1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
27 #include "MDBalancer.h"
29 #include "ScrubStack.h"
31 #include "SnapClient.h"
40 #include "include/ceph_fs.h"
41 #include "include/filepath.h"
43 #include "msg/Message.h"
44 #include "msg/Messenger.h"
46 #include "common/errno.h"
47 #include "common/safe_io.h"
48 #include "common/perf_counters.h"
49 #include "common/MemoryModel.h"
50 #include "osdc/Journaler.h"
51 #include "osdc/Filer.h"
53 #include "events/ESubtreeMap.h"
54 #include "events/EUpdate.h"
55 #include "events/ESlaveUpdate.h"
56 #include "events/EImportFinish.h"
57 #include "events/EFragment.h"
58 #include "events/ECommitted.h"
59 #include "events/ESessions.h"
61 #include "messages/MGenericMessage.h"
63 #include "messages/MMDSResolve.h"
64 #include "messages/MMDSResolveAck.h"
65 #include "messages/MMDSCacheRejoin.h"
67 #include "messages/MDiscover.h"
68 #include "messages/MDiscoverReply.h"
70 //#include "messages/MInodeUpdate.h"
71 #include "messages/MDirUpdate.h"
72 #include "messages/MCacheExpire.h"
74 #include "messages/MInodeFileCaps.h"
76 #include "messages/MLock.h"
77 #include "messages/MDentryLink.h"
78 #include "messages/MDentryUnlink.h"
80 #include "messages/MMDSFindIno.h"
81 #include "messages/MMDSFindInoReply.h"
83 #include "messages/MMDSOpenIno.h"
84 #include "messages/MMDSOpenInoReply.h"
86 #include "messages/MClientRequest.h"
87 #include "messages/MClientCaps.h"
88 #include "messages/MClientSnap.h"
89 #include "messages/MClientQuota.h"
91 #include "messages/MMDSSlaveRequest.h"
93 #include "messages/MMDSFragmentNotify.h"
95 #include "messages/MGatherCaps.h"
99 #include "common/Timer.h"
101 #include "perfglue/heap_profiler.h"
105 #include "common/config.h"
106 #include "include/assert.h"
108 #define dout_context g_ceph_context
109 #define dout_subsys ceph_subsys_mds
111 #define dout_prefix _prefix(_dout, mds)
112 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
113 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
116 set
<int> SimpleLock::empty_gather_set
;
120 * All non-I/O contexts that require a reference
121 * to an MDCache instance descend from this.
123 class MDCacheContext
: public virtual MDSInternalContextBase
{
126 MDSRank
*get_mds() override
128 assert(mdcache
!= NULL
);
132 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
137 * Only for contexts called back from an I/O completion
139 * Note: duplication of members wrt MDCacheContext, because
140 * it'ls the lesser of two evils compared with introducing
141 * yet another piece of (multiple) inheritance.
143 class MDCacheIOContext
: public virtual MDSIOContextBase
{
146 MDSRank
*get_mds() override
148 assert(mdcache
!= NULL
);
152 explicit MDCacheIOContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
155 class MDCacheLogContext
: public virtual MDSLogContextBase
{
158 MDSRank
*get_mds() override
160 assert(mdcache
!= NULL
);
164 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
167 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
169 filer(m
->objecter
, m
->finisher
),
170 exceeded_size_limit(false),
172 stray_manager(m
, purge_queue_
)
174 migrator
.reset(new Migrator(mds
, this));
180 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
184 num_inodes_with_caps
= 0;
186 max_dir_commit_size
= g_conf
->mds_dir_max_commit_size
?
187 (g_conf
->mds_dir_max_commit_size
<< 20) :
188 (0.9 *(g_conf
->osd_max_write_size
<< 20));
190 discover_last_tid
= 0;
191 open_ino_last_tid
= 0;
192 find_ino_peer_last_tid
= 0;
196 client_lease_durations
[0] = 5.0;
197 client_lease_durations
[1] = 30.0;
198 client_lease_durations
[2] = 300.0;
200 resolves_pending
= false;
201 rejoins_pending
= false;
202 cap_imports_num_opening
= 0;
204 opening_root
= open
= false;
205 lru
.lru_set_max(g_conf
->mds_cache_size
);
206 lru
.lru_set_midpoint(g_conf
->mds_cache_mid
);
208 bottom_lru
.lru_set_max(0);
209 bottom_lru
.lru_set_midpoint(0);
211 decayrate
.set_halflife(g_conf
->mds_decay_halflife
);
213 did_shutdown_log_cap
= false;
219 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
225 void MDCache::log_stat()
227 mds
->logger
->set(l_mds_inode_max
, g_conf
->mds_cache_size
);
228 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
229 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
230 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
231 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
232 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
233 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
234 mds
->logger
->set(l_mds_caps
, Capability::count());
240 bool MDCache::shutdown()
242 if (lru
.lru_get_size() > 0) {
243 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
252 // ====================================================================
253 // some inode functions
255 void MDCache::add_inode(CInode
*in
)
257 // add to lru, inode map
258 assert(inode_map
.count(in
->vino()) == 0); // should be no dup inos!
259 inode_map
[ in
->vino() ] = in
;
261 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
262 if (in
->ino() == MDS_INO_ROOT
)
264 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
266 else if (in
->is_stray()) {
267 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
268 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
272 base_inodes
.insert(in
);
275 if (CInode::count() >
276 g_conf
->mds_cache_size
* g_conf
->mds_health_cache_threshold
) {
277 exceeded_size_limit
= true;
281 void MDCache::remove_inode(CInode
*o
)
283 dout(14) << "remove_inode " << *o
<< dendl
;
285 if (o
->get_parent_dn()) {
286 // FIXME: multiple parents?
287 CDentry
*dn
= o
->get_parent_dn();
288 assert(!dn
->is_dirty());
289 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
294 if (o
->is_dirty_parent())
295 o
->clear_dirty_parent();
297 o
->clear_scatter_dirty();
299 o
->item_open_file
.remove_myself();
301 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
302 export_pin_queue
.erase(o
);
304 // remove from inode map
305 inode_map
.erase(o
->vino());
307 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
308 if (o
== root
) root
= 0;
309 if (o
== myin
) myin
= 0;
311 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
312 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
316 base_inodes
.erase(o
);
320 assert(o
->get_num_ref() == 0);
324 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
326 file_layout_t result
= file_layout_t::get_default();
327 result
.pool_id
= mdsmap
.get_first_data_pool();
331 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
333 file_layout_t result
= file_layout_t::get_default();
334 result
.pool_id
= mdsmap
.get_metadata_pool();
335 if (g_conf
->mds_log_segment_size
> 0) {
336 result
.object_size
= g_conf
->mds_log_segment_size
;
337 result
.stripe_unit
= g_conf
->mds_log_segment_size
;
342 void MDCache::init_layouts()
344 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
345 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
348 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
352 in
->inode
.version
= 1;
353 in
->inode
.xattr_version
= 1;
354 in
->inode
.mode
= 0500 | mode
;
358 in
->inode
.btime
= ceph_clock_now();
360 in
->inode
.truncate_size
= -1ull;
361 in
->inode
.change_attr
= 0;
362 in
->inode
.export_pin
= MDS_RANK_NONE
;
364 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
365 if (in
->inode
.is_dir()) {
366 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
367 ++in
->inode
.rstat
.rsubdirs
;
369 in
->inode
.layout
= default_file_layout
;
370 ++in
->inode
.rstat
.rfiles
;
372 in
->inode
.accounted_rstat
= in
->inode
.rstat
;
376 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
378 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
379 in
->open_snaprealm(); // empty snaprealm
380 assert(!in
->snaprealm
->parent
); // created its own
381 in
->snaprealm
->srnode
.seq
= 1;
385 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
387 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
388 CInode
*in
= new CInode(this);
389 create_unlinked_system_inode(in
, ino
, mode
);
394 CInode
*MDCache::create_root_inode()
396 CInode
*i
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755);
397 i
->inode
.uid
= g_conf
->mds_root_ino_uid
;
398 i
->inode
.gid
= g_conf
->mds_root_ino_gid
;
399 i
->inode
.layout
= default_file_layout
;
400 i
->inode
.layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
404 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
407 CInode
*root
= create_root_inode();
409 // force empty root dir
410 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
411 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
412 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
414 rootdir
->fnode
.accounted_fragstat
= rootdir
->fnode
.fragstat
;
415 rootdir
->fnode
.accounted_rstat
= rootdir
->fnode
.rstat
;
417 root
->inode
.dirstat
= rootdir
->fnode
.fragstat
;
418 root
->inode
.rstat
= rootdir
->fnode
.rstat
;
419 ++root
->inode
.rstat
.rsubdirs
;
420 root
->inode
.accounted_rstat
= root
->inode
.rstat
;
422 rootdir
->mark_complete();
423 rootdir
->mark_dirty(rootdir
->pre_dirty(), mds
->mdlog
->get_current_segment());
424 rootdir
->commit(0, gather
->new_sub());
426 root
->store(gather
->new_sub());
429 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
432 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
434 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
435 adjust_subtree_auth(mydir
, mds
->get_nodeid());
437 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
440 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
441 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
442 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
444 name
<< "stray" << i
;
445 CDentry
*sdn
= mydir
->add_primary_dentry(name
.str(), stray
);
446 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
448 stray
->inode
.dirstat
= straydir
->fnode
.fragstat
;
450 mydir
->fnode
.rstat
.add(stray
->inode
.rstat
);
451 mydir
->fnode
.fragstat
.nsubdirs
++;
453 straydir
->mark_complete();
454 straydir
->mark_dirty(straydir
->pre_dirty(), ls
);
455 straydir
->commit(0, gather
->new_sub());
456 stray
->_mark_dirty_parent(ls
, true);
457 stray
->store_backtrace(gather
->new_sub());
460 mydir
->fnode
.accounted_fragstat
= mydir
->fnode
.fragstat
;
461 mydir
->fnode
.accounted_rstat
= mydir
->fnode
.rstat
;
463 myin
->inode
.dirstat
= mydir
->fnode
.fragstat
;
464 myin
->inode
.rstat
= mydir
->fnode
.rstat
;
465 ++myin
->inode
.rstat
.rsubdirs
;
466 myin
->inode
.accounted_rstat
= myin
->inode
.rstat
;
468 mydir
->mark_complete();
469 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
470 mydir
->commit(0, gather
->new_sub());
472 myin
->store(gather
->new_sub());
475 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
479 MDSInternalContextBase
*fin
;
480 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSInternalContextBase
*f
) :
481 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
482 void finish(int r
) override
{
483 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
487 void MDCache::_create_system_file(CDir
*dir
, const char *name
, CInode
*in
, MDSInternalContextBase
*fin
)
489 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
490 CDentry
*dn
= dir
->add_null_dentry(name
);
492 dn
->push_projected_linkage(in
);
493 version_t dpv
= dn
->pre_dirty();
496 if (in
->inode
.is_dir()) {
497 in
->inode
.rstat
.rsubdirs
= 1;
499 mdir
= in
->get_or_open_dirfrag(this, frag_t());
500 mdir
->mark_complete();
503 in
->inode
.rstat
.rfiles
= 1;
504 in
->inode
.version
= dn
->pre_dirty();
506 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
507 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
509 MutationRef
mut(new MutationImpl());
511 // force some locks. hacky.
512 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
513 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
515 mut
->ls
= mds
->mdlog
->get_current_segment();
516 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
517 mds
->mdlog
->start_entry(le
);
519 if (!in
->is_mdsdir()) {
520 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
521 le
->metablob
.add_primary_dentry(dn
, in
, true);
523 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
524 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
525 dn
->push_projected_linkage(in
->ino(), in
->d_type());
526 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
527 le
->metablob
.add_root(true, in
);
530 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
532 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
536 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSInternalContextBase
*fin
)
538 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
540 dn
->pop_projected_linkage();
541 dn
->mark_dirty(dpv
, mut
->ls
);
543 CInode
*in
= dn
->get_linkage()->get_inode();
545 in
->mark_dirty(in
->inode
.version
+ 1, mut
->ls
);
547 if (in
->inode
.is_dir()) {
548 CDir
*dir
= in
->get_dirfrag(frag_t());
550 dir
->mark_dirty(1, mut
->ls
);
551 dir
->mark_new(mut
->ls
);
555 mds
->locker
->drop_locks(mut
.get());
560 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
561 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
566 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
568 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
569 void finish(int r
) override
{
571 // If we can't open root, something disastrous has happened: mark
572 // this rank damaged for operator intervention. Note that
573 // it is not okay to call suicide() here because we are in
574 // a Finisher callback.
575 cache
->mds
->damaged();
576 ceph_abort(); // damaged should never return
583 void MDCache::open_root_inode(MDSInternalContextBase
*c
)
585 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
587 in
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
590 discover_base_ino(MDS_INO_ROOT
, c
, mds
->mdsmap
->get_root());
594 void MDCache::open_mydir_inode(MDSInternalContextBase
*c
)
596 MDSGatherBuilder
gather(g_ceph_context
);
598 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
599 in
->fetch(gather
.new_sub());
601 gather
.set_finisher(c
);
605 void MDCache::open_root()
607 dout(10) << "open_root" << dendl
;
610 open_root_inode(new C_MDS_RetryOpenRoot(this));
613 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
614 assert(root
->is_auth());
615 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
617 if (!rootdir
->is_subtree_root())
618 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
619 if (!rootdir
->is_complete()) {
620 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
624 assert(!root
->is_auth());
625 CDir
*rootdir
= root
->get_dirfrag(frag_t());
627 discover_dir_frag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
633 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
634 in
->fetch(new C_MDS_RetryOpenRoot(this));
637 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
639 adjust_subtree_auth(mydir
, mds
->get_nodeid());
644 void MDCache::populate_mydir()
647 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
650 dout(10) << "populate_mydir " << *mydir
<< dendl
;
652 if (!mydir
->is_complete()) {
653 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
657 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
658 // A missing dirfrag, we will recreate it. Before that, we must dirty
659 // it before dirtying any of the strays we create within it.
660 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
662 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
663 mydir
->state_clear(CDir::STATE_BADFRAG
);
664 mydir
->mark_complete();
665 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
668 // open or create stray
669 uint64_t num_strays
= 0;
670 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
672 name
<< "stray" << i
;
673 CDentry
*straydn
= mydir
->lookup(name
.str());
675 // allow for older fs's with stray instead of stray0
676 if (straydn
== NULL
&& i
== 0)
677 straydn
= mydir
->lookup("stray");
679 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
680 _create_system_file(mydir
, name
.str().c_str(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
681 new C_MDS_RetryOpenRoot(this));
686 // we make multiple passes through this method; make sure we only pin each stray once.
687 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
688 strays
[i
]->get(CInode::PIN_STRAY
);
689 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
690 strays
[i
]->get_stickydirs();
692 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
696 strays
[i
]->dirfragtree
.get_leaves(ls
);
697 for (list
<frag_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
699 CDir
*dir
= strays
[i
]->get_dirfrag(fg
);
701 dir
= strays
[i
]->get_or_open_dirfrag(this, fg
);
704 // DamageTable applies special handling to strays: it will
705 // have damaged() us out if one is damaged.
706 assert(!dir
->state_test(CDir::STATE_BADFRAG
));
708 if (dir
->get_version() == 0) {
709 dir
->fetch(new C_MDS_RetryOpenRoot(this));
713 if (dir
->get_frag_size() > 0)
714 num_strays
+= dir
->get_frag_size();
718 stray_manager
.set_num_strays(num_strays
);
721 dout(10) << "populate_mydir done" << dendl
;
724 mds
->queue_waiters(waiting_for_open
);
729 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSInternalContextBase
*fin
)
731 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
734 CDir
*MDCache::get_stray_dir(CInode
*in
)
737 in
->name_stray_dentry(straydname
);
739 CInode
*strayi
= get_stray();
741 frag_t fg
= strayi
->pick_dirfrag(straydname
);
742 CDir
*straydir
= strayi
->get_dirfrag(fg
);
747 CDentry
*MDCache::get_or_create_stray_dentry(CInode
*in
)
749 CDir
*straydir
= get_stray_dir(in
);
751 in
->name_stray_dentry(straydname
);
752 CDentry
*straydn
= straydir
->lookup(straydname
);
754 straydn
= straydir
->add_null_dentry(straydname
);
757 assert(straydn
->get_projected_linkage()->is_null());
760 straydn
->state_set(CDentry::STATE_STRAY
);
766 MDSCacheObject
*MDCache::get_object(MDSCacheObjectInfo
&info
)
770 return get_inode(info
.ino
, info
.snapid
);
773 CDir
*dir
= get_dirfrag(info
.dirfrag
);
776 if (info
.dname
.length())
777 return dir
->lookup(info
.dname
, info
.snapid
);
785 // ====================================================================
786 // subtree management
788 void MDCache::list_subtrees(list
<CDir
*>& ls
)
790 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
793 ls
.push_back(p
->first
);
797 * adjust the dir_auth of a subtree.
798 * merge with parent and/or child subtrees, if is it appropriate.
799 * merge can ONLY happen if both parent and child have unambiguous auth.
801 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool do_eval
)
803 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
804 << " on " << *dir
<< dendl
;
806 if (mds
->is_any_replay() || mds
->is_resolve())
812 if (dir
->inode
->is_base()) {
813 root
= dir
; // bootstrap hack.
814 if (subtrees
.count(root
) == 0) {
816 root
->get(CDir::PIN_SUBTREE
);
819 root
= get_subtree_root(dir
); // subtree root
822 assert(subtrees
.count(root
));
823 dout(7) << " current root is " << *root
<< dendl
;
826 // i am already a subtree.
827 dir
->set_dir_auth(auth
);
829 // i am a new subtree.
830 dout(10) << " new subtree at " << *dir
<< dendl
;
831 assert(subtrees
.count(dir
) == 0);
832 subtrees
[dir
]; // create empty subtree bounds list for me.
833 dir
->get(CDir::PIN_SUBTREE
);
836 dir
->set_dir_auth(auth
);
838 // move items nested beneath me, under me.
839 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
840 while (p
!= subtrees
[root
].end()) {
841 set
<CDir
*>::iterator next
= p
;
843 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
845 dout(10) << " claiming child bound " << **p
<< dendl
;
846 subtrees
[dir
].insert(*p
);
847 subtrees
[root
].erase(p
);
852 // i am a bound of the parent subtree.
853 subtrees
[root
].insert(dir
);
855 // i am now the subtree root.
858 // adjust recursive pop counters
859 if (dir
->is_auth()) {
860 utime_t now
= ceph_clock_now();
861 CDir
*p
= dir
->get_parent_dir();
863 p
->pop_auth_subtree
.sub(now
, decayrate
, dir
->pop_auth_subtree
);
864 if (p
->is_subtree_root()) break;
865 p
= p
->inode
->get_parent_dir();
870 eval_subtree_root(dir
->get_inode());
877 void MDCache::try_subtree_merge(CDir
*dir
)
879 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
880 assert(subtrees
.count(dir
));
881 set
<CDir
*> oldbounds
= subtrees
[dir
];
883 // try merge at my root
884 try_subtree_merge_at(dir
);
886 // try merge at my old bounds
887 for (set
<CDir
*>::iterator p
= oldbounds
.begin();
888 p
!= oldbounds
.end();
890 try_subtree_merge_at(*p
);
893 class C_MDC_SubtreeMergeWB
: public MDCacheLogContext
{
897 C_MDC_SubtreeMergeWB(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) : MDCacheLogContext(mdc
), in(i
), mut(m
) {}
898 void finish(int r
) override
{
899 mdcache
->subtree_merge_writebehind_finish(in
, mut
);
903 void MDCache::try_subtree_merge_at(CDir
*dir
, bool do_eval
)
905 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
906 assert(subtrees
.count(dir
));
908 if (mds
->is_any_replay() || mds
->is_resolve())
911 // merge with parent?
913 if (!dir
->inode
->is_base())
914 parent
= get_subtree_root(dir
->get_parent_dir());
916 if (parent
!= dir
&& // we have a parent,
917 parent
->dir_auth
== dir
->dir_auth
&& // auth matches,
918 dir
->dir_auth
.second
== CDIR_AUTH_UNKNOWN
&& // auth is unambiguous,
919 !dir
->state_test(CDir::STATE_EXPORTBOUND
) && // not an exportbound,
920 !dir
->state_test(CDir::STATE_AUXSUBTREE
)) { // not aux subtree
921 // merge with parent.
922 dout(10) << " subtree merge at " << *dir
<< dendl
;
923 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
925 // move our bounds under the parent
926 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
927 p
!= subtrees
[dir
].end();
929 subtrees
[parent
].insert(*p
);
931 // we are no longer a subtree or bound
932 dir
->put(CDir::PIN_SUBTREE
);
934 subtrees
[parent
].erase(dir
);
936 // adjust popularity?
937 if (dir
->is_auth()) {
938 utime_t now
= ceph_clock_now();
939 CDir
*p
= dir
->get_parent_dir();
941 p
->pop_auth_subtree
.add(now
, decayrate
, dir
->pop_auth_subtree
);
942 if (p
->is_subtree_root()) break;
943 p
= p
->inode
->get_parent_dir();
948 eval_subtree_root(dir
->get_inode());
954 void MDCache::subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
)
956 dout(10) << "subtree_merge_writebehind_finish on " << in
<< dendl
;
957 in
->pop_and_dirty_projected_inode(mut
->ls
);
960 mds
->locker
->drop_locks(mut
.get());
963 in
->auth_unpin(this);
966 void MDCache::eval_subtree_root(CInode
*diri
)
968 // evaluate subtree inode filelock?
969 // (we should scatter the filelock on subtree bounds)
971 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
975 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, set
<CDir
*>& bounds
, mds_authority_t auth
)
977 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
979 << " bounds " << bounds
985 if (dir
->ino() == MDS_INO_ROOT
) {
986 root
= dir
; // bootstrap hack.
987 if (subtrees
.count(root
) == 0) {
989 root
->get(CDir::PIN_SUBTREE
);
992 root
= get_subtree_root(dir
); // subtree root
995 assert(subtrees
.count(root
));
996 dout(7) << " current root is " << *root
<< dendl
;
998 mds_authority_t oldauth
= dir
->authority();
1001 // i am already a subtree.
1002 dir
->set_dir_auth(auth
);
1004 // i am a new subtree.
1005 dout(10) << " new subtree at " << *dir
<< dendl
;
1006 assert(subtrees
.count(dir
) == 0);
1007 subtrees
[dir
]; // create empty subtree bounds list for me.
1008 dir
->get(CDir::PIN_SUBTREE
);
1011 dir
->set_dir_auth(auth
);
1013 // move items nested beneath me, under me.
1014 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1015 while (p
!= subtrees
[root
].end()) {
1016 set
<CDir
*>::iterator next
= p
;
1018 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1020 dout(10) << " claiming child bound " << **p
<< dendl
;
1021 subtrees
[dir
].insert(*p
);
1022 subtrees
[root
].erase(p
);
1027 // i am a bound of the parent subtree.
1028 subtrees
[root
].insert(dir
);
1030 // i am now the subtree root.
1034 // verify/adjust bounds.
1035 // - these may be new, or
1036 // - beneath existing ambiguous bounds (which will be collapsed),
1037 // - but NOT beneath unambiguous bounds.
1038 for (set
<CDir
*>::iterator p
= bounds
.begin();
1044 if (subtrees
[dir
].count(bound
) == 0) {
1045 if (get_subtree_root(bound
) == dir
) {
1046 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1047 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1050 dout(10) << " want bound " << *bound
<< dendl
;
1051 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1052 if (subtrees
[t
].count(bound
) == 0) {
1054 dout(10) << " new bound " << *bound
<< dendl
;
1055 adjust_subtree_auth(bound
, t
->authority());
1057 // make sure it's nested beneath ambiguous subtree(s)
1059 while (subtrees
[dir
].count(t
) == 0)
1060 t
= get_subtree_root(t
->get_parent_dir());
1061 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1062 adjust_subtree_auth(t
, auth
);
1063 try_subtree_merge_at(t
);
1064 t
= get_subtree_root(bound
->get_parent_dir());
1065 if (t
== dir
) break;
1070 dout(10) << " already have bound " << *bound
<< dendl
;
1073 // merge stray bounds?
1074 while (!subtrees
[dir
].empty()) {
1075 set
<CDir
*> copy
= subtrees
[dir
];
1076 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1077 if (bounds
.count(*p
) == 0) {
1079 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1080 adjust_subtree_auth(stray
, auth
);
1081 try_subtree_merge_at(stray
);
1084 // swallowing subtree may add new subtree bounds
1085 if (copy
== subtrees
[dir
])
1089 // bound should now match.
1090 verify_subtree_bounds(dir
, bounds
);
1097 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1098 * fragmentation as necessary to get an equivalent bounding set. That is, only
1099 * split if one of our frags spans the provided bounding set. Never merge.
1101 void MDCache::get_force_dirfrag_bound_set(vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1103 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1106 map
<inodeno_t
, fragset_t
> byino
;
1107 for (vector
<dirfrag_t
>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
1108 byino
[p
->ino
].insert(p
->frag
);
1109 dout(10) << " by ino: " << byino
<< dendl
;
1111 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1112 CInode
*diri
= get_inode(p
->first
);
1115 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1118 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1119 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1121 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
1124 diri
->dirfragtree
.get_leaves_under(fg
, fgls
);
1127 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1129 tmpdft
.get_leaves_under(approx_fg
, ls
);
1130 for (list
<frag_t
>::iterator r
= ls
.begin(); r
!= ls
.end(); ++r
) {
1131 if (p
->second
.get().count(*r
) == 0) {
1132 // not bound, so the resolve message is from auth MDS of the dirfrag
1133 force_dir_fragment(diri
, *r
);
1138 fgls
.push_back(approx_fg
);
1140 diri
->dirfragtree
.get_leaves_under(fg
, fgls
);
1142 dout(10) << " frag " << fg
<< " contains " << fgls
<< dendl
;
1143 for (list
<frag_t
>::iterator r
= fgls
.begin(); r
!= fgls
.end(); ++r
) {
1144 CDir
*dir
= diri
->get_dirfrag(*r
);
1152 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, vector
<dirfrag_t
>& bound_dfs
, mds_authority_t auth
)
1154 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1155 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1158 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1159 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1162 void MDCache::map_dirfrag_set(list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1164 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1167 map
<inodeno_t
, fragset_t
> ino_fragset
;
1168 for (list
<dirfrag_t
>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
1169 ino_fragset
[p
->ino
].insert(p
->frag
);
1172 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1173 p
!= ino_fragset
.end();
1175 CInode
*in
= get_inode(p
->first
);
1179 list
<frag_t
> fglist
;
1180 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1181 in
->dirfragtree
.get_leaves_under(*q
, fglist
);
1183 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fglist
1184 << " on " << *in
<< dendl
;
1186 for (list
<frag_t
>::iterator q
= fglist
.begin(); q
!= fglist
.end(); ++q
) {
1187 CDir
*dir
= in
->get_dirfrag(*q
);
1196 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1198 // find the underlying dir that delegates (or is about to delegate) auth
1200 if (dir
->is_subtree_root())
1202 dir
= dir
->get_inode()->get_parent_dir();
1208 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1210 // find the underlying dir that delegates (or is about to delegate) auth
1212 if (dir
->is_subtree_root())
1214 dir
= dir
->get_inode()->get_projected_parent_dir();
1220 void MDCache::remove_subtree(CDir
*dir
)
1222 dout(10) << "remove_subtree " << *dir
<< dendl
;
1223 assert(subtrees
.count(dir
));
1224 assert(subtrees
[dir
].empty());
1225 subtrees
.erase(dir
);
1226 dir
->put(CDir::PIN_SUBTREE
);
1227 if (dir
->get_parent_dir()) {
1228 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1229 assert(subtrees
[p
].count(dir
));
1230 subtrees
[p
].erase(dir
);
1234 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1236 assert(subtrees
.count(dir
));
1237 bounds
= subtrees
[dir
];
1240 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1242 if (subtrees
.count(dir
)) {
1243 // just copy them, dir is a subtree.
1244 get_subtree_bounds(dir
, bounds
);
1247 CDir
*root
= get_subtree_root(dir
);
1248 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1249 p
!= subtrees
[root
].end();
1253 t
= t
->get_parent_dir();
1264 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1266 // for debugging only.
1267 assert(subtrees
.count(dir
));
1268 if (bounds
!= subtrees
[dir
]) {
1269 dout(0) << "verify_subtree_bounds failed" << dendl
;
1270 set
<CDir
*> b
= bounds
;
1271 for (auto &cd
: subtrees
[dir
]) {
1272 if (bounds
.count(cd
)) {
1276 dout(0) << " missing bound " << *cd
<< dendl
;
1278 for (const auto &cd
: b
)
1279 dout(0) << " extra bound " << *cd
<< dendl
;
1281 assert(bounds
== subtrees
[dir
]);
1284 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1286 // for debugging only.
1287 assert(subtrees
.count(dir
));
1289 // make sure that any bounds i do have are properly noted as such.
1291 for (const auto &fg
: bounds
) {
1292 CDir
*bd
= get_dirfrag(fg
);
1294 if (subtrees
[dir
].count(bd
) == 0) {
1295 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1299 assert(failed
== 0);
1302 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1304 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1305 << " to " << *newdir
<< dendl
;
1306 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1309 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
,
1310 bool pop
, bool imported
)
1312 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1316 CDir
*newdir
= diri
->get_parent_dir();
1319 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1320 assert(p
!= projected_subtree_renames
.end());
1321 assert(!p
->second
.empty());
1322 assert(p
->second
.front().first
== olddir
);
1323 assert(p
->second
.front().second
== newdir
);
1324 p
->second
.pop_front();
1325 if (p
->second
.empty())
1326 projected_subtree_renames
.erase(p
);
1331 // make sure subtree dirfrags are at the front of the list
1332 diri
->get_subtree_dirfrags(dfls
);
1333 diri
->get_nested_dirfrags(dfls
);
1334 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
1337 dout(10) << "dirfrag " << *dir
<< dendl
;
1338 CDir
*oldparent
= get_subtree_root(olddir
);
1339 dout(10) << " old parent " << *oldparent
<< dendl
;
1340 CDir
*newparent
= get_subtree_root(newdir
);
1341 dout(10) << " new parent " << *newparent
<< dendl
;
1343 if (oldparent
== newparent
) {
1344 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1348 if (dir
->is_subtree_root()) {
1349 // children are fine. change parent.
1350 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1351 assert(subtrees
[oldparent
].count(dir
));
1352 subtrees
[oldparent
].erase(dir
);
1353 assert(subtrees
.count(newparent
));
1354 subtrees
[newparent
].insert(dir
);
1355 try_subtree_merge_at(dir
, !imported
);
1359 // see if any old bounds move to the new parent.
1361 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
1362 p
!= subtrees
[oldparent
].end();
1365 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1366 if (broot
!= oldparent
) {
1367 assert(broot
== newparent
);
1368 tomove
.push_back(bound
);
1371 for (list
<CDir
*>::iterator p
= tomove
.begin(); p
!= tomove
.end(); ++p
) {
1373 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1374 subtrees
[oldparent
].erase(bound
);
1375 subtrees
[newparent
].insert(bound
);
1379 if (oldparent
->authority() != newparent
->authority()) {
1380 adjust_subtree_auth(dir
, oldparent
->authority(), !imported
); // caller is responsible for *diri.
1381 try_subtree_merge_at(dir
, !imported
);
1390 void MDCache::get_fullauth_subtrees(set
<CDir
*>& s
)
1392 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1393 p
!= subtrees
.end();
1395 CDir
*root
= p
->first
;
1396 if (root
->is_full_dir_auth())
1400 void MDCache::get_auth_subtrees(set
<CDir
*>& s
)
1402 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1403 p
!= subtrees
.end();
1405 CDir
*root
= p
->first
;
1406 if (root
->is_auth())
1414 int MDCache::num_subtrees()
1416 return subtrees
.size();
1419 int MDCache::num_subtrees_fullauth()
1422 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1423 p
!= subtrees
.end();
1425 CDir
*root
= p
->first
;
1426 if (root
->is_full_dir_auth())
1432 int MDCache::num_subtrees_fullnonauth()
1435 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1436 p
!= subtrees
.end();
1438 CDir
*root
= p
->first
;
1439 if (root
->is_full_dir_nonauth())
1447 // ===================================
1448 // journal and snap/cow helpers
1452 * find first inode in cache that follows given snapid. otherwise, return current.
1454 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1456 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1457 assert(in
->last
== CEPH_NOSNAP
);
1459 SnapRealm
*realm
= in
->find_snaprealm();
1460 const set
<snapid_t
>& snaps
= realm
->get_snaps();
1461 dout(10) << " realm " << *realm
<< " " << *realm
->inode
<< dendl
;
1462 dout(10) << " snaps " << snaps
<< dendl
;
1467 for (set
<snapid_t
>::const_iterator p
= snaps
.upper_bound(follows
); // first item > follows
1470 CInode
*t
= get_inode(in
->ino(), *p
);
1473 dout(10) << "pick_inode_snap snap " << *p
<< " found " << *in
<< dendl
;
1482 * note: i'm currently cheating wrt dirty and inode.version on cow
1483 * items. instead of doing a full dir predirty, i just take the
1484 * original item's version, and set the dirty flag (via
1485 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1486 * means a special case in the dir commit clean sweep assertions.
1489 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1491 assert(last
>= in
->first
);
1493 SnapRealm
*realm
= in
->find_snaprealm();
1494 const set
<snapid_t
>& snaps
= realm
->get_snaps();
1496 // make sure snap inode's last match existing snapshots.
1497 // MDCache::pick_inode_snap() requires this.
1498 snapid_t last_snap
= last
;
1499 if (snaps
.count(last
) == 0) {
1500 set
<snapid_t
>::const_iterator p
= snaps
.upper_bound(last
);
1501 if (p
!= snaps
.begin()) {
1503 if (*p
>= in
->first
)
1508 CInode
*oldin
= new CInode(this, true, in
->first
, last_snap
);
1509 oldin
->inode
= *in
->get_previous_projected_inode();
1510 oldin
->symlink
= in
->symlink
;
1511 oldin
->xattrs
= *in
->get_previous_projected_xattrs();
1512 oldin
->inode
.trim_client_ranges(last
);
1514 if (in
->first
< in
->oldest_snap
)
1515 in
->oldest_snap
= in
->first
;
1519 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1522 if (in
->last
!= CEPH_NOSNAP
) {
1523 CInode
*head_in
= get_inode(in
->ino());
1525 if (head_in
->split_need_snapflush(oldin
, in
)) {
1526 oldin
->client_snap_caps
= in
->client_snap_caps
;
1527 for (compact_map
<int,set
<client_t
> >::iterator p
= in
->client_snap_caps
.begin();
1528 p
!= in
->client_snap_caps
.end();
1530 SimpleLock
*lock
= oldin
->get_lock(p
->first
);
1532 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
1533 oldin
->auth_pin(lock
);
1534 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1535 lock
->get_wrlock(true);
1543 for (map
<client_t
,Capability
*>::iterator p
= in
->client_caps
.begin();
1544 p
!= in
->client_caps
.end();
1546 client_t client
= p
->first
;
1547 Capability
*cap
= p
->second
;
1548 int issued
= cap
->issued();
1549 if ((issued
& CEPH_CAP_ANY_WR
) &&
1550 cap
->client_follows
< last
) {
1552 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1553 if (issued
& cinode_lock_info
[i
].wr_caps
) {
1554 int lockid
= cinode_lock_info
[i
].lock
;
1555 SimpleLock
*lock
= oldin
->get_lock(lockid
);
1557 oldin
->client_snap_caps
[lockid
].insert(client
);
1558 oldin
->auth_pin(lock
);
1559 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1560 lock
->get_wrlock(true);
1561 dout(10) << " client." << client
<< " cap " << ccap_string(issued
& cinode_lock_info
[i
].wr_caps
)
1562 << " wrlock lock " << *lock
<< " on " << *oldin
<< dendl
;
1565 cap
->client_follows
= last
;
1567 // we need snapflushes for any intervening snaps
1568 dout(10) << " snaps " << snaps
<< dendl
;
1569 for (set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(oldin
->first
);
1570 q
!= snaps
.end() && *q
<= last
;
1572 in
->add_need_snapflush(oldin
, *q
, client
);
1575 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1582 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1583 CDentry
*dn
, snapid_t follows
,
1584 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1587 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1590 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1591 assert(dn
->is_auth());
1593 // nothing to cow on a null dentry, fix caller
1595 dnl
= dn
->get_projected_linkage();
1596 assert(!dnl
->is_null());
1598 if (dnl
->is_primary() && dnl
->get_inode()->is_multiversion()) {
1599 // multiversion inode.
1600 CInode
*in
= dnl
->get_inode();
1601 SnapRealm
*realm
= NULL
;
1603 if (in
->get_projected_parent_dn() != dn
) {
1604 assert(follows
== CEPH_NOSNAP
);
1605 realm
= dn
->dir
->inode
->find_snaprealm();
1606 snapid_t dir_follows
= realm
->get_newest_snap();
1608 if (dir_follows
+1 > dn
->first
) {
1609 snapid_t oldfirst
= dn
->first
;
1610 dn
->first
= dir_follows
+1;
1611 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1612 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->name
, in
->ino(), in
->d_type(),
1613 oldfirst
, dir_follows
);
1615 dout(10) << " olddn " << *olddn
<< dendl
;
1616 metablob
->add_remote_dentry(olddn
, true);
1617 mut
->add_cow_dentry(olddn
);
1618 // FIXME: adjust link count here? hmm.
1620 if (dir_follows
+1 > in
->first
)
1621 in
->cow_old_inode(dir_follows
, false);
1625 if (in
->snaprealm
) {
1626 realm
= in
->snaprealm
;
1627 follows
= realm
->get_newest_seq();
1629 follows
= dir_follows
;
1631 realm
= in
->find_snaprealm();
1632 if (follows
== CEPH_NOSNAP
)
1633 follows
= realm
->get_newest_seq();
1637 if (follows
< in
->first
) {
1638 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1642 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1643 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1644 in
->first
= follows
+ 1;
1648 in
->cow_old_inode(follows
, false);
1651 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1652 if (follows
== CEPH_NOSNAP
)
1653 follows
= realm
->get_newest_seq();
1656 if (follows
< dn
->first
) {
1657 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1661 // update dn.first before adding old dentry to cdir's map
1662 snapid_t oldfirst
= dn
->first
;
1663 dn
->first
= follows
+1;
1665 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1667 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1668 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1670 in
->first
= follows
+1;
1674 dout(10) << " dn " << *dn
<< dendl
;
1676 CInode
*oldin
= cow_inode(in
, follows
);
1677 mut
->add_cow_inode(oldin
);
1679 *pcow_inode
= oldin
;
1680 CDentry
*olddn
= dn
->dir
->add_primary_dentry(dn
->name
, oldin
, oldfirst
, follows
);
1681 oldin
->inode
.version
= olddn
->pre_dirty();
1682 dout(10) << " olddn " << *olddn
<< dendl
;
1683 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1685 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1686 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1687 mut
->add_cow_dentry(olddn
);
1689 assert(dnl
->is_remote());
1690 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->name
, dnl
->get_remote_ino(), dnl
->get_remote_d_type(),
1693 dout(10) << " olddn " << *olddn
<< dendl
;
1694 metablob
->add_remote_dentry(olddn
, true);
1695 mut
->add_cow_dentry(olddn
);
1701 void MDCache::journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
,
1702 CInode
*in
, snapid_t follows
,
1703 CInode
**pcow_inode
)
1705 dout(10) << "journal_cow_inode follows " << follows
<< " on " << *in
<< dendl
;
1706 CDentry
*dn
= in
->get_projected_parent_dn();
1707 journal_cow_dentry(mut
.get(), metablob
, dn
, follows
, pcow_inode
);
1710 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1712 if (in
->is_base()) {
1713 metablob
->add_root(true, in
, in
->get_projected_inode());
1715 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1716 follows
= in
->first
- 1;
1717 CDentry
*dn
= in
->get_projected_parent_dn();
1718 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1719 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1720 if (in
->get_projected_inode()->is_backtrace_updated()) {
1721 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1722 in
->get_previous_projected_inode()->layout
.pool_id
;
1723 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1725 metablob
->add_primary_dentry(dn
, in
, true);
1732 // nested ---------------------------------------------------------------
1734 void MDCache::project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
1735 int linkunlink
, SnapRealm
*prealm
)
1737 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1738 inode_t
*curi
= cur
->get_projected_inode();
1740 if (cur
->first
> first
)
1743 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1744 << " " << *cur
<< dendl
;
1745 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1746 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1749 * FIXME. this incompletely propagates rstats to _old_ parents
1750 * (i.e. shortly after a directory rename). but we need full
1751 * blown hard link backpointers to make this work properly...
1753 snapid_t floor
= parentdn
->first
;
1754 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1757 prealm
= parent
->inode
->find_snaprealm();
1758 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1760 if (cur
->last
!= CEPH_NOSNAP
) {
1761 assert(cur
->dirty_old_rstats
.empty());
1762 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(MAX(first
, floor
));
1763 if (q
== snaps
.end() || *q
> cur
->last
)
1767 if (cur
->last
>= floor
) {
1769 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1770 // rename src inode is not projected in the slave rename prep case. so we should
1771 // avoid updateing the inode.
1772 assert(linkunlink
< 0);
1773 assert(cur
->is_frozen_inode());
1776 _project_rstat_inode_to_frag(*curi
, MAX(first
, floor
), cur
->last
, parent
,
1777 linkunlink
, update
);
1780 if (g_conf
->mds_snap_rstat
) {
1781 for (compact_set
<snapid_t
>::iterator p
= cur
->dirty_old_rstats
.begin();
1782 p
!= cur
->dirty_old_rstats
.end();
1784 old_inode_t
& old
= cur
->old_inodes
[*p
];
1785 snapid_t ofirst
= MAX(old
.first
, floor
);
1786 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(ofirst
);
1787 if (q
== snaps
.end() || *q
> *p
)
1790 _project_rstat_inode_to_frag(old
.inode
, ofirst
, *p
, parent
, 0, false);
1793 cur
->dirty_old_rstats
.clear();
1797 void MDCache::_project_rstat_inode_to_frag(inode_t
& inode
, snapid_t ofirst
, snapid_t last
,
1798 CDir
*parent
, int linkunlink
, bool update_inode
)
1800 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1801 dout(20) << " inode rstat " << inode
.rstat
<< dendl
;
1802 dout(20) << " inode accounted_rstat " << inode
.accounted_rstat
<< dendl
;
1804 if (linkunlink
== 0) {
1805 delta
.add(inode
.rstat
);
1806 delta
.sub(inode
.accounted_rstat
);
1807 } else if (linkunlink
< 0) {
1808 delta
.sub(inode
.accounted_rstat
);
1810 delta
.add(inode
.rstat
);
1812 dout(20) << " delta " << delta
<< dendl
;
1815 inode
.accounted_rstat
= inode
.rstat
;
1817 while (last
>= ofirst
) {
1819 * pick fnode version to update. at each iteration, we want to
1820 * pick a segment ending in 'last' to update. split as necessary
1821 * to make that work. then, adjust first up so that we only
1822 * update one segment at a time. then loop to cover the whole
1823 * [ofirst,last] interval.
1825 nest_info_t
*prstat
;
1827 fnode_t
*pf
= parent
->get_projected_fnode();
1828 if (last
== CEPH_NOSNAP
) {
1829 if (g_conf
->mds_snap_rstat
)
1830 first
= MAX(ofirst
, parent
->first
);
1832 first
= parent
->first
;
1833 prstat
= &pf
->rstat
;
1834 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1836 if (first
> parent
->first
&&
1837 !(pf
->rstat
== pf
->accounted_rstat
)) {
1838 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1839 << parent
->first
<< "," << (first
-1) << "] "
1840 << " " << *prstat
<< "/" << pf
->accounted_rstat
1842 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1843 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1844 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1846 parent
->first
= first
;
1847 } else if (!g_conf
->mds_snap_rstat
) {
1848 // drop snapshots' rstats
1850 } else if (last
>= parent
->first
) {
1851 first
= parent
->first
;
1852 parent
->dirty_old_rstat
[last
].first
= first
;
1853 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1854 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1855 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1856 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1857 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1859 // be careful, dirty_old_rstat is a _sparse_ map.
1860 // sorry, this is ugly.
1863 // find any intersection with last
1864 compact_map
<snapid_t
,old_rstat_t
>::iterator p
= parent
->dirty_old_rstat
.lower_bound(last
);
1865 if (p
== parent
->dirty_old_rstat
.end()) {
1866 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1867 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1868 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1869 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1872 // *p last is >= last
1873 if (p
->second
.first
<= last
) {
1874 // *p intersects [first,last]
1875 if (p
->second
.first
< first
) {
1876 dout(10) << " splitting off left bit [" << p
->second
.first
<< "," << first
-1 << "]" << dendl
;
1877 parent
->dirty_old_rstat
[first
-1] = p
->second
;
1878 p
->second
.first
= first
;
1880 if (p
->second
.first
> first
)
1881 first
= p
->second
.first
;
1882 if (last
< p
->first
) {
1883 dout(10) << " splitting off right bit [" << last
+1 << "," << p
->first
<< "]" << dendl
;
1884 parent
->dirty_old_rstat
[last
] = p
->second
;
1885 p
->second
.first
= last
+1;
1888 // *p is to the _right_ of [first,last]
1889 p
= parent
->dirty_old_rstat
.lower_bound(first
);
1890 // new *p last is >= first
1891 if (p
->second
.first
<= last
&& // new *p isn't also to the right, and
1892 p
->first
>= first
) { // it intersects our first bit,
1893 dout(10) << " staying to the right of [" << p
->second
.first
<< "," << p
->first
<< "]..." << dendl
;
1896 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1899 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1900 parent
->dirty_old_rstat
[last
].first
= first
;
1901 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1905 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1906 assert(last
>= first
);
1909 inode
.accounted_rstat
= inode
.rstat
;
1910 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1916 void MDCache::project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
1917 snapid_t ofirst
, snapid_t last
,
1918 CInode
*pin
, bool cow_head
)
1920 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1921 dout(20) << " frag rstat " << rstat
<< dendl
;
1922 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1923 nest_info_t delta
= rstat
;
1924 delta
.sub(accounted_rstat
);
1925 dout(20) << " delta " << delta
<< dendl
;
1927 while (last
>= ofirst
) {
1930 if (last
== pin
->last
) {
1931 pi
= pin
->get_projected_inode();
1932 first
= MAX(ofirst
, pin
->first
);
1933 if (first
> pin
->first
) {
1934 old_inode_t
& old
= pin
->cow_old_inode(first
-1, cow_head
);
1935 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
1938 if (last
>= pin
->first
) {
1940 pin
->cow_old_inode(last
, cow_head
);
1942 // our life is easier here because old_inodes is not sparse
1943 // (although it may not begin at snapid 1)
1944 compact_map
<snapid_t
,old_inode_t
>::iterator p
= pin
->old_inodes
.lower_bound(last
);
1945 if (p
== pin
->old_inodes
.end()) {
1946 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
1949 first
= p
->second
.first
;
1951 dout(10) << " oldest old_inode is [" << first
<< "," << p
->first
<< "], done." << dendl
;
1952 //assert(p == pin->old_inodes.begin());
1955 if (p
->first
> last
) {
1956 dout(10) << " splitting right old_inode [" << first
<< "," << p
->first
<< "] to ["
1957 << (last
+1) << "," << p
->first
<< "]" << dendl
;
1958 pin
->old_inodes
[last
] = p
->second
;
1959 p
->second
.first
= last
+1;
1960 pin
->dirty_old_rstats
.insert(p
->first
);
1963 if (first
< ofirst
) {
1964 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
1965 << first
<< "," << ofirst
-1 << "]" << dendl
;
1966 pin
->old_inodes
[ofirst
-1] = pin
->old_inodes
[last
];
1967 pin
->dirty_old_rstats
.insert(ofirst
-1);
1968 pin
->old_inodes
[last
].first
= first
= ofirst
;
1970 pi
= &pin
->old_inodes
[last
].inode
;
1971 pin
->dirty_old_rstats
.insert(last
);
1973 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1974 pi
->rstat
.add(delta
);
1975 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1981 void MDCache::broadcast_quota_to_client(CInode
*in
)
1983 if (!in
->is_auth() || in
->is_frozen())
1986 inode_t
*i
= in
->get_projected_inode();
1988 if (!i
->quota
.is_enable())
1991 for (map
<client_t
,Capability
*>::iterator it
= in
->client_caps
.begin();
1992 it
!= in
->client_caps
.end();
1994 Session
*session
= mds
->get_session(it
->first
);
1995 if (!session
|| !session
->connection
||
1996 !session
->connection
->has_feature(CEPH_FEATURE_MDS_QUOTA
))
1999 Capability
*cap
= it
->second
;
2000 if (cap
->last_rbytes
== i
->rstat
.rbytes
&&
2001 cap
->last_rsize
== i
->rstat
.rsize())
2004 if (i
->quota
.max_files
> 0) {
2005 if (i
->rstat
.rsize() >= i
->quota
.max_files
)
2008 if ((abs(cap
->last_rsize
- i
->quota
.max_files
) >> 4) <
2009 abs(cap
->last_rsize
- i
->rstat
.rsize()))
2013 if (i
->quota
.max_bytes
> 0) {
2014 if (i
->rstat
.rbytes
> i
->quota
.max_bytes
- (i
->quota
.max_bytes
>> 3))
2017 if ((abs(cap
->last_rbytes
- i
->quota
.max_bytes
) >> 4) <
2018 abs(cap
->last_rbytes
- i
->rstat
.rbytes
))
2025 cap
->last_rsize
= i
->rstat
.rsize();
2026 cap
->last_rbytes
= i
->rstat
.rbytes
;
2028 MClientQuota
*msg
= new MClientQuota();
2029 msg
->ino
= in
->ino();
2030 msg
->rstat
= i
->rstat
;
2031 msg
->quota
= i
->quota
;
2032 mds
->send_message_client_counted(msg
, session
->connection
);
2034 for (compact_map
<mds_rank_t
, unsigned>::iterator it
= in
->replicas_begin();
2035 it
!= in
->replicas_end();
2037 MGatherCaps
*msg
= new MGatherCaps
;
2038 msg
->ino
= in
->ino();
2039 mds
->send_message_mds(msg
, it
->first
);
2044 * NOTE: we _have_ to delay the scatter if we are called during a
2045 * rejoin, because we can't twiddle locks between when the
2046 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2047 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2048 * (no requests), and a survivor acks immediately. _except_ that
2049 * during rejoin_(weak|strong) processing, we may complete a lock
2050 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2051 * scatterlock state in that case or the lock states will get out of
2052 * sync between the auth and replica.
2054 * the simple solution is to never do the scatter here. instead, put
2055 * the scatterlock on a list if it isn't already wrlockable. this is
2056 * probably the best plan anyway, since we avoid too many
2057 * scatters/locks under normal usage.
2060 * some notes on dirlock/nestlock scatterlock semantics:
2062 * the fragstat (dirlock) will never be updated without
2063 * dirlock+nestlock wrlock held by the caller.
2065 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2066 * data is pushed up the tree. this could be changed with some
2067 * restructuring here, but in its current form we ensure that the
2068 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2069 * frag, which is nice. and, we only need to track frags that need to
2070 * be nudged (and not inodes with pending rstat changes that need to
2071 * be pushed into the frag). a consequence of this is that the
2072 * accounted_rstat on scatterlock sync may not match our current
2073 * rstat. this is normal and expected.
2075 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2076 CInode
*in
, CDir
*parent
,
2077 int flags
, int linkunlink
,
2080 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2081 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2082 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2084 assert(mds
->mdlog
->entry_is_open());
2086 // make sure stamp is set
2087 if (mut
->get_mds_stamp() == utime_t())
2088 mut
->set_mds_stamp(ceph_clock_now());
2093 dout(10) << "predirty_journal_parents"
2094 << (do_parent_mtime
? " do_parent_mtime":"")
2095 << " linkunlink=" << linkunlink
2096 << (primary_dn
? " primary_dn":" remote_dn")
2097 << (shallow
? " SHALLOW":"")
2098 << " follows " << cfollows
2099 << " " << *in
<< dendl
;
2103 parent
= in
->get_projected_parent_dn()->get_dir();
2106 if (flags
== 0 && linkunlink
== 0) {
2107 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2108 blob
->add_dir_context(parent
);
2112 // build list of inodes to wrlock, dirty, and update
2115 CDentry
*parentdn
= NULL
;
2118 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2119 assert(parent
->is_auth());
2121 // opportunistically adjust parent dirfrag
2122 CInode
*pin
= parent
->get_inode();
2125 mut
->auth_pin(parent
);
2126 mut
->add_projected_fnode(parent
);
2128 fnode_t
*pf
= parent
->project_fnode();
2129 pf
->version
= parent
->pre_dirty();
2131 if (do_parent_mtime
|| linkunlink
) {
2132 assert(mut
->wrlocks
.count(&pin
->filelock
));
2133 assert(mut
->wrlocks
.count(&pin
->nestlock
));
2134 assert(cfollows
== CEPH_NOSNAP
);
2136 // update stale fragstat/rstat?
2137 parent
->resync_accounted_fragstat();
2138 parent
->resync_accounted_rstat();
2140 if (do_parent_mtime
) {
2141 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2142 pf
->fragstat
.change_attr
++;
2143 dout(10) << "predirty_journal_parents bumping change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2144 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2145 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2146 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2148 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2152 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2154 pf
->fragstat
.nsubdirs
+= linkunlink
;
2155 //pf->rstat.rsubdirs += linkunlink;
2157 pf
->fragstat
.nfiles
+= linkunlink
;
2158 //pf->rstat.rfiles += linkunlink;
2165 // don't update parent this pass
2166 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2167 pin
->versionlock
.can_wrlock())) {
2168 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2169 << ", marking dirty rstat on " << *cur
<< dendl
;
2170 cur
->mark_dirty_rstat();
2172 // if we don't hold a wrlock reference on this nestlock, take one,
2173 // because we are about to write into the dirfrag fnode and that needs
2174 // to commit before the lock can cycle.
2176 assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_slave());
2179 if (mut
->wrlocks
.count(&pin
->nestlock
) == 0) {
2180 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2181 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2184 // now we can project the inode rstat diff the dirfrag
2185 SnapRealm
*prealm
= pin
->find_snaprealm();
2187 snapid_t follows
= cfollows
;
2188 if (follows
== CEPH_NOSNAP
)
2189 follows
= prealm
->get_newest_seq();
2191 snapid_t first
= follows
+1;
2193 // first, if the frag is stale, bring it back in sync.
2194 parent
->resync_accounted_rstat();
2196 // now push inode rstats into frag
2197 project_rstat_inode_to_frag(cur
, parent
, first
, linkunlink
, prealm
);
2198 cur
->clear_dirty_rstat();
2202 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2203 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2207 // delay propagating until later?
2208 if (!stop
&& !first
&&
2209 g_conf
->mds_dirstat_min_interval
> 0) {
2210 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2211 if (since_last_prop
< g_conf
->mds_dirstat_min_interval
) {
2212 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2213 << " < " << g_conf
->mds_dirstat_min_interval
2214 << ", stopping" << dendl
;
2217 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2221 // can cast only because i'm passing nowait=true in the sole user
2222 MDRequestRef mdmut
= static_cast<MDRequestImpl
*>(mut
.get());
2224 mut
->wrlocks
.count(&pin
->nestlock
) == 0 &&
2225 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2227 !mds
->locker
->wrlock_start(&pin
->nestlock
, mdmut
, true)
2228 )) { // ** do not initiate.. see above comment **
2229 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2230 << " on " << *pin
<< dendl
;
2234 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2235 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2236 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2237 mut
->add_updated_lock(&pin
->nestlock
);
2238 if (do_parent_mtime
|| linkunlink
) {
2239 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2240 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2241 mut
->add_updated_lock(&pin
->filelock
);
2245 if (!mut
->wrlocks
.count(&pin
->versionlock
))
2246 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2248 assert(mut
->wrlocks
.count(&pin
->nestlock
) ||
2251 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2255 mut
->add_projected_inode(pin
);
2256 lsi
.push_front(pin
);
2258 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2260 inode_t
*pi
= pin
->project_inode();
2261 pi
->version
= pin
->pre_dirty();
2264 if (do_parent_mtime
|| linkunlink
) {
2265 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2266 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2267 bool touched_mtime
= false, touched_chattr
= false;
2268 pi
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2269 pf
->accounted_fragstat
= pf
->fragstat
;
2271 pi
->mtime
= pi
->ctime
= pi
->dirstat
.mtime
;
2273 pi
->change_attr
= pi
->dirstat
.change_attr
;
2274 dout(20) << "predirty_journal_parents gives " << pi
->dirstat
<< " on " << *pin
<< dendl
;
2276 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2277 if (pi
->dirstat
.size() < 0)
2278 assert(!"negative dirstat size" == g_conf
->mds_verify_scatter
);
2279 if (pi
->dirstat
.size() != pf
->fragstat
.size()) {
2280 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2281 << parent
->dirfrag() << ", inode has " << pi
->dirstat
2282 << ", dirfrag has " << pf
->fragstat
;
2284 // trust the dirfrag for now
2285 pi
->dirstat
= pf
->fragstat
;
2287 assert(!"unmatched fragstat size" == g_conf
->mds_verify_scatter
);
2293 * the rule here is to follow the _oldest_ parent with dirty rstat
2294 * data. if we don't propagate all data, we add ourselves to the
2295 * nudge list. that way all rstat data will (eventually) get
2296 * pushed up the tree.
2298 * actually, no. for now, silently drop rstats for old parents. we need
2299 * hard link backpointers to do the above properly.
2305 parentdn
= pin
->get_projected_parent_dn();
2309 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2311 // first, if the frag is stale, bring it back in sync.
2312 parent
->resync_accounted_rstat();
2314 if (g_conf
->mds_snap_rstat
) {
2315 for (compact_map
<snapid_t
,old_rstat_t
>::iterator p
= parent
->dirty_old_rstat
.begin();
2316 p
!= parent
->dirty_old_rstat
.end();
2318 project_rstat_frag_to_inode(p
->second
.rstat
, p
->second
.accounted_rstat
, p
->second
.first
,
2319 p
->first
, pin
, true);//false);
2321 parent
->dirty_old_rstat
.clear();
2322 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2324 pf
->accounted_rstat
= pf
->rstat
;
2326 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2327 if (pi
->rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2328 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2329 << parent
->dirfrag() << ", inode has " << pi
->rstat
2330 << ", dirfrag has " << pf
->rstat
;
2332 // trust the dirfrag for now
2333 pi
->rstat
= pf
->rstat
;
2335 assert(!"unmatched rstat rbytes" == g_conf
->mds_verify_scatter
);
2339 parent
->check_rstats();
2340 broadcast_quota_to_client(pin
);
2343 parent
= parentdn
->get_dir();
2345 do_parent_mtime
= false;
2350 // now, stick it in the blob
2352 assert(parent
->is_auth());
2353 blob
->add_dir_context(parent
);
2354 blob
->add_dir(parent
, true);
2355 for (list
<CInode
*>::iterator p
= lsi
.begin();
2359 journal_dirty_inode(mut
.get(), blob
, cur
);
2368 // ===================================
2373 * some handlers for master requests with slaves. we need to make
2374 * sure slaves journal commits before we forget we mastered them and
2375 * remove them from the uncommitted_masters map (used during recovery
2376 * to commit|abort slaves).
2378 struct C_MDC_CommittedMaster
: public MDCacheLogContext
{
2380 C_MDC_CommittedMaster(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2381 void finish(int r
) override
{
2382 mdcache
->_logged_master_commit(reqid
);
2386 void MDCache::log_master_commit(metareqid_t reqid
)
2388 dout(10) << "log_master_commit " << reqid
<< dendl
;
2389 uncommitted_masters
[reqid
].committing
= true;
2390 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2391 new C_MDC_CommittedMaster(this, reqid
));
2394 void MDCache::_logged_master_commit(metareqid_t reqid
)
2396 dout(10) << "_logged_master_commit " << reqid
<< dendl
;
2397 assert(uncommitted_masters
.count(reqid
));
2398 uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2399 mds
->queue_waiters(uncommitted_masters
[reqid
].waiters
);
2400 uncommitted_masters
.erase(reqid
);
2405 void MDCache::committed_master_slave(metareqid_t r
, mds_rank_t from
)
2407 dout(10) << "committed_master_slave mds." << from
<< " on " << r
<< dendl
;
2408 assert(uncommitted_masters
.count(r
));
2409 uncommitted_masters
[r
].slaves
.erase(from
);
2410 if (!uncommitted_masters
[r
].recovering
&& uncommitted_masters
[r
].slaves
.empty())
2411 log_master_commit(r
);
2414 void MDCache::logged_master_update(metareqid_t reqid
)
2416 dout(10) << "logged_master_update " << reqid
<< dendl
;
2417 assert(uncommitted_masters
.count(reqid
));
2418 uncommitted_masters
[reqid
].safe
= true;
2419 if (pending_masters
.count(reqid
)) {
2420 pending_masters
.erase(reqid
);
2421 if (pending_masters
.empty())
2422 process_delayed_resolve();
2427 * Master may crash after receiving all slaves' commit acks, but before journalling
2428 * the final commit. Slaves may crash after journalling the slave commit, but before
2429 * sending commit ack to the master. Commit masters with no uncommitted slave when
2432 void MDCache::finish_committed_masters()
2434 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
2435 p
!= uncommitted_masters
.end();
2437 p
->second
.recovering
= false;
2438 if (!p
->second
.committing
&& p
->second
.slaves
.empty()) {
2439 dout(10) << "finish_committed_masters " << p
->first
<< dendl
;
2440 log_master_commit(p
->first
);
2446 * at end of resolve... we must journal a commit|abort for all slave
2447 * updates, before moving on.
2449 * this is so that the master can safely journal ECommitted on ops it
2450 * masters when it reaches up:active (all other recovering nodes must
2451 * complete resolve before that happens).
2453 struct C_MDC_SlaveCommit
: public MDCacheLogContext
{
2456 C_MDC_SlaveCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2457 void finish(int r
) override
{
2458 mdcache
->_logged_slave_commit(from
, reqid
);
2462 void MDCache::_logged_slave_commit(mds_rank_t from
, metareqid_t reqid
)
2464 dout(10) << "_logged_slave_commit from mds." << from
<< " " << reqid
<< dendl
;
2467 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(reqid
, 0, MMDSSlaveRequest::OP_COMMITTED
);
2468 mds
->send_message_mds(req
, from
);
2476 // ====================================================================
2477 // import map, recovery
2479 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2480 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2482 if (subtrees
.count(oldparent
)) {
2483 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2484 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2485 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2491 if (subtrees
.count(newparent
)) {
2492 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2493 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2498 ESubtreeMap
*MDCache::create_subtree_map()
2500 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2501 << num_subtrees_fullauth() << " fullauth"
2506 ESubtreeMap
*le
= new ESubtreeMap();
2507 mds
->mdlog
->_start_entry(le
);
2509 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2512 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2513 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2516 // include all auth subtrees, and their bounds.
2517 // and a spanning tree to tie it to the root.
2518 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
2519 p
!= subtrees
.end();
2521 CDir
*dir
= p
->first
;
2523 // journal subtree as "ours" if we are
2526 // me, !me (may be importing and ambiguous!)
2530 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2533 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2534 my_ambiguous_imports
.count(dir
->dirfrag())) {
2535 dout(15) << " ambig subtree " << *dir
<< dendl
;
2536 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2538 dout(15) << " subtree " << *dir
<< dendl
;
2541 dirs_to_add
[dir
->dirfrag()] = dir
;
2542 le
->subtrees
[dir
->dirfrag()].clear();
2546 for (set
<CDir
*>::iterator q
= p
->second
.begin();
2547 q
!= p
->second
.end();
2550 dout(15) << " subtree bound " << *bound
<< dendl
;
2551 dirs_to_add
[bound
->dirfrag()] = bound
;
2552 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2556 // apply projected renames
2557 for (map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.begin();
2558 p
!= projected_subtree_renames
.end();
2560 for (list
<pair
<CDir
*,CDir
*> >::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2561 CInode
*diri
= p
->first
;
2562 CDir
*olddir
= q
->first
;
2563 CDir
*newdir
= q
->second
;
2564 dout(10) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2567 diri
->get_dirfrags(dfls
);
2568 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
2570 dout(10) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2571 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2572 dout(10) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2573 CDir
*newparent
= get_projected_subtree_root(newdir
);
2574 dout(10) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2576 if (oldparent
== newparent
) {
2577 dout(10) << "parent unchanged for " << dir
->dirfrag() << " at "
2578 << oldparent
->dirfrag() << dendl
;
2582 if (dir
->is_subtree_root()) {
2583 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2584 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2585 dirs_to_add
[dir
->dirfrag()] = dir
;
2586 // children are fine. change parent.
2587 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2592 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2593 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2594 // if oldparent is auth, subtree is mine; include it.
2595 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2596 dirs_to_add
[dir
->dirfrag()] = dir
;
2597 le
->subtrees
[dir
->dirfrag()].clear();
2599 // if newparent is auth, subtree is a new bound
2600 if (le
->subtrees
.count(newparent
->dirfrag())) {
2601 dirs_to_add
[dir
->dirfrag()] = dir
;
2602 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2607 // see if any old bounds move to the new parent.
2608 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
2609 p
!= subtrees
[oldparent
].end();
2612 if (dir
->contains(bound
->get_parent_dir()))
2613 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2621 // simplify the journaled map. our in memory map may have more
2622 // subtrees than needed due to migrations that are just getting
2623 // started or just completing. but on replay, the "live" map will
2624 // be simple and we can do a straight comparison.
2625 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= le
->subtrees
.begin(); p
!= le
->subtrees
.end(); ++p
) {
2626 if (le
->ambiguous_subtrees
.count(p
->first
))
2629 while (i
< p
->second
.size()) {
2630 dirfrag_t b
= p
->second
[i
];
2631 if (le
->subtrees
.count(b
) &&
2632 le
->ambiguous_subtrees
.count(b
) == 0) {
2633 vector
<dirfrag_t
>& bb
= le
->subtrees
[b
];
2634 dout(10) << "simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2635 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2636 p
->second
.push_back(*r
);
2637 dirs_to_add
.erase(b
);
2638 le
->subtrees
.erase(b
);
2639 p
->second
.erase(p
->second
.begin() + i
);
2646 for (auto p
: dirs_to_add
) {
2647 CDir
*dir
= p
.second
;
2648 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2649 le
->metablob
.add_dir(dir
, false);
2652 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2653 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2655 //le->metablob.print(cout);
2656 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2660 void MDCache::dump_resolve_status(Formatter
*f
) const
2662 f
->open_object_section("resolve_status");
2663 f
->dump_stream("resolve_gather") << resolve_gather
;
2664 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2668 void MDCache::resolve_start(MDSInternalContext
*resolve_done_
)
2670 dout(10) << "resolve_start" << dendl
;
2671 assert(!resolve_done
);
2672 resolve_done
.reset(resolve_done_
);
2674 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2675 // if we don't have the root dir, adjust it to UNKNOWN. during
2676 // resolve we want mds0 to explicit claim the portion of it that
2677 // it owns, so that anything beyond its bounds get left as
2679 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2681 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2683 resolve_gather
= recovery_set
;
2686 void MDCache::send_resolves()
2688 send_slave_resolves();
2689 if (!resolve_ack_gather
.empty()) {
2690 dout(10) << "send_resolves still waiting for resolve ack from ("
2691 << resolve_ack_gather
<< ")" << dendl
;
2694 if (!need_resolve_rollback
.empty()) {
2695 dout(10) << "send_resolves still waiting for rollback to commit on ("
2696 << need_resolve_rollback
<< ")" << dendl
;
2699 send_subtree_resolves();
2702 void MDCache::send_slave_resolves()
2704 dout(10) << "send_slave_resolves" << dendl
;
2706 map
<mds_rank_t
, MMDSResolve
*> resolves
;
2708 if (mds
->is_resolve()) {
2709 for (map
<mds_rank_t
, map
<metareqid_t
, MDSlaveUpdate
*> >::iterator p
= uncommitted_slave_updates
.begin();
2710 p
!= uncommitted_slave_updates
.end();
2712 resolves
[p
->first
] = new MMDSResolve
;
2713 for (map
<metareqid_t
, MDSlaveUpdate
*>::iterator q
= p
->second
.begin();
2714 q
!= p
->second
.end();
2716 dout(10) << " including uncommitted " << q
->first
<< dendl
;
2717 resolves
[p
->first
]->add_slave_request(q
->first
, false);
2721 set
<mds_rank_t
> resolve_set
;
2722 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2723 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2724 p
!= active_requests
.end();
2726 MDRequestRef
& mdr
= p
->second
;
2727 if (!mdr
->is_slave())
2729 if (!mdr
->slave_did_prepare() && !mdr
->committing
) {
2732 mds_rank_t master
= mdr
->slave_to_mds
;
2733 if (resolve_set
.count(master
) || is_ambiguous_slave_update(p
->first
, master
)) {
2734 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2735 if (!resolves
.count(master
))
2736 resolves
[master
] = new MMDSResolve
;
2737 if (!mdr
->committing
&&
2738 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2739 // re-send cap exports
2740 CInode
*in
= mdr
->more()->rename_inode
;
2741 map
<client_t
, Capability::Export
> cap_map
;
2742 in
->export_client_caps(cap_map
);
2744 ::encode(in
->ino(), bl
);
2745 ::encode(cap_map
, bl
);
2746 resolves
[master
]->add_slave_request(p
->first
, bl
);
2748 resolves
[master
]->add_slave_request(p
->first
, mdr
->committing
);
2754 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= resolves
.begin();
2755 p
!= resolves
.end();
2757 dout(10) << "sending slave resolve to mds." << p
->first
<< dendl
;
2758 mds
->send_message_mds(p
->second
, p
->first
);
2759 resolve_ack_gather
.insert(p
->first
);
2763 void MDCache::send_subtree_resolves()
2765 dout(10) << "send_subtree_resolves" << dendl
;
2767 if (migrator
->is_exporting() || migrator
->is_importing()) {
2768 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2769 migrator
->show_importing();
2770 migrator
->show_exporting();
2771 resolves_pending
= true;
2775 map
<mds_rank_t
, MMDSResolve
*> resolves
;
2776 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2777 p
!= recovery_set
.end();
2779 if (*p
== mds
->get_nodeid())
2781 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2782 resolves
[*p
] = new MMDSResolve
;
2785 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2786 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2789 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2790 p
!= subtrees
.end();
2792 CDir
*dir
= p
->first
;
2794 // only our subtrees
2795 if (dir
->authority().first
!= mds
->get_nodeid())
2798 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2799 continue; // we'll add it below
2801 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2802 // ambiguous (mid-import)
2804 get_subtree_bounds(dir
, bounds
);
2805 vector
<dirfrag_t
> dfls
;
2806 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2807 dfls
.push_back((*q
)->dirfrag());
2809 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2810 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2813 for (map
<mds_rank_t
, MMDSResolve
*>::iterator q
= resolves
.begin();
2814 q
!= resolves
.end();
2816 resolves
[q
->first
]->add_subtree(dir
->dirfrag());
2818 vector
<dirfrag_t
> dfls
;
2819 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2820 q
!= subtrees
[dir
].end();
2823 dfls
.push_back(bound
->dirfrag());
2826 my_subtrees
[dir
->dirfrag()] = dfls
;
2827 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2832 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2833 p
!= my_ambiguous_imports
.end();
2835 my_ambig_imports
[p
->first
] = p
->second
;
2836 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2839 // simplify the claimed subtree.
2840 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2842 while (i
< p
->second
.size()) {
2843 dirfrag_t b
= p
->second
[i
];
2844 if (my_subtrees
.count(b
)) {
2845 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2846 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2847 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2848 p
->second
.push_back(*r
);
2849 my_subtrees
.erase(b
);
2850 p
->second
.erase(p
->second
.begin() + i
);
2858 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= resolves
.begin();
2859 p
!= resolves
.end();
2861 MMDSResolve
* m
= p
->second
;
2862 m
->subtrees
= my_subtrees
;
2863 m
->ambiguous_imports
= my_ambig_imports
;
2864 dout(10) << "sending subtee resolve to mds." << p
->first
<< dendl
;
2865 mds
->send_message_mds(m
, p
->first
);
2867 resolves_pending
= false;
2870 void MDCache::handle_mds_failure(mds_rank_t who
)
2872 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2874 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2876 resolve_gather
.insert(who
);
2877 discard_delayed_resolve(who
);
2878 ambiguous_slave_updates
.erase(who
);
2880 rejoin_gather
.insert(who
);
2881 rejoin_sent
.erase(who
); // i need to send another
2882 rejoin_ack_sent
.erase(who
); // i need to send another
2883 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2885 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2886 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2887 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2888 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2889 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2892 // tell the migrator too.
2893 migrator
->handle_mds_failure_or_stop(who
);
2895 // clean up any requests slave to/from this node
2896 list
<MDRequestRef
> finish
;
2897 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2898 p
!= active_requests
.end();
2900 MDRequestRef
& mdr
= p
->second
;
2901 // slave to the failed node?
2902 if (mdr
->slave_to_mds
== who
) {
2903 if (mdr
->slave_did_prepare()) {
2904 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2905 if (is_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
))
2906 remove_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2908 if (!mdr
->more()->waiting_on_slave
.empty()) {
2909 assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2910 // will rollback, no need to wait
2911 if (mdr
->slave_request
) {
2912 mdr
->slave_request
->put();
2913 mdr
->slave_request
= 0;
2915 mdr
->more()->waiting_on_slave
.clear();
2917 } else if (!mdr
->committing
) {
2918 dout(10) << " slave request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2919 if (mdr
->slave_request
|| mdr
->slave_rolling_back())
2920 mdr
->aborted
= true;
2922 finish
.push_back(mdr
);
2926 if (mdr
->is_slave() && mdr
->slave_did_prepare()) {
2927 if (mdr
->more()->waiting_on_slave
.count(who
)) {
2928 assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2929 dout(10) << " slave request " << *mdr
<< " no longer need rename notity ack from mds."
2931 mdr
->more()->waiting_on_slave
.erase(who
);
2932 if (mdr
->more()->waiting_on_slave
.empty() && mdr
->slave_request
)
2933 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
2936 if (mdr
->more()->srcdn_auth_mds
== who
&&
2937 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->slave_to_mds
)) {
2938 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2939 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2940 add_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2942 } else if (mdr
->slave_request
) {
2943 MMDSSlaveRequest
*slave_req
= mdr
->slave_request
;
2944 // FIXME: Slave rename request can arrive after we notice mds failure.
2945 // This can cause mds to crash (does not affect integrity of FS).
2946 if (slave_req
->get_op() == MMDSSlaveRequest::OP_RENAMEPREP
&&
2947 slave_req
->srcdn_auth
== who
)
2948 slave_req
->mark_interrupted();
2951 // failed node is slave?
2952 if (mdr
->is_master() && !mdr
->committing
) {
2953 if (mdr
->more()->srcdn_auth_mds
== who
) {
2954 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2955 << who
<< " to recover" << dendl
;
2956 assert(mdr
->more()->witnessed
.count(who
) == 0);
2957 if (mdr
->more()->is_ambiguous_auth
)
2958 mdr
->clear_ambiguous_auth();
2959 // rename srcdn's auth mds failed, all witnesses will rollback
2960 mdr
->more()->witnessed
.clear();
2961 pending_masters
.erase(p
->first
);
2964 if (mdr
->more()->witnessed
.count(who
)) {
2965 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
2966 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_slave
.count(srcdn_auth
)) {
2967 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2968 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
2969 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2970 // until either the request is committing or the slave also fails.
2971 assert(mdr
->more()->waiting_on_slave
.size() == 1);
2972 pending_masters
.insert(p
->first
);
2974 dout(10) << " master request " << *mdr
<< " no longer witnessed by slave mds."
2975 << who
<< " to recover" << dendl
;
2976 if (srcdn_auth
>= 0)
2977 assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
2979 // discard this peer's prepare (if any)
2980 mdr
->more()->witnessed
.erase(who
);
2984 if (mdr
->more()->waiting_on_slave
.count(who
)) {
2985 dout(10) << " master request " << *mdr
<< " waiting for slave mds." << who
2986 << " to recover" << dendl
;
2987 // retry request when peer recovers
2988 mdr
->more()->waiting_on_slave
.erase(who
);
2989 if (mdr
->more()->waiting_on_slave
.empty())
2990 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
2993 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
2994 mdr
->finish_locking(mdr
->locking
);
2998 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
2999 p
!= uncommitted_masters
.end();
3001 // The failed MDS may have already committed the slave update
3002 if (p
->second
.slaves
.count(who
)) {
3003 p
->second
.recovering
= true;
3004 p
->second
.slaves
.erase(who
);
3008 while (!finish
.empty()) {
3009 dout(10) << "cleaning up slave request " << *finish
.front() << dendl
;
3010 request_finish(finish
.front());
3014 kick_find_ino_peers(who
);
3015 kick_open_ino_peers(who
);
3017 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3018 p
!= fragments
.end(); ) {
3019 dirfrag_t df
= p
->first
;
3020 fragment_info_t
& info
= p
->second
;
3022 if (info
.is_fragmenting())
3024 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3026 info
.dirs
.swap(dirs
);
3027 fragments
.erase(df
);
3028 fragment_unmark_unfreeze_dirs(dirs
);
3031 // MDCache::shutdown_export_strays() always exports strays to mds.0
3032 if (who
== mds_rank_t(0))
3033 shutdown_exported_strays
.clear();
3039 * handle_mds_recovery - called on another node's transition
3040 * from resolve -> active.
3042 void MDCache::handle_mds_recovery(mds_rank_t who
)
3044 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3046 // exclude all discover waiters. kick_discovers() will do the job
3047 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3048 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3050 list
<MDSInternalContextBase
*> waiters
;
3052 // wake up any waiters in their subtrees
3053 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3054 p
!= subtrees
.end();
3056 CDir
*dir
= p
->first
;
3058 if (dir
->authority().first
!= who
||
3059 dir
->authority().second
== mds
->get_nodeid())
3061 assert(!dir
->is_auth());
3067 while (!q
.empty()) {
3068 CDir
*d
= q
.front();
3070 d
->take_waiting(d_mask
, waiters
);
3072 // inode waiters too
3073 for (CDir::map_t::iterator p
= d
->items
.begin();
3074 p
!= d
->items
.end();
3076 CDentry
*dn
= p
->second
;
3077 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3078 if (dnl
->is_primary()) {
3079 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3083 dnl
->get_inode()->get_dirfrags(ls
);
3084 for (list
<CDir
*>::iterator p
= ls
.begin();
3088 if (!subdir
->is_subtree_root())
3089 q
.push_back(subdir
);
3096 kick_open_ino_peers(who
);
3097 kick_find_ino_peers(who
);
3100 mds
->queue_waiters(waiters
);
3103 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3105 dout(7) << "set_recovery_set " << s
<< dendl
;
3111 * during resolve state, we share resolves to determine who
3112 * is authoritative for which trees. we expect to get an resolve
3113 * from _everyone_ in the recovery_set (the mds cluster at the time of
3114 * the first failure).
3116 * This functions puts the passed message before returning
3118 void MDCache::handle_resolve(MMDSResolve
*m
)
3120 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3121 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3123 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3124 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3125 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3128 // wait until we reach the resolve stage!
3133 discard_delayed_resolve(from
);
3135 // ambiguous slave requests?
3136 if (!m
->slave_requests
.empty()) {
3137 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3138 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3139 if (uncommitted_masters
.count(p
->first
) && !uncommitted_masters
[p
->first
].safe
) {
3140 assert(!p
->second
.committing
);
3141 pending_masters
.insert(p
->first
);
3145 if (!pending_masters
.empty()) {
3146 dout(10) << " still have pending updates, delay processing slave resolve" << dendl
;
3147 delayed_resolve
[from
] = m
;
3152 MMDSResolveAck
*ack
= new MMDSResolveAck
;
3153 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3154 if (uncommitted_masters
.count(p
->first
)) { //mds->sessionmap.have_completed_request(p->first)) {
3156 if (p
->second
.committing
) {
3157 // already committing, waiting for the OP_COMMITTED slave reply
3158 dout(10) << " already committing slave request " << *p
<< " noop "<< dendl
;
3160 dout(10) << " ambiguous slave request " << *p
<< " will COMMIT" << dendl
;
3161 ack
->add_commit(p
->first
);
3163 uncommitted_masters
[p
->first
].slaves
.insert(from
); // wait for slave OP_COMMITTED before we log ECommitted
3165 if (p
->second
.inode_caps
.length() > 0) {
3166 // slave wants to export caps (rename)
3167 assert(mds
->is_resolve());
3170 map
<client_t
,Capability::Export
> cap_exports
;
3171 bufferlist::iterator q
= p
->second
.inode_caps
.begin();
3173 ::decode(cap_exports
, q
);
3175 assert(get_inode(ino
));
3177 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3178 q
!= cap_exports
.end();
3180 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3181 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3183 im
.mseq
= q
->second
.mseq
;
3186 // will process these caps in rejoin stage
3187 rejoin_slave_exports
[ino
].first
= from
;
3188 rejoin_slave_exports
[ino
].second
.swap(cap_exports
);
3190 // send information of imported caps back to slave
3191 ::encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
->first
]);
3195 dout(10) << " ambiguous slave request " << *p
<< " will ABORT" << dendl
;
3196 assert(!p
->second
.committing
);
3197 ack
->add_abort(p
->first
);
3200 mds
->send_message(ack
, m
->get_connection());
3205 if (!resolve_ack_gather
.empty() || !need_resolve_rollback
.empty()) {
3206 dout(10) << "delay processing subtree resolve" << dendl
;
3207 delayed_resolve
[from
] = m
;
3211 bool survivor
= false;
3212 // am i a surviving ambiguous importer?
3213 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3215 // check for any import success/failure (from this node)
3216 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3217 while (p
!= my_ambiguous_imports
.end()) {
3218 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3220 CDir
*dir
= get_dirfrag(p
->first
);
3222 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3223 if (migrator
->is_importing(dir
->dirfrag()) &&
3224 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3225 assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3227 // check if sender claims the subtree
3228 bool claimed_by_sender
= false;
3229 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= m
->subtrees
.begin();
3230 q
!= m
->subtrees
.end();
3232 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3233 CDir
*base
= get_force_dirfrag(q
->first
, false);
3234 if (!base
|| !base
->contains(dir
))
3235 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3239 get_force_dirfrag_bound_set(q
->second
, bounds
);
3240 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3242 if (bound
->contains(dir
)) {
3243 inside
= false; // nope, bound is dir or parent of dir, not inside.
3248 claimed_by_sender
= true;
3251 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3252 if (claimed_by_sender
) {
3253 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3254 migrator
->import_reverse(dir
);
3256 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3257 migrator
->import_finish(dir
, true);
3264 // update my dir_auth values
3265 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3266 // migrations between other nodes)
3267 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator pi
= m
->subtrees
.begin();
3268 pi
!= m
->subtrees
.end();
3270 dout(10) << "peer claims " << pi
->first
<< " bounds " << pi
->second
<< dendl
;
3271 CDir
*dir
= get_force_dirfrag(pi
->first
, !survivor
);
3274 adjust_bounded_subtree_auth(dir
, pi
->second
, from
);
3275 try_subtree_merge(dir
);
3280 // note ambiguous imports too
3281 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator pi
= m
->ambiguous_imports
.begin();
3282 pi
!= m
->ambiguous_imports
.end();
3284 dout(10) << "noting ambiguous import on " << pi
->first
<< " bounds " << pi
->second
<< dendl
;
3285 other_ambiguous_imports
[from
][pi
->first
].swap( pi
->second
);
3288 // did i get them all?
3289 resolve_gather
.erase(from
);
3291 maybe_resolve_finish();
3296 void MDCache::process_delayed_resolve()
3298 dout(10) << "process_delayed_resolve" << dendl
;
3299 map
<mds_rank_t
, MMDSResolve
*> tmp
;
3300 tmp
.swap(delayed_resolve
);
3301 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= tmp
.begin(); p
!= tmp
.end(); ++p
)
3302 handle_resolve(p
->second
);
3305 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3307 if (delayed_resolve
.count(who
)) {
3308 delayed_resolve
[who
]->put();
3309 delayed_resolve
.erase(who
);
3313 void MDCache::maybe_resolve_finish()
3315 assert(resolve_ack_gather
.empty());
3316 assert(need_resolve_rollback
.empty());
3318 if (!resolve_gather
.empty()) {
3319 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3320 << resolve_gather
<< ")" << dendl
;
3324 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3325 disambiguate_my_imports();
3326 finish_committed_masters();
3329 assert(mds
->is_resolve());
3330 trim_unlinked_inodes();
3331 recalc_auth_bits(false);
3332 resolve_done
.release()->complete(0);
3334 maybe_send_pending_rejoins();
3338 /* This functions puts the passed message before returning */
3339 void MDCache::handle_resolve_ack(MMDSResolveAck
*ack
)
3341 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3342 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3344 if (!resolve_ack_gather
.count(from
) ||
3345 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3350 if (ambiguous_slave_updates
.count(from
)) {
3351 assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3352 assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3355 for (map
<metareqid_t
, bufferlist
>::iterator p
= ack
->commit
.begin();
3356 p
!= ack
->commit
.end();
3358 dout(10) << " commit on slave " << p
->first
<< dendl
;
3360 if (ambiguous_slave_updates
.count(from
)) {
3361 remove_ambiguous_slave_update(p
->first
, from
);
3365 if (mds
->is_resolve()) {
3367 MDSlaveUpdate
*su
= get_uncommitted_slave_update(p
->first
, from
);
3371 mds
->mdlog
->start_submit_entry(new ESlaveUpdate(mds
->mdlog
, "unknown", p
->first
, from
,
3372 ESlaveUpdate::OP_COMMIT
, su
->origop
),
3373 new C_MDC_SlaveCommit(this, from
, p
->first
));
3374 mds
->mdlog
->flush();
3376 finish_uncommitted_slave_update(p
->first
, from
);
3378 MDRequestRef mdr
= request_get(p
->first
);
3379 // information about master imported caps
3380 if (p
->second
.length() > 0)
3381 mdr
->more()->inode_import
.claim(p
->second
);
3383 assert(mdr
->slave_request
== 0); // shouldn't be doing anything!
3384 request_finish(mdr
);
3388 for (vector
<metareqid_t
>::iterator p
= ack
->abort
.begin();
3389 p
!= ack
->abort
.end();
3391 dout(10) << " abort on slave " << *p
<< dendl
;
3393 if (mds
->is_resolve()) {
3394 MDSlaveUpdate
*su
= get_uncommitted_slave_update(*p
, from
);
3397 // perform rollback (and journal a rollback entry)
3398 // note: this will hold up the resolve a bit, until the rollback entries journal.
3399 MDRequestRef null_ref
;
3400 switch (su
->origop
) {
3401 case ESlaveUpdate::LINK
:
3402 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3404 case ESlaveUpdate::RENAME
:
3405 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3407 case ESlaveUpdate::RMDIR
:
3408 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3414 MDRequestRef mdr
= request_get(*p
);
3415 mdr
->aborted
= true;
3416 if (mdr
->slave_request
) {
3417 if (mdr
->slave_did_prepare()) // journaling slave prepare ?
3418 add_rollback(*p
, from
);
3420 request_finish(mdr
);
3425 if (!ambiguous_slave_updates
.count(from
))
3426 resolve_ack_gather
.erase(from
);
3427 if (resolve_ack_gather
.empty() && need_resolve_rollback
.empty()) {
3428 send_subtree_resolves();
3429 process_delayed_resolve();
3435 void MDCache::add_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
, MDSlaveUpdate
*su
)
3437 assert(uncommitted_slave_updates
[master
].count(reqid
) == 0);
3438 uncommitted_slave_updates
[master
][reqid
] = su
;
3439 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3440 uncommitted_slave_rename_olddir
[*p
]++;
3441 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3442 uncommitted_slave_unlink
[*p
]++;
3445 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3447 assert(uncommitted_slave_updates
[master
].count(reqid
));
3448 MDSlaveUpdate
* su
= uncommitted_slave_updates
[master
][reqid
];
3450 uncommitted_slave_updates
[master
].erase(reqid
);
3451 if (uncommitted_slave_updates
[master
].empty())
3452 uncommitted_slave_updates
.erase(master
);
3453 // discard the non-auth subtree we renamed out of
3454 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3456 map
<CInode
*, int>::iterator it
= uncommitted_slave_rename_olddir
.find(diri
);
3457 assert(it
!= uncommitted_slave_rename_olddir
.end());
3459 if (it
->second
== 0) {
3460 uncommitted_slave_rename_olddir
.erase(it
);
3462 diri
->get_dirfrags(ls
);
3463 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
3464 CDir
*root
= get_subtree_root(*q
);
3465 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3466 try_trim_non_auth_subtree(root
);
3472 assert(it
->second
> 0);
3474 // removed the inodes that were unlinked by slave update
3475 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3477 map
<CInode
*, int>::iterator it
= uncommitted_slave_unlink
.find(in
);
3478 assert(it
!= uncommitted_slave_unlink
.end());
3480 if (it
->second
== 0) {
3481 uncommitted_slave_unlink
.erase(it
);
3482 if (!in
->get_projected_parent_dn())
3483 mds
->mdcache
->remove_inode_recursive(in
);
3485 assert(it
->second
> 0);
3490 MDSlaveUpdate
* MDCache::get_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3493 MDSlaveUpdate
* su
= NULL
;
3494 if (uncommitted_slave_updates
.count(master
) &&
3495 uncommitted_slave_updates
[master
].count(reqid
)) {
3496 su
= uncommitted_slave_updates
[master
][reqid
];
3502 void MDCache::finish_rollback(metareqid_t reqid
) {
3503 assert(need_resolve_rollback
.count(reqid
));
3504 if (mds
->is_resolve())
3505 finish_uncommitted_slave_update(reqid
, need_resolve_rollback
[reqid
]);
3506 need_resolve_rollback
.erase(reqid
);
3507 if (resolve_ack_gather
.empty() && need_resolve_rollback
.empty()) {
3508 send_subtree_resolves();
3509 process_delayed_resolve();
3513 void MDCache::disambiguate_other_imports()
3515 dout(10) << "disambiguate_other_imports" << dendl
;
3517 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3518 // other nodes' ambiguous imports
3519 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3520 p
!= other_ambiguous_imports
.end();
3522 mds_rank_t who
= p
->first
;
3523 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3525 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3526 q
!= p
->second
.end();
3528 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3529 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3530 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3533 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3534 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3535 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3536 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3537 try_subtree_merge(dir
);
3539 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3543 other_ambiguous_imports
.clear();
3546 void MDCache::disambiguate_my_imports()
3548 dout(10) << "disambiguate_my_imports" << dendl
;
3550 if (!mds
->is_resolve()) {
3551 assert(my_ambiguous_imports
.empty());
3555 disambiguate_other_imports();
3557 // my ambiguous imports
3558 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3559 while (!my_ambiguous_imports
.empty()) {
3560 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3562 CDir
*dir
= get_dirfrag(q
->first
);
3565 if (dir
->authority() != me_ambig
) {
3566 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3567 cancel_ambiguous_import(dir
);
3569 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3571 // subtree may have been swallowed by another node claiming dir
3573 CDir
*root
= get_subtree_root(dir
);
3575 dout(10) << " subtree root is " << *root
<< dendl
;
3576 assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3577 try_trim_non_auth_subtree(root
);
3579 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3580 finish_ambiguous_import(q
->first
);
3581 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3584 assert(my_ambiguous_imports
.empty());
3585 mds
->mdlog
->flush();
3587 // verify all my subtrees are unambiguous!
3588 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3589 p
!= subtrees
.end();
3591 CDir
*dir
= p
->first
;
3592 if (dir
->is_ambiguous_dir_auth()) {
3593 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3595 assert(!dir
->is_ambiguous_dir_auth());
3602 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3604 assert(my_ambiguous_imports
.count(base
) == 0);
3605 my_ambiguous_imports
[base
] = bounds
;
3609 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3612 vector
<dirfrag_t
> binos
;
3613 for (set
<CDir
*>::iterator p
= bounds
.begin();
3616 binos
.push_back((*p
)->dirfrag());
3618 // note: this can get called twice if the exporter fails during recovery
3619 if (my_ambiguous_imports
.count(base
->dirfrag()))
3620 my_ambiguous_imports
.erase(base
->dirfrag());
3622 add_ambiguous_import(base
->dirfrag(), binos
);
3625 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3627 dirfrag_t df
= dir
->dirfrag();
3628 assert(my_ambiguous_imports
.count(df
));
3629 dout(10) << "cancel_ambiguous_import " << df
3630 << " bounds " << my_ambiguous_imports
[df
]
3633 my_ambiguous_imports
.erase(df
);
3636 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3638 assert(my_ambiguous_imports
.count(df
));
3639 vector
<dirfrag_t
> bounds
;
3640 bounds
.swap(my_ambiguous_imports
[df
]);
3641 my_ambiguous_imports
.erase(df
);
3643 dout(10) << "finish_ambiguous_import " << df
3644 << " bounds " << bounds
3646 CDir
*dir
= get_dirfrag(df
);
3649 // adjust dir_auth, import maps
3650 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3651 try_subtree_merge(dir
);
3654 void MDCache::remove_inode_recursive(CInode
*in
)
3656 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3658 in
->get_dirfrags(ls
);
3659 list
<CDir
*>::iterator p
= ls
.begin();
3660 while (p
!= ls
.end()) {
3661 CDir
*subdir
= *p
++;
3663 dout(10) << " removing dirfrag " << subdir
<< dendl
;
3664 CDir::map_t::iterator q
= subdir
->items
.begin();
3665 while (q
!= subdir
->items
.end()) {
3666 CDentry
*dn
= q
->second
;
3668 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3669 if (dnl
->is_primary()) {
3670 CInode
*tin
= dnl
->get_inode();
3671 subdir
->unlink_inode(dn
, false);
3672 remove_inode_recursive(tin
);
3674 subdir
->remove_dentry(dn
);
3677 if (subdir
->is_subtree_root())
3678 remove_subtree(subdir
);
3679 in
->close_dirfrag(subdir
->dirfrag().frag
);
3684 bool MDCache::expire_recursive(
3686 map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
3688 assert(!in
->is_auth());
3690 dout(10) << __func__
<< ":" << *in
<< dendl
;
3692 // Recurse into any dirfrags beneath this inode
3694 in
->get_dirfrags(ls
);
3695 for (auto subdir
: ls
) {
3696 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3697 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3701 for (auto &it
: subdir
->items
) {
3702 CDentry
*dn
= it
.second
;
3703 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3704 if (dnl
->is_primary()) {
3705 CInode
*tin
= dnl
->get_inode();
3707 /* Remote strays with linkage (i.e. hardlinks) should not be
3708 * expired, because they may be the target of
3709 * a rename() as the owning MDS shuts down */
3710 if (!tin
->is_stray() && tin
->inode
.nlink
) {
3711 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3715 const bool abort
= expire_recursive(tin
, expiremap
);
3720 if (dn
->lru_is_expireable()) {
3721 trim_dentry(dn
, expiremap
);
3723 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3732 void MDCache::trim_unlinked_inodes()
3734 dout(7) << "trim_unlinked_inodes" << dendl
;
3736 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
3737 p
!= inode_map
.end();
3739 CInode
*in
= p
->second
;
3740 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3741 dout(7) << " will trim from " << *in
<< dendl
;
3745 for (list
<CInode
*>::iterator p
= q
.begin(); p
!= q
.end(); ++p
)
3746 remove_inode_recursive(*p
);
3749 /** recalc_auth_bits()
3750 * once subtree auth is disambiguated, we need to adjust all the
3751 * auth and dirty bits in our cache before moving on.
3753 void MDCache::recalc_auth_bits(bool replay
)
3755 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3758 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3759 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3761 root
->state_set(CInode::STATE_AUTH
);
3763 root
->state_clear(CInode::STATE_AUTH
);
3765 root
->state_set(CInode::STATE_REJOINING
);
3769 set
<CInode
*> subtree_inodes
;
3770 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3771 p
!= subtrees
.end();
3773 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3774 subtree_inodes
.insert(p
->first
->inode
);
3777 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3778 p
!= subtrees
.end();
3780 if (p
->first
->inode
->is_mdsdir()) {
3781 CInode
*in
= p
->first
->inode
;
3782 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3784 in
->state_set(CInode::STATE_AUTH
);
3786 in
->state_clear(CInode::STATE_AUTH
);
3788 in
->state_set(CInode::STATE_REJOINING
);
3792 list
<CDir
*> dfq
; // dirfrag queue
3793 dfq
.push_back(p
->first
);
3795 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3796 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3798 while (!dfq
.empty()) {
3799 CDir
*dir
= dfq
.front();
3804 dir
->state_set(CDir::STATE_AUTH
);
3806 dir
->state_clear(CDir::STATE_AUTH
);
3808 // close empty non-auth dirfrag
3809 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3810 dir
->inode
->close_dirfrag(dir
->get_frag());
3813 dir
->state_set(CDir::STATE_REJOINING
);
3814 dir
->state_clear(CDir::STATE_COMPLETE
);
3815 if (dir
->is_dirty())
3820 // dentries in this dir
3821 for (CDir::map_t::iterator q
= dir
->items
.begin();
3822 q
!= dir
->items
.end();
3825 CDentry
*dn
= q
->second
;
3826 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3828 dn
->state_set(CDentry::STATE_AUTH
);
3830 dn
->state_clear(CDentry::STATE_AUTH
);
3832 dn
->state_set(CDentry::STATE_REJOINING
);
3838 if (dnl
->is_primary()) {
3840 CInode
*in
= dnl
->get_inode();
3842 in
->state_set(CInode::STATE_AUTH
);
3844 in
->state_clear(CInode::STATE_AUTH
);
3846 in
->state_set(CInode::STATE_REJOINING
);
3849 if (in
->is_dirty_parent())
3850 in
->clear_dirty_parent();
3851 // avoid touching scatterlocks for our subtree roots!
3852 if (subtree_inodes
.count(in
) == 0)
3853 in
->clear_scatter_dirty();
3858 in
->get_nested_dirfrags(dfq
);
3870 // ===========================================================================
3874 * notes on scatterlock recovery:
3876 * - recovering inode replica sends scatterlock data for any subtree
3877 * roots (the only ones that are possibly dirty).
3879 * - surviving auth incorporates any provided scatterlock data. any
3880 * pending gathers are then finished, as with the other lock types.
3882 * that takes care of surviving auth + (recovering replica)*.
3884 * - surviving replica sends strong_inode, which includes current
3885 * scatterlock state, AND any dirty scatterlock data. this
3886 * provides the recovering auth with everything it might need.
3888 * - recovering auth must pick initial scatterlock state based on
3889 * (weak|strong) rejoins.
3890 * - always assimilate scatterlock data (it can't hurt)
3891 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3892 * - include base inode in ack for all inodes that saw scatterlock content
3894 * also, for scatter gather,
3896 * - auth increments {frag,r}stat.version on completion of any gather.
3898 * - auth incorporates changes in a gather _only_ if the version
3901 * - replica discards changes any time the scatterlock syncs, and
3905 void MDCache::dump_rejoin_status(Formatter
*f
) const
3907 f
->open_object_section("rejoin_status");
3908 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3909 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3910 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
3914 void MDCache::rejoin_start(MDSInternalContext
*rejoin_done_
)
3916 dout(10) << "rejoin_start" << dendl
;
3917 assert(!rejoin_done
);
3918 rejoin_done
.reset(rejoin_done_
);
3920 rejoin_gather
= recovery_set
;
3921 // need finish opening cap inodes before sending cache rejoins
3922 rejoin_gather
.insert(mds
->get_nodeid());
3923 process_imported_caps();
3929 * this initiates rejoin. it shoudl be called before we get any
3930 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3932 * we start out by sending rejoins to everyone in the recovery set.
3934 * if we are rejoin, send for all regions in our cache.
3935 * if we are active|stopping, send only to nodes that are are rejoining.
3937 void MDCache::rejoin_send_rejoins()
3939 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
3941 if (rejoin_gather
.count(mds
->get_nodeid())) {
3942 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
3943 rejoins_pending
= true;
3946 if (!resolve_gather
.empty()) {
3947 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3948 << resolve_gather
<< ")" << dendl
;
3949 rejoins_pending
= true;
3953 assert(!migrator
->is_importing());
3954 assert(!migrator
->is_exporting());
3956 if (!mds
->is_rejoin()) {
3957 disambiguate_other_imports();
3960 map
<mds_rank_t
, MMDSCacheRejoin
*> rejoins
;
3963 // if i am rejoining, send a rejoin to everyone.
3964 // otherwise, just send to others who are rejoining.
3965 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
3966 p
!= recovery_set
.end();
3968 if (*p
== mds
->get_nodeid()) continue; // nothing to myself!
3969 if (rejoin_sent
.count(*p
)) continue; // already sent a rejoin to this node!
3970 if (mds
->is_rejoin())
3971 rejoins
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK
);
3972 else if (mds
->mdsmap
->is_rejoin(*p
))
3973 rejoins
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG
);
3976 if (mds
->is_rejoin()) {
3977 map
<client_t
, set
<mds_rank_t
> > client_exports
;
3978 for (auto p
= cap_exports
.begin(); p
!= cap_exports
.end(); ++p
) {
3979 assert(cap_export_targets
.count(p
->first
));
3980 mds_rank_t target
= cap_export_targets
[p
->first
];
3981 if (rejoins
.count(target
) == 0)
3983 rejoins
[target
]->cap_exports
[p
->first
] = p
->second
;
3984 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
3985 client_exports
[q
->first
].insert(target
);
3987 for (map
<client_t
, set
<mds_rank_t
> >::iterator p
= client_exports
.begin();
3988 p
!= client_exports
.end();
3990 entity_inst_t inst
= mds
->sessionmap
.get_inst(entity_name_t::CLIENT(p
->first
.v
));
3991 for (set
<mds_rank_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
3992 rejoins
[*q
]->client_map
[p
->first
] = inst
;
3997 // check all subtrees
3998 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
3999 p
!= subtrees
.end();
4001 CDir
*dir
= p
->first
;
4002 assert(dir
->is_subtree_root());
4003 if (dir
->is_ambiguous_dir_auth()) {
4004 // exporter is recovering, importer is survivor.
4005 assert(rejoins
.count(dir
->authority().first
));
4006 assert(!rejoins
.count(dir
->authority().second
));
4012 continue; // skip my own regions!
4014 mds_rank_t auth
= dir
->get_dir_auth().first
;
4016 if (rejoins
.count(auth
) == 0)
4017 continue; // don't care about this node's subtrees
4019 rejoin_walk(dir
, rejoins
[auth
]);
4022 // rejoin root inodes, too
4023 for (map
<mds_rank_t
, MMDSCacheRejoin
*>::iterator p
= rejoins
.begin();
4026 if (mds
->is_rejoin()) {
4028 if (p
->first
== 0 && root
) {
4029 p
->second
->add_weak_inode(root
->vino());
4030 if (root
->is_dirty_scattered()) {
4031 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4032 p
->second
->add_scatterlock_state(root
);
4035 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
->first
))) {
4037 p
->second
->add_weak_inode(in
->vino());
4041 if (p
->first
== 0 && root
) {
4042 p
->second
->add_strong_inode(root
->vino(),
4043 root
->get_replica_nonce(),
4044 root
->get_caps_wanted(),
4045 root
->filelock
.get_state(),
4046 root
->nestlock
.get_state(),
4047 root
->dirfragtreelock
.get_state());
4048 root
->state_set(CInode::STATE_REJOINING
);
4049 if (root
->is_dirty_scattered()) {
4050 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4051 p
->second
->add_scatterlock_state(root
);
4055 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
->first
))) {
4056 p
->second
->add_strong_inode(in
->vino(),
4057 in
->get_replica_nonce(),
4058 in
->get_caps_wanted(),
4059 in
->filelock
.get_state(),
4060 in
->nestlock
.get_state(),
4061 in
->dirfragtreelock
.get_state());
4062 in
->state_set(CInode::STATE_REJOINING
);
4067 if (!mds
->is_rejoin()) {
4068 // i am survivor. send strong rejoin.
4069 // note request remote_auth_pins, xlocks
4070 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4071 p
!= active_requests
.end();
4073 MDRequestRef
& mdr
= p
->second
;
4074 if (mdr
->is_slave())
4077 for (map
<MDSCacheObject
*,mds_rank_t
>::iterator q
= mdr
->remote_auth_pins
.begin();
4078 q
!= mdr
->remote_auth_pins
.end();
4080 if (!q
->first
->is_auth()) {
4081 assert(q
->second
== q
->first
->authority().first
);
4082 if (rejoins
.count(q
->second
) == 0) continue;
4083 MMDSCacheRejoin
*rejoin
= rejoins
[q
->second
];
4085 dout(15) << " " << *mdr
<< " authpin on " << *q
->first
<< dendl
;
4086 MDSCacheObjectInfo i
;
4087 q
->first
->set_object_info(i
);
4089 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4091 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4093 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4094 mdr
->more()->rename_inode
== q
->first
)
4095 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4096 mdr
->reqid
, mdr
->attempt
);
4100 for (set
<SimpleLock
*>::iterator q
= mdr
->xlocks
.begin();
4101 q
!= mdr
->xlocks
.end();
4103 if (!(*q
)->get_parent()->is_auth()) {
4104 mds_rank_t who
= (*q
)->get_parent()->authority().first
;
4105 if (rejoins
.count(who
) == 0) continue;
4106 MMDSCacheRejoin
*rejoin
= rejoins
[who
];
4108 dout(15) << " " << *mdr
<< " xlock on " << **q
<< " " << *(*q
)->get_parent() << dendl
;
4109 MDSCacheObjectInfo i
;
4110 (*q
)->get_parent()->set_object_info(i
);
4112 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), (*q
)->get_type(),
4113 mdr
->reqid
, mdr
->attempt
);
4115 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4116 mdr
->reqid
, mdr
->attempt
);
4120 for (map
<SimpleLock
*, mds_rank_t
>::iterator q
= mdr
->remote_wrlocks
.begin();
4121 q
!= mdr
->remote_wrlocks
.end();
4123 mds_rank_t who
= q
->second
;
4124 if (rejoins
.count(who
) == 0) continue;
4125 MMDSCacheRejoin
*rejoin
= rejoins
[who
];
4127 dout(15) << " " << *mdr
<< " wrlock on " << q
->second
4128 << " " << q
->first
->get_parent() << dendl
;
4129 MDSCacheObjectInfo i
;
4130 q
->first
->get_parent()->set_object_info(i
);
4132 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), q
->first
->get_type(),
4133 mdr
->reqid
, mdr
->attempt
);
4138 // send the messages
4139 for (map
<mds_rank_t
,MMDSCacheRejoin
*>::iterator p
= rejoins
.begin();
4142 assert(rejoin_sent
.count(p
->first
) == 0);
4143 assert(rejoin_ack_gather
.count(p
->first
) == 0);
4144 rejoin_sent
.insert(p
->first
);
4145 rejoin_ack_gather
.insert(p
->first
);
4146 mds
->send_message_mds(p
->second
, p
->first
);
4148 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4149 rejoins_pending
= false;
4152 if (mds
->is_rejoin() && rejoins
.empty()) {
4153 dout(10) << "nothing to rejoin" << dendl
;
4154 rejoin_gather_finish();
4160 * rejoin_walk - build rejoin declarations for a subtree
4162 * @param dir subtree root
4163 * @param rejoin rejoin message
4165 * from a rejoining node:
4167 * weak dentries (w/ connectivity)
4169 * from a surviving node:
4171 * strong dentries (no connectivity!)
4174 void MDCache::rejoin_walk(CDir
*dir
, MMDSCacheRejoin
*rejoin
)
4176 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4178 list
<CDir
*> nested
; // finish this dir, then do nested items
4180 if (mds
->is_rejoin()) {
4182 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4183 for (CDir::map_t::iterator p
= dir
->items
.begin();
4184 p
!= dir
->items
.end();
4186 CDentry
*dn
= p
->second
;
4187 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4188 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4189 assert(dnl
->is_primary());
4190 CInode
*in
= dnl
->get_inode();
4191 assert(dnl
->get_inode()->is_dir());
4192 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->name
.c_str(), dn
->first
, dn
->last
, in
->ino());
4193 in
->get_nested_dirfrags(nested
);
4194 if (in
->is_dirty_scattered()) {
4195 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4196 rejoin
->add_scatterlock_state(in
);
4201 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4202 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4203 dir
->state_set(CDir::STATE_REJOINING
);
4205 for (CDir::map_t::iterator p
= dir
->items
.begin();
4206 p
!= dir
->items
.end();
4208 CDentry
*dn
= p
->second
;
4209 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4210 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4211 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->name
, dn
->first
, dn
->last
,
4212 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4213 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4214 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4215 dn
->get_replica_nonce(),
4216 dn
->lock
.get_state());
4217 dn
->state_set(CDentry::STATE_REJOINING
);
4218 if (dnl
->is_primary()) {
4219 CInode
*in
= dnl
->get_inode();
4220 dout(15) << " add_strong_inode " << *in
<< dendl
;
4221 rejoin
->add_strong_inode(in
->vino(),
4222 in
->get_replica_nonce(),
4223 in
->get_caps_wanted(),
4224 in
->filelock
.get_state(),
4225 in
->nestlock
.get_state(),
4226 in
->dirfragtreelock
.get_state());
4227 in
->state_set(CInode::STATE_REJOINING
);
4228 in
->get_nested_dirfrags(nested
);
4229 if (in
->is_dirty_scattered()) {
4230 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4231 rejoin
->add_scatterlock_state(in
);
4237 // recurse into nested dirs
4238 for (list
<CDir
*>::iterator p
= nested
.begin();
4241 rejoin_walk(*p
, rejoin
);
4247 * - reply with the lockstate
4249 * if i am active|stopping,
4250 * - remove source from replica list for everything not referenced here.
4251 * This function puts the passed message before returning.
4253 void MDCache::handle_cache_rejoin(MMDSCacheRejoin
*m
)
4255 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4256 << " (" << m
->get_payload().length() << " bytes)"
4260 case MMDSCacheRejoin::OP_WEAK
:
4261 handle_cache_rejoin_weak(m
);
4263 case MMDSCacheRejoin::OP_STRONG
:
4264 handle_cache_rejoin_strong(m
);
4266 case MMDSCacheRejoin::OP_ACK
:
4267 handle_cache_rejoin_ack(m
);
4278 * handle_cache_rejoin_weak
4281 * - is recovering from their journal.
4282 * - may have incorrect (out of date) inode contents
4283 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4285 * if the sender didn't trim_non_auth(), they
4286 * - may have incorrect (out of date) dentry/inode linkage
4287 * - may have deleted/purged inodes
4288 * and i may have to go to disk to get accurate inode contents. yuck.
4289 * This functions DOES NOT put the passed message before returning
4291 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin
*weak
)
4293 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4295 // possible response(s)
4296 MMDSCacheRejoin
*ack
= 0; // if survivor
4297 set
<vinodeno_t
> acked_inodes
; // if survivor
4298 set
<SimpleLock
*> gather_locks
; // if survivor
4299 bool survivor
= false; // am i a survivor?
4301 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4303 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4304 ack
= new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK
);
4306 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4308 // check cap exports
4309 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4310 CInode
*in
= get_inode(p
->first
);
4311 assert(!in
|| in
->is_auth());
4312 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4313 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4314 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4315 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4317 im
.cap_id
= cap
->get_cap_id();
4318 im
.issue_seq
= cap
->get_last_seq();
4319 im
.mseq
= cap
->get_mseq();
4324 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4327 ::encode(imported_caps
, ack
->imported_caps
);
4329 assert(mds
->is_rejoin());
4331 // we may have already received a strong rejoin from the sender.
4332 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4333 assert(gather_locks
.empty());
4335 // check cap exports.
4336 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4338 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4339 CInode
*in
= get_inode(p
->first
);
4340 assert(in
&& in
->is_auth());
4342 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4343 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4344 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4349 // assimilate any potentially dirty scatterlock state
4350 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= weak
->inode_scatterlocks
.begin();
4351 p
!= weak
->inode_scatterlocks
.end();
4353 CInode
*in
= get_inode(p
->first
);
4355 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
->second
.file
);
4356 in
->decode_lock_state(CEPH_LOCK_INEST
, p
->second
.nest
);
4357 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
->second
.dft
);
4359 rejoin_potential_updated_scatterlocks
.insert(in
);
4362 // recovering peer may send incorrect dirfrags here. we need to
4363 // infer which dirfrag they meant. the ack will include a
4364 // strong_dirfrag that will set them straight on the fragmentation.
4367 set
<CDir
*> dirs_to_share
;
4368 for (set
<dirfrag_t
>::iterator p
= weak
->weak_dirfrags
.begin();
4369 p
!= weak
->weak_dirfrags
.end();
4371 CInode
*diri
= get_inode(p
->ino
);
4373 dout(0) << " missing dir ino " << p
->ino
<< dendl
;
4377 if (diri
->dirfragtree
.is_leaf(p
->frag
)) {
4378 ls
.push_back(p
->frag
);
4380 diri
->dirfragtree
.get_leaves_under(p
->frag
, ls
);
4382 ls
.push_back(diri
->dirfragtree
[p
->frag
.value()]);
4384 for (list
<frag_t
>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
4386 CDir
*dir
= diri
->get_dirfrag(fg
);
4388 dout(0) << " missing dir for " << p
->frag
<< " (which maps to " << fg
<< ") on " << *diri
<< dendl
;
4392 if (dirs_to_share
.count(dir
)) {
4393 dout(10) << " already have " << p
->frag
<< " -> " << fg
<< " " << *dir
<< dendl
;
4395 dirs_to_share
.insert(dir
);
4396 unsigned nonce
= dir
->add_replica(from
);
4397 dout(10) << " have " << p
->frag
<< " -> " << fg
<< " " << *dir
<< dendl
;
4399 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4400 ack
->add_dirfrag_base(dir
);
4406 for (map
<inodeno_t
,map
<string_snap_t
,MMDSCacheRejoin::dn_weak
> >::iterator p
= weak
->weak
.begin();
4407 p
!= weak
->weak
.end();
4409 CInode
*diri
= get_inode(p
->first
);
4411 dout(0) << " missing dir ino " << p
->first
<< dendl
;
4416 for (map
<string_snap_t
,MMDSCacheRejoin::dn_weak
>::iterator q
= p
->second
.begin();
4417 q
!= p
->second
.end();
4419 // locate proper dirfrag.
4420 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4421 frag_t fg
= diri
->pick_dirfrag(q
->first
.name
);
4422 if (!dir
|| dir
->get_frag() != fg
) {
4423 dir
= diri
->get_dirfrag(fg
);
4425 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4427 assert(dirs_to_share
.count(dir
));
4431 CDentry
*dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4433 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4434 assert(dnl
->is_primary());
4436 if (survivor
&& dn
->is_replica(from
))
4437 dentry_remove_replica(dn
, from
, gather_locks
);
4438 unsigned dnonce
= dn
->add_replica(from
);
4439 dout(10) << " have " << *dn
<< dendl
;
4441 ack
->add_strong_dentry(dir
->dirfrag(), dn
->name
, dn
->first
, dn
->last
,
4442 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4443 dnonce
, dn
->lock
.get_replica_state());
4446 CInode
*in
= dnl
->get_inode();
4449 if (survivor
&& in
->is_replica(from
))
4450 inode_remove_replica(in
, from
, true, gather_locks
);
4451 unsigned inonce
= in
->add_replica(from
);
4452 dout(10) << " have " << *in
<< dendl
;
4454 // scatter the dirlock, just in case?
4455 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4456 in
->filelock
.set_state(LOCK_MIX
);
4459 acked_inodes
.insert(in
->vino());
4460 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4462 in
->_encode_locks_state_for_rejoin(bl
, from
);
4463 ack
->add_inode_locks(in
, inonce
, bl
);
4468 // weak base inodes? (root, stray, etc.)
4469 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4470 p
!= weak
->weak_inodes
.end();
4472 CInode
*in
= get_inode(*p
);
4473 assert(in
); // hmm fixme wrt stray?
4474 if (survivor
&& in
->is_replica(from
))
4475 inode_remove_replica(in
, from
, true, gather_locks
);
4476 unsigned inonce
= in
->add_replica(from
);
4477 dout(10) << " have base " << *in
<< dendl
;
4480 acked_inodes
.insert(in
->vino());
4481 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4483 in
->_encode_locks_state_for_rejoin(bl
, from
);
4484 ack
->add_inode_locks(in
, inonce
, bl
);
4488 assert(rejoin_gather
.count(from
));
4489 rejoin_gather
.erase(from
);
4491 // survivor. do everything now.
4492 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= weak
->inode_scatterlocks
.begin();
4493 p
!= weak
->inode_scatterlocks
.end();
4495 CInode
*in
= get_inode(p
->first
);
4497 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4498 acked_inodes
.insert(in
->vino());
4499 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4502 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4503 mds
->send_message(ack
, weak
->get_connection());
4505 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4506 if (!(*p
)->is_stable())
4507 mds
->locker
->eval_gather(*p
);
4511 if (rejoin_gather
.empty()) {
4512 rejoin_gather_finish();
4514 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4519 class C_MDC_RejoinGatherFinish
: public MDCacheContext
{
4521 explicit C_MDC_RejoinGatherFinish(MDCache
*c
) : MDCacheContext(c
) {}
4522 void finish(int r
) override
{
4523 mdcache
->rejoin_gather_finish();
4528 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4530 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4531 * ack, the replica dne, and we can remove it from our replica maps.
4533 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, MMDSCacheRejoin
*ack
,
4534 set
<vinodeno_t
>& acked_inodes
,
4535 set
<SimpleLock
*>& gather_locks
)
4537 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4539 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
4540 p
!= inode_map
.end();
4542 CInode
*in
= p
->second
;
4545 if (in
->is_auth() &&
4546 in
->is_replica(from
) &&
4547 (ack
== NULL
|| acked_inodes
.count(p
->second
->vino()) == 0)) {
4548 inode_remove_replica(in
, from
, false, gather_locks
);
4549 dout(10) << " rem " << *in
<< dendl
;
4552 if (!in
->is_dir()) continue;
4555 in
->get_dirfrags(dfs
);
4556 for (list
<CDir
*>::iterator p
= dfs
.begin();
4561 if (dir
->is_auth() &&
4562 dir
->is_replica(from
) &&
4563 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4564 dir
->remove_replica(from
);
4565 dout(10) << " rem " << *dir
<< dendl
;
4569 for (CDir::map_t::iterator p
= dir
->items
.begin();
4570 p
!= dir
->items
.end();
4572 CDentry
*dn
= p
->second
;
4574 if (dn
->is_replica(from
) &&
4576 ack
->strong_dentries
.count(dir
->dirfrag()) == 0 ||
4577 ack
->strong_dentries
[dir
->dirfrag()].count(string_snap_t(dn
->name
, dn
->last
)) == 0)) {
4578 dentry_remove_replica(dn
, from
, gather_locks
);
4579 dout(10) << " rem " << *dn
<< dendl
;
4587 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4589 CInode
*in
= new CInode(this, true, 1, last
);
4590 in
->inode
.ino
= ino
;
4591 in
->state_set(CInode::STATE_REJOINUNDEF
);
4593 rejoin_undef_inodes
.insert(in
);
4594 dout(10) << " invented " << *in
<< dendl
;
4598 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4600 CInode
*in
= get_inode(df
.ino
);
4602 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4603 if (!in
->is_dir()) {
4604 assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4605 in
->inode
.mode
= S_IFDIR
;
4606 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
4608 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4609 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4610 rejoin_undef_dirfrags
.insert(dir
);
4611 dout(10) << " invented " << *dir
<< dendl
;
4615 /* This functions DOES NOT put the passed message before returning */
4616 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin
*strong
)
4618 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4620 // only a recovering node will get a strong rejoin.
4621 assert(mds
->is_rejoin());
4623 // assimilate any potentially dirty scatterlock state
4624 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= strong
->inode_scatterlocks
.begin();
4625 p
!= strong
->inode_scatterlocks
.end();
4627 CInode
*in
= get_inode(p
->first
);
4629 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
->second
.file
);
4630 in
->decode_lock_state(CEPH_LOCK_INEST
, p
->second
.nest
);
4631 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
->second
.dft
);
4632 rejoin_potential_updated_scatterlocks
.insert(in
);
4635 rejoin_unlinked_inodes
[from
].clear();
4637 // surviving peer may send incorrect dirfrag here (maybe they didn't
4638 // get the fragment notify, or maybe we rolled back?). we need to
4639 // infer the right frag and get them with the program. somehow.
4640 // we don't normally send ACK.. so we'll need to bundle this with
4641 // MISSING or something.
4643 // strong dirfrags/dentries.
4644 // also process auth_pins, xlocks.
4645 for (map
<dirfrag_t
, MMDSCacheRejoin::dirfrag_strong
>::iterator p
= strong
->strong_dirfrags
.begin();
4646 p
!= strong
->strong_dirfrags
.end();
4648 CInode
*diri
= get_inode(p
->first
.ino
);
4650 diri
= rejoin_invent_inode(p
->first
.ino
, CEPH_NOSNAP
);
4651 CDir
*dir
= diri
->get_dirfrag(p
->first
.frag
);
4652 bool refragged
= false;
4654 dout(10) << " have " << *dir
<< dendl
;
4656 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4657 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4658 else if (diri
->dirfragtree
.is_leaf(p
->first
.frag
))
4659 dir
= rejoin_invent_dirfrag(p
->first
);
4662 dir
->add_replica(from
, p
->second
.nonce
);
4663 dir
->dir_rep
= p
->second
.dir_rep
;
4665 dout(10) << " frag " << p
->first
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4667 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, ls
);
4669 ls
.push_back(diri
->dirfragtree
[p
->first
.frag
.value()]);
4670 dout(10) << " maps to frag(s) " << ls
<< dendl
;
4671 for (list
<frag_t
>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
4672 CDir
*dir
= diri
->get_dirfrag(*q
);
4674 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), *q
));
4676 dout(10) << " have(approx) " << *dir
<< dendl
;
4677 dir
->add_replica(from
, p
->second
.nonce
);
4678 dir
->dir_rep
= p
->second
.dir_rep
;
4683 map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>& dmap
= strong
->strong_dentries
[p
->first
];
4684 for (map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>::iterator q
= dmap
.begin();
4689 dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4691 frag_t fg
= diri
->pick_dirfrag(q
->first
.name
);
4692 dir
= diri
->get_dirfrag(fg
);
4694 dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4697 if (q
->second
.is_remote()) {
4698 dn
= dir
->add_remote_dentry(q
->first
.name
, q
->second
.remote_ino
, q
->second
.remote_d_type
,
4699 q
->second
.first
, q
->first
.snapid
);
4700 } else if (q
->second
.is_null()) {
4701 dn
= dir
->add_null_dentry(q
->first
.name
, q
->second
.first
, q
->first
.snapid
);
4703 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4704 if (!in
) in
= rejoin_invent_inode(q
->second
.ino
, q
->first
.snapid
);
4705 dn
= dir
->add_primary_dentry(q
->first
.name
, in
, q
->second
.first
, q
->first
.snapid
);
4707 dout(10) << " invented " << *dn
<< dendl
;
4709 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4712 if (strong
->authpinned_dentries
.count(p
->first
) &&
4713 strong
->authpinned_dentries
[p
->first
].count(q
->first
)) {
4714 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= strong
->authpinned_dentries
[p
->first
][q
->first
].begin();
4715 r
!= strong
->authpinned_dentries
[p
->first
][q
->first
].end();
4717 dout(10) << " dn authpin by " << *r
<< " on " << *dn
<< dendl
;
4719 // get/create slave mdrequest
4721 if (have_request(r
->reqid
))
4722 mdr
= request_get(r
->reqid
);
4724 mdr
= request_start_slave(r
->reqid
, r
->attempt
, strong
);
4730 if (strong
->xlocked_dentries
.count(p
->first
) &&
4731 strong
->xlocked_dentries
[p
->first
].count(q
->first
)) {
4732 MMDSCacheRejoin::slave_reqid r
= strong
->xlocked_dentries
[p
->first
][q
->first
];
4733 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4734 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4735 assert(mdr
->is_auth_pinned(dn
));
4736 if (!mdr
->xlocks
.count(&dn
->versionlock
)) {
4737 assert(dn
->versionlock
.can_xlock_local());
4738 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4739 mdr
->xlocks
.insert(&dn
->versionlock
);
4740 mdr
->locks
.insert(&dn
->versionlock
);
4742 if (dn
->lock
.is_stable())
4743 dn
->auth_pin(&dn
->lock
);
4744 dn
->lock
.set_state(LOCK_XLOCK
);
4745 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4746 mdr
->xlocks
.insert(&dn
->lock
);
4747 mdr
->locks
.insert(&dn
->lock
);
4750 dn
->add_replica(from
, q
->second
.nonce
);
4751 dout(10) << " have " << *dn
<< dendl
;
4753 if (dnl
->is_primary()) {
4754 if (q
->second
.is_primary()) {
4755 if (vinodeno_t(q
->second
.ino
, q
->first
.snapid
) != dnl
->get_inode()->vino()) {
4756 // the survivor missed MDentryUnlink+MDentryLink messages ?
4757 assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4758 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4760 assert(in
->get_parent_dn());
4761 rejoin_unlinked_inodes
[from
].insert(in
);
4762 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4765 // the survivor missed MDentryLink message ?
4766 assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4767 dout(7) << " sender doesn't have primay dentry" << dendl
;
4770 if (q
->second
.is_primary()) {
4771 // the survivor missed MDentryUnlink message ?
4772 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4774 assert(in
->get_parent_dn());
4775 rejoin_unlinked_inodes
[from
].insert(in
);
4776 dout(7) << " sender has primary dentry but we don't" << dendl
;
4782 for (map
<vinodeno_t
, MMDSCacheRejoin::inode_strong
>::iterator p
= strong
->strong_inodes
.begin();
4783 p
!= strong
->strong_inodes
.end();
4785 CInode
*in
= get_inode(p
->first
);
4787 in
->add_replica(from
, p
->second
.nonce
);
4788 dout(10) << " have " << *in
<< dendl
;
4790 MMDSCacheRejoin::inode_strong
&is
= p
->second
;
4793 if (is
.caps_wanted
) {
4794 in
->mds_caps_wanted
[from
] = is
.caps_wanted
;
4795 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4796 << " on " << *in
<< dendl
;
4800 // infer state from replica state:
4801 // * go to MIX if they might have wrlocks
4802 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4803 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4804 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4805 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4808 if (strong
->authpinned_inodes
.count(in
->vino())) {
4809 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= strong
->authpinned_inodes
[in
->vino()].begin();
4810 r
!= strong
->authpinned_inodes
[in
->vino()].end();
4812 dout(10) << " inode authpin by " << *r
<< " on " << *in
<< dendl
;
4814 // get/create slave mdrequest
4816 if (have_request(r
->reqid
))
4817 mdr
= request_get(r
->reqid
);
4819 mdr
= request_start_slave(r
->reqid
, r
->attempt
, strong
);
4820 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4821 assert(!in
->get_num_auth_pins());
4822 mdr
->freeze_auth_pin(in
);
4824 assert(!in
->is_frozen_auth_pin());
4830 if (strong
->xlocked_inodes
.count(in
->vino())) {
4831 for (map
<int,MMDSCacheRejoin::slave_reqid
>::iterator q
= strong
->xlocked_inodes
[in
->vino()].begin();
4832 q
!= strong
->xlocked_inodes
[in
->vino()].end();
4834 SimpleLock
*lock
= in
->get_lock(q
->first
);
4835 dout(10) << " inode xlock by " << q
->second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4836 MDRequestRef mdr
= request_get(q
->second
.reqid
); // should have this from auth_pin above.
4837 assert(mdr
->is_auth_pinned(in
));
4838 if (!mdr
->xlocks
.count(&in
->versionlock
)) {
4839 assert(in
->versionlock
.can_xlock_local());
4840 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4841 mdr
->xlocks
.insert(&in
->versionlock
);
4842 mdr
->locks
.insert(&in
->versionlock
);
4844 if (lock
->is_stable())
4846 lock
->set_state(LOCK_XLOCK
);
4847 if (lock
== &in
->filelock
)
4849 lock
->get_xlock(mdr
, mdr
->get_client());
4850 mdr
->xlocks
.insert(lock
);
4851 mdr
->locks
.insert(lock
);
4856 for (map
<vinodeno_t
, map
<int, list
<MMDSCacheRejoin::slave_reqid
> > >::iterator p
= strong
->wrlocked_inodes
.begin();
4857 p
!= strong
->wrlocked_inodes
.end();
4859 CInode
*in
= get_inode(p
->first
);
4860 for (map
<int, list
<MMDSCacheRejoin::slave_reqid
> >::iterator q
= p
->second
.begin();
4861 q
!= p
->second
.end();
4863 SimpleLock
*lock
= in
->get_lock(q
->first
);
4864 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= q
->second
.begin();
4865 r
!= q
->second
.end();
4867 dout(10) << " inode wrlock by " << *r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4868 MDRequestRef mdr
= request_get(r
->reqid
); // should have this from auth_pin above.
4870 assert(mdr
->is_auth_pinned(in
));
4871 lock
->set_state(LOCK_MIX
);
4872 if (lock
== &in
->filelock
)
4874 lock
->get_wrlock(true);
4875 mdr
->wrlocks
.insert(lock
);
4876 mdr
->locks
.insert(lock
);
4882 assert(rejoin_gather
.count(from
));
4883 rejoin_gather
.erase(from
);
4884 if (rejoin_gather
.empty()) {
4885 rejoin_gather_finish();
4887 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4891 /* This functions DOES NOT put the passed message before returning */
4892 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin
*ack
)
4894 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4895 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4897 // for sending cache expire message
4898 set
<CInode
*> isolated_inodes
;
4899 set
<CInode
*> refragged_inodes
;
4902 for (map
<dirfrag_t
, MMDSCacheRejoin::dirfrag_strong
>::iterator p
= ack
->strong_dirfrags
.begin();
4903 p
!= ack
->strong_dirfrags
.end();
4905 // we may have had incorrect dir fragmentation; refragment based
4906 // on what they auth tells us.
4907 CDir
*dir
= get_dirfrag(p
->first
);
4909 dir
= get_force_dirfrag(p
->first
, false);
4911 refragged_inodes
.insert(dir
->get_inode());
4914 CInode
*diri
= get_inode(p
->first
.ino
);
4916 // barebones inode; the full inode loop below will clean up.
4917 diri
= new CInode(this, false);
4918 diri
->inode
.ino
= p
->first
.ino
;
4919 diri
->inode
.mode
= S_IFDIR
;
4920 diri
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
4922 if (MDS_INO_MDSDIR(from
) == p
->first
.ino
) {
4923 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
4924 dout(10) << " add inode " << *diri
<< dendl
;
4926 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
4927 isolated_inodes
.insert(diri
);
4928 dout(10) << " unconnected dirfrag " << p
->first
<< dendl
;
4931 // barebones dirfrag; the full dirfrag loop below will clean up.
4932 dir
= diri
->add_dirfrag(new CDir(diri
, p
->first
.frag
, this, false));
4933 if (MDS_INO_MDSDIR(from
) == p
->first
.ino
||
4934 (dir
->authority() != CDIR_AUTH_UNDEF
&&
4935 dir
->authority().first
!= from
))
4936 adjust_subtree_auth(dir
, from
);
4937 dout(10) << " add dirfrag " << *dir
<< dendl
;
4940 dir
->set_replica_nonce(p
->second
.nonce
);
4941 dir
->state_clear(CDir::STATE_REJOINING
);
4942 dout(10) << " got " << *dir
<< dendl
;
4945 map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>& dmap
= ack
->strong_dentries
[p
->first
];
4946 for (map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>::iterator q
= dmap
.begin();
4949 CDentry
*dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4951 dn
= dir
->add_null_dentry(q
->first
.name
, q
->second
.first
, q
->first
.snapid
);
4953 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4955 assert(dn
->last
== q
->first
.snapid
);
4956 if (dn
->first
!= q
->second
.first
) {
4957 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
->second
.first
<< " on " << *dn
<< dendl
;
4958 dn
->first
= q
->second
.first
;
4961 // may have bad linkage if we missed dentry link/unlink messages
4962 if (dnl
->is_primary()) {
4963 CInode
*in
= dnl
->get_inode();
4964 if (!q
->second
.is_primary() ||
4965 vinodeno_t(q
->second
.ino
, q
->first
.snapid
) != in
->vino()) {
4966 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
4967 dir
->unlink_inode(dn
);
4969 } else if (dnl
->is_remote()) {
4970 if (!q
->second
.is_remote() ||
4971 q
->second
.remote_ino
!= dnl
->get_remote_ino() ||
4972 q
->second
.remote_d_type
!= dnl
->get_remote_d_type()) {
4973 dout(10) << " had bad linkage for " << *dn
<< dendl
;
4974 dir
->unlink_inode(dn
);
4977 if (!q
->second
.is_null())
4978 dout(10) << " had bad linkage for " << *dn
<< dendl
;
4981 // hmm, did we have the proper linkage here?
4982 if (dnl
->is_null() && !q
->second
.is_null()) {
4983 if (q
->second
.is_remote()) {
4984 dn
->dir
->link_remote_inode(dn
, q
->second
.remote_ino
, q
->second
.remote_d_type
);
4986 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4988 // barebones inode; assume it's dir, the full inode loop below will clean up.
4989 in
= new CInode(this, false, q
->second
.first
, q
->first
.snapid
);
4990 in
->inode
.ino
= q
->second
.ino
;
4991 in
->inode
.mode
= S_IFDIR
;
4992 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
4994 dout(10) << " add inode " << *in
<< dendl
;
4995 } else if (in
->get_parent_dn()) {
4996 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
4997 << ", unlinking " << *in
<< dendl
;
4998 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5000 dn
->dir
->link_primary_inode(dn
, in
);
5001 isolated_inodes
.erase(in
);
5005 dn
->set_replica_nonce(q
->second
.nonce
);
5006 dn
->lock
.set_state_rejoin(q
->second
.lock
, rejoin_waiters
);
5007 dn
->state_clear(CDentry::STATE_REJOINING
);
5008 dout(10) << " got " << *dn
<< dendl
;
5012 for (set
<CInode
*>::iterator p
= refragged_inodes
.begin();
5013 p
!= refragged_inodes
.end();
5016 (*p
)->get_nested_dirfrags(ls
);
5017 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
5018 if ((*q
)->is_auth() || ack
->strong_dirfrags
.count((*q
)->dirfrag()))
5020 assert((*q
)->get_num_any() == 0);
5021 (*p
)->close_dirfrag((*q
)->get_frag());
5026 for (map
<dirfrag_t
, bufferlist
>::iterator p
= ack
->dirfrag_bases
.begin();
5027 p
!= ack
->dirfrag_bases
.end();
5029 CDir
*dir
= get_dirfrag(p
->first
);
5031 bufferlist::iterator q
= p
->second
.begin();
5032 dir
->_decode_base(q
);
5033 dout(10) << " got dir replica " << *dir
<< dendl
;
5037 bufferlist::iterator p
= ack
->inode_base
.begin();
5044 ::decode(basebl
, p
);
5045 CInode
*in
= get_inode(ino
, last
);
5047 bufferlist::iterator q
= basebl
.begin();
5048 in
->_decode_base(q
);
5049 dout(10) << " got inode base " << *in
<< dendl
;
5053 p
= ack
->inode_locks
.begin();
5054 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5063 ::decode(lockbl
, p
);
5065 CInode
*in
= get_inode(ino
, last
);
5067 in
->set_replica_nonce(nonce
);
5068 bufferlist::iterator q
= lockbl
.begin();
5069 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
);
5070 in
->state_clear(CInode::STATE_REJOINING
);
5071 dout(10) << " got inode locks " << *in
<< dendl
;
5074 // FIXME: This can happen if entire subtree, together with the inode subtree root
5075 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5076 assert(isolated_inodes
.empty());
5078 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5079 bufferlist::iterator bp
= ack
->imported_caps
.begin();
5080 ::decode(peer_imported
, bp
);
5082 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5083 p
!= peer_imported
.end();
5085 assert(cap_exports
.count(p
->first
));
5086 assert(cap_export_targets
.count(p
->first
));
5087 assert(cap_export_targets
[p
->first
] == from
);
5088 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5089 q
!= p
->second
.end();
5091 assert(cap_exports
[p
->first
].count(q
->first
));
5093 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5094 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5097 // mark client caps stale.
5098 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5099 cap_exports
[p
->first
][q
->first
].capinfo
.cap_id
, 0,
5100 mds
->get_osd_epoch_barrier());
5101 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5102 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5103 mds
->send_message_client_counted(m
, session
);
5105 cap_exports
[p
->first
].erase(q
->first
);
5107 assert(cap_exports
[p
->first
].empty());
5111 assert(rejoin_ack_gather
.count(from
));
5112 rejoin_ack_gather
.erase(from
);
5113 if (mds
->is_rejoin()) {
5115 if (rejoin_gather
.empty()) {
5116 // eval unstable scatter locks after all wrlocks are rejoined.
5117 while (!rejoin_eval_locks
.empty()) {
5118 SimpleLock
*lock
= rejoin_eval_locks
.front();
5119 rejoin_eval_locks
.pop_front();
5120 if (!lock
->is_stable())
5121 mds
->locker
->eval_gather(lock
);
5125 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5126 rejoin_ack_gather
.empty()) {
5127 // finally, kickstart past snap parent opens
5128 open_snap_parents();
5130 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5131 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5135 mds
->queue_waiters(rejoin_waiters
);
5140 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5142 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5143 * messages that clean these guys up...
5145 void MDCache::rejoin_trim_undef_inodes()
5147 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5149 while (!rejoin_undef_inodes
.empty()) {
5150 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5152 rejoin_undef_inodes
.erase(p
);
5154 in
->clear_replica_map();
5156 // close out dirfrags
5159 in
->get_dirfrags(dfls
);
5160 for (list
<CDir
*>::iterator p
= dfls
.begin();
5164 dir
->clear_replica_map();
5166 for (CDir::map_t::iterator p
= dir
->items
.begin();
5167 p
!= dir
->items
.end();
5169 CDentry
*dn
= p
->second
;
5170 dn
->clear_replica_map();
5172 dout(10) << " trimming " << *dn
<< dendl
;
5173 dir
->remove_dentry(dn
);
5176 dout(10) << " trimming " << *dir
<< dendl
;
5177 in
->close_dirfrag(dir
->dirfrag().frag
);
5181 CDentry
*dn
= in
->get_parent_dn();
5183 dn
->clear_replica_map();
5184 dout(10) << " trimming " << *dn
<< dendl
;
5185 dn
->dir
->remove_dentry(dn
);
5187 dout(10) << " trimming " << *in
<< dendl
;
5192 assert(rejoin_undef_inodes
.empty());
5195 void MDCache::rejoin_gather_finish()
5197 dout(10) << "rejoin_gather_finish" << dendl
;
5198 assert(mds
->is_rejoin());
5200 if (open_undef_inodes_dirfrags())
5203 if (process_imported_caps())
5206 choose_lock_states_and_reconnect_caps();
5208 identify_files_to_recover();
5211 // signal completion of fetches, rejoin_gather_finish, etc.
5212 assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5213 rejoin_ack_gather
.erase(mds
->get_nodeid());
5215 // did we already get our acks too?
5216 if (rejoin_ack_gather
.empty()) {
5217 // finally, kickstart past snap parent opens
5218 open_snap_parents();
5222 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5225 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5226 void finish(int r
) override
{
5227 mdcache
->rejoin_open_ino_finish(ino
, r
);
5231 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5233 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5236 cap_imports_missing
.insert(ino
);
5237 } else if (ret
== mds
->get_nodeid()) {
5238 assert(get_inode(ino
));
5240 auto p
= cap_imports
.find(ino
);
5241 assert(p
!= cap_imports
.end());
5242 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5243 assert(q
->second
.count(MDS_RANK_NONE
));
5244 assert(q
->second
.size() == 1);
5245 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5247 cap_imports
.erase(p
);
5250 assert(cap_imports_num_opening
> 0);
5251 cap_imports_num_opening
--;
5253 if (cap_imports_num_opening
== 0) {
5254 if (rejoin_gather
.empty())
5255 rejoin_gather_finish();
5256 else if (rejoin_gather
.count(mds
->get_nodeid()))
5257 process_imported_caps();
5261 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5263 map
<client_t
,entity_inst_t
> client_map
;
5264 map
<client_t
,uint64_t> sseqmap
;
5266 C_MDC_RejoinSessionsOpened(MDCache
*c
, map
<client_t
,entity_inst_t
>& cm
) :
5267 MDCacheLogContext(c
), client_map(cm
) {}
5268 void finish(int r
) override
{
5270 mdcache
->rejoin_open_sessions_finish(client_map
, sseqmap
);
5274 void MDCache::rejoin_open_sessions_finish(map
<client_t
,entity_inst_t
> client_map
,
5275 map
<client_t
,uint64_t>& sseqmap
)
5277 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5278 mds
->server
->finish_force_open_sessions(client_map
, sseqmap
);
5279 if (rejoin_gather
.empty())
5280 rejoin_gather_finish();
5283 bool MDCache::process_imported_caps()
5285 dout(10) << "process_imported_caps" << dendl
;
5287 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5288 CInode
*in
= get_inode(p
->first
);
5290 assert(in
->is_auth());
5291 cap_imports_missing
.erase(p
->first
);
5294 if (cap_imports_missing
.count(p
->first
) > 0)
5297 cap_imports_num_opening
++;
5298 dout(10) << " opening missing ino " << p
->first
<< dendl
;
5299 open_ino(p
->first
, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p
->first
), false);
5302 if (cap_imports_num_opening
> 0)
5305 // called by rejoin_gather_finish() ?
5306 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5307 // if sessions for imported caps are all open ?
5308 for (map
<client_t
,entity_inst_t
>::iterator p
= rejoin_client_map
.begin();
5309 p
!= rejoin_client_map
.end();
5311 if (!mds
->sessionmap
.have_session(entity_name_t::CLIENT(p
->first
.v
))) {
5312 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this, rejoin_client_map
);
5313 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
, finish
->sseqmap
);
5314 ESessions
*le
= new ESessions(pv
, rejoin_client_map
);
5315 mds
->mdlog
->start_submit_entry(le
, finish
);
5316 mds
->mdlog
->flush();
5317 rejoin_client_map
.clear();
5321 rejoin_client_map
.clear();
5323 // process caps that were exported by slave rename
5324 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_slave_exports
.begin();
5325 p
!= rejoin_slave_exports
.end();
5327 CInode
*in
= get_inode(p
->first
);
5329 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5330 q
!= p
->second
.second
.end();
5332 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5335 Capability
*cap
= in
->get_client_cap(q
->first
);
5337 cap
= in
->add_client_cap(q
->first
, session
);
5338 cap
->merge(q
->second
, true);
5340 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5341 assert(cap
->get_last_seq() == im
.issue_seq
);
5342 assert(cap
->get_mseq() == im
.mseq
);
5343 cap
->set_cap_id(im
.cap_id
);
5344 // send cap import because we assigned a new cap ID
5345 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5346 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5349 rejoin_slave_exports
.clear();
5350 rejoin_imported_caps
.clear();
5352 // process cap imports
5353 // ino -> client -> frommds -> capex
5354 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5355 CInode
*in
= get_inode(p
->first
);
5357 dout(10) << " still missing ino " << p
->first
5358 << ", will try again after replayed client requests" << dendl
;
5362 assert(in
->is_auth());
5363 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5364 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5366 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5367 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5368 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5369 if (r
->first
>= 0) {
5370 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5372 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5374 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5375 im
.cap_id
= cap
->get_cap_id();
5376 im
.issue_seq
= cap
->get_last_seq();
5377 im
.mseq
= cap
->get_mseq();
5381 cap_imports
.erase(p
++); // remove and move on
5386 rejoin_gather
.erase(mds
->get_nodeid());
5387 maybe_send_pending_rejoins();
5389 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid()))
5390 rejoin_gather_finish();
5395 void MDCache::check_realm_past_parents(SnapRealm
*realm
, bool reconnect
)
5397 // are this realm's parents fully open?
5398 if (realm
->have_past_parents_open()) {
5399 dout(10) << " have past snap parents for realm " << *realm
5400 << " on " << *realm
->inode
<< dendl
;
5402 // finish off client snaprealm reconnects?
5403 auto p
= reconnected_snaprealms
.find(realm
->inode
->ino());
5404 if (p
!= reconnected_snaprealms
.end()) {
5405 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
5406 finish_snaprealm_reconnect(q
->first
, realm
, q
->second
);
5407 reconnected_snaprealms
.erase(p
);
5411 if (!missing_snap_parents
.count(realm
->inode
)) {
5412 dout(10) << " MISSING past snap parents for realm " << *realm
5413 << " on " << *realm
->inode
<< dendl
;
5414 realm
->inode
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5415 missing_snap_parents
[realm
->inode
].size(); // just to get it into the map!
5417 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5418 << " on " << *realm
->inode
<< dendl
;
5423 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5424 client_t client
, snapid_t snap_follows
)
5426 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5428 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5429 snapid_t follows
= snap_follows
;
5432 CInode
*in
= pick_inode_snap(head_in
, follows
);
5435 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5437 /* TODO: we can check the reconnected/flushing caps to find
5438 * which locks need gathering */
5439 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5440 int lockid
= cinode_lock_info
[i
].lock
;
5441 SimpleLock
*lock
= in
->get_lock(lockid
);
5443 in
->client_snap_caps
[lockid
].insert(client
);
5445 lock
->set_state(LOCK_SNAP_SYNC
);
5446 lock
->get_wrlock(true);
5449 for (auto p
= snaps
.lower_bound(in
->first
);
5450 p
!= snaps
.end() && *p
<= in
->last
;
5452 head_in
->add_need_snapflush(in
, *p
, client
);
5460 * choose lock states based on reconnected caps
5462 void MDCache::choose_lock_states_and_reconnect_caps()
5464 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5466 map
<client_t
,MClientSnap
*> splits
;
5468 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator i
= inode_map
.begin();
5469 i
!= inode_map
.end();
5471 CInode
*in
= i
->second
;
5473 if (in
->last
!= CEPH_NOSNAP
)
5476 if (in
->is_auth() && !in
->is_base() && in
->inode
.is_dirty_rstat())
5477 in
->mark_dirty_rstat();
5479 auto p
= reconnected_caps
.find(in
->ino());
5482 if (p
!= reconnected_caps
.end()) {
5483 for (const auto &it
: p
->second
)
5484 dirty_caps
|= it
.second
.dirty_caps
;
5486 in
->choose_lock_states(dirty_caps
);
5487 dout(15) << " chose lock states on " << *in
<< dendl
;
5489 SnapRealm
*realm
= in
->find_snaprealm();
5491 check_realm_past_parents(realm
, realm
== in
->snaprealm
);
5493 if (p
!= reconnected_caps
.end()) {
5494 bool missing_snap_parent
= false;
5495 // also, make sure client's cap is in the correct snaprealm.
5496 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5497 if (q
->second
.snap_follows
> 0 && q
->second
.snap_follows
< in
->first
- 1) {
5498 if (realm
->have_past_parents_open()) {
5499 rebuild_need_snapflush(in
, realm
, q
->first
, q
->second
.snap_follows
);
5501 missing_snap_parent
= true;
5505 if (q
->second
.realm_ino
== realm
->inode
->ino()) {
5506 dout(15) << " client." << q
->first
<< " has correct realm " << q
->second
.realm_ino
<< dendl
;
5508 dout(15) << " client." << q
->first
<< " has wrong realm " << q
->second
.realm_ino
5509 << " != " << realm
->inode
->ino() << dendl
;
5510 if (realm
->have_past_parents_open()) {
5511 // ok, include in a split message _now_.
5512 prepare_realm_split(realm
, q
->first
, in
->ino(), splits
);
5514 // send the split later.
5515 missing_snap_parent
= true;
5519 if (missing_snap_parent
)
5520 missing_snap_parents
[realm
->inode
].insert(in
);
5527 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5528 map
<client_t
,MClientSnap
*>& splits
)
5531 if (splits
.count(client
) == 0) {
5532 splits
[client
] = snap
= new MClientSnap(CEPH_SNAP_OP_SPLIT
);
5533 snap
->head
.split
= realm
->inode
->ino();
5534 realm
->build_snap_trace(snap
->bl
);
5536 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5537 p
!= realm
->open_children
.end();
5539 snap
->split_realms
.push_back((*p
)->inode
->ino());
5542 snap
= splits
[client
];
5543 snap
->split_inos
.push_back(ino
);
5546 void MDCache::send_snaps(map
<client_t
,MClientSnap
*>& splits
)
5548 dout(10) << "send_snaps" << dendl
;
5550 for (map
<client_t
,MClientSnap
*>::iterator p
= splits
.begin();
5553 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->first
.v
));
5555 dout(10) << " client." << p
->first
5556 << " split " << p
->second
->head
.split
5557 << " inos " << p
->second
->split_inos
5559 mds
->send_message_client_counted(p
->second
, session
);
5561 dout(10) << " no session for client." << p
->first
<< dendl
;
5570 * remove any items from logsegment open_file lists that don't have
5573 void MDCache::clean_open_file_lists()
5575 dout(10) << "clean_open_file_lists" << dendl
;
5577 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5578 p
!= mds
->mdlog
->segments
.end();
5580 LogSegment
*ls
= p
->second
;
5582 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5586 if (in
->last
== CEPH_NOSNAP
) {
5587 if (!in
->is_any_caps_wanted()) {
5588 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5589 in
->item_open_file
.remove_myself();
5591 } else if (in
->last
!= CEPH_NOSNAP
) {
5592 if (in
->client_snap_caps
.empty()) {
5593 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5594 in
->item_open_file
.remove_myself();
5603 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5605 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5606 << " on " << *in
<< dendl
;
5607 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5609 dout(10) << " no session for client." << client
<< dendl
;
5613 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5616 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5618 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5624 void MDCache::export_remaining_imported_caps()
5626 dout(10) << "export_remaining_imported_caps" << dendl
;
5628 stringstream warn_str
;
5630 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5631 warn_str
<< " ino " << p
->first
<< "\n";
5632 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5633 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5635 // mark client caps stale.
5636 MClientCaps
*stale
= new MClientCaps(CEPH_CAP_OP_EXPORT
, p
->first
, 0, 0, 0, mds
->get_osd_epoch_barrier());
5637 stale
->set_cap_peer(0, 0, 0, -1, 0);
5638 mds
->send_message_client_counted(stale
, q
->first
);
5642 mds
->heartbeat_reset();
5645 for (map
<inodeno_t
, list
<MDSInternalContextBase
*> >::iterator p
= cap_reconnect_waiters
.begin();
5646 p
!= cap_reconnect_waiters
.end();
5648 mds
->queue_waiters(p
->second
);
5650 cap_imports
.clear();
5651 cap_reconnect_waiters
.clear();
5653 if (warn_str
.peek() != EOF
) {
5654 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:";
5655 mds
->clog
->warn(warn_str
);
5659 void MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5661 client_t client
= session
->info
.get_client();
5662 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5664 in
->reconnect_cap(client
, *rc
, session
);
5665 dout(10) << "try_reconnect_cap client." << client
5666 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5667 << " issue " << ccap_string(rc
->capinfo
.issued
)
5668 << " on " << *in
<< dendl
;
5669 remove_replay_cap_reconnect(in
->ino(), client
);
5671 if (in
->is_replicated()) {
5672 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5675 auto p
= reconnected_caps
.find(in
->ino());
5676 if (p
!= reconnected_caps
.end()) {
5677 auto q
= p
->second
.find(client
);
5678 if (q
!= p
->second
.end())
5679 dirty_caps
= q
->second
.dirty_caps
;
5681 in
->choose_lock_states(dirty_caps
);
5682 dout(15) << " chose lock states on " << *in
<< dendl
;
5685 map
<inodeno_t
, list
<MDSInternalContextBase
*> >::iterator it
=
5686 cap_reconnect_waiters
.find(in
->ino());
5687 if (it
!= cap_reconnect_waiters
.end()) {
5688 mds
->queue_waiters(it
->second
);
5689 cap_reconnect_waiters
.erase(it
);
5697 // cap imports and delayed snap parent opens
5699 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5700 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5701 int peer
, int p_flags
)
5703 client_t client
= session
->info
.inst
.name
.num();
5704 SnapRealm
*realm
= in
->find_snaprealm();
5705 if (realm
->have_past_parents_open()) {
5706 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5707 if (cap
->get_last_seq() == 0) // reconnected cap
5708 cap
->inc_last_seq();
5709 cap
->set_last_issue();
5710 cap
->set_last_issue_stamp(ceph_clock_now());
5712 MClientCaps
*reap
= new MClientCaps(CEPH_CAP_OP_IMPORT
,
5714 realm
->inode
->ino(),
5715 cap
->get_cap_id(), cap
->get_last_seq(),
5716 cap
->pending(), cap
->wanted(), 0,
5717 cap
->get_mseq(), mds
->get_osd_epoch_barrier());
5718 in
->encode_cap_message(reap
, cap
);
5719 realm
->build_snap_trace(reap
->snapbl
);
5720 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5721 mds
->send_message_client_counted(reap
, session
);
5723 dout(10) << "do_cap_import missing past snap parents, delaying " << session
->info
.inst
.name
<< " mseq "
5724 << cap
->get_mseq() << " on " << *in
<< dendl
;
5726 cap
->inc_suppress();
5727 delayed_imported_caps
[client
].insert(in
);
5728 missing_snap_parents
[in
].size();
5732 void MDCache::do_delayed_cap_imports()
5734 dout(10) << "do_delayed_cap_imports" << dendl
;
5736 assert(delayed_imported_caps
.empty());
5739 struct C_MDC_OpenSnapParents
: public MDCacheContext
{
5740 explicit C_MDC_OpenSnapParents(MDCache
*c
) : MDCacheContext(c
) {}
5741 void finish(int r
) override
{
5742 mdcache
->open_snap_parents();
5746 void MDCache::open_snap_parents()
5748 dout(10) << "open_snap_parents" << dendl
;
5750 map
<client_t
,MClientSnap
*> splits
;
5751 MDSGatherBuilder
gather(g_ceph_context
);
5753 auto p
= missing_snap_parents
.begin();
5754 while (p
!= missing_snap_parents
.end()) {
5755 CInode
*in
= p
->first
;
5756 assert(in
->snaprealm
);
5757 if (in
->snaprealm
->open_parents(gather
.new_sub())) {
5758 dout(10) << " past parents now open on " << *in
<< dendl
;
5760 for (CInode
*child
: p
->second
) {
5761 auto q
= reconnected_caps
.find(child
->ino());
5762 assert(q
!= reconnected_caps
.end());
5763 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5764 if (r
->second
.snap_follows
> 0 && r
->second
.snap_follows
< in
->first
- 1) {
5765 rebuild_need_snapflush(child
, in
->snaprealm
, r
->first
, r
->second
.snap_follows
);
5767 // make sure client's cap is in the correct snaprealm.
5768 if (r
->second
.realm_ino
!= in
->ino()) {
5769 prepare_realm_split(in
->snaprealm
, r
->first
, child
->ino(), splits
);
5774 missing_snap_parents
.erase(p
++);
5776 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5778 // finish off client snaprealm reconnects?
5779 map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator q
= reconnected_snaprealms
.find(in
->ino());
5780 if (q
!= reconnected_snaprealms
.end()) {
5781 for (map
<client_t
,snapid_t
>::iterator r
= q
->second
.begin();
5782 r
!= q
->second
.end();
5784 finish_snaprealm_reconnect(r
->first
, in
->snaprealm
, r
->second
);
5785 reconnected_snaprealms
.erase(q
);
5788 dout(10) << " opening past parents on " << *in
<< dendl
;
5795 if (gather
.has_subs()) {
5796 dout(10) << "open_snap_parents - waiting for "
5797 << gather
.num_subs_remaining() << dendl
;
5798 gather
.set_finisher(new C_MDC_OpenSnapParents(this));
5801 if (!reconnected_snaprealms
.empty()) {
5802 stringstream warn_str
;
5803 for (map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator p
= reconnected_snaprealms
.begin();
5804 p
!= reconnected_snaprealms
.end();
5806 warn_str
<< " unconnected snaprealm " << p
->first
<< "\n";
5807 for (map
<client_t
,snapid_t
>::iterator q
= p
->second
.begin();
5808 q
!= p
->second
.end();
5810 warn_str
<< " client." << q
->first
<< " snapid " << q
->second
<< "\n";
5812 mds
->clog
->warn() << "open_snap_parents has:";
5813 mds
->clog
->warn(warn_str
);
5815 assert(rejoin_waiters
.empty());
5816 assert(missing_snap_parents
.empty());
5817 dout(10) << "open_snap_parents - all open" << dendl
;
5818 do_delayed_cap_imports();
5820 assert(rejoin_done
);
5821 rejoin_done
.release()->complete(0);
5822 reconnected_caps
.clear();
5826 bool MDCache::open_undef_inodes_dirfrags()
5828 dout(10) << "open_undef_inodes_dirfrags "
5829 << rejoin_undef_inodes
.size() << " inodes "
5830 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
5832 set
<CDir
*> fetch_queue
= rejoin_undef_dirfrags
;
5834 for (set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5835 p
!= rejoin_undef_inodes
.end();
5838 assert(!in
->is_base());
5839 fetch_queue
.insert(in
->get_parent_dir());
5842 if (fetch_queue
.empty())
5845 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_RejoinGatherFinish(this));
5846 for (set
<CDir
*>::iterator p
= fetch_queue
.begin();
5847 p
!= fetch_queue
.end();
5850 CInode
*diri
= dir
->get_inode();
5851 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
5853 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
5854 assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
5855 dir
->fetch(gather
.new_sub());
5857 assert(gather
.has_subs());
5862 void MDCache::opened_undef_inode(CInode
*in
) {
5863 dout(10) << "opened_undef_inode " << *in
<< dendl
;
5864 rejoin_undef_inodes
.erase(in
);
5866 // FIXME: re-hash dentries if necessary
5867 assert(in
->inode
.dir_layout
.dl_dir_hash
== g_conf
->mds_default_dir_hash
);
5868 if (in
->has_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
5869 CDir
*dir
= in
->get_dirfrag(frag_t());
5871 rejoin_undef_dirfrags
.erase(dir
);
5872 in
->force_dirfrags();
5874 in
->get_dirfrags(ls
);
5875 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
)
5876 rejoin_undef_dirfrags
.insert(*p
);
5881 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
)
5883 if (seq
< realm
->get_newest_seq()) {
5884 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
5885 << realm
->get_newest_seq()
5886 << " on " << *realm
<< dendl
;
5888 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5890 MClientSnap
*snap
= new MClientSnap(CEPH_SNAP_OP_UPDATE
);
5891 realm
->build_snap_trace(snap
->bl
);
5892 mds
->send_message_client_counted(snap
, session
);
5894 dout(10) << " ...or not, no session for this client!" << dendl
;
5897 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
5898 << " on " << *realm
<< dendl
;
5904 void MDCache::rejoin_send_acks()
5906 dout(7) << "rejoin_send_acks" << dendl
;
5909 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
5910 p
!= rejoin_unlinked_inodes
.end();
5912 for (set
<CInode
*>::iterator q
= p
->second
.begin();
5913 q
!= p
->second
.end();
5916 dout(7) << " unlinked inode " << *in
<< dendl
;
5918 if (!in
->is_replica(p
->first
))
5921 CDentry
*dn
= in
->get_parent_dn();
5922 if (dn
->is_replica(p
->first
))
5924 dn
->add_replica(p
->first
);
5925 CDir
*dir
= dn
->get_dir();
5926 if (dir
->is_replica(p
->first
))
5928 dir
->add_replica(p
->first
);
5929 in
= dir
->get_inode();
5930 if (in
->is_replica(p
->first
))
5937 rejoin_unlinked_inodes
.clear();
5939 // send acks to everyone in the recovery set
5940 map
<mds_rank_t
,MMDSCacheRejoin
*> acks
;
5941 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
5942 p
!= recovery_set
.end();
5944 if (rejoin_ack_sent
.count(*p
))
5946 acks
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK
);
5949 rejoin_ack_sent
= recovery_set
;
5952 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
5953 p
!= subtrees
.end();
5955 CDir
*dir
= p
->first
;
5956 if (!dir
->is_auth())
5958 dout(10) << "subtree " << *dir
<< dendl
;
5960 // auth items in this subtree
5964 while (!dq
.empty()) {
5965 CDir
*dir
= dq
.front();
5969 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= dir
->replicas_begin();
5970 r
!= dir
->replicas_end();
5972 auto it
= acks
.find(r
->first
);
5973 if (it
== acks
.end())
5975 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
->second
, dir
->dir_rep
);
5976 it
->second
->add_dirfrag_base(dir
);
5979 for (CDir::map_t::iterator q
= dir
->items
.begin();
5980 q
!= dir
->items
.end();
5982 CDentry
*dn
= q
->second
;
5983 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5987 if (dnl
->is_primary())
5988 in
= dnl
->get_inode();
5991 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= dn
->replicas_begin();
5992 r
!= dn
->replicas_end();
5994 auto it
= acks
.find(r
->first
);
5995 if (it
== acks
.end())
5997 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->name
, dn
->first
, dn
->last
,
5998 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
5999 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6000 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6002 dn
->lock
.get_replica_state());
6003 // peer missed MDentrylink message ?
6004 if (in
&& !in
->is_replica(r
->first
))
6005 in
->add_replica(r
->first
);
6011 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= in
->replicas_begin();
6012 r
!= in
->replicas_end();
6014 auto it
= acks
.find(r
->first
);
6015 if (it
== acks
.end())
6017 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6019 in
->_encode_locks_state_for_rejoin(bl
, r
->first
);
6020 it
->second
->add_inode_locks(in
, ++r
->second
, bl
);
6023 // subdirs in this subtree?
6024 in
->get_nested_dirfrags(dq
);
6030 if (root
&& root
->is_auth())
6031 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= root
->replicas_begin();
6032 r
!= root
->replicas_end();
6034 auto it
= acks
.find(r
->first
);
6035 if (it
== acks
.end())
6037 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6039 root
->_encode_locks_state_for_rejoin(bl
, r
->first
);
6040 it
->second
->add_inode_locks(root
, ++r
->second
, bl
);
6043 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= myin
->replicas_begin();
6044 r
!= myin
->replicas_end();
6046 auto it
= acks
.find(r
->first
);
6047 if (it
== acks
.end())
6049 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6051 myin
->_encode_locks_state_for_rejoin(bl
, r
->first
);
6052 it
->second
->add_inode_locks(myin
, ++r
->second
, bl
);
6055 // include inode base for any inodes whose scatterlocks may have updated
6056 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6057 p
!= rejoin_potential_updated_scatterlocks
.end();
6060 for (compact_map
<mds_rank_t
,unsigned>::iterator r
= in
->replicas_begin();
6061 r
!= in
->replicas_end();
6063 auto it
= acks
.find(r
->first
);
6064 if (it
== acks
.end())
6066 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6071 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6072 ::encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6073 mds
->send_message_mds(p
->second
, p
->first
);
6076 rejoin_imported_caps
.clear();
6080 void MDCache::reissue_all_caps()
6082 dout(10) << "reissue_all_caps" << dendl
;
6084 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
6085 p
!= inode_map
.end();
6087 CInode
*in
= p
->second
;
6088 if (in
->is_head() && in
->is_any_caps()) {
6089 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6090 mds
->locker
->issue_caps(in
);
6096 // ===============================================================================
6098 struct C_MDC_QueuedCow
: public MDCacheContext
{
6101 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6102 MDCacheContext(mdc
), in(i
), mut(m
) {}
6103 void finish(int r
) override
{
6104 mdcache
->_queued_file_recover_cow(in
, mut
);
6109 void MDCache::queue_file_recover(CInode
*in
)
6111 dout(10) << "queue_file_recover " << *in
<< dendl
;
6112 assert(in
->is_auth());
6116 SnapRealm *realm = in->find_snaprealm();
6117 set<snapid_t> s = realm->get_snaps();
6118 while (!s.empty() && *s.begin() < in->first)
6120 while (!s.empty() && *s.rbegin() > in->last)
6121 s.erase(*s.rbegin());
6122 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6124 inode_t *pi = in->project_inode();
6125 pi->version = in->pre_dirty();
6127 auto mut(std::make_shared<MutationImpl>());
6128 mut->ls = mds->mdlog->get_current_segment();
6129 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6130 mds->mdlog->start_entry(le);
6131 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6133 s.erase(*s.begin());
6134 while (!s.empty()) {
6135 snapid_t snapid = *s.begin();
6136 CInode *cow_inode = 0;
6137 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6139 recovery_queue.enqueue(cow_inode);
6140 s.erase(*s.begin());
6143 in->parent->first = in->first;
6144 le->metablob.add_primary_dentry(in->parent, in, true);
6145 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6146 mds->mdlog->flush();
6150 recovery_queue
.enqueue(in
);
6153 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6155 in
->pop_and_dirty_projected_inode(mut
->ls
);
6157 mds
->locker
->drop_locks(mut
.get());
6163 * called after recovery to recover file sizes for previously opened (for write)
6164 * files. that is, those where max_size > size.
6166 void MDCache::identify_files_to_recover()
6168 dout(10) << "identify_files_to_recover" << dendl
;
6169 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
6170 p
!= inode_map
.end();
6172 CInode
*in
= p
->second
;
6176 if (in
->last
!= CEPH_NOSNAP
)
6179 // Only normal files need file size recovery
6180 if (!in
->is_file()) {
6184 bool recover
= false;
6185 for (map
<client_t
,client_writeable_range_t
>::iterator p
= in
->inode
.client_ranges
.begin();
6186 p
!= in
->inode
.client_ranges
.end();
6188 Capability
*cap
= in
->get_client_cap(p
->first
);
6190 dout(10) << " client." << p
->first
<< " has range " << p
->second
<< " but no cap on " << *in
<< dendl
;
6197 if (in
->filelock
.is_stable()) {
6198 in
->auth_pin(&in
->filelock
);
6200 assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6202 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6203 rejoin_recover_q
.push_back(in
);
6205 rejoin_check_q
.push_back(in
);
6210 void MDCache::start_files_to_recover()
6212 for (CInode
*in
: rejoin_check_q
) {
6213 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6214 mds
->locker
->issue_caps(in
);
6215 mds
->locker
->check_inode_max_size(in
);
6217 rejoin_check_q
.clear();
6218 for (CInode
*in
: rejoin_recover_q
) {
6219 mds
->locker
->file_recover(&in
->filelock
);
6221 if (!rejoin_recover_q
.empty()) {
6222 rejoin_recover_q
.clear();
6227 void MDCache::do_file_recover()
6229 recovery_queue
.advance();
6232 // ===============================================================================
6235 // ----------------------------
6238 class C_MDC_RetryTruncate
: public MDCacheContext
{
6242 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6243 MDCacheContext(c
), in(i
), ls(l
) {}
6244 void finish(int r
) override
{
6245 mdcache
->_truncate_inode(in
, ls
);
6249 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6251 inode_t
*pi
= in
->get_projected_inode();
6252 dout(10) << "truncate_inode "
6253 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6257 ls
->truncating_inodes
.insert(in
);
6258 in
->get(CInode::PIN_TRUNCATING
);
6261 if (!in
->client_need_snapflush
.empty() &&
6262 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6263 assert(in
->filelock
.is_xlocked());
6264 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6265 mds
->locker
->issue_caps(in
);
6269 _truncate_inode(in
, ls
);
6272 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6275 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6276 MDCacheIOContext(c
), in(i
), ls(l
) {}
6277 void finish(int r
) override
{
6278 assert(r
== 0 || r
== -ENOENT
);
6279 mdcache
->truncate_inode_finish(in
, ls
);
6283 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6285 inode_t
*pi
= &in
->inode
;
6286 dout(10) << "_truncate_inode "
6287 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6288 << " on " << *in
<< dendl
;
6290 assert(pi
->is_truncating());
6291 assert(pi
->truncate_size
< (1ULL << 63));
6292 assert(pi
->truncate_from
< (1ULL << 63));
6293 assert(pi
->truncate_size
< pi
->truncate_from
);
6296 SnapRealm
*realm
= in
->find_snaprealm();
6297 SnapContext nullsnap
;
6298 const SnapContext
*snapc
;
6300 dout(10) << " realm " << *realm
<< dendl
;
6301 snapc
= &realm
->get_snap_context();
6303 dout(10) << " NO realm, using null context" << dendl
;
6305 assert(in
->last
== CEPH_NOSNAP
);
6307 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
<< dendl
;
6308 filer
.truncate(in
->inode
.ino
, &in
->inode
.layout
, *snapc
,
6309 pi
->truncate_size
, pi
->truncate_from
-pi
->truncate_size
,
6310 pi
->truncate_seq
, ceph::real_time::min(), 0,
6311 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6315 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6318 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6319 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6320 void finish(int r
) override
{
6321 mdcache
->truncate_inode_logged(in
, mut
);
6325 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6327 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6329 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6330 assert(p
!= ls
->truncating_inodes
.end());
6331 ls
->truncating_inodes
.erase(p
);
6334 inode_t
*pi
= in
->project_inode();
6335 pi
->version
= in
->pre_dirty();
6336 pi
->truncate_from
= 0;
6337 pi
->truncate_pending
--;
6339 MutationRef
mut(new MutationImpl());
6340 mut
->ls
= mds
->mdlog
->get_current_segment();
6341 mut
->add_projected_inode(in
);
6343 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6344 mds
->mdlog
->start_entry(le
);
6345 CDentry
*dn
= in
->get_projected_parent_dn();
6346 le
->metablob
.add_dir_context(dn
->get_dir());
6347 le
->metablob
.add_primary_dentry(dn
, in
, true);
6348 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6350 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6351 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6353 // flush immediately if there are readers/writers waiting
6354 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6355 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6356 mds
->mdlog
->flush();
6359 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6361 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6363 mds
->locker
->drop_locks(mut
.get());
6366 in
->put(CInode::PIN_TRUNCATING
);
6367 in
->auth_unpin(this);
6369 list
<MDSInternalContextBase
*> waiters
;
6370 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6371 mds
->queue_waiters(waiters
);
6375 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6377 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6378 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6379 ls
->truncating_inodes
.insert(in
);
6380 in
->get(CInode::PIN_TRUNCATING
);
6383 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6385 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6386 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6387 // if we have the logseg the truncate started in, it must be in our list.
6388 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6389 assert(p
!= ls
->truncating_inodes
.end());
6390 ls
->truncating_inodes
.erase(p
);
6391 in
->put(CInode::PIN_TRUNCATING
);
6394 void MDCache::start_recovered_truncates()
6396 dout(10) << "start_recovered_truncates" << dendl
;
6397 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6398 p
!= mds
->mdlog
->segments
.end();
6400 LogSegment
*ls
= p
->second
;
6401 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6402 q
!= ls
->truncating_inodes
.end();
6407 if (!in
->client_need_snapflush
.empty() &&
6408 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6409 assert(in
->filelock
.is_stable());
6410 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6411 in
->auth_pin(&in
->filelock
);
6412 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6413 // start_files_to_recover will revoke caps
6416 _truncate_inode(in
, ls
);
6426 // ================================================================================
6431 * note: only called while MDS is active or stopping... NOT during recovery.
6432 * however, we may expire a replica whose authority is recovering.
6435 bool MDCache::trim(int max
, int count
)
6439 max
= lru
.lru_get_size() - count
;
6442 } else if (max
< 0) {
6443 max
= g_conf
->mds_cache_size
;
6447 dout(7) << "trim max=" << max
<< " cur=" << lru
.lru_get_size()
6448 << "/" << bottom_lru
.lru_get_size() << dendl
;
6450 // process delayed eval_stray()
6451 stray_manager
.advance_delayed();
6453 map
<mds_rank_t
, MCacheExpire
*> expiremap
;
6454 bool is_standby_replay
= mds
->is_standby_replay();
6455 int unexpirable
= 0;
6456 list
<CDentry
*> unexpirables
;
6459 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6462 if (trim_dentry(dn
, expiremap
)) {
6463 unexpirables
.push_back(dn
);
6468 for(auto dn
: unexpirables
)
6469 bottom_lru
.lru_insert_mid(dn
);
6470 unexpirables
.clear();
6472 // trim dentries from the LRU: only enough to satisfy `max`,
6473 while (lru
.lru_get_size() + unexpirable
> (unsigned)max
) {
6474 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6478 if ((is_standby_replay
&& dn
->get_linkage()->inode
&&
6479 dn
->get_linkage()->inode
->item_open_file
.is_on_list()) ||
6480 trim_dentry(dn
, expiremap
)) {
6481 unexpirables
.push_back(dn
);
6485 for(auto dn
: unexpirables
)
6486 lru
.lru_insert_mid(dn
);
6487 unexpirables
.clear();
6489 // trim non-auth, non-bound subtrees
6490 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
6491 p
!= subtrees
.end();) {
6492 CDir
*dir
= p
->first
;
6494 CInode
*diri
= dir
->get_inode();
6495 if (dir
->is_auth()) {
6496 if (!diri
->is_auth() && !diri
->is_base() &&
6497 dir
->get_num_head_items() == 0) {
6498 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6499 dir
->is_freezing() || dir
->is_frozen())
6502 migrator
->export_empty_import(dir
);
6505 if (!diri
->is_auth()) {
6506 if (dir
->get_num_ref() > 1) // only subtree pin
6509 diri
->get_subtree_dirfrags(ls
);
6510 if (diri
->get_num_ref() > (int)ls
.size()) // only pinned by subtrees
6513 // don't trim subtree root if its auth MDS is recovering.
6514 // This simplify the cache rejoin code.
6515 if (dir
->is_subtree_root() &&
6516 rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6518 trim_dirfrag(dir
, 0, expiremap
);
6524 if (max
== 0 && root
) {
6526 root
->get_dirfrags(ls
);
6527 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6529 if (dir
->get_num_ref() == 1) // subtree pin
6530 trim_dirfrag(dir
, 0, expiremap
);
6532 if (root
->get_num_ref() == 0)
6533 trim_inode(0, root
, 0, expiremap
);
6536 std::set
<mds_rank_t
> stopping
;
6537 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6538 stopping
.erase(mds
->get_nodeid());
6539 for (auto rank
: stopping
) {
6540 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6544 if (expiremap
.count(rank
) == 0) {
6545 expiremap
[rank
] = new MCacheExpire(mds
->get_nodeid());
6548 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
<< dendl
;
6550 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6552 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6554 mdsdir_in
->get_dirfrags(ls
);
6555 for (auto dir
: ls
) {
6556 if (dir
->get_num_ref() == 1) // subtree pin
6557 trim_dirfrag(dir
, dir
, expiremap
);
6559 if (mdsdir_in
->get_num_ref() == 0)
6560 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6562 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6566 // Other rank's base inodes (when I'm stopping)
6568 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6569 p
!= base_inodes
.end(); ++p
) {
6570 if (MDS_INO_MDSDIR_OWNER((*p
)->ino()) != mds
->get_nodeid()) {
6571 dout(20) << __func__
<< ": maybe trimming base: " << *(*p
) << dendl
;
6572 if ((*p
)->get_num_ref() == 0) {
6573 trim_inode(NULL
, *p
, NULL
, expiremap
);
6579 // send any expire messages
6580 send_expire_messages(expiremap
);
6585 void MDCache::send_expire_messages(map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6588 for (map
<mds_rank_t
, MCacheExpire
*>::iterator it
= expiremap
.begin();
6589 it
!= expiremap
.end();
6591 if (mds
->is_cluster_degraded() &&
6592 (mds
->mdsmap
->get_state(it
->first
) < MDSMap::STATE_REJOIN
||
6593 (mds
->mdsmap
->get_state(it
->first
) == MDSMap::STATE_REJOIN
&&
6594 rejoin_sent
.count(it
->first
) == 0))) {
6598 dout(7) << "sending cache_expire to " << it
->first
<< dendl
;
6599 mds
->send_message_mds(it
->second
, it
->first
);
6604 bool MDCache::trim_dentry(CDentry
*dn
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6606 dout(12) << "trim_dentry " << *dn
<< dendl
;
6608 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6610 CDir
*dir
= dn
->get_dir();
6613 CDir
*con
= get_subtree_root(dir
);
6615 dout(12) << " in container " << *con
<< dendl
;
6617 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
6618 assert(dn
->is_auth());
6621 // If replica dentry is not readable, it's likely we will receive
6622 // MDentryLink/MDentryUnlink message soon (It's possible we first
6623 // receive a MDentryUnlink message, then MDentryLink message)
6624 // MDentryLink message only replicates an inode, so we should
6625 // avoid trimming the inode's parent dentry. This is because that
6626 // unconnected replicas are problematic for subtree migration.
6627 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
6628 !dn
->get_dir()->get_inode()->is_stray())
6631 // adjust the dir state
6632 // NOTE: we can safely remove a clean, null dentry without effecting
6633 // directory completeness.
6634 // (check this _before_ we unlink the inode, below!)
6635 bool clear_complete
= false;
6636 if (!(dnl
->is_null() && dn
->is_clean()))
6637 clear_complete
= true;
6639 // unlink the dentry
6640 if (dnl
->is_remote()) {
6642 dir
->unlink_inode(dn
, false);
6643 } else if (dnl
->is_primary()) {
6644 // expire the inode, too.
6645 CInode
*in
= dnl
->get_inode();
6647 if (trim_inode(dn
, in
, con
, expiremap
))
6648 return true; // purging stray instead of trimming
6650 assert(dnl
->is_null());
6653 if (!dn
->is_auth()) {
6654 // notify dentry authority.
6655 mds_authority_t auth
= dn
->authority();
6657 for (int p
=0; p
<2; p
++) {
6658 mds_rank_t a
= auth
.first
;
6659 if (p
) a
= auth
.second
;
6660 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6661 if (mds
->get_nodeid() == auth
.second
&&
6662 con
->is_importing()) break; // don't send any expire while importing.
6663 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6665 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
6666 assert(a
!= mds
->get_nodeid());
6667 if (expiremap
.count(a
) == 0)
6668 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6669 expiremap
[a
]->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->name
, dn
->last
, dn
->get_replica_nonce());
6674 if (dn
->last
== CEPH_NOSNAP
&& dir
->is_auth())
6675 dir
->add_to_bloom(dn
);
6676 dir
->remove_dentry(dn
);
6679 dir
->state_clear(CDir::STATE_COMPLETE
);
6681 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
6686 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6688 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
6690 if (dir
->is_subtree_root()) {
6691 assert(!dir
->is_auth() ||
6692 (!dir
->is_replicated() && dir
->inode
->is_base()));
6693 remove_subtree(dir
); // remove from subtree map
6695 assert(dir
->get_num_ref() == 0);
6697 CInode
*in
= dir
->get_inode();
6699 if (!dir
->is_auth()) {
6700 mds_authority_t auth
= dir
->authority();
6702 // was this an auth delegation? (if so, slightly modified container)
6704 if (dir
->is_subtree_root()) {
6705 dout(12) << " subtree root, container is " << *dir
<< dendl
;
6707 condf
= dir
->dirfrag();
6709 condf
= con
->dirfrag();
6712 for (int p
=0; p
<2; p
++) {
6713 mds_rank_t a
= auth
.first
;
6714 if (p
) a
= auth
.second
;
6715 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6716 if (mds
->get_nodeid() == auth
.second
&&
6717 con
->is_importing()) break; // don't send any expire while importing.
6718 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6720 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
6721 assert(a
!= mds
->get_nodeid());
6722 if (expiremap
.count(a
) == 0)
6723 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6724 expiremap
[a
]->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
6728 in
->close_dirfrag(dir
->dirfrag().frag
);
6732 * Try trimming an inode from the cache
6734 * @return true if the inode is still in cache, else false if it was trimmed
6736 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6738 dout(15) << "trim_inode " << *in
<< dendl
;
6739 assert(in
->get_num_ref() == 0);
6742 // If replica inode's dirfragtreelock is not readable, it's likely
6743 // some dirfrags of the inode are being fragmented and we will receive
6744 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6745 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6746 // This is because that unconnected replicas are problematic for
6747 // subtree migration.
6749 if (!in
->is_auth() && !in
->dirfragtreelock
.can_read(-1))
6754 in
->get_dirfrags(dfls
);
6755 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
6757 assert(!dir
->is_subtree_root());
6758 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
6763 if (in
->is_auth()) {
6764 // eval stray after closing dirfrags
6765 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
6766 maybe_eval_stray(in
);
6767 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
6771 mds_authority_t auth
= in
->authority();
6775 df
= con
->dirfrag();
6777 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
6779 for (int p
=0; p
<2; p
++) {
6780 mds_rank_t a
= auth
.first
;
6781 if (p
) a
= auth
.second
;
6782 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6783 if (con
&& mds
->get_nodeid() == auth
.second
&&
6784 con
->is_importing()) break; // don't send any expire while importing.
6785 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6787 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
6788 assert(a
!= mds
->get_nodeid());
6789 if (expiremap
.count(a
) == 0)
6790 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6791 expiremap
[a
]->add_inode(df
, in
->vino(), in
->get_replica_nonce());
6796 if (in->is_auth()) {
6797 if (in->hack_accessed)
6798 mds->logger->inc("outt");
6800 mds->logger->inc("outut");
6801 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6808 dn
->get_dir()->unlink_inode(dn
, false);
6815 * trim_non_auth - remove any non-auth items from our cache
6817 * this reduces the amount of non-auth metadata in our cache, reducing the
6818 * load incurred by the rejoin phase.
6820 * the only non-auth items that remain are those that are needed to
6821 * attach our own subtrees to the root.
6823 * when we are done, all dentries will be in the top bit of the lru.
6825 * why we have to do this:
6826 * we may not have accurate linkage for non-auth items. which means we will
6827 * know which subtree it falls into, and can not be sure to declare it to the
6828 * correct authority.
6830 void MDCache::trim_non_auth()
6832 dout(7) << "trim_non_auth" << dendl
;
6834 // temporarily pin all subtree roots
6835 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
6836 p
!= subtrees
.end();
6838 p
->first
->get(CDir::PIN_SUBTREETEMP
);
6840 list
<CDentry
*> auth_list
;
6842 // trim non-auth items from the lru
6845 if (bottom_lru
.lru_get_size() > 0)
6846 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6847 if (!dn
&& lru
.lru_get_size() > 0)
6848 dn
= static_cast<CDentry
*>(lru
.lru_expire());
6852 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6854 if (dn
->is_auth()) {
6855 // add back into lru (at the top)
6856 auth_list
.push_back(dn
);
6858 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
6859 dn
->unlink_remote(dnl
);
6861 // non-auth. expire.
6862 CDir
*dir
= dn
->get_dir();
6865 // unlink the dentry
6866 dout(10) << " removing " << *dn
<< dendl
;
6867 if (dnl
->is_remote()) {
6868 dir
->unlink_inode(dn
, false);
6870 else if (dnl
->is_primary()) {
6871 CInode
*in
= dnl
->get_inode();
6872 dout(10) << " removing " << *in
<< dendl
;
6874 in
->get_dirfrags(ls
);
6875 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6877 assert(!subdir
->is_subtree_root());
6878 in
->close_dirfrag(subdir
->dirfrag().frag
);
6880 dir
->unlink_inode(dn
, false);
6884 assert(dnl
->is_null());
6887 assert(!dir
->has_bloom());
6888 dir
->remove_dentry(dn
);
6889 // adjust the dir state
6890 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
6891 // close empty non-auth dirfrag
6892 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
6893 dir
->inode
->close_dirfrag(dir
->get_frag());
6897 for (auto dn
: auth_list
) {
6898 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
6899 bottom_lru
.lru_insert_mid(dn
);
6901 lru
.lru_insert_top(dn
);
6904 // move everything in the pintail to the top bit of the lru.
6905 lru
.lru_touch_entire_pintail();
6907 // unpin all subtrees
6908 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
6909 p
!= subtrees
.end();
6911 p
->first
->put(CDir::PIN_SUBTREETEMP
);
6913 if (lru
.lru_get_size() == 0 &&
6914 bottom_lru
.lru_get_size() == 0) {
6915 // root, stray, etc.?
6916 ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
6917 while (p
!= inode_map
.end()) {
6918 ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator next
= p
;
6920 CInode
*in
= p
->second
;
6921 if (!in
->is_auth()) {
6923 in
->get_dirfrags(ls
);
6924 for (list
<CDir
*>::iterator p
= ls
.begin();
6927 dout(10) << " removing " << **p
<< dendl
;
6928 assert((*p
)->get_num_ref() == 1); // SUBTREE
6929 remove_subtree((*p
));
6930 in
->close_dirfrag((*p
)->dirfrag().frag
);
6932 dout(10) << " removing " << *in
<< dendl
;
6933 assert(!in
->get_parent_dn());
6934 assert(in
->get_num_ref() == 0);
6945 * Recursively trim the subtree rooted at directory to remove all
6946 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6947 * of those links. This is used to clear invalid data out of the cache.
6948 * Note that it doesn't clear the passed-in directory, since that's not
6951 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
6953 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
6955 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
6957 CDir::map_t::iterator j
= dir
->begin();
6958 CDir::map_t::iterator i
= j
;
6959 while (j
!= dir
->end()) {
6961 CDentry
*dn
= i
->second
;
6962 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
6963 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6964 if (dnl
->is_primary()) { // check for subdirectories, etc
6965 CInode
*in
= dnl
->get_inode();
6966 bool keep_inode
= false;
6968 list
<CDir
*> subdirs
;
6969 in
->get_dirfrags(subdirs
);
6970 for (list
<CDir
*>::iterator subdir
= subdirs
.begin();
6971 subdir
!= subdirs
.end();
6973 if ((*subdir
)->is_subtree_root()) {
6975 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << **subdir
<< dendl
;
6977 if (trim_non_auth_subtree(*subdir
))
6980 in
->close_dirfrag((*subdir
)->get_frag());
6981 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
6987 if (!keep_inode
) { // remove it!
6988 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
6989 dir
->unlink_inode(dn
, false);
6991 assert(!dir
->has_bloom());
6992 dir
->remove_dentry(dn
);
6994 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
6995 dn
->state_clear(CDentry::STATE_AUTH
);
6996 in
->state_clear(CInode::STATE_AUTH
);
6998 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for slave rollback
6999 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7000 } else { // just remove it
7001 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7002 if (dnl
->is_remote())
7003 dir
->unlink_inode(dn
, false);
7004 dir
->remove_dentry(dn
);
7007 dir
->state_clear(CDir::STATE_AUTH
);
7009 * We've now checked all our children and deleted those that need it.
7010 * Now return to caller, and tell them if *we're* a keeper.
7012 return keep_dir
|| dir
->get_num_any();
7016 * during replay, when we determine a subtree is no longer ours, we
7017 * try to trim it from our cache. because subtrees must be connected
7018 * to the root, the fact that we can trim this tree may mean that our
7019 * children or parents can also be trimmed.
7021 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7023 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7025 // can we now trim child subtrees?
7027 get_subtree_bounds(dir
, bounds
);
7028 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7030 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7031 bd
->get_num_any() == 0 && // and empty
7032 can_trim_non_auth_dirfrag(bd
)) {
7033 CInode
*bi
= bd
->get_inode();
7034 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7037 bi
->close_dirfrag(bd
->get_frag());
7041 if (trim_non_auth_subtree(dir
)) {
7043 try_subtree_merge(dir
);
7045 // can we trim this subtree (and possibly our ancestors) too?
7047 CInode
*diri
= dir
->get_inode();
7048 if (diri
->is_base()) {
7049 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7050 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7051 remove_subtree(dir
);
7053 diri
->close_dirfrag(dir
->get_frag());
7055 dout(10) << " removing " << *diri
<< dendl
;
7056 assert(!diri
->get_parent_dn());
7057 assert(diri
->get_num_ref() == 0);
7063 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7064 dout(10) << " parent subtree is " << *psub
<< dendl
;
7065 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7066 break; // we are auth, keep.
7068 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7069 remove_subtree(dir
);
7071 diri
->close_dirfrag(dir
->get_frag());
7073 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7074 if (trim_non_auth_subtree(psub
))
7083 void MDCache::standby_trim_segment(LogSegment
*ls
)
7085 ls
->new_dirfrags
.clear_list();
7086 ls
->open_files
.clear_list();
7088 while (!ls
->dirty_dirfrags
.empty()) {
7089 CDir
*dir
= ls
->dirty_dirfrags
.front();
7092 while (!ls
->dirty_inodes
.empty()) {
7093 CInode
*in
= ls
->dirty_inodes
.front();
7096 while (!ls
->dirty_dentries
.empty()) {
7097 CDentry
*dn
= ls
->dirty_dentries
.front();
7100 while (!ls
->dirty_parent_inodes
.empty()) {
7101 CInode
*in
= ls
->dirty_parent_inodes
.front();
7102 in
->clear_dirty_parent();
7104 while (!ls
->dirty_dirfrag_dir
.empty()) {
7105 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7106 in
->filelock
.remove_dirty();
7108 while (!ls
->dirty_dirfrag_nest
.empty()) {
7109 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7110 in
->nestlock
.remove_dirty();
7112 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7113 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7114 in
->dirfragtreelock
.remove_dirty();
7118 /* This function DOES put the passed message before returning */
7119 void MDCache::handle_cache_expire(MCacheExpire
*m
)
7121 mds_rank_t from
= mds_rank_t(m
->get_from());
7123 dout(7) << "cache_expire from mds." << from
<< dendl
;
7125 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7130 set
<SimpleLock
*> gather_locks
;
7132 for (map
<dirfrag_t
,MCacheExpire::realm
>::iterator p
= m
->realms
.begin();
7133 p
!= m
->realms
.end();
7136 if (p
->first
.ino
> 0) {
7137 CInode
*expired_inode
= get_inode(p
->first
.ino
);
7138 assert(expired_inode
); // we had better have this.
7139 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
->first
.frag
);
7142 int export_state
= -1;
7143 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7144 export_state
= migrator
->get_export_state(parent_dir
);
7145 assert(export_state
>= 0);
7148 if (!parent_dir
->is_auth() ||
7149 (export_state
!= -1 &&
7150 ((export_state
== Migrator::EXPORT_WARNING
&&
7151 migrator
->export_has_warned(parent_dir
,from
)) ||
7152 export_state
== Migrator::EXPORT_EXPORTING
||
7153 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7154 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7155 !migrator
->export_has_notified(parent_dir
,from
))))) {
7158 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7159 assert(parent_dir
->is_frozen_tree_root());
7161 // make a message container
7162 if (delayed_expire
[parent_dir
].count(from
) == 0)
7163 delayed_expire
[parent_dir
][from
] = new MCacheExpire(from
);
7165 // merge these expires into it
7166 delayed_expire
[parent_dir
][from
]->add_realm(p
->first
, p
->second
);
7169 assert(export_state
<= Migrator::EXPORT_PREPPING
||
7170 (export_state
== Migrator::EXPORT_WARNING
&&
7171 !migrator
->export_has_warned(parent_dir
, from
)));
7173 dout(7) << "expires for " << *parent_dir
<< dendl
;
7175 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7179 for (map
<vinodeno_t
,uint32_t>::iterator it
= p
->second
.inodes
.begin();
7180 it
!= p
->second
.inodes
.end();
7182 CInode
*in
= get_inode(it
->first
);
7183 unsigned nonce
= it
->second
;
7186 dout(0) << " inode expire on " << it
->first
<< " from " << from
7187 << ", don't have it" << dendl
;
7190 assert(in
->is_auth());
7191 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7194 if (nonce
== in
->get_replica_nonce(from
)) {
7195 // remove from our cached_by
7196 dout(7) << " inode expire on " << *in
<< " from mds." << from
7197 << " cached_by was " << in
->get_replicas() << dendl
;
7198 inode_remove_replica(in
, from
, false, gather_locks
);
7201 // this is an old nonce, ignore expire.
7202 dout(7) << " inode expire on " << *in
<< " from mds." << from
7203 << " with old nonce " << nonce
7204 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7210 for (map
<dirfrag_t
,uint32_t>::iterator it
= p
->second
.dirs
.begin();
7211 it
!= p
->second
.dirs
.end();
7213 CDir
*dir
= get_dirfrag(it
->first
);
7214 unsigned nonce
= it
->second
;
7217 CInode
*diri
= get_inode(it
->first
.ino
);
7219 if (mds
->is_rejoin() &&
7220 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7221 !diri
->is_replica(from
)) {
7223 diri
->get_nested_dirfrags(ls
);
7224 dout(7) << " dir expire on dirfrag " << it
->first
<< " from mds." << from
7225 << " while rejoining, inode isn't replicated" << dendl
;
7226 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
7228 if (dir
->is_replica(from
)) {
7229 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7230 dir
->remove_replica(from
);
7235 CDir
*other
= diri
->get_approx_dirfrag(it
->first
.frag
);
7237 dout(7) << " dir expire on dirfrag " << it
->first
<< " from mds." << from
7238 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7242 dout(0) << " dir expire on " << it
->first
<< " from " << from
7243 << ", don't have it" << dendl
;
7246 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7248 assert(dir
->is_auth());
7251 if (nonce
== dir
->get_replica_nonce(from
)) {
7252 // remove from our cached_by
7253 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7254 << " replicas was " << dir
->replica_map
<< dendl
;
7255 dir
->remove_replica(from
);
7258 // this is an old nonce, ignore expire.
7259 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7260 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7261 << "), dropping" << dendl
;
7266 for (map
<dirfrag_t
, map
<pair
<string
,snapid_t
>,uint32_t> >::iterator pd
= p
->second
.dentries
.begin();
7267 pd
!= p
->second
.dentries
.end();
7269 dout(10) << " dn expires in dir " << pd
->first
<< dendl
;
7270 CInode
*diri
= get_inode(pd
->first
.ino
);
7272 CDir
*dir
= diri
->get_dirfrag(pd
->first
.frag
);
7275 dout(0) << " dn expires on " << pd
->first
<< " from " << from
7276 << ", must have refragmented" << dendl
;
7278 assert(dir
->is_auth());
7281 for (map
<pair
<string
,snapid_t
>,uint32_t>::iterator p
= pd
->second
.begin();
7282 p
!= pd
->second
.end();
7284 unsigned nonce
= p
->second
;
7288 dn
= dir
->lookup(p
->first
.first
, p
->first
.second
);
7290 // which dirfrag for this dentry?
7291 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
->first
.first
));
7293 assert(dir
->is_auth());
7294 dn
= dir
->lookup(p
->first
.first
, p
->first
.second
);
7299 dout(0) << " missing dentry for " << p
->first
.first
<< " snap " << p
->first
.second
<< " in " << *dir
<< dendl
;
7301 dout(0) << " missing dentry for " << p
->first
.first
<< " snap " << p
->first
.second
<< dendl
;
7305 if (nonce
== dn
->get_replica_nonce(from
)) {
7306 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7307 dentry_remove_replica(dn
, from
, gather_locks
);
7310 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7311 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7312 << "), dropping" << dendl
;
7321 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7322 if (!(*p
)->is_stable())
7323 mds
->locker
->eval_gather(*p
);
7327 void MDCache::process_delayed_expire(CDir
*dir
)
7329 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7330 for (map
<mds_rank_t
,MCacheExpire
*>::iterator p
= delayed_expire
[dir
].begin();
7331 p
!= delayed_expire
[dir
].end();
7333 handle_cache_expire(p
->second
);
7334 delayed_expire
.erase(dir
);
7337 void MDCache::discard_delayed_expire(CDir
*dir
)
7339 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7340 for (map
<mds_rank_t
,MCacheExpire
*>::iterator p
= delayed_expire
[dir
].begin();
7341 p
!= delayed_expire
[dir
].end();
7344 delayed_expire
.erase(dir
);
7347 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7348 set
<SimpleLock
*>& gather_locks
)
7350 in
->remove_replica(from
);
7351 in
->mds_caps_wanted
.erase(from
);
7353 // note: this code calls _eval more often than it needs to!
7355 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7356 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7357 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7358 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7359 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7360 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7362 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7363 // Don't remove the recovering mds from lock's gathering list because
7364 // it may hold rejoined wrlocks.
7365 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7366 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7367 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7370 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7372 dn
->remove_replica(from
);
7375 if (dn
->lock
.remove_replica(from
))
7376 gather_locks
.insert(&dn
->lock
);
7378 // Replicated strays might now be elegible for purge
7379 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7380 if (dnl
->is_primary()) {
7381 maybe_eval_stray(dnl
->get_inode());
7385 void MDCache::trim_client_leases()
7387 utime_t now
= ceph_clock_now();
7389 dout(10) << "trim_client_leases" << dendl
;
7391 for (int pool
=0; pool
<client_lease_pools
; pool
++) {
7392 int before
= client_leases
[pool
].size();
7393 if (client_leases
[pool
].empty())
7396 while (!client_leases
[pool
].empty()) {
7397 ClientLease
*r
= client_leases
[pool
].front();
7398 if (r
->ttl
> now
) break;
7399 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7400 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7401 dn
->remove_client_lease(r
, mds
->locker
);
7403 int after
= client_leases
[pool
].size();
7404 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7405 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7410 void MDCache::check_memory_usage()
7412 static MemoryModel
mm(g_ceph_context
);
7413 static MemoryModel::snap last
;
7415 static MemoryModel::snap baseline
= last
;
7417 // check client caps
7418 assert(CInode::count() == inode_map
.size());
7419 float caps_per_inode
= 0.0;
7420 if (CInode::count())
7421 caps_per_inode
= (float)Capability::count() / (float)CInode::count();
7423 dout(2) << "check_memory_usage"
7424 << " total " << last
.get_total()
7425 << ", rss " << last
.get_rss()
7426 << ", heap " << last
.get_heap()
7427 << ", baseline " << baseline
.get_heap()
7428 << ", buffers " << (buffer::get_total_alloc() >> 10)
7429 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7430 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7433 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7434 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7436 if (num_inodes_with_caps
> g_conf
->mds_cache_size
) {
7437 float ratio
= (float)g_conf
->mds_cache_size
* .9 / (float)num_inodes_with_caps
;
7439 last_recall_state
= ceph_clock_now();
7440 mds
->server
->recall_client_state(ratio
);
7444 // If the cache size had exceeded its limit, but we're back in bounds
7445 // now, free any unused pool memory so that our memory usage isn't
7446 // permanently bloated.
7447 if (exceeded_size_limit
7448 && CInode::count() <=
7449 g_conf
->mds_cache_size
* g_conf
->mds_health_cache_threshold
) {
7450 // Only do this once we are back in bounds: otherwise the releases would
7451 // slow down whatever process caused us to exceed bounds to begin with
7452 if (ceph_using_tcmalloc()) {
7453 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7455 ceph_heap_release_free_memory();
7457 exceeded_size_limit
= false;
7463 // =========================================================================================
7466 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7468 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7469 void finish(int) override
{
7470 mdcache
->shutdown_check();
7474 void MDCache::shutdown_check()
7476 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7479 char old_val
[32] = { 0 };
7481 g_conf
->get_val("debug_mds", &o
, sizeof(old_val
));
7482 g_conf
->set_val("debug_mds", "10");
7483 g_conf
->apply_changes(NULL
);
7485 g_conf
->set_val("debug_mds", old_val
);
7486 g_conf
->apply_changes(NULL
);
7487 mds
->timer
.add_event_after(g_conf
->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7490 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7491 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7494 if (mds
->objecter
->is_active()) {
7495 dout(0) << "objecter still active" << dendl
;
7496 mds
->objecter
->dump_active();
7501 void MDCache::shutdown_start()
7503 dout(2) << "shutdown_start" << dendl
;
7505 if (g_conf
->mds_shutdown_check
)
7506 mds
->timer
.add_event_after(g_conf
->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7508 // g_conf->debug_mds = 10;
7513 bool MDCache::shutdown_pass()
7515 dout(7) << "shutdown_pass" << dendl
;
7517 if (mds
->is_stopped()) {
7518 dout(7) << " already shut down" << dendl
;
7525 if (!shutdown_export_strays()) {
7526 dout(7) << "waiting for strays to migrate" << dendl
;
7530 // drop our reference to our stray dir inode
7531 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7533 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7534 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7535 strays
[i
]->put(CInode::PIN_STRAY
);
7536 strays
[i
]->put_stickydirs();
7542 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7545 int num_auth_subtree
= 0;
7546 if (!subtrees
.empty() &&
7547 mds
->get_nodeid() != 0 &&
7548 migrator
->get_export_queue_size() == 0) {
7549 dout(7) << "looking for subtrees to export to mds0" << dendl
;
7551 for (map
<CDir
*, set
<CDir
*> >::iterator it
= subtrees
.begin();
7552 it
!= subtrees
.end();
7554 CDir
*dir
= it
->first
;
7555 if (dir
->get_inode()->is_mdsdir())
7557 if (dir
->is_auth()) {
7559 if (dir
->is_frozen() ||
7560 dir
->is_freezing() ||
7561 dir
->is_ambiguous_dir_auth() ||
7562 dir
->state_test(CDir::STATE_EXPORTING
))
7567 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7569 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7570 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7572 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7573 migrator
->export_dir_nicely(dir
, dest
);
7577 if (num_auth_subtree
> 0) {
7578 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7583 // close out any sessions (and open files!) before we try to trim the log, etc.
7584 if (mds
->sessionmap
.have_unclosed_sessions()) {
7585 if (!mds
->server
->terminating_sessions
)
7586 mds
->server
->terminate_sessions();
7590 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7591 if (mydir
&& !mydir
->is_subtree_root())
7594 // subtrees map not empty yet?
7595 if (subtrees
.size() > (mydir
? 1 : 0)) {
7596 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7598 migrator
->show_importing();
7599 migrator
->show_exporting();
7600 if (!migrator
->is_importing() && !migrator
->is_exporting())
7604 assert(!migrator
->is_exporting());
7605 assert(!migrator
->is_importing());
7608 // flush what we can from the log
7609 mds
->mdlog
->trim(0);
7610 if (mds
->mdlog
->get_num_segments() > 1) {
7611 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7615 // (only do this once!)
7616 if (!mds
->mdlog
->is_capped()) {
7617 dout(7) << "capping the log" << dendl
;
7622 if (!mds
->mdlog
->empty()) {
7623 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
7624 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
7628 if (!did_shutdown_log_cap
) {
7629 // flush journal header
7630 dout(7) << "writing header for (now-empty) journal" << dendl
;
7631 assert(mds
->mdlog
->empty());
7632 mds
->mdlog
->write_head(0);
7633 // NOTE: filer active checker below will block us until this completes.
7634 did_shutdown_log_cap
= true;
7639 if (mds
->objecter
->is_active()) {
7640 dout(7) << "objecter still active" << dendl
;
7641 mds
->objecter
->dump_active();
7645 // trim what we can from the cache
7646 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
7647 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7653 // make mydir subtree go away
7655 if (mydir
->get_num_ref() > 1) { // subtree pin
7656 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
7661 remove_subtree(mydir
);
7662 myin
->close_dirfrag(mydir
->get_frag());
7664 assert(subtrees
.empty());
7670 dout(2) << "shutdown done." << dendl
;
7674 bool MDCache::shutdown_export_strays()
7676 if (mds
->get_nodeid() == 0)
7679 dout(10) << "shutdown_export_strays" << dendl
;
7681 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
7686 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7690 strays
[i
]->get_dirfrags(dfs
);
7693 for (std::list
<CDir
*>::iterator dfs_i
= dfs
.begin();
7694 dfs_i
!= dfs
.end(); ++dfs_i
)
7698 if (!dir
->is_complete()) {
7705 for (CDir::map_t::iterator p
= dir
->items
.begin();
7706 p
!= dir
->items
.end();
7708 CDentry
*dn
= p
->second
;
7709 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7716 if (dn
->state_test(CDentry::STATE_PURGING
)) {
7717 // Don't try to migrate anything that is actually
7718 // being purged right now
7722 if (shutdown_exported_strays
.count(dnl
->get_inode()->ino()) == 0) {
7723 shutdown_exported_strays
.insert(dnl
->get_inode()->ino());
7724 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
7726 dout(10) << "already exporting " << *dn
<< dendl
;
7734 // ========= messaging ==============
7736 /* This function DOES put the passed message before returning */
7737 void MDCache::dispatch(Message
*m
)
7739 switch (m
->get_type()) {
7742 case MSG_MDS_RESOLVE
:
7743 handle_resolve(static_cast<MMDSResolve
*>(m
));
7745 case MSG_MDS_RESOLVEACK
:
7746 handle_resolve_ack(static_cast<MMDSResolveAck
*>(m
));
7750 case MSG_MDS_CACHEREJOIN
:
7751 handle_cache_rejoin(static_cast<MMDSCacheRejoin
*>(m
));
7754 case MSG_MDS_DISCOVER
:
7755 handle_discover(static_cast<MDiscover
*>(m
));
7757 case MSG_MDS_DISCOVERREPLY
:
7758 handle_discover_reply(static_cast<MDiscoverReply
*>(m
));
7761 case MSG_MDS_DIRUPDATE
:
7762 handle_dir_update(static_cast<MDirUpdate
*>(m
));
7765 case MSG_MDS_CACHEEXPIRE
:
7766 handle_cache_expire(static_cast<MCacheExpire
*>(m
));
7769 case MSG_MDS_DENTRYLINK
:
7770 handle_dentry_link(static_cast<MDentryLink
*>(m
));
7772 case MSG_MDS_DENTRYUNLINK
:
7773 handle_dentry_unlink(static_cast<MDentryUnlink
*>(m
));
7776 case MSG_MDS_FRAGMENTNOTIFY
:
7777 handle_fragment_notify(static_cast<MMDSFragmentNotify
*>(m
));
7780 case MSG_MDS_FINDINO
:
7781 handle_find_ino(static_cast<MMDSFindIno
*>(m
));
7783 case MSG_MDS_FINDINOREPLY
:
7784 handle_find_ino_reply(static_cast<MMDSFindInoReply
*>(m
));
7787 case MSG_MDS_OPENINO
:
7788 handle_open_ino(static_cast<MMDSOpenIno
*>(m
));
7790 case MSG_MDS_OPENINOREPLY
:
7791 handle_open_ino_reply(static_cast<MMDSOpenInoReply
*>(m
));
7795 derr
<< "cache unknown message " << m
->get_type() << dendl
;
7796 assert(0 == "cache unknown message");
7800 MDSInternalContextBase
*MDCache::_get_waiter(MDRequestRef
& mdr
, Message
*req
, MDSInternalContextBase
*fin
)
7803 dout(20) << "_get_waiter retryrequest" << dendl
;
7804 return new C_MDS_RetryRequest(this, mdr
);
7806 dout(20) << "_get_waiter retrymessage" << dendl
;
7807 return new C_MDS_RetryMessage(mds
, req
);
7813 int MDCache::path_traverse(MDRequestRef
& mdr
, Message
*req
, MDSInternalContextBase
*fin
, // who
7814 const filepath
& path
, // what
7815 vector
<CDentry
*> *pdnvec
, // result
7819 bool discover
= (onfail
== MDS_TRAVERSE_DISCOVER
);
7820 bool null_okay
= (onfail
== MDS_TRAVERSE_DISCOVERXLOCK
);
7821 bool forward
= (onfail
== MDS_TRAVERSE_FORWARD
);
7823 assert(mdr
|| req
|| fin
);
7824 assert(!forward
|| mdr
|| req
); // forward requires a request
7826 snapid_t snapid
= CEPH_NOSNAP
;
7828 mdr
->snapid
= snapid
;
7830 client_t client
= (mdr
&& mdr
->reqid
.name
.is_client()) ? mdr
->reqid
.name
.num() : -1;
7832 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
7834 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
7835 CInode
*cur
= get_inode(path
.get_ino());
7837 if (MDS_INO_IS_MDSDIR(path
.get_ino()))
7838 open_foreign_mdsdir(path
.get_ino(), _get_waiter(mdr
, req
, fin
));
7840 //ceph_abort(); // hrm.. broken
7845 if (cur
->state_test(CInode::STATE_PURGING
))
7848 // make sure snaprealm are open...
7849 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->is_open() &&
7850 !cur
->snaprealm
->open_parents(_get_waiter(mdr
, req
, fin
))) {
7861 while (depth
< path
.depth()) {
7862 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
7863 << "' snapid " << snapid
<< dendl
;
7865 if (!cur
->is_dir()) {
7866 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
7870 // walk into snapdir?
7871 if (path
[depth
].length() == 0) {
7872 dout(10) << "traverse: snapdir" << dendl
;
7875 snapid
= CEPH_SNAPDIR
;
7876 mdr
->snapid
= snapid
;
7880 // walk thru snapdir?
7881 if (snapid
== CEPH_SNAPDIR
) {
7884 SnapRealm
*realm
= cur
->find_snaprealm();
7885 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
7886 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
7889 mdr
->snapid
= snapid
;
7895 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
7896 CDir
*curdir
= cur
->get_dirfrag(fg
);
7898 if (cur
->is_auth()) {
7899 // parent dir frozen_dir?
7900 if (cur
->is_frozen()) {
7901 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
7902 cur
->add_waiter(CDir::WAIT_UNFREEZE
, _get_waiter(mdr
, req
, fin
));
7905 curdir
= cur
->get_or_open_dirfrag(this, fg
);
7908 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
7909 discover_path(cur
, snapid
, path
.postfixpath(depth
), _get_waiter(mdr
, req
, fin
),
7911 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
7917 #ifdef MDS_VERIFY_FRAGSTAT
7918 if (curdir
->is_complete())
7919 curdir
->verify_fragstat();
7924 if (curdir->is_frozen()) {
7926 // FIXME: traverse is allowed?
7927 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7928 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7929 if (onfinish) delete onfinish;
7934 // Before doing dirfrag->dn lookup, compare with DamageTable's
7935 // record of which dentries were unreadable
7936 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
7937 dout(4) << "traverse: stopped lookup at damaged dentry "
7938 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
7943 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
7944 CDentry::linkage_t
*dnl
= dn
? dn
->get_projected_linkage() : 0;
7946 // null and last_bit and xlocked by me?
7947 if (dnl
&& dnl
->is_null() && null_okay
) {
7948 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl
;
7950 pdnvec
->push_back(dn
);
7957 dn
->lock
.is_xlocked() &&
7958 dn
->lock
.get_xlock_by() != mdr
&&
7959 !dn
->lock
.can_read(client
) &&
7960 (dnl
->is_null() || forward
)) {
7961 dout(10) << "traverse: xlocked dentry at " << *dn
<< dendl
;
7962 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, _get_waiter(mdr
, req
, fin
));
7963 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_lock
);
7964 mds
->mdlog
->flush();
7968 // can we conclude ENOENT?
7969 if (dnl
&& dnl
->is_null()) {
7970 if (dn
->lock
.can_read(client
) ||
7971 (dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
7972 dout(10) << "traverse: miss on null+readable dentry " << path
[depth
] << " " << *dn
<< dendl
;
7974 if (depth
== path
.depth() - 1)
7975 pdnvec
->push_back(dn
);
7977 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
7981 dout(10) << "miss on dentry " << *dn
<< ", can't read due to lock" << dendl
;
7982 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, _get_waiter(mdr
, req
, fin
));
7987 if (dnl
&& !dnl
->is_null()) {
7988 CInode
*in
= dnl
->get_inode();
7990 // do we have inode?
7992 assert(dnl
->is_remote());
7994 in
= get_inode(dnl
->get_remote_ino());
7996 dout(7) << "linking in remote in " << *in
<< dendl
;
7997 dn
->link_remote(dnl
, in
);
7999 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8000 assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8001 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8002 dout(4) << "traverse: remote dentry points to damaged ino "
8006 open_remote_dentry(dn
, true, _get_waiter(mdr
, req
, fin
),
8007 (null_okay
&& depth
== path
.depth() - 1));
8008 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8014 // make sure snaprealm are open...
8015 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->is_open() &&
8016 !cur
->snaprealm
->open_parents(_get_waiter(mdr
, req
, fin
))) {
8020 // add to trace, continue.
8023 pdnvec
->push_back(dn
);
8031 // MISS. dentry doesn't exist.
8032 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8034 if (curdir
->is_auth()) {
8036 if (curdir
->is_complete() ||
8037 (snapid
== CEPH_NOSNAP
&&
8038 curdir
->has_bloom() &&
8039 !curdir
->is_in_bloom(path
[depth
]))){
8042 // instantiate a null dn?
8043 if (depth
< path
.depth()-1){
8044 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8047 ceph_abort(); // should have fallen out in ->is_null() check above
8048 } else if (curdir
->is_frozen()) {
8049 dout(20) << " not adding null to frozen dir " << dendl
;
8050 } else if (snapid
< CEPH_MAXSNAP
) {
8051 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8053 // create a null dentry
8054 dn
= curdir
->add_null_dentry(path
[depth
]);
8055 dout(20) << " added null " << *dn
<< dendl
;
8058 pdnvec
->push_back(dn
);
8060 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8065 // Check DamageTable for missing fragments before trying to fetch
8067 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8068 dout(4) << "traverse: damaged dirfrag " << *curdir
8069 << ", blocking fetch" << dendl
;
8073 // directory isn't complete; reload
8074 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8076 curdir
->fetch(_get_waiter(mdr
, req
, fin
), path
[depth
]);
8077 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8081 // dirfrag/dentry is not mine.
8082 mds_authority_t dauth
= curdir
->authority();
8085 snapid
&& mdr
&& mdr
->client_request
&&
8086 (int)depth
< mdr
->client_request
->get_num_fwd()) {
8087 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8088 << " < fwd " << mdr
->client_request
->get_num_fwd()
8089 << ", discovering instead of forwarding" << dendl
;
8093 if ((discover
|| null_okay
)) {
8094 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8095 discover_path(curdir
, snapid
, path
.postfixpath(depth
), _get_waiter(mdr
, req
, fin
),
8097 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8102 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8104 if (curdir
->is_ambiguous_auth()) {
8106 dout(7) << "traverse: waiting for single auth in " << *curdir
<< dendl
;
8107 curdir
->add_waiter(CDir::WAIT_SINGLEAUTH
, _get_waiter(mdr
, req
, fin
));
8111 dout(7) << "traverse: forwarding, not auth for " << *curdir
<< dendl
;
8114 request_forward(mdr
, dauth
.first
);
8116 mds
->forward_message_mds(req
, dauth
.first
);
8118 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_forward
);
8119 assert(fin
== NULL
);
8124 ceph_abort(); // i shouldn't get here
8128 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8129 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8131 assert(mdr
->snapid
== snapid
);
8135 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8137 dout(10) << "cache_traverse " << fp
<< dendl
;
8141 in
= get_inode(fp
.get_ino());
8147 for (unsigned i
= 0; i
< fp
.depth(); i
++) {
8148 const string
& dname
= fp
[i
];
8149 frag_t fg
= in
->pick_dirfrag(dname
);
8150 dout(20) << " " << i
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8151 CDir
*curdir
= in
->get_dirfrag(fg
);
8154 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8157 in
= dn
->get_linkage()->get_inode();
8161 dout(10) << " got " << *in
<< dendl
;
8167 * open_remote_dir -- open up a remote dirfrag
8169 * @param diri base inode
8170 * @param approxfg approximate fragment.
8171 * @param fin completion callback
8173 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSInternalContextBase
*fin
)
8175 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8177 assert(diri
->is_dir());
8178 assert(!diri
->is_auth());
8179 assert(diri
->get_dirfrag(approxfg
) == 0);
8181 mds_rank_t auth
= diri
->authority().first
;
8183 if (!mds
->is_cluster_degraded() ||
8184 mds
->mdsmap
->get_state(auth
) >= MDSMap::STATE_REJOIN
) {
8185 discover_dir_frag(diri
, approxfg
, fin
);
8187 // mds is down or recovering. forge a replica!
8188 forge_replica_dir(diri
, approxfg
, auth
);
8190 mds
->queue_waiter(fin
);
8196 * get_dentry_inode - get or open inode
8198 * @param dn the dentry
8199 * @param mdr current request
8201 * will return inode for primary, or link up/open up remote link's inode as necessary.
8202 * If it's not available right now, puts mdr on wait list and returns null.
8204 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8206 CDentry::linkage_t
*dnl
;
8208 dnl
= dn
->get_projected_linkage();
8210 dnl
= dn
->get_linkage();
8212 assert(!dnl
->is_null());
8214 if (dnl
->is_primary())
8217 assert(dnl
->is_remote());
8218 CInode
*in
= get_inode(dnl
->get_remote_ino());
8220 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8221 dn
->link_remote(dnl
, in
);
8224 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8225 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8230 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8233 MDSInternalContextBase
*onfinish
;
8235 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSInternalContextBase
*f
, bool wx
) :
8236 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8237 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8239 void finish(int r
) override
{
8240 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8241 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8245 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSInternalContextBase
*fin
, bool want_xlocked
)
8247 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8248 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8249 inodeno_t ino
= dnl
->get_remote_ino();
8250 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->mdsmap
->get_metadata_pool() : -1;
8252 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8255 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSInternalContextBase
*fin
,
8256 bool want_xlocked
, int r
)
8259 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8260 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8261 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8262 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8265 CDir
*dir
= dn
->get_dir();
8267 dir
->get_inode()->make_path_string(path
);
8268 path
= path
+ "/" + dn
->get_name();
8271 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8274 ceph_abort(); // unreachable, damaged() respawns us
8280 fin
->complete(r
< 0 ? r
: 0);
8284 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8286 // empty trace if we're a base inode
8290 CInode
*parent
= in
->get_parent_inode();
8292 make_trace(trace
, parent
);
8294 CDentry
*dn
= in
->get_parent_dn();
8295 dout(15) << "make_trace adding " << *dn
<< dendl
;
8296 trace
.push_back(dn
);
8300 // -------------------------------------------------------------------------------
8301 // Open inode by inode number
8303 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8307 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8308 MDCacheIOContext(c
), ino(i
) {}
8309 void finish(int r
) override
{
8310 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8314 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8319 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, MMDSOpenIno
*m
, bool p
) :
8320 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8321 void finish(int r
) override
{
8322 if (r
< 0 && !parent
)
8325 mdcache
->handle_open_ino(msg
, r
);
8328 assert(mdcache
->opening_inodes
.count(ino
));
8329 mdcache
->_open_ino_traverse_dir(ino
, mdcache
->opening_inodes
[ino
], r
);
8333 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8336 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8337 void finish(int r
) override
{
8338 mdcache
->_open_ino_parent_opened(ino
, r
);
8342 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8344 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8346 assert(opening_inodes
.count(ino
));
8347 open_ino_info_t
& info
= opening_inodes
[ino
];
8349 CInode
*in
= get_inode(ino
);
8351 dout(10) << " found cached " << *in
<< dendl
;
8352 open_ino_finish(ino
, info
, in
->authority().first
);
8356 inode_backtrace_t backtrace
;
8359 ::decode(backtrace
, bl
);
8360 } catch (const buffer::error
&decode_exc
) {
8361 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8362 << std::dec
<< ": " << decode_exc
<< dendl
;
8363 open_ino_finish(ino
, info
, -EIO
);
8366 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8367 dout(10) << " old object in pool " << info
.pool
8368 << ", retrying pool " << backtrace
.pool
<< dendl
;
8369 info
.pool
= backtrace
.pool
;
8370 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8371 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8372 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8373 new C_OnFinisher(fin
, mds
->finisher
));
8376 } else if (err
== -ENOENT
) {
8377 int64_t meta_pool
= mds
->mdsmap
->get_metadata_pool();
8378 if (info
.pool
!= meta_pool
) {
8379 dout(10) << " no object in pool " << info
.pool
8380 << ", retrying pool " << meta_pool
<< dendl
;
8381 info
.pool
= meta_pool
;
8382 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8383 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8384 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8385 new C_OnFinisher(fin
, mds
->finisher
));
8388 err
= 0; // backtrace.ancestors.empty() is checked below
8392 if (backtrace
.ancestors
.empty()) {
8393 dout(10) << " got empty backtrace " << dendl
;
8395 } else if (!info
.ancestors
.empty()) {
8396 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8397 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8405 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8407 err
= info
.last_err
;
8408 open_ino_finish(ino
, info
, err
);
8412 dout(10) << " got backtrace " << backtrace
<< dendl
;
8413 info
.ancestors
= backtrace
.ancestors
;
8415 _open_ino_traverse_dir(ino
, info
, 0);
8418 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8420 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8422 assert(opening_inodes
.count(ino
));
8423 open_ino_info_t
& info
= opening_inodes
[ino
];
8425 CInode
*in
= get_inode(ino
);
8427 dout(10) << " found cached " << *in
<< dendl
;
8428 open_ino_finish(ino
, info
, in
->authority().first
);
8432 if (ret
== mds
->get_nodeid()) {
8433 _open_ino_traverse_dir(ino
, info
, 0);
8436 mds_rank_t checked_rank
= mds_rank_t(ret
);
8437 info
.check_peers
= true;
8438 info
.auth_hint
= checked_rank
;
8439 info
.checked
.erase(checked_rank
);
8441 do_open_ino(ino
, info
, ret
);
8445 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8447 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
8449 CInode
*in
= get_inode(ino
);
8451 dout(10) << " found cached " << *in
<< dendl
;
8452 open_ino_finish(ino
, info
, in
->authority().first
);
8457 do_open_ino(ino
, info
, ret
);
8461 mds_rank_t hint
= info
.auth_hint
;
8462 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
8463 info
.discover
, info
.want_xlocked
, &hint
);
8466 if (hint
!= mds
->get_nodeid())
8467 info
.auth_hint
= hint
;
8468 do_open_ino(ino
, info
, ret
);
8471 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, MMDSOpenIno
*m
, CDir
*dir
, bool parent
)
8473 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
8474 assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
8475 dir
->fetch(new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
));
8478 int MDCache::open_ino_traverse_dir(inodeno_t ino
, MMDSOpenIno
*m
,
8479 vector
<inode_backpointer_t
>& ancestors
,
8480 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
8482 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
8484 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
8485 CInode
*diri
= get_inode(ancestors
[i
].dirino
);
8488 if (discover
&& MDS_INO_IS_MDSDIR(ancestors
[i
].dirino
)) {
8489 open_foreign_mdsdir(ancestors
[i
].dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8495 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
8496 CDir
*dir
= diri
->get_parent_dir();
8497 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
8498 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
))
8499 dir
= dir
->get_inode()->get_parent_dir();
8500 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8504 if (!diri
->is_dir()) {
8505 dout(10) << " " << *diri
<< " is not dir" << dendl
;
8511 string
&name
= ancestors
[i
].dname
;
8512 frag_t fg
= diri
->pick_dirfrag(name
);
8513 CDir
*dir
= diri
->get_dirfrag(fg
);
8515 if (diri
->is_auth()) {
8516 if (diri
->is_frozen()) {
8517 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
8518 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8521 dir
= diri
->get_or_open_dirfrag(this, fg
);
8522 } else if (discover
) {
8523 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8528 inodeno_t next_ino
= i
> 0 ? ancestors
[i
- 1].dirino
: ino
;
8529 CDentry
*dn
= dir
->lookup(name
);
8530 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
8531 if (dir
->is_auth()) {
8532 if (dnl
&& dnl
->is_primary() &&
8533 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
8534 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
8535 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8539 if (!dnl
&& !dir
->is_complete() &&
8540 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
8541 dout(10) << " fetching incomplete " << *dir
<< dendl
;
8542 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8546 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8549 } else if (discover
) {
8551 filepath
path(name
, 0);
8552 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
8553 (i
== 0 && want_xlocked
));
8556 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
8557 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
8558 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8561 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8567 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
8573 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8575 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
8577 list
<MDSInternalContextBase
*> waiters
;
8578 waiters
.swap(info
.waiters
);
8579 opening_inodes
.erase(ino
);
8580 finish_contexts(g_ceph_context
, waiters
, ret
);
8583 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
8585 if (err
< 0 && err
!= -EAGAIN
) {
8586 info
.checked
.clear();
8587 info
.checked
.insert(mds
->get_nodeid());
8588 info
.checking
= MDS_RANK_NONE
;
8589 info
.check_peers
= true;
8590 info
.fetch_backtrace
= true;
8591 if (info
.discover
) {
8592 info
.discover
= false;
8593 info
.ancestors
.clear();
8595 if (err
!= -ENOENT
&& err
!= -ENOTDIR
)
8596 info
.last_err
= err
;
8599 if (info
.check_peers
) {
8600 info
.check_peers
= false;
8601 info
.checking
= MDS_RANK_NONE
;
8602 do_open_ino_peer(ino
, info
);
8603 } else if (info
.fetch_backtrace
) {
8604 info
.check_peers
= true;
8605 info
.fetch_backtrace
= false;
8606 info
.checking
= mds
->get_nodeid();
8607 info
.checked
.clear();
8608 info
.checked
.insert(mds
->get_nodeid());
8609 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8610 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8611 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8612 new C_OnFinisher(fin
, mds
->finisher
));
8614 assert(!info
.ancestors
.empty());
8615 info
.checking
= mds
->get_nodeid();
8616 open_ino(info
.ancestors
[0].dirino
, mds
->mdsmap
->get_metadata_pool(),
8617 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
8621 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
8623 set
<mds_rank_t
> all
, active
;
8624 mds
->mdsmap
->get_mds_set(all
);
8625 mds
->mdsmap
->get_clientreplay_or_active_or_stopping_mds_set(active
);
8626 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
8627 mds
->mdsmap
->get_mds_set(active
, MDSMap::STATE_REJOIN
);
8629 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
8630 << " all " << all
<< " checked " << info
.checked
<< dendl
;
8632 mds_rank_t peer
= MDS_RANK_NONE
;
8633 if (info
.auth_hint
>= 0) {
8634 if (active
.count(info
.auth_hint
)) {
8635 peer
= info
.auth_hint
;
8636 info
.auth_hint
= MDS_RANK_NONE
;
8639 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
8640 if (*p
!= mds
->get_nodeid() && info
.checked
.count(*p
) == 0) {
8646 if (all
.size() > active
.size() && all
!= info
.checked
) {
8647 dout(10) << " waiting for more peers to be active" << dendl
;
8649 dout(10) << " all MDS peers have been checked " << dendl
;
8650 do_open_ino(ino
, info
, 0);
8653 info
.checking
= peer
;
8654 vector
<inode_backpointer_t
> *pa
= NULL
;
8655 // got backtrace from peer or backtrace just fetched
8656 if (info
.discover
|| !info
.fetch_backtrace
)
8657 pa
= &info
.ancestors
;
8658 mds
->send_message_mds(new MMDSOpenIno(info
.tid
, ino
, pa
), peer
);
8662 void MDCache::handle_open_ino(MMDSOpenIno
*m
, int err
)
8664 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
8665 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
8670 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
8672 inodeno_t ino
= m
->ino
;
8673 MMDSOpenInoReply
*reply
;
8674 CInode
*in
= get_inode(ino
);
8676 dout(10) << " have " << *in
<< dendl
;
8677 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, mds_rank_t(0));
8678 if (in
->is_auth()) {
8681 CDentry
*pdn
= in
->get_parent_dn();
8684 CInode
*diri
= pdn
->get_dir()->get_inode();
8685 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->name
,
8686 in
->inode
.version
));
8690 reply
->hint
= in
->authority().first
;
8692 } else if (err
< 0) {
8693 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
8695 mds_rank_t hint
= MDS_RANK_NONE
;
8696 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
8699 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, hint
, ret
);
8701 m
->get_connection()->send_message(reply
);
8705 void MDCache::handle_open_ino_reply(MMDSOpenInoReply
*m
)
8707 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
8709 inodeno_t ino
= m
->ino
;
8710 mds_rank_t from
= mds_rank_t(m
->get_source().num());
8711 auto it
= opening_inodes
.find(ino
);
8712 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
8713 open_ino_info_t
& info
= it
->second
;
8714 info
.checking
= MDS_RANK_NONE
;
8715 info
.checked
.insert(from
);
8717 CInode
*in
= get_inode(ino
);
8719 dout(10) << " found cached " << *in
<< dendl
;
8720 open_ino_finish(ino
, info
, in
->authority().first
);
8721 } else if (!m
->ancestors
.empty()) {
8722 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
8723 if (!info
.want_replica
) {
8724 open_ino_finish(ino
, info
, from
);
8729 info
.ancestors
= m
->ancestors
;
8730 info
.auth_hint
= from
;
8731 info
.checking
= mds
->get_nodeid();
8732 info
.discover
= true;
8733 _open_ino_traverse_dir(ino
, info
, 0);
8734 } else if (m
->error
) {
8735 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
8736 do_open_ino(ino
, info
, m
->error
);
8738 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
8739 info
.auth_hint
= m
->hint
;
8740 info
.checked
.erase(m
->hint
);
8742 do_open_ino_peer(ino
, info
);
8748 void MDCache::kick_open_ino_peers(mds_rank_t who
)
8750 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
8752 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
8753 p
!= opening_inodes
.end();
8755 open_ino_info_t
& info
= p
->second
;
8756 if (info
.checking
== who
) {
8757 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
8758 info
.checking
= MDS_RANK_NONE
;
8759 do_open_ino_peer(p
->first
, info
);
8760 } else if (info
.checking
== MDS_RANK_NONE
) {
8761 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
8762 do_open_ino_peer(p
->first
, info
);
8767 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSInternalContextBase
* fin
,
8768 bool want_replica
, bool want_xlocked
)
8770 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
8771 << want_replica
<< dendl
;
8773 if (opening_inodes
.count(ino
)) {
8774 open_ino_info_t
& info
= opening_inodes
[ino
];
8776 info
.want_replica
= true;
8777 if (want_xlocked
&& !info
.want_xlocked
) {
8778 if (!info
.ancestors
.empty()) {
8779 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
8781 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
8782 CDir
*dir
= diri
->get_dirfrag(fg
);
8783 if (dir
&& !dir
->is_auth()) {
8784 filepath
path(info
.ancestors
[0].dname
, 0);
8785 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
8789 info
.want_xlocked
= true;
8792 info
.waiters
.push_back(fin
);
8794 open_ino_info_t
& info
= opening_inodes
[ino
];
8795 info
.checked
.insert(mds
->get_nodeid());
8796 info
.want_replica
= want_replica
;
8797 info
.want_xlocked
= want_xlocked
;
8798 info
.tid
= ++open_ino_last_tid
;
8799 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
8800 info
.waiters
.push_back(fin
);
8801 do_open_ino(ino
, info
, 0);
8805 /* ---------------------------- */
8808 * search for a given inode on MDS peers. optionally start with the given node.
8812 - recover from mds node failure, recovery
8816 void MDCache::find_ino_peers(inodeno_t ino
, MDSInternalContextBase
*c
, mds_rank_t hint
)
8818 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
8819 assert(!have_inode(ino
));
8821 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
8822 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
8827 fip
.checked
.insert(mds
->get_nodeid());
8828 _do_find_ino_peer(fip
);
8831 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
8833 set
<mds_rank_t
> all
, active
;
8834 mds
->mdsmap
->get_mds_set(all
);
8835 mds
->mdsmap
->get_clientreplay_or_active_or_stopping_mds_set(active
);
8837 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
8838 << " active " << active
<< " all " << all
8839 << " checked " << fip
.checked
8842 mds_rank_t m
= MDS_RANK_NONE
;
8843 if (fip
.hint
>= 0) {
8845 fip
.hint
= MDS_RANK_NONE
;
8847 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
8848 if (*p
!= mds
->get_nodeid() &&
8849 fip
.checked
.count(*p
) == 0) {
8854 if (m
== MDS_RANK_NONE
) {
8855 if (all
.size() > active
.size()) {
8856 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
8858 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
8859 fip
.fin
->complete(-ESTALE
);
8860 find_ino_peer
.erase(fip
.tid
);
8864 mds
->send_message_mds(new MMDSFindIno(fip
.tid
, fip
.ino
), m
);
8868 void MDCache::handle_find_ino(MMDSFindIno
*m
)
8870 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
8875 dout(10) << "handle_find_ino " << *m
<< dendl
;
8876 MMDSFindInoReply
*r
= new MMDSFindInoReply(m
->tid
);
8877 CInode
*in
= get_inode(m
->ino
);
8879 in
->make_path(r
->path
);
8880 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
8882 m
->get_connection()->send_message(r
);
8887 void MDCache::handle_find_ino_reply(MMDSFindInoReply
*m
)
8889 map
<ceph_tid_t
, find_ino_peer_info_t
>::iterator p
= find_ino_peer
.find(m
->tid
);
8890 if (p
!= find_ino_peer
.end()) {
8891 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
8892 find_ino_peer_info_t
& fip
= p
->second
;
8895 if (get_inode(fip
.ino
)) {
8896 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
8897 mds
->queue_waiter(fip
.fin
);
8898 find_ino_peer
.erase(p
);
8903 mds_rank_t from
= mds_rank_t(m
->get_source().num());
8904 if (fip
.checking
== from
)
8905 fip
.checking
= MDS_RANK_NONE
;
8906 fip
.checked
.insert(from
);
8908 if (!m
->path
.empty()) {
8910 vector
<CDentry
*> trace
;
8911 MDRequestRef null_ref
;
8912 int r
= path_traverse(null_ref
, m
, NULL
, m
->path
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
8915 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
8916 << ", retrying" << dendl
;
8917 fip
.checked
.clear();
8918 _do_find_ino_peer(fip
);
8921 _do_find_ino_peer(fip
);
8924 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
8929 void MDCache::kick_find_ino_peers(mds_rank_t who
)
8931 // find_ino_peers requests we should move on from
8932 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
8933 p
!= find_ino_peer
.end();
8935 find_ino_peer_info_t
& fip
= p
->second
;
8936 if (fip
.checking
== who
) {
8937 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
8938 fip
.checking
= MDS_RANK_NONE
;
8939 _do_find_ino_peer(fip
);
8940 } else if (fip
.checking
== MDS_RANK_NONE
) {
8941 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
8942 _do_find_ino_peer(fip
);
8947 /* ---------------------------- */
8949 int MDCache::get_num_client_requests()
8952 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
8953 p
!= active_requests
.end();
8955 MDRequestRef
& mdr
= p
->second
;
8956 if (mdr
->reqid
.name
.is_client() && !mdr
->is_slave())
8962 /* This function takes over the reference to the passed Message */
8963 MDRequestRef
MDCache::request_start(MClientRequest
*req
)
8965 // did we win a forward race against a slave?
8966 if (active_requests
.count(req
->get_reqid())) {
8967 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
8969 if (mdr
->is_slave()) {
8970 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
8971 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
8973 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
8976 return MDRequestRef();
8979 // register new client request
8980 MDRequestImpl::Params params
;
8981 params
.reqid
= req
->get_reqid();
8982 params
.attempt
= req
->get_num_fwd();
8983 params
.client_req
= req
;
8984 params
.initiated
= req
->get_recv_stamp();
8985 params
.throttled
= req
->get_throttle_stamp();
8986 params
.all_read
= req
->get_recv_complete_stamp();
8987 params
.dispatched
= req
->get_dispatch_stamp();
8990 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
8991 active_requests
[params
.reqid
] = mdr
;
8992 mdr
->set_op_stamp(req
->get_stamp());
8993 dout(7) << "request_start " << *mdr
<< dendl
;
8997 MDRequestRef
MDCache::request_start_slave(metareqid_t ri
, __u32 attempt
, Message
*m
)
8999 int by
= m
->get_source().num();
9000 MDRequestImpl::Params params
;
9002 params
.attempt
= attempt
;
9003 params
.triggering_slave_req
= m
;
9004 params
.slave_to
= by
;
9005 params
.initiated
= m
->get_recv_stamp();
9006 params
.throttled
= m
->get_throttle_stamp();
9007 params
.all_read
= m
->get_recv_complete_stamp();
9008 params
.dispatched
= m
->get_dispatch_stamp();
9010 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
9011 assert(active_requests
.count(mdr
->reqid
) == 0);
9012 active_requests
[mdr
->reqid
] = mdr
;
9013 dout(7) << "request_start_slave " << *mdr
<< " by mds." << by
<< dendl
;
9017 MDRequestRef
MDCache::request_start_internal(int op
)
9019 MDRequestImpl::Params params
;
9020 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9021 params
.reqid
.tid
= mds
->issue_tid();
9022 params
.initiated
= ceph_clock_now();
9023 params
.internal_op
= op
;
9025 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
9027 assert(active_requests
.count(mdr
->reqid
) == 0);
9028 active_requests
[mdr
->reqid
] = mdr
;
9029 dout(7) << "request_start_internal " << *mdr
<< " op " << op
<< dendl
;
9033 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9035 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9036 assert(p
!= active_requests
.end());
9037 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9041 void MDCache::request_finish(MDRequestRef
& mdr
)
9043 dout(7) << "request_finish " << *mdr
<< dendl
;
9044 mdr
->mark_event("finishing request");
9047 if (mdr
->has_more() && mdr
->more()->slave_commit
) {
9048 Context
*fin
= mdr
->more()->slave_commit
;
9049 mdr
->more()->slave_commit
= 0;
9052 mdr
->aborted
= false;
9054 mdr
->more()->slave_rolling_back
= true;
9057 mdr
->committing
= true;
9059 fin
->complete(ret
); // this must re-call request_finish.
9063 request_cleanup(mdr
);
9067 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9069 mdr
->mark_event("forwarding request");
9070 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9071 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9072 << *mdr
->client_request
<< dendl
;
9073 mds
->forward_message_mds(mdr
->client_request
, who
);
9074 mdr
->client_request
= 0;
9075 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9076 } else if (mdr
->internal_op
>= 0) {
9077 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9078 mdr
->internal_op_finish
->complete(-EXDEV
);
9080 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9081 << " was from mds" << dendl
;
9083 request_cleanup(mdr
);
9087 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9089 if (mdr
->client_request
) {
9090 mds
->server
->dispatch_client_request(mdr
);
9091 } else if (mdr
->slave_request
) {
9092 mds
->server
->dispatch_slave_request(mdr
);
9094 switch (mdr
->internal_op
) {
9095 case CEPH_MDS_OP_FRAGMENTDIR
:
9096 dispatch_fragment_dir(mdr
);
9098 case CEPH_MDS_OP_EXPORTDIR
:
9099 migrator
->dispatch_export_dir(mdr
, 0);
9101 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9102 enqueue_scrub_work(mdr
);
9104 case CEPH_MDS_OP_FLUSH
:
9105 flush_dentry_work(mdr
);
9107 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9108 repair_dirfrag_stats_work(mdr
);
9110 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9111 repair_inode_stats_work(mdr
);
9120 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9122 if (!mdr
->has_more())
9126 // (will implicitly drop remote dn pins)
9127 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->slaves
.begin();
9128 p
!= mdr
->more()->slaves
.end();
9130 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
9131 MMDSSlaveRequest::OP_FINISH
);
9133 if (mdr
->killed
&& !mdr
->committing
) {
9135 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9136 mdr
->more()->inode_import
.length() > 0) {
9137 // information about rename imported caps
9138 r
->inode_export
.claim(mdr
->more()->inode_import
);
9141 mds
->send_message_mds(r
, *p
);
9144 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9145 * implicitly. Note that we don't call the finishers -- there shouldn't
9146 * be any on a remote lock and the request finish wakes up all
9147 * the waiters anyway! */
9148 set
<SimpleLock
*>::iterator p
= mdr
->xlocks
.begin();
9149 while (p
!= mdr
->xlocks
.end()) {
9150 if ((*p
)->get_parent()->is_auth())
9153 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9154 << " on " << *(*p
)->get_parent() << dendl
;
9156 mdr
->locks
.erase(*p
);
9157 mdr
->xlocks
.erase(p
++);
9161 map
<SimpleLock
*, mds_rank_t
>::iterator q
= mdr
->remote_wrlocks
.begin();
9162 while (q
!= mdr
->remote_wrlocks
.end()) {
9163 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q
->first
9164 << " on mds." << q
->second
9165 << " on " << *(q
->first
)->get_parent() << dendl
;
9166 mdr
->locks
.erase(q
->first
);
9167 mdr
->remote_wrlocks
.erase(q
++);
9170 mdr
->more()->slaves
.clear(); /* we no longer have requests out to them, and
9171 * leaving them in can cause double-notifies as
9172 * this function can get called more than once */
9175 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9177 request_drop_foreign_locks(mdr
);
9178 mds
->locker
->drop_non_rdlocks(mdr
.get());
9181 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9183 request_drop_foreign_locks(mdr
);
9184 mds
->locker
->drop_locks(mdr
.get());
9187 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9189 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9191 if (mdr
->has_more()) {
9192 if (mdr
->more()->is_ambiguous_auth
)
9193 mdr
->clear_ambiguous_auth();
9194 if (!mdr
->more()->waiting_for_finish
.empty())
9195 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9198 request_drop_locks(mdr
);
9200 // drop (local) auth pins
9201 mdr
->drop_local_auth_pins();
9204 for (set
<CInode
*>::iterator p
= mdr
->stickydirs
.begin();
9205 p
!= mdr
->stickydirs
.end();
9207 (*p
)->put_stickydirs();
9209 mds
->locker
->kick_cap_releases(mdr
);
9214 // remove from session
9215 mdr
->item_session_request
.remove_myself();
9218 active_requests
.erase(mdr
->reqid
);
9223 mdr
->mark_event("cleaned up request");
9226 void MDCache::request_kill(MDRequestRef
& mdr
)
9228 // rollback slave requests is tricky. just let the request proceed.
9229 if (mdr
->done_locking
&& mdr
->has_more() &&
9230 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_slave
.empty())) {
9231 dout(10) << "request_kill " << *mdr
<< " -- already started slave requests, no-op" << dendl
;
9233 assert(mdr
->used_prealloc_ino
== 0);
9234 assert(mdr
->prealloc_inos
.empty());
9236 mdr
->session
= NULL
;
9237 mdr
->item_session_request
.remove_myself();
9242 mdr
->mark_event("killing request");
9244 if (mdr
->committing
) {
9245 dout(10) << "request_kill " << *mdr
<< " -- already committing, no-op" << dendl
;
9247 dout(10) << "request_kill " << *mdr
<< dendl
;
9248 request_cleanup(mdr
);
9252 // -------------------------------------------------------------------------------
9255 struct C_MDC_snaprealm_create_finish
: public MDCacheLogContext
{
9259 C_MDC_snaprealm_create_finish(MDCache
*c
, MDRequestRef
& m
,
9260 MutationRef
& mu
, CInode
*i
) :
9261 MDCacheLogContext(c
), mdr(m
), mut(mu
), in(i
) {}
9262 void finish(int r
) override
{
9263 mdcache
->_snaprealm_create_finish(mdr
, mut
, in
);
9267 void MDCache::snaprealm_create(MDRequestRef
& mdr
, CInode
*in
)
9269 dout(10) << "snaprealm_create " << *in
<< dendl
;
9270 assert(!in
->snaprealm
);
9273 if (!mdr
->more()->stid
) {
9274 mds
->snapclient
->prepare_create_realm(in
->ino(), &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9275 new C_MDS_RetryRequest(this, mdr
));
9279 MutationRef
mut(new MutationImpl());
9280 mut
->ls
= mds
->mdlog
->get_current_segment();
9281 EUpdate
*le
= new EUpdate(mds
->mdlog
, "snaprealm_create");
9282 mds
->mdlog
->start_entry(le
);
9284 le
->metablob
.add_table_transaction(TABLE_SNAP
, mdr
->more()->stid
);
9286 inode_t
*pi
= in
->project_inode();
9287 pi
->version
= in
->pre_dirty();
9288 pi
->rstat
.rsnaprealms
++;
9290 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
9294 sr_t
*newsnap
= in
->project_snaprealm(seq
);
9296 newsnap
->last_created
= seq
;
9298 predirty_journal_parents(mut
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
9299 journal_cow_inode(mut
, &le
->metablob
, in
);
9300 le
->metablob
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9302 mds
->server
->submit_mdlog_entry(le
,
9303 new C_MDC_snaprealm_create_finish(this, mdr
,
9306 mds
->mdlog
->flush();
9310 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool nosend
)
9312 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9314 vector
<inodeno_t
> split_inos
;
9315 vector
<inodeno_t
> split_realms
;
9317 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9318 // notify clients of update|split
9319 for (elist
<CInode
*>::iterator p
= in
->snaprealm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
9321 split_inos
.push_back((*p
)->ino());
9323 for (set
<SnapRealm
*>::iterator p
= in
->snaprealm
->open_children
.begin();
9324 p
!= in
->snaprealm
->open_children
.end();
9326 split_realms
.push_back((*p
)->inode
->ino());
9330 in
->snaprealm
->build_snap_trace(snapbl
);
9332 set
<SnapRealm
*> past_children
;
9333 map
<client_t
, MClientSnap
*> updates
;
9335 q
.push_back(in
->snaprealm
);
9336 while (!q
.empty()) {
9337 SnapRealm
*realm
= q
.front();
9340 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9341 realm
->invalidate_cached_snaps();
9343 for (map
<client_t
, xlist
<Capability
*>* >::iterator p
= realm
->client_caps
.begin();
9344 p
!= realm
->client_caps
.end();
9346 assert(!p
->second
->empty());
9347 if (!nosend
&& updates
.count(p
->first
) == 0) {
9348 MClientSnap
*update
= new MClientSnap(snapop
);
9349 update
->head
.split
= in
->ino();
9350 update
->split_inos
= split_inos
;
9351 update
->split_realms
= split_realms
;
9352 update
->bl
= snapbl
;
9353 updates
[p
->first
] = update
;
9357 if (snapop
== CEPH_SNAP_OP_UPDATE
|| snapop
== CEPH_SNAP_OP_DESTROY
) {
9358 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9359 p
!= realm
->open_past_children
.end();
9361 past_children
.insert(*p
);
9364 // notify for active children, too.
9365 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9366 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9367 p
!= realm
->open_children
.end();
9373 send_snaps(updates
);
9375 // notify past children and their descendants if we update/delete old snapshots
9376 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9377 p
!= past_children
.end();
9381 while (!q
.empty()) {
9382 SnapRealm
*realm
= q
.front();
9385 realm
->invalidate_cached_snaps();
9387 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9388 p
!= realm
->open_children
.end();
9390 if (past_children
.count(*p
) == 0)
9394 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9395 p
!= realm
->open_past_children
.end();
9397 if (past_children
.count(*p
) == 0) {
9399 past_children
.insert(*p
);
9404 if (snapop
== CEPH_SNAP_OP_DESTROY
) {
9405 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9406 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9407 p
!= past_children
.end();
9409 maybe_eval_stray((*p
)->inode
, true);
9413 void MDCache::_snaprealm_create_finish(MDRequestRef
& mdr
, MutationRef
& mut
, CInode
*in
)
9415 dout(10) << "_snaprealm_create_finish " << *in
<< dendl
;
9418 in
->pop_and_dirty_projected_inode(mut
->ls
);
9420 mds
->locker
->drop_locks(mut
.get());
9423 // tell table we've committed
9424 mds
->snapclient
->commit(mdr
->more()->stid
, mut
->ls
);
9427 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
9431 in
->open_snaprealm();
9432 in
->snaprealm
->srnode
.seq
= seq
;
9433 in
->snaprealm
->srnode
.created
= seq
;
9434 bool ok
= in
->snaprealm
->_open_parents(NULL
);
9437 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
);
9440 static int count = 5;
9442 ceph_abort(); // hack test test **********
9446 mdr
->more()->stid
= 0; // caller will likely need to reuse this
9447 dispatch_request(mdr
);
9451 // -------------------------------------------------------------------------------
9454 struct C_MDC_RetryScanStray
: public MDCacheContext
{
9456 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
9457 void finish(int r
) override
{
9458 mdcache
->scan_stray_dir(next
);
9462 void MDCache::scan_stray_dir(dirfrag_t next
)
9464 dout(10) << "scan_stray_dir " << next
<< dendl
;
9467 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
9468 if (strays
[i
]->ino() < next
.ino
)
9470 strays
[i
]->get_dirfrags(ls
);
9473 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
9475 if (dir
->dirfrag() < next
)
9477 if (!dir
->is_complete()) {
9478 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
9481 for (CDir::map_t::iterator q
= dir
->items
.begin(); q
!= dir
->items
.end(); ++q
) {
9482 CDentry
*dn
= q
->second
;
9483 dn
->state_set(CDentry::STATE_STRAY
);
9484 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
9485 if (dnl
->is_primary()) {
9486 CInode
*in
= dnl
->get_inode();
9487 if (in
->inode
.nlink
== 0)
9488 in
->state_set(CInode::STATE_ORPHAN
);
9489 maybe_eval_stray(in
);
9495 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
9497 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
9498 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
9505 // ========================================================================================
9509 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9510 to the parent metadata object in the cache (pinning it).
9512 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9516 void MDCache::_send_discover(discover_info_t
& d
)
9518 MDiscover
*dis
= new MDiscover(d
.ino
, d
.frag
, d
.snap
, d
.want_path
,
9519 d
.want_base_dir
, d
.want_xlocked
);
9520 dis
->set_tid(d
.tid
);
9521 mds
->send_message_mds(dis
, d
.mds
);
9524 void MDCache::discover_base_ino(inodeno_t want_ino
,
9525 MDSInternalContextBase
*onfinish
,
9528 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
9529 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
9530 discover_info_t
& d
= _create_discover(from
);
9534 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
9538 void MDCache::discover_dir_frag(CInode
*base
,
9540 MDSInternalContextBase
*onfinish
,
9544 from
= base
->authority().first
;
9546 dirfrag_t
df(base
->ino(), approx_fg
);
9547 dout(7) << "discover_dir_frag " << df
9548 << " from mds." << from
<< dendl
;
9550 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
9551 discover_info_t
& d
= _create_discover(from
);
9553 d
.ino
= base
->ino();
9555 d
.want_base_dir
= true;
9560 base
->add_dir_waiter(approx_fg
, onfinish
);
9563 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
9568 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
9569 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
9570 void finish(int r
) override
{
9571 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
9575 void MDCache::discover_path(CInode
*base
,
9578 MDSInternalContextBase
*onfinish
,
9583 from
= base
->authority().first
;
9585 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9586 << (want_xlocked
? " want_xlocked":"")
9589 if (base
->is_ambiguous_auth()) {
9590 dout(10) << " waiting for single auth on " << *base
<< dendl
;
9592 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
9593 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
9595 } else if (from
== mds
->get_nodeid()) {
9596 list
<MDSInternalContextBase
*> finished
;
9597 base
->take_waiting(CInode::WAIT_DIR
, finished
);
9598 mds
->queue_waiters(finished
);
9602 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
9603 if ((want_xlocked
&& want_path
.depth() == 1) ||
9604 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
9605 discover_info_t
& d
= _create_discover(from
);
9606 d
.ino
= base
->ino();
9610 d
.want_path
= want_path
;
9611 d
.want_base_dir
= true;
9612 d
.want_xlocked
= want_xlocked
;
9618 base
->add_dir_waiter(fg
, onfinish
);
9621 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
9625 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
9626 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
9627 void finish(int r
) override
{
9628 mdcache
->discover_path(base
, snapid
, path
, 0);
9632 void MDCache::discover_path(CDir
*base
,
9635 MDSInternalContextBase
*onfinish
,
9638 mds_rank_t from
= base
->authority().first
;
9640 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9641 << (want_xlocked
? " want_xlocked":"")
9644 if (base
->is_ambiguous_auth()) {
9645 dout(7) << " waiting for single auth on " << *base
<< dendl
;
9647 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
9648 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
9650 } else if (from
== mds
->get_nodeid()) {
9651 list
<MDSInternalContextBase
*> finished
;
9652 base
->take_sub_waiting(finished
);
9653 mds
->queue_waiters(finished
);
9657 if ((want_xlocked
&& want_path
.depth() == 1) ||
9658 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
9659 discover_info_t
& d
= _create_discover(from
);
9660 d
.ino
= base
->ino();
9661 d
.pin_base(base
->inode
);
9662 d
.frag
= base
->get_frag();
9664 d
.want_path
= want_path
;
9665 d
.want_base_dir
= false;
9666 d
.want_xlocked
= want_xlocked
;
9672 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
9675 void MDCache::kick_discovers(mds_rank_t who
)
9677 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
9678 p
!= discovers
.end();
9680 if (p
->second
.mds
!= who
)
9682 _send_discover(p
->second
);
9687 /* This function DOES put the passed message before returning */
9688 void MDCache::handle_discover(MDiscover
*dis
)
9690 mds_rank_t whoami
= mds
->get_nodeid();
9691 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
9693 assert(from
!= whoami
);
9695 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
9696 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
9697 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
9702 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9703 // delay processing request from survivor because we may not yet choose lock states.
9704 if (!mds
->mdsmap
->is_rejoin(from
)) {
9705 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
9706 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
9713 MDiscoverReply
*reply
= new MDiscoverReply(dis
);
9715 snapid_t snapid
= dis
->get_snapid();
9718 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
9719 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
9721 dout(7) << "handle_discover from mds." << from
9722 << " wants base + " << dis
->get_want().get_path()
9723 << " snap " << snapid
9726 cur
= get_inode(dis
->get_base_ino());
9730 reply
->starts_with
= MDiscoverReply::INODE
;
9731 replicate_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
9732 dout(10) << "added base " << *cur
<< dendl
;
9735 // there's a base inode
9736 cur
= get_inode(dis
->get_base_ino(), snapid
);
9737 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
9738 cur
= get_inode(dis
->get_base_ino());
9739 if (cur
&& !cur
->is_multiversion())
9740 cur
= NULL
; // nope!
9744 dout(7) << "handle_discover mds." << from
9745 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
9747 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
9748 reply
->set_error_dentry(dis
->get_dentry(0));
9749 reply
->set_flag_error_dir();
9750 } else if (dis
->wants_base_dir()) {
9751 dout(7) << "handle_discover mds." << from
9752 << " wants basedir+" << dis
->get_want().get_path()
9756 dout(7) << "handle_discover mds." << from
9757 << " wants " << dis
->get_want().get_path()
9766 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9767 for (unsigned i
= 0;
9768 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
9771 // -- figure out the dir
9773 // is *cur even a dir at all?
9774 if (!cur
->is_dir()) {
9775 dout(7) << *cur
<< " not a dir" << dendl
;
9776 reply
->set_flag_error_dir();
9782 if (dis
->get_want().depth()) {
9784 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
9786 // requester explicity specified the frag
9787 assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
9788 fg
= dis
->get_base_dir_frag();
9789 if (!cur
->dirfragtree
.is_leaf(fg
))
9790 fg
= cur
->dirfragtree
[fg
.value()];
9792 CDir
*curdir
= cur
->get_dirfrag(fg
);
9794 if ((!curdir
&& !cur
->is_auth()) ||
9795 (curdir
&& !curdir
->is_auth())) {
9798 * ONLY set flag if empty!!
9799 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9800 * resulting in duplicate discovers in flight,
9801 * which can wreak havoc when discovering rename srcdn (which may move)
9804 if (reply
->is_empty()) {
9805 // only hint if empty.
9806 // someday this could be better, but right now the waiter logic isn't smart enough.
9810 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
9811 reply
->set_dir_auth_hint(curdir
->authority().first
);
9813 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9815 reply
->set_dir_auth_hint(cur
->authority().first
);
9818 // note error dentry, if any
9819 // NOTE: important, as it allows requester to issue an equivalent discover
9820 // to whomever we hint at.
9821 if (dis
->get_want().depth() > i
)
9822 reply
->set_error_dentry(dis
->get_dentry(i
));
9828 if (!curdir
) { // open dir?
9829 if (cur
->is_frozen()) {
9830 if (!reply
->is_empty()) {
9831 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
9834 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
9835 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
9839 curdir
= cur
->get_or_open_dirfrag(this, fg
);
9840 } else if (curdir
->is_frozen_tree() ||
9841 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
9842 if (!reply
->is_empty()) {
9843 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
9846 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
9847 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
9848 reply
->set_flag_error_dir();
9851 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
9852 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
9858 if (curdir
->get_version() == 0) {
9859 // fetch newly opened dir
9860 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
9861 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
9862 // make sure the base frag is correct, though, in there was a refragment since the
9863 // original request was sent.
9864 reply
->set_base_dir_frag(curdir
->get_frag());
9866 assert(!curdir
->is_ambiguous_auth()); // would be frozen.
9867 if (!reply
->trace
.length())
9868 reply
->starts_with
= MDiscoverReply::DIR;
9869 replicate_dir(curdir
, from
, reply
->trace
);
9870 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
9875 if (curdir
->get_version() == 0) {
9876 // fetch newly opened dir
9877 assert(!curdir
->has_bloom());
9878 } else if (dis
->get_want().depth() > 0) {
9880 dn
= curdir
->lookup(dis
->get_dentry(i
), snapid
);
9886 if (!curdir
->is_complete() &&
9887 (!curdir
->has_bloom() || curdir
->is_in_bloom(dis
->get_dentry(i
)))) {
9889 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
9890 if (reply
->is_empty()) {
9892 curdir
->fetch(new C_MDS_RetryMessage(mds
, dis
),
9893 dis
->wants_base_dir() && curdir
->get_version() == 0);
9897 // initiate fetch, but send what we have so far
9904 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
9905 << *curdir
<< dendl
;
9906 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
9910 // don't add replica to purging dentry/inode
9911 if (dn
->state_test(CDentry::STATE_PURGING
)) {
9912 if (reply
->is_empty())
9913 reply
->set_flag_error_dn(dis
->get_dentry(i
));
9917 CDentry::linkage_t
*dnl
= dn
->get_linkage();
9920 // ...always block on non-tail items (they are unrelated)
9921 // ...allow xlocked tail disocvery _only_ if explicitly requested
9922 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
9923 if (dn
->lock
.is_xlocked()) {
9924 // is this the last (tail) item in the discover traversal?
9925 if (tailitem
&& dis
->wants_xlocked()) {
9926 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn
<< dendl
;
9927 } else if (reply
->is_empty()) {
9928 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
9929 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
9933 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
9939 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
9940 if (tailitem
&& dis
->wants_xlocked()) {
9941 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
9942 } else if (reply
->is_empty()) {
9943 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
9944 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
9948 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
9954 if (!reply
->trace
.length())
9955 reply
->starts_with
= MDiscoverReply::DENTRY
;
9956 replicate_dentry(dn
, from
, reply
->trace
);
9957 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
9959 if (!dnl
->is_primary()) break; // stop on null or remote link.
9962 CInode
*next
= dnl
->get_inode();
9963 assert(next
->is_auth());
9965 replicate_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
9966 dout(7) << "handle_discover added inode " << *next
<< dendl
;
9968 // descend, keep going.
9974 assert(!reply
->is_empty());
9975 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
9976 mds
->send_message(reply
, dis
->get_connection());
9981 /* This function DOES put the passed message before returning */
9982 void MDCache::handle_discover_reply(MDiscoverReply
*m
)
9985 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
9986 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
9991 dout(7) << "discover_reply " << *m
<< dendl
;
9992 if (m
->is_flag_error_dir())
9993 dout(7) << " flag error, dir" << dendl
;
9994 if (m
->is_flag_error_dn())
9995 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
9997 list
<MDSInternalContextBase
*> finished
, error
;
9998 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10001 CInode
*cur
= get_inode(m
->get_base_ino());
10002 bufferlist::iterator p
= m
->trace
.begin();
10004 int next
= m
->starts_with
;
10006 // decrement discover counters
10007 if (m
->get_tid()) {
10008 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10009 if (p
!= discovers
.end()) {
10010 dout(10) << " found tid " << m
->get_tid() << dendl
;
10011 discovers
.erase(p
);
10013 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10017 // discover may start with an inode
10018 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10019 cur
= add_replica_inode(p
, NULL
, finished
);
10020 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10021 assert(cur
->is_base());
10023 next
= MDiscoverReply::DIR;
10026 if (cur
->is_base() &&
10027 waiting_for_base_ino
[from
].count(cur
->ino())) {
10028 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10029 waiting_for_base_ino
[from
].erase(cur
->ino());
10034 // loop over discover results.
10035 // indexes follow each ([[dir] dentry] inode)
10036 // can start, end with any type.
10041 if (next
== MDiscoverReply::DIR) {
10042 curdir
= add_replica_dir(p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10043 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10044 assert(m
->get_wanted_base_dir());
10045 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10048 // note: this can only happen our first way around this loop.
10049 if (p
.end() && m
->is_flag_error_dn()) {
10050 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10051 curdir
= cur
->get_dirfrag(fg
);
10053 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10060 CDentry
*dn
= add_replica_dentry(p
, curdir
, finished
);
10066 cur
= add_replica_inode(p
, dn
, finished
);
10068 next
= MDiscoverReply::DIR;
10072 // or dir_auth hint?
10073 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10075 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10076 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10077 mds_rank_t who
= m
->get_dir_auth_hint();
10078 if (who
== mds
->get_nodeid()) who
= -1;
10080 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10083 if (m
->get_wanted_base_dir()) {
10084 frag_t fg
= m
->get_base_dir_frag();
10085 CDir
*dir
= cur
->get_dirfrag(fg
);
10087 if (cur
->is_waiting_for_dir(fg
)) {
10088 if (cur
->is_auth())
10089 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10090 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10091 cur
->take_dir_waiting(fg
, finished
);
10093 discover_dir_frag(cur
, fg
, 0, who
);
10095 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10099 if (m
->get_error_dentry().length()) {
10100 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10101 CDir
*dir
= cur
->get_dirfrag(fg
);
10103 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10104 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10105 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10106 m
->get_wanted_snapid(), finished
);
10108 filepath
relpath(m
->get_error_dentry(), 0);
10109 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->get_wanted_xlocked());
10112 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10113 << m
->get_error_dentry() << dendl
;
10115 } else if (m
->is_flag_error_dn()) {
10116 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10117 CDir
*dir
= cur
->get_dirfrag(fg
);
10119 if (dir
->is_auth()) {
10120 dir
->take_sub_waiting(finished
);
10122 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10123 m
->get_wanted_snapid(), error
);
10129 finish_contexts(g_ceph_context
, error
, -ENOENT
); // finish errors directly
10130 mds
->queue_waiters(finished
);
10138 // ----------------------------
10141 CDir
*MDCache::add_replica_dir(bufferlist::iterator
& p
, CInode
*diri
, mds_rank_t from
,
10142 list
<MDSInternalContextBase
*>& finished
)
10147 assert(diri
->ino() == df
.ino
);
10149 // add it (_replica_)
10150 CDir
*dir
= diri
->get_dirfrag(df
.frag
);
10153 // had replica. update w/ new nonce.
10154 dir
->decode_replica(p
);
10155 dout(7) << "add_replica_dir had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10157 // force frag to leaf in the diri tree
10158 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10159 dout(7) << "add_replica_dir forcing frag " << df
.frag
<< " to leaf in the fragtree "
10160 << diri
->dirfragtree
<< dendl
;
10161 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10165 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10166 dir
->decode_replica(p
);
10168 // is this a dir_auth delegation boundary?
10169 if (from
!= diri
->authority().first
||
10170 diri
->is_ambiguous_auth() ||
10172 adjust_subtree_auth(dir
, from
);
10174 dout(7) << "add_replica_dir added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10177 diri
->take_dir_waiting(df
.frag
, finished
);
10183 CDir
*MDCache::forge_replica_dir(CInode
*diri
, frag_t fg
, mds_rank_t from
)
10185 assert(mds
->mdsmap
->get_state(from
) < MDSMap::STATE_REJOIN
);
10187 // forge a replica.
10188 CDir
*dir
= diri
->add_dirfrag( new CDir(diri
, fg
, this, false) );
10190 // i'm assuming this is a subtree root.
10191 adjust_subtree_auth(dir
, from
);
10193 dout(7) << "forge_replica_dir added " << *dir
<< " while mds." << from
<< " is down" << dendl
;
10198 CDentry
*MDCache::add_replica_dentry(bufferlist::iterator
& p
, CDir
*dir
, list
<MDSInternalContextBase
*>& finished
)
10205 CDentry
*dn
= dir
->lookup(name
, last
);
10209 dn
->decode_replica(p
, false);
10210 dout(7) << "add_replica_dentry had " << *dn
<< dendl
;
10212 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10213 dn
->decode_replica(p
, true);
10214 dout(7) << "add_replica_dentry added " << *dn
<< dendl
;
10217 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10222 CInode
*MDCache::add_replica_inode(bufferlist::iterator
& p
, CDentry
*dn
, list
<MDSInternalContextBase
*>& finished
)
10228 CInode
*in
= get_inode(ino
, last
);
10230 in
= new CInode(this, false, 1, last
);
10231 in
->decode_replica(p
, true);
10233 if (in
->ino() == MDS_INO_ROOT
)
10234 in
->inode_auth
.first
= 0;
10235 else if (in
->is_mdsdir())
10236 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10237 dout(10) << "add_replica_inode added " << *in
<< dendl
;
10239 assert(dn
->get_linkage()->is_null());
10240 dn
->dir
->link_primary_inode(dn
, in
);
10243 in
->decode_replica(p
, false);
10244 dout(10) << "add_replica_inode had " << *in
<< dendl
;
10248 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10249 dout(10) << "add_replica_inode different linkage in dentry " << *dn
<< dendl
;
10256 void MDCache::replicate_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10258 uint64_t features
= mds
->mdsmap
->get_up_features();
10259 replicate_inode(get_myin(), who
, bl
, features
);
10260 replicate_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10261 replicate_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10262 replicate_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10263 replicate_dir(straydn
->get_dir(), who
, bl
);
10264 replicate_dentry(straydn
, who
, bl
);
10267 CDentry
*MDCache::add_replica_stray(bufferlist
&bl
, mds_rank_t from
)
10269 list
<MDSInternalContextBase
*> finished
;
10270 bufferlist::iterator p
= bl
.begin();
10272 CInode
*mdsin
= add_replica_inode(p
, NULL
, finished
);
10273 CDir
*mdsdir
= add_replica_dir(p
, mdsin
, from
, finished
);
10274 CDentry
*straydirdn
= add_replica_dentry(p
, mdsdir
, finished
);
10275 CInode
*strayin
= add_replica_inode(p
, straydirdn
, finished
);
10276 CDir
*straydir
= add_replica_dir(p
, strayin
, from
, finished
);
10277 CDentry
*straydn
= add_replica_dentry(p
, straydir
, finished
);
10278 if (!finished
.empty())
10279 mds
->queue_waiters(finished
);
10285 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
10287 // this is an FYI, re: replication
10289 set
<mds_rank_t
> who
;
10291 mds
->get_mds_map()->get_active_mds_set(who
);
10293 for (compact_map
<mds_rank_t
,unsigned>::iterator p
= dir
->replicas_begin();
10294 p
!= dir
->replicas_end();
10296 who
.insert(p
->first
);
10299 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
10302 dir
->inode
->make_path(path
);
10304 mds_rank_t whoami
= mds
->get_nodeid();
10305 for (set
<mds_rank_t
>::iterator it
= who
.begin();
10308 if (*it
== whoami
) continue;
10309 //if (*it == except) continue;
10310 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
10312 mds
->send_message_mds(new MDirUpdate(mds
->get_nodeid(),
10324 /* This function DOES put the passed message before returning */
10325 void MDCache::handle_dir_update(MDirUpdate
*m
)
10327 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10329 dout(5) << "dir_update on " << m
->get_dirfrag() << ", don't have it" << dendl
;
10332 if (m
->should_discover()) {
10334 // this is key to avoid a fragtree update race, among other things.
10335 m
->tried_discover();
10336 vector
<CDentry
*> trace
;
10338 filepath path
= m
->get_path();
10339 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
10340 MDRequestRef null_ref
;
10341 int r
= path_traverse(null_ref
, m
, NULL
, path
, &trace
, &in
, MDS_TRAVERSE_DISCOVER
);
10345 open_remote_dirfrag(in
, m
->get_dirfrag().frag
,
10346 new C_MDS_RetryMessage(mds
, m
));
10355 dout(5) << "dir_update on " << *dir
<< dendl
;
10356 dir
->dir_rep
= m
->get_dir_rep();
10357 dir
->dir_rep_by
= m
->get_dir_rep_by();
10369 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
10371 dout(7) << "send_dentry_link " << *dn
<< dendl
;
10373 CDir
*subtree
= get_subtree_root(dn
->get_dir());
10374 for (compact_map
<mds_rank_t
,unsigned>::iterator p
= dn
->replicas_begin();
10375 p
!= dn
->replicas_end();
10377 // don't tell (rename) witnesses; they already know
10378 if (mdr
.get() && mdr
->more()->witnessed
.count(p
->first
))
10380 if (mds
->mdsmap
->get_state(p
->first
) < MDSMap::STATE_REJOIN
||
10381 (mds
->mdsmap
->get_state(p
->first
) == MDSMap::STATE_REJOIN
&&
10382 rejoin_gather
.count(p
->first
)))
10384 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10385 MDentryLink
*m
= new MDentryLink(subtree
->dirfrag(), dn
->get_dir()->dirfrag(),
10386 dn
->name
, dnl
->is_primary());
10387 if (dnl
->is_primary()) {
10388 dout(10) << " primary " << *dnl
->get_inode() << dendl
;
10389 replicate_inode(dnl
->get_inode(), p
->first
, m
->bl
,
10390 mds
->mdsmap
->get_up_features());
10391 } else if (dnl
->is_remote()) {
10392 inodeno_t ino
= dnl
->get_remote_ino();
10393 __u8 d_type
= dnl
->get_remote_d_type();
10394 dout(10) << " remote " << ino
<< " " << d_type
<< dendl
;
10395 ::encode(ino
, m
->bl
);
10396 ::encode(d_type
, m
->bl
);
10398 ceph_abort(); // aie, bad caller!
10399 mds
->send_message_mds(m
, p
->first
);
10403 /* This function DOES put the passed message before returning */
10404 void MDCache::handle_dentry_link(MDentryLink
*m
)
10407 CDentry
*dn
= NULL
;
10408 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10410 dout(7) << "handle_dentry_link don't have dirfrag " << m
->get_dirfrag() << dendl
;
10412 dn
= dir
->lookup(m
->get_dn());
10414 dout(7) << "handle_dentry_link don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10416 dout(7) << "handle_dentry_link on " << *dn
<< dendl
;
10417 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10419 assert(!dn
->is_auth());
10420 assert(dnl
->is_null());
10424 bufferlist::iterator p
= m
->bl
.begin();
10425 list
<MDSInternalContextBase
*> finished
;
10427 if (m
->get_is_primary()) {
10429 add_replica_inode(p
, dn
, finished
);
10431 // remote link, easy enough.
10435 ::decode(d_type
, p
);
10436 dir
->link_remote_inode(dn
, ino
, d_type
);
10442 if (!finished
.empty())
10443 mds
->queue_waiters(finished
);
10452 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
10454 dout(10) << "send_dentry_unlink " << *dn
<< dendl
;
10455 // share unlink news with replicas
10456 set
<mds_rank_t
> replicas
;
10457 dn
->list_replicas(replicas
);
10459 straydn
->list_replicas(replicas
);
10460 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
10461 it
!= replicas
.end();
10463 // don't tell (rmdir) witnesses; they already know
10464 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
10467 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
10468 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
10469 rejoin_gather
.count(*it
)))
10472 MDentryUnlink
*unlink
= new MDentryUnlink(dn
->get_dir()->dirfrag(), dn
->name
);
10474 replicate_stray(straydn
, *it
, unlink
->straybl
);
10475 mds
->send_message_mds(unlink
, *it
);
10479 /* This function DOES put the passed message before returning */
10480 void MDCache::handle_dentry_unlink(MDentryUnlink
*m
)
10483 CDentry
*straydn
= NULL
;
10484 if (m
->straybl
.length())
10485 straydn
= add_replica_stray(m
->straybl
, mds_rank_t(m
->get_source().num()));
10487 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10489 dout(7) << "handle_dentry_unlink don't have dirfrag " << m
->get_dirfrag() << dendl
;
10491 CDentry
*dn
= dir
->lookup(m
->get_dn());
10493 dout(7) << "handle_dentry_unlink don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10495 dout(7) << "handle_dentry_unlink on " << *dn
<< dendl
;
10496 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10499 if (dnl
->is_primary()) {
10500 CInode
*in
= dnl
->get_inode();
10501 dn
->dir
->unlink_inode(dn
);
10503 straydn
->dir
->link_primary_inode(straydn
, in
);
10505 // in->first is lazily updated on replica; drag it forward so
10506 // that we always keep it in sync with the dnq
10507 assert(straydn
->first
>= in
->first
);
10508 in
->first
= straydn
->first
;
10510 // update subtree map?
10512 adjust_subtree_after_rename(in
, dir
, false);
10514 // send caps to auth (if we're not already)
10515 if (in
->is_any_caps() &&
10516 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
10517 migrator
->export_caps(in
);
10522 assert(dnl
->is_remote());
10523 dn
->dir
->unlink_inode(dn
);
10525 assert(dnl
->is_null());
10529 // race with trim_dentry()
10531 assert(straydn
->get_num_ref() == 0);
10532 assert(straydn
->get_linkage()->is_null());
10533 map
<mds_rank_t
, MCacheExpire
*> expiremap
;
10534 trim_dentry(straydn
, expiremap
);
10535 send_expire_messages(expiremap
);
10547 // ===================================================================
10551 // ===================================================================
10556 * adjust_dir_fragments -- adjust fragmentation for a directory
10558 * @param diri directory inode
10559 * @param basefrag base fragment
10560 * @param bits bit adjustment. positive for split, negative for merge.
10562 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
10563 list
<CDir
*>& resultfrags
,
10564 list
<MDSInternalContextBase
*>& waiters
,
10567 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
10568 << " on " << *diri
<< dendl
;
10570 list
<CDir
*> srcfrags
;
10571 diri
->get_dirfrags_under(basefrag
, srcfrags
);
10573 adjust_dir_fragments(diri
, srcfrags
, basefrag
, bits
, resultfrags
, waiters
, replay
);
10576 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
10578 CDir
*dir
= diri
->get_dirfrag(fg
);
10582 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
10584 list
<CDir
*> src
, result
;
10585 list
<MDSInternalContextBase
*> waiters
;
10588 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
10590 CDir
*pdir
= diri
->get_dirfrag(parent
);
10592 int split
= fg
.bits() - parent
.bits();
10593 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
10594 src
.push_back(pdir
);
10595 adjust_dir_fragments(diri
, src
, parent
, split
, result
, waiters
, replay
);
10596 dir
= diri
->get_dirfrag(fg
);
10598 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10602 if (parent
== frag_t())
10604 frag_t last
= parent
;
10605 parent
= parent
.parent();
10606 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
10610 // hoover up things under fg?
10611 diri
->get_dirfrags_under(fg
, src
);
10613 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
10615 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
10616 adjust_dir_fragments(diri
, src
, fg
, 0, result
, waiters
, replay
);
10617 dir
= result
.front();
10618 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10622 mds
->queue_waiters(waiters
);
10626 void MDCache::adjust_dir_fragments(CInode
*diri
,
10627 list
<CDir
*>& srcfrags
,
10628 frag_t basefrag
, int bits
,
10629 list
<CDir
*>& resultfrags
,
10630 list
<MDSInternalContextBase
*>& waiters
,
10633 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
10634 << " srcfrags " << srcfrags
10635 << " on " << *diri
<< dendl
;
10638 // yuck. we may have discovered the inode while it was being fragmented.
10639 if (!diri
->dirfragtree
.is_leaf(basefrag
))
10640 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
10643 diri
->dirfragtree
.split(basefrag
, bits
);
10644 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
10646 if (srcfrags
.empty())
10650 CDir
*parent_dir
= diri
->get_parent_dir();
10651 CDir
*parent_subtree
= 0;
10653 parent_subtree
= get_subtree_root(parent_dir
);
10657 assert(srcfrags
.size() == 1);
10658 CDir
*dir
= srcfrags
.front();
10660 dir
->split(bits
, resultfrags
, waiters
, replay
);
10662 // did i change the subtree map?
10663 if (dir
->is_subtree_root()) {
10664 // new frags are now separate subtrees
10665 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
10666 p
!= resultfrags
.end();
10668 subtrees
[*p
].clear(); // new frag is now its own subtree
10671 if (parent_subtree
) {
10672 assert(subtrees
[parent_subtree
].count(dir
));
10673 subtrees
[parent_subtree
].erase(dir
);
10674 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
10675 p
!= resultfrags
.end();
10677 assert((*p
)->is_subtree_root());
10678 subtrees
[parent_subtree
].insert(*p
);
10682 // adjust my bounds.
10684 bounds
.swap(subtrees
[dir
]);
10685 subtrees
.erase(dir
);
10686 for (set
<CDir
*>::iterator p
= bounds
.begin();
10689 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
10690 subtrees
[frag
].insert(*p
);
10695 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10696 dir
->dir_auth
= CDIR_AUTH_DEFAULT
;
10699 diri
->close_dirfrag(dir
->get_frag());
10704 // are my constituent bits subtrees? if so, i will be too.
10705 // (it's all or none, actually.)
10706 bool any_subtree
= false;
10707 for (CDir
*dir
: srcfrags
) {
10708 if (dir
->is_subtree_root()) {
10709 any_subtree
= true;
10713 set
<CDir
*> new_bounds
;
10715 for (CDir
*dir
: srcfrags
) {
10716 // this simplifies the code that find subtrees underneath the dirfrag
10717 if (!dir
->is_subtree_root()) {
10718 dir
->state_set(CDir::STATE_AUXSUBTREE
);
10719 adjust_subtree_auth(dir
, mds
->get_nodeid());
10723 for (CDir
*dir
: srcfrags
) {
10724 assert(dir
->is_subtree_root());
10725 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
10726 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
10727 set
<CDir
*>::iterator r
= q
->second
.begin();
10728 while (r
!= subtrees
[dir
].end()) {
10729 new_bounds
.insert(*r
);
10730 subtrees
[dir
].erase(r
++);
10734 // remove myself as my parent's bound
10735 if (parent_subtree
)
10736 subtrees
[parent_subtree
].erase(dir
);
10741 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
10742 f
->merge(srcfrags
, waiters
, replay
);
10745 assert(f
->is_subtree_root());
10746 subtrees
[f
].swap(new_bounds
);
10747 if (parent_subtree
)
10748 subtrees
[parent_subtree
].insert(f
);
10753 resultfrags
.push_back(f
);
10758 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
10762 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
10763 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
10764 void finish(int r
) override
{
10765 mdcache
->fragment_frozen(mdr
, r
);
10769 bool MDCache::can_fragment(CInode
*diri
, list
<CDir
*>& dirs
)
10771 if (is_readonly()) {
10772 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
10775 if (mds
->is_cluster_degraded()) {
10776 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
10779 if (diri
->get_parent_dir() &&
10780 diri
->get_parent_dir()->get_inode()->is_stray()) {
10781 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
10784 if (diri
->is_mdsdir() || diri
->is_stray() || diri
->ino() == MDS_INO_CEPH
) {
10785 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl
;
10789 if (diri
->scrub_is_in_progress()) {
10790 dout(7) << "can_fragment: scrub in progress" << dendl
;
10794 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
10796 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
10797 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
10800 if (!dir
->is_auth()) {
10801 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
10804 if (dir
->is_bad()) {
10805 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
10808 if (dir
->is_frozen() ||
10809 dir
->is_freezing()) {
10810 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
10818 void MDCache::split_dir(CDir
*dir
, int bits
)
10820 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
10821 assert(dir
->is_auth());
10822 CInode
*diri
= dir
->inode
;
10825 dirs
.push_back(dir
);
10827 if (!can_fragment(diri
, dirs
)) {
10828 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
10832 if (dir
->frag
.bits() + bits
> 24) {
10833 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
10837 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
10838 mdr
->more()->fragment_base
= dir
->dirfrag();
10840 assert(fragments
.count(dir
->dirfrag()) == 0);
10841 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
10843 info
.dirs
.push_back(dir
);
10845 info
.last_cum_auth_pins_change
= ceph_clock_now();
10847 fragment_freeze_dirs(dirs
);
10848 // initial mark+complete pass
10849 fragment_mark_and_complete(mdr
);
10852 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
10854 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
10857 if (!diri
->get_dirfrags_under(frag
, dirs
)) {
10858 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
10862 if (diri
->dirfragtree
.is_leaf(frag
)) {
10863 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
10867 if (!can_fragment(diri
, dirs
))
10870 CDir
*first
= dirs
.front();
10871 int bits
= first
->get_frag().bits() - frag
.bits();
10872 dout(10) << " we are merginb by " << bits
<< " bits" << dendl
;
10874 dirfrag_t
basedirfrag(diri
->ino(), frag
);
10875 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
10876 mdr
->more()->fragment_base
= basedirfrag
;
10878 assert(fragments
.count(basedirfrag
) == 0);
10879 fragment_info_t
& info
= fragments
[basedirfrag
];
10883 info
.last_cum_auth_pins_change
= ceph_clock_now();
10885 fragment_freeze_dirs(dirs
);
10886 // initial mark+complete pass
10887 fragment_mark_and_complete(mdr
);
10890 void MDCache::fragment_freeze_dirs(list
<CDir
*>& dirs
)
10892 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
10894 dir
->auth_pin(dir
); // until we mark and complete them
10895 dir
->state_set(CDir::STATE_FRAGMENTING
);
10897 assert(dir
->is_freezing_dir());
10901 class C_MDC_FragmentMarking
: public MDCacheContext
{
10904 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
10905 void finish(int r
) override
{
10906 mdcache
->fragment_mark_and_complete(mdr
);
10910 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
10912 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
10913 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
10914 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
10915 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
10916 request_finish(mdr
);
10920 fragment_info_t
& info
= it
->second
;
10921 CInode
*diri
= info
.dirs
.front()->get_inode();
10922 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
10924 MDSGatherBuilder
gather(g_ceph_context
);
10926 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
10927 p
!= info
.dirs
.end();
10932 if (!dir
->is_complete()) {
10933 dout(15) << " fetching incomplete " << *dir
<< dendl
;
10934 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
10936 } else if (dir
->get_frag() == frag_t()) {
10937 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10938 // the operation. To avoid CDir::fetch() complaining about missing object,
10939 // we commit new dirfrag first.
10940 if (dir
->state_test(CDir::STATE_CREATING
)) {
10941 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
10942 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
10944 } else if (dir
->is_new()) {
10945 dout(15) << " committing new " << *dir
<< dendl
;
10946 assert(dir
->is_dirty());
10947 dir
->commit(0, gather
.new_sub(), true);
10954 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
10955 dout(15) << " marking " << *dir
<< dendl
;
10956 for (CDir::map_t::iterator p
= dir
->items
.begin();
10957 p
!= dir
->items
.end();
10959 CDentry
*dn
= p
->second
;
10960 dn
->get(CDentry::PIN_FRAGMENTING
);
10961 assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
10962 dn
->state_set(CDentry::STATE_FRAGMENTING
);
10964 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
10965 dir
->auth_unpin(dir
);
10967 dout(15) << " already marked " << *dir
<< dendl
;
10970 if (gather
.has_subs()) {
10971 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
10976 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
10977 p
!= info
.dirs
.end();
10980 if (!dir
->is_frozen_dir()) {
10981 assert(dir
->is_freezing_dir());
10982 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
10985 if (gather
.has_subs()) {
10986 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
10988 // flush log so that request auth_pins are retired
10989 mds
->mdlog
->flush();
10993 fragment_frozen(mdr
, 0);
10996 void MDCache::fragment_unmark_unfreeze_dirs(list
<CDir
*>& dirs
)
10998 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
10999 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
11001 dout(10) << " frag " << *dir
<< dendl
;
11003 assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11004 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11006 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11007 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11009 for (CDir::map_t::iterator p
= dir
->items
.begin();
11010 p
!= dir
->items
.end();
11012 CDentry
*dn
= p
->second
;
11013 assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11014 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11015 dn
->put(CDentry::PIN_FRAGMENTING
);
11018 dir
->auth_unpin(dir
);
11021 dir
->unfreeze_dir();
11025 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11027 assert(dir
->is_frozen_dir());
11028 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11029 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11030 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11032 if (p
->first
.frag
.contains(dir
->get_frag()))
11033 return p
->second
.all_frozen
;
11039 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11041 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11042 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11043 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11045 if (p
->first
.frag
.contains(dir
->get_frag())) {
11046 p
->second
.num_remote_waiters
++;
11053 void MDCache::find_stale_fragment_freeze()
11055 dout(10) << "find_stale_fragment_freeze" << dendl
;
11056 // see comment in Migrator::find_stale_export_freeze()
11057 utime_t now
= ceph_clock_now();
11058 utime_t cutoff
= now
;
11059 cutoff
-= g_conf
->mds_freeze_tree_timeout
;
11061 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11062 p
!= fragments
.end(); ) {
11063 dirfrag_t df
= p
->first
;
11064 fragment_info_t
& info
= p
->second
;
11066 if (info
.all_frozen
)
11069 int total_auth_pins
= 0;
11070 for (list
<CDir
*>::iterator q
= info
.dirs
.begin();
11071 q
!= info
.dirs
.end();
11074 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11075 total_auth_pins
= -1;
11078 if (dir
->is_frozen_dir())
11080 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11082 if (total_auth_pins
< 0)
11084 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11085 info
.last_cum_auth_pins
= total_auth_pins
;
11086 info
.last_cum_auth_pins_change
= now
;
11089 if (info
.last_cum_auth_pins_change
>= cutoff
)
11091 dir
= info
.dirs
.front();
11092 if (info
.num_remote_waiters
> 0 ||
11093 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11094 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11096 info
.dirs
.swap(dirs
);
11097 fragments
.erase(df
);
11098 fragment_unmark_unfreeze_dirs(dirs
);
11103 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11106 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11107 void finish(int r
) override
{
11108 mdcache
->_fragment_logged(mdr
);
11112 class C_MDC_FragmentStore
: public MDCacheContext
{
11115 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11116 void finish(int r
) override
{
11117 mdcache
->_fragment_stored(mdr
);
11121 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11122 dirfrag_t basedirfrag
;
11123 list
<CDir
*> resultfrags
;
11125 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, list
<CDir
*>& l
) :
11126 MDCacheLogContext(m
), basedirfrag(df
), resultfrags(l
) {}
11127 void finish(int r
) override
{
11128 mdcache
->_fragment_committed(basedirfrag
, resultfrags
);
11132 class C_IO_MDC_FragmentFinish
: public MDCacheIOContext
{
11133 dirfrag_t basedirfrag
;
11134 list
<CDir
*> resultfrags
;
11136 C_IO_MDC_FragmentFinish(MDCache
*m
, dirfrag_t f
, list
<CDir
*>& l
) :
11137 MDCacheIOContext(m
), basedirfrag(f
) {
11138 resultfrags
.swap(l
);
11140 void finish(int r
) override
{
11141 assert(r
== 0 || r
== -ENOENT
);
11142 mdcache
->_fragment_finish(basedirfrag
, resultfrags
);
11146 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11148 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11149 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11150 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11151 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11152 request_finish(mdr
);
11157 fragment_info_t
& info
= it
->second
;
11158 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11159 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11161 info
.all_frozen
= true;
11162 dispatch_fragment_dir(mdr
);
11165 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11167 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11168 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11169 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11170 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11171 request_finish(mdr
);
11175 fragment_info_t
& info
= it
->second
;
11176 CInode
*diri
= info
.dirs
.front()->get_inode();
11178 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11179 << " on " << *diri
<< dendl
;
11180 if (!mdr
->aborted
) {
11181 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
11182 wrlocks
.insert(&diri
->dirfragtreelock
);
11183 // prevent a racing gather on any other scatterlocks too
11184 wrlocks
.insert(&diri
->nestlock
);
11185 wrlocks
.insert(&diri
->filelock
);
11186 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
, NULL
, NULL
, true))
11191 if (mdr
->aborted
) {
11192 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11193 << info
.dirs
.front()->dirfrag() << dendl
;
11195 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11197 mds
->balancer
->queue_merge(info
.dirs
.front());
11198 fragment_unmark_unfreeze_dirs(info
.dirs
);
11199 fragments
.erase(it
);
11200 request_finish(mdr
);
11204 mdr
->ls
= mds
->mdlog
->get_current_segment();
11205 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11206 mds
->mdlog
->start_entry(le
);
11208 for (list
<CDir
*>::iterator p
= info
.dirs
.begin(); p
!= info
.dirs
.end(); ++p
) {
11210 dirfrag_rollback rollback
;
11211 rollback
.fnode
= dir
->fnode
;
11212 le
->add_orig_frag(dir
->get_frag(), &rollback
);
11216 list
<MDSInternalContextBase
*> waiters
;
11217 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
11218 info
.resultfrags
, waiters
, false);
11219 if (g_conf
->mds_debug_frag
)
11220 diri
->verify_dirfrags();
11221 mds
->queue_waiters(waiters
);
11223 for (list
<frag_t
>::iterator p
= le
->orig_frags
.begin(); p
!= le
->orig_frags
.end(); ++p
)
11224 assert(!diri
->dirfragtree
.is_leaf(*p
));
11226 le
->metablob
.add_dir_context(*info
.resultfrags
.begin());
11227 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11228 p
!= info
.resultfrags
.end();
11230 if (diri
->is_auth()) {
11231 le
->metablob
.add_fragmented_dir(*p
, false, false);
11233 (*p
)->state_set(CDir::STATE_DIRTYDFT
);
11234 le
->metablob
.add_fragmented_dir(*p
, false, true);
11239 if (diri
->is_auth()) {
11240 // journal dirfragtree
11241 inode_t
*pi
= diri
->project_inode();
11242 pi
->version
= diri
->pre_dirty();
11243 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11245 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11246 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11247 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
11252 mds->locker->mark_updated_scatterlock(&diri->filelock);
11253 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11254 mut->add_updated_lock(&diri->filelock);
11257 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11258 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11259 mut->add_updated_lock(&diri->nestlock);
11262 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
11263 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
11265 mds
->mdlog
->flush();
11268 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
11270 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11271 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11272 assert(it
!= fragments
.end());
11273 fragment_info_t
&info
= it
->second
;
11274 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11276 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
11277 << " on " << *diri
<< dendl
;
11279 if (diri
->is_auth())
11280 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
11282 mdr
->apply(); // mark scatterlock
11284 // store resulting frags
11285 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
11287 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11288 p
!= info
.resultfrags
.end();
11291 dout(10) << " storing result frag " << *dir
<< dendl
;
11293 // freeze and store them too
11294 dir
->auth_pin(this);
11295 dir
->state_set(CDir::STATE_FRAGMENTING
);
11296 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
11302 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
11304 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11305 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11306 assert(it
!= fragments
.end());
11307 fragment_info_t
&info
= it
->second
;
11308 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11310 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
11311 << " on " << *diri
<< dendl
;
11314 CDir
*first
= *info
.resultfrags
.begin();
11315 for (compact_map
<mds_rank_t
,unsigned>::iterator p
= first
->replicas_begin();
11316 p
!= first
->replicas_end();
11318 if (mds
->mdsmap
->get_state(p
->first
) < MDSMap::STATE_REJOIN
||
11319 (mds
->mdsmap
->get_state(p
->first
) == MDSMap::STATE_REJOIN
&&
11320 rejoin_gather
.count(p
->first
)))
11323 MMDSFragmentNotify
*notify
= new MMDSFragmentNotify(basedirfrag
, info
.bits
);
11325 // freshly replicate new dirs to peers
11326 for (list
<CDir
*>::iterator q
= info
.resultfrags
.begin();
11327 q
!= info
.resultfrags
.end();
11329 replicate_dir(*q
, p
->first
, notify
->basebl
);
11331 mds
->send_message_mds(notify
, p
->first
);
11335 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
11336 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
,
11337 info
.resultfrags
));
11339 mds
->locker
->drop_locks(mdr
.get());
11341 // unfreeze resulting frags
11342 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11343 p
!= info
.resultfrags
.end();
11346 dout(10) << " result frag " << *dir
<< dendl
;
11348 for (CDir::map_t::iterator p
= dir
->items
.begin();
11349 p
!= dir
->items
.end();
11351 CDentry
*dn
= p
->second
;
11352 assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11353 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11354 dn
->put(CDentry::PIN_FRAGMENTING
);
11358 dir
->unfreeze_dir();
11361 fragments
.erase(it
);
11362 request_finish(mdr
);
11365 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, list
<CDir
*>& resultfrags
)
11367 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
11368 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11369 assert(it
!= uncommitted_fragments
.end());
11370 ufragment
&uf
= it
->second
;
11372 // remove old frags
11373 C_GatherBuilder
gather(
11376 new C_IO_MDC_FragmentFinish(this, basedirfrag
, resultfrags
),
11379 SnapContext nullsnapc
;
11380 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
11381 for (list
<frag_t
>::iterator p
= uf
.old_frags
.begin();
11382 p
!= uf
.old_frags
.end();
11384 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, *p
, "");
11385 ObjectOperation op
;
11386 if (*p
== frag_t()) {
11387 // backtrace object
11388 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
11392 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
11395 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
11396 ceph::real_clock::now(),
11397 0, gather
.new_sub());
11400 assert(gather
.has_subs());
11404 void MDCache::_fragment_finish(dirfrag_t basedirfrag
, list
<CDir
*>& resultfrags
)
11406 dout(10) << "fragment_finish " << basedirfrag
<< "resultfrags.size="
11407 << resultfrags
.size() << dendl
;
11408 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11409 assert(it
!= uncommitted_fragments
.end());
11410 ufragment
&uf
= it
->second
;
11412 // unmark & auth_unpin
11413 for (const auto &dir
: resultfrags
) {
11414 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11415 dir
->auth_unpin(this);
11417 // In case the resulting fragments are beyond the split size,
11418 // we might need to split them again right away (they could
11419 // have been taking inserts between unfreezing and getting
11421 mds
->balancer
->maybe_fragment(dir
, false);
11425 if (resultfrags
.size() > 1) {
11426 mds
->logger
->inc(l_mds_dir_split
);
11428 mds
->logger
->inc(l_mds_dir_merge
);
11432 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, uf
.bits
);
11433 mds
->mdlog
->start_submit_entry(le
);
11435 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
11438 /* This function DOES put the passed message before returning */
11439 void MDCache::handle_fragment_notify(MMDSFragmentNotify
*notify
)
11441 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
11443 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
11448 CInode
*diri
= get_inode(notify
->get_ino());
11450 frag_t base
= notify
->get_basefrag();
11451 int bits
= notify
->get_bits();
11454 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11455 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11456 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11457 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11464 list
<MDSInternalContextBase
*> waiters
;
11465 list
<CDir
*> resultfrags
;
11466 adjust_dir_fragments(diri
, base
, bits
, resultfrags
, waiters
, false);
11467 if (g_conf
->mds_debug_frag
)
11468 diri
->verify_dirfrags();
11470 for (list
<CDir
*>::iterator p
= resultfrags
.begin(); p
!= resultfrags
.end(); ++p
)
11471 diri
->take_dir_waiting((*p
)->get_frag(), waiters
);
11473 // add new replica dirs values
11474 bufferlist::iterator p
= notify
->basebl
.begin();
11476 add_replica_dir(p
, diri
, mds_rank_t(notify
->get_source().num()), waiters
);
11478 mds
->queue_waiters(waiters
);
11486 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, list
<frag_t
>& old_frags
,
11487 LogSegment
*ls
, bufferlist
*rollback
)
11489 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
11490 assert(!uncommitted_fragments
.count(basedirfrag
));
11491 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
11492 uf
.old_frags
= old_frags
;
11495 ls
->uncommitted_fragments
.insert(basedirfrag
);
11497 uf
.rollback
.swap(*rollback
);
11500 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
11502 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11503 << " op " << EFragment::op_name(op
) << dendl
;
11504 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11505 if (it
!= uncommitted_fragments
.end()) {
11506 ufragment
& uf
= it
->second
;
11507 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
11508 uf
.committed
= true;
11510 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11511 mds
->queue_waiters(uf
.waiters
);
11512 uncommitted_fragments
.erase(it
);
11517 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, list
<frag_t
>& old_frags
)
11519 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11520 << " old_frags (" << old_frags
<< ")" << dendl
;
11521 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11522 if (it
!= uncommitted_fragments
.end()) {
11523 ufragment
& uf
= it
->second
;
11524 if (!uf
.old_frags
.empty()) {
11525 uf
.old_frags
.swap(old_frags
);
11526 uf
.committed
= true;
11528 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11529 uncommitted_fragments
.erase(it
);
11534 void MDCache::rollback_uncommitted_fragments()
11536 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
11537 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
11538 p
!= uncommitted_fragments
.end();
11540 ufragment
&uf
= p
->second
;
11541 CInode
*diri
= get_inode(p
->first
.ino
);
11544 if (uf
.committed
) {
11546 diri
->get_dirfrags_under(p
->first
.frag
, frags
);
11547 for (list
<CDir
*>::iterator q
= frags
.begin(); q
!= frags
.end(); ++q
) {
11549 dir
->auth_pin(this);
11550 dir
->state_set(CDir::STATE_FRAGMENTING
);
11552 _fragment_committed(p
->first
, frags
);
11556 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
11558 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
11559 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
11560 mds
->mdlog
->start_entry(le
);
11561 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
11563 list
<frag_t
> old_frags
;
11564 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
11566 list
<CDir
*> resultfrags
;
11567 if (uf
.old_frags
.empty()) {
11568 // created by old format EFragment
11569 list
<MDSInternalContextBase
*> waiters
;
11570 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, resultfrags
, waiters
, true);
11572 bufferlist::iterator bp
= uf
.rollback
.begin();
11573 for (list
<frag_t
>::iterator q
= uf
.old_frags
.begin(); q
!= uf
.old_frags
.end(); ++q
) {
11574 CDir
*dir
= force_dir_fragment(diri
, *q
);
11575 resultfrags
.push_back(dir
);
11577 dirfrag_rollback rollback
;
11578 ::decode(rollback
, bp
);
11580 dir
->set_version(rollback
.fnode
.version
);
11581 dir
->fnode
= rollback
.fnode
;
11583 dir
->_mark_dirty(ls
);
11585 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
11586 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
11587 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
11588 ls
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
11590 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
11591 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
11592 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
11593 ls
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
11596 le
->add_orig_frag(dir
->get_frag());
11597 le
->metablob
.add_dir_context(dir
);
11599 le
->metablob
.add_fragmented_dir(dir
, true, false);
11601 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
11602 dir
->state_set(CDir::STATE_DIRTYDFT
);
11603 le
->metablob
.add_fragmented_dir(dir
, true, true);
11609 diri
->project_inode()->version
= diri
->pre_dirty();
11610 diri
->pop_and_dirty_projected_inode(ls
); // hacky
11611 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
11613 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11614 ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11617 if (g_conf
->mds_debug_frag
)
11618 diri
->verify_dirfrags();
11620 for (list
<frag_t
>::iterator q
= old_frags
.begin(); q
!= old_frags
.end(); ++q
)
11621 assert(!diri
->dirfragtree
.is_leaf(*q
));
11623 for (list
<CDir
*>::iterator q
= resultfrags
.begin(); q
!= resultfrags
.end(); ++q
) {
11625 dir
->auth_pin(this);
11626 dir
->state_set(CDir::STATE_FRAGMENTING
);
11629 mds
->mdlog
->submit_entry(le
);
11631 uf
.old_frags
.swap(old_frags
);
11632 _fragment_committed(p
->first
, resultfrags
);
11636 void MDCache::force_readonly()
11641 dout(1) << "force file system read-only" << dendl
;
11642 mds
->clog
->warn() << "force file system read-only";
11646 mds
->server
->force_clients_readonly();
11648 // revoke write caps
11649 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator p
= inode_map
.begin();
11650 p
!= inode_map
.end();
11652 CInode
*in
= p
->second
;
11654 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
11657 mds
->mdlog
->flush();
11661 // ==============================================================
11664 void MDCache::show_subtrees(int dbl
)
11666 if (g_conf
->mds_thrash_exports
)
11669 //dout(10) << "show_subtrees" << dendl;
11671 if (!g_conf
->subsys
.should_gather(ceph_subsys_mds
, dbl
))
11672 return; // i won't print anything.
11674 if (subtrees
.empty()) {
11675 dout(dbl
) << "show_subtrees - no subtrees" << dendl
;
11680 list
<CDir
*> basefrags
;
11681 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
11682 p
!= base_inodes
.end();
11684 (*p
)->get_dirfrags(basefrags
);
11685 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11686 dout(15) << "show_subtrees" << dendl
;
11689 list
<pair
<CDir
*,int> > q
;
11694 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
11695 q
.push_back(pair
<CDir
*,int>(*p
, 0));
11697 set
<CDir
*> subtrees_seen
;
11700 while (!q
.empty()) {
11701 CDir
*dir
= q
.front().first
;
11702 int d
= q
.front().second
;
11705 if (subtrees
.count(dir
) == 0) continue;
11707 subtrees_seen
.insert(dir
);
11709 if (d
> depth
) depth
= d
;
11712 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11713 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
11714 assert(seen
.count(dir
) == 0);
11718 if (!subtrees
[dir
].empty()) {
11719 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
11720 p
!= subtrees
[dir
].end();
11722 //dout(25) << " saw sub " << **p << dendl;
11723 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
11730 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
11731 q
.push_back(pair
<CDir
*,int>(*p
, 0));
11733 while (!q
.empty()) {
11734 CDir
*dir
= q
.front().first
;
11735 int d
= q
.front().second
;
11738 if (subtrees
.count(dir
) == 0) continue;
11741 while ((unsigned)d
< indent
.size())
11745 string pad
= "______________________________________";
11746 pad
.resize(depth
*2+1-indent
.size());
11747 if (!subtrees
[dir
].empty())
11748 pad
[0] = '.'; // parent
11752 if (dir
->is_auth())
11758 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
11759 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
11761 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
11764 dout(dbl
) << indent
<< "|_" << pad
<< s
<< " " << auth
<< *dir
<< dendl
;
11766 if (dir
->ino() == MDS_INO_ROOT
)
11767 assert(dir
->inode
== root
);
11768 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
11769 assert(dir
->inode
== myin
);
11770 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
11771 assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
11774 if (!subtrees
[dir
].empty()) {
11775 // more at my level?
11776 if (!q
.empty() && q
.front().second
== d
)
11781 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
11782 p
!= subtrees
[dir
].end();
11784 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
11788 // verify there isn't stray crap in subtree map
11790 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
11791 p
!= subtrees
.end();
11793 if (subtrees_seen
.count(p
->first
)) continue;
11794 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
11801 void MDCache::show_cache()
11803 dout(7) << "show_cache" << dendl
;
11805 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator it
= inode_map
.begin();
11806 it
!= inode_map
.end();
11809 if (!it
->second
->parent
)
11810 dout(7) << " unlinked " << *it
->second
<< dendl
;
11814 it
->second
->get_dirfrags(dfs
);
11815 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
11817 dout(7) << " dirfrag " << *dir
<< dendl
;
11819 for (CDir::map_t::iterator p
= dir
->items
.begin();
11820 p
!= dir
->items
.end();
11822 CDentry
*dn
= p
->second
;
11823 dout(7) << " dentry " << *dn
<< dendl
;
11824 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11825 if (dnl
->is_primary() && dnl
->get_inode())
11826 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
11832 int MDCache::dump_cache(std::string
const &file_name
)
11834 return dump_cache(file_name
.c_str(), NULL
);
11837 int MDCache::dump_cache(Formatter
*f
)
11839 return dump_cache(NULL
, f
);
11842 int MDCache::dump_cache(const string
& dump_root
, int depth
, Formatter
*f
)
11844 return dump_cache(NULL
, f
, dump_root
, depth
);
11848 * Dump the metadata cache, either to a Formatter, if
11849 * provided, else to a plain text file.
11851 int MDCache::dump_cache(const char *fn
, Formatter
*f
,
11852 const string
& dump_root
, int depth
)
11858 f
->open_array_section("inodes");
11862 snprintf(deffn
, sizeof(deffn
), "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
11866 dout(1) << "dump_cache to " << fn
<< dendl
;
11868 fd
= ::open(fn
, O_WRONLY
|O_CREAT
|O_EXCL
, 0600);
11870 derr
<< "failed to open " << fn
<< ": " << cpp_strerror(errno
) << dendl
;
11875 for (ceph::unordered_map
<vinodeno_t
,CInode
*>::iterator it
= inode_map
.begin();
11876 it
!= inode_map
.end();
11878 CInode
*in
= it
->second
;
11880 if (!dump_root
.empty()) {
11885 in
->make_path_string(ipath
);
11887 if (dump_root
.length() > ipath
.length() ||
11888 !equal(dump_root
.begin(), dump_root
.end(), ipath
.begin()))
11892 count(ipath
.begin() + dump_root
.length(), ipath
.end(), '/') > depth
)
11897 f
->open_object_section("inode");
11901 ss
<< *in
<< std::endl
;
11902 std::string s
= ss
.str();
11903 r
= safe_write(fd
, s
.c_str(), s
.length());
11910 in
->get_dirfrags(dfs
);
11912 f
->open_array_section("dirfrags");
11914 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
11917 f
->open_object_section("dir");
11921 tt
<< " " << *dir
<< std::endl
;
11922 string t
= tt
.str();
11923 r
= safe_write(fd
, t
.c_str(), t
.length());
11930 f
->open_array_section("dentries");
11932 for (CDir::map_t::iterator q
= dir
->items
.begin();
11933 q
!= dir
->items
.end();
11935 CDentry
*dn
= q
->second
;
11937 f
->open_object_section("dentry");
11939 f
->close_section();
11942 uu
<< " " << *dn
<< std::endl
;
11943 string u
= uu
.str();
11944 r
= safe_write(fd
, u
.c_str(), u
.length());
11951 f
->close_section(); //dentries
11953 dir
->check_rstats();
11955 f
->close_section(); //dir
11959 f
->close_section(); // dirfrags
11963 f
->close_section(); // inode
11969 f
->close_section(); // inodes
11978 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
)
11979 : MDSInternalContext(c
->mds
), cache(c
), mdr(r
)
11982 void C_MDS_RetryRequest::finish(int r
)
11985 cache
->dispatch_request(mdr
);
11989 class C_MDS_EnqueueScrub
: public Context
11991 Formatter
*formatter
;
11992 Context
*on_finish
;
11994 ScrubHeaderRef header
;
11995 C_MDS_EnqueueScrub(Formatter
*f
, Context
*fin
) :
11996 formatter(f
), on_finish(fin
), header(nullptr) {}
11998 Context
*take_finisher() {
11999 Context
*fin
= on_finish
;
12004 void finish(int r
) override
{
12005 if (r
< 0) { // we failed the lookup or something; dump ourselves
12006 formatter
->open_object_section("results");
12007 formatter
->dump_int("return_code", r
);
12008 formatter
->close_section(); // results
12011 on_finish
->complete(r
);
12015 void MDCache::enqueue_scrub(
12016 const string
& path
,
12017 const std::string
&tag
,
12018 bool force
, bool recursive
, bool repair
,
12019 Formatter
*f
, Context
*fin
)
12021 dout(10) << __func__
<< path
<< dendl
;
12022 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12023 filepath
fp(path
.c_str());
12024 mdr
->set_filepath(fp
);
12026 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(f
, fin
);
12027 cs
->header
= std::make_shared
<ScrubHeader
>(
12028 tag
, force
, recursive
, repair
, f
);
12030 mdr
->internal_op_finish
= cs
;
12031 enqueue_scrub_work(mdr
);
12034 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12036 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12037 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
12041 // TODO: Remove this restriction
12042 assert(in
->is_auth());
12044 bool locked
= mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
);
12048 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
12049 ScrubHeaderRef
&header
= cs
->header
;
12051 // Cannot scrub same dentry twice at same time
12052 if (in
->scrub_infop
&& in
->scrub_infop
->scrub_in_progress
) {
12053 mds
->server
->respond_to_request(mdr
, -EBUSY
);
12059 header
->set_origin(in
);
12061 // only set completion context for non-recursive scrub, because we don't
12062 // want to block asok caller on long running scrub
12063 if (!header
->get_recursive()) {
12064 Context
*fin
= cs
->take_finisher();
12065 mds
->scrubstack
->enqueue_inode_top(in
, header
,
12066 new MDSInternalContextWrapper(mds
, fin
));
12068 mds
->scrubstack
->enqueue_inode_bottom(in
, header
, NULL
);
12070 mds
->server
->respond_to_request(mdr
, 0);
12074 struct C_MDC_RepairDirfragStats
: public MDCacheLogContext
{
12076 C_MDC_RepairDirfragStats(MDCache
*c
, MDRequestRef
& m
) :
12077 MDCacheLogContext(c
), mdr(m
) {}
12078 void finish(int r
) override
{
12080 get_mds()->server
->respond_to_request(mdr
, r
);
12084 void MDCache::repair_dirfrag_stats(CDir
*dir
)
12086 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
12088 mdr
->internal_op_private
= dir
;
12089 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12090 repair_dirfrag_stats_work(mdr
);
12093 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
12095 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
12096 dout(10) << __func__
<< " " << *dir
<< dendl
;
12098 if (!dir
->is_auth()) {
12099 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12103 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
12104 mds
->locker
->drop_locks(mdr
.get());
12105 mdr
->drop_local_auth_pins();
12106 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
12110 mdr
->auth_pin(dir
);
12112 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12113 CInode
*diri
= dir
->inode
;
12114 rdlocks
.insert(&diri
->dirfragtreelock
);
12115 wrlocks
.insert(&diri
->nestlock
);
12116 wrlocks
.insert(&diri
->filelock
);
12117 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12120 if (!dir
->is_complete()) {
12121 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12125 frag_info_t frag_info
;
12126 nest_info_t nest_info
;
12127 for (CDir::map_t::iterator it
= dir
->begin(); it
!= dir
->end(); ++it
) {
12128 CDentry
*dn
= it
->second
;
12129 if (dn
->last
!= CEPH_NOSNAP
)
12131 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
12132 if (dnl
->is_primary()) {
12133 CInode
*in
= dnl
->get_inode();
12134 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
12136 frag_info
.nsubdirs
++;
12138 frag_info
.nfiles
++;
12139 } else if (dnl
->is_remote())
12140 frag_info
.nfiles
++;
12143 fnode_t
*pf
= dir
->get_projected_fnode();
12144 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
12145 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
12146 if (good_fragstat
&& good_rstat
) {
12147 dout(10) << __func__
<< " no corruption found" << dendl
;
12148 mds
->server
->respond_to_request(mdr
, 0);
12152 pf
= dir
->project_fnode();
12153 pf
->version
= dir
->pre_dirty();
12154 mdr
->add_projected_fnode(dir
);
12156 mdr
->ls
= mds
->mdlog
->get_current_segment();
12157 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
12158 mds
->mdlog
->start_entry(le
);
12160 if (!good_fragstat
) {
12161 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
12162 frag_info
.mtime
= pf
->fragstat
.mtime
;
12163 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
12164 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
12165 pf
->fragstat
= frag_info
;
12166 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12167 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12168 mdr
->add_updated_lock(&diri
->filelock
);
12172 if (pf
->rstat
.rctime
> nest_info
.rctime
)
12173 nest_info
.rctime
= pf
->rstat
.rctime
;
12174 pf
->rstat
= nest_info
;
12175 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12176 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12177 mdr
->add_updated_lock(&diri
->nestlock
);
12180 le
->metablob
.add_dir_context(dir
);
12181 le
->metablob
.add_dir(dir
, true);
12183 mds
->mdlog
->submit_entry(le
, new C_MDC_RepairDirfragStats(this, mdr
));
12186 void MDCache::repair_inode_stats(CInode
*diri
)
12188 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
12190 mdr
->internal_op_private
= diri
;
12191 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12192 repair_inode_stats_work(mdr
);
12195 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
12197 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
12198 dout(10) << __func__
<< " " << *diri
<< dendl
;
12200 if (!diri
->is_auth()) {
12201 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12204 if (!diri
->is_dir()) {
12205 mds
->server
->respond_to_request(mdr
, -ENOTDIR
);
12209 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12210 std::list
<frag_t
> frags
;
12212 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
12215 rdlocks
.insert(&diri
->dirfragtreelock
);
12216 wrlocks
.insert(&diri
->nestlock
);
12217 wrlocks
.insert(&diri
->filelock
);
12218 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12221 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12222 // the scatter-gather process, which will fix any fragstat/rstat errors.
12223 diri
->dirfragtree
.get_leaves(frags
);
12224 for (list
<frag_t
>::iterator p
= frags
.begin(); p
!= frags
.end(); ++p
) {
12225 CDir
*dir
= diri
->get_dirfrag(*p
);
12227 assert(mdr
->is_auth_pinned(diri
));
12228 dir
= diri
->get_or_open_dirfrag(this, *p
);
12230 if (dir
->get_version() == 0) {
12231 assert(dir
->is_auth());
12232 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12237 diri
->state_set(CInode::STATE_REPAIRSTATS
);
12238 mdr
->ls
= mds
->mdlog
->get_current_segment();
12239 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12240 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12241 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12242 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12244 mds
->locker
->drop_locks(mdr
.get());
12247 // force the scatter-gather process
12248 rdlocks
.insert(&diri
->dirfragtreelock
);
12249 rdlocks
.insert(&diri
->nestlock
);
12250 rdlocks
.insert(&diri
->filelock
);
12252 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12255 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
12257 frag_info_t dir_info
;
12258 nest_info_t nest_info
;
12259 nest_info
.rsubdirs
++; // it gets one to account for self
12261 diri
->dirfragtree
.get_leaves(frags
);
12262 for (list
<frag_t
>::iterator p
= frags
.begin(); p
!= frags
.end(); ++p
) {
12263 CDir
*dir
= diri
->get_dirfrag(*p
);
12265 assert(dir
->get_version() > 0);
12266 dir_info
.add(dir
->fnode
.accounted_fragstat
);
12267 nest_info
.add(dir
->fnode
.accounted_rstat
);
12270 if (!dir_info
.same_sums(diri
->inode
.dirstat
) ||
12271 !nest_info
.same_sums(diri
->inode
.rstat
)) {
12272 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
12276 mds
->server
->respond_to_request(mdr
, 0);
12279 void MDCache::flush_dentry(const string
& path
, Context
*fin
)
12281 if (is_readonly()) {
12282 dout(10) << __func__
<< ": read-only FS" << dendl
;
12283 fin
->complete(-EROFS
);
12286 dout(10) << "flush_dentry " << path
<< dendl
;
12287 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
12288 filepath
fp(path
.c_str());
12289 mdr
->set_filepath(fp
);
12290 mdr
->internal_op_finish
= fin
;
12291 flush_dentry_work(mdr
);
12294 class C_FinishIOMDR
: public MDSInternalContextBase
{
12298 MDSRank
*get_mds() override
{ return mds
; }
12300 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
12301 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
12304 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
12306 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12307 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
12311 // TODO: Is this necessary? Fix it if so
12312 assert(in
->is_auth());
12313 bool locked
= mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
);
12316 in
->flush(new C_FinishIOMDR(mds
, mdr
));
12321 * Initialize performance counters with global perfcounter
12324 void MDCache::register_perfcounters()
12326 PerfCountersBuilder
pcb(g_ceph_context
,
12327 "mds_cache", l_mdc_first
, l_mdc_last
);
12329 /* Stray/purge statistics */
12330 pcb
.add_u64(l_mdc_num_strays
, "num_strays",
12331 "Stray dentries", "stry");
12332 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed", "Stray dentries delayed");
12333 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12335 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created", "Stray dentries created");
12336 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
12337 "Stray dentries enqueued for purge");
12338 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated", "Stray dentries reintegrated");
12339 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated", "Stray dentries migrated");
12342 /* Recovery queue statistics */
12343 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing", "Files currently being recovered");
12344 pcb
.add_u64(l_mdc_num_recovering_enqueued
, "num_recovering_enqueued",
12345 "Files waiting for recovery", "recy");
12346 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12347 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started", "File recoveries started");
12348 pcb
.add_u64_counter(l_mdc_recovery_completed
, "recovery_completed",
12349 "File recoveries completed", "recd");
12351 logger
.reset(pcb
.create_perf_counters());
12352 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
12353 recovery_queue
.set_logger(logger
.get());
12354 stray_manager
.set_logger(logger
.get());
12357 void MDCache::activate_stray_manager()
12360 stray_manager
.activate();
12363 new MDSInternalContextWrapper(mds
,
12364 new FunctionContext([this](int r
){
12365 stray_manager
.activate();
12373 * Call this when putting references to an inode/dentry or
12374 * when attempting to trim it.
12376 * If this inode is no longer linked by anyone, and this MDS
12377 * rank holds the primary dentry, and that dentry is in a stray
12378 * directory, then give up the dentry to the StrayManager, never
12379 * to be seen again by MDCache.
12381 * @param delay if true, then purgeable inodes are stashed til
12382 * the next trim(), rather than being purged right
12385 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
12386 if (in
->inode
.nlink
> 0 || in
->is_base() || is_readonly() || mds
->is_standby_replay())
12388 CDentry
*dn
= in
->get_projected_parent_dn();
12390 if (dn
->state_test(CDentry::STATE_PURGING
)) {
12391 /* We have already entered the purging process, no need
12392 * to re-evaluate me ! */
12396 if (dn
->get_projected_linkage()->is_primary() &&
12397 dn
->get_dir()->get_inode()->is_stray()) {
12398 stray_manager
.eval_stray(dn
, delay
);
12402 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
12403 dout(10) << __func__
<< " " << *diri
<< dendl
;
12404 assert(diri
->get_projected_parent_dir()->inode
->is_stray());
12406 diri
->get_dirfrags(ls
);
12407 for (auto p
: ls
) {
12408 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
12409 p
->try_remove_dentries_for_stray();
12411 if (!diri
->snaprealm
) {
12412 if (diri
->is_auth())
12413 diri
->clear_dirty_rstat();
12414 diri
->clear_scatter_dirty();