1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
20 #include <string_view>
28 #include "MDBalancer.h"
30 #include "ScrubStack.h"
32 #include "SnapClient.h"
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
45 #include "messages/MClientCaps.h"
47 #include "msg/Message.h"
48 #include "msg/Messenger.h"
50 #include "common/MemoryModel.h"
51 #include "common/errno.h"
52 #include "common/perf_counters.h"
53 #include "common/safe_io.h"
55 #include "osdc/Journaler.h"
56 #include "osdc/Filer.h"
58 #include "events/ESubtreeMap.h"
59 #include "events/EUpdate.h"
60 #include "events/ESlaveUpdate.h"
61 #include "events/EImportFinish.h"
62 #include "events/EFragment.h"
63 #include "events/ECommitted.h"
64 #include "events/EPurged.h"
65 #include "events/ESessions.h"
69 #include "common/Timer.h"
71 #include "perfglue/heap_profiler.h"
74 #include "common/config.h"
75 #include "include/ceph_assert.h"
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_mds
80 #define dout_prefix _prefix(_dout, mds)
81 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
82 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
85 set
<int> SimpleLock::empty_gather_set
;
89 * All non-I/O contexts that require a reference
90 * to an MDCache instance descend from this.
92 class MDCacheContext
: public virtual MDSContext
{
95 MDSRank
*get_mds() override
97 ceph_assert(mdcache
!= NULL
);
101 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
106 * Only for contexts called back from an I/O completion
108 * Note: duplication of members wrt MDCacheContext, because
109 * it'ls the lesser of two evils compared with introducing
110 * yet another piece of (multiple) inheritance.
112 class MDCacheIOContext
: public virtual MDSIOContextBase
{
115 MDSRank
*get_mds() override
117 ceph_assert(mdcache
!= NULL
);
121 explicit MDCacheIOContext(MDCache
*mdc_
, bool track
=true) :
122 MDSIOContextBase(track
), mdcache(mdc_
) {}
125 class MDCacheLogContext
: public virtual MDSLogContextBase
{
128 MDSRank
*get_mds() override
130 ceph_assert(mdcache
!= NULL
);
134 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
137 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
140 filer(m
->objecter
, m
->finisher
),
141 stray_manager(m
, purge_queue_
),
143 trim_counter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"))
145 migrator
.reset(new Migrator(mds
, this));
147 max_dir_commit_size
= g_conf()->mds_dir_max_commit_size
?
148 (g_conf()->mds_dir_max_commit_size
<< 20) :
149 (0.9 *(g_conf()->osd_max_write_size
<< 20));
151 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
153 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
154 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
156 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
157 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
158 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
160 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
162 bottom_lru
.lru_set_midpoint(0);
164 decayrate
.set_halflife(g_conf()->mds_decay_halflife
);
166 upkeeper
= std::thread([this]() {
167 std::unique_lock
lock(upkeep_mutex
);
168 while (!upkeep_trim_shutdown
.load()) {
169 auto now
= clock::now();
170 auto since
= now
-upkeep_last_trim
;
171 auto trim_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_trim_interval"));
172 if (since
>= trim_interval
*.90) {
173 lock
.unlock(); /* mds_lock -> upkeep_mutex */
174 std::scoped_lock
mds_lock(mds
->mds_lock
);
176 if (upkeep_trim_shutdown
.load())
178 if (mds
->is_cache_trimmable()) {
179 dout(20) << "upkeep thread trimming cache; last trim " << since
<< " ago" << dendl
;
180 trim_client_leases();
182 check_memory_usage();
183 auto flags
= Server::RecallFlags::ENFORCE_MAX
|Server::RecallFlags::ENFORCE_LIVENESS
;
184 mds
->server
->recall_client_state(nullptr, flags
);
185 upkeep_last_trim
= now
= clock::now();
187 dout(10) << "cache not ready for trimming" << dendl
;
190 trim_interval
-= since
;
192 since
= now
-upkeep_last_release
;
193 auto release_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_release_free_interval"));
194 if (since
>= release_interval
) {
195 /* XXX not necessary once MDCache uses PriorityCache */
196 dout(10) << "releasing free memory" << dendl
;
197 ceph_heap_release_free_memory();
198 upkeep_last_release
= clock::now();
200 release_interval
-= since
;
202 auto interval
= std::min(release_interval
, trim_interval
);
203 dout(20) << "upkeep thread waiting interval " << interval
<< dendl
;
204 upkeep_cvar
.wait_for(lock
, interval
);
212 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
214 if (upkeeper
.joinable())
218 void MDCache::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mdsmap
)
220 dout(20) << "config changes: " << changed
<< dendl
;
221 if (changed
.count("mds_cache_memory_limit"))
222 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
223 if (changed
.count("mds_cache_reservation"))
224 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
225 if (changed
.count("mds_export_ephemeral_distributed")) {
226 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
227 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl
;
228 /* copy to vector to avoid removals during iteration */
229 std::vector
<CInode
*> migrate
;
230 migrate
.assign(dist_ephemeral_pins
.begin(), dist_ephemeral_pins
.end());
231 for (auto& in
: migrate
) {
232 in
->maybe_ephemeral_dist();
234 mds
->balancer
->handle_export_pins();
236 if (changed
.count("mds_export_ephemeral_random")) {
237 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
238 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl
;
239 /* copy to vector to avoid removals during iteration */
240 std::vector
<CInode
*> migrate
;
241 migrate
.assign(rand_ephemeral_pins
.begin(), rand_ephemeral_pins
.end());
242 for (auto& in
: migrate
) {
243 in
->maybe_ephemeral_rand();
245 mds
->balancer
->handle_export_pins();
247 if (changed
.count("mds_export_ephemeral_random_max")) {
248 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
250 if (changed
.count("mds_health_cache_threshold"))
251 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
252 if (changed
.count("mds_cache_mid"))
253 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
254 if (changed
.count("mds_cache_trim_decay_rate")) {
255 trim_counter
= DecayCounter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"));
257 if (changed
.count("mds_forward_all_requests_to_auth")){
258 forward_all_requests_to_auth
= g_conf().get_val
<bool>("mds_forward_all_requests_to_auth");
261 migrator
->handle_conf_change(changed
, mdsmap
);
262 mds
->balancer
->handle_conf_change(changed
, mdsmap
);
265 void MDCache::log_stat()
267 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
268 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
269 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
270 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
271 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
272 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
273 mds
->logger
->set(l_mds_caps
, Capability::count());
275 mds
->logger
->set(l_mds_root_rfiles
, root
->inode
.rstat
.rfiles
);
276 mds
->logger
->set(l_mds_root_rbytes
, root
->inode
.rstat
.rbytes
);
277 mds
->logger
->set(l_mds_root_rsnaps
, root
->inode
.rstat
.rsnaps
);
284 bool MDCache::shutdown()
287 std::scoped_lock
lock(upkeep_mutex
);
288 upkeep_trim_shutdown
= true;
289 upkeep_cvar
.notify_one();
291 if (lru
.lru_get_size() > 0) {
292 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
301 // ====================================================================
302 // some inode functions
304 void MDCache::add_inode(CInode
*in
)
306 // add to lru, inode map
307 if (in
->last
== CEPH_NOSNAP
) {
308 auto &p
= inode_map
[in
->ino()];
309 ceph_assert(!p
); // should be no dup inos!
312 auto &p
= snap_inode_map
[in
->vino()];
313 ceph_assert(!p
); // should be no dup inos!
317 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
318 if (in
->ino() == MDS_INO_ROOT
)
320 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
322 else if (in
->is_stray()) {
323 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
324 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
328 base_inodes
.insert(in
);
331 if (cache_toofull()) {
332 exceeded_size_limit
= true;
335 in
->maybe_ephemeral_dist(false);
338 void MDCache::remove_inode(CInode
*o
)
340 dout(14) << "remove_inode " << *o
<< dendl
;
342 if (o
->get_parent_dn()) {
343 // FIXME: multiple parents?
344 CDentry
*dn
= o
->get_parent_dn();
345 ceph_assert(!dn
->is_dirty());
346 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
351 if (o
->is_dirty_parent())
352 o
->clear_dirty_parent();
354 o
->clear_scatter_dirty();
356 o
->item_open_file
.remove_myself();
358 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
359 export_pin_queue
.erase(o
);
361 if (o
->state_test(CInode::STATE_DELAYEDEXPORTPIN
))
362 export_pin_delayed_queue
.erase(o
);
364 o
->set_ephemeral_dist(false);
365 o
->set_ephemeral_rand(false);
367 // remove from inode map
368 if (o
->last
== CEPH_NOSNAP
) {
369 inode_map
.erase(o
->ino());
371 o
->item_caps
.remove_myself();
372 snap_inode_map
.erase(o
->vino());
375 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
376 if (o
== root
) root
= 0;
377 if (o
== myin
) myin
= 0;
379 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
380 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
384 base_inodes
.erase(o
);
388 ceph_assert(o
->get_num_ref() == 0);
392 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
394 file_layout_t result
= file_layout_t::get_default();
395 result
.pool_id
= mdsmap
.get_first_data_pool();
399 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
401 file_layout_t result
= file_layout_t::get_default();
402 result
.pool_id
= mdsmap
.get_metadata_pool();
403 if (g_conf()->mds_log_segment_size
> 0) {
404 result
.object_size
= g_conf()->mds_log_segment_size
;
405 result
.stripe_unit
= g_conf()->mds_log_segment_size
;
410 void MDCache::init_layouts()
412 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
413 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
416 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
420 in
->inode
.version
= 1;
421 in
->inode
.xattr_version
= 1;
422 in
->inode
.mode
= 0500 | mode
;
426 in
->inode
.btime
= ceph_clock_now();
428 in
->inode
.truncate_size
= -1ull;
429 in
->inode
.change_attr
= 0;
430 in
->inode
.export_pin
= MDS_RANK_NONE
;
432 // FIPS zeroization audit 20191117: this memset is not security related.
433 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
434 if (in
->inode
.is_dir()) {
435 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
436 in
->inode
.rstat
.rsubdirs
= 1; /* itself */
437 in
->inode
.rstat
.rctime
= in
->inode
.ctime
;
439 in
->inode
.layout
= default_file_layout
;
440 ++in
->inode
.rstat
.rfiles
;
442 in
->inode
.accounted_rstat
= in
->inode
.rstat
;
446 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
448 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
449 in
->open_snaprealm(); // empty snaprealm
450 ceph_assert(!in
->snaprealm
->parent
); // created its own
451 in
->snaprealm
->srnode
.seq
= 1;
455 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
457 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
458 CInode
*in
= new CInode(this);
459 create_unlinked_system_inode(in
, ino
, mode
);
464 CInode
*MDCache::create_root_inode()
466 CInode
*i
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755);
467 i
->inode
.uid
= g_conf()->mds_root_ino_uid
;
468 i
->inode
.gid
= g_conf()->mds_root_ino_gid
;
469 i
->inode
.layout
= default_file_layout
;
470 i
->inode
.layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
474 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
477 CInode
*root
= create_root_inode();
479 // force empty root dir
480 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
481 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
482 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
484 ceph_assert(rootdir
->fnode
.accounted_fragstat
== rootdir
->fnode
.fragstat
);
485 ceph_assert(rootdir
->fnode
.fragstat
== root
->inode
.dirstat
);
486 ceph_assert(rootdir
->fnode
.accounted_rstat
== rootdir
->fnode
.rstat
);
487 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
488 * assume version 0 is stale/invalid.
491 rootdir
->mark_complete();
492 rootdir
->mark_dirty(rootdir
->pre_dirty(), mds
->mdlog
->get_current_segment());
493 rootdir
->commit(0, gather
->new_sub());
496 root
->mark_dirty(root
->pre_dirty(), mds
->mdlog
->get_current_segment());
497 root
->mark_dirty_parent(mds
->mdlog
->get_current_segment(), true);
498 root
->flush(gather
->new_sub());
501 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
504 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
506 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
507 adjust_subtree_auth(mydir
, mds
->get_nodeid());
509 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
512 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
513 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
514 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
516 name
<< "stray" << i
;
517 CDentry
*sdn
= mydir
->add_primary_dentry(name
.str(), stray
);
518 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
520 stray
->inode
.dirstat
= straydir
->fnode
.fragstat
;
522 mydir
->fnode
.rstat
.add(stray
->inode
.rstat
);
523 mydir
->fnode
.fragstat
.nsubdirs
++;
525 straydir
->mark_complete();
526 straydir
->mark_dirty(straydir
->pre_dirty(), ls
);
527 straydir
->commit(0, gather
->new_sub());
528 stray
->mark_dirty_parent(ls
, true);
529 stray
->store_backtrace(gather
->new_sub());
532 mydir
->fnode
.accounted_fragstat
= mydir
->fnode
.fragstat
;
533 mydir
->fnode
.accounted_rstat
= mydir
->fnode
.rstat
;
535 myin
->inode
.dirstat
= mydir
->fnode
.fragstat
;
536 myin
->inode
.rstat
= mydir
->fnode
.rstat
;
537 ++myin
->inode
.rstat
.rsubdirs
;
538 myin
->inode
.accounted_rstat
= myin
->inode
.rstat
;
540 mydir
->mark_complete();
541 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
542 mydir
->commit(0, gather
->new_sub());
544 myin
->store(gather
->new_sub());
547 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
552 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSContext
*f
) :
553 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
554 void finish(int r
) override
{
555 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
559 void MDCache::_create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
)
561 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
562 CDentry
*dn
= dir
->add_null_dentry(name
);
564 dn
->push_projected_linkage(in
);
565 version_t dpv
= dn
->pre_dirty();
568 if (in
->inode
.is_dir()) {
569 in
->inode
.rstat
.rsubdirs
= 1;
571 mdir
= in
->get_or_open_dirfrag(this, frag_t());
572 mdir
->mark_complete();
575 in
->inode
.rstat
.rfiles
= 1;
576 in
->inode
.version
= dn
->pre_dirty();
578 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
579 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
581 MutationRef
mut(new MutationImpl());
583 // force some locks. hacky.
584 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
585 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
587 mut
->ls
= mds
->mdlog
->get_current_segment();
588 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
589 mds
->mdlog
->start_entry(le
);
591 if (!in
->is_mdsdir()) {
592 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
593 le
->metablob
.add_primary_dentry(dn
, in
, true);
595 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
596 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
597 dn
->push_projected_linkage(in
->ino(), in
->d_type());
598 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
599 le
->metablob
.add_root(true, in
);
602 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
604 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
608 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSContext
*fin
)
610 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
612 dn
->pop_projected_linkage();
613 dn
->mark_dirty(dpv
, mut
->ls
);
615 CInode
*in
= dn
->get_linkage()->get_inode();
617 in
->mark_dirty(in
->inode
.version
+ 1, mut
->ls
);
619 if (in
->inode
.is_dir()) {
620 CDir
*dir
= in
->get_dirfrag(frag_t());
622 dir
->mark_dirty(1, mut
->ls
);
623 dir
->mark_new(mut
->ls
);
627 mds
->locker
->drop_locks(mut
.get());
632 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
633 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
638 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
640 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
641 void finish(int r
) override
{
643 // If we can't open root, something disastrous has happened: mark
644 // this rank damaged for operator intervention. Note that
645 // it is not okay to call suicide() here because we are in
646 // a Finisher callback.
647 cache
->mds
->damaged();
648 ceph_abort(); // damaged should never return
655 void MDCache::open_root_inode(MDSContext
*c
)
657 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
659 in
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
662 discover_base_ino(MDS_INO_ROOT
, c
, mds
->mdsmap
->get_root());
666 void MDCache::open_mydir_inode(MDSContext
*c
)
668 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
672 void MDCache::open_mydir_frag(MDSContext
*c
)
675 new MDSInternalContextWrapper(mds
,
676 new LambdaContext([this, c
](int r
) {
681 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
683 adjust_subtree_auth(mydir
, mds
->get_nodeid());
690 void MDCache::open_root()
692 dout(10) << "open_root" << dendl
;
695 open_root_inode(new C_MDS_RetryOpenRoot(this));
698 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
699 ceph_assert(root
->is_auth());
700 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
701 ceph_assert(rootdir
);
702 if (!rootdir
->is_subtree_root())
703 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
704 if (!rootdir
->is_complete()) {
705 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
709 ceph_assert(!root
->is_auth());
710 CDir
*rootdir
= root
->get_dirfrag(frag_t());
712 open_remote_dirfrag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
718 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
719 in
->fetch(new C_MDS_RetryOpenRoot(this));
722 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
724 adjust_subtree_auth(mydir
, mds
->get_nodeid());
729 void MDCache::populate_mydir()
732 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
735 dout(10) << "populate_mydir " << *mydir
<< dendl
;
737 if (!mydir
->is_complete()) {
738 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
742 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
743 // A missing dirfrag, we will recreate it. Before that, we must dirty
744 // it before dirtying any of the strays we create within it.
745 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
747 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
748 mydir
->state_clear(CDir::STATE_BADFRAG
);
749 mydir
->mark_complete();
750 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
753 // open or create stray
754 uint64_t num_strays
= 0;
755 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
757 name
<< "stray" << i
;
758 CDentry
*straydn
= mydir
->lookup(name
.str());
760 // allow for older fs's with stray instead of stray0
761 if (straydn
== NULL
&& i
== 0)
762 straydn
= mydir
->lookup("stray");
764 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
765 _create_system_file(mydir
, name
.str().c_str(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
766 new C_MDS_RetryOpenRoot(this));
769 ceph_assert(straydn
);
770 ceph_assert(strays
[i
]);
771 // we make multiple passes through this method; make sure we only pin each stray once.
772 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
773 strays
[i
]->get(CInode::PIN_STRAY
);
774 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
775 strays
[i
]->get_stickydirs();
777 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
781 strays
[i
]->dirfragtree
.get_leaves(leaves
);
782 for (const auto& leaf
: leaves
) {
783 CDir
*dir
= strays
[i
]->get_dirfrag(leaf
);
785 dir
= strays
[i
]->get_or_open_dirfrag(this, leaf
);
788 // DamageTable applies special handling to strays: it will
789 // have damaged() us out if one is damaged.
790 ceph_assert(!dir
->state_test(CDir::STATE_BADFRAG
));
792 if (dir
->get_version() == 0) {
793 dir
->fetch(new C_MDS_RetryOpenRoot(this));
797 if (dir
->get_frag_size() > 0)
798 num_strays
+= dir
->get_frag_size();
803 dout(10) << "populate_mydir done" << dendl
;
806 mds
->queue_waiters(waiting_for_open
);
808 stray_manager
.set_num_strays(num_strays
);
809 stray_manager
.activate();
814 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSContext
*fin
)
816 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
819 CDir
*MDCache::get_stray_dir(CInode
*in
)
822 in
->name_stray_dentry(straydname
);
824 CInode
*strayi
= get_stray();
826 frag_t fg
= strayi
->pick_dirfrag(straydname
);
827 CDir
*straydir
= strayi
->get_dirfrag(fg
);
828 ceph_assert(straydir
);
832 CDentry
*MDCache::get_or_create_stray_dentry(CInode
*in
)
834 CDir
*straydir
= get_stray_dir(in
);
836 in
->name_stray_dentry(straydname
);
837 CDentry
*straydn
= straydir
->lookup(straydname
);
839 straydn
= straydir
->add_null_dentry(straydname
);
842 ceph_assert(straydn
->get_projected_linkage()->is_null());
845 straydn
->state_set(CDentry::STATE_STRAY
);
851 MDSCacheObject
*MDCache::get_object(const MDSCacheObjectInfo
&info
)
855 return get_inode(info
.ino
, info
.snapid
);
858 CDir
*dir
= get_dirfrag(info
.dirfrag
);
861 if (info
.dname
.length())
862 return dir
->lookup(info
.dname
, info
.snapid
);
868 // ====================================================================
869 // consistent hash ring
872 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
874 mds_rank_t
MDCache::hash_into_rank_bucket(inodeno_t ino
)
876 const mds_rank_t max_mds
= mds
->mdsmap
->get_max_mds();
877 uint64_t hash
= rjhash64(ino
);
878 int64_t b
= -1, j
= 0;
879 while (j
< max_mds
) {
881 hash
= hash
*2862933555777941757ULL + 1;
882 j
= (b
+ 1) * (double(1LL << 31) / double((hash
>> 33) + 1));
884 // verify bounds before returning
885 auto result
= mds_rank_t(b
);
886 ceph_assert(result
>= 0 && result
< max_mds
);
891 // ====================================================================
892 // subtree management
895 * adjust the dir_auth of a subtree.
896 * merge with parent and/or child subtrees, if is it appropriate.
897 * merge can ONLY happen if both parent and child have unambiguous auth.
899 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool adjust_pop
)
901 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
902 << " on " << *dir
<< dendl
;
907 if (dir
->inode
->is_base()) {
908 root
= dir
; // bootstrap hack.
909 if (subtrees
.count(root
) == 0) {
911 root
->get(CDir::PIN_SUBTREE
);
914 root
= get_subtree_root(dir
); // subtree root
917 ceph_assert(subtrees
.count(root
));
918 dout(7) << " current root is " << *root
<< dendl
;
921 // i am already a subtree.
922 dir
->set_dir_auth(auth
);
924 // i am a new subtree.
925 dout(10) << " new subtree at " << *dir
<< dendl
;
926 ceph_assert(subtrees
.count(dir
) == 0);
927 subtrees
[dir
]; // create empty subtree bounds list for me.
928 dir
->get(CDir::PIN_SUBTREE
);
931 dir
->set_dir_auth(auth
);
933 // move items nested beneath me, under me.
934 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
935 while (p
!= subtrees
[root
].end()) {
936 set
<CDir
*>::iterator next
= p
;
938 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
940 dout(10) << " claiming child bound " << **p
<< dendl
;
941 subtrees
[dir
].insert(*p
);
942 subtrees
[root
].erase(p
);
947 // i am a bound of the parent subtree.
948 subtrees
[root
].insert(dir
);
950 // i am now the subtree root.
953 // adjust recursive pop counters
954 if (adjust_pop
&& dir
->is_auth()) {
955 CDir
*p
= dir
->get_parent_dir();
957 p
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
958 if (p
->is_subtree_root()) break;
959 p
= p
->inode
->get_parent_dir();
964 if (dir
->is_auth()) {
965 /* do this now that we are auth for the CDir */
966 dir
->inode
->maybe_pin();
973 void MDCache::try_subtree_merge(CDir
*dir
)
975 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
976 // record my old bounds
977 auto oldbounds
= subtrees
.at(dir
);
979 set
<CInode
*> to_eval
;
980 // try merge at my root
981 try_subtree_merge_at(dir
, &to_eval
);
983 // try merge at my old bounds
984 for (auto bound
: oldbounds
)
985 try_subtree_merge_at(bound
, &to_eval
);
987 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
988 for(auto in
: to_eval
)
989 eval_subtree_root(in
);
993 class C_MDC_SubtreeMergeWB
: public MDCacheLogContext
{
997 C_MDC_SubtreeMergeWB(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) : MDCacheLogContext(mdc
), in(i
), mut(m
) {}
998 void finish(int r
) override
{
999 mdcache
->subtree_merge_writebehind_finish(in
, mut
);
1003 void MDCache::try_subtree_merge_at(CDir
*dir
, set
<CInode
*> *to_eval
, bool adjust_pop
)
1005 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
1007 if (dir
->dir_auth
.second
!= CDIR_AUTH_UNKNOWN
||
1008 dir
->state_test(CDir::STATE_EXPORTBOUND
) ||
1009 dir
->state_test(CDir::STATE_AUXSUBTREE
))
1012 auto it
= subtrees
.find(dir
);
1013 ceph_assert(it
!= subtrees
.end());
1015 // merge with parent?
1017 if (!dir
->inode
->is_base())
1018 parent
= get_subtree_root(dir
->get_parent_dir());
1020 if (parent
!= dir
&& // we have a parent,
1021 parent
->dir_auth
== dir
->dir_auth
) { // auth matches,
1022 // merge with parent.
1023 dout(10) << " subtree merge at " << *dir
<< dendl
;
1024 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
1026 // move our bounds under the parent
1027 subtrees
[parent
].insert(it
->second
.begin(), it
->second
.end());
1029 // we are no longer a subtree or bound
1030 dir
->put(CDir::PIN_SUBTREE
);
1032 subtrees
[parent
].erase(dir
);
1034 // adjust popularity?
1035 if (adjust_pop
&& dir
->is_auth()) {
1037 CDir
*p
= dir
->get_parent_dir();
1039 p
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
1040 p
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
1041 if (p
->is_subtree_root()) break;
1043 p
= p
->inode
->get_parent_dir();
1047 if (to_eval
&& dir
->get_inode()->is_auth())
1048 to_eval
->insert(dir
->get_inode());
1054 void MDCache::subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
)
1056 dout(10) << "subtree_merge_writebehind_finish on " << in
<< dendl
;
1057 in
->pop_and_dirty_projected_inode(mut
->ls
);
1060 mds
->locker
->drop_locks(mut
.get());
1063 in
->auth_unpin(this);
1066 void MDCache::eval_subtree_root(CInode
*diri
)
1068 // evaluate subtree inode filelock?
1069 // (we should scatter the filelock on subtree bounds)
1070 ceph_assert(diri
->is_auth());
1071 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
1075 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
)
1077 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1079 << " bounds " << bounds
1085 if (dir
->ino() == MDS_INO_ROOT
) {
1086 root
= dir
; // bootstrap hack.
1087 if (subtrees
.count(root
) == 0) {
1089 root
->get(CDir::PIN_SUBTREE
);
1092 root
= get_subtree_root(dir
); // subtree root
1095 ceph_assert(subtrees
.count(root
));
1096 dout(7) << " current root is " << *root
<< dendl
;
1098 mds_authority_t oldauth
= dir
->authority();
1101 // i am already a subtree.
1102 dir
->set_dir_auth(auth
);
1104 // i am a new subtree.
1105 dout(10) << " new subtree at " << *dir
<< dendl
;
1106 ceph_assert(subtrees
.count(dir
) == 0);
1107 subtrees
[dir
]; // create empty subtree bounds list for me.
1108 dir
->get(CDir::PIN_SUBTREE
);
1111 dir
->set_dir_auth(auth
);
1113 // move items nested beneath me, under me.
1114 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1115 while (p
!= subtrees
[root
].end()) {
1116 set
<CDir
*>::iterator next
= p
;
1118 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1120 dout(10) << " claiming child bound " << **p
<< dendl
;
1121 subtrees
[dir
].insert(*p
);
1122 subtrees
[root
].erase(p
);
1127 // i am a bound of the parent subtree.
1128 subtrees
[root
].insert(dir
);
1130 // i am now the subtree root.
1134 set
<CInode
*> to_eval
;
1136 // verify/adjust bounds.
1137 // - these may be new, or
1138 // - beneath existing ambiguous bounds (which will be collapsed),
1139 // - but NOT beneath unambiguous bounds.
1140 for (const auto& bound
: bounds
) {
1142 if (subtrees
[dir
].count(bound
) == 0) {
1143 if (get_subtree_root(bound
) == dir
) {
1144 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1145 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1148 dout(10) << " want bound " << *bound
<< dendl
;
1149 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1150 if (subtrees
[t
].count(bound
) == 0) {
1151 ceph_assert(t
!= dir
);
1152 dout(10) << " new bound " << *bound
<< dendl
;
1153 adjust_subtree_auth(bound
, t
->authority());
1155 // make sure it's nested beneath ambiguous subtree(s)
1157 while (subtrees
[dir
].count(t
) == 0)
1158 t
= get_subtree_root(t
->get_parent_dir());
1159 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1160 adjust_subtree_auth(t
, auth
);
1161 try_subtree_merge_at(t
, &to_eval
);
1162 t
= get_subtree_root(bound
->get_parent_dir());
1163 if (t
== dir
) break;
1168 dout(10) << " already have bound " << *bound
<< dendl
;
1171 // merge stray bounds?
1172 while (!subtrees
[dir
].empty()) {
1173 set
<CDir
*> copy
= subtrees
[dir
];
1174 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1175 if (bounds
.count(*p
) == 0) {
1177 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1178 adjust_subtree_auth(stray
, auth
);
1179 try_subtree_merge_at(stray
, &to_eval
);
1182 // swallowing subtree may add new subtree bounds
1183 if (copy
== subtrees
[dir
])
1187 // bound should now match.
1188 verify_subtree_bounds(dir
, bounds
);
1192 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
1193 for(auto in
: to_eval
)
1194 eval_subtree_root(in
);
1200 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1201 * fragmentation as necessary to get an equivalent bounding set. That is, only
1202 * split if one of our frags spans the provided bounding set. Never merge.
1204 void MDCache::get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1206 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1209 map
<inodeno_t
, fragset_t
> byino
;
1210 for (auto& frag
: dfs
) {
1211 byino
[frag
.ino
].insert_raw(frag
.frag
);
1213 dout(10) << " by ino: " << byino
<< dendl
;
1215 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1216 p
->second
.simplify();
1217 CInode
*diri
= get_inode(p
->first
);
1220 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1223 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1224 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1226 for (const auto& fg
: p
->second
) {
1228 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1229 if (leaves
.empty()) {
1231 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1232 frag_vec_t approx_leaves
;
1233 tmpdft
.get_leaves_under(approx_fg
, approx_leaves
);
1234 for (const auto& leaf
: approx_leaves
) {
1235 if (p
->second
.get().count(leaf
) == 0) {
1236 // not bound, so the resolve message is from auth MDS of the dirfrag
1237 force_dir_fragment(diri
, leaf
);
1242 leaves
.push_back(approx_fg
);
1244 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1246 dout(10) << " frag " << fg
<< " contains " << leaves
<< dendl
;
1247 for (const auto& leaf
: leaves
) {
1248 CDir
*dir
= diri
->get_dirfrag(leaf
);
1256 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bound_dfs
, const mds_authority_t
&auth
)
1258 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1259 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1262 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1263 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1266 void MDCache::map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1268 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1271 map
<inodeno_t
, fragset_t
> ino_fragset
;
1272 for (const auto &df
: dfs
) {
1273 ino_fragset
[df
.ino
].insert_raw(df
.frag
);
1276 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1277 p
!= ino_fragset
.end();
1279 p
->second
.simplify();
1280 CInode
*in
= get_inode(p
->first
);
1285 for (const auto& fg
: p
->second
) {
1286 in
->dirfragtree
.get_leaves_under(fg
, fgs
);
1289 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fgs
1290 << " on " << *in
<< dendl
;
1292 for (const auto& fg
: fgs
) {
1293 CDir
*dir
= in
->get_dirfrag(fg
);
1302 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1304 // find the underlying dir that delegates (or is about to delegate) auth
1306 if (dir
->is_subtree_root())
1308 dir
= dir
->get_inode()->get_parent_dir();
1314 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1316 // find the underlying dir that delegates (or is about to delegate) auth
1318 if (dir
->is_subtree_root())
1320 dir
= dir
->get_inode()->get_projected_parent_dir();
1326 void MDCache::remove_subtree(CDir
*dir
)
1328 dout(10) << "remove_subtree " << *dir
<< dendl
;
1329 auto it
= subtrees
.find(dir
);
1330 ceph_assert(it
!= subtrees
.end());
1332 dir
->put(CDir::PIN_SUBTREE
);
1333 if (dir
->get_parent_dir()) {
1334 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1335 auto it
= subtrees
.find(p
);
1336 ceph_assert(it
!= subtrees
.end());
1337 auto count
= it
->second
.erase(dir
);
1338 ceph_assert(count
== 1);
1342 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1344 ceph_assert(subtrees
.count(dir
));
1345 bounds
= subtrees
[dir
];
1348 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1350 if (subtrees
.count(dir
)) {
1351 // just copy them, dir is a subtree.
1352 get_subtree_bounds(dir
, bounds
);
1355 CDir
*root
= get_subtree_root(dir
);
1356 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1357 p
!= subtrees
[root
].end();
1361 t
= t
->get_parent_dir();
1372 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1374 // for debugging only.
1375 ceph_assert(subtrees
.count(dir
));
1376 if (bounds
!= subtrees
[dir
]) {
1377 dout(0) << "verify_subtree_bounds failed" << dendl
;
1378 set
<CDir
*> b
= bounds
;
1379 for (auto &cd
: subtrees
[dir
]) {
1380 if (bounds
.count(cd
)) {
1384 dout(0) << " missing bound " << *cd
<< dendl
;
1386 for (const auto &cd
: b
)
1387 dout(0) << " extra bound " << *cd
<< dendl
;
1389 ceph_assert(bounds
== subtrees
[dir
]);
1392 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1394 // for debugging only.
1395 ceph_assert(subtrees
.count(dir
));
1397 // make sure that any bounds i do have are properly noted as such.
1399 for (const auto &fg
: bounds
) {
1400 CDir
*bd
= get_dirfrag(fg
);
1402 if (subtrees
[dir
].count(bd
) == 0) {
1403 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1407 ceph_assert(failed
== 0);
1410 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1412 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1413 << " to " << *newdir
<< dendl
;
1414 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1417 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
)
1419 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1421 CDir
*newdir
= diri
->get_parent_dir();
1424 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1425 ceph_assert(p
!= projected_subtree_renames
.end());
1426 ceph_assert(!p
->second
.empty());
1427 ceph_assert(p
->second
.front().first
== olddir
);
1428 ceph_assert(p
->second
.front().second
== newdir
);
1429 p
->second
.pop_front();
1430 if (p
->second
.empty())
1431 projected_subtree_renames
.erase(p
);
1434 // adjust total auth pin of freezing subtree
1435 if (olddir
!= newdir
) {
1436 auto&& dfls
= diri
->get_nested_dirfrags();
1437 for (const auto& dir
: dfls
)
1438 olddir
->adjust_freeze_after_rename(dir
);
1442 // N.B. make sure subtree dirfrags are at the front of the list
1443 auto dfls
= diri
->get_subtree_dirfrags();
1444 diri
->get_nested_dirfrags(dfls
);
1445 for (const auto& dir
: dfls
) {
1446 dout(10) << "dirfrag " << *dir
<< dendl
;
1447 CDir
*oldparent
= get_subtree_root(olddir
);
1448 dout(10) << " old parent " << *oldparent
<< dendl
;
1449 CDir
*newparent
= get_subtree_root(newdir
);
1450 dout(10) << " new parent " << *newparent
<< dendl
;
1452 auto& oldbounds
= subtrees
[oldparent
];
1453 auto& newbounds
= subtrees
[newparent
];
1455 if (olddir
!= newdir
)
1456 mds
->balancer
->adjust_pop_for_rename(olddir
, dir
, false);
1458 if (oldparent
== newparent
) {
1459 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1460 } else if (dir
->is_subtree_root()) {
1461 // children are fine. change parent.
1462 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1464 auto n
= oldbounds
.erase(dir
);
1465 ceph_assert(n
== 1);
1467 newbounds
.insert(dir
);
1468 // caller is responsible for 'eval diri'
1469 try_subtree_merge_at(dir
, NULL
, false);
1473 // see if any old bounds move to the new parent.
1474 std::vector
<CDir
*> tomove
;
1475 for (const auto& bound
: oldbounds
) {
1476 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1477 if (broot
!= oldparent
) {
1478 ceph_assert(broot
== newparent
);
1479 tomove
.push_back(bound
);
1482 for (const auto& bound
: tomove
) {
1483 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1484 oldbounds
.erase(bound
);
1485 newbounds
.insert(bound
);
1489 if (oldparent
->authority() != newparent
->authority()) {
1490 adjust_subtree_auth(dir
, oldparent
->authority(), false);
1491 // caller is responsible for 'eval diri'
1492 try_subtree_merge_at(dir
, NULL
, false);
1496 if (olddir
!= newdir
)
1497 mds
->balancer
->adjust_pop_for_rename(newdir
, dir
, true);
1503 // ===================================
1504 // journal and snap/cow helpers
1508 * find first inode in cache that follows given snapid. otherwise, return current.
1510 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1512 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1513 ceph_assert(in
->last
== CEPH_NOSNAP
);
1515 auto p
= snap_inode_map
.upper_bound(vinodeno_t(in
->ino(), follows
));
1516 if (p
!= snap_inode_map
.end() && p
->second
->ino() == in
->ino()) {
1517 dout(10) << "pick_inode_snap found " << *p
->second
<< dendl
;
1526 * note: i'm currently cheating wrt dirty and inode.version on cow
1527 * items. instead of doing a full dir predirty, i just take the
1528 * original item's version, and set the dirty flag (via
1529 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1530 * means a special case in the dir commit clean sweep assertions.
1533 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1535 ceph_assert(last
>= in
->first
);
1537 CInode
*oldin
= new CInode(this, true, in
->first
, last
);
1538 oldin
->inode
= *in
->get_previous_projected_inode();
1539 oldin
->xattrs
= *in
->get_previous_projected_xattrs();
1540 oldin
->symlink
= in
->symlink
;
1541 oldin
->inode
.trim_client_ranges(last
);
1543 if (in
->first
< in
->oldest_snap
)
1544 in
->oldest_snap
= in
->first
;
1548 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1551 if (in
->last
!= CEPH_NOSNAP
) {
1552 CInode
*head_in
= get_inode(in
->ino());
1553 ceph_assert(head_in
);
1554 auto ret
= head_in
->split_need_snapflush(oldin
, in
);
1556 oldin
->client_snap_caps
= in
->client_snap_caps
;
1557 if (!oldin
->client_snap_caps
.empty()) {
1558 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1559 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1561 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1562 ceph_assert(lock
->is_stable());
1563 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1564 oldin
->auth_pin(lock
);
1566 lock
->get_wrlock(true);
1571 auto client_snap_caps
= std::move(in
->client_snap_caps
);
1572 in
->client_snap_caps
.clear();
1573 in
->item_open_file
.remove_myself();
1574 in
->item_caps
.remove_myself();
1576 if (!client_snap_caps
.empty()) {
1577 MDSContext::vec finished
;
1578 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1579 SimpleLock
*lock
= in
->get_lock(cinode_lock_info
[i
].lock
);
1581 ceph_assert(lock
->get_state() == LOCK_SNAP_SYNC
); // gathering
1583 if (!lock
->get_num_wrlocks()) {
1584 lock
->set_state(LOCK_SYNC
);
1585 lock
->take_waiting(SimpleLock::WAIT_STABLE
|SimpleLock::WAIT_RD
, finished
);
1586 in
->auth_unpin(lock
);
1589 mds
->queue_waiters(finished
);
1595 if (!in
->client_caps
.empty()) {
1596 const set
<snapid_t
>& snaps
= in
->find_snaprealm()->get_snaps();
1598 for (auto &p
: in
->client_caps
) {
1599 client_t client
= p
.first
;
1600 Capability
*cap
= &p
.second
;
1601 int issued
= cap
->need_snapflush() ? CEPH_CAP_ANY_WR
: cap
->issued();
1602 if ((issued
& CEPH_CAP_ANY_WR
) &&
1603 cap
->client_follows
< last
) {
1604 dout(10) << " client." << client
<< " cap " << ccap_string(issued
) << dendl
;
1605 oldin
->client_snap_caps
.insert(client
);
1606 cap
->client_follows
= last
;
1608 // we need snapflushes for any intervening snaps
1609 dout(10) << " snaps " << snaps
<< dendl
;
1610 for (auto q
= snaps
.lower_bound(oldin
->first
);
1611 q
!= snaps
.end() && *q
<= last
;
1613 in
->add_need_snapflush(oldin
, *q
, client
);
1616 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1620 if (!oldin
->client_snap_caps
.empty()) {
1621 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1622 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1624 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1625 ceph_assert(lock
->is_stable());
1626 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1627 oldin
->auth_pin(lock
);
1629 lock
->get_wrlock(true);
1636 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1637 CDentry
*dn
, snapid_t follows
,
1638 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1641 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1644 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1645 ceph_assert(dn
->is_auth());
1647 // nothing to cow on a null dentry, fix caller
1649 dnl
= dn
->get_projected_linkage();
1650 ceph_assert(!dnl
->is_null());
1652 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1653 bool cow_head
= false;
1654 if (in
&& in
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
1655 ceph_assert(in
->is_frozen_inode());
1658 if (in
&& (in
->is_multiversion() || cow_head
)) {
1659 // multiversion inode.
1660 SnapRealm
*realm
= NULL
;
1662 if (in
->get_projected_parent_dn() != dn
) {
1663 ceph_assert(follows
== CEPH_NOSNAP
);
1664 realm
= dn
->dir
->inode
->find_snaprealm();
1665 snapid_t dir_follows
= get_global_snaprealm()->get_newest_seq();
1666 ceph_assert(dir_follows
>= realm
->get_newest_seq());
1668 if (dir_follows
+1 > dn
->first
) {
1669 snapid_t oldfirst
= dn
->first
;
1670 dn
->first
= dir_follows
+1;
1671 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1672 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), in
->ino(), in
->d_type(),
1673 oldfirst
, dir_follows
);
1675 dout(10) << " olddn " << *olddn
<< dendl
;
1676 metablob
->add_remote_dentry(olddn
, true);
1677 mut
->add_cow_dentry(olddn
);
1678 // FIXME: adjust link count here? hmm.
1680 if (dir_follows
+1 > in
->first
)
1681 in
->cow_old_inode(dir_follows
, cow_head
);
1685 follows
= dir_follows
;
1686 if (in
->snaprealm
) {
1687 realm
= in
->snaprealm
;
1688 ceph_assert(follows
>= realm
->get_newest_seq());
1691 realm
= in
->find_snaprealm();
1692 if (follows
== CEPH_NOSNAP
) {
1693 follows
= get_global_snaprealm()->get_newest_seq();
1694 ceph_assert(follows
>= realm
->get_newest_seq());
1699 if (follows
< in
->first
) {
1700 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1704 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1705 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1706 in
->first
= follows
+ 1;
1710 in
->cow_old_inode(follows
, cow_head
);
1713 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1714 if (follows
== CEPH_NOSNAP
) {
1715 follows
= get_global_snaprealm()->get_newest_seq();
1716 ceph_assert(follows
>= realm
->get_newest_seq());
1720 if (follows
< dn
->first
) {
1721 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1725 // update dn.first before adding old dentry to cdir's map
1726 snapid_t oldfirst
= dn
->first
;
1727 dn
->first
= follows
+1;
1729 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1730 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1732 in
->first
= follows
+1;
1736 dout(10) << " dn " << *dn
<< dendl
;
1738 CInode
*oldin
= cow_inode(in
, follows
);
1739 mut
->add_cow_inode(oldin
);
1741 *pcow_inode
= oldin
;
1742 CDentry
*olddn
= dn
->dir
->add_primary_dentry(dn
->get_name(), oldin
, oldfirst
, follows
);
1743 oldin
->inode
.version
= olddn
->pre_dirty();
1744 dout(10) << " olddn " << *olddn
<< dendl
;
1745 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1746 if (need_snapflush
) {
1747 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1748 mds
->locker
->mark_need_snapflush_inode(oldin
);
1750 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1751 mut
->add_cow_dentry(olddn
);
1753 ceph_assert(dnl
->is_remote());
1754 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), dnl
->get_remote_ino(), dnl
->get_remote_d_type(),
1757 dout(10) << " olddn " << *olddn
<< dendl
;
1758 metablob
->add_remote_dentry(olddn
, true);
1759 mut
->add_cow_dentry(olddn
);
1765 void MDCache::journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
,
1766 CInode
*in
, snapid_t follows
,
1767 CInode
**pcow_inode
)
1769 dout(10) << "journal_cow_inode follows " << follows
<< " on " << *in
<< dendl
;
1770 CDentry
*dn
= in
->get_projected_parent_dn();
1771 journal_cow_dentry(mut
.get(), metablob
, dn
, follows
, pcow_inode
);
1774 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1776 if (in
->is_base()) {
1777 metablob
->add_root(true, in
);
1779 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1780 follows
= in
->first
- 1;
1781 CDentry
*dn
= in
->get_projected_parent_dn();
1782 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1783 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1784 if (in
->get_projected_inode()->is_backtrace_updated()) {
1785 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1786 in
->get_previous_projected_inode()->layout
.pool_id
;
1787 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1789 metablob
->add_primary_dentry(dn
, in
, true);
1796 // nested ---------------------------------------------------------------
1798 void MDCache::project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
1799 int linkunlink
, SnapRealm
*prealm
)
1801 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1802 CInode::mempool_inode
*curi
= cur
->get_projected_inode();
1804 if (cur
->first
> first
)
1807 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1808 << " " << *cur
<< dendl
;
1809 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1810 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1813 * FIXME. this incompletely propagates rstats to _old_ parents
1814 * (i.e. shortly after a directory rename). but we need full
1815 * blown hard link backpointers to make this work properly...
1817 snapid_t floor
= parentdn
->first
;
1818 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1821 prealm
= parent
->inode
->find_snaprealm();
1822 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1824 if (cur
->last
!= CEPH_NOSNAP
) {
1825 ceph_assert(cur
->dirty_old_rstats
.empty());
1826 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(std::max(first
, floor
));
1827 if (q
== snaps
.end() || *q
> cur
->last
)
1831 if (cur
->last
>= floor
) {
1833 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1834 // rename src inode is not projected in the slave rename prep case. so we should
1835 // avoid updateing the inode.
1836 ceph_assert(linkunlink
< 0);
1837 ceph_assert(cur
->is_frozen_inode());
1840 _project_rstat_inode_to_frag(*curi
, std::max(first
, floor
), cur
->last
, parent
,
1841 linkunlink
, update
);
1844 if (g_conf()->mds_snap_rstat
) {
1845 for (const auto &p
: cur
->dirty_old_rstats
) {
1846 auto &old
= cur
->old_inodes
[p
];
1847 snapid_t ofirst
= std::max(old
.first
, floor
);
1848 auto it
= snaps
.lower_bound(ofirst
);
1849 if (it
== snaps
.end() || *it
> p
)
1852 _project_rstat_inode_to_frag(old
.inode
, ofirst
, p
, parent
, 0, false);
1855 cur
->dirty_old_rstats
.clear();
1859 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode
& inode
, snapid_t ofirst
, snapid_t last
,
1860 CDir
*parent
, int linkunlink
, bool update_inode
)
1862 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1863 dout(20) << " inode rstat " << inode
.rstat
<< dendl
;
1864 dout(20) << " inode accounted_rstat " << inode
.accounted_rstat
<< dendl
;
1866 if (linkunlink
== 0) {
1867 delta
.add(inode
.rstat
);
1868 delta
.sub(inode
.accounted_rstat
);
1869 } else if (linkunlink
< 0) {
1870 delta
.sub(inode
.accounted_rstat
);
1872 delta
.add(inode
.rstat
);
1874 dout(20) << " delta " << delta
<< dendl
;
1877 inode
.accounted_rstat
= inode
.rstat
;
1879 while (last
>= ofirst
) {
1881 * pick fnode version to update. at each iteration, we want to
1882 * pick a segment ending in 'last' to update. split as necessary
1883 * to make that work. then, adjust first up so that we only
1884 * update one segment at a time. then loop to cover the whole
1885 * [ofirst,last] interval.
1887 nest_info_t
*prstat
;
1889 fnode_t
*pf
= parent
->get_projected_fnode();
1890 if (last
== CEPH_NOSNAP
) {
1891 if (g_conf()->mds_snap_rstat
)
1892 first
= std::max(ofirst
, parent
->first
);
1894 first
= parent
->first
;
1895 prstat
= &pf
->rstat
;
1896 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1898 if (first
> parent
->first
&&
1899 !(pf
->rstat
== pf
->accounted_rstat
)) {
1900 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1901 << parent
->first
<< "," << (first
-1) << "] "
1902 << " " << *prstat
<< "/" << pf
->accounted_rstat
1904 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1905 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1906 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1908 parent
->first
= first
;
1909 } else if (!g_conf()->mds_snap_rstat
) {
1910 // drop snapshots' rstats
1912 } else if (last
>= parent
->first
) {
1913 first
= parent
->first
;
1914 parent
->dirty_old_rstat
[last
].first
= first
;
1915 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1916 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1917 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1918 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1919 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1921 // be careful, dirty_old_rstat is a _sparse_ map.
1922 // sorry, this is ugly.
1925 // find any intersection with last
1926 auto it
= parent
->dirty_old_rstat
.lower_bound(last
);
1927 if (it
== parent
->dirty_old_rstat
.end()) {
1928 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1929 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1930 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1931 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1934 // *it last is >= last
1935 if (it
->second
.first
<= last
) {
1936 // *it intersects [first,last]
1937 if (it
->second
.first
< first
) {
1938 dout(10) << " splitting off left bit [" << it
->second
.first
<< "," << first
-1 << "]" << dendl
;
1939 parent
->dirty_old_rstat
[first
-1] = it
->second
;
1940 it
->second
.first
= first
;
1942 if (it
->second
.first
> first
)
1943 first
= it
->second
.first
;
1944 if (last
< it
->first
) {
1945 dout(10) << " splitting off right bit [" << last
+1 << "," << it
->first
<< "]" << dendl
;
1946 parent
->dirty_old_rstat
[last
] = it
->second
;
1947 it
->second
.first
= last
+1;
1950 // *it is to the _right_ of [first,last]
1951 it
= parent
->dirty_old_rstat
.lower_bound(first
);
1952 // new *it last is >= first
1953 if (it
->second
.first
<= last
&& // new *it isn't also to the right, and
1954 it
->first
>= first
) { // it intersects our first bit,
1955 dout(10) << " staying to the right of [" << it
->second
.first
<< "," << it
->first
<< "]..." << dendl
;
1956 first
= it
->first
+1;
1958 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1961 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1962 parent
->dirty_old_rstat
[last
].first
= first
;
1963 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1967 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1968 ceph_assert(last
>= first
);
1971 inode
.accounted_rstat
= inode
.rstat
;
1972 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1978 void MDCache::project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
1979 snapid_t ofirst
, snapid_t last
,
1980 CInode
*pin
, bool cow_head
)
1982 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1983 dout(20) << " frag rstat " << rstat
<< dendl
;
1984 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1985 nest_info_t delta
= rstat
;
1986 delta
.sub(accounted_rstat
);
1987 dout(20) << " delta " << delta
<< dendl
;
1989 while (last
>= ofirst
) {
1990 CInode::mempool_inode
*pi
;
1992 if (last
== pin
->last
) {
1993 pi
= pin
->get_projected_inode();
1994 first
= std::max(ofirst
, pin
->first
);
1995 if (first
> pin
->first
) {
1996 auto &old
= pin
->cow_old_inode(first
-1, cow_head
);
1997 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
2000 if (last
>= pin
->first
) {
2002 pin
->cow_old_inode(last
, cow_head
);
2004 // our life is easier here because old_inodes is not sparse
2005 // (although it may not begin at snapid 1)
2006 auto it
= pin
->old_inodes
.lower_bound(last
);
2007 if (it
== pin
->old_inodes
.end()) {
2008 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
2011 first
= it
->second
.first
;
2013 dout(10) << " oldest old_inode is [" << first
<< "," << it
->first
<< "], done." << dendl
;
2014 //assert(p == pin->old_inodes.begin());
2017 if (it
->first
> last
) {
2018 dout(10) << " splitting right old_inode [" << first
<< "," << it
->first
<< "] to ["
2019 << (last
+1) << "," << it
->first
<< "]" << dendl
;
2020 pin
->old_inodes
[last
] = it
->second
;
2021 it
->second
.first
= last
+1;
2022 pin
->dirty_old_rstats
.insert(it
->first
);
2025 if (first
< ofirst
) {
2026 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
2027 << first
<< "," << ofirst
-1 << "]" << dendl
;
2028 pin
->old_inodes
[ofirst
-1] = pin
->old_inodes
[last
];
2029 pin
->dirty_old_rstats
.insert(ofirst
-1);
2030 pin
->old_inodes
[last
].first
= first
= ofirst
;
2032 pi
= &pin
->old_inodes
[last
].inode
;
2033 pin
->dirty_old_rstats
.insert(last
);
2035 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
2036 pi
->rstat
.add(delta
);
2037 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
2043 void MDCache::broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
, bool quota_change
)
2045 if (!(mds
->is_active() || mds
->is_stopping()))
2048 if (!in
->is_auth() || in
->is_frozen())
2051 auto i
= in
->get_projected_inode();
2053 if (!i
->quota
.is_enable() &&
2057 // creaete snaprealm for quota inode (quota was set before mimic)
2058 if (!in
->get_projected_srnode())
2059 mds
->server
->create_quota_realm(in
);
2061 for (auto &p
: in
->client_caps
) {
2062 Capability
*cap
= &p
.second
;
2063 if (cap
->is_noquota())
2066 if (exclude_ct
>= 0 && exclude_ct
!= p
.first
)
2069 if (cap
->last_rbytes
== i
->rstat
.rbytes
&&
2070 cap
->last_rsize
== i
->rstat
.rsize())
2073 if (i
->quota
.max_files
> 0) {
2074 if (i
->rstat
.rsize() >= i
->quota
.max_files
)
2077 if ((abs(cap
->last_rsize
- i
->quota
.max_files
) >> 4) <
2078 abs(cap
->last_rsize
- i
->rstat
.rsize()))
2082 if (i
->quota
.max_bytes
> 0) {
2083 if (i
->rstat
.rbytes
> i
->quota
.max_bytes
- (i
->quota
.max_bytes
>> 3))
2086 if ((abs(cap
->last_rbytes
- i
->quota
.max_bytes
) >> 4) <
2087 abs(cap
->last_rbytes
- i
->rstat
.rbytes
))
2094 cap
->last_rsize
= i
->rstat
.rsize();
2095 cap
->last_rbytes
= i
->rstat
.rbytes
;
2097 auto msg
= make_message
<MClientQuota
>();
2098 msg
->ino
= in
->ino();
2099 msg
->rstat
= i
->rstat
;
2100 msg
->quota
= i
->quota
;
2101 mds
->send_message_client_counted(msg
, cap
->get_session());
2103 for (const auto &it
: in
->get_replicas()) {
2104 auto msg
= make_message
<MGatherCaps
>();
2105 msg
->ino
= in
->ino();
2106 mds
->send_message_mds(msg
, it
.first
);
2111 * NOTE: we _have_ to delay the scatter if we are called during a
2112 * rejoin, because we can't twiddle locks between when the
2113 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2114 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2115 * (no requests), and a survivor acks immediately. _except_ that
2116 * during rejoin_(weak|strong) processing, we may complete a lock
2117 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2118 * scatterlock state in that case or the lock states will get out of
2119 * sync between the auth and replica.
2121 * the simple solution is to never do the scatter here. instead, put
2122 * the scatterlock on a list if it isn't already wrlockable. this is
2123 * probably the best plan anyway, since we avoid too many
2124 * scatters/locks under normal usage.
2127 * some notes on dirlock/nestlock scatterlock semantics:
2129 * the fragstat (dirlock) will never be updated without
2130 * dirlock+nestlock wrlock held by the caller.
2132 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2133 * data is pushed up the tree. this could be changed with some
2134 * restructuring here, but in its current form we ensure that the
2135 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2136 * frag, which is nice. and, we only need to track frags that need to
2137 * be nudged (and not inodes with pending rstat changes that need to
2138 * be pushed into the frag). a consequence of this is that the
2139 * accounted_rstat on scatterlock sync may not match our current
2140 * rstat. this is normal and expected.
2142 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2143 CInode
*in
, CDir
*parent
,
2144 int flags
, int linkunlink
,
2147 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2148 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2149 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2151 ceph_assert(mds
->mdlog
->entry_is_open());
2153 // make sure stamp is set
2154 if (mut
->get_mds_stamp() == utime_t())
2155 mut
->set_mds_stamp(ceph_clock_now());
2160 dout(10) << "predirty_journal_parents"
2161 << (do_parent_mtime
? " do_parent_mtime":"")
2162 << " linkunlink=" << linkunlink
2163 << (primary_dn
? " primary_dn":" remote_dn")
2164 << (shallow
? " SHALLOW":"")
2165 << " follows " << cfollows
2166 << " " << *in
<< dendl
;
2169 ceph_assert(primary_dn
);
2170 parent
= in
->get_projected_parent_dn()->get_dir();
2173 if (flags
== 0 && linkunlink
== 0) {
2174 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2175 blob
->add_dir_context(parent
);
2179 // build list of inodes to wrlock, dirty, and update
2182 CDentry
*parentdn
= NULL
;
2185 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2186 ceph_assert(parent
->is_auth());
2188 // opportunistically adjust parent dirfrag
2189 CInode
*pin
= parent
->get_inode();
2192 mut
->auth_pin(parent
);
2193 mut
->add_projected_fnode(parent
);
2195 fnode_t
*pf
= parent
->project_fnode();
2196 pf
->version
= parent
->pre_dirty();
2198 if (do_parent_mtime
|| linkunlink
) {
2199 ceph_assert(mut
->is_wrlocked(&pin
->filelock
));
2200 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
));
2201 ceph_assert(cfollows
== CEPH_NOSNAP
);
2203 // update stale fragstat/rstat?
2204 parent
->resync_accounted_fragstat();
2205 parent
->resync_accounted_rstat();
2207 if (do_parent_mtime
) {
2208 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2209 pf
->fragstat
.change_attr
++;
2210 dout(10) << "predirty_journal_parents bumping change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2211 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2212 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2213 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2215 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2219 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2221 pf
->fragstat
.nsubdirs
+= linkunlink
;
2222 //pf->rstat.rsubdirs += linkunlink;
2224 pf
->fragstat
.nfiles
+= linkunlink
;
2225 //pf->rstat.rfiles += linkunlink;
2232 // don't update parent this pass
2233 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2234 pin
->versionlock
.can_wrlock())) {
2235 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2236 << ", marking dirty rstat on " << *cur
<< dendl
;
2237 cur
->mark_dirty_rstat();
2239 // if we don't hold a wrlock reference on this nestlock, take one,
2240 // because we are about to write into the dirfrag fnode and that needs
2241 // to commit before the lock can cycle.
2243 ceph_assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_slave());
2246 if (!mut
->is_wrlocked(&pin
->nestlock
)) {
2247 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2248 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2251 // now we can project the inode rstat diff the dirfrag
2252 SnapRealm
*prealm
= pin
->find_snaprealm();
2254 snapid_t follows
= cfollows
;
2255 if (follows
== CEPH_NOSNAP
)
2256 follows
= prealm
->get_newest_seq();
2258 snapid_t first
= follows
+1;
2260 // first, if the frag is stale, bring it back in sync.
2261 parent
->resync_accounted_rstat();
2263 // now push inode rstats into frag
2264 project_rstat_inode_to_frag(cur
, parent
, first
, linkunlink
, prealm
);
2265 cur
->clear_dirty_rstat();
2269 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2270 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2274 // delay propagating until later?
2275 if (!stop
&& !first
&&
2276 g_conf()->mds_dirstat_min_interval
> 0) {
2277 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2278 if (since_last_prop
< g_conf()->mds_dirstat_min_interval
) {
2279 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2280 << " < " << g_conf()->mds_dirstat_min_interval
2281 << ", stopping" << dendl
;
2284 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2288 // can cast only because i'm passing nowait=true in the sole user
2290 !mut
->is_wrlocked(&pin
->nestlock
) &&
2291 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2292 !mds
->locker
->wrlock_try(&pin
->nestlock
, mut
)
2293 )) { // ** do not initiate.. see above comment **
2294 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2295 << " on " << *pin
<< dendl
;
2299 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2300 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2301 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2302 mut
->add_updated_lock(&pin
->nestlock
);
2303 if (do_parent_mtime
|| linkunlink
) {
2304 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2305 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2306 mut
->add_updated_lock(&pin
->filelock
);
2310 if (!mut
->is_wrlocked(&pin
->versionlock
))
2311 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2313 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
) || mut
->is_slave());
2315 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2319 mut
->add_projected_inode(pin
);
2320 lsi
.push_front(pin
);
2322 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2324 auto &pi
= pin
->project_inode();
2325 pi
.inode
.version
= pin
->pre_dirty();
2328 if (do_parent_mtime
|| linkunlink
) {
2329 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2330 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2331 bool touched_mtime
= false, touched_chattr
= false;
2332 pi
.inode
.dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2333 pf
->accounted_fragstat
= pf
->fragstat
;
2335 pi
.inode
.mtime
= pi
.inode
.ctime
= pi
.inode
.dirstat
.mtime
;
2337 pi
.inode
.change_attr
= pi
.inode
.dirstat
.change_attr
;
2338 dout(20) << "predirty_journal_parents gives " << pi
.inode
.dirstat
<< " on " << *pin
<< dendl
;
2340 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2341 if (pi
.inode
.dirstat
.size() < 0)
2342 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter
);
2343 if (pi
.inode
.dirstat
.size() != pf
->fragstat
.size()) {
2344 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2345 << parent
->dirfrag() << ", inode has " << pi
.inode
.dirstat
2346 << ", dirfrag has " << pf
->fragstat
;
2348 // trust the dirfrag for now
2349 pi
.inode
.dirstat
= pf
->fragstat
;
2351 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter
);
2357 * the rule here is to follow the _oldest_ parent with dirty rstat
2358 * data. if we don't propagate all data, we add ourselves to the
2359 * nudge list. that way all rstat data will (eventually) get
2360 * pushed up the tree.
2362 * actually, no. for now, silently drop rstats for old parents. we need
2363 * hard link backpointers to do the above properly.
2369 parentdn
= pin
->get_projected_parent_dn();
2370 ceph_assert(parentdn
);
2373 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2375 // first, if the frag is stale, bring it back in sync.
2376 parent
->resync_accounted_rstat();
2378 if (g_conf()->mds_snap_rstat
) {
2379 for (auto &p
: parent
->dirty_old_rstat
) {
2380 project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
, p
.second
.first
,
2381 p
.first
, pin
, true);
2384 parent
->dirty_old_rstat
.clear();
2385 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2387 pf
->accounted_rstat
= pf
->rstat
;
2389 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2390 if (pi
.inode
.rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2391 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2392 << parent
->dirfrag() << ", inode has " << pi
.inode
.rstat
2393 << ", dirfrag has " << pf
->rstat
;
2395 // trust the dirfrag for now
2396 pi
.inode
.rstat
= pf
->rstat
;
2398 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter
);
2402 parent
->check_rstats();
2403 broadcast_quota_to_client(pin
);
2406 parent
= parentdn
->get_dir();
2408 do_parent_mtime
= false;
2413 // now, stick it in the blob
2414 ceph_assert(parent
);
2415 ceph_assert(parent
->is_auth());
2416 blob
->add_dir_context(parent
);
2417 blob
->add_dir(parent
, true);
2418 for (const auto& in
: lsi
) {
2419 journal_dirty_inode(mut
.get(), blob
, in
);
2428 // ===================================
2433 * some handlers for master requests with slaves. we need to make
2434 * sure slaves journal commits before we forget we mastered them and
2435 * remove them from the uncommitted_masters map (used during recovery
2436 * to commit|abort slaves).
2438 struct C_MDC_CommittedMaster
: public MDCacheLogContext
{
2440 C_MDC_CommittedMaster(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2441 void finish(int r
) override
{
2442 mdcache
->_logged_master_commit(reqid
);
2446 void MDCache::log_master_commit(metareqid_t reqid
)
2448 dout(10) << "log_master_commit " << reqid
<< dendl
;
2449 uncommitted_masters
[reqid
].committing
= true;
2450 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2451 new C_MDC_CommittedMaster(this, reqid
));
2454 void MDCache::_logged_master_commit(metareqid_t reqid
)
2456 dout(10) << "_logged_master_commit " << reqid
<< dendl
;
2457 ceph_assert(uncommitted_masters
.count(reqid
));
2458 uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2459 mds
->queue_waiters(uncommitted_masters
[reqid
].waiters
);
2460 uncommitted_masters
.erase(reqid
);
2465 void MDCache::committed_master_slave(metareqid_t r
, mds_rank_t from
)
2467 dout(10) << "committed_master_slave mds." << from
<< " on " << r
<< dendl
;
2468 ceph_assert(uncommitted_masters
.count(r
));
2469 uncommitted_masters
[r
].slaves
.erase(from
);
2470 if (!uncommitted_masters
[r
].recovering
&& uncommitted_masters
[r
].slaves
.empty())
2471 log_master_commit(r
);
2474 void MDCache::logged_master_update(metareqid_t reqid
)
2476 dout(10) << "logged_master_update " << reqid
<< dendl
;
2477 ceph_assert(uncommitted_masters
.count(reqid
));
2478 uncommitted_masters
[reqid
].safe
= true;
2479 auto p
= pending_masters
.find(reqid
);
2480 if (p
!= pending_masters
.end()) {
2481 pending_masters
.erase(p
);
2482 if (pending_masters
.empty())
2483 process_delayed_resolve();
2488 * Master may crash after receiving all slaves' commit acks, but before journalling
2489 * the final commit. Slaves may crash after journalling the slave commit, but before
2490 * sending commit ack to the master. Commit masters with no uncommitted slave when
2493 void MDCache::finish_committed_masters()
2495 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
2496 p
!= uncommitted_masters
.end();
2498 p
->second
.recovering
= false;
2499 if (!p
->second
.committing
&& p
->second
.slaves
.empty()) {
2500 dout(10) << "finish_committed_masters " << p
->first
<< dendl
;
2501 log_master_commit(p
->first
);
2507 * at end of resolve... we must journal a commit|abort for all slave
2508 * updates, before moving on.
2510 * this is so that the master can safely journal ECommitted on ops it
2511 * masters when it reaches up:active (all other recovering nodes must
2512 * complete resolve before that happens).
2514 struct C_MDC_SlaveCommit
: public MDCacheLogContext
{
2517 C_MDC_SlaveCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2518 void finish(int r
) override
{
2519 mdcache
->_logged_slave_commit(from
, reqid
);
2523 void MDCache::_logged_slave_commit(mds_rank_t from
, metareqid_t reqid
)
2525 dout(10) << "_logged_slave_commit from mds." << from
<< " " << reqid
<< dendl
;
2528 auto req
= make_message
<MMDSSlaveRequest
>(reqid
, 0, MMDSSlaveRequest::OP_COMMITTED
);
2529 mds
->send_message_mds(req
, from
);
2537 // ====================================================================
2538 // import map, recovery
2540 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2541 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2543 if (subtrees
.count(oldparent
)) {
2544 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2545 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2546 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2552 if (subtrees
.count(newparent
)) {
2553 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2554 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2559 ESubtreeMap
*MDCache::create_subtree_map()
2561 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2562 << num_subtrees_fullauth() << " fullauth"
2567 ESubtreeMap
*le
= new ESubtreeMap();
2568 mds
->mdlog
->_start_entry(le
);
2570 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2573 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2574 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2577 // include all auth subtrees, and their bounds.
2578 // and a spanning tree to tie it to the root.
2579 for (auto& [dir
, bounds
] : subtrees
) {
2580 // journal subtree as "ours" if we are
2583 // me, !me (may be importing and ambiguous!)
2587 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2590 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2591 my_ambiguous_imports
.count(dir
->dirfrag())) {
2592 dout(15) << " ambig subtree " << *dir
<< dendl
;
2593 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2595 dout(15) << " auth subtree " << *dir
<< dendl
;
2598 dirs_to_add
[dir
->dirfrag()] = dir
;
2599 le
->subtrees
[dir
->dirfrag()].clear();
2602 size_t nbounds
= bounds
.size();
2604 dout(15) << " subtree has " << nbounds
<< " bounds" << dendl
;
2606 for (auto& bound
: bounds
) {
2608 dout(15) << " subtree bound " << *bound
<< dendl
;
2610 dirs_to_add
[bound
->dirfrag()] = bound
;
2611 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2615 // apply projected renames
2616 for (const auto& [diri
, renames
] : projected_subtree_renames
) {
2617 for (const auto& [olddir
, newdir
] : renames
) {
2618 dout(15) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2620 auto&& dfls
= diri
->get_dirfrags();
2621 for (const auto& dir
: dfls
) {
2622 dout(15) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2623 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2624 dout(15) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2625 CDir
*newparent
= get_projected_subtree_root(newdir
);
2626 dout(15) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2628 if (oldparent
== newparent
) {
2629 dout(15) << "parent unchanged for " << dir
->dirfrag() << " at "
2630 << oldparent
->dirfrag() << dendl
;
2634 if (dir
->is_subtree_root()) {
2635 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2636 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2637 dirs_to_add
[dir
->dirfrag()] = dir
;
2638 // children are fine. change parent.
2639 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2644 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2645 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2646 // if oldparent is auth, subtree is mine; include it.
2647 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2648 dirs_to_add
[dir
->dirfrag()] = dir
;
2649 le
->subtrees
[dir
->dirfrag()].clear();
2651 // if newparent is auth, subtree is a new bound
2652 if (le
->subtrees
.count(newparent
->dirfrag())) {
2653 dirs_to_add
[dir
->dirfrag()] = dir
;
2654 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2659 // see if any old bounds move to the new parent.
2660 for (auto& bound
: subtrees
.at(oldparent
)) {
2661 if (dir
->contains(bound
->get_parent_dir()))
2662 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2670 // simplify the journaled map. our in memory map may have more
2671 // subtrees than needed due to migrations that are just getting
2672 // started or just completing. but on replay, the "live" map will
2673 // be simple and we can do a straight comparison.
2674 for (auto& [frag
, bfrags
] : le
->subtrees
) {
2675 if (le
->ambiguous_subtrees
.count(frag
))
2678 while (i
< bfrags
.size()) {
2679 dirfrag_t b
= bfrags
[i
];
2680 if (le
->subtrees
.count(b
) &&
2681 le
->ambiguous_subtrees
.count(b
) == 0) {
2682 auto& bb
= le
->subtrees
.at(b
);
2683 dout(10) << "simplify: " << frag
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2684 for (auto& r
: bb
) {
2685 bfrags
.push_back(r
);
2687 dirs_to_add
.erase(b
);
2688 le
->subtrees
.erase(b
);
2689 bfrags
.erase(bfrags
.begin() + i
);
2696 for (auto &p
: dirs_to_add
) {
2697 CDir
*dir
= p
.second
;
2698 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2699 le
->metablob
.add_dir(dir
, false);
2702 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2703 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2705 //le->metablob.print(cout);
2706 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2710 void MDCache::dump_resolve_status(Formatter
*f
) const
2712 f
->open_object_section("resolve_status");
2713 f
->dump_stream("resolve_gather") << resolve_gather
;
2714 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2718 void MDCache::resolve_start(MDSContext
*resolve_done_
)
2720 dout(10) << "resolve_start" << dendl
;
2721 ceph_assert(!resolve_done
);
2722 resolve_done
.reset(resolve_done_
);
2724 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2725 // if we don't have the root dir, adjust it to UNKNOWN. during
2726 // resolve we want mds0 to explicit claim the portion of it that
2727 // it owns, so that anything beyond its bounds get left as
2729 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2731 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2733 resolve_gather
= recovery_set
;
2735 resolve_snapclient_commits
= mds
->snapclient
->get_journaled_tids();
2738 void MDCache::send_resolves()
2740 send_slave_resolves();
2742 if (!resolve_done
) {
2743 // I'm survivor: refresh snap cache
2744 mds
->snapclient
->sync(
2745 new MDSInternalContextWrapper(mds
,
2746 new LambdaContext([this](int r
) {
2747 maybe_finish_slave_resolve();
2751 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl
;
2754 if (!resolve_ack_gather
.empty()) {
2755 dout(10) << "send_resolves still waiting for resolve ack from ("
2756 << resolve_ack_gather
<< ")" << dendl
;
2759 if (!resolve_need_rollback
.empty()) {
2760 dout(10) << "send_resolves still waiting for rollback to commit on ("
2761 << resolve_need_rollback
<< ")" << dendl
;
2765 send_subtree_resolves();
2768 void MDCache::send_slave_resolves()
2770 dout(10) << "send_slave_resolves" << dendl
;
2772 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2774 if (mds
->is_resolve()) {
2775 for (map
<metareqid_t
, uslave
>::iterator p
= uncommitted_slaves
.begin();
2776 p
!= uncommitted_slaves
.end();
2778 mds_rank_t master
= p
->second
.master
;
2779 auto &m
= resolves
[master
];
2780 if (!m
) m
= make_message
<MMDSResolve
>();
2781 m
->add_slave_request(p
->first
, false);
2784 set
<mds_rank_t
> resolve_set
;
2785 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2786 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2787 p
!= active_requests
.end();
2789 MDRequestRef
& mdr
= p
->second
;
2790 if (!mdr
->is_slave())
2792 if (!mdr
->slave_did_prepare() && !mdr
->committing
) {
2795 mds_rank_t master
= mdr
->slave_to_mds
;
2796 if (resolve_set
.count(master
) || is_ambiguous_slave_update(p
->first
, master
)) {
2797 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2798 if (!resolves
.count(master
))
2799 resolves
[master
] = make_message
<MMDSResolve
>();
2800 if (!mdr
->committing
&&
2801 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2802 // re-send cap exports
2803 CInode
*in
= mdr
->more()->rename_inode
;
2804 map
<client_t
, Capability::Export
> cap_map
;
2805 in
->export_client_caps(cap_map
);
2807 MMDSResolve::slave_inode_cap
inode_caps(in
->ino(), cap_map
);
2808 encode(inode_caps
, bl
);
2809 resolves
[master
]->add_slave_request(p
->first
, bl
);
2811 resolves
[master
]->add_slave_request(p
->first
, mdr
->committing
);
2817 for (auto &p
: resolves
) {
2818 dout(10) << "sending slave resolve to mds." << p
.first
<< dendl
;
2819 mds
->send_message_mds(p
.second
, p
.first
);
2820 resolve_ack_gather
.insert(p
.first
);
2824 void MDCache::send_subtree_resolves()
2826 dout(10) << "send_subtree_resolves" << dendl
;
2828 if (migrator
->is_exporting() || migrator
->is_importing()) {
2829 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2830 migrator
->show_importing();
2831 migrator
->show_exporting();
2832 resolves_pending
= true;
2836 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2837 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2838 p
!= recovery_set
.end();
2840 if (*p
== mds
->get_nodeid())
2842 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2843 resolves
[*p
] = make_message
<MMDSResolve
>();
2846 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2847 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2850 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2851 p
!= subtrees
.end();
2853 CDir
*dir
= p
->first
;
2855 // only our subtrees
2856 if (dir
->authority().first
!= mds
->get_nodeid())
2859 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2860 continue; // we'll add it below
2862 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2863 // ambiguous (mid-import)
2865 get_subtree_bounds(dir
, bounds
);
2866 vector
<dirfrag_t
> dfls
;
2867 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2868 dfls
.push_back((*q
)->dirfrag());
2870 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2871 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2874 for (auto &q
: resolves
) {
2875 resolves
[q
.first
]->add_subtree(dir
->dirfrag());
2878 vector
<dirfrag_t
> dfls
;
2879 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2880 q
!= subtrees
[dir
].end();
2883 dfls
.push_back(bound
->dirfrag());
2886 my_subtrees
[dir
->dirfrag()] = dfls
;
2887 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2892 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2893 p
!= my_ambiguous_imports
.end();
2895 my_ambig_imports
[p
->first
] = p
->second
;
2896 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2899 // simplify the claimed subtree.
2900 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2902 while (i
< p
->second
.size()) {
2903 dirfrag_t b
= p
->second
[i
];
2904 if (my_subtrees
.count(b
)) {
2905 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2906 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2907 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2908 p
->second
.push_back(*r
);
2909 my_subtrees
.erase(b
);
2910 p
->second
.erase(p
->second
.begin() + i
);
2918 for (auto &p
: resolves
) {
2919 const ref_t
<MMDSResolve
> &m
= p
.second
;
2920 if (mds
->is_resolve()) {
2921 m
->add_table_commits(TABLE_SNAP
, resolve_snapclient_commits
);
2923 m
->add_table_commits(TABLE_SNAP
, mds
->snapclient
->get_journaled_tids());
2925 m
->subtrees
= my_subtrees
;
2926 m
->ambiguous_imports
= my_ambig_imports
;
2927 dout(10) << "sending subtee resolve to mds." << p
.first
<< dendl
;
2928 mds
->send_message_mds(m
, p
.first
);
2930 resolves_pending
= false;
2933 void MDCache::maybe_finish_slave_resolve() {
2934 if (resolve_ack_gather
.empty() && resolve_need_rollback
.empty()) {
2935 // snap cache get synced or I'm in resolve state
2936 if (mds
->snapclient
->is_synced() || resolve_done
)
2937 send_subtree_resolves();
2938 process_delayed_resolve();
2942 void MDCache::handle_mds_failure(mds_rank_t who
)
2944 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2946 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2948 resolve_gather
.insert(who
);
2949 discard_delayed_resolve(who
);
2950 ambiguous_slave_updates
.erase(who
);
2952 rejoin_gather
.insert(who
);
2953 rejoin_sent
.erase(who
); // i need to send another
2954 rejoin_ack_sent
.erase(who
); // i need to send another
2955 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2957 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2958 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2959 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2960 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2961 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2964 // tell the migrator too.
2965 migrator
->handle_mds_failure_or_stop(who
);
2967 // tell the balancer too.
2968 mds
->balancer
->handle_mds_failure(who
);
2970 // clean up any requests slave to/from this node
2971 list
<MDRequestRef
> finish
;
2972 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2973 p
!= active_requests
.end();
2975 MDRequestRef
& mdr
= p
->second
;
2976 // slave to the failed node?
2977 if (mdr
->slave_to_mds
== who
) {
2978 if (mdr
->slave_did_prepare()) {
2979 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2980 if (is_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
))
2981 remove_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2983 if (!mdr
->more()->waiting_on_slave
.empty()) {
2984 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2985 // will rollback, no need to wait
2986 mdr
->reset_slave_request();
2987 mdr
->more()->waiting_on_slave
.clear();
2989 } else if (!mdr
->committing
) {
2990 dout(10) << " slave request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2991 if (mdr
->slave_request
|| mdr
->slave_rolling_back())
2992 mdr
->aborted
= true;
2994 finish
.push_back(mdr
);
2998 if (mdr
->is_slave() && mdr
->slave_did_prepare()) {
2999 if (mdr
->more()->waiting_on_slave
.count(who
)) {
3000 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
3001 dout(10) << " slave request " << *mdr
<< " no longer need rename notity ack from mds."
3003 mdr
->more()->waiting_on_slave
.erase(who
);
3004 if (mdr
->more()->waiting_on_slave
.empty() && mdr
->slave_request
)
3005 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
3008 if (mdr
->more()->srcdn_auth_mds
== who
&&
3009 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->slave_to_mds
)) {
3010 // rename srcdn's auth mds failed, resolve even I'm a survivor.
3011 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
3012 add_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
3014 } else if (mdr
->slave_request
) {
3015 const cref_t
<MMDSSlaveRequest
> &slave_req
= mdr
->slave_request
;
3016 // FIXME: Slave rename request can arrive after we notice mds failure.
3017 // This can cause mds to crash (does not affect integrity of FS).
3018 if (slave_req
->get_op() == MMDSSlaveRequest::OP_RENAMEPREP
&&
3019 slave_req
->srcdn_auth
== who
)
3020 slave_req
->mark_interrupted();
3023 // failed node is slave?
3024 if (mdr
->is_master() && !mdr
->committing
) {
3025 if (mdr
->more()->srcdn_auth_mds
== who
) {
3026 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
3027 << who
<< " to recover" << dendl
;
3028 ceph_assert(mdr
->more()->witnessed
.count(who
) == 0);
3029 if (mdr
->more()->is_ambiguous_auth
)
3030 mdr
->clear_ambiguous_auth();
3031 // rename srcdn's auth mds failed, all witnesses will rollback
3032 mdr
->more()->witnessed
.clear();
3033 pending_masters
.erase(p
->first
);
3036 if (mdr
->more()->witnessed
.count(who
)) {
3037 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
3038 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_slave
.count(srcdn_auth
)) {
3039 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
3040 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
3041 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
3042 // until either the request is committing or the slave also fails.
3043 ceph_assert(mdr
->more()->waiting_on_slave
.size() == 1);
3044 pending_masters
.insert(p
->first
);
3046 dout(10) << " master request " << *mdr
<< " no longer witnessed by slave mds."
3047 << who
<< " to recover" << dendl
;
3048 if (srcdn_auth
>= 0)
3049 ceph_assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
3051 // discard this peer's prepare (if any)
3052 mdr
->more()->witnessed
.erase(who
);
3056 if (mdr
->more()->waiting_on_slave
.count(who
)) {
3057 dout(10) << " master request " << *mdr
<< " waiting for slave mds." << who
3058 << " to recover" << dendl
;
3059 // retry request when peer recovers
3060 mdr
->more()->waiting_on_slave
.erase(who
);
3061 if (mdr
->more()->waiting_on_slave
.empty())
3062 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
3065 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
3066 mdr
->finish_locking(mdr
->locking
);
3070 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
3071 p
!= uncommitted_masters
.end();
3073 // The failed MDS may have already committed the slave update
3074 if (p
->second
.slaves
.count(who
)) {
3075 p
->second
.recovering
= true;
3076 p
->second
.slaves
.erase(who
);
3080 while (!finish
.empty()) {
3081 dout(10) << "cleaning up slave request " << *finish
.front() << dendl
;
3082 request_finish(finish
.front());
3086 kick_find_ino_peers(who
);
3087 kick_open_ino_peers(who
);
3089 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3090 p
!= fragments
.end(); ) {
3091 dirfrag_t df
= p
->first
;
3092 fragment_info_t
& info
= p
->second
;
3094 if (info
.is_fragmenting()) {
3095 if (info
.notify_ack_waiting
.erase(who
) &&
3096 info
.notify_ack_waiting
.empty()) {
3097 fragment_drop_locks(info
);
3098 fragment_maybe_finish(p
++);
3106 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3107 std::vector
<CDir
*> dirs
;
3108 info
.dirs
.swap(dirs
);
3109 fragments
.erase(df
);
3110 fragment_unmark_unfreeze_dirs(dirs
);
3113 // MDCache::shutdown_export_strays() always exports strays to mds.0
3114 if (who
== mds_rank_t(0))
3115 shutdown_exporting_strays
.clear();
3121 * handle_mds_recovery - called on another node's transition
3122 * from resolve -> active.
3124 void MDCache::handle_mds_recovery(mds_rank_t who
)
3126 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3128 // exclude all discover waiters. kick_discovers() will do the job
3129 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3130 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3132 MDSContext::vec waiters
;
3134 // wake up any waiters in their subtrees
3135 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3136 p
!= subtrees
.end();
3138 CDir
*dir
= p
->first
;
3140 if (dir
->authority().first
!= who
||
3141 dir
->authority().second
== mds
->get_nodeid())
3143 ceph_assert(!dir
->is_auth());
3146 std::queue
<CDir
*> q
;
3149 while (!q
.empty()) {
3150 CDir
*d
= q
.front();
3152 d
->take_waiting(d_mask
, waiters
);
3154 // inode waiters too
3155 for (auto &p
: d
->items
) {
3156 CDentry
*dn
= p
.second
;
3157 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3158 if (dnl
->is_primary()) {
3159 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3162 auto&& ls
= dnl
->get_inode()->get_dirfrags();
3163 for (const auto& subdir
: ls
) {
3164 if (!subdir
->is_subtree_root())
3172 kick_open_ino_peers(who
);
3173 kick_find_ino_peers(who
);
3176 mds
->queue_waiters(waiters
);
3179 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3181 dout(7) << "set_recovery_set " << s
<< dendl
;
3187 * during resolve state, we share resolves to determine who
3188 * is authoritative for which trees. we expect to get an resolve
3189 * from _everyone_ in the recovery_set (the mds cluster at the time of
3190 * the first failure).
3192 * This functions puts the passed message before returning
3194 void MDCache::handle_resolve(const cref_t
<MMDSResolve
> &m
)
3196 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3197 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3199 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3200 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3201 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3204 // wait until we reach the resolve stage!
3208 discard_delayed_resolve(from
);
3210 // ambiguous slave requests?
3211 if (!m
->slave_requests
.empty()) {
3212 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3213 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3214 if (uncommitted_masters
.count(p
->first
) && !uncommitted_masters
[p
->first
].safe
) {
3215 ceph_assert(!p
->second
.committing
);
3216 pending_masters
.insert(p
->first
);
3220 if (!pending_masters
.empty()) {
3221 dout(10) << " still have pending updates, delay processing slave resolve" << dendl
;
3222 delayed_resolve
[from
] = m
;
3227 auto ack
= make_message
<MMDSResolveAck
>();
3228 for (const auto &p
: m
->slave_requests
) {
3229 if (uncommitted_masters
.count(p
.first
)) { //mds->sessionmap.have_completed_request(p.first)) {
3231 if (p
.second
.committing
) {
3232 // already committing, waiting for the OP_COMMITTED slave reply
3233 dout(10) << " already committing slave request " << p
<< " noop "<< dendl
;
3235 dout(10) << " ambiguous slave request " << p
<< " will COMMIT" << dendl
;
3236 ack
->add_commit(p
.first
);
3238 uncommitted_masters
[p
.first
].slaves
.insert(from
); // wait for slave OP_COMMITTED before we log ECommitted
3240 if (p
.second
.inode_caps
.length() > 0) {
3241 // slave wants to export caps (rename)
3242 ceph_assert(mds
->is_resolve());
3243 MMDSResolve::slave_inode_cap inode_caps
;
3244 auto q
= p
.second
.inode_caps
.cbegin();
3245 decode(inode_caps
, q
);
3246 inodeno_t ino
= inode_caps
.ino
;
3247 map
<client_t
,Capability::Export
> cap_exports
= inode_caps
.cap_exports
;
3248 ceph_assert(get_inode(ino
));
3250 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3251 q
!= cap_exports
.end();
3253 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3254 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3256 im
.mseq
= q
->second
.mseq
;
3258 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
3260 rejoin_client_map
.emplace(q
->first
, session
->info
.inst
);
3263 // will process these caps in rejoin stage
3264 rejoin_slave_exports
[ino
].first
= from
;
3265 rejoin_slave_exports
[ino
].second
.swap(cap_exports
);
3267 // send information of imported caps back to slave
3268 encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
.first
]);
3272 dout(10) << " ambiguous slave request " << p
<< " will ABORT" << dendl
;
3273 ceph_assert(!p
.second
.committing
);
3274 ack
->add_abort(p
.first
);
3277 mds
->send_message(ack
, m
->get_connection());
3281 if (!resolve_ack_gather
.empty() || !resolve_need_rollback
.empty()) {
3282 dout(10) << "delay processing subtree resolve" << dendl
;
3283 delayed_resolve
[from
] = m
;
3287 bool survivor
= false;
3288 // am i a surviving ambiguous importer?
3289 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3291 // check for any import success/failure (from this node)
3292 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3293 while (p
!= my_ambiguous_imports
.end()) {
3294 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3296 CDir
*dir
= get_dirfrag(p
->first
);
3298 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3299 if (migrator
->is_importing(dir
->dirfrag()) &&
3300 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3301 ceph_assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3303 // check if sender claims the subtree
3304 bool claimed_by_sender
= false;
3305 for (const auto &q
: m
->subtrees
) {
3306 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3307 CDir
*base
= get_force_dirfrag(q
.first
, false);
3308 if (!base
|| !base
->contains(dir
))
3309 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3313 get_force_dirfrag_bound_set(q
.second
, bounds
);
3314 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3316 if (bound
->contains(dir
)) {
3317 inside
= false; // nope, bound is dir or parent of dir, not inside.
3322 claimed_by_sender
= true;
3325 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3326 if (claimed_by_sender
) {
3327 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3328 migrator
->import_reverse(dir
);
3330 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3331 migrator
->import_finish(dir
, true);
3338 // update my dir_auth values
3339 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3340 // migrations between other nodes)
3341 for (const auto& p
: m
->subtrees
) {
3342 dout(10) << "peer claims " << p
.first
<< " bounds " << p
.second
<< dendl
;
3343 CDir
*dir
= get_force_dirfrag(p
.first
, !survivor
);
3346 adjust_bounded_subtree_auth(dir
, p
.second
, from
);
3347 try_subtree_merge(dir
);
3352 // note ambiguous imports too
3353 for (const auto& p
: m
->ambiguous_imports
) {
3354 dout(10) << "noting ambiguous import on " << p
.first
<< " bounds " << p
.second
<< dendl
;
3355 other_ambiguous_imports
[from
][p
.first
] = p
.second
;
3358 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3359 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3360 for (const auto& p
: m
->table_clients
) {
3361 dout(10) << " noting " << get_mdstable_name(p
.type
)
3362 << " pending_commits " << p
.pending_commits
<< dendl
;
3363 MDSTableClient
*client
= mds
->get_table_client(p
.type
);
3364 for (const auto& q
: p
.pending_commits
)
3365 client
->notify_commit(q
);
3368 // did i get them all?
3369 resolve_gather
.erase(from
);
3371 maybe_resolve_finish();
3374 void MDCache::process_delayed_resolve()
3376 dout(10) << "process_delayed_resolve" << dendl
;
3377 map
<mds_rank_t
, cref_t
<MMDSResolve
>> tmp
;
3378 tmp
.swap(delayed_resolve
);
3379 for (auto &p
: tmp
) {
3380 handle_resolve(p
.second
);
3384 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3386 delayed_resolve
.erase(who
);
3389 void MDCache::maybe_resolve_finish()
3391 ceph_assert(resolve_ack_gather
.empty());
3392 ceph_assert(resolve_need_rollback
.empty());
3394 if (!resolve_gather
.empty()) {
3395 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3396 << resolve_gather
<< ")" << dendl
;
3400 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3401 disambiguate_my_imports();
3402 finish_committed_masters();
3405 ceph_assert(mds
->is_resolve());
3406 trim_unlinked_inodes();
3407 recalc_auth_bits(false);
3408 resolve_done
.release()->complete(0);
3411 maybe_send_pending_rejoins();
3415 void MDCache::handle_resolve_ack(const cref_t
<MMDSResolveAck
> &ack
)
3417 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3418 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3420 if (!resolve_ack_gather
.count(from
) ||
3421 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3425 if (ambiguous_slave_updates
.count(from
)) {
3426 ceph_assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3427 ceph_assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3430 for (const auto &p
: ack
->commit
) {
3431 dout(10) << " commit on slave " << p
.first
<< dendl
;
3433 if (ambiguous_slave_updates
.count(from
)) {
3434 remove_ambiguous_slave_update(p
.first
, from
);
3438 if (mds
->is_resolve()) {
3440 MDSlaveUpdate
*su
= get_uncommitted_slave(p
.first
, from
);
3444 mds
->mdlog
->start_submit_entry(new ESlaveUpdate(mds
->mdlog
, "unknown", p
.first
, from
,
3445 ESlaveUpdate::OP_COMMIT
, su
->origop
),
3446 new C_MDC_SlaveCommit(this, from
, p
.first
));
3447 mds
->mdlog
->flush();
3449 finish_uncommitted_slave(p
.first
);
3451 MDRequestRef mdr
= request_get(p
.first
);
3452 // information about master imported caps
3453 if (p
.second
.length() > 0)
3454 mdr
->more()->inode_import
.share(p
.second
);
3456 ceph_assert(mdr
->slave_request
== 0); // shouldn't be doing anything!
3457 request_finish(mdr
);
3461 for (const auto &metareq
: ack
->abort
) {
3462 dout(10) << " abort on slave " << metareq
<< dendl
;
3464 if (mds
->is_resolve()) {
3465 MDSlaveUpdate
*su
= get_uncommitted_slave(metareq
, from
);
3468 // perform rollback (and journal a rollback entry)
3469 // note: this will hold up the resolve a bit, until the rollback entries journal.
3470 MDRequestRef null_ref
;
3471 switch (su
->origop
) {
3472 case ESlaveUpdate::LINK
:
3473 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3475 case ESlaveUpdate::RENAME
:
3476 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3478 case ESlaveUpdate::RMDIR
:
3479 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3485 MDRequestRef mdr
= request_get(metareq
);
3486 mdr
->aborted
= true;
3487 if (mdr
->slave_request
) {
3488 if (mdr
->slave_did_prepare()) // journaling slave prepare ?
3489 add_rollback(metareq
, from
);
3491 request_finish(mdr
);
3496 if (!ambiguous_slave_updates
.count(from
)) {
3497 resolve_ack_gather
.erase(from
);
3498 maybe_finish_slave_resolve();
3502 void MDCache::add_uncommitted_slave(metareqid_t reqid
, LogSegment
*ls
, mds_rank_t master
, MDSlaveUpdate
*su
)
3504 auto const &ret
= uncommitted_slaves
.emplace(std::piecewise_construct
,
3505 std::forward_as_tuple(reqid
),
3506 std::forward_as_tuple());
3507 ceph_assert(ret
.second
);
3508 ls
->uncommitted_slaves
.insert(reqid
);
3509 uslave
&u
= ret
.first
->second
;
3513 if (su
== nullptr) {
3516 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3517 uncommitted_slave_rename_olddir
[*p
]++;
3518 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3519 uncommitted_slave_unlink
[*p
]++;
3522 void MDCache::finish_uncommitted_slave(metareqid_t reqid
, bool assert_exist
)
3524 auto it
= uncommitted_slaves
.find(reqid
);
3525 if (it
== uncommitted_slaves
.end()) {
3526 ceph_assert(!assert_exist
);
3529 uslave
&u
= it
->second
;
3530 MDSlaveUpdate
* su
= u
.su
;
3532 if (!u
.waiters
.empty()) {
3533 mds
->queue_waiters(u
.waiters
);
3535 u
.ls
->uncommitted_slaves
.erase(reqid
);
3536 uncommitted_slaves
.erase(it
);
3538 if (su
== nullptr) {
3541 // discard the non-auth subtree we renamed out of
3542 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3544 map
<CInode
*, int>::iterator it
= uncommitted_slave_rename_olddir
.find(diri
);
3545 ceph_assert(it
!= uncommitted_slave_rename_olddir
.end());
3547 if (it
->second
== 0) {
3548 uncommitted_slave_rename_olddir
.erase(it
);
3549 auto&& ls
= diri
->get_dirfrags();
3550 for (const auto& dir
: ls
) {
3551 CDir
*root
= get_subtree_root(dir
);
3552 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3553 try_trim_non_auth_subtree(root
);
3559 ceph_assert(it
->second
> 0);
3561 // removed the inodes that were unlinked by slave update
3562 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3564 map
<CInode
*, int>::iterator it
= uncommitted_slave_unlink
.find(in
);
3565 ceph_assert(it
!= uncommitted_slave_unlink
.end());
3567 if (it
->second
== 0) {
3568 uncommitted_slave_unlink
.erase(it
);
3569 if (!in
->get_projected_parent_dn())
3570 mds
->mdcache
->remove_inode_recursive(in
);
3572 ceph_assert(it
->second
> 0);
3577 MDSlaveUpdate
* MDCache::get_uncommitted_slave(metareqid_t reqid
, mds_rank_t master
)
3580 MDSlaveUpdate
* su
= nullptr;
3581 auto it
= uncommitted_slaves
.find(reqid
);
3582 if (it
!= uncommitted_slaves
.end() &&
3583 it
->second
.master
== master
) {
3589 void MDCache::finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
) {
3590 auto p
= resolve_need_rollback
.find(mdr
->reqid
);
3591 ceph_assert(p
!= resolve_need_rollback
.end());
3592 if (mds
->is_resolve()) {
3593 finish_uncommitted_slave(reqid
, false);
3595 finish_uncommitted_slave(mdr
->reqid
, mdr
->more()->slave_update_journaled
);
3597 resolve_need_rollback
.erase(p
);
3598 maybe_finish_slave_resolve();
3601 void MDCache::disambiguate_other_imports()
3603 dout(10) << "disambiguate_other_imports" << dendl
;
3605 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3606 // other nodes' ambiguous imports
3607 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3608 p
!= other_ambiguous_imports
.end();
3610 mds_rank_t who
= p
->first
;
3611 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3613 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3614 q
!= p
->second
.end();
3616 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3617 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3618 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3621 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3622 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3623 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3624 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3625 try_subtree_merge(dir
);
3627 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3631 other_ambiguous_imports
.clear();
3634 void MDCache::disambiguate_my_imports()
3636 dout(10) << "disambiguate_my_imports" << dendl
;
3638 if (!mds
->is_resolve()) {
3639 ceph_assert(my_ambiguous_imports
.empty());
3643 disambiguate_other_imports();
3645 // my ambiguous imports
3646 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3647 while (!my_ambiguous_imports
.empty()) {
3648 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3650 CDir
*dir
= get_dirfrag(q
->first
);
3653 if (dir
->authority() != me_ambig
) {
3654 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3655 cancel_ambiguous_import(dir
);
3657 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3659 // subtree may have been swallowed by another node claiming dir
3661 CDir
*root
= get_subtree_root(dir
);
3663 dout(10) << " subtree root is " << *root
<< dendl
;
3664 ceph_assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3665 try_trim_non_auth_subtree(root
);
3667 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3668 finish_ambiguous_import(q
->first
);
3669 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3672 ceph_assert(my_ambiguous_imports
.empty());
3673 mds
->mdlog
->flush();
3675 // verify all my subtrees are unambiguous!
3676 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3677 p
!= subtrees
.end();
3679 CDir
*dir
= p
->first
;
3680 if (dir
->is_ambiguous_dir_auth()) {
3681 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3683 ceph_assert(!dir
->is_ambiguous_dir_auth());
3690 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3692 ceph_assert(my_ambiguous_imports
.count(base
) == 0);
3693 my_ambiguous_imports
[base
] = bounds
;
3697 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3700 vector
<dirfrag_t
> binos
;
3701 for (set
<CDir
*>::iterator p
= bounds
.begin();
3704 binos
.push_back((*p
)->dirfrag());
3706 // note: this can get called twice if the exporter fails during recovery
3707 if (my_ambiguous_imports
.count(base
->dirfrag()))
3708 my_ambiguous_imports
.erase(base
->dirfrag());
3710 add_ambiguous_import(base
->dirfrag(), binos
);
3713 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3715 dirfrag_t df
= dir
->dirfrag();
3716 ceph_assert(my_ambiguous_imports
.count(df
));
3717 dout(10) << "cancel_ambiguous_import " << df
3718 << " bounds " << my_ambiguous_imports
[df
]
3721 my_ambiguous_imports
.erase(df
);
3724 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3726 ceph_assert(my_ambiguous_imports
.count(df
));
3727 vector
<dirfrag_t
> bounds
;
3728 bounds
.swap(my_ambiguous_imports
[df
]);
3729 my_ambiguous_imports
.erase(df
);
3731 dout(10) << "finish_ambiguous_import " << df
3732 << " bounds " << bounds
3734 CDir
*dir
= get_dirfrag(df
);
3737 // adjust dir_auth, import maps
3738 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3739 try_subtree_merge(dir
);
3742 void MDCache::remove_inode_recursive(CInode
*in
)
3744 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3745 auto&& ls
= in
->get_dirfrags();
3746 for (const auto& subdir
: ls
) {
3747 dout(10) << " removing dirfrag " << *subdir
<< dendl
;
3748 auto it
= subdir
->items
.begin();
3749 while (it
!= subdir
->items
.end()) {
3750 CDentry
*dn
= it
->second
;
3752 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3753 if (dnl
->is_primary()) {
3754 CInode
*tin
= dnl
->get_inode();
3755 subdir
->unlink_inode(dn
, false);
3756 remove_inode_recursive(tin
);
3758 subdir
->remove_dentry(dn
);
3761 if (subdir
->is_subtree_root())
3762 remove_subtree(subdir
);
3763 in
->close_dirfrag(subdir
->dirfrag().frag
);
3768 bool MDCache::expire_recursive(CInode
*in
, expiremap
&expiremap
)
3770 ceph_assert(!in
->is_auth());
3772 dout(10) << __func__
<< ":" << *in
<< dendl
;
3774 // Recurse into any dirfrags beneath this inode
3775 auto&& ls
= in
->get_dirfrags();
3776 for (const auto& subdir
: ls
) {
3777 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3778 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3782 for (auto &it
: subdir
->items
) {
3783 CDentry
*dn
= it
.second
;
3784 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3785 if (dnl
->is_primary()) {
3786 CInode
*tin
= dnl
->get_inode();
3788 /* Remote strays with linkage (i.e. hardlinks) should not be
3789 * expired, because they may be the target of
3790 * a rename() as the owning MDS shuts down */
3791 if (!tin
->is_stray() && tin
->inode
.nlink
) {
3792 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3796 const bool abort
= expire_recursive(tin
, expiremap
);
3801 if (dn
->lru_is_expireable()) {
3802 trim_dentry(dn
, expiremap
);
3804 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3813 void MDCache::trim_unlinked_inodes()
3815 dout(7) << "trim_unlinked_inodes" << dendl
;
3818 for (auto &p
: inode_map
) {
3819 CInode
*in
= p
.second
;
3820 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3821 dout(7) << " will trim from " << *in
<< dendl
;
3825 if (!(++count
% 1000))
3826 mds
->heartbeat_reset();
3828 for (auto& in
: q
) {
3829 remove_inode_recursive(in
);
3831 if (!(++count
% 1000))
3832 mds
->heartbeat_reset();
3836 /** recalc_auth_bits()
3837 * once subtree auth is disambiguated, we need to adjust all the
3838 * auth and dirty bits in our cache before moving on.
3840 void MDCache::recalc_auth_bits(bool replay
)
3842 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3845 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3846 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3848 root
->state_set(CInode::STATE_AUTH
);
3850 root
->state_clear(CInode::STATE_AUTH
);
3852 root
->state_set(CInode::STATE_REJOINING
);
3856 set
<CInode
*> subtree_inodes
;
3857 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3858 p
!= subtrees
.end();
3860 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3861 subtree_inodes
.insert(p
->first
->inode
);
3864 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3865 p
!= subtrees
.end();
3867 if (p
->first
->inode
->is_mdsdir()) {
3868 CInode
*in
= p
->first
->inode
;
3869 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3871 in
->state_set(CInode::STATE_AUTH
);
3873 in
->state_clear(CInode::STATE_AUTH
);
3875 in
->state_set(CInode::STATE_REJOINING
);
3879 std::queue
<CDir
*> dfq
; // dirfrag queue
3882 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3883 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3885 while (!dfq
.empty()) {
3886 CDir
*dir
= dfq
.front();
3891 dir
->state_set(CDir::STATE_AUTH
);
3893 dir
->state_clear(CDir::STATE_AUTH
);
3895 // close empty non-auth dirfrag
3896 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3897 dir
->inode
->close_dirfrag(dir
->get_frag());
3900 dir
->state_set(CDir::STATE_REJOINING
);
3901 dir
->state_clear(CDir::STATE_COMPLETE
);
3902 if (dir
->is_dirty())
3907 // dentries in this dir
3908 for (auto &p
: dir
->items
) {
3910 CDentry
*dn
= p
.second
;
3911 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3913 dn
->state_set(CDentry::STATE_AUTH
);
3915 dn
->state_clear(CDentry::STATE_AUTH
);
3917 dn
->state_set(CDentry::STATE_REJOINING
);
3923 if (dnl
->is_primary()) {
3925 CInode
*in
= dnl
->get_inode();
3927 in
->state_set(CInode::STATE_AUTH
);
3929 in
->state_clear(CInode::STATE_AUTH
);
3931 in
->state_set(CInode::STATE_REJOINING
);
3934 if (in
->is_dirty_parent())
3935 in
->clear_dirty_parent();
3936 // avoid touching scatterlocks for our subtree roots!
3937 if (subtree_inodes
.count(in
) == 0)
3938 in
->clear_scatter_dirty();
3943 auto&& dfv
= in
->get_nested_dirfrags();
3944 for (const auto& dir
: dfv
) {
3959 // ===========================================================================
3963 * notes on scatterlock recovery:
3965 * - recovering inode replica sends scatterlock data for any subtree
3966 * roots (the only ones that are possibly dirty).
3968 * - surviving auth incorporates any provided scatterlock data. any
3969 * pending gathers are then finished, as with the other lock types.
3971 * that takes care of surviving auth + (recovering replica)*.
3973 * - surviving replica sends strong_inode, which includes current
3974 * scatterlock state, AND any dirty scatterlock data. this
3975 * provides the recovering auth with everything it might need.
3977 * - recovering auth must pick initial scatterlock state based on
3978 * (weak|strong) rejoins.
3979 * - always assimilate scatterlock data (it can't hurt)
3980 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3981 * - include base inode in ack for all inodes that saw scatterlock content
3983 * also, for scatter gather,
3985 * - auth increments {frag,r}stat.version on completion of any gather.
3987 * - auth incorporates changes in a gather _only_ if the version
3990 * - replica discards changes any time the scatterlock syncs, and
3994 void MDCache::dump_rejoin_status(Formatter
*f
) const
3996 f
->open_object_section("rejoin_status");
3997 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3998 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3999 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
4003 void MDCache::rejoin_start(MDSContext
*rejoin_done_
)
4005 dout(10) << "rejoin_start" << dendl
;
4006 ceph_assert(!rejoin_done
);
4007 rejoin_done
.reset(rejoin_done_
);
4009 rejoin_gather
= recovery_set
;
4010 // need finish opening cap inodes before sending cache rejoins
4011 rejoin_gather
.insert(mds
->get_nodeid());
4012 process_imported_caps();
4018 * this initiates rejoin. it should be called before we get any
4019 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
4021 * we start out by sending rejoins to everyone in the recovery set.
4023 * if we are rejoin, send for all regions in our cache.
4024 * if we are active|stopping, send only to nodes that are rejoining.
4026 void MDCache::rejoin_send_rejoins()
4028 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
4030 if (rejoin_gather
.count(mds
->get_nodeid())) {
4031 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
4032 rejoins_pending
= true;
4035 if (!resolve_gather
.empty()) {
4036 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
4037 << resolve_gather
<< ")" << dendl
;
4038 rejoins_pending
= true;
4042 ceph_assert(!migrator
->is_importing());
4043 ceph_assert(!migrator
->is_exporting());
4045 if (!mds
->is_rejoin()) {
4046 disambiguate_other_imports();
4049 map
<mds_rank_t
, ref_t
<MMDSCacheRejoin
>> rejoins
;
4052 // if i am rejoining, send a rejoin to everyone.
4053 // otherwise, just send to others who are rejoining.
4054 for (const auto& rank
: recovery_set
) {
4055 if (rank
== mds
->get_nodeid()) continue; // nothing to myself!
4056 if (rejoin_sent
.count(rank
)) continue; // already sent a rejoin to this node!
4057 if (mds
->is_rejoin())
4058 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_WEAK
);
4059 else if (mds
->mdsmap
->is_rejoin(rank
))
4060 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_STRONG
);
4063 if (mds
->is_rejoin()) {
4064 map
<client_t
, pair
<Session
*, set
<mds_rank_t
> > > client_exports
;
4065 for (auto& p
: cap_exports
) {
4066 mds_rank_t target
= p
.second
.first
;
4067 if (rejoins
.count(target
) == 0)
4069 for (auto q
= p
.second
.second
.begin(); q
!= p
.second
.second
.end(); ) {
4070 Session
*session
= nullptr;
4071 auto it
= client_exports
.find(q
->first
);
4072 if (it
!= client_exports
.end()) {
4073 session
= it
->second
.first
;
4075 it
->second
.second
.insert(target
);
4077 session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
4078 auto& r
= client_exports
[q
->first
];
4081 r
.second
.insert(target
);
4086 // remove reconnect with no session
4087 p
.second
.second
.erase(q
++);
4090 rejoins
[target
]->cap_exports
[p
.first
] = p
.second
.second
;
4092 for (auto& p
: client_exports
) {
4093 Session
*session
= p
.second
.first
;
4094 for (auto& q
: p
.second
.second
) {
4095 auto rejoin
= rejoins
[q
];
4096 rejoin
->client_map
[p
.first
] = session
->info
.inst
;
4097 rejoin
->client_metadata_map
[p
.first
] = session
->info
.client_metadata
;
4103 // check all subtrees
4104 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
4105 p
!= subtrees
.end();
4107 CDir
*dir
= p
->first
;
4108 ceph_assert(dir
->is_subtree_root());
4109 if (dir
->is_ambiguous_dir_auth()) {
4110 // exporter is recovering, importer is survivor.
4111 ceph_assert(rejoins
.count(dir
->authority().first
));
4112 ceph_assert(!rejoins
.count(dir
->authority().second
));
4118 continue; // skip my own regions!
4120 mds_rank_t auth
= dir
->get_dir_auth().first
;
4121 ceph_assert(auth
>= 0);
4122 if (rejoins
.count(auth
) == 0)
4123 continue; // don't care about this node's subtrees
4125 rejoin_walk(dir
, rejoins
[auth
]);
4128 // rejoin root inodes, too
4129 for (auto &p
: rejoins
) {
4130 if (mds
->is_rejoin()) {
4132 if (p
.first
== 0 && root
) {
4133 p
.second
->add_weak_inode(root
->vino());
4134 if (root
->is_dirty_scattered()) {
4135 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4136 p
.second
->add_scatterlock_state(root
);
4139 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4141 p
.second
->add_weak_inode(in
->vino());
4145 if (p
.first
== 0 && root
) {
4146 p
.second
->add_strong_inode(root
->vino(),
4147 root
->get_replica_nonce(),
4148 root
->get_caps_wanted(),
4149 root
->filelock
.get_state(),
4150 root
->nestlock
.get_state(),
4151 root
->dirfragtreelock
.get_state());
4152 root
->state_set(CInode::STATE_REJOINING
);
4153 if (root
->is_dirty_scattered()) {
4154 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4155 p
.second
->add_scatterlock_state(root
);
4159 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4160 p
.second
->add_strong_inode(in
->vino(),
4161 in
->get_replica_nonce(),
4162 in
->get_caps_wanted(),
4163 in
->filelock
.get_state(),
4164 in
->nestlock
.get_state(),
4165 in
->dirfragtreelock
.get_state());
4166 in
->state_set(CInode::STATE_REJOINING
);
4171 if (!mds
->is_rejoin()) {
4172 // i am survivor. send strong rejoin.
4173 // note request remote_auth_pins, xlocks
4174 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4175 p
!= active_requests
.end();
4177 MDRequestRef
& mdr
= p
->second
;
4178 if (mdr
->is_slave())
4181 for (const auto& q
: mdr
->object_states
) {
4182 if (q
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
4184 if (!q
.first
->is_auth()) {
4185 mds_rank_t target
= q
.second
.remote_auth_pinned
;
4186 ceph_assert(target
== q
.first
->authority().first
);
4187 if (rejoins
.count(target
) == 0) continue;
4188 const auto& rejoin
= rejoins
[target
];
4190 dout(15) << " " << *mdr
<< " authpin on " << *q
.first
<< dendl
;
4191 MDSCacheObjectInfo i
;
4192 q
.first
->set_object_info(i
);
4194 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4196 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4198 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4199 mdr
->more()->rename_inode
== q
.first
)
4200 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4201 mdr
->reqid
, mdr
->attempt
);
4205 for (const auto& q
: mdr
->locks
) {
4207 auto obj
= lock
->get_parent();
4208 if (q
.is_xlock() && !obj
->is_auth()) {
4209 mds_rank_t who
= obj
->authority().first
;
4210 if (rejoins
.count(who
) == 0) continue;
4211 const auto& rejoin
= rejoins
[who
];
4213 dout(15) << " " << *mdr
<< " xlock on " << *lock
<< " " << *obj
<< dendl
;
4214 MDSCacheObjectInfo i
;
4215 obj
->set_object_info(i
);
4217 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4218 mdr
->reqid
, mdr
->attempt
);
4220 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4221 mdr
->reqid
, mdr
->attempt
);
4222 } else if (q
.is_remote_wrlock()) {
4223 mds_rank_t who
= q
.wrlock_target
;
4224 if (rejoins
.count(who
) == 0) continue;
4225 const auto& rejoin
= rejoins
[who
];
4227 dout(15) << " " << *mdr
<< " wrlock on " << *lock
<< " " << *obj
<< dendl
;
4228 MDSCacheObjectInfo i
;
4229 obj
->set_object_info(i
);
4231 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4232 mdr
->reqid
, mdr
->attempt
);
4238 // send the messages
4239 for (auto &p
: rejoins
) {
4240 ceph_assert(rejoin_sent
.count(p
.first
) == 0);
4241 ceph_assert(rejoin_ack_gather
.count(p
.first
) == 0);
4242 rejoin_sent
.insert(p
.first
);
4243 rejoin_ack_gather
.insert(p
.first
);
4244 mds
->send_message_mds(p
.second
, p
.first
);
4246 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4247 rejoins_pending
= false;
4250 if (mds
->is_rejoin() && rejoin_gather
.empty()) {
4251 dout(10) << "nothing to rejoin" << dendl
;
4252 rejoin_gather_finish();
4258 * rejoin_walk - build rejoin declarations for a subtree
4260 * @param dir subtree root
4261 * @param rejoin rejoin message
4263 * from a rejoining node:
4265 * weak dentries (w/ connectivity)
4267 * from a surviving node:
4269 * strong dentries (no connectivity!)
4272 void MDCache::rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
)
4274 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4276 std::vector
<CDir
*> nested
; // finish this dir, then do nested items
4278 if (mds
->is_rejoin()) {
4280 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4281 for (auto &p
: dir
->items
) {
4282 CDentry
*dn
= p
.second
;
4283 ceph_assert(dn
->last
== CEPH_NOSNAP
);
4284 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4285 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4286 ceph_assert(dnl
->is_primary());
4287 CInode
*in
= dnl
->get_inode();
4288 ceph_assert(dnl
->get_inode()->is_dir());
4289 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->get_name(), dn
->first
, dn
->last
, in
->ino());
4291 auto&& dirs
= in
->get_nested_dirfrags();
4292 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4294 if (in
->is_dirty_scattered()) {
4295 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4296 rejoin
->add_scatterlock_state(in
);
4301 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4302 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4303 dir
->state_set(CDir::STATE_REJOINING
);
4305 for (auto it
= dir
->items
.begin(); it
!= dir
->items
.end(); ) {
4306 CDentry
*dn
= it
->second
;
4308 dn
->state_set(CDentry::STATE_REJOINING
);
4309 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4310 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
4312 // trim snap dentries. because they may have been pruned by
4313 // their auth mds (snap deleted)
4314 if (dn
->last
!= CEPH_NOSNAP
) {
4315 if (in
&& !in
->remote_parents
.empty()) {
4316 // unlink any stale remote snap dentry.
4317 for (auto it2
= in
->remote_parents
.begin(); it2
!= in
->remote_parents
.end(); ) {
4318 CDentry
*remote_dn
= *it2
;
4320 ceph_assert(remote_dn
->last
!= CEPH_NOSNAP
);
4321 remote_dn
->unlink_remote(remote_dn
->get_linkage());
4324 if (dn
->lru_is_expireable()) {
4325 if (!dnl
->is_null())
4326 dir
->unlink_inode(dn
, false);
4329 dir
->remove_dentry(dn
);
4332 // Inventing null/remote dentry shouldn't cause problem
4333 ceph_assert(!dnl
->is_primary());
4337 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4338 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4339 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4340 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4341 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4342 dn
->get_replica_nonce(),
4343 dn
->lock
.get_state());
4344 dn
->state_set(CDentry::STATE_REJOINING
);
4345 if (dnl
->is_primary()) {
4346 CInode
*in
= dnl
->get_inode();
4347 dout(15) << " add_strong_inode " << *in
<< dendl
;
4348 rejoin
->add_strong_inode(in
->vino(),
4349 in
->get_replica_nonce(),
4350 in
->get_caps_wanted(),
4351 in
->filelock
.get_state(),
4352 in
->nestlock
.get_state(),
4353 in
->dirfragtreelock
.get_state());
4354 in
->state_set(CInode::STATE_REJOINING
);
4356 auto&& dirs
= in
->get_nested_dirfrags();
4357 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4359 if (in
->is_dirty_scattered()) {
4360 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4361 rejoin
->add_scatterlock_state(in
);
4367 // recurse into nested dirs
4368 for (const auto& dir
: nested
) {
4369 rejoin_walk(dir
, rejoin
);
4376 * - reply with the lockstate
4378 * if i am active|stopping,
4379 * - remove source from replica list for everything not referenced here.
4381 void MDCache::handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
)
4383 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4384 << " (" << m
->get_payload().length() << " bytes)"
4388 case MMDSCacheRejoin::OP_WEAK
:
4389 handle_cache_rejoin_weak(m
);
4391 case MMDSCacheRejoin::OP_STRONG
:
4392 handle_cache_rejoin_strong(m
);
4394 case MMDSCacheRejoin::OP_ACK
:
4395 handle_cache_rejoin_ack(m
);
4405 * handle_cache_rejoin_weak
4408 * - is recovering from their journal.
4409 * - may have incorrect (out of date) inode contents
4410 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4412 * if the sender didn't trim_non_auth(), they
4413 * - may have incorrect (out of date) dentry/inode linkage
4414 * - may have deleted/purged inodes
4415 * and i may have to go to disk to get accurate inode contents. yuck.
4417 void MDCache::handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &weak
)
4419 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4421 // possible response(s)
4422 ref_t
<MMDSCacheRejoin
> ack
; // if survivor
4423 set
<vinodeno_t
> acked_inodes
; // if survivor
4424 set
<SimpleLock
*> gather_locks
; // if survivor
4425 bool survivor
= false; // am i a survivor?
4427 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4429 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4430 ack
= make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
4432 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4434 // check cap exports
4435 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4436 CInode
*in
= get_inode(p
->first
);
4437 ceph_assert(!in
|| in
->is_auth());
4438 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4439 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4440 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4441 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4443 im
.cap_id
= cap
->get_cap_id();
4444 im
.issue_seq
= cap
->get_last_seq();
4445 im
.mseq
= cap
->get_mseq();
4450 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4453 encode(imported_caps
, ack
->imported_caps
);
4455 ceph_assert(mds
->is_rejoin());
4457 // we may have already received a strong rejoin from the sender.
4458 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4459 ceph_assert(gather_locks
.empty());
4461 // check cap exports.
4462 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4463 rejoin_client_metadata_map
.insert(weak
->client_metadata_map
.begin(),
4464 weak
->client_metadata_map
.end());
4466 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4467 CInode
*in
= get_inode(p
->first
);
4468 ceph_assert(!in
|| in
->is_auth());
4470 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4471 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4472 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4477 // assimilate any potentially dirty scatterlock state
4478 for (const auto &p
: weak
->inode_scatterlocks
) {
4479 CInode
*in
= get_inode(p
.first
);
4481 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4482 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4483 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4485 rejoin_potential_updated_scatterlocks
.insert(in
);
4488 // recovering peer may send incorrect dirfrags here. we need to
4489 // infer which dirfrag they meant. the ack will include a
4490 // strong_dirfrag that will set them straight on the fragmentation.
4493 set
<CDir
*> dirs_to_share
;
4494 for (const auto &p
: weak
->weak_dirfrags
) {
4495 CInode
*diri
= get_inode(p
.ino
);
4497 dout(0) << " missing dir ino " << p
.ino
<< dendl
;
4501 if (diri
->dirfragtree
.is_leaf(p
.frag
)) {
4502 leaves
.push_back(p
.frag
);
4504 diri
->dirfragtree
.get_leaves_under(p
.frag
, leaves
);
4506 leaves
.push_back(diri
->dirfragtree
[p
.frag
.value()]);
4508 for (const auto& leaf
: leaves
) {
4509 CDir
*dir
= diri
->get_dirfrag(leaf
);
4511 dout(0) << " missing dir for " << p
.frag
<< " (which maps to " << leaf
<< ") on " << *diri
<< dendl
;
4515 if (dirs_to_share
.count(dir
)) {
4516 dout(10) << " already have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4518 dirs_to_share
.insert(dir
);
4519 unsigned nonce
= dir
->add_replica(from
);
4520 dout(10) << " have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4522 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4523 ack
->add_dirfrag_base(dir
);
4529 for (const auto &p
: weak
->weak
) {
4530 CInode
*diri
= get_inode(p
.first
);
4532 dout(0) << " missing dir ino " << p
.first
<< dendl
;
4537 for (const auto &q
: p
.second
) {
4538 // locate proper dirfrag.
4539 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4540 frag_t fg
= diri
->pick_dirfrag(q
.first
.name
);
4541 if (!dir
|| dir
->get_frag() != fg
) {
4542 dir
= diri
->get_dirfrag(fg
);
4544 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4546 ceph_assert(dirs_to_share
.count(dir
));
4550 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
4552 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4553 ceph_assert(dnl
->is_primary());
4555 if (survivor
&& dn
->is_replica(from
))
4556 dentry_remove_replica(dn
, from
, gather_locks
);
4557 unsigned dnonce
= dn
->add_replica(from
);
4558 dout(10) << " have " << *dn
<< dendl
;
4560 ack
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4561 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4562 dnonce
, dn
->lock
.get_replica_state());
4565 CInode
*in
= dnl
->get_inode();
4568 if (survivor
&& in
->is_replica(from
))
4569 inode_remove_replica(in
, from
, true, gather_locks
);
4570 unsigned inonce
= in
->add_replica(from
);
4571 dout(10) << " have " << *in
<< dendl
;
4573 // scatter the dirlock, just in case?
4574 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4575 in
->filelock
.set_state(LOCK_MIX
);
4578 acked_inodes
.insert(in
->vino());
4579 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4581 in
->_encode_locks_state_for_rejoin(bl
, from
);
4582 ack
->add_inode_locks(in
, inonce
, bl
);
4587 // weak base inodes? (root, stray, etc.)
4588 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4589 p
!= weak
->weak_inodes
.end();
4591 CInode
*in
= get_inode(*p
);
4592 ceph_assert(in
); // hmm fixme wrt stray?
4593 if (survivor
&& in
->is_replica(from
))
4594 inode_remove_replica(in
, from
, true, gather_locks
);
4595 unsigned inonce
= in
->add_replica(from
);
4596 dout(10) << " have base " << *in
<< dendl
;
4599 acked_inodes
.insert(in
->vino());
4600 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4602 in
->_encode_locks_state_for_rejoin(bl
, from
);
4603 ack
->add_inode_locks(in
, inonce
, bl
);
4607 ceph_assert(rejoin_gather
.count(from
));
4608 rejoin_gather
.erase(from
);
4610 // survivor. do everything now.
4611 for (const auto &p
: weak
->inode_scatterlocks
) {
4612 CInode
*in
= get_inode(p
.first
);
4614 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4615 acked_inodes
.insert(in
->vino());
4616 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4619 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4620 mds
->send_message(ack
, weak
->get_connection());
4622 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4623 if (!(*p
)->is_stable())
4624 mds
->locker
->eval_gather(*p
);
4628 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4629 rejoin_gather_finish();
4631 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4637 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4639 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4640 * ack, the replica dne, and we can remove it from our replica maps.
4642 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
4643 set
<vinodeno_t
>& acked_inodes
,
4644 set
<SimpleLock
*>& gather_locks
)
4646 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4648 auto scour_func
= [this, from
, ack
, &acked_inodes
, &gather_locks
] (CInode
*in
) {
4650 if (in
->is_auth() &&
4651 in
->is_replica(from
) &&
4652 (ack
== NULL
|| acked_inodes
.count(in
->vino()) == 0)) {
4653 inode_remove_replica(in
, from
, false, gather_locks
);
4654 dout(10) << " rem " << *in
<< dendl
;
4660 const auto&& dfs
= in
->get_dirfrags();
4661 for (const auto& dir
: dfs
) {
4662 if (!dir
->is_auth())
4665 if (dir
->is_replica(from
) &&
4666 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4667 dir
->remove_replica(from
);
4668 dout(10) << " rem " << *dir
<< dendl
;
4672 for (auto &p
: dir
->items
) {
4673 CDentry
*dn
= p
.second
;
4675 if (dn
->is_replica(from
)) {
4677 const auto it
= ack
->strong_dentries
.find(dir
->dirfrag());
4678 if (it
!= ack
->strong_dentries
.end() && it
->second
.count(string_snap_t(dn
->get_name(), dn
->last
)) > 0) {
4682 dentry_remove_replica(dn
, from
, gather_locks
);
4683 dout(10) << " rem " << *dn
<< dendl
;
4689 for (auto &p
: inode_map
)
4690 scour_func(p
.second
);
4691 for (auto &p
: snap_inode_map
)
4692 scour_func(p
.second
);
4696 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4698 CInode
*in
= new CInode(this, true, 1, last
);
4699 in
->inode
.ino
= ino
;
4700 in
->state_set(CInode::STATE_REJOINUNDEF
);
4702 rejoin_undef_inodes
.insert(in
);
4703 dout(10) << " invented " << *in
<< dendl
;
4707 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4709 CInode
*in
= get_inode(df
.ino
);
4711 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4712 if (!in
->is_dir()) {
4713 ceph_assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4714 in
->inode
.mode
= S_IFDIR
;
4715 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4717 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4718 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4719 rejoin_undef_dirfrags
.insert(dir
);
4720 dout(10) << " invented " << *dir
<< dendl
;
4724 void MDCache::handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &strong
)
4726 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4728 // only a recovering node will get a strong rejoin.
4729 if (!mds
->is_rejoin()) {
4730 if (mds
->get_want_state() == MDSMap::STATE_REJOIN
) {
4731 mds
->wait_for_rejoin(new C_MDS_RetryMessage(mds
, strong
));
4734 ceph_abort_msg("got unexpected rejoin message during recovery");
4737 // assimilate any potentially dirty scatterlock state
4738 for (const auto &p
: strong
->inode_scatterlocks
) {
4739 CInode
*in
= get_inode(p
.first
);
4741 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4742 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4743 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4744 rejoin_potential_updated_scatterlocks
.insert(in
);
4747 rejoin_unlinked_inodes
[from
].clear();
4749 // surviving peer may send incorrect dirfrag here (maybe they didn't
4750 // get the fragment notify, or maybe we rolled back?). we need to
4751 // infer the right frag and get them with the program. somehow.
4752 // we don't normally send ACK.. so we'll need to bundle this with
4753 // MISSING or something.
4755 // strong dirfrags/dentries.
4756 // also process auth_pins, xlocks.
4757 for (const auto &p
: strong
->strong_dirfrags
) {
4758 auto& dirfrag
= p
.first
;
4759 CInode
*diri
= get_inode(dirfrag
.ino
);
4761 diri
= rejoin_invent_inode(dirfrag
.ino
, CEPH_NOSNAP
);
4762 CDir
*dir
= diri
->get_dirfrag(dirfrag
.frag
);
4763 bool refragged
= false;
4765 dout(10) << " have " << *dir
<< dendl
;
4767 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4768 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4769 else if (diri
->dirfragtree
.is_leaf(dirfrag
.frag
))
4770 dir
= rejoin_invent_dirfrag(dirfrag
);
4773 dir
->add_replica(from
, p
.second
.nonce
);
4774 dir
->dir_rep
= p
.second
.dir_rep
;
4776 dout(10) << " frag " << dirfrag
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4778 diri
->dirfragtree
.get_leaves_under(dirfrag
.frag
, leaves
);
4780 leaves
.push_back(diri
->dirfragtree
[dirfrag
.frag
.value()]);
4781 dout(10) << " maps to frag(s) " << leaves
<< dendl
;
4782 for (const auto& leaf
: leaves
) {
4783 CDir
*dir
= diri
->get_dirfrag(leaf
);
4785 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), leaf
));
4787 dout(10) << " have(approx) " << *dir
<< dendl
;
4788 dir
->add_replica(from
, p
.second
.nonce
);
4789 dir
->dir_rep
= p
.second
.dir_rep
;
4794 const auto it
= strong
->strong_dentries
.find(dirfrag
);
4795 if (it
!= strong
->strong_dentries
.end()) {
4796 const auto& dmap
= it
->second
;
4797 for (const auto &q
: dmap
) {
4798 const string_snap_t
& ss
= q
.first
;
4799 const MMDSCacheRejoin::dn_strong
& d
= q
.second
;
4802 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4804 frag_t fg
= diri
->pick_dirfrag(ss
.name
);
4805 dir
= diri
->get_dirfrag(fg
);
4807 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4810 if (d
.is_remote()) {
4811 dn
= dir
->add_remote_dentry(ss
.name
, d
.remote_ino
, d
.remote_d_type
, d
.first
, ss
.snapid
);
4812 } else if (d
.is_null()) {
4813 dn
= dir
->add_null_dentry(ss
.name
, d
.first
, ss
.snapid
);
4815 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4816 if (!in
) in
= rejoin_invent_inode(d
.ino
, ss
.snapid
);
4817 dn
= dir
->add_primary_dentry(ss
.name
, in
, d
.first
, ss
.snapid
);
4819 dout(10) << " invented " << *dn
<< dendl
;
4821 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4824 const auto pinned_it
= strong
->authpinned_dentries
.find(dirfrag
);
4825 if (pinned_it
!= strong
->authpinned_dentries
.end()) {
4826 const auto slave_reqid_it
= pinned_it
->second
.find(ss
);
4827 if (slave_reqid_it
!= pinned_it
->second
.end()) {
4828 for (const auto &r
: slave_reqid_it
->second
) {
4829 dout(10) << " dn authpin by " << r
<< " on " << *dn
<< dendl
;
4831 // get/create slave mdrequest
4833 if (have_request(r
.reqid
))
4834 mdr
= request_get(r
.reqid
);
4836 mdr
= request_start_slave(r
.reqid
, r
.attempt
, strong
);
4843 const auto xlocked_it
= strong
->xlocked_dentries
.find(dirfrag
);
4844 if (xlocked_it
!= strong
->xlocked_dentries
.end()) {
4845 const auto ss_req_it
= xlocked_it
->second
.find(ss
);
4846 if (ss_req_it
!= xlocked_it
->second
.end()) {
4847 const MMDSCacheRejoin::slave_reqid
& r
= ss_req_it
->second
;
4848 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4849 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4850 ceph_assert(mdr
->is_auth_pinned(dn
));
4851 if (!mdr
->is_xlocked(&dn
->versionlock
)) {
4852 ceph_assert(dn
->versionlock
.can_xlock_local());
4853 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4854 mdr
->emplace_lock(&dn
->versionlock
, MutationImpl::LockOp::XLOCK
);
4856 if (dn
->lock
.is_stable())
4857 dn
->auth_pin(&dn
->lock
);
4858 dn
->lock
.set_state(LOCK_XLOCK
);
4859 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4860 mdr
->emplace_lock(&dn
->lock
, MutationImpl::LockOp::XLOCK
);
4864 dn
->add_replica(from
, d
.nonce
);
4865 dout(10) << " have " << *dn
<< dendl
;
4867 if (dnl
->is_primary()) {
4868 if (d
.is_primary()) {
4869 if (vinodeno_t(d
.ino
, ss
.snapid
) != dnl
->get_inode()->vino()) {
4870 // the survivor missed MDentryUnlink+MDentryLink messages ?
4871 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4872 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4874 ceph_assert(in
->get_parent_dn());
4875 rejoin_unlinked_inodes
[from
].insert(in
);
4876 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4879 // the survivor missed MDentryLink message ?
4880 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4881 dout(7) << " sender doesn't have primay dentry" << dendl
;
4884 if (d
.is_primary()) {
4885 // the survivor missed MDentryUnlink message ?
4886 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4888 ceph_assert(in
->get_parent_dn());
4889 rejoin_unlinked_inodes
[from
].insert(in
);
4890 dout(7) << " sender has primary dentry but we don't" << dendl
;
4897 for (const auto &p
: strong
->strong_inodes
) {
4898 CInode
*in
= get_inode(p
.first
);
4900 in
->add_replica(from
, p
.second
.nonce
);
4901 dout(10) << " have " << *in
<< dendl
;
4903 const MMDSCacheRejoin::inode_strong
& is
= p
.second
;
4906 if (is
.caps_wanted
) {
4907 in
->set_mds_caps_wanted(from
, is
.caps_wanted
);
4908 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4909 << " on " << *in
<< dendl
;
4913 // infer state from replica state:
4914 // * go to MIX if they might have wrlocks
4915 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4916 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4917 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4918 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4921 const auto authpinned_inodes_it
= strong
->authpinned_inodes
.find(in
->vino());
4922 if (authpinned_inodes_it
!= strong
->authpinned_inodes
.end()) {
4923 for (const auto& r
: authpinned_inodes_it
->second
) {
4924 dout(10) << " inode authpin by " << r
<< " on " << *in
<< dendl
;
4926 // get/create slave mdrequest
4928 if (have_request(r
.reqid
))
4929 mdr
= request_get(r
.reqid
);
4931 mdr
= request_start_slave(r
.reqid
, r
.attempt
, strong
);
4932 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4933 ceph_assert(!in
->get_num_auth_pins());
4934 mdr
->freeze_auth_pin(in
);
4936 ceph_assert(!in
->is_frozen_auth_pin());
4942 const auto xlocked_inodes_it
= strong
->xlocked_inodes
.find(in
->vino());
4943 if (xlocked_inodes_it
!= strong
->xlocked_inodes
.end()) {
4944 for (const auto &q
: xlocked_inodes_it
->second
) {
4945 SimpleLock
*lock
= in
->get_lock(q
.first
);
4946 dout(10) << " inode xlock by " << q
.second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4947 MDRequestRef mdr
= request_get(q
.second
.reqid
); // should have this from auth_pin above.
4948 ceph_assert(mdr
->is_auth_pinned(in
));
4949 if (!mdr
->is_xlocked(&in
->versionlock
)) {
4950 ceph_assert(in
->versionlock
.can_xlock_local());
4951 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4952 mdr
->emplace_lock(&in
->versionlock
, MutationImpl::LockOp::XLOCK
);
4954 if (lock
->is_stable())
4956 lock
->set_state(LOCK_XLOCK
);
4957 if (lock
== &in
->filelock
)
4959 lock
->get_xlock(mdr
, mdr
->get_client());
4960 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
4965 for (const auto &p
: strong
->wrlocked_inodes
) {
4966 CInode
*in
= get_inode(p
.first
);
4967 for (const auto &q
: p
.second
) {
4968 SimpleLock
*lock
= in
->get_lock(q
.first
);
4969 for (const auto &r
: q
.second
) {
4970 dout(10) << " inode wrlock by " << r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4971 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4973 ceph_assert(mdr
->is_auth_pinned(in
));
4974 lock
->set_state(LOCK_MIX
);
4975 if (lock
== &in
->filelock
)
4977 lock
->get_wrlock(true);
4978 mdr
->emplace_lock(lock
, MutationImpl::LockOp::WRLOCK
);
4984 ceph_assert(rejoin_gather
.count(from
));
4985 rejoin_gather
.erase(from
);
4986 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4987 rejoin_gather_finish();
4989 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4993 void MDCache::handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &ack
)
4995 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4996 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4998 ceph_assert(mds
->get_state() >= MDSMap::STATE_REJOIN
);
4999 bool survivor
= !mds
->is_rejoin();
5001 // for sending cache expire message
5002 set
<CInode
*> isolated_inodes
;
5003 set
<CInode
*> refragged_inodes
;
5004 list
<pair
<CInode
*,int> > updated_realms
;
5007 for (const auto &p
: ack
->strong_dirfrags
) {
5008 // we may have had incorrect dir fragmentation; refragment based
5009 // on what they auth tells us.
5010 CDir
*dir
= get_dirfrag(p
.first
);
5012 dir
= get_force_dirfrag(p
.first
, false);
5014 refragged_inodes
.insert(dir
->get_inode());
5017 CInode
*diri
= get_inode(p
.first
.ino
);
5019 // barebones inode; the full inode loop below will clean up.
5020 diri
= new CInode(this, false);
5021 diri
->inode
.ino
= p
.first
.ino
;
5022 diri
->inode
.mode
= S_IFDIR
;
5023 diri
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
5025 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
) {
5026 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
5027 dout(10) << " add inode " << *diri
<< dendl
;
5029 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
5030 isolated_inodes
.insert(diri
);
5031 dout(10) << " unconnected dirfrag " << p
.first
<< dendl
;
5034 // barebones dirfrag; the full dirfrag loop below will clean up.
5035 dir
= diri
->add_dirfrag(new CDir(diri
, p
.first
.frag
, this, false));
5036 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
||
5037 (dir
->authority() != CDIR_AUTH_UNDEF
&&
5038 dir
->authority().first
!= from
))
5039 adjust_subtree_auth(dir
, from
);
5040 dout(10) << " add dirfrag " << *dir
<< dendl
;
5043 dir
->set_replica_nonce(p
.second
.nonce
);
5044 dir
->state_clear(CDir::STATE_REJOINING
);
5045 dout(10) << " got " << *dir
<< dendl
;
5048 auto it
= ack
->strong_dentries
.find(p
.first
);
5049 if (it
!= ack
->strong_dentries
.end()) {
5050 for (const auto &q
: it
->second
) {
5051 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
5053 dn
= dir
->add_null_dentry(q
.first
.name
, q
.second
.first
, q
.first
.snapid
);
5055 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5057 ceph_assert(dn
->last
== q
.first
.snapid
);
5058 if (dn
->first
!= q
.second
.first
) {
5059 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
.second
.first
<< " on " << *dn
<< dendl
;
5060 dn
->first
= q
.second
.first
;
5063 // may have bad linkage if we missed dentry link/unlink messages
5064 if (dnl
->is_primary()) {
5065 CInode
*in
= dnl
->get_inode();
5066 if (!q
.second
.is_primary() ||
5067 vinodeno_t(q
.second
.ino
, q
.first
.snapid
) != in
->vino()) {
5068 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
5069 dir
->unlink_inode(dn
);
5071 } else if (dnl
->is_remote()) {
5072 if (!q
.second
.is_remote() ||
5073 q
.second
.remote_ino
!= dnl
->get_remote_ino() ||
5074 q
.second
.remote_d_type
!= dnl
->get_remote_d_type()) {
5075 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5076 dir
->unlink_inode(dn
);
5079 if (!q
.second
.is_null())
5080 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5083 // hmm, did we have the proper linkage here?
5084 if (dnl
->is_null() && !q
.second
.is_null()) {
5085 if (q
.second
.is_remote()) {
5086 dn
->dir
->link_remote_inode(dn
, q
.second
.remote_ino
, q
.second
.remote_d_type
);
5088 CInode
*in
= get_inode(q
.second
.ino
, q
.first
.snapid
);
5090 // barebones inode; assume it's dir, the full inode loop below will clean up.
5091 in
= new CInode(this, false, q
.second
.first
, q
.first
.snapid
);
5092 in
->inode
.ino
= q
.second
.ino
;
5093 in
->inode
.mode
= S_IFDIR
;
5094 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
5096 dout(10) << " add inode " << *in
<< dendl
;
5097 } else if (in
->get_parent_dn()) {
5098 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
5099 << ", unlinking " << *in
<< dendl
;
5100 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5102 dn
->dir
->link_primary_inode(dn
, in
);
5103 isolated_inodes
.erase(in
);
5107 dn
->set_replica_nonce(q
.second
.nonce
);
5108 dn
->lock
.set_state_rejoin(q
.second
.lock
, rejoin_waiters
, survivor
);
5109 dn
->state_clear(CDentry::STATE_REJOINING
);
5110 dout(10) << " got " << *dn
<< dendl
;
5115 for (const auto& in
: refragged_inodes
) {
5116 auto&& ls
= in
->get_nested_dirfrags();
5117 for (const auto& dir
: ls
) {
5118 if (dir
->is_auth() || ack
->strong_dirfrags
.count(dir
->dirfrag()))
5120 ceph_assert(dir
->get_num_any() == 0);
5121 in
->close_dirfrag(dir
->get_frag());
5126 for (const auto &p
: ack
->dirfrag_bases
) {
5127 CDir
*dir
= get_dirfrag(p
.first
);
5129 auto q
= p
.second
.cbegin();
5130 dir
->_decode_base(q
);
5131 dout(10) << " got dir replica " << *dir
<< dendl
;
5135 auto p
= ack
->inode_base
.cbegin();
5143 CInode
*in
= get_inode(ino
, last
);
5145 auto q
= basebl
.cbegin();
5148 sseq
= in
->snaprealm
->srnode
.seq
;
5149 in
->_decode_base(q
);
5150 if (in
->snaprealm
&& in
->snaprealm
->srnode
.seq
!= sseq
) {
5151 int snap_op
= sseq
> 0 ? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
;
5152 updated_realms
.push_back(pair
<CInode
*,int>(in
, snap_op
));
5154 dout(10) << " got inode base " << *in
<< dendl
;
5158 p
= ack
->inode_locks
.cbegin();
5159 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5170 CInode
*in
= get_inode(ino
, last
);
5172 in
->set_replica_nonce(nonce
);
5173 auto q
= lockbl
.cbegin();
5174 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
, survivor
);
5175 in
->state_clear(CInode::STATE_REJOINING
);
5176 dout(10) << " got inode locks " << *in
<< dendl
;
5179 // FIXME: This can happen if entire subtree, together with the inode subtree root
5180 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5181 ceph_assert(isolated_inodes
.empty());
5183 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5184 auto bp
= ack
->imported_caps
.cbegin();
5185 decode(peer_imported
, bp
);
5187 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5188 p
!= peer_imported
.end();
5190 auto& ex
= cap_exports
.at(p
->first
);
5191 ceph_assert(ex
.first
== from
);
5192 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5193 q
!= p
->second
.end();
5195 auto r
= ex
.second
.find(q
->first
);
5196 ceph_assert(r
!= ex
.second
.end());
5198 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5199 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5201 dout(10) << " no session for client." << p
->first
<< dendl
;
5206 // mark client caps stale.
5207 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5208 r
->second
.capinfo
.cap_id
, 0,
5209 mds
->get_osd_epoch_barrier());
5210 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5211 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5212 mds
->send_message_client_counted(m
, session
);
5216 ceph_assert(ex
.second
.empty());
5219 for (auto p
: updated_realms
) {
5220 CInode
*in
= p
.first
;
5221 bool notify_clients
;
5222 if (mds
->is_rejoin()) {
5223 if (!rejoin_pending_snaprealms
.count(in
)) {
5224 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5225 rejoin_pending_snaprealms
.insert(in
);
5227 notify_clients
= false;
5229 // notify clients if I'm survivor
5230 notify_clients
= true;
5232 do_realm_invalidate_and_update_notify(in
, p
.second
, notify_clients
);
5236 ceph_assert(rejoin_ack_gather
.count(from
));
5237 rejoin_ack_gather
.erase(from
);
5239 if (rejoin_gather
.empty()) {
5240 // eval unstable scatter locks after all wrlocks are rejoined.
5241 while (!rejoin_eval_locks
.empty()) {
5242 SimpleLock
*lock
= rejoin_eval_locks
.front();
5243 rejoin_eval_locks
.pop_front();
5244 if (!lock
->is_stable())
5245 mds
->locker
->eval_gather(lock
);
5249 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5250 rejoin_ack_gather
.empty()) {
5251 // finally, kickstart past snap parent opens
5254 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5255 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5259 mds
->queue_waiters(rejoin_waiters
);
5264 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5266 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5267 * messages that clean these guys up...
5269 void MDCache::rejoin_trim_undef_inodes()
5271 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5273 while (!rejoin_undef_inodes
.empty()) {
5274 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5276 rejoin_undef_inodes
.erase(p
);
5278 in
->clear_replica_map();
5280 // close out dirfrags
5282 const auto&& dfls
= in
->get_dirfrags();
5283 for (const auto& dir
: dfls
) {
5284 dir
->clear_replica_map();
5286 for (auto &p
: dir
->items
) {
5287 CDentry
*dn
= p
.second
;
5288 dn
->clear_replica_map();
5290 dout(10) << " trimming " << *dn
<< dendl
;
5291 dir
->remove_dentry(dn
);
5294 dout(10) << " trimming " << *dir
<< dendl
;
5295 in
->close_dirfrag(dir
->dirfrag().frag
);
5299 CDentry
*dn
= in
->get_parent_dn();
5301 dn
->clear_replica_map();
5302 dout(10) << " trimming " << *dn
<< dendl
;
5303 dn
->dir
->remove_dentry(dn
);
5305 dout(10) << " trimming " << *in
<< dendl
;
5310 ceph_assert(rejoin_undef_inodes
.empty());
5313 void MDCache::rejoin_gather_finish()
5315 dout(10) << "rejoin_gather_finish" << dendl
;
5316 ceph_assert(mds
->is_rejoin());
5317 ceph_assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5319 if (open_undef_inodes_dirfrags())
5322 if (process_imported_caps())
5325 choose_lock_states_and_reconnect_caps();
5327 identify_files_to_recover();
5330 // signal completion of fetches, rejoin_gather_finish, etc.
5331 rejoin_ack_gather
.erase(mds
->get_nodeid());
5333 // did we already get our acks too?
5334 if (rejoin_ack_gather
.empty()) {
5335 // finally, open snaprealms
5340 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5343 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5344 void finish(int r
) override
{
5345 mdcache
->rejoin_open_ino_finish(ino
, r
);
5349 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5351 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5354 cap_imports_missing
.insert(ino
);
5355 } else if (ret
== mds
->get_nodeid()) {
5356 ceph_assert(get_inode(ino
));
5358 auto p
= cap_imports
.find(ino
);
5359 ceph_assert(p
!= cap_imports
.end());
5360 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5361 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5362 ceph_assert(q
->second
.size() == 1);
5363 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5365 cap_imports
.erase(p
);
5368 ceph_assert(cap_imports_num_opening
> 0);
5369 cap_imports_num_opening
--;
5371 if (cap_imports_num_opening
== 0) {
5372 if (rejoin_gather
.empty())
5373 rejoin_gather_finish();
5374 else if (rejoin_gather
.count(mds
->get_nodeid()))
5375 process_imported_caps();
5379 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5381 map
<client_t
,pair
<Session
*,uint64_t> > session_map
;
5382 C_MDC_RejoinSessionsOpened(MDCache
*c
) : MDCacheLogContext(c
) {}
5383 void finish(int r
) override
{
5384 ceph_assert(r
== 0);
5385 mdcache
->rejoin_open_sessions_finish(session_map
);
5389 void MDCache::rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
)
5391 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5392 mds
->server
->finish_force_open_sessions(session_map
);
5393 rejoin_session_map
.swap(session_map
);
5394 if (rejoin_gather
.empty())
5395 rejoin_gather_finish();
5398 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
)
5400 auto p
= cap_imports
.find(ino
);
5401 if (p
!= cap_imports
.end()) {
5402 dout(10) << __func__
<< " ino " << ino
<< " ret " << ret
<< dendl
;
5404 cap_imports_missing
.insert(ino
);
5405 } else if (ret
!= mds
->get_nodeid()) {
5406 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5407 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5408 ceph_assert(q
->second
.size() == 1);
5409 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5411 cap_imports
.erase(p
);
5416 bool MDCache::process_imported_caps()
5418 dout(10) << "process_imported_caps" << dendl
;
5420 if (!open_file_table
.is_prefetched() &&
5421 open_file_table
.prefetch_inodes()) {
5422 open_file_table
.wait_for_prefetch(
5423 new MDSInternalContextWrapper(mds
,
5424 new LambdaContext([this](int r
) {
5425 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5426 process_imported_caps();
5433 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5434 CInode
*in
= get_inode(p
->first
);
5436 ceph_assert(in
->is_auth());
5437 cap_imports_missing
.erase(p
->first
);
5440 if (cap_imports_missing
.count(p
->first
) > 0)
5443 cap_imports_num_opening
++;
5444 dout(10) << " opening missing ino " << p
->first
<< dendl
;
5445 open_ino(p
->first
, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p
->first
), false);
5446 if (!(cap_imports_num_opening
% 1000))
5447 mds
->heartbeat_reset();
5450 if (cap_imports_num_opening
> 0)
5453 // called by rejoin_gather_finish() ?
5454 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5455 if (!rejoin_client_map
.empty() &&
5456 rejoin_session_map
.empty()) {
5457 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this);
5458 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
,
5459 rejoin_client_metadata_map
,
5460 finish
->session_map
);
5461 ESessions
*le
= new ESessions(pv
, std::move(rejoin_client_map
),
5462 std::move(rejoin_client_metadata_map
));
5463 mds
->mdlog
->start_submit_entry(le
, finish
);
5464 mds
->mdlog
->flush();
5465 rejoin_client_map
.clear();
5466 rejoin_client_metadata_map
.clear();
5470 // process caps that were exported by slave rename
5471 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_slave_exports
.begin();
5472 p
!= rejoin_slave_exports
.end();
5474 CInode
*in
= get_inode(p
->first
);
5476 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5477 q
!= p
->second
.second
.end();
5479 auto r
= rejoin_session_map
.find(q
->first
);
5480 if (r
== rejoin_session_map
.end())
5483 Session
*session
= r
->second
.first
;
5484 Capability
*cap
= in
->get_client_cap(q
->first
);
5486 cap
= in
->add_client_cap(q
->first
, session
);
5487 // add empty item to reconnected_caps
5488 (void)reconnected_caps
[p
->first
][q
->first
];
5490 cap
->merge(q
->second
, true);
5492 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5493 ceph_assert(cap
->get_last_seq() == im
.issue_seq
);
5494 ceph_assert(cap
->get_mseq() == im
.mseq
);
5495 cap
->set_cap_id(im
.cap_id
);
5496 // send cap import because we assigned a new cap ID
5497 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5498 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5501 rejoin_slave_exports
.clear();
5502 rejoin_imported_caps
.clear();
5504 // process cap imports
5505 // ino -> client -> frommds -> capex
5506 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5507 CInode
*in
= get_inode(p
->first
);
5509 dout(10) << " still missing ino " << p
->first
5510 << ", will try again after replayed client requests" << dendl
;
5514 ceph_assert(in
->is_auth());
5515 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5518 auto r
= rejoin_session_map
.find(q
->first
);
5519 session
= (r
!= rejoin_session_map
.end() ? r
->second
.first
: nullptr);
5522 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5525 (void)rejoin_imported_caps
[r
->first
][p
->first
][q
->first
]; // all are zero
5529 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5530 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5531 if (r
->first
>= 0) {
5532 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5534 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5536 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5537 im
.cap_id
= cap
->get_cap_id();
5538 im
.issue_seq
= cap
->get_last_seq();
5539 im
.mseq
= cap
->get_mseq();
5543 cap_imports
.erase(p
++); // remove and move on
5548 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5549 rejoin_gather
.erase(mds
->get_nodeid());
5550 ceph_assert(!rejoin_ack_gather
.count(mds
->get_nodeid()));
5551 maybe_send_pending_rejoins();
5556 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5557 client_t client
, snapid_t snap_follows
)
5559 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5561 if (!realm
->has_snaps_in_range(snap_follows
+ 1, head_in
->first
- 1))
5564 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5565 snapid_t follows
= snap_follows
;
5568 CInode
*in
= pick_inode_snap(head_in
, follows
);
5572 bool need_snapflush
= false;
5573 for (auto p
= snaps
.lower_bound(std::max
<snapid_t
>(in
->first
, (follows
+ 1)));
5574 p
!= snaps
.end() && *p
<= in
->last
;
5576 head_in
->add_need_snapflush(in
, *p
, client
);
5577 need_snapflush
= true;
5580 if (!need_snapflush
)
5583 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5585 if (in
->client_snap_caps
.empty()) {
5586 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5587 int lockid
= cinode_lock_info
[i
].lock
;
5588 SimpleLock
*lock
= in
->get_lock(lockid
);
5591 lock
->set_state(LOCK_SNAP_SYNC
);
5592 lock
->get_wrlock(true);
5595 in
->client_snap_caps
.insert(client
);
5596 mds
->locker
->mark_need_snapflush_inode(in
);
5601 * choose lock states based on reconnected caps
5603 void MDCache::choose_lock_states_and_reconnect_caps()
5605 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5608 for (auto p
: inode_map
) {
5609 CInode
*in
= p
.second
;
5610 if (in
->last
!= CEPH_NOSNAP
)
5613 if (in
->is_auth() && !in
->is_base() && in
->inode
.is_dirty_rstat())
5614 in
->mark_dirty_rstat();
5617 auto q
= reconnected_caps
.find(in
->ino());
5618 if (q
!= reconnected_caps
.end()) {
5619 for (const auto &it
: q
->second
)
5620 dirty_caps
|= it
.second
.dirty_caps
;
5622 in
->choose_lock_states(dirty_caps
);
5623 dout(15) << " chose lock states on " << *in
<< dendl
;
5625 if (in
->snaprealm
&& !rejoin_pending_snaprealms
.count(in
)) {
5626 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5627 rejoin_pending_snaprealms
.insert(in
);
5630 if (!(++count
% 1000))
5631 mds
->heartbeat_reset();
5635 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5636 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5638 ref_t
<MClientSnap
> snap
;
5639 auto it
= splits
.find(client
);
5640 if (it
!= splits
.end()) {
5642 snap
->head
.op
= CEPH_SNAP_OP_SPLIT
;
5644 snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5645 splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
5646 snap
->head
.split
= realm
->inode
->ino();
5647 snap
->bl
= realm
->get_snap_trace();
5649 for (const auto& child
: realm
->open_children
)
5650 snap
->split_realms
.push_back(child
->inode
->ino());
5652 snap
->split_inos
.push_back(ino
);
5655 void MDCache::prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
,
5656 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5658 ceph_assert(parent_realm
);
5660 vector
<inodeno_t
> split_inos
;
5661 vector
<inodeno_t
> split_realms
;
5663 for (elist
<CInode
*>::iterator p
= realm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
5666 split_inos
.push_back((*p
)->ino());
5667 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5668 p
!= realm
->open_children
.end();
5670 split_realms
.push_back((*p
)->inode
->ino());
5672 for (const auto& p
: realm
->client_caps
) {
5673 ceph_assert(!p
.second
->empty());
5674 auto em
= splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple());
5676 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5677 update
->head
.split
= parent_realm
->inode
->ino();
5678 update
->split_inos
= split_inos
;
5679 update
->split_realms
= split_realms
;
5680 update
->bl
= parent_realm
->get_snap_trace();
5681 em
.first
->second
= std::move(update
);
5686 void MDCache::send_snaps(map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5688 dout(10) << "send_snaps" << dendl
;
5690 for (auto &p
: splits
) {
5691 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
.first
.v
));
5693 dout(10) << " client." << p
.first
5694 << " split " << p
.second
->head
.split
5695 << " inos " << p
.second
->split_inos
5697 mds
->send_message_client_counted(p
.second
, session
);
5699 dout(10) << " no session for client." << p
.first
<< dendl
;
5707 * remove any items from logsegment open_file lists that don't have
5710 void MDCache::clean_open_file_lists()
5712 dout(10) << "clean_open_file_lists" << dendl
;
5714 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5715 p
!= mds
->mdlog
->segments
.end();
5717 LogSegment
*ls
= p
->second
;
5719 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5723 if (in
->last
== CEPH_NOSNAP
) {
5724 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5725 in
->item_open_file
.remove_myself();
5727 if (in
->client_snap_caps
.empty()) {
5728 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5729 in
->item_open_file
.remove_myself();
5736 void MDCache::dump_openfiles(Formatter
*f
)
5738 f
->open_array_section("openfiles");
5739 for (auto p
= mds
->mdlog
->segments
.begin();
5740 p
!= mds
->mdlog
->segments
.end();
5742 LogSegment
*ls
= p
->second
;
5744 auto q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5748 if ((in
->last
== CEPH_NOSNAP
&& !in
->is_any_caps_wanted())
5749 || (in
->last
!= CEPH_NOSNAP
&& in
->client_snap_caps
.empty()))
5751 f
->open_object_section("file");
5752 in
->dump(f
, CInode::DUMP_PATH
| CInode::DUMP_INODE_STORE_BASE
| CInode::DUMP_CAPS
);
5759 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5761 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5762 << " on " << *in
<< dendl
;
5763 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5765 dout(10) << " no session for client." << client
<< dendl
;
5769 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5772 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5774 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5780 void MDCache::export_remaining_imported_caps()
5782 dout(10) << "export_remaining_imported_caps" << dendl
;
5784 stringstream warn_str
;
5787 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5788 warn_str
<< " ino " << p
->first
<< "\n";
5789 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5790 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5792 // mark client caps stale.
5793 auto stale
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
,
5795 mds
->get_osd_epoch_barrier());
5796 stale
->set_cap_peer(0, 0, 0, -1, 0);
5797 mds
->send_message_client_counted(stale
, q
->first
);
5801 if (!(++count
% 1000))
5802 mds
->heartbeat_reset();
5805 for (map
<inodeno_t
, MDSContext::vec
>::iterator p
= cap_reconnect_waiters
.begin();
5806 p
!= cap_reconnect_waiters
.end();
5808 mds
->queue_waiters(p
->second
);
5810 cap_imports
.clear();
5811 cap_reconnect_waiters
.clear();
5813 if (warn_str
.peek() != EOF
) {
5814 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:";
5815 mds
->clog
->warn(warn_str
);
5819 Capability
* MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5821 client_t client
= session
->info
.get_client();
5822 Capability
*cap
= nullptr;
5823 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5825 cap
= in
->reconnect_cap(client
, *rc
, session
);
5826 dout(10) << "try_reconnect_cap client." << client
5827 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5828 << " issue " << ccap_string(rc
->capinfo
.issued
)
5829 << " on " << *in
<< dendl
;
5830 remove_replay_cap_reconnect(in
->ino(), client
);
5832 if (in
->is_replicated()) {
5833 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5836 auto p
= reconnected_caps
.find(in
->ino());
5837 if (p
!= reconnected_caps
.end()) {
5838 auto q
= p
->second
.find(client
);
5839 if (q
!= p
->second
.end())
5840 dirty_caps
= q
->second
.dirty_caps
;
5842 in
->choose_lock_states(dirty_caps
);
5843 dout(15) << " chose lock states on " << *in
<< dendl
;
5846 map
<inodeno_t
, MDSContext::vec
>::iterator it
=
5847 cap_reconnect_waiters
.find(in
->ino());
5848 if (it
!= cap_reconnect_waiters
.end()) {
5849 mds
->queue_waiters(it
->second
);
5850 cap_reconnect_waiters
.erase(it
);
5859 // cap imports and delayed snap parent opens
5861 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5862 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5863 int peer
, int p_flags
)
5865 SnapRealm
*realm
= in
->find_snaprealm();
5866 if (realm
->have_past_parents_open()) {
5867 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5868 if (cap
->get_last_seq() == 0) // reconnected cap
5869 cap
->inc_last_seq();
5870 cap
->set_last_issue();
5871 cap
->set_last_issue_stamp(ceph_clock_now());
5873 auto reap
= make_message
<MClientCaps
>(
5874 CEPH_CAP_OP_IMPORT
, in
->ino(), realm
->inode
->ino(), cap
->get_cap_id(),
5875 cap
->get_last_seq(), cap
->pending(), cap
->wanted(), 0, cap
->get_mseq(),
5876 mds
->get_osd_epoch_barrier());
5877 in
->encode_cap_message(reap
, cap
);
5878 reap
->snapbl
= realm
->get_snap_trace();
5879 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5880 mds
->send_message_client_counted(reap
, session
);
5886 void MDCache::do_delayed_cap_imports()
5888 dout(10) << "do_delayed_cap_imports" << dendl
;
5890 ceph_assert(delayed_imported_caps
.empty());
5893 struct C_MDC_OpenSnapRealms
: public MDCacheContext
{
5894 explicit C_MDC_OpenSnapRealms(MDCache
*c
) : MDCacheContext(c
) {}
5895 void finish(int r
) override
{
5896 mdcache
->open_snaprealms();
5900 void MDCache::open_snaprealms()
5902 dout(10) << "open_snaprealms" << dendl
;
5904 MDSGatherBuilder
gather(g_ceph_context
);
5906 auto it
= rejoin_pending_snaprealms
.begin();
5907 while (it
!= rejoin_pending_snaprealms
.end()) {
5909 SnapRealm
*realm
= in
->snaprealm
;
5911 if (realm
->have_past_parents_open() ||
5912 realm
->open_parents(gather
.new_sub())) {
5913 dout(10) << " past parents now open on " << *in
<< dendl
;
5915 map
<client_t
,ref_t
<MClientSnap
>> splits
;
5916 // finish off client snaprealm reconnects?
5917 map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator q
= reconnected_snaprealms
.find(in
->ino());
5918 if (q
!= reconnected_snaprealms
.end()) {
5919 for (const auto& r
: q
->second
)
5920 finish_snaprealm_reconnect(r
.first
, realm
, r
.second
, splits
);
5921 reconnected_snaprealms
.erase(q
);
5924 for (elist
<CInode
*>::iterator p
= realm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
5927 auto q
= reconnected_caps
.find(child
->ino());
5928 ceph_assert(q
!= reconnected_caps
.end());
5929 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5930 Capability
*cap
= child
->get_client_cap(r
->first
);
5933 if (r
->second
.snap_follows
> 0) {
5934 if (r
->second
.snap_follows
< child
->first
- 1) {
5935 rebuild_need_snapflush(child
, realm
, r
->first
, r
->second
.snap_follows
);
5936 } else if (r
->second
.snapflush
) {
5937 // When processing a cap flush message that is re-sent, it's possble
5938 // that the sender has already released all WR caps. So we should
5939 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5940 cap
->mark_needsnapflush();
5943 // make sure client's cap is in the correct snaprealm.
5944 if (r
->second
.realm_ino
!= in
->ino()) {
5945 prepare_realm_split(realm
, r
->first
, child
->ino(), splits
);
5950 rejoin_pending_snaprealms
.erase(it
++);
5951 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5955 dout(10) << " opening past parents on " << *in
<< dendl
;
5960 if (gather
.has_subs()) {
5961 if (gather
.num_subs_remaining() == 0) {
5963 gather
.set_finisher(new C_MDSInternalNoop
);
5966 // for multimds, must succeed the first time
5967 ceph_assert(recovery_set
.empty());
5969 dout(10) << "open_snaprealms - waiting for "
5970 << gather
.num_subs_remaining() << dendl
;
5971 gather
.set_finisher(new C_MDC_OpenSnapRealms(this));
5977 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE
);
5979 if (!reconnected_snaprealms
.empty()) {
5980 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl
;
5981 for (auto& p
: reconnected_snaprealms
) {
5982 stringstream warn_str
;
5983 warn_str
<< " " << p
.first
<< " {";
5985 for (auto& q
: p
.second
) {
5988 warn_str
<< "client." << q
.first
<< "/" << q
.second
;
5991 dout(5) << warn_str
.str() << dendl
;
5994 ceph_assert(rejoin_waiters
.empty());
5995 ceph_assert(rejoin_pending_snaprealms
.empty());
5996 dout(10) << "open_snaprealms - all open" << dendl
;
5997 do_delayed_cap_imports();
5999 ceph_assert(rejoin_done
);
6000 rejoin_done
.release()->complete(0);
6001 reconnected_caps
.clear();
6004 bool MDCache::open_undef_inodes_dirfrags()
6006 dout(10) << "open_undef_inodes_dirfrags "
6007 << rejoin_undef_inodes
.size() << " inodes "
6008 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
6010 set
<CDir
*> fetch_queue
= rejoin_undef_dirfrags
;
6012 for (set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
6013 p
!= rejoin_undef_inodes
.end();
6016 ceph_assert(!in
->is_base());
6017 fetch_queue
.insert(in
->get_parent_dir());
6020 if (fetch_queue
.empty())
6023 MDSGatherBuilder
gather(g_ceph_context
,
6024 new MDSInternalContextWrapper(mds
,
6025 new LambdaContext([this](int r
) {
6026 if (rejoin_gather
.empty())
6027 rejoin_gather_finish();
6032 for (set
<CDir
*>::iterator p
= fetch_queue
.begin();
6033 p
!= fetch_queue
.end();
6036 CInode
*diri
= dir
->get_inode();
6037 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
6039 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
6040 ceph_assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
6041 dir
->fetch(gather
.new_sub());
6043 ceph_assert(gather
.has_subs());
6048 void MDCache::opened_undef_inode(CInode
*in
) {
6049 dout(10) << "opened_undef_inode " << *in
<< dendl
;
6050 rejoin_undef_inodes
.erase(in
);
6052 // FIXME: re-hash dentries if necessary
6053 ceph_assert(in
->inode
.dir_layout
.dl_dir_hash
== g_conf()->mds_default_dir_hash
);
6054 if (in
->get_num_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
6055 CDir
*dir
= in
->get_dirfrag(frag_t());
6057 rejoin_undef_dirfrags
.erase(dir
);
6058 in
->force_dirfrags();
6059 auto&& ls
= in
->get_dirfrags();
6060 for (const auto& dir
: ls
) {
6061 rejoin_undef_dirfrags
.insert(dir
);
6067 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
6068 map
<client_t
,ref_t
<MClientSnap
>>& updates
)
6070 if (seq
< realm
->get_newest_seq()) {
6071 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
6072 << realm
->get_newest_seq() << " on " << *realm
<< dendl
;
6073 auto snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_UPDATE
);
6074 snap
->bl
= realm
->get_snap_trace();
6075 for (const auto& child
: realm
->open_children
)
6076 snap
->split_realms
.push_back(child
->inode
->ino());
6077 updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
6079 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
6080 << " on " << *realm
<< dendl
;
6086 void MDCache::rejoin_send_acks()
6088 dout(7) << "rejoin_send_acks" << dendl
;
6091 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
6092 p
!= rejoin_unlinked_inodes
.end();
6094 for (set
<CInode
*>::iterator q
= p
->second
.begin();
6095 q
!= p
->second
.end();
6098 dout(7) << " unlinked inode " << *in
<< dendl
;
6100 if (!in
->is_replica(p
->first
))
6103 CDentry
*dn
= in
->get_parent_dn();
6104 if (dn
->is_replica(p
->first
))
6106 dn
->add_replica(p
->first
);
6107 CDir
*dir
= dn
->get_dir();
6108 if (dir
->is_replica(p
->first
))
6110 dir
->add_replica(p
->first
);
6111 in
= dir
->get_inode();
6112 if (in
->is_replica(p
->first
))
6114 in
->add_replica(p
->first
);
6120 rejoin_unlinked_inodes
.clear();
6122 // send acks to everyone in the recovery set
6123 map
<mds_rank_t
,ref_t
<MMDSCacheRejoin
>> acks
;
6124 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
6125 p
!= recovery_set
.end();
6127 if (rejoin_ack_sent
.count(*p
))
6129 acks
[*p
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
6132 rejoin_ack_sent
= recovery_set
;
6135 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
6136 p
!= subtrees
.end();
6138 CDir
*dir
= p
->first
;
6139 if (!dir
->is_auth())
6141 dout(10) << "subtree " << *dir
<< dendl
;
6143 // auth items in this subtree
6144 std::queue
<CDir
*> dq
;
6147 while (!dq
.empty()) {
6148 CDir
*dir
= dq
.front();
6152 for (auto &r
: dir
->get_replicas()) {
6153 auto it
= acks
.find(r
.first
);
6154 if (it
== acks
.end())
6156 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
.second
, dir
->dir_rep
);
6157 it
->second
->add_dirfrag_base(dir
);
6160 for (auto &p
: dir
->items
) {
6161 CDentry
*dn
= p
.second
;
6162 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6166 if (dnl
->is_primary())
6167 in
= dnl
->get_inode();
6170 for (auto &r
: dn
->get_replicas()) {
6171 auto it
= acks
.find(r
.first
);
6172 if (it
== acks
.end())
6174 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
6175 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
6176 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6177 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6179 dn
->lock
.get_replica_state());
6180 // peer missed MDentrylink message ?
6181 if (in
&& !in
->is_replica(r
.first
))
6182 in
->add_replica(r
.first
);
6188 for (auto &r
: in
->get_replicas()) {
6189 auto it
= acks
.find(r
.first
);
6190 if (it
== acks
.end())
6192 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6194 in
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6195 it
->second
->add_inode_locks(in
, ++r
.second
, bl
);
6198 // subdirs in this subtree?
6200 auto&& dirs
= in
->get_nested_dirfrags();
6201 for (const auto& dir
: dirs
) {
6210 if (root
&& root
->is_auth())
6211 for (auto &r
: root
->get_replicas()) {
6212 auto it
= acks
.find(r
.first
);
6213 if (it
== acks
.end())
6215 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6217 root
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6218 it
->second
->add_inode_locks(root
, ++r
.second
, bl
);
6221 for (auto &r
: myin
->get_replicas()) {
6222 auto it
= acks
.find(r
.first
);
6223 if (it
== acks
.end())
6225 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6227 myin
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6228 it
->second
->add_inode_locks(myin
, ++r
.second
, bl
);
6231 // include inode base for any inodes whose scatterlocks may have updated
6232 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6233 p
!= rejoin_potential_updated_scatterlocks
.end();
6236 for (const auto &r
: in
->get_replicas()) {
6237 auto it
= acks
.find(r
.first
);
6238 if (it
== acks
.end())
6240 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6245 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6246 encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6247 mds
->send_message_mds(p
->second
, p
->first
);
6250 rejoin_imported_caps
.clear();
6253 class C_MDC_ReIssueCaps
: public MDCacheContext
{
6256 C_MDC_ReIssueCaps(MDCache
*mdc
, CInode
*i
) :
6257 MDCacheContext(mdc
), in(i
)
6259 in
->get(CInode::PIN_PTRWAITER
);
6261 void finish(int r
) override
{
6262 if (!mdcache
->mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6263 mdcache
->mds
->locker
->issue_caps(in
);
6264 in
->put(CInode::PIN_PTRWAITER
);
6268 void MDCache::reissue_all_caps()
6270 dout(10) << "reissue_all_caps" << dendl
;
6273 for (auto &p
: inode_map
) {
6275 CInode
*in
= p
.second
;
6276 if (in
->is_head() && in
->is_any_caps()) {
6277 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6278 if (in
->is_frozen_inode()) {
6279 in
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDC_ReIssueCaps(this, in
));
6282 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6283 n
+= mds
->locker
->issue_caps(in
);
6286 if ((count
% 1000) + n
>= 1000)
6287 mds
->heartbeat_reset();
6293 // ===============================================================================
6295 struct C_MDC_QueuedCow
: public MDCacheContext
{
6298 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6299 MDCacheContext(mdc
), in(i
), mut(m
) {}
6300 void finish(int r
) override
{
6301 mdcache
->_queued_file_recover_cow(in
, mut
);
6306 void MDCache::queue_file_recover(CInode
*in
)
6308 dout(10) << "queue_file_recover " << *in
<< dendl
;
6309 ceph_assert(in
->is_auth());
6313 SnapRealm *realm = in->find_snaprealm();
6314 set<snapid_t> s = realm->get_snaps();
6315 while (!s.empty() && *s.begin() < in->first)
6317 while (!s.empty() && *s.rbegin() > in->last)
6318 s.erase(*s.rbegin());
6319 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6321 CInode::mempool_inode pi = in->project_inode();
6322 pi->version = in->pre_dirty();
6324 auto mut(std::make_shared<MutationImpl>());
6325 mut->ls = mds->mdlog->get_current_segment();
6326 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6327 mds->mdlog->start_entry(le);
6328 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6330 s.erase(*s.begin());
6331 while (!s.empty()) {
6332 snapid_t snapid = *s.begin();
6333 CInode *cow_inode = 0;
6334 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6335 ceph_assert(cow_inode);
6336 recovery_queue.enqueue(cow_inode);
6337 s.erase(*s.begin());
6340 in->parent->first = in->first;
6341 le->metablob.add_primary_dentry(in->parent, in, true);
6342 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6343 mds->mdlog->flush();
6347 recovery_queue
.enqueue(in
);
6350 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6352 in
->pop_and_dirty_projected_inode(mut
->ls
);
6354 mds
->locker
->drop_locks(mut
.get());
6360 * called after recovery to recover file sizes for previously opened (for write)
6361 * files. that is, those where max_size > size.
6363 void MDCache::identify_files_to_recover()
6365 dout(10) << "identify_files_to_recover" << dendl
;
6367 for (auto &p
: inode_map
) {
6368 CInode
*in
= p
.second
;
6372 if (in
->last
!= CEPH_NOSNAP
)
6375 // Only normal files need file size recovery
6376 if (!in
->is_file()) {
6380 bool recover
= false;
6381 for (map
<client_t
,client_writeable_range_t
>::iterator p
= in
->inode
.client_ranges
.begin();
6382 p
!= in
->inode
.client_ranges
.end();
6384 Capability
*cap
= in
->get_client_cap(p
->first
);
6386 cap
->mark_clientwriteable();
6388 dout(10) << " client." << p
->first
<< " has range " << p
->second
<< " but no cap on " << *in
<< dendl
;
6395 if (in
->filelock
.is_stable()) {
6396 in
->auth_pin(&in
->filelock
);
6398 ceph_assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6400 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6401 rejoin_recover_q
.push_back(in
);
6403 rejoin_check_q
.push_back(in
);
6406 if (!(++count
% 1000))
6407 mds
->heartbeat_reset();
6411 void MDCache::start_files_to_recover()
6413 for (CInode
*in
: rejoin_check_q
) {
6414 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6415 mds
->locker
->issue_caps(in
);
6416 mds
->locker
->check_inode_max_size(in
);
6418 rejoin_check_q
.clear();
6419 for (CInode
*in
: rejoin_recover_q
) {
6420 mds
->locker
->file_recover(&in
->filelock
);
6422 if (!rejoin_recover_q
.empty()) {
6423 rejoin_recover_q
.clear();
6428 void MDCache::do_file_recover()
6430 recovery_queue
.advance();
6433 // ===============================================================================
6436 // ----------------------------
6439 class C_MDC_RetryTruncate
: public MDCacheContext
{
6443 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6444 MDCacheContext(c
), in(i
), ls(l
) {}
6445 void finish(int r
) override
{
6446 mdcache
->_truncate_inode(in
, ls
);
6450 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6452 auto pi
= in
->get_projected_inode();
6453 dout(10) << "truncate_inode "
6454 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6458 ls
->truncating_inodes
.insert(in
);
6459 in
->get(CInode::PIN_TRUNCATING
);
6462 if (!in
->client_need_snapflush
.empty() &&
6463 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6464 ceph_assert(in
->filelock
.is_xlocked());
6465 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6466 mds
->locker
->issue_caps(in
);
6470 _truncate_inode(in
, ls
);
6473 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6476 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6477 MDCacheIOContext(c
, false), in(i
), ls(l
) {
6479 void finish(int r
) override
{
6480 ceph_assert(r
== 0 || r
== -ENOENT
);
6481 mdcache
->truncate_inode_finish(in
, ls
);
6483 void print(ostream
& out
) const override
{
6484 out
<< "file_truncate(" << in
->ino() << ")";
6488 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6490 auto pi
= &in
->inode
;
6491 dout(10) << "_truncate_inode "
6492 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6493 << " on " << *in
<< dendl
;
6495 ceph_assert(pi
->is_truncating());
6496 ceph_assert(pi
->truncate_size
< (1ULL << 63));
6497 ceph_assert(pi
->truncate_from
< (1ULL << 63));
6498 ceph_assert(pi
->truncate_size
< pi
->truncate_from
);
6501 SnapRealm
*realm
= in
->find_snaprealm();
6502 SnapContext nullsnap
;
6503 const SnapContext
*snapc
;
6505 dout(10) << " realm " << *realm
<< dendl
;
6506 snapc
= &realm
->get_snap_context();
6508 dout(10) << " NO realm, using null context" << dendl
;
6510 ceph_assert(in
->last
== CEPH_NOSNAP
);
6512 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
<< dendl
;
6513 filer
.truncate(in
->inode
.ino
, &in
->inode
.layout
, *snapc
,
6514 pi
->truncate_size
, pi
->truncate_from
-pi
->truncate_size
,
6515 pi
->truncate_seq
, ceph::real_time::min(), 0,
6516 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6520 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6523 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6524 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6525 void finish(int r
) override
{
6526 mdcache
->truncate_inode_logged(in
, mut
);
6530 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6532 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6534 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6535 ceph_assert(p
!= ls
->truncating_inodes
.end());
6536 ls
->truncating_inodes
.erase(p
);
6539 auto &pi
= in
->project_inode();
6540 pi
.inode
.version
= in
->pre_dirty();
6541 pi
.inode
.truncate_from
= 0;
6542 pi
.inode
.truncate_pending
--;
6544 MutationRef
mut(new MutationImpl());
6545 mut
->ls
= mds
->mdlog
->get_current_segment();
6546 mut
->add_projected_inode(in
);
6548 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6549 mds
->mdlog
->start_entry(le
);
6550 CDentry
*dn
= in
->get_projected_parent_dn();
6551 le
->metablob
.add_dir_context(dn
->get_dir());
6552 le
->metablob
.add_primary_dentry(dn
, in
, true);
6553 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6555 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6556 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6558 // flush immediately if there are readers/writers waiting
6559 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6560 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6561 mds
->mdlog
->flush();
6564 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6566 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6568 mds
->locker
->drop_locks(mut
.get());
6571 in
->put(CInode::PIN_TRUNCATING
);
6572 in
->auth_unpin(this);
6574 MDSContext::vec waiters
;
6575 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6576 mds
->queue_waiters(waiters
);
6580 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6582 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6583 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6584 ls
->truncating_inodes
.insert(in
);
6585 in
->get(CInode::PIN_TRUNCATING
);
6588 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6590 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6591 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6592 // if we have the logseg the truncate started in, it must be in our list.
6593 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6594 ceph_assert(p
!= ls
->truncating_inodes
.end());
6595 ls
->truncating_inodes
.erase(p
);
6596 in
->put(CInode::PIN_TRUNCATING
);
6599 void MDCache::start_recovered_truncates()
6601 dout(10) << "start_recovered_truncates" << dendl
;
6602 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6603 p
!= mds
->mdlog
->segments
.end();
6605 LogSegment
*ls
= p
->second
;
6606 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6607 q
!= ls
->truncating_inodes
.end();
6612 if (!in
->client_need_snapflush
.empty() &&
6613 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6614 ceph_assert(in
->filelock
.is_stable());
6615 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6616 in
->auth_pin(&in
->filelock
);
6617 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6618 // start_files_to_recover will revoke caps
6621 _truncate_inode(in
, ls
);
6627 class C_MDS_purge_completed_finish
: public MDCacheLogContext
{
6628 interval_set
<inodeno_t
> inos
;
6629 version_t inotablev
;
6632 C_MDS_purge_completed_finish(MDCache
*m
,
6633 interval_set
<inodeno_t
> i
,
6636 : MDCacheLogContext(m
),
6640 void finish(int r
) override
{
6643 ls
->purge_inodes_finish(inos
);
6644 mdcache
->mds
->inotable
->apply_release_ids(inos
);
6645 assert(mdcache
->mds
->inotable
->get_version() == inotablev
);
6650 void MDCache::start_purge_inodes(){
6651 dout(10) << "start_purge_inodes" << dendl
;
6652 for (auto& p
: mds
->mdlog
->segments
){
6653 LogSegment
*ls
= p
.second
;
6654 if (ls
->purge_inodes
.size()){
6655 purge_inodes(ls
->purge_inodes
, ls
);
6660 void MDCache::purge_inodes(const interval_set
<inodeno_t
>& inos
, LogSegment
*ls
)
6662 auto cb
= new LambdaContext([this, inos
, ls
](int r
){
6663 assert(r
== 0 || r
== -2);
6664 mds
->inotable
->project_release_ids(inos
);
6665 version_t piv
= mds
->inotable
->get_projected_version();
6667 mds
->mdlog
->start_submit_entry(new EPurged(inos
, piv
, ls
->seq
),
6668 new C_MDS_purge_completed_finish(this, inos
, piv
, ls
));
6669 mds
->mdlog
->flush();
6672 dout(10) << __func__
<< " start purge data : " << inos
<< dendl
;
6673 C_GatherBuilder
gather(g_ceph_context
,
6674 new C_OnFinisher( new MDSIOContextWrapper(mds
, cb
), mds
->finisher
));
6675 SnapContext nullsnapc
;
6676 uint64_t num
= Striper::get_num_objects(default_file_layout
, default_file_layout
.get_period());
6677 for (auto p
= inos
.begin();
6680 dout(10) << __func__
6681 << " prealloc_inos : " << inos
.size()
6682 << " start : " << p
.get_start().val
6683 << " length : " << p
.get_len() << " "
6684 << " seq : " << ls
->seq
<< dendl
;
6686 for (_inodeno_t i
= 0; i
< p
.get_len(); i
++){
6687 dout(20) << __func__
<< " : " << p
.get_start() + i
<< dendl
;
6688 filer
.purge_range(p
.get_start() + i
,
6689 &default_file_layout
,
6692 ceph::real_clock::now(),
6693 0, gather
.new_sub());
6699 // ================================================================================
6702 std::pair
<bool, uint64_t> MDCache::trim_lru(uint64_t count
, expiremap
& expiremap
)
6704 bool is_standby_replay
= mds
->is_standby_replay();
6705 std::vector
<CDentry
*> unexpirables
;
6706 uint64_t trimmed
= 0;
6708 auto trim_threshold
= g_conf().get_val
<Option::size_t>("mds_cache_trim_threshold");
6710 dout(7) << "trim_lru trimming " << count
6711 << " items from LRU"
6712 << " size=" << lru
.lru_get_size()
6713 << " mid=" << lru
.lru_get_top()
6714 << " pintail=" << lru
.lru_get_pintail()
6715 << " pinned=" << lru
.lru_get_num_pinned()
6718 const uint64_t trim_counter_start
= trim_counter
.get();
6719 bool throttled
= false;
6721 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6722 if (throttled
) break;
6723 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6726 if (trim_dentry(dn
, expiremap
)) {
6727 unexpirables
.push_back(dn
);
6733 for (auto &dn
: unexpirables
) {
6734 bottom_lru
.lru_insert_mid(dn
);
6736 unexpirables
.clear();
6738 // trim dentries from the LRU until count is reached
6739 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6740 while (!throttled
&& (cache_toofull() || count
> 0 || is_standby_replay
)) {
6741 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6742 if (throttled
) break;
6743 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6747 if ((is_standby_replay
&& dn
->get_linkage()->inode
&&
6748 dn
->get_linkage()->inode
->item_open_file
.is_on_list())) {
6749 // we move the inodes that need to be trimmed to the end of the lru queue.
6750 // refer to MDCache::standby_trim_segment
6751 lru
.lru_insert_bot(dn
);
6753 } else if (trim_dentry(dn
, expiremap
)) {
6754 unexpirables
.push_back(dn
);
6757 if (count
> 0) count
--;
6760 trim_counter
.hit(trimmed
);
6762 for (auto &dn
: unexpirables
) {
6763 lru
.lru_insert_mid(dn
);
6765 unexpirables
.clear();
6767 dout(7) << "trim_lru trimmed " << trimmed
<< " items" << dendl
;
6768 return std::pair
<bool, uint64_t>(throttled
, trimmed
);
6772 * note: only called while MDS is active or stopping... NOT during recovery.
6773 * however, we may expire a replica whose authority is recovering.
6775 * @param count is number of dentries to try to expire
6777 std::pair
<bool, uint64_t> MDCache::trim(uint64_t count
)
6779 uint64_t used
= cache_size();
6780 uint64_t limit
= cache_memory_limit
;
6781 expiremap expiremap
;
6783 dout(7) << "trim bytes_used=" << bytes2str(used
)
6784 << " limit=" << bytes2str(limit
)
6785 << " reservation=" << cache_reservation
6786 << "% count=" << count
<< dendl
;
6788 // process delayed eval_stray()
6789 stray_manager
.advance_delayed();
6791 auto result
= trim_lru(count
, expiremap
);
6792 auto& trimmed
= result
.second
;
6794 // trim non-auth, non-bound subtrees
6795 for (auto p
= subtrees
.begin(); p
!= subtrees
.end();) {
6796 CDir
*dir
= p
->first
;
6798 CInode
*diri
= dir
->get_inode();
6799 if (dir
->is_auth()) {
6800 if (diri
->is_auth() && !diri
->is_base()) {
6801 /* this situation should correspond to an export pin */
6802 if (dir
->get_num_head_items() == 0 && dir
->get_num_ref() == 1) {
6803 /* pinned empty subtree, try to drop */
6804 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
6805 dout(20) << "trimming empty pinned subtree " << *dir
<< dendl
;
6806 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
6807 remove_subtree(dir
);
6808 diri
->close_dirfrag(dir
->dirfrag().frag
);
6811 } else if (!diri
->is_auth() && !diri
->is_base() && dir
->get_num_head_items() == 0) {
6812 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6813 !(mds
->is_active() || mds
->is_stopping()) ||
6814 dir
->is_freezing() || dir
->is_frozen())
6817 migrator
->export_empty_import(dir
);
6820 } else if (!diri
->is_auth() && dir
->get_num_ref() <= 1) {
6822 auto&& ls
= diri
->get_subtree_dirfrags();
6823 if (diri
->get_num_ref() > (int)ls
.size()) // only pinned by subtrees
6826 // don't trim subtree root if its auth MDS is recovering.
6827 // This simplify the cache rejoin code.
6828 if (dir
->is_subtree_root() && rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6830 trim_dirfrag(dir
, 0, expiremap
);
6836 if (mds
->is_stopping() && root
) {
6837 auto&& ls
= root
->get_dirfrags();
6838 for (const auto& dir
: ls
) {
6839 if (dir
->get_num_ref() == 1) { // subtree pin
6840 trim_dirfrag(dir
, 0, expiremap
);
6844 if (root
->get_num_ref() == 0) {
6845 trim_inode(0, root
, 0, expiremap
);
6850 std::set
<mds_rank_t
> stopping
;
6851 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6852 stopping
.erase(mds
->get_nodeid());
6853 for (auto rank
: stopping
) {
6854 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6858 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(rank
), std::forward_as_tuple());
6860 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
6863 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
<< dendl
;
6865 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6867 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6868 auto&& ls
= mdsdir_in
->get_dirfrags();
6869 for (auto dir
: ls
) {
6870 if (dir
->get_num_ref() == 1) { // subtree pin
6871 trim_dirfrag(dir
, dir
, expiremap
);
6875 if (mdsdir_in
->get_num_ref() == 0) {
6876 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6880 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6884 // Other rank's base inodes (when I'm stopping)
6885 if (mds
->is_stopping()) {
6886 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6887 p
!= base_inodes
.end();) {
6888 CInode
*base_in
= *p
;
6890 if (MDS_INO_IS_MDSDIR(base_in
->ino()) &&
6891 MDS_INO_MDSDIR_OWNER(base_in
->ino()) != mds
->get_nodeid()) {
6892 dout(20) << __func__
<< ": maybe trimming base: " << *base_in
<< dendl
;
6893 if (base_in
->get_num_ref() == 0) {
6894 trim_inode(NULL
, base_in
, NULL
, expiremap
);
6901 // send any expire messages
6902 send_expire_messages(expiremap
);
6907 void MDCache::send_expire_messages(expiremap
& expiremap
)
6910 for (const auto &p
: expiremap
) {
6911 if (mds
->is_cluster_degraded() &&
6912 (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
6913 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
6914 rejoin_sent
.count(p
.first
) == 0))) {
6917 dout(7) << "sending cache_expire to " << p
.first
<< dendl
;
6918 mds
->send_message_mds(p
.second
, p
.first
);
6924 bool MDCache::trim_dentry(CDentry
*dn
, expiremap
& expiremap
)
6926 dout(12) << "trim_dentry " << *dn
<< dendl
;
6928 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6930 CDir
*dir
= dn
->get_dir();
6933 CDir
*con
= get_subtree_root(dir
);
6935 dout(12) << " in container " << *con
<< dendl
;
6937 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
6938 ceph_assert(dn
->is_auth());
6941 // If replica dentry is not readable, it's likely we will receive
6942 // MDentryLink/MDentryUnlink message soon (It's possible we first
6943 // receive a MDentryUnlink message, then MDentryLink message)
6944 // MDentryLink message only replicates an inode, so we should
6945 // avoid trimming the inode's parent dentry. This is because that
6946 // unconnected replicas are problematic for subtree migration.
6947 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
6948 !dn
->get_dir()->get_inode()->is_stray())
6951 // adjust the dir state
6952 // NOTE: we can safely remove a clean, null dentry without effecting
6953 // directory completeness.
6954 // (check this _before_ we unlink the inode, below!)
6955 bool clear_complete
= false;
6956 if (!(dnl
->is_null() && dn
->is_clean()))
6957 clear_complete
= true;
6959 // unlink the dentry
6960 if (dnl
->is_remote()) {
6962 dir
->unlink_inode(dn
, false);
6963 } else if (dnl
->is_primary()) {
6964 // expire the inode, too.
6965 CInode
*in
= dnl
->get_inode();
6967 if (trim_inode(dn
, in
, con
, expiremap
))
6968 return true; // purging stray instead of trimming
6970 ceph_assert(dnl
->is_null());
6973 if (!dn
->is_auth()) {
6974 // notify dentry authority.
6975 mds_authority_t auth
= dn
->authority();
6977 for (int p
=0; p
<2; p
++) {
6978 mds_rank_t a
= auth
.first
;
6979 if (p
) a
= auth
.second
;
6980 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6981 if (mds
->get_nodeid() == auth
.second
&&
6982 con
->is_importing()) break; // don't send any expire while importing.
6983 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6985 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
6986 ceph_assert(a
!= mds
->get_nodeid());
6987 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6989 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
6990 em
.first
->second
->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->get_name(), dn
->last
, dn
->get_replica_nonce());
6995 if (dn
->last
== CEPH_NOSNAP
&& dir
->is_auth())
6996 dir
->add_to_bloom(dn
);
6997 dir
->remove_dentry(dn
);
7000 dir
->state_clear(CDir::STATE_COMPLETE
);
7002 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
7007 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
)
7009 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
7011 if (dir
->is_subtree_root()) {
7012 ceph_assert(!dir
->is_auth() ||
7013 (!dir
->is_replicated() && dir
->inode
->is_base()));
7014 remove_subtree(dir
); // remove from subtree map
7016 ceph_assert(dir
->get_num_ref() == 0);
7018 CInode
*in
= dir
->get_inode();
7020 if (!dir
->is_auth()) {
7021 mds_authority_t auth
= dir
->authority();
7023 // was this an auth delegation? (if so, slightly modified container)
7025 if (dir
->is_subtree_root()) {
7026 dout(12) << " subtree root, container is " << *dir
<< dendl
;
7028 condf
= dir
->dirfrag();
7030 condf
= con
->dirfrag();
7033 for (int p
=0; p
<2; p
++) {
7034 mds_rank_t a
= auth
.first
;
7035 if (p
) a
= auth
.second
;
7036 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7037 if (mds
->get_nodeid() == auth
.second
&&
7038 con
->is_importing()) break; // don't send any expire while importing.
7039 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7041 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
7042 ceph_assert(a
!= mds
->get_nodeid());
7043 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7045 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
7046 em
.first
->second
->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
7050 in
->close_dirfrag(dir
->dirfrag().frag
);
7054 * Try trimming an inode from the cache
7056 * @return true if the inode is still in cache, else false if it was trimmed
7058 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
& expiremap
)
7060 dout(15) << "trim_inode " << *in
<< dendl
;
7061 ceph_assert(in
->get_num_ref() == 0);
7064 // If replica inode's dirfragtreelock is not readable, it's likely
7065 // some dirfrags of the inode are being fragmented and we will receive
7066 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7067 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7068 // This is because that unconnected replicas are problematic for
7069 // subtree migration.
7071 if (!in
->is_auth() && !mds
->locker
->rdlock_try(&in
->dirfragtreelock
, -1)) {
7076 auto&& dfls
= in
->get_dirfrags();
7077 for (const auto& dir
: dfls
) {
7078 ceph_assert(!dir
->is_subtree_root());
7079 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
7084 if (in
->is_auth()) {
7085 // eval stray after closing dirfrags
7086 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
7087 maybe_eval_stray(in
);
7088 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
7092 mds_authority_t auth
= in
->authority();
7096 df
= con
->dirfrag();
7098 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
7100 for (int p
=0; p
<2; p
++) {
7101 mds_rank_t a
= auth
.first
;
7102 if (p
) a
= auth
.second
;
7103 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7104 if (con
&& mds
->get_nodeid() == auth
.second
&&
7105 con
->is_importing()) break; // don't send any expire while importing.
7106 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7108 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
7109 ceph_assert(a
!= mds
->get_nodeid());
7110 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7112 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
7113 em
.first
->second
->add_inode(df
, in
->vino(), in
->get_replica_nonce());
7118 if (in->is_auth()) {
7119 if (in->hack_accessed)
7120 mds->logger->inc("outt");
7122 mds->logger->inc("outut");
7123 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7130 dn
->get_dir()->unlink_inode(dn
, false);
7137 * trim_non_auth - remove any non-auth items from our cache
7139 * this reduces the amount of non-auth metadata in our cache, reducing the
7140 * load incurred by the rejoin phase.
7142 * the only non-auth items that remain are those that are needed to
7143 * attach our own subtrees to the root.
7145 * when we are done, all dentries will be in the top bit of the lru.
7147 * why we have to do this:
7148 * we may not have accurate linkage for non-auth items. which means we will
7149 * know which subtree it falls into, and can not be sure to declare it to the
7150 * correct authority.
7152 void MDCache::trim_non_auth()
7154 dout(7) << "trim_non_auth" << dendl
;
7156 // temporarily pin all subtree roots
7157 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7158 p
!= subtrees
.end();
7160 p
->first
->get(CDir::PIN_SUBTREETEMP
);
7162 list
<CDentry
*> auth_list
;
7164 // trim non-auth items from the lru
7167 if (bottom_lru
.lru_get_size() > 0)
7168 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
7169 if (!dn
&& lru
.lru_get_size() > 0)
7170 dn
= static_cast<CDentry
*>(lru
.lru_expire());
7174 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7176 if (dn
->is_auth()) {
7177 // add back into lru (at the top)
7178 auth_list
.push_back(dn
);
7180 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
7181 dn
->unlink_remote(dnl
);
7183 // non-auth. expire.
7184 CDir
*dir
= dn
->get_dir();
7187 // unlink the dentry
7188 dout(10) << " removing " << *dn
<< dendl
;
7189 if (dnl
->is_remote()) {
7190 dir
->unlink_inode(dn
, false);
7192 else if (dnl
->is_primary()) {
7193 CInode
*in
= dnl
->get_inode();
7194 dout(10) << " removing " << *in
<< dendl
;
7195 auto&& ls
= in
->get_dirfrags();
7196 for (const auto& subdir
: ls
) {
7197 ceph_assert(!subdir
->is_subtree_root());
7198 in
->close_dirfrag(subdir
->dirfrag().frag
);
7200 dir
->unlink_inode(dn
, false);
7204 ceph_assert(dnl
->is_null());
7207 ceph_assert(!dir
->has_bloom());
7208 dir
->remove_dentry(dn
);
7209 // adjust the dir state
7210 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
7211 // close empty non-auth dirfrag
7212 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
7213 dir
->inode
->close_dirfrag(dir
->get_frag());
7217 for (const auto& dn
: auth_list
) {
7218 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
7219 bottom_lru
.lru_insert_mid(dn
);
7221 lru
.lru_insert_top(dn
);
7224 // move everything in the pintail to the top bit of the lru.
7225 lru
.lru_touch_entire_pintail();
7227 // unpin all subtrees
7228 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7229 p
!= subtrees
.end();
7231 p
->first
->put(CDir::PIN_SUBTREETEMP
);
7233 if (lru
.lru_get_size() == 0 &&
7234 bottom_lru
.lru_get_size() == 0) {
7235 // root, stray, etc.?
7236 auto p
= inode_map
.begin();
7237 while (p
!= inode_map
.end()) {
7238 CInode
*in
= p
->second
;
7240 if (!in
->is_auth()) {
7241 auto&& ls
= in
->get_dirfrags();
7242 for (const auto& dir
: ls
) {
7243 dout(10) << " removing " << *dir
<< dendl
;
7244 ceph_assert(dir
->get_num_ref() == 1); // SUBTREE
7245 remove_subtree(dir
);
7246 in
->close_dirfrag(dir
->dirfrag().frag
);
7248 dout(10) << " removing " << *in
<< dendl
;
7249 ceph_assert(!in
->get_parent_dn());
7250 ceph_assert(in
->get_num_ref() == 0);
7260 * Recursively trim the subtree rooted at directory to remove all
7261 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7262 * of those links. This is used to clear invalid data out of the cache.
7263 * Note that it doesn't clear the passed-in directory, since that's not
7266 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
7268 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
7270 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
7272 auto j
= dir
->begin();
7274 while (j
!= dir
->end()) {
7276 CDentry
*dn
= i
->second
;
7277 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
7278 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7279 if (dnl
->is_primary()) { // check for subdirectories, etc
7280 CInode
*in
= dnl
->get_inode();
7281 bool keep_inode
= false;
7283 auto&& subdirs
= in
->get_dirfrags();
7284 for (const auto& subdir
: subdirs
) {
7285 if (subdir
->is_subtree_root()) {
7287 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << *subdir
<< dendl
;
7289 if (trim_non_auth_subtree(subdir
))
7292 in
->close_dirfrag(subdir
->get_frag());
7293 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
7299 if (!keep_inode
) { // remove it!
7300 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
7301 dir
->unlink_inode(dn
, false);
7303 ceph_assert(!dir
->has_bloom());
7304 dir
->remove_dentry(dn
);
7306 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
7307 dn
->state_clear(CDentry::STATE_AUTH
);
7308 in
->state_clear(CInode::STATE_AUTH
);
7310 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for slave rollback
7311 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7312 } else { // just remove it
7313 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7314 if (dnl
->is_remote())
7315 dir
->unlink_inode(dn
, false);
7316 dir
->remove_dentry(dn
);
7319 dir
->state_clear(CDir::STATE_AUTH
);
7321 * We've now checked all our children and deleted those that need it.
7322 * Now return to caller, and tell them if *we're* a keeper.
7324 return keep_dir
|| dir
->get_num_any();
7328 * during replay, when we determine a subtree is no longer ours, we
7329 * try to trim it from our cache. because subtrees must be connected
7330 * to the root, the fact that we can trim this tree may mean that our
7331 * children or parents can also be trimmed.
7333 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7335 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7337 // can we now trim child subtrees?
7339 get_subtree_bounds(dir
, bounds
);
7340 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7342 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7343 bd
->get_num_any() == 0 && // and empty
7344 can_trim_non_auth_dirfrag(bd
)) {
7345 CInode
*bi
= bd
->get_inode();
7346 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7349 bi
->close_dirfrag(bd
->get_frag());
7353 if (trim_non_auth_subtree(dir
)) {
7355 try_subtree_merge(dir
);
7357 // can we trim this subtree (and possibly our ancestors) too?
7359 CInode
*diri
= dir
->get_inode();
7360 if (diri
->is_base()) {
7361 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7362 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7363 remove_subtree(dir
);
7365 diri
->close_dirfrag(dir
->get_frag());
7367 dout(10) << " removing " << *diri
<< dendl
;
7368 ceph_assert(!diri
->get_parent_dn());
7369 ceph_assert(diri
->get_num_ref() == 0);
7375 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7376 dout(10) << " parent subtree is " << *psub
<< dendl
;
7377 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7378 break; // we are auth, keep.
7380 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7381 remove_subtree(dir
);
7383 diri
->close_dirfrag(dir
->get_frag());
7385 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7386 if (trim_non_auth_subtree(psub
))
7395 void MDCache::standby_trim_segment(LogSegment
*ls
)
7397 auto try_trim_inode
= [this](CInode
*in
) {
7398 if (in
->get_num_ref() == 0 &&
7399 !in
->item_open_file
.is_on_list() &&
7400 in
->parent
!= NULL
&&
7401 in
->parent
->get_num_ref() == 0){
7402 touch_dentry_bottom(in
->parent
);
7406 auto try_trim_dentry
= [this](CDentry
*dn
) {
7407 if (dn
->get_num_ref() > 0)
7409 auto in
= dn
->get_linkage()->inode
;
7410 if(in
&& in
->item_open_file
.is_on_list())
7412 touch_dentry_bottom(dn
);
7415 ls
->new_dirfrags
.clear_list();
7416 ls
->open_files
.clear_list();
7418 while (!ls
->dirty_dirfrags
.empty()) {
7419 CDir
*dir
= ls
->dirty_dirfrags
.front();
7422 try_trim_inode(dir
->inode
);
7424 while (!ls
->dirty_inodes
.empty()) {
7425 CInode
*in
= ls
->dirty_inodes
.front();
7429 while (!ls
->dirty_dentries
.empty()) {
7430 CDentry
*dn
= ls
->dirty_dentries
.front();
7432 try_trim_dentry(dn
);
7434 while (!ls
->dirty_parent_inodes
.empty()) {
7435 CInode
*in
= ls
->dirty_parent_inodes
.front();
7436 in
->clear_dirty_parent();
7439 while (!ls
->dirty_dirfrag_dir
.empty()) {
7440 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7441 in
->filelock
.remove_dirty();
7444 while (!ls
->dirty_dirfrag_nest
.empty()) {
7445 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7446 in
->nestlock
.remove_dirty();
7449 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7450 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7451 in
->dirfragtreelock
.remove_dirty();
7454 while (!ls
->truncating_inodes
.empty()) {
7455 auto it
= ls
->truncating_inodes
.begin();
7457 ls
->truncating_inodes
.erase(it
);
7458 in
->put(CInode::PIN_TRUNCATING
);
7463 void MDCache::handle_cache_expire(const cref_t
<MCacheExpire
> &m
)
7465 mds_rank_t from
= mds_rank_t(m
->get_from());
7467 dout(7) << "cache_expire from mds." << from
<< dendl
;
7469 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7473 set
<SimpleLock
*> gather_locks
;
7475 for (const auto &p
: m
->realms
) {
7477 if (p
.first
.ino
> 0) {
7478 CInode
*expired_inode
= get_inode(p
.first
.ino
);
7479 ceph_assert(expired_inode
); // we had better have this.
7480 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
.first
.frag
);
7481 ceph_assert(parent_dir
);
7483 int export_state
= -1;
7484 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7485 export_state
= migrator
->get_export_state(parent_dir
);
7486 ceph_assert(export_state
>= 0);
7489 if (!parent_dir
->is_auth() ||
7490 (export_state
!= -1 &&
7491 ((export_state
== Migrator::EXPORT_WARNING
&&
7492 migrator
->export_has_warned(parent_dir
,from
)) ||
7493 export_state
== Migrator::EXPORT_EXPORTING
||
7494 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7495 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7496 !migrator
->export_has_notified(parent_dir
,from
))))) {
7499 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7500 ceph_assert(parent_dir
->is_frozen_tree_root());
7502 // make a message container
7504 auto em
= delayed_expire
[parent_dir
].emplace(std::piecewise_construct
, std::forward_as_tuple(from
), std::forward_as_tuple());
7506 em
.first
->second
= make_message
<MCacheExpire
>(from
); /* new */
7508 // merge these expires into it
7509 em
.first
->second
->add_realm(p
.first
, p
.second
);
7512 ceph_assert(export_state
<= Migrator::EXPORT_PREPPING
||
7513 (export_state
== Migrator::EXPORT_WARNING
&&
7514 !migrator
->export_has_warned(parent_dir
, from
)));
7516 dout(7) << "expires for " << *parent_dir
<< dendl
;
7518 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7522 for (const auto &q
: p
.second
.inodes
) {
7523 CInode
*in
= get_inode(q
.first
);
7524 unsigned nonce
= q
.second
;
7527 dout(0) << " inode expire on " << q
.first
<< " from " << from
7528 << ", don't have it" << dendl
;
7531 ceph_assert(in
->is_auth());
7532 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7535 if (nonce
== in
->get_replica_nonce(from
)) {
7536 // remove from our cached_by
7537 dout(7) << " inode expire on " << *in
<< " from mds." << from
7538 << " cached_by was " << in
->get_replicas() << dendl
;
7539 inode_remove_replica(in
, from
, false, gather_locks
);
7542 // this is an old nonce, ignore expire.
7543 dout(7) << " inode expire on " << *in
<< " from mds." << from
7544 << " with old nonce " << nonce
7545 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7551 for (const auto &q
: p
.second
.dirs
) {
7552 CDir
*dir
= get_dirfrag(q
.first
);
7553 unsigned nonce
= q
.second
;
7556 CInode
*diri
= get_inode(q
.first
.ino
);
7558 if (mds
->is_rejoin() &&
7559 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7560 !diri
->is_replica(from
)) {
7561 auto&& ls
= diri
->get_nested_dirfrags();
7562 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7563 << " while rejoining, inode isn't replicated" << dendl
;
7564 for (const auto& d
: ls
) {
7566 if (dir
->is_replica(from
)) {
7567 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7568 dir
->remove_replica(from
);
7573 CDir
*other
= diri
->get_approx_dirfrag(q
.first
.frag
);
7575 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7576 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7580 dout(0) << " dir expire on " << q
.first
<< " from " << from
7581 << ", don't have it" << dendl
;
7584 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7586 ceph_assert(dir
->is_auth());
7589 if (nonce
== dir
->get_replica_nonce(from
)) {
7590 // remove from our cached_by
7591 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7592 << " replicas was " << dir
->get_replicas() << dendl
;
7593 dir
->remove_replica(from
);
7596 // this is an old nonce, ignore expire.
7597 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7598 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7599 << "), dropping" << dendl
;
7604 for (const auto &pd
: p
.second
.dentries
) {
7605 dout(10) << " dn expires in dir " << pd
.first
<< dendl
;
7606 CInode
*diri
= get_inode(pd
.first
.ino
);
7608 CDir
*dir
= diri
->get_dirfrag(pd
.first
.frag
);
7611 dout(0) << " dn expires on " << pd
.first
<< " from " << from
7612 << ", must have refragmented" << dendl
;
7614 ceph_assert(dir
->is_auth());
7617 for (const auto &p
: pd
.second
) {
7618 unsigned nonce
= p
.second
;
7622 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7624 // which dirfrag for this dentry?
7625 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
.first
.first
));
7627 ceph_assert(dir
->is_auth());
7628 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7633 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< " in " << *dir
<< dendl
;
7635 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< dendl
;
7639 if (nonce
== dn
->get_replica_nonce(from
)) {
7640 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7641 dentry_remove_replica(dn
, from
, gather_locks
);
7644 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7645 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7646 << "), dropping" << dendl
;
7652 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7653 if (!(*p
)->is_stable())
7654 mds
->locker
->eval_gather(*p
);
7658 void MDCache::process_delayed_expire(CDir
*dir
)
7660 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7661 for (const auto &p
: delayed_expire
[dir
]) {
7662 handle_cache_expire(p
.second
);
7664 delayed_expire
.erase(dir
);
7667 void MDCache::discard_delayed_expire(CDir
*dir
)
7669 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7670 delayed_expire
.erase(dir
);
7673 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7674 set
<SimpleLock
*>& gather_locks
)
7676 in
->remove_replica(from
);
7677 in
->set_mds_caps_wanted(from
, 0);
7679 // note: this code calls _eval more often than it needs to!
7681 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7682 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7683 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7684 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7685 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7686 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7688 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7689 // Don't remove the recovering mds from lock's gathering list because
7690 // it may hold rejoined wrlocks.
7691 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7692 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7693 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7696 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7698 dn
->remove_replica(from
);
7701 if (dn
->lock
.remove_replica(from
))
7702 gather_locks
.insert(&dn
->lock
);
7704 // Replicated strays might now be elegible for purge
7705 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7706 if (dnl
->is_primary()) {
7707 maybe_eval_stray(dnl
->get_inode());
7711 void MDCache::trim_client_leases()
7713 utime_t now
= ceph_clock_now();
7715 dout(10) << "trim_client_leases" << dendl
;
7717 std::size_t pool
= 0;
7718 for (const auto& list
: client_leases
) {
7723 auto before
= list
.size();
7724 while (!list
.empty()) {
7725 ClientLease
*r
= list
.front();
7726 if (r
->ttl
> now
) break;
7727 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7728 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7729 dn
->remove_client_lease(r
, mds
->locker
);
7731 auto after
= list
.size();
7732 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7733 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7738 void MDCache::check_memory_usage()
7740 static MemoryModel
mm(g_ceph_context
);
7741 static MemoryModel::snap last
;
7743 static MemoryModel::snap baseline
= last
;
7745 // check client caps
7746 ceph_assert(CInode::count() == inode_map
.size() + snap_inode_map
.size() + num_shadow_inodes
);
7747 double caps_per_inode
= 0.0;
7748 if (CInode::count())
7749 caps_per_inode
= (double)Capability::count() / (double)CInode::count();
7751 dout(2) << "Memory usage: "
7752 << " total " << last
.get_total()
7753 << ", rss " << last
.get_rss()
7754 << ", heap " << last
.get_heap()
7755 << ", baseline " << baseline
.get_heap()
7756 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7757 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7760 mds
->update_mlogger();
7761 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7762 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7764 if (cache_toofull()) {
7765 mds
->server
->recall_client_state(nullptr, Server::RecallFlags::TRIM
);
7768 // If the cache size had exceeded its limit, but we're back in bounds
7769 // now, free any unused pool memory so that our memory usage isn't
7770 // permanently bloated.
7771 if (exceeded_size_limit
&& !cache_toofull()) {
7772 // Only do this once we are back in bounds: otherwise the releases would
7773 // slow down whatever process caused us to exceed bounds to begin with
7774 if (ceph_using_tcmalloc()) {
7775 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7777 ceph_heap_release_free_memory();
7779 exceeded_size_limit
= false;
7785 // =========================================================================================
7788 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7790 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7791 void finish(int) override
{
7792 mdcache
->shutdown_check();
7796 void MDCache::shutdown_check()
7798 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7801 char old_val
[32] = { 0 };
7803 g_conf().get_val("debug_mds", &o
, sizeof(old_val
));
7804 g_conf().set_val("debug_mds", "10");
7805 g_conf().apply_changes(nullptr);
7807 g_conf().set_val("debug_mds", old_val
);
7808 g_conf().apply_changes(nullptr);
7809 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7812 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7813 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7816 if (mds
->objecter
->is_active()) {
7817 dout(0) << "objecter still active" << dendl
;
7818 mds
->objecter
->dump_active();
7823 void MDCache::shutdown_start()
7825 dout(5) << "shutdown_start" << dendl
;
7827 if (g_conf()->mds_shutdown_check
)
7828 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7830 // g_conf()->debug_mds = 10;
7835 bool MDCache::shutdown_pass()
7837 dout(7) << "shutdown_pass" << dendl
;
7839 if (mds
->is_stopped()) {
7840 dout(7) << " already shut down" << dendl
;
7847 bool strays_all_exported
= shutdown_export_strays();
7851 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7855 dout(10) << "Migrating any ephemerally pinned inodes" << dendl
;
7856 /* copy to vector to avoid removals during iteration */
7857 std::vector
<CInode
*> migrate
;
7858 migrate
.assign(rand_ephemeral_pins
.begin(), rand_ephemeral_pins
.end());
7859 for (auto& in
: migrate
) {
7860 in
->maybe_ephemeral_rand();
7862 migrate
.assign(dist_ephemeral_pins
.begin(), dist_ephemeral_pins
.end());
7863 for (auto& in
: migrate
) {
7864 in
->maybe_ephemeral_dist();
7866 mds
->balancer
->handle_export_pins();
7869 // Export all subtrees to another active (usually rank 0) if not rank 0
7870 int num_auth_subtree
= 0;
7871 if (!subtrees
.empty() && mds
->get_nodeid() != 0) {
7872 dout(7) << "looking for subtrees to export" << dendl
;
7873 std::vector
<CDir
*> ls
;
7874 for (auto& [dir
, bounds
] : subtrees
) {
7875 dout(10) << " examining " << *dir
<< " bounds " << bounds
<< dendl
;
7876 if (dir
->get_inode()->is_mdsdir() || !dir
->is_auth())
7879 if (dir
->is_frozen() ||
7880 dir
->is_freezing() ||
7881 dir
->is_ambiguous_dir_auth() ||
7882 dir
->state_test(CDir::STATE_EXPORTING
) ||
7883 dir
->get_inode()->is_ephemerally_pinned()) {
7889 migrator
->clear_export_queue();
7891 for (const auto& dir
: ls
) {
7892 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7893 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7895 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7896 migrator
->export_dir_nicely(dir
, dest
);
7900 if (!strays_all_exported
) {
7901 dout(7) << "waiting for strays to migrate" << dendl
;
7905 if (num_auth_subtree
> 0) {
7906 ceph_assert(mds
->get_nodeid() > 0);
7907 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7912 // close out any sessions (and open files!) before we try to trim the log, etc.
7913 if (mds
->sessionmap
.have_unclosed_sessions()) {
7914 if (!mds
->server
->terminating_sessions
)
7915 mds
->server
->terminate_sessions();
7919 // Fully trim the log so that all objects in cache are clean and may be
7920 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7921 // trim the log such that the cache eventually becomes clean.
7922 if (mds
->mdlog
->get_num_segments() > 0) {
7923 auto ls
= mds
->mdlog
->get_current_segment();
7924 if (ls
->num_events
> 1 || !ls
->dirty_dirfrags
.empty()) {
7925 // Current segment contains events other than subtreemap or
7926 // there are dirty dirfrags (see CDir::log_mark_dirty())
7927 mds
->mdlog
->start_new_segment();
7928 mds
->mdlog
->flush();
7931 mds
->mdlog
->trim_all();
7932 if (mds
->mdlog
->get_num_segments() > 1) {
7933 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7937 // drop our reference to our stray dir inode
7938 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7940 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7941 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7942 strays
[i
]->put(CInode::PIN_STRAY
);
7943 strays
[i
]->put_stickydirs();
7947 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7948 if (mydir
&& !mydir
->is_subtree_root())
7951 // subtrees map not empty yet?
7952 if (subtrees
.size() > (mydir
? 1 : 0)) {
7953 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7955 migrator
->show_importing();
7956 migrator
->show_exporting();
7957 if (!migrator
->is_importing() && !migrator
->is_exporting())
7961 ceph_assert(!migrator
->is_exporting());
7962 ceph_assert(!migrator
->is_importing());
7964 // replicas may dirty scatter locks
7965 if (myin
&& myin
->is_replicated()) {
7966 dout(7) << "still have replicated objects" << dendl
;
7970 if ((myin
&& myin
->get_num_auth_pins()) ||
7971 (mydir
&& (mydir
->get_auth_pins() || mydir
->get_dir_auth_pins()))) {
7972 dout(7) << "still have auth pinned objects" << dendl
;
7976 // (only do this once!)
7977 if (!mds
->mdlog
->is_capped()) {
7978 dout(7) << "capping the log" << dendl
;
7982 if (!mds
->mdlog
->empty())
7983 mds
->mdlog
->trim(0);
7985 if (!mds
->mdlog
->empty()) {
7986 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
7987 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
7991 if (!did_shutdown_log_cap
) {
7992 // flush journal header
7993 dout(7) << "writing header for (now-empty) journal" << dendl
;
7994 ceph_assert(mds
->mdlog
->empty());
7995 mds
->mdlog
->write_head(0);
7996 // NOTE: filer active checker below will block us until this completes.
7997 did_shutdown_log_cap
= true;
8002 if (mds
->objecter
->is_active()) {
8003 dout(7) << "objecter still active" << dendl
;
8004 mds
->objecter
->dump_active();
8008 // trim what we can from the cache
8009 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
8010 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
8016 // make mydir subtree go away
8018 if (mydir
->get_num_ref() > 1) { // subtree pin
8019 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
8024 remove_subtree(mydir
);
8025 myin
->close_dirfrag(mydir
->get_frag());
8027 ceph_assert(subtrees
.empty());
8034 if (global_snaprealm
) {
8035 remove_inode(global_snaprealm
->inode
);
8036 global_snaprealm
= nullptr;
8040 dout(5) << "shutdown done." << dendl
;
8044 bool MDCache::shutdown_export_strays()
8046 static const unsigned MAX_EXPORTING
= 100;
8048 if (mds
->get_nodeid() == 0)
8051 if (shutdown_exporting_strays
.size() * 3 >= MAX_EXPORTING
* 2)
8054 dout(10) << "shutdown_export_strays " << shutdown_export_next
.first
8055 << " '" << shutdown_export_next
.second
<< "'" << dendl
;
8057 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
8058 bool all_exported
= false;
8061 auto next
= shutdown_export_next
;
8063 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
8064 CInode
*strayi
= strays
[i
];
8066 !strayi
->state_test(CInode::STATE_STRAYPINNED
))
8068 if (strayi
->ino() < next
.first
.ino
)
8072 strayi
->get_dirfrags(dfls
);
8074 while (!dfls
.empty()) {
8075 CDir
*dir
= dfls
.front();
8078 if (dir
->dirfrag() < next
.first
)
8080 if (next
.first
< dir
->dirfrag()) {
8081 next
.first
= dir
->dirfrag();
8082 next
.second
.clear();
8085 if (!dir
->is_complete()) {
8086 MDSContext
*fin
= nullptr;
8087 if (shutdown_exporting_strays
.empty()) {
8088 fin
= new MDSInternalContextWrapper(mds
,
8089 new LambdaContext([this](int r
) {
8090 shutdown_export_strays();
8098 CDir::dentry_key_map::iterator it
;
8099 if (next
.second
.empty()) {
8102 auto hash
= ceph_frag_value(strayi
->hash_dentry_name(next
.second
));
8103 it
= dir
->lower_bound(dentry_key_t(0, next
.second
, hash
));
8106 for (; it
!= dir
->end(); ++it
) {
8107 CDentry
*dn
= it
->second
;
8108 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8112 if (!mds0_active
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
8113 next
.second
= it
->first
.name
;
8117 auto ret
= shutdown_exporting_strays
.insert(dnl
->get_inode()->ino());
8119 dout(10) << "already exporting/purging " << *dn
<< dendl
;
8123 // Don't try to migrate anything that is actually
8124 // being purged right now
8125 if (!dn
->state_test(CDentry::STATE_PURGING
))
8126 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
8128 if (shutdown_exporting_strays
.size() >= MAX_EXPORTING
) {
8130 if (it
!= dir
->end()) {
8131 next
.second
= it
->first
.name
;
8134 next
.first
.ino
.val
++;
8136 next
.first
= dfls
.front()->dirfrag();
8137 next
.second
.clear();
8145 if (shutdown_exporting_strays
.empty()) {
8146 dirfrag_t
first_df(MDS_INO_STRAY(mds
->get_nodeid(), 0), 0);
8147 if (first_df
< shutdown_export_next
.first
||
8148 !shutdown_export_next
.second
.empty()) {
8149 shutdown_export_next
.first
= first_df
;
8150 shutdown_export_next
.second
.clear();
8153 all_exported
= true;
8157 shutdown_export_next
= next
;
8158 return all_exported
;
8161 // ========= messaging ==============
8163 void MDCache::dispatch(const cref_t
<Message
> &m
)
8165 switch (m
->get_type()) {
8168 case MSG_MDS_RESOLVE
:
8169 handle_resolve(ref_cast
<MMDSResolve
>(m
));
8171 case MSG_MDS_RESOLVEACK
:
8172 handle_resolve_ack(ref_cast
<MMDSResolveAck
>(m
));
8176 case MSG_MDS_CACHEREJOIN
:
8177 handle_cache_rejoin(ref_cast
<MMDSCacheRejoin
>(m
));
8180 case MSG_MDS_DISCOVER
:
8181 handle_discover(ref_cast
<MDiscover
>(m
));
8183 case MSG_MDS_DISCOVERREPLY
:
8184 handle_discover_reply(ref_cast
<MDiscoverReply
>(m
));
8187 case MSG_MDS_DIRUPDATE
:
8188 handle_dir_update(ref_cast
<MDirUpdate
>(m
));
8191 case MSG_MDS_CACHEEXPIRE
:
8192 handle_cache_expire(ref_cast
<MCacheExpire
>(m
));
8195 case MSG_MDS_DENTRYLINK
:
8196 handle_dentry_link(ref_cast
<MDentryLink
>(m
));
8198 case MSG_MDS_DENTRYUNLINK
:
8199 handle_dentry_unlink(ref_cast
<MDentryUnlink
>(m
));
8202 case MSG_MDS_FRAGMENTNOTIFY
:
8203 handle_fragment_notify(ref_cast
<MMDSFragmentNotify
>(m
));
8205 case MSG_MDS_FRAGMENTNOTIFYACK
:
8206 handle_fragment_notify_ack(ref_cast
<MMDSFragmentNotifyAck
>(m
));
8209 case MSG_MDS_FINDINO
:
8210 handle_find_ino(ref_cast
<MMDSFindIno
>(m
));
8212 case MSG_MDS_FINDINOREPLY
:
8213 handle_find_ino_reply(ref_cast
<MMDSFindInoReply
>(m
));
8216 case MSG_MDS_OPENINO
:
8217 handle_open_ino(ref_cast
<MMDSOpenIno
>(m
));
8219 case MSG_MDS_OPENINOREPLY
:
8220 handle_open_ino_reply(ref_cast
<MMDSOpenInoReply
>(m
));
8223 case MSG_MDS_SNAPUPDATE
:
8224 handle_snap_update(ref_cast
<MMDSSnapUpdate
>(m
));
8228 derr
<< "cache unknown message " << m
->get_type() << dendl
;
8229 ceph_abort_msg("cache unknown message");
8233 int MDCache::path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
8234 const filepath
& path
, int flags
,
8235 vector
<CDentry
*> *pdnvec
, CInode
**pin
)
8237 bool discover
= (flags
& MDS_TRAVERSE_DISCOVER
);
8238 bool forward
= !discover
;
8239 bool path_locked
= (flags
& MDS_TRAVERSE_PATH_LOCKED
);
8240 bool want_dentry
= (flags
& MDS_TRAVERSE_WANT_DENTRY
);
8241 bool want_auth
= (flags
& MDS_TRAVERSE_WANT_AUTH
);
8242 bool rdlock_snap
= (flags
& (MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_SNAP2
));
8243 bool rdlock_path
= (flags
& MDS_TRAVERSE_RDLOCK_PATH
);
8244 bool xlock_dentry
= (flags
& MDS_TRAVERSE_XLOCK_DENTRY
);
8245 bool rdlock_authlock
= (flags
& MDS_TRAVERSE_RDLOCK_AUTHLOCK
);
8248 ceph_assert(mdr
); // forward requires a request
8250 snapid_t snapid
= CEPH_NOSNAP
;
8252 mdr
->snapid
= snapid
;
8254 client_t client
= (mdr
&& mdr
->reqid
.name
.is_client()) ? mdr
->reqid
.name
.num() : -1;
8256 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
8258 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
8259 CInode
*cur
= get_inode(path
.get_ino());
8261 if (MDS_INO_IS_MDSDIR(path
.get_ino())) {
8262 open_foreign_mdsdir(path
.get_ino(), cf
.build());
8265 if (MDS_INO_IS_STRAY(path
.get_ino())) {
8266 mds_rank_t rank
= MDS_INO_STRAY_OWNER(path
.get_ino());
8267 unsigned idx
= MDS_INO_STRAY_INDEX(path
.get_ino());
8268 filepath
path(strays
[idx
]->get_parent_dn()->get_name(),
8269 MDS_INO_MDSDIR(rank
));
8270 MDRequestRef null_ref
;
8271 return path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, nullptr);
8275 if (cur
->state_test(CInode::STATE_PURGING
))
8278 // make sure snaprealm are open...
8279 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->have_past_parents_open() &&
8280 !cur
->snaprealm
->open_parents(cf
.build())) {
8284 if (flags
& MDS_TRAVERSE_CHECK_LOCKCACHE
)
8285 mds
->locker
->find_and_attach_lock_cache(mdr
, cur
);
8287 if (mdr
&& mdr
->lock_cache
) {
8288 if (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
)
8289 mdr
->dir_layout
= mdr
->lock_cache
->get_dir_layout();
8290 } else if (rdlock_snap
) {
8291 int n
= (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
) ? 1 : 0;
8292 if ((n
== 0 && !(mdr
->locking_state
& MutationImpl::SNAP_LOCKED
)) ||
8293 (n
== 1 && !(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
))) {
8294 bool want_layout
= (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
);
8295 if (!mds
->locker
->try_rdlock_snap_layout(cur
, mdr
, n
, want_layout
))
8306 MutationImpl::LockOpVec lov
;
8308 for (unsigned depth
= 0; depth
< path
.depth(); ) {
8309 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
8310 << "' snapid " << snapid
<< dendl
;
8312 if (!cur
->is_dir()) {
8313 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
8317 // walk into snapdir?
8318 if (path
[depth
].length() == 0) {
8319 dout(10) << "traverse: snapdir" << dendl
;
8320 if (!mdr
|| depth
> 0) // snapdir must be the first component
8322 snapid
= CEPH_SNAPDIR
;
8323 mdr
->snapid
= snapid
;
8327 // walk thru snapdir?
8328 if (snapid
== CEPH_SNAPDIR
) {
8331 SnapRealm
*realm
= cur
->find_snaprealm();
8332 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
8333 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
8336 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8339 mdr
->snapid
= snapid
;
8345 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
8346 CDir
*curdir
= cur
->get_dirfrag(fg
);
8348 if (cur
->is_auth()) {
8349 // parent dir frozen_dir?
8350 if (cur
->is_frozen()) {
8351 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
8352 cur
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8355 curdir
= cur
->get_or_open_dirfrag(this, fg
);
8358 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
8359 discover_path(cur
, snapid
, path
.postfixpath(depth
), cf
.build(),
8361 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8365 ceph_assert(curdir
);
8367 #ifdef MDS_VERIFY_FRAGSTAT
8368 if (curdir
->is_complete())
8369 curdir
->verify_fragstat();
8374 if (curdir->is_frozen()) {
8376 // FIXME: traverse is allowed?
8377 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8378 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8379 if (onfinish) delete onfinish;
8384 if (want_auth
&& want_dentry
&& depth
== path
.depth() - 1) {
8385 if (curdir
->is_ambiguous_auth()) {
8386 dout(10) << "waiting for single auth on " << *curdir
<< dendl
;
8387 curdir
->add_waiter(CInode::WAIT_SINGLEAUTH
, cf
.build());
8390 if (!curdir
->is_auth()) {
8391 dout(10) << "fw to auth for " << *curdir
<< dendl
;
8392 request_forward(mdr
, curdir
->authority().first
);
8397 // Before doing dirfrag->dn lookup, compare with DamageTable's
8398 // record of which dentries were unreadable
8399 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
8400 dout(4) << "traverse: stopped lookup at damaged dentry "
8401 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
8406 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
8408 if (dn
->state_test(CDentry::STATE_PURGING
))
8413 if (xlock_dentry
&& depth
== path
.depth() - 1) {
8414 if (depth
> 0 || !mdr
->lock_cache
) {
8415 lov
.add_wrlock(&cur
->filelock
);
8416 lov
.add_wrlock(&cur
->nestlock
);
8417 if (rdlock_authlock
)
8418 lov
.add_rdlock(&cur
->authlock
);
8420 lov
.add_xlock(&dn
->lock
);
8422 // force client to flush async dir operation if necessary
8423 if (cur
->filelock
.is_cached())
8424 lov
.add_wrlock(&cur
->filelock
);
8425 lov
.add_rdlock(&dn
->lock
);
8427 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8428 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8431 } else if (!path_locked
&&
8432 !dn
->lock
.can_read(client
) &&
8433 !(dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
8434 dout(10) << "traverse: non-readable dentry at " << *dn
<< dendl
;
8435 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8437 mds
->logger
->inc(l_mds_traverse_lock
);
8438 if (dn
->is_auth() && dn
->lock
.is_unstable_and_locked())
8439 mds
->mdlog
->flush();
8444 pdnvec
->push_back(dn
);
8446 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8447 // can we conclude ENOENT?
8448 if (dnl
->is_null()) {
8449 dout(10) << "traverse: null+readable dentry at " << *dn
<< dendl
;
8450 if (depth
== path
.depth() - 1) {
8455 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8460 // do we have inode?
8461 CInode
*in
= dnl
->get_inode();
8463 ceph_assert(dnl
->is_remote());
8465 in
= get_inode(dnl
->get_remote_ino());
8467 dout(7) << "linking in remote in " << *in
<< dendl
;
8468 dn
->link_remote(dnl
, in
);
8470 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8471 ceph_assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8472 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8473 dout(4) << "traverse: remote dentry points to damaged ino "
8477 open_remote_dentry(dn
, true, cf
.build(),
8478 (path_locked
&& depth
== path
.depth() - 1));
8479 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8485 // make sure snaprealm are open...
8486 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->have_past_parents_open() &&
8487 !cur
->snaprealm
->open_parents(cf
.build())) {
8491 if (rdlock_snap
&& !(want_dentry
&& depth
== path
.depth() - 1)) {
8493 lov
.add_rdlock(&cur
->snaplock
);
8494 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8495 dout(10) << "traverse: failed to rdlock " << cur
->snaplock
<< " " << *cur
<< dendl
;
8500 // add to trace, continue.
8510 // MISS. dentry doesn't exist.
8511 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8513 if (curdir
->is_auth()) {
8515 if (curdir
->is_complete() ||
8516 (snapid
== CEPH_NOSNAP
&&
8517 curdir
->has_bloom() &&
8518 !curdir
->is_in_bloom(path
[depth
]))) {
8521 // instantiate a null dn?
8522 if (depth
< path
.depth() - 1) {
8523 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8524 } else if (snapid
< CEPH_MAXSNAP
) {
8525 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8526 } else if (curdir
->is_frozen()) {
8527 dout(7) << "traverse: " << *curdir
<< " is frozen, waiting" << dendl
;
8528 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8531 // create a null dentry
8532 dn
= curdir
->add_null_dentry(path
[depth
]);
8533 dout(20) << " added null " << *dn
<< dendl
;
8538 if (depth
> 0 || !mdr
->lock_cache
) {
8539 lov
.add_wrlock(&cur
->filelock
);
8540 lov
.add_wrlock(&cur
->nestlock
);
8541 if (rdlock_authlock
)
8542 lov
.add_rdlock(&cur
->authlock
);
8544 lov
.add_xlock(&dn
->lock
);
8546 // force client to flush async dir operation if necessary
8547 if (cur
->filelock
.is_cached())
8548 lov
.add_wrlock(&cur
->filelock
);
8549 lov
.add_rdlock(&dn
->lock
);
8551 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8552 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8558 pdnvec
->push_back(dn
);
8562 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8568 // Check DamageTable for missing fragments before trying to fetch
8570 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8571 dout(4) << "traverse: damaged dirfrag " << *curdir
8572 << ", blocking fetch" << dendl
;
8576 // directory isn't complete; reload
8577 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8579 curdir
->fetch(cf
.build(), path
[depth
]);
8580 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8584 // dirfrag/dentry is not mine.
8585 mds_authority_t dauth
= curdir
->authority();
8587 if (!forward_all_requests_to_auth
&&
8589 mdr
&& mdr
->client_request
&&
8590 (int)depth
< mdr
->client_request
->get_num_fwd()){
8591 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8592 << " < fwd " << mdr
->client_request
->get_num_fwd()
8593 << ", discovering instead of forwarding" << dendl
;
8598 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8599 discover_path(curdir
, snapid
, path
.postfixpath(depth
), cf
.build(),
8601 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8606 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8608 if (curdir
->is_ambiguous_auth()) {
8610 dout(7) << "traverse: waiting for single auth in " << *curdir
<< dendl
;
8611 curdir
->add_waiter(CDir::WAIT_SINGLEAUTH
, cf
.build());
8615 dout(7) << "traverse: forwarding, not auth for " << *curdir
<< dendl
;
8617 request_forward(mdr
, dauth
.first
);
8619 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_forward
);
8624 ceph_abort(); // i shouldn't get here
8627 if (want_auth
&& !want_dentry
) {
8628 if (cur
->is_ambiguous_auth()) {
8629 dout(10) << "waiting for single auth on " << *cur
<< dendl
;
8630 cur
->add_waiter(CInode::WAIT_SINGLEAUTH
, cf
.build());
8633 if (!cur
->is_auth()) {
8634 dout(10) << "fw to auth for " << *cur
<< dendl
;
8635 request_forward(mdr
, cur
->authority().first
);
8641 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8642 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8644 ceph_assert(mdr
->snapid
== snapid
);
8646 if (flags
& MDS_TRAVERSE_RDLOCK_SNAP
)
8647 mdr
->locking_state
|= MutationImpl::SNAP_LOCKED
;
8648 else if (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
)
8649 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
8652 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
8657 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8659 dout(10) << "cache_traverse " << fp
<< dendl
;
8663 in
= get_inode(fp
.get_ino());
8669 for (unsigned i
= 0; i
< fp
.depth(); i
++) {
8670 std::string_view dname
= fp
[i
];
8671 frag_t fg
= in
->pick_dirfrag(dname
);
8672 dout(20) << " " << i
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8673 CDir
*curdir
= in
->get_dirfrag(fg
);
8676 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8679 in
= dn
->get_linkage()->get_inode();
8683 dout(10) << " got " << *in
<< dendl
;
8689 * open_remote_dir -- open up a remote dirfrag
8691 * @param diri base inode
8692 * @param approxfg approximate fragment.
8693 * @param fin completion callback
8695 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSContext
*fin
)
8697 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8698 ceph_assert(diri
->is_dir());
8699 ceph_assert(!diri
->is_auth());
8700 ceph_assert(diri
->get_dirfrag(approxfg
) == 0);
8702 discover_dir_frag(diri
, approxfg
, fin
);
8707 * get_dentry_inode - get or open inode
8709 * @param dn the dentry
8710 * @param mdr current request
8712 * will return inode for primary, or link up/open up remote link's inode as necessary.
8713 * If it's not available right now, puts mdr on wait list and returns null.
8715 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8717 CDentry::linkage_t
*dnl
;
8719 dnl
= dn
->get_projected_linkage();
8721 dnl
= dn
->get_linkage();
8723 ceph_assert(!dnl
->is_null());
8725 if (dnl
->is_primary())
8728 ceph_assert(dnl
->is_remote());
8729 CInode
*in
= get_inode(dnl
->get_remote_ino());
8731 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8732 dn
->link_remote(dnl
, in
);
8735 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8736 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8741 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8744 MDSContext
*onfinish
;
8746 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSContext
*f
, bool wx
) :
8747 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8748 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8750 void finish(int r
) override
{
8751 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8752 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8756 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
, bool want_xlocked
)
8758 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8759 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8760 inodeno_t ino
= dnl
->get_remote_ino();
8761 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->mdsmap
->get_metadata_pool() : -1;
8763 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8766 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
8767 bool want_xlocked
, int r
)
8770 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8771 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8772 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8773 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8776 CDir
*dir
= dn
->get_dir();
8778 dir
->get_inode()->make_path_string(path
);
8780 path
+= dn
->get_name();
8783 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8786 ceph_abort(); // unreachable, damaged() respawns us
8792 fin
->complete(r
< 0 ? r
: 0);
8796 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8798 // empty trace if we're a base inode
8802 CInode
*parent
= in
->get_parent_inode();
8803 ceph_assert(parent
);
8804 make_trace(trace
, parent
);
8806 CDentry
*dn
= in
->get_parent_dn();
8807 dout(15) << "make_trace adding " << *dn
<< dendl
;
8808 trace
.push_back(dn
);
8812 // -------------------------------------------------------------------------------
8813 // Open inode by inode number
8815 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8819 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8820 MDCacheIOContext(c
), ino(i
) {}
8821 void finish(int r
) override
{
8822 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8824 void print(ostream
& out
) const override
{
8825 out
<< "openino_backtrace_fetch" << ino
<< ")";
8829 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8831 cref_t
<MMDSOpenIno
> msg
;
8834 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, const cref_t
<MMDSOpenIno
> &m
, bool p
) :
8835 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8836 void finish(int r
) override
{
8837 if (r
< 0 && !parent
)
8840 mdcache
->handle_open_ino(msg
, r
);
8843 auto& info
= mdcache
->opening_inodes
.at(ino
);
8844 mdcache
->_open_ino_traverse_dir(ino
, info
, r
);
8848 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8851 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8852 void finish(int r
) override
{
8853 mdcache
->_open_ino_parent_opened(ino
, r
);
8857 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8859 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8861 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8863 CInode
*in
= get_inode(ino
);
8865 dout(10) << " found cached " << *in
<< dendl
;
8866 open_ino_finish(ino
, info
, in
->authority().first
);
8870 inode_backtrace_t backtrace
;
8873 decode(backtrace
, bl
);
8874 } catch (const buffer::error
&decode_exc
) {
8875 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8876 << std::dec
<< ": " << decode_exc
<< dendl
;
8877 open_ino_finish(ino
, info
, -EIO
);
8880 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8881 dout(10) << " old object in pool " << info
.pool
8882 << ", retrying pool " << backtrace
.pool
<< dendl
;
8883 info
.pool
= backtrace
.pool
;
8884 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8885 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8886 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8887 new C_OnFinisher(fin
, mds
->finisher
));
8890 } else if (err
== -ENOENT
) {
8891 int64_t meta_pool
= mds
->mdsmap
->get_metadata_pool();
8892 if (info
.pool
!= meta_pool
) {
8893 dout(10) << " no object in pool " << info
.pool
8894 << ", retrying pool " << meta_pool
<< dendl
;
8895 info
.pool
= meta_pool
;
8896 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8897 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8898 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8899 new C_OnFinisher(fin
, mds
->finisher
));
8902 err
= 0; // backtrace.ancestors.empty() is checked below
8906 if (backtrace
.ancestors
.empty()) {
8907 dout(10) << " got empty backtrace " << dendl
;
8909 } else if (!info
.ancestors
.empty()) {
8910 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8911 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8919 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8921 err
= info
.last_err
;
8922 open_ino_finish(ino
, info
, err
);
8926 dout(10) << " got backtrace " << backtrace
<< dendl
;
8927 info
.ancestors
= backtrace
.ancestors
;
8929 _open_ino_traverse_dir(ino
, info
, 0);
8932 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8934 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8936 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8938 CInode
*in
= get_inode(ino
);
8940 dout(10) << " found cached " << *in
<< dendl
;
8941 open_ino_finish(ino
, info
, in
->authority().first
);
8945 if (ret
== mds
->get_nodeid()) {
8946 _open_ino_traverse_dir(ino
, info
, 0);
8949 mds_rank_t checked_rank
= mds_rank_t(ret
);
8950 info
.check_peers
= true;
8951 info
.auth_hint
= checked_rank
;
8952 info
.checked
.erase(checked_rank
);
8954 do_open_ino(ino
, info
, ret
);
8958 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8960 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
8962 CInode
*in
= get_inode(ino
);
8964 dout(10) << " found cached " << *in
<< dendl
;
8965 open_ino_finish(ino
, info
, in
->authority().first
);
8970 do_open_ino(ino
, info
, ret
);
8974 mds_rank_t hint
= info
.auth_hint
;
8975 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
8976 info
.discover
, info
.want_xlocked
, &hint
);
8979 if (hint
!= mds
->get_nodeid())
8980 info
.auth_hint
= hint
;
8981 do_open_ino(ino
, info
, ret
);
8984 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, CDir
*dir
, bool parent
)
8986 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
8987 ceph_assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
8988 dir
->fetch(new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
));
8990 mds
->logger
->inc(l_mds_openino_dir_fetch
);
8993 int MDCache::open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
8994 const vector
<inode_backpointer_t
>& ancestors
,
8995 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
8997 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
8999 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
9000 const auto& ancestor
= ancestors
.at(i
);
9001 CInode
*diri
= get_inode(ancestor
.dirino
);
9004 if (discover
&& MDS_INO_IS_MDSDIR(ancestor
.dirino
)) {
9005 open_foreign_mdsdir(ancestor
.dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9011 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
9012 CDir
*dir
= diri
->get_parent_dir();
9013 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
9014 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
))
9015 dir
= dir
->get_inode()->get_parent_dir();
9016 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
9020 if (!diri
->is_dir()) {
9021 dout(10) << " " << *diri
<< " is not dir" << dendl
;
9027 const string
& name
= ancestor
.dname
;
9028 frag_t fg
= diri
->pick_dirfrag(name
);
9029 CDir
*dir
= diri
->get_dirfrag(fg
);
9031 if (diri
->is_auth()) {
9032 if (diri
->is_frozen()) {
9033 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
9034 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9037 dir
= diri
->get_or_open_dirfrag(this, fg
);
9038 } else if (discover
) {
9039 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9044 inodeno_t next_ino
= i
> 0 ? ancestors
.at(i
-1).dirino
: ino
;
9045 CDentry
*dn
= dir
->lookup(name
);
9046 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
9047 if (dir
->is_auth()) {
9048 if (dnl
&& dnl
->is_primary() &&
9049 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
9050 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
9051 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
9055 if (!dnl
&& !dir
->is_complete() &&
9056 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
9057 dout(10) << " fetching incomplete " << *dir
<< dendl
;
9058 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
9062 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
9065 } else if (discover
) {
9067 filepath
path(name
, 0);
9068 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
9069 (i
== 0 && want_xlocked
));
9072 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
9073 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
9074 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9077 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
9083 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
9089 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
9091 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
9093 MDSContext::vec waiters
;
9094 waiters
.swap(info
.waiters
);
9095 opening_inodes
.erase(ino
);
9096 finish_contexts(g_ceph_context
, waiters
, ret
);
9099 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
9101 if (err
< 0 && err
!= -EAGAIN
) {
9102 info
.checked
.clear();
9103 info
.checking
= MDS_RANK_NONE
;
9104 info
.check_peers
= true;
9105 info
.fetch_backtrace
= true;
9106 if (info
.discover
) {
9107 info
.discover
= false;
9108 info
.ancestors
.clear();
9110 if (err
!= -ENOENT
&& err
!= -ENOTDIR
)
9111 info
.last_err
= err
;
9114 if (info
.check_peers
|| info
.discover
) {
9115 if (info
.discover
) {
9116 // got backtrace from peer, but failed to find inode. re-check peers
9117 info
.discover
= false;
9118 info
.ancestors
.clear();
9119 info
.checked
.clear();
9121 info
.check_peers
= false;
9122 info
.checking
= MDS_RANK_NONE
;
9123 do_open_ino_peer(ino
, info
);
9124 } else if (info
.fetch_backtrace
) {
9125 info
.check_peers
= true;
9126 info
.fetch_backtrace
= false;
9127 info
.checking
= mds
->get_nodeid();
9128 info
.checked
.clear();
9129 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
9130 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
9131 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
9132 new C_OnFinisher(fin
, mds
->finisher
));
9134 ceph_assert(!info
.ancestors
.empty());
9135 info
.checking
= mds
->get_nodeid();
9136 open_ino(info
.ancestors
[0].dirino
, mds
->mdsmap
->get_metadata_pool(),
9137 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
9141 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
9143 set
<mds_rank_t
> all
, active
;
9144 mds
->mdsmap
->get_mds_set(all
);
9145 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
9146 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_REJOIN
);
9148 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9150 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
9151 << " all " << all
<< " checked " << info
.checked
<< dendl
;
9153 mds_rank_t whoami
= mds
->get_nodeid();
9154 mds_rank_t peer
= MDS_RANK_NONE
;
9155 if (info
.auth_hint
>= 0 && info
.auth_hint
!= whoami
) {
9156 if (active
.count(info
.auth_hint
)) {
9157 peer
= info
.auth_hint
;
9158 info
.auth_hint
= MDS_RANK_NONE
;
9161 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9162 if (*p
!= whoami
&& info
.checked
.count(*p
) == 0) {
9169 if (all
!= info
.checked
) {
9170 dout(10) << " waiting for more peers to be active" << dendl
;
9172 dout(10) << " all MDS peers have been checked " << dendl
;
9173 do_open_ino(ino
, info
, 0);
9176 info
.checking
= peer
;
9177 vector
<inode_backpointer_t
> *pa
= NULL
;
9178 // got backtrace from peer or backtrace just fetched
9179 if (info
.discover
|| !info
.fetch_backtrace
)
9180 pa
= &info
.ancestors
;
9181 mds
->send_message_mds(make_message
<MMDSOpenIno
>(info
.tid
, ino
, pa
), peer
);
9183 mds
->logger
->inc(l_mds_openino_peer_discover
);
9187 void MDCache::handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
)
9189 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
9190 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
9194 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
9196 auto from
= mds_rank_t(m
->get_source().num());
9197 inodeno_t ino
= m
->ino
;
9198 ref_t
<MMDSOpenInoReply
> reply
;
9199 CInode
*in
= get_inode(ino
);
9201 dout(10) << " have " << *in
<< dendl
;
9202 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, mds_rank_t(0));
9203 if (in
->is_auth()) {
9206 CDentry
*pdn
= in
->get_parent_dn();
9209 CInode
*diri
= pdn
->get_dir()->get_inode();
9210 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(),
9211 in
->inode
.version
));
9215 reply
->hint
= in
->authority().first
;
9217 } else if (err
< 0) {
9218 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
9220 mds_rank_t hint
= MDS_RANK_NONE
;
9221 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
9224 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, hint
, ret
);
9226 mds
->send_message_mds(reply
, from
);
9229 void MDCache::handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
)
9231 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
9233 inodeno_t ino
= m
->ino
;
9234 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9235 auto it
= opening_inodes
.find(ino
);
9236 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
9237 open_ino_info_t
& info
= it
->second
;
9238 info
.checking
= MDS_RANK_NONE
;
9239 info
.checked
.insert(from
);
9241 CInode
*in
= get_inode(ino
);
9243 dout(10) << " found cached " << *in
<< dendl
;
9244 open_ino_finish(ino
, info
, in
->authority().first
);
9245 } else if (!m
->ancestors
.empty()) {
9246 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
9247 if (!info
.want_replica
) {
9248 open_ino_finish(ino
, info
, from
);
9252 info
.ancestors
= m
->ancestors
;
9253 info
.auth_hint
= from
;
9254 info
.checking
= mds
->get_nodeid();
9255 info
.discover
= true;
9256 _open_ino_traverse_dir(ino
, info
, 0);
9257 } else if (m
->error
) {
9258 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
9259 do_open_ino(ino
, info
, m
->error
);
9261 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
9262 info
.auth_hint
= m
->hint
;
9263 info
.checked
.erase(m
->hint
);
9265 do_open_ino_peer(ino
, info
);
9270 void MDCache::kick_open_ino_peers(mds_rank_t who
)
9272 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
9274 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
9275 p
!= opening_inodes
.end();
9277 open_ino_info_t
& info
= p
->second
;
9278 if (info
.checking
== who
) {
9279 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
9280 info
.checking
= MDS_RANK_NONE
;
9281 do_open_ino_peer(p
->first
, info
);
9282 } else if (info
.checking
== MDS_RANK_NONE
) {
9283 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
9284 do_open_ino_peer(p
->first
, info
);
9289 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSContext
* fin
,
9290 bool want_replica
, bool want_xlocked
)
9292 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
9293 << want_replica
<< dendl
;
9295 auto it
= opening_inodes
.find(ino
);
9296 if (it
!= opening_inodes
.end()) {
9297 open_ino_info_t
& info
= it
->second
;
9299 info
.want_replica
= true;
9300 if (want_xlocked
&& !info
.want_xlocked
) {
9301 if (!info
.ancestors
.empty()) {
9302 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
9304 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
9305 CDir
*dir
= diri
->get_dirfrag(fg
);
9306 if (dir
&& !dir
->is_auth()) {
9307 filepath
path(info
.ancestors
[0].dname
, 0);
9308 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
9312 info
.want_xlocked
= true;
9315 info
.waiters
.push_back(fin
);
9317 open_ino_info_t
& info
= opening_inodes
[ino
];
9318 info
.want_replica
= want_replica
;
9319 info
.want_xlocked
= want_xlocked
;
9320 info
.tid
= ++open_ino_last_tid
;
9321 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
9322 info
.waiters
.push_back(fin
);
9323 if (mds
->is_rejoin() &&
9324 open_file_table
.get_ancestors(ino
, info
.ancestors
, info
.auth_hint
)) {
9325 info
.fetch_backtrace
= false;
9326 info
.checking
= mds
->get_nodeid();
9327 _open_ino_traverse_dir(ino
, info
, 0);
9329 do_open_ino(ino
, info
, 0);
9334 /* ---------------------------- */
9337 * search for a given inode on MDS peers. optionally start with the given node.
9341 - recover from mds node failure, recovery
9345 void MDCache::find_ino_peers(inodeno_t ino
, MDSContext
*c
,
9346 mds_rank_t hint
, bool path_locked
)
9348 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
9349 CInode
*in
= get_inode(ino
);
9350 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
9351 c
->complete(-ESTALE
);
9356 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
9357 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
9361 fip
.path_locked
= path_locked
;
9363 _do_find_ino_peer(fip
);
9366 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
9368 set
<mds_rank_t
> all
, active
;
9369 mds
->mdsmap
->get_mds_set(all
);
9370 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9372 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
9373 << " active " << active
<< " all " << all
9374 << " checked " << fip
.checked
9377 mds_rank_t m
= MDS_RANK_NONE
;
9378 if (fip
.hint
>= 0) {
9380 fip
.hint
= MDS_RANK_NONE
;
9382 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9383 if (*p
!= mds
->get_nodeid() &&
9384 fip
.checked
.count(*p
) == 0) {
9389 if (m
== MDS_RANK_NONE
) {
9390 all
.erase(mds
->get_nodeid());
9391 if (all
!= fip
.checked
) {
9392 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
9394 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
9395 fip
.fin
->complete(-ESTALE
);
9396 find_ino_peer
.erase(fip
.tid
);
9400 mds
->send_message_mds(make_message
<MMDSFindIno
>(fip
.tid
, fip
.ino
), m
);
9404 void MDCache::handle_find_ino(const cref_t
<MMDSFindIno
> &m
)
9406 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
9410 dout(10) << "handle_find_ino " << *m
<< dendl
;
9411 auto r
= make_message
<MMDSFindInoReply
>(m
->tid
);
9412 CInode
*in
= get_inode(m
->ino
);
9414 in
->make_path(r
->path
);
9415 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
9417 mds
->send_message_mds(r
, mds_rank_t(m
->get_source().num()));
9421 void MDCache::handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
)
9423 auto p
= find_ino_peer
.find(m
->tid
);
9424 if (p
!= find_ino_peer
.end()) {
9425 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
9426 find_ino_peer_info_t
& fip
= p
->second
;
9429 if (get_inode(fip
.ino
)) {
9430 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
9431 mds
->queue_waiter(fip
.fin
);
9432 find_ino_peer
.erase(p
);
9436 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9437 if (fip
.checking
== from
)
9438 fip
.checking
= MDS_RANK_NONE
;
9439 fip
.checked
.insert(from
);
9441 if (!m
->path
.empty()) {
9443 vector
<CDentry
*> trace
;
9444 CF_MDS_RetryMessageFactory
cf(mds
, m
);
9445 MDRequestRef null_ref
;
9446 int flags
= MDS_TRAVERSE_DISCOVER
;
9447 if (fip
.path_locked
)
9448 flags
|= MDS_TRAVERSE_PATH_LOCKED
;
9449 int r
= path_traverse(null_ref
, cf
, m
->path
, flags
, &trace
);
9452 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
9453 << ", retrying" << dendl
;
9454 fip
.checked
.clear();
9455 _do_find_ino_peer(fip
);
9458 _do_find_ino_peer(fip
);
9461 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
9465 void MDCache::kick_find_ino_peers(mds_rank_t who
)
9467 // find_ino_peers requests we should move on from
9468 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
9469 p
!= find_ino_peer
.end();
9471 find_ino_peer_info_t
& fip
= p
->second
;
9472 if (fip
.checking
== who
) {
9473 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
9474 fip
.checking
= MDS_RANK_NONE
;
9475 _do_find_ino_peer(fip
);
9476 } else if (fip
.checking
== MDS_RANK_NONE
) {
9477 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
9478 _do_find_ino_peer(fip
);
9483 /* ---------------------------- */
9485 int MDCache::get_num_client_requests()
9488 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
9489 p
!= active_requests
.end();
9491 MDRequestRef
& mdr
= p
->second
;
9492 if (mdr
->reqid
.name
.is_client() && !mdr
->is_slave())
9498 MDRequestRef
MDCache::request_start(const cref_t
<MClientRequest
>& req
)
9500 // did we win a forward race against a slave?
9501 if (active_requests
.count(req
->get_reqid())) {
9502 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
9504 if (mdr
->is_slave()) {
9505 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
9506 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
9508 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
9510 return MDRequestRef();
9513 // register new client request
9514 MDRequestImpl::Params params
;
9515 params
.reqid
= req
->get_reqid();
9516 params
.attempt
= req
->get_num_fwd();
9517 params
.client_req
= req
;
9518 params
.initiated
= req
->get_recv_stamp();
9519 params
.throttled
= req
->get_throttle_stamp();
9520 params
.all_read
= req
->get_recv_complete_stamp();
9521 params
.dispatched
= req
->get_dispatch_stamp();
9524 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9525 active_requests
[params
.reqid
] = mdr
;
9526 mdr
->set_op_stamp(req
->get_stamp());
9527 dout(7) << "request_start " << *mdr
<< dendl
;
9531 MDRequestRef
MDCache::request_start_slave(metareqid_t ri
, __u32 attempt
, const cref_t
<Message
> &m
)
9533 int by
= m
->get_source().num();
9534 MDRequestImpl::Params params
;
9536 params
.attempt
= attempt
;
9537 params
.triggering_slave_req
= m
;
9538 params
.slave_to
= by
;
9539 params
.initiated
= m
->get_recv_stamp();
9540 params
.throttled
= m
->get_throttle_stamp();
9541 params
.all_read
= m
->get_recv_complete_stamp();
9542 params
.dispatched
= m
->get_dispatch_stamp();
9544 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9545 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9546 active_requests
[mdr
->reqid
] = mdr
;
9547 dout(7) << "request_start_slave " << *mdr
<< " by mds." << by
<< dendl
;
9551 MDRequestRef
MDCache::request_start_internal(int op
)
9553 utime_t now
= ceph_clock_now();
9554 MDRequestImpl::Params params
;
9555 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9556 params
.reqid
.tid
= mds
->issue_tid();
9557 params
.initiated
= now
;
9558 params
.throttled
= now
;
9559 params
.all_read
= now
;
9560 params
.dispatched
= now
;
9561 params
.internal_op
= op
;
9563 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9565 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9566 active_requests
[mdr
->reqid
] = mdr
;
9567 dout(7) << "request_start_internal " << *mdr
<< " op " << op
<< dendl
;
9571 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9573 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9574 ceph_assert(p
!= active_requests
.end());
9575 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9579 void MDCache::request_finish(MDRequestRef
& mdr
)
9581 dout(7) << "request_finish " << *mdr
<< dendl
;
9582 mdr
->mark_event("finishing request");
9585 if (mdr
->has_more() && mdr
->more()->slave_commit
) {
9586 Context
*fin
= mdr
->more()->slave_commit
;
9587 mdr
->more()->slave_commit
= 0;
9590 mdr
->aborted
= false;
9592 mdr
->more()->slave_rolling_back
= true;
9595 mdr
->committing
= true;
9597 fin
->complete(ret
); // this must re-call request_finish.
9601 switch(mdr
->internal_op
) {
9602 case CEPH_MDS_OP_FRAGMENTDIR
:
9603 logger
->inc(l_mdss_ireq_fragmentdir
);
9605 case CEPH_MDS_OP_EXPORTDIR
:
9606 logger
->inc(l_mdss_ireq_exportdir
);
9608 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9609 logger
->inc(l_mdss_ireq_enqueue_scrub
);
9611 case CEPH_MDS_OP_FLUSH
:
9612 logger
->inc(l_mdss_ireq_flush
);
9614 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9615 logger
->inc(l_mdss_ireq_fragstats
);
9617 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9618 logger
->inc(l_mdss_ireq_inodestats
);
9622 request_cleanup(mdr
);
9626 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9628 mdr
->mark_event("forwarding request");
9629 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9630 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9631 << *mdr
->client_request
<< dendl
;
9632 if (mdr
->is_batch_head
) {
9633 int mask
= mdr
->client_request
->head
.args
.getattr
.mask
;
9635 switch (mdr
->client_request
->get_op()) {
9636 case CEPH_MDS_OP_GETATTR
:
9638 CInode
* in
= mdr
->in
[0];
9640 auto it
= in
->batch_ops
.find(mask
);
9641 if (it
!= in
->batch_ops
.end()) {
9642 it
->second
->forward(who
);
9643 in
->batch_ops
.erase(it
);
9648 case CEPH_MDS_OP_LOOKUP
:
9650 if (mdr
->dn
[0].size()) {
9651 CDentry
* dn
= mdr
->dn
[0].back();
9652 auto it
= dn
->batch_ops
.find(mask
);
9653 if (it
!= dn
->batch_ops
.end()) {
9654 it
->second
->forward(who
);
9655 dn
->batch_ops
.erase(it
);
9664 mds
->forward_message_mds(mdr
->release_client_request(), who
);
9666 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9667 } else if (mdr
->internal_op
>= 0) {
9668 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9669 mdr
->internal_op_finish
->complete(-EXDEV
);
9671 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9672 << " was from mds" << dendl
;
9674 request_cleanup(mdr
);
9678 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9680 if (mdr
->client_request
) {
9681 mds
->server
->dispatch_client_request(mdr
);
9682 } else if (mdr
->slave_request
) {
9683 mds
->server
->dispatch_slave_request(mdr
);
9685 switch (mdr
->internal_op
) {
9686 case CEPH_MDS_OP_FRAGMENTDIR
:
9687 dispatch_fragment_dir(mdr
);
9689 case CEPH_MDS_OP_EXPORTDIR
:
9690 migrator
->dispatch_export_dir(mdr
, 0);
9692 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9693 enqueue_scrub_work(mdr
);
9695 case CEPH_MDS_OP_FLUSH
:
9696 flush_dentry_work(mdr
);
9698 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9699 repair_dirfrag_stats_work(mdr
);
9701 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9702 repair_inode_stats_work(mdr
);
9704 case CEPH_MDS_OP_UPGRADE_SNAPREALM
:
9705 upgrade_inode_snaprealm_work(mdr
);
9714 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9716 if (!mdr
->has_more())
9720 // (will implicitly drop remote dn pins)
9721 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->slaves
.begin();
9722 p
!= mdr
->more()->slaves
.end();
9724 auto r
= make_message
<MMDSSlaveRequest
>(mdr
->reqid
, mdr
->attempt
,
9725 MMDSSlaveRequest::OP_FINISH
);
9727 if (mdr
->killed
&& !mdr
->committing
) {
9729 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9730 mdr
->more()->inode_import
.length() > 0) {
9731 // information about rename imported caps
9732 r
->inode_export
.claim(mdr
->more()->inode_import
);
9735 mds
->send_message_mds(r
, *p
);
9738 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9739 * implicitly. Note that we don't call the finishers -- there shouldn't
9740 * be any on a remote lock and the request finish wakes up all
9741 * the waiters anyway! */
9743 for (auto it
= mdr
->locks
.begin(); it
!= mdr
->locks
.end(); ) {
9744 SimpleLock
*lock
= it
->lock
;
9745 if (it
->is_xlock() && !lock
->get_parent()->is_auth()) {
9746 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9747 << " on " << lock
->get_parent() << dendl
;
9749 mdr
->locks
.erase(it
++);
9750 } else if (it
->is_remote_wrlock()) {
9751 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9752 << " on mds." << it
->wrlock_target
<< " on " << *lock
->get_parent() << dendl
;
9753 if (it
->is_wrlock()) {
9754 it
->clear_remote_wrlock();
9757 mdr
->locks
.erase(it
++);
9764 mdr
->more()->slaves
.clear(); /* we no longer have requests out to them, and
9765 * leaving them in can cause double-notifies as
9766 * this function can get called more than once */
9769 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9771 request_drop_foreign_locks(mdr
);
9772 mds
->locker
->drop_non_rdlocks(mdr
.get());
9775 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9777 request_drop_foreign_locks(mdr
);
9778 mds
->locker
->drop_locks(mdr
.get());
9781 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9783 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9785 if (mdr
->has_more()) {
9786 if (mdr
->more()->is_ambiguous_auth
)
9787 mdr
->clear_ambiguous_auth();
9788 if (!mdr
->more()->waiting_for_finish
.empty())
9789 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9792 request_drop_locks(mdr
);
9794 // drop (local) auth pins
9795 mdr
->drop_local_auth_pins();
9798 mdr
->put_stickydirs();
9800 mds
->locker
->kick_cap_releases(mdr
);
9805 // remove from session
9806 mdr
->item_session_request
.remove_myself();
9809 active_requests
.erase(mdr
->reqid
);
9814 mdr
->mark_event("cleaned up request");
9817 void MDCache::request_kill(MDRequestRef
& mdr
)
9819 // rollback slave requests is tricky. just let the request proceed.
9820 if (mdr
->has_more() &&
9821 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_slave
.empty())) {
9822 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9823 ceph_assert(mdr
->more()->witnessed
.empty());
9824 mdr
->aborted
= true;
9825 dout(10) << "request_kill " << *mdr
<< " -- waiting for slave reply, delaying" << dendl
;
9827 dout(10) << "request_kill " << *mdr
<< " -- already started slave prep, no-op" << dendl
;
9830 ceph_assert(mdr
->used_prealloc_ino
== 0);
9831 ceph_assert(mdr
->prealloc_inos
.empty());
9833 mdr
->session
= NULL
;
9834 mdr
->item_session_request
.remove_myself();
9839 mdr
->mark_event("killing request");
9841 if (mdr
->committing
) {
9842 dout(10) << "request_kill " << *mdr
<< " -- already committing, no-op" << dendl
;
9844 dout(10) << "request_kill " << *mdr
<< dendl
;
9845 request_cleanup(mdr
);
9849 // -------------------------------------------------------------------------------
9852 void MDCache::create_global_snaprealm()
9854 CInode
*in
= new CInode(this); // dummy inode
9855 create_unlinked_system_inode(in
, MDS_INO_GLOBAL_SNAPREALM
, S_IFDIR
|0755);
9857 global_snaprealm
= in
->snaprealm
;
9860 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
)
9862 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9864 vector
<inodeno_t
> split_inos
;
9865 vector
<inodeno_t
> split_realms
;
9867 if (notify_clients
) {
9868 ceph_assert(in
->snaprealm
->have_past_parents_open());
9869 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9870 // notify clients of update|split
9871 for (elist
<CInode
*>::iterator p
= in
->snaprealm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
9873 split_inos
.push_back((*p
)->ino());
9875 for (set
<SnapRealm
*>::iterator p
= in
->snaprealm
->open_children
.begin();
9876 p
!= in
->snaprealm
->open_children
.end();
9878 split_realms
.push_back((*p
)->inode
->ino());
9882 set
<SnapRealm
*> past_children
;
9883 map
<client_t
, ref_t
<MClientSnap
>> updates
;
9885 q
.push_back(in
->snaprealm
);
9886 while (!q
.empty()) {
9887 SnapRealm
*realm
= q
.front();
9890 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9891 realm
->invalidate_cached_snaps();
9893 if (notify_clients
) {
9894 for (const auto& p
: realm
->client_caps
) {
9895 const auto& client
= p
.first
;
9896 const auto& caps
= p
.second
;
9897 ceph_assert(!caps
->empty());
9899 auto em
= updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple());
9901 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
9902 update
->head
.split
= in
->ino();
9903 update
->split_inos
= split_inos
;
9904 update
->split_realms
= split_realms
;
9905 update
->bl
= in
->snaprealm
->get_snap_trace();
9906 em
.first
->second
= std::move(update
);
9911 if (snapop
== CEPH_SNAP_OP_UPDATE
|| snapop
== CEPH_SNAP_OP_DESTROY
) {
9912 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9913 p
!= realm
->open_past_children
.end();
9915 past_children
.insert(*p
);
9918 // notify for active children, too.
9919 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9920 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9921 p
!= realm
->open_children
.end();
9927 send_snaps(updates
);
9929 // notify past children and their descendants if we update/delete old snapshots
9930 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9931 p
!= past_children
.end();
9935 while (!q
.empty()) {
9936 SnapRealm
*realm
= q
.front();
9939 realm
->invalidate_cached_snaps();
9941 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9942 p
!= realm
->open_children
.end();
9944 if (past_children
.count(*p
) == 0)
9948 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9949 p
!= realm
->open_past_children
.end();
9951 if (past_children
.count(*p
) == 0) {
9953 past_children
.insert(*p
);
9958 if (snapop
== CEPH_SNAP_OP_DESTROY
) {
9959 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9960 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9961 p
!= past_children
.end();
9963 maybe_eval_stray((*p
)->inode
, true);
9967 void MDCache::send_snap_update(CInode
*in
, version_t stid
, int snap_op
)
9969 dout(10) << __func__
<< " " << *in
<< " stid " << stid
<< dendl
;
9970 ceph_assert(in
->is_auth());
9972 set
<mds_rank_t
> mds_set
;
9974 mds
->mdsmap
->get_mds_set_lower_bound(mds_set
, MDSMap::STATE_RESOLVE
);
9975 mds_set
.erase(mds
->get_nodeid());
9977 in
->list_replicas(mds_set
);
9980 if (!mds_set
.empty()) {
9981 bufferlist snap_blob
;
9982 in
->encode_snap(snap_blob
);
9984 for (auto p
: mds_set
) {
9985 auto m
= make_message
<MMDSSnapUpdate
>(in
->ino(), stid
, snap_op
);
9986 m
->snap_blob
= snap_blob
;
9987 mds
->send_message_mds(m
, p
);
9992 notify_global_snaprealm_update(snap_op
);
9995 void MDCache::handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
)
9997 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9998 dout(10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
10000 if (mds
->get_state() < MDSMap::STATE_RESOLVE
&&
10001 mds
->get_want_state() != CEPH_MDS_STATE_RESOLVE
) {
10005 // null rejoin_done means open_snaprealms() has already been called
10006 bool notify_clients
= mds
->get_state() > MDSMap::STATE_REJOIN
||
10007 (mds
->is_rejoin() && !rejoin_done
);
10009 if (m
->get_tid() > 0) {
10010 mds
->snapclient
->notify_commit(m
->get_tid());
10011 if (notify_clients
)
10012 notify_global_snaprealm_update(m
->get_snap_op());
10015 CInode
*in
= get_inode(m
->get_ino());
10017 ceph_assert(!in
->is_auth());
10018 if (mds
->get_state() > MDSMap::STATE_REJOIN
||
10019 (mds
->is_rejoin() && !in
->is_rejoining())) {
10020 auto p
= m
->snap_blob
.cbegin();
10021 in
->decode_snap(p
);
10023 if (!notify_clients
) {
10024 if (!rejoin_pending_snaprealms
.count(in
)) {
10025 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
10026 rejoin_pending_snaprealms
.insert(in
);
10029 do_realm_invalidate_and_update_notify(in
, m
->get_snap_op(), notify_clients
);
10034 void MDCache::notify_global_snaprealm_update(int snap_op
)
10036 if (snap_op
!= CEPH_SNAP_OP_DESTROY
)
10037 snap_op
= CEPH_SNAP_OP_UPDATE
;
10038 set
<Session
*> sessions
;
10039 mds
->sessionmap
.get_client_session_set(sessions
);
10040 for (auto &session
: sessions
) {
10041 if (!session
->is_open() && !session
->is_stale())
10043 auto update
= make_message
<MClientSnap
>(snap_op
);
10044 update
->head
.split
= global_snaprealm
->inode
->ino();
10045 update
->bl
= global_snaprealm
->get_snap_trace();
10046 mds
->send_message_client_counted(update
, session
);
10050 // -------------------------------------------------------------------------------
10053 struct C_MDC_RetryScanStray
: public MDCacheContext
{
10055 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
10056 void finish(int r
) override
{
10057 mdcache
->scan_stray_dir(next
);
10061 void MDCache::scan_stray_dir(dirfrag_t next
)
10063 dout(10) << "scan_stray_dir " << next
<< dendl
;
10065 std::vector
<CDir
*> ls
;
10066 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
10067 if (strays
[i
]->ino() < next
.ino
)
10069 strays
[i
]->get_dirfrags(ls
);
10072 for (const auto& dir
: ls
) {
10073 if (dir
->dirfrag() < next
)
10075 if (!dir
->is_complete()) {
10076 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
10079 for (auto &p
: dir
->items
) {
10080 CDentry
*dn
= p
.second
;
10081 dn
->state_set(CDentry::STATE_STRAY
);
10082 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
10083 if (dnl
->is_primary()) {
10084 CInode
*in
= dnl
->get_inode();
10085 if (in
->inode
.nlink
== 0)
10086 in
->state_set(CInode::STATE_ORPHAN
);
10087 maybe_eval_stray(in
);
10093 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
10095 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
10096 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
10098 mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
10105 // ========================================================================================
10109 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10110 to the parent metadata object in the cache (pinning it).
10112 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10116 void MDCache::_send_discover(discover_info_t
& d
)
10118 auto dis
= make_message
<MDiscover
>(d
.ino
, d
.frag
, d
.snap
, d
.want_path
,
10119 d
.want_base_dir
, d
.path_locked
);
10120 dis
->set_tid(d
.tid
);
10121 mds
->send_message_mds(dis
, d
.mds
);
10124 void MDCache::discover_base_ino(inodeno_t want_ino
,
10125 MDSContext
*onfinish
,
10128 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
10129 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
10130 discover_info_t
& d
= _create_discover(from
);
10134 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
10138 void MDCache::discover_dir_frag(CInode
*base
,
10140 MDSContext
*onfinish
,
10144 from
= base
->authority().first
;
10146 dirfrag_t
df(base
->ino(), approx_fg
);
10147 dout(7) << "discover_dir_frag " << df
10148 << " from mds." << from
<< dendl
;
10150 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
10151 discover_info_t
& d
= _create_discover(from
);
10153 d
.ino
= base
->ino();
10154 d
.frag
= approx_fg
;
10155 d
.want_base_dir
= true;
10160 base
->add_dir_waiter(approx_fg
, onfinish
);
10163 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
10168 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
10169 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
10170 void finish(int r
) override
{
10171 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
10175 void MDCache::discover_path(CInode
*base
,
10177 filepath want_path
,
10178 MDSContext
*onfinish
,
10183 from
= base
->authority().first
;
10185 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10186 << (path_locked
? " path_locked":"")
10189 if (base
->is_ambiguous_auth()) {
10190 dout(10) << " waiting for single auth on " << *base
<< dendl
;
10192 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
10193 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
10195 } else if (from
== mds
->get_nodeid()) {
10196 MDSContext::vec finished
;
10197 base
->take_waiting(CInode::WAIT_DIR
, finished
);
10198 mds
->queue_waiters(finished
);
10202 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
10203 if ((path_locked
&& want_path
.depth() == 1) ||
10204 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
10205 discover_info_t
& d
= _create_discover(from
);
10206 d
.ino
= base
->ino();
10210 d
.want_path
= want_path
;
10211 d
.want_base_dir
= true;
10212 d
.path_locked
= path_locked
;
10218 base
->add_dir_waiter(fg
, onfinish
);
10221 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
10225 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
10226 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
10227 void finish(int r
) override
{
10228 mdcache
->discover_path(base
, snapid
, path
, 0);
10232 void MDCache::discover_path(CDir
*base
,
10234 filepath want_path
,
10235 MDSContext
*onfinish
,
10238 mds_rank_t from
= base
->authority().first
;
10240 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10241 << (path_locked
? " path_locked":"")
10244 if (base
->is_ambiguous_auth()) {
10245 dout(7) << " waiting for single auth on " << *base
<< dendl
;
10247 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
10248 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
10250 } else if (from
== mds
->get_nodeid()) {
10251 MDSContext::vec finished
;
10252 base
->take_sub_waiting(finished
);
10253 mds
->queue_waiters(finished
);
10257 if ((path_locked
&& want_path
.depth() == 1) ||
10258 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
10259 discover_info_t
& d
= _create_discover(from
);
10260 d
.ino
= base
->ino();
10261 d
.pin_base(base
->inode
);
10262 d
.frag
= base
->get_frag();
10264 d
.want_path
= want_path
;
10265 d
.want_base_dir
= false;
10266 d
.path_locked
= path_locked
;
10272 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
10275 void MDCache::kick_discovers(mds_rank_t who
)
10277 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
10278 p
!= discovers
.end();
10280 if (p
->second
.mds
!= who
)
10282 _send_discover(p
->second
);
10287 void MDCache::handle_discover(const cref_t
<MDiscover
> &dis
)
10289 mds_rank_t whoami
= mds
->get_nodeid();
10290 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
10292 ceph_assert(from
!= whoami
);
10294 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
10295 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
10296 mds
->get_want_state() < CEPH_MDS_STATE_REJOIN
) {
10300 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10301 // delay processing request from survivor because we may not yet choose lock states.
10302 if (!mds
->mdsmap
->is_rejoin(from
)) {
10303 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
10304 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
10311 auto reply
= make_message
<MDiscoverReply
>(*dis
);
10313 snapid_t snapid
= dis
->get_snapid();
10316 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
10317 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
10319 dout(7) << "handle_discover from mds." << from
10320 << " wants base + " << dis
->get_want().get_path()
10321 << " snap " << snapid
10324 cur
= get_inode(dis
->get_base_ino());
10328 reply
->starts_with
= MDiscoverReply::INODE
;
10329 encode_replica_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10330 dout(10) << "added base " << *cur
<< dendl
;
10333 // there's a base inode
10334 cur
= get_inode(dis
->get_base_ino(), snapid
);
10335 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
10336 cur
= get_inode(dis
->get_base_ino());
10337 if (cur
&& !cur
->is_multiversion())
10338 cur
= NULL
; // nope!
10342 dout(7) << "handle_discover mds." << from
10343 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
10345 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
10346 reply
->set_error_dentry(dis
->get_dentry(0));
10347 reply
->set_flag_error_dir();
10348 } else if (dis
->wants_base_dir()) {
10349 dout(7) << "handle_discover mds." << from
10350 << " wants basedir+" << dis
->get_want().get_path()
10354 dout(7) << "handle_discover mds." << from
10355 << " wants " << dis
->get_want().get_path()
10361 ceph_assert(reply
);
10364 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10365 for (unsigned i
= 0;
10366 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
10369 // -- figure out the dir
10371 // is *cur even a dir at all?
10372 if (!cur
->is_dir()) {
10373 dout(7) << *cur
<< " not a dir" << dendl
;
10374 reply
->set_flag_error_dir();
10380 if (dis
->get_want().depth()) {
10381 // dentry specifies
10382 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
10384 // requester explicity specified the frag
10385 ceph_assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
10386 fg
= dis
->get_base_dir_frag();
10387 if (!cur
->dirfragtree
.is_leaf(fg
))
10388 fg
= cur
->dirfragtree
[fg
.value()];
10390 CDir
*curdir
= cur
->get_dirfrag(fg
);
10392 if ((!curdir
&& !cur
->is_auth()) ||
10393 (curdir
&& !curdir
->is_auth())) {
10396 * ONLY set flag if empty!!
10397 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10398 * resulting in duplicate discovers in flight,
10399 * which can wreak havoc when discovering rename srcdn (which may move)
10402 if (reply
->is_empty()) {
10403 // only hint if empty.
10404 // someday this could be better, but right now the waiter logic isn't smart enough.
10408 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
10409 reply
->set_dir_auth_hint(curdir
->authority().first
);
10411 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10413 reply
->set_dir_auth_hint(cur
->authority().first
);
10416 // note error dentry, if any
10417 // NOTE: important, as it allows requester to issue an equivalent discover
10418 // to whomever we hint at.
10419 if (dis
->get_want().depth() > i
)
10420 reply
->set_error_dentry(dis
->get_dentry(i
));
10426 if (!curdir
) { // open dir?
10427 if (cur
->is_frozen()) {
10428 if (!reply
->is_empty()) {
10429 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
10432 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
10433 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10436 curdir
= cur
->get_or_open_dirfrag(this, fg
);
10437 } else if (curdir
->is_frozen_tree() ||
10438 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
10439 if (!reply
->is_empty()) {
10440 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
10443 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
10444 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
10445 reply
->set_flag_error_dir();
10448 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
10449 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10454 if (curdir
->get_version() == 0) {
10455 // fetch newly opened dir
10456 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
10457 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
10458 // make sure the base frag is correct, though, in there was a refragment since the
10459 // original request was sent.
10460 reply
->set_base_dir_frag(curdir
->get_frag());
10462 ceph_assert(!curdir
->is_ambiguous_auth()); // would be frozen.
10463 if (!reply
->trace
.length())
10464 reply
->starts_with
= MDiscoverReply::DIR;
10465 encode_replica_dir(curdir
, from
, reply
->trace
);
10466 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
10471 if (curdir
->get_version() == 0) {
10472 // fetch newly opened dir
10473 ceph_assert(!curdir
->has_bloom());
10474 } else if (dis
->get_want().depth() > 0) {
10476 dn
= curdir
->lookup(dis
->get_dentry(i
), snapid
);
10482 if (!curdir
->is_complete() &&
10483 !(snapid
== CEPH_NOSNAP
&&
10484 curdir
->has_bloom() &&
10485 !curdir
->is_in_bloom(dis
->get_dentry(i
)))) {
10487 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
10488 if (reply
->is_empty()) {
10490 curdir
->fetch(new C_MDS_RetryMessage(mds
, dis
),
10491 dis
->wants_base_dir() && curdir
->get_version() == 0);
10494 // initiate fetch, but send what we have so far
10500 if (snapid
!= CEPH_NOSNAP
&& !reply
->is_empty()) {
10501 dout(7) << "dentry " << dis
->get_dentry(i
) << " snap " << snapid
10502 << " dne, non-empty reply, stopping" << dendl
;
10506 // send null dentry
10507 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
10508 << *curdir
<< dendl
;
10509 if (snapid
== CEPH_NOSNAP
)
10510 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
10512 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
), snapid
, snapid
);
10516 // don't add replica to purging dentry/inode
10517 if (dn
->state_test(CDentry::STATE_PURGING
)) {
10518 if (reply
->is_empty())
10519 reply
->set_flag_error_dn(dis
->get_dentry(i
));
10523 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10526 // ...always block on non-tail items (they are unrelated)
10527 // ...allow xlocked tail disocvery _only_ if explicitly requested
10528 if (dn
->lock
.is_xlocked()) {
10529 // is this the last (tail) item in the discover traversal?
10530 if (dis
->is_path_locked()) {
10531 dout(7) << "handle_discover allowing discovery of xlocked " << *dn
<< dendl
;
10532 } else if (reply
->is_empty()) {
10533 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
10534 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
10537 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
10543 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
10544 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
10545 if (tailitem
&& dis
->is_path_locked()) {
10546 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
10547 } else if (reply
->is_empty()) {
10548 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
10549 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10552 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
10558 if (!reply
->trace
.length())
10559 reply
->starts_with
= MDiscoverReply::DENTRY
;
10560 encode_replica_dentry(dn
, from
, reply
->trace
);
10561 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
10563 if (!dnl
->is_primary()) break; // stop on null or remote link.
10566 CInode
*next
= dnl
->get_inode();
10567 ceph_assert(next
->is_auth());
10569 encode_replica_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10570 dout(7) << "handle_discover added inode " << *next
<< dendl
;
10572 // descend, keep going.
10578 ceph_assert(!reply
->is_empty());
10579 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
10580 mds
->send_message(reply
, dis
->get_connection());
10583 void MDCache::handle_discover_reply(const cref_t
<MDiscoverReply
> &m
)
10586 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10587 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10591 dout(7) << "discover_reply " << *m
<< dendl
;
10592 if (m
->is_flag_error_dir())
10593 dout(7) << " flag error, dir" << dendl
;
10594 if (m
->is_flag_error_dn())
10595 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
10597 MDSContext::vec finished
, error
;
10598 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10601 CInode
*cur
= get_inode(m
->get_base_ino());
10602 auto p
= m
->trace
.cbegin();
10604 int next
= m
->starts_with
;
10606 // decrement discover counters
10607 if (m
->get_tid()) {
10608 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10609 if (p
!= discovers
.end()) {
10610 dout(10) << " found tid " << m
->get_tid() << dendl
;
10611 discovers
.erase(p
);
10613 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10617 // discover may start with an inode
10618 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10619 decode_replica_inode(cur
, p
, NULL
, finished
);
10620 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10621 ceph_assert(cur
->is_base());
10623 next
= MDiscoverReply::DIR;
10626 if (cur
->is_base() &&
10627 waiting_for_base_ino
[from
].count(cur
->ino())) {
10628 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10629 waiting_for_base_ino
[from
].erase(cur
->ino());
10634 // loop over discover results.
10635 // indexes follow each ([[dir] dentry] inode)
10636 // can start, end with any type.
10640 CDir
*curdir
= nullptr;
10641 if (next
== MDiscoverReply::DIR) {
10642 decode_replica_dir(curdir
, p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10643 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10644 ceph_assert(m
->get_wanted_base_dir());
10645 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10648 // note: this can only happen our first way around this loop.
10649 if (p
.end() && m
->is_flag_error_dn()) {
10650 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10651 curdir
= cur
->get_dirfrag(fg
);
10653 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10660 CDentry
*dn
= nullptr;
10661 decode_replica_dentry(dn
, p
, curdir
, finished
);
10667 decode_replica_inode(cur
, p
, dn
, finished
);
10669 next
= MDiscoverReply::DIR;
10673 // or dir_auth hint?
10674 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10676 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10677 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10678 mds_rank_t who
= m
->get_dir_auth_hint();
10679 if (who
== mds
->get_nodeid()) who
= -1;
10681 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10684 if (m
->get_wanted_base_dir()) {
10685 frag_t fg
= m
->get_base_dir_frag();
10686 CDir
*dir
= cur
->get_dirfrag(fg
);
10688 if (cur
->is_waiting_for_dir(fg
)) {
10689 if (cur
->is_auth())
10690 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10691 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10692 cur
->take_dir_waiting(fg
, finished
);
10694 discover_dir_frag(cur
, fg
, 0, who
);
10696 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10700 if (m
->get_error_dentry().length()) {
10701 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10702 CDir
*dir
= cur
->get_dirfrag(fg
);
10704 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10705 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10706 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10707 m
->get_wanted_snapid(), finished
);
10709 filepath
relpath(m
->get_error_dentry(), 0);
10710 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->is_path_locked());
10713 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10714 << m
->get_error_dentry() << dendl
;
10716 } else if (m
->is_flag_error_dn()) {
10717 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10718 CDir
*dir
= cur
->get_dirfrag(fg
);
10720 if (dir
->is_auth()) {
10721 dir
->take_sub_waiting(finished
);
10723 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10724 m
->get_wanted_snapid(), error
);
10730 finish_contexts(g_ceph_context
, error
, -ENOENT
); // finish errors directly
10731 mds
->queue_waiters(finished
);
10736 // ----------------------------
10740 void MDCache::encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
)
10742 ENCODE_START(1, 1, bl
);
10743 dirfrag_t df
= dir
->dirfrag();
10745 __u32 nonce
= dir
->add_replica(to
);
10747 dir
->_encode_base(bl
);
10751 void MDCache::encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
)
10753 ENCODE_START(1, 1, bl
);
10754 encode(dn
->get_name(), bl
);
10755 encode(dn
->last
, bl
);
10757 __u32 nonce
= dn
->add_replica(to
);
10759 encode(dn
->first
, bl
);
10760 encode(dn
->linkage
.remote_ino
, bl
);
10761 encode(dn
->linkage
.remote_d_type
, bl
);
10762 dn
->lock
.encode_state_for_replica(bl
);
10763 bool need_recover
= mds
->get_state() < MDSMap::STATE_ACTIVE
;
10764 encode(need_recover
, bl
);
10768 void MDCache::encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
10771 ENCODE_START(2, 1, bl
);
10772 ceph_assert(in
->is_auth());
10773 encode(in
->inode
.ino
, bl
); // bleh, minor assymetry here
10774 encode(in
->last
, bl
);
10776 __u32 nonce
= in
->add_replica(to
);
10779 in
->_encode_base(bl
, features
);
10780 in
->_encode_locks_state_for_replica(bl
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10782 __u32 state
= in
->state
;
10788 void MDCache::decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
,
10789 MDSContext::vec
& finished
)
10791 DECODE_START(1, p
);
10795 ceph_assert(diri
->ino() == df
.ino
);
10797 // add it (_replica_)
10798 dir
= diri
->get_dirfrag(df
.frag
);
10801 // had replica. update w/ new nonce.
10804 dir
->set_replica_nonce(nonce
);
10805 dir
->_decode_base(p
);
10806 dout(7) << __func__
<< " had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10808 // force frag to leaf in the diri tree
10809 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10810 dout(7) << __func__
<< " forcing frag " << df
.frag
<< " to leaf in the fragtree "
10811 << diri
->dirfragtree
<< dendl
;
10812 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10815 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10818 dir
->set_replica_nonce(nonce
);
10819 dir
->_decode_base(p
);
10820 // is this a dir_auth delegation boundary?
10821 if (from
!= diri
->authority().first
||
10822 diri
->is_ambiguous_auth() ||
10824 adjust_subtree_auth(dir
, from
);
10826 dout(7) << __func__
<< " added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10828 diri
->take_dir_waiting(df
.frag
, finished
);
10833 void MDCache::decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
)
10835 DECODE_START(1, p
);
10841 dn
= dir
->lookup(name
, last
);
10844 bool is_new
= false;
10847 dout(7) << __func__
<< " had " << *dn
<< dendl
;
10850 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10851 dout(7) << __func__
<< " added " << *dn
<< dendl
;
10856 dn
->set_replica_nonce(nonce
);
10857 decode(dn
->first
, p
);
10860 unsigned char rdtype
;
10863 dn
->lock
.decode_state(p
, is_new
);
10866 decode(need_recover
, p
);
10870 dir
->link_remote_inode(dn
, rino
, rdtype
);
10872 dn
->lock
.mark_need_recover();
10875 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10879 void MDCache::decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
)
10881 DECODE_START(2, p
);
10888 in
= get_inode(ino
, last
);
10890 in
= new CInode(this, false, 1, last
);
10891 in
->set_replica_nonce(nonce
);
10892 in
->_decode_base(p
);
10893 in
->_decode_locks_state_for_replica(p
, true);
10895 if (in
->ino() == MDS_INO_ROOT
)
10896 in
->inode_auth
.first
= 0;
10897 else if (in
->is_mdsdir())
10898 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10899 dout(10) << __func__
<< " added " << *in
<< dendl
;
10901 ceph_assert(dn
->get_linkage()->is_null());
10902 dn
->dir
->link_primary_inode(dn
, in
);
10905 in
->set_replica_nonce(nonce
);
10906 in
->_decode_base(p
);
10907 in
->_decode_locks_state_for_replica(p
, false);
10908 dout(10) << __func__
<< " had " << *in
<< dendl
;
10912 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10913 dout(10) << __func__
<< " different linkage in dentry " << *dn
<< dendl
;
10916 if (struct_v
>= 2) {
10919 s
&= CInode::MASK_STATE_REPLICATED
;
10920 if (s
& CInode::STATE_RANDEPHEMERALPIN
) {
10921 dout(10) << "replica inode is random ephemeral pinned" << dendl
;
10922 in
->set_ephemeral_rand(true);
10930 void MDCache::encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10932 ENCODE_START(1, 1, bl
);
10933 uint64_t features
= mds
->mdsmap
->get_up_features();
10934 encode_replica_inode(get_myin(), who
, bl
, features
);
10935 encode_replica_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10936 encode_replica_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10937 encode_replica_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10938 encode_replica_dir(straydn
->get_dir(), who
, bl
);
10939 encode_replica_dentry(straydn
, who
, bl
);
10943 void MDCache::decode_replica_stray(CDentry
*&straydn
, const bufferlist
&bl
, mds_rank_t from
)
10945 MDSContext::vec finished
;
10946 auto p
= bl
.cbegin();
10948 DECODE_START(1, p
);
10949 CInode
*mdsin
= nullptr;
10950 decode_replica_inode(mdsin
, p
, NULL
, finished
);
10951 CDir
*mdsdir
= nullptr;
10952 decode_replica_dir(mdsdir
, p
, mdsin
, from
, finished
);
10953 CDentry
*straydirdn
= nullptr;
10954 decode_replica_dentry(straydirdn
, p
, mdsdir
, finished
);
10955 CInode
*strayin
= nullptr;
10956 decode_replica_inode(strayin
, p
, straydirdn
, finished
);
10957 CDir
*straydir
= nullptr;
10958 decode_replica_dir(straydir
, p
, strayin
, from
, finished
);
10960 decode_replica_dentry(straydn
, p
, straydir
, finished
);
10961 if (!finished
.empty())
10962 mds
->queue_waiters(finished
);
10967 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
10969 // this is an FYI, re: replication
10971 set
<mds_rank_t
> who
;
10973 mds
->get_mds_map()->get_active_mds_set(who
);
10975 for (const auto &p
: dir
->get_replicas()) {
10976 who
.insert(p
.first
);
10980 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
10983 dir
->inode
->make_path(path
);
10985 mds_rank_t whoami
= mds
->get_nodeid();
10986 for (set
<mds_rank_t
>::iterator it
= who
.begin();
10989 if (*it
== whoami
) continue;
10990 //if (*it == except) continue;
10991 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
10993 std::set
<int32_t> s
;
10994 for (const auto &r
: dir
->dir_rep_by
) {
10997 mds
->send_message_mds(make_message
<MDirUpdate
>(mds
->get_nodeid(), dir
->dirfrag(), dir
->dir_rep
, s
, path
, bcast
), *it
);
11003 void MDCache::handle_dir_update(const cref_t
<MDirUpdate
> &m
)
11005 dirfrag_t df
= m
->get_dirfrag();
11006 CDir
*dir
= get_dirfrag(df
);
11008 dout(5) << "dir_update on " << df
<< ", don't have it" << dendl
;
11011 if (m
->should_discover()) {
11013 // this is key to avoid a fragtree update race, among other things.
11014 m
->inc_tried_discover();
11015 vector
<CDentry
*> trace
;
11017 filepath path
= m
->get_path();
11018 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
11019 CF_MDS_RetryMessageFactory
cf(mds
, m
);
11020 MDRequestRef null_ref
;
11021 int r
= path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, &trace
, &in
);
11025 in
->ino() == df
.ino
&&
11026 in
->get_approx_dirfrag(df
.frag
) == NULL
) {
11027 open_remote_dirfrag(in
, df
.frag
, new C_MDS_RetryMessage(mds
, m
));
11035 if (!m
->has_tried_discover()) {
11036 // Update if it already exists. Othwerwise it got updated by discover reply.
11037 dout(5) << "dir_update on " << *dir
<< dendl
;
11038 dir
->dir_rep
= m
->get_dir_rep();
11039 dir
->dir_rep_by
.clear();
11040 for (const auto &e
: m
->get_dir_rep_by()) {
11041 dir
->dir_rep_by
.insert(e
);
11052 void MDCache::encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
)
11054 ENCODE_START(1, 1, bl
);
11055 inodeno_t ino
= dnl
->get_remote_ino();
11057 __u8 d_type
= dnl
->get_remote_d_type();
11058 encode(d_type
, bl
);
11062 void MDCache::decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
)
11064 DECODE_START(1, p
);
11069 dout(10) << __func__
<< " remote " << ino
<< " " << d_type
<< dendl
;
11070 dir
->link_remote_inode(dn
, ino
, d_type
);
11074 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
11076 dout(7) << __func__
<< " " << *dn
<< dendl
;
11078 CDir
*subtree
= get_subtree_root(dn
->get_dir());
11079 for (const auto &p
: dn
->get_replicas()) {
11080 // don't tell (rename) witnesses; they already know
11081 if (mdr
.get() && mdr
->more()->witnessed
.count(p
.first
))
11083 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
11084 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
11085 rejoin_gather
.count(p
.first
)))
11087 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11088 auto m
= make_message
<MDentryLink
>(subtree
->dirfrag(), dn
->get_dir()->dirfrag(), dn
->get_name(), dnl
->is_primary());
11089 if (dnl
->is_primary()) {
11090 dout(10) << __func__
<< " primary " << *dnl
->get_inode() << dendl
;
11091 encode_replica_inode(dnl
->get_inode(), p
.first
, m
->bl
,
11092 mds
->mdsmap
->get_up_features());
11093 } else if (dnl
->is_remote()) {
11094 encode_remote_dentry_link(dnl
, m
->bl
);
11096 ceph_abort(); // aie, bad caller!
11097 mds
->send_message_mds(m
, p
.first
);
11101 void MDCache::handle_dentry_link(const cref_t
<MDentryLink
> &m
)
11103 CDentry
*dn
= NULL
;
11104 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
11106 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
11108 dn
= dir
->lookup(m
->get_dn());
11110 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
11112 dout(7) << __func__
<< " on " << *dn
<< dendl
;
11113 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11115 ceph_assert(!dn
->is_auth());
11116 ceph_assert(dnl
->is_null());
11120 auto p
= m
->bl
.cbegin();
11121 MDSContext::vec finished
;
11123 if (m
->get_is_primary()) {
11125 CInode
*in
= nullptr;
11126 decode_replica_inode(in
, p
, dn
, finished
);
11128 // remote link, easy enough.
11129 decode_remote_dentry_link(dir
, dn
, p
);
11135 if (!finished
.empty())
11136 mds
->queue_waiters(finished
);
11144 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
11146 dout(10) << __func__
<< " " << *dn
<< dendl
;
11147 // share unlink news with replicas
11148 set
<mds_rank_t
> replicas
;
11149 dn
->list_replicas(replicas
);
11152 straydn
->list_replicas(replicas
);
11153 CInode
*strayin
= straydn
->get_linkage()->get_inode();
11154 strayin
->encode_snap_blob(snapbl
);
11156 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
11157 it
!= replicas
.end();
11159 // don't tell (rmdir) witnesses; they already know
11160 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
11163 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
11164 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
11165 rejoin_gather
.count(*it
)))
11168 auto unlink
= make_message
<MDentryUnlink
>(dn
->get_dir()->dirfrag(), dn
->get_name());
11170 encode_replica_stray(straydn
, *it
, unlink
->straybl
);
11171 unlink
->snapbl
= snapbl
;
11173 mds
->send_message_mds(unlink
, *it
);
11177 void MDCache::handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
)
11180 CDentry
*straydn
= nullptr;
11181 if (m
->straybl
.length())
11182 decode_replica_stray(straydn
, m
->straybl
, mds_rank_t(m
->get_source().num()));
11184 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
11186 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
11188 CDentry
*dn
= dir
->lookup(m
->get_dn());
11190 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
11192 dout(7) << __func__
<< " on " << *dn
<< dendl
;
11193 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11196 if (dnl
->is_primary()) {
11197 CInode
*in
= dnl
->get_inode();
11198 dn
->dir
->unlink_inode(dn
);
11199 ceph_assert(straydn
);
11200 straydn
->dir
->link_primary_inode(straydn
, in
);
11202 // in->first is lazily updated on replica; drag it forward so
11203 // that we always keep it in sync with the dnq
11204 ceph_assert(straydn
->first
>= in
->first
);
11205 in
->first
= straydn
->first
;
11207 // update subtree map?
11209 adjust_subtree_after_rename(in
, dir
, false);
11211 if (m
->snapbl
.length()) {
11212 bool hadrealm
= (in
->snaprealm
? true : false);
11213 in
->decode_snap_blob(m
->snapbl
);
11214 ceph_assert(in
->snaprealm
);
11215 ceph_assert(in
->snaprealm
->have_past_parents_open());
11217 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
11220 // send caps to auth (if we're not already)
11221 if (in
->is_any_caps() &&
11222 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
11223 migrator
->export_caps(in
);
11227 ceph_assert(!straydn
);
11228 ceph_assert(dnl
->is_remote());
11229 dn
->dir
->unlink_inode(dn
);
11231 ceph_assert(dnl
->is_null());
11235 // race with trim_dentry()
11237 ceph_assert(straydn
->get_num_ref() == 0);
11238 ceph_assert(straydn
->get_linkage()->is_null());
11240 trim_dentry(straydn
, ex
);
11241 send_expire_messages(ex
);
11250 // ===================================================================
11254 // ===================================================================
11259 * adjust_dir_fragments -- adjust fragmentation for a directory
11261 * @param diri directory inode
11262 * @param basefrag base fragment
11263 * @param bits bit adjustment. positive for split, negative for merge.
11265 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
11266 std::vector
<CDir
*>* resultfrags
,
11267 MDSContext::vec
& waiters
,
11270 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
11271 << " on " << *diri
<< dendl
;
11273 auto&& p
= diri
->get_dirfrags_under(basefrag
);
11275 adjust_dir_fragments(diri
, p
.second
, basefrag
, bits
, resultfrags
, waiters
, replay
);
11278 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
11280 CDir
*dir
= diri
->get_dirfrag(fg
);
11284 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
11286 std::vector
<CDir
*> src
, result
;
11287 MDSContext::vec waiters
;
11290 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
11292 CDir
*pdir
= diri
->get_dirfrag(parent
);
11294 int split
= fg
.bits() - parent
.bits();
11295 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
11296 src
.push_back(pdir
);
11297 adjust_dir_fragments(diri
, src
, parent
, split
, &result
, waiters
, replay
);
11298 dir
= diri
->get_dirfrag(fg
);
11300 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11304 if (parent
== frag_t())
11306 frag_t last
= parent
;
11307 parent
= parent
.parent();
11308 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
11312 // hoover up things under fg?
11314 auto&& p
= diri
->get_dirfrags_under(fg
);
11315 src
.insert(std::end(src
), std::cbegin(p
.second
), std::cend(p
.second
));
11318 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
11320 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
11321 adjust_dir_fragments(diri
, src
, fg
, 0, &result
, waiters
, replay
);
11322 dir
= result
.front();
11323 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11327 mds
->queue_waiters(waiters
);
11331 void MDCache::adjust_dir_fragments(CInode
*diri
,
11332 const std::vector
<CDir
*>& srcfrags
,
11333 frag_t basefrag
, int bits
,
11334 std::vector
<CDir
*>* resultfrags
,
11335 MDSContext::vec
& waiters
,
11338 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
11339 << " srcfrags " << srcfrags
11340 << " on " << *diri
<< dendl
;
11343 // yuck. we may have discovered the inode while it was being fragmented.
11344 if (!diri
->dirfragtree
.is_leaf(basefrag
))
11345 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
11348 diri
->dirfragtree
.split(basefrag
, bits
);
11349 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
11351 if (srcfrags
.empty())
11355 CDir
*parent_dir
= diri
->get_parent_dir();
11356 CDir
*parent_subtree
= 0;
11358 parent_subtree
= get_subtree_root(parent_dir
);
11360 ceph_assert(srcfrags
.size() >= 1);
11363 ceph_assert(srcfrags
.size() == 1);
11364 CDir
*dir
= srcfrags
.front();
11366 dir
->split(bits
, resultfrags
, waiters
, replay
);
11368 // did i change the subtree map?
11369 if (dir
->is_subtree_root()) {
11370 // new frags are now separate subtrees
11371 for (const auto& dir
: *resultfrags
) {
11372 subtrees
[dir
].clear(); // new frag is now its own subtree
11376 if (parent_subtree
) {
11377 ceph_assert(subtrees
[parent_subtree
].count(dir
));
11378 subtrees
[parent_subtree
].erase(dir
);
11379 for (const auto& dir
: *resultfrags
) {
11380 ceph_assert(dir
->is_subtree_root());
11381 subtrees
[parent_subtree
].insert(dir
);
11385 // adjust my bounds.
11387 bounds
.swap(subtrees
[dir
]);
11388 subtrees
.erase(dir
);
11389 for (set
<CDir
*>::iterator p
= bounds
.begin();
11392 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
11393 subtrees
[frag
].insert(*p
);
11399 diri
->close_dirfrag(dir
->get_frag());
11404 // are my constituent bits subtrees? if so, i will be too.
11405 // (it's all or none, actually.)
11406 bool any_subtree
= false, any_non_subtree
= false;
11407 for (const auto& dir
: srcfrags
) {
11408 if (dir
->is_subtree_root())
11409 any_subtree
= true;
11411 any_non_subtree
= true;
11413 ceph_assert(!any_subtree
|| !any_non_subtree
);
11415 set
<CDir
*> new_bounds
;
11417 for (const auto& dir
: srcfrags
) {
11418 // this simplifies the code that find subtrees underneath the dirfrag
11419 if (!dir
->is_subtree_root()) {
11420 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11421 adjust_subtree_auth(dir
, mds
->get_nodeid());
11425 for (const auto& dir
: srcfrags
) {
11426 ceph_assert(dir
->is_subtree_root());
11427 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
11428 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
11429 set
<CDir
*>::iterator r
= q
->second
.begin();
11430 while (r
!= subtrees
[dir
].end()) {
11431 new_bounds
.insert(*r
);
11432 subtrees
[dir
].erase(r
++);
11436 // remove myself as my parent's bound
11437 if (parent_subtree
)
11438 subtrees
[parent_subtree
].erase(dir
);
11443 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
11444 f
->merge(srcfrags
, waiters
, replay
);
11447 ceph_assert(f
->is_subtree_root());
11448 subtrees
[f
].swap(new_bounds
);
11449 if (parent_subtree
)
11450 subtrees
[parent_subtree
].insert(f
);
11455 resultfrags
->push_back(f
);
11460 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
11464 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
11465 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
11466 void finish(int r
) override
{
11467 mdcache
->fragment_frozen(mdr
, r
);
11471 bool MDCache::can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
)
11473 if (is_readonly()) {
11474 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
11477 if (mds
->is_cluster_degraded()) {
11478 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
11481 if (diri
->get_parent_dir() &&
11482 diri
->get_parent_dir()->get_inode()->is_stray()) {
11483 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
11486 if (diri
->is_mdsdir() || diri
->is_stray() || diri
->ino() == MDS_INO_CEPH
) {
11487 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl
;
11491 if (diri
->scrub_is_in_progress()) {
11492 dout(7) << "can_fragment: scrub in progress" << dendl
;
11496 for (const auto& dir
: dirs
) {
11497 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
11498 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
11501 if (!dir
->is_auth()) {
11502 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
11505 if (dir
->is_bad()) {
11506 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
11509 if (dir
->is_frozen() ||
11510 dir
->is_freezing()) {
11511 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
11519 void MDCache::split_dir(CDir
*dir
, int bits
)
11521 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
11522 ceph_assert(dir
->is_auth());
11523 CInode
*diri
= dir
->inode
;
11525 std::vector
<CDir
*> dirs
;
11526 dirs
.push_back(dir
);
11528 if (!can_fragment(diri
, dirs
)) {
11529 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
11533 if (dir
->frag
.bits() + bits
> 24) {
11534 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
11538 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11539 mdr
->more()->fragment_base
= dir
->dirfrag();
11541 ceph_assert(fragments
.count(dir
->dirfrag()) == 0);
11542 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
11544 info
.dirs
.push_back(dir
);
11546 info
.last_cum_auth_pins_change
= ceph_clock_now();
11548 fragment_freeze_dirs(dirs
);
11549 // initial mark+complete pass
11550 fragment_mark_and_complete(mdr
);
11553 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
11555 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
11557 auto&& [all
, dirs
] = diri
->get_dirfrags_under(frag
);
11559 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
11563 if (diri
->dirfragtree
.is_leaf(frag
)) {
11564 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
11568 if (!can_fragment(diri
, dirs
))
11571 CDir
*first
= dirs
.front();
11572 int bits
= first
->get_frag().bits() - frag
.bits();
11573 dout(10) << " we are merging by " << bits
<< " bits" << dendl
;
11575 dirfrag_t
basedirfrag(diri
->ino(), frag
);
11576 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11577 mdr
->more()->fragment_base
= basedirfrag
;
11579 ceph_assert(fragments
.count(basedirfrag
) == 0);
11580 fragment_info_t
& info
= fragments
[basedirfrag
];
11584 info
.last_cum_auth_pins_change
= ceph_clock_now();
11586 fragment_freeze_dirs(dirs
);
11587 // initial mark+complete pass
11588 fragment_mark_and_complete(mdr
);
11591 void MDCache::fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
)
11593 bool any_subtree
= false, any_non_subtree
= false;
11594 for (const auto& dir
: dirs
) {
11595 dir
->auth_pin(dir
); // until we mark and complete them
11596 dir
->state_set(CDir::STATE_FRAGMENTING
);
11598 ceph_assert(dir
->is_freezing_dir());
11600 if (dir
->is_subtree_root())
11601 any_subtree
= true;
11603 any_non_subtree
= true;
11606 if (any_subtree
&& any_non_subtree
) {
11607 // either all dirfrags are subtree roots or all are not.
11608 for (const auto& dir
: dirs
) {
11609 if (dir
->is_subtree_root()) {
11610 ceph_assert(dir
->state_test(CDir::STATE_AUXSUBTREE
));
11612 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11613 adjust_subtree_auth(dir
, mds
->get_nodeid());
11619 class C_MDC_FragmentMarking
: public MDCacheContext
{
11622 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11623 void finish(int r
) override
{
11624 mdcache
->fragment_mark_and_complete(mdr
);
11628 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
11630 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11631 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11632 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11633 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
11634 request_finish(mdr
);
11638 fragment_info_t
& info
= it
->second
;
11639 CInode
*diri
= info
.dirs
.front()->get_inode();
11640 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
11642 MDSGatherBuilder
gather(g_ceph_context
);
11644 for (const auto& dir
: info
.dirs
) {
11646 if (!dir
->is_complete()) {
11647 dout(15) << " fetching incomplete " << *dir
<< dendl
;
11648 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
11650 } else if (dir
->get_frag() == frag_t()) {
11651 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11652 // the operation. To avoid CDir::fetch() complaining about missing object,
11653 // we commit new dirfrag first.
11654 if (dir
->state_test(CDir::STATE_CREATING
)) {
11655 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
11656 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
11658 } else if (dir
->is_new()) {
11659 dout(15) << " committing new " << *dir
<< dendl
;
11660 ceph_assert(dir
->is_dirty());
11661 dir
->commit(0, gather
.new_sub(), true);
11668 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11669 dout(15) << " marking " << *dir
<< dendl
;
11670 for (auto &p
: dir
->items
) {
11671 CDentry
*dn
= p
.second
;
11672 dn
->get(CDentry::PIN_FRAGMENTING
);
11673 ceph_assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
11674 dn
->state_set(CDentry::STATE_FRAGMENTING
);
11676 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
11677 dir
->auth_unpin(dir
);
11679 dout(15) << " already marked " << *dir
<< dendl
;
11682 if (gather
.has_subs()) {
11683 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
11688 for (const auto& dir
: info
.dirs
) {
11689 if (!dir
->is_frozen_dir()) {
11690 ceph_assert(dir
->is_freezing_dir());
11691 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
11694 if (gather
.has_subs()) {
11695 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
11697 // flush log so that request auth_pins are retired
11698 mds
->mdlog
->flush();
11702 fragment_frozen(mdr
, 0);
11705 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
)
11707 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
11708 for (const auto& dir
: dirs
) {
11709 dout(10) << " frag " << *dir
<< dendl
;
11711 ceph_assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11712 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11714 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11715 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11717 for (auto &p
: dir
->items
) {
11718 CDentry
*dn
= p
.second
;
11719 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11720 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11721 dn
->put(CDentry::PIN_FRAGMENTING
);
11724 dir
->auth_unpin(dir
);
11727 dir
->unfreeze_dir();
11731 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11733 ceph_assert(dir
->is_frozen_dir());
11734 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11735 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11736 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11738 if (p
->first
.frag
.contains(dir
->get_frag()))
11739 return p
->second
.all_frozen
;
11745 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11747 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11748 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11749 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11751 if (p
->first
.frag
.contains(dir
->get_frag())) {
11752 p
->second
.num_remote_waiters
++;
11759 void MDCache::find_stale_fragment_freeze()
11761 dout(10) << "find_stale_fragment_freeze" << dendl
;
11762 // see comment in Migrator::find_stale_export_freeze()
11763 utime_t now
= ceph_clock_now();
11764 utime_t cutoff
= now
;
11765 cutoff
-= g_conf()->mds_freeze_tree_timeout
;
11767 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11768 p
!= fragments
.end(); ) {
11769 dirfrag_t df
= p
->first
;
11770 fragment_info_t
& info
= p
->second
;
11772 if (info
.all_frozen
)
11775 int total_auth_pins
= 0;
11776 for (const auto& d
: info
.dirs
) {
11778 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11779 total_auth_pins
= -1;
11782 if (dir
->is_frozen_dir())
11784 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11786 if (total_auth_pins
< 0)
11788 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11789 info
.last_cum_auth_pins
= total_auth_pins
;
11790 info
.last_cum_auth_pins_change
= now
;
11793 if (info
.last_cum_auth_pins_change
>= cutoff
)
11795 dir
= info
.dirs
.front();
11796 if (info
.num_remote_waiters
> 0 ||
11797 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11798 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11799 std::vector
<CDir
*> dirs
;
11800 info
.dirs
.swap(dirs
);
11801 fragments
.erase(df
);
11802 fragment_unmark_unfreeze_dirs(dirs
);
11807 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11810 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11811 void finish(int r
) override
{
11812 mdcache
->_fragment_logged(mdr
);
11816 class C_MDC_FragmentStore
: public MDCacheContext
{
11819 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11820 void finish(int r
) override
{
11821 mdcache
->_fragment_stored(mdr
);
11825 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11826 dirfrag_t basedirfrag
;
11829 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, const MDRequestRef
& r
) :
11830 MDCacheLogContext(m
), basedirfrag(df
), mdr(r
) {}
11831 void finish(int r
) override
{
11832 mdcache
->_fragment_committed(basedirfrag
, mdr
);
11836 class C_IO_MDC_FragmentPurgeOld
: public MDCacheIOContext
{
11837 dirfrag_t basedirfrag
;
11841 C_IO_MDC_FragmentPurgeOld(MDCache
*m
, dirfrag_t f
, int b
,
11842 const MDRequestRef
& r
) :
11843 MDCacheIOContext(m
), basedirfrag(f
), bits(b
), mdr(r
) {}
11844 void finish(int r
) override
{
11845 ceph_assert(r
== 0 || r
== -ENOENT
);
11846 mdcache
->_fragment_old_purged(basedirfrag
, bits
, mdr
);
11848 void print(ostream
& out
) const override
{
11849 out
<< "fragment_purge_old(" << basedirfrag
<< ")";
11853 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11855 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11856 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11857 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11858 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11859 request_finish(mdr
);
11863 ceph_assert(r
== 0);
11864 fragment_info_t
& info
= it
->second
;
11865 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11866 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11868 info
.all_frozen
= true;
11869 dispatch_fragment_dir(mdr
);
11872 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11874 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11875 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11876 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11877 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11878 request_finish(mdr
);
11882 fragment_info_t
& info
= it
->second
;
11883 CInode
*diri
= info
.dirs
.front()->get_inode();
11885 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11886 << " on " << *diri
<< dendl
;
11888 if (mdr
->more()->slave_error
)
11889 mdr
->aborted
= true;
11891 if (!mdr
->aborted
) {
11892 MutationImpl::LockOpVec lov
;
11893 lov
.add_wrlock(&diri
->dirfragtreelock
);
11894 // prevent a racing gather on any other scatterlocks too
11895 lov
.lock_scatter_gather(&diri
->nestlock
);
11896 lov
.lock_scatter_gather(&diri
->filelock
);
11897 if (!mds
->locker
->acquire_locks(mdr
, lov
, NULL
, true)) {
11903 if (mdr
->aborted
) {
11904 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11905 << info
.dirs
.front()->dirfrag() << dendl
;
11907 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11909 mds
->balancer
->queue_merge(info
.dirs
.front());
11910 fragment_unmark_unfreeze_dirs(info
.dirs
);
11911 fragments
.erase(it
);
11912 request_finish(mdr
);
11916 mdr
->ls
= mds
->mdlog
->get_current_segment();
11917 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11918 mds
->mdlog
->start_entry(le
);
11920 for (const auto& dir
: info
.dirs
) {
11921 dirfrag_rollback rollback
;
11922 rollback
.fnode
= dir
->fnode
;
11923 le
->add_orig_frag(dir
->get_frag(), &rollback
);
11927 MDSContext::vec waiters
;
11928 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
11929 &info
.resultfrags
, waiters
, false);
11930 if (g_conf()->mds_debug_frag
)
11931 diri
->verify_dirfrags();
11932 mds
->queue_waiters(waiters
);
11934 for (const auto& fg
: le
->orig_frags
)
11935 ceph_assert(!diri
->dirfragtree
.is_leaf(fg
));
11937 le
->metablob
.add_dir_context(info
.resultfrags
.front());
11938 for (const auto& dir
: info
.resultfrags
) {
11939 if (diri
->is_auth()) {
11940 le
->metablob
.add_fragmented_dir(dir
, false, false);
11942 dir
->state_set(CDir::STATE_DIRTYDFT
);
11943 le
->metablob
.add_fragmented_dir(dir
, false, true);
11948 if (diri
->is_auth()) {
11949 // journal dirfragtree
11950 auto &pi
= diri
->project_inode();
11951 pi
.inode
.version
= diri
->pre_dirty();
11952 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11954 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11955 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11956 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
11961 mds->locker->mark_updated_scatterlock(&diri->filelock);
11962 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11963 mut->add_updated_lock(&diri->filelock);
11966 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11967 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11968 mut->add_updated_lock(&diri->nestlock);
11971 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
11972 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
11974 mds
->mdlog
->flush();
11977 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
11979 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11980 auto& info
= fragments
.at(basedirfrag
);
11981 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11983 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
11984 << " on " << *diri
<< dendl
;
11985 mdr
->mark_event("prepare logged");
11987 if (diri
->is_auth())
11988 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
11990 mdr
->apply(); // mark scatterlock
11992 // store resulting frags
11993 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
11995 for (const auto& dir
: info
.resultfrags
) {
11996 dout(10) << " storing result frag " << *dir
<< dendl
;
11998 // freeze and store them too
11999 dir
->auth_pin(this);
12000 dir
->state_set(CDir::STATE_FRAGMENTING
);
12001 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
12007 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
12009 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
12010 fragment_info_t
&info
= fragments
.at(basedirfrag
);
12011 CDir
*first
= info
.resultfrags
.front();
12012 CInode
*diri
= first
->get_inode();
12014 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
12015 << " on " << *diri
<< dendl
;
12016 mdr
->mark_event("new frags stored");
12019 mds_rank_t diri_auth
= (first
->is_subtree_root() && !diri
->is_auth()) ?
12020 diri
->authority().first
: CDIR_AUTH_UNKNOWN
;
12021 for (const auto &p
: first
->get_replicas()) {
12022 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
12023 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
12024 rejoin_gather
.count(p
.first
)))
12027 auto notify
= make_message
<MMDSFragmentNotify
>(basedirfrag
, info
.bits
, mdr
->reqid
.tid
);
12028 if (diri_auth
!= CDIR_AUTH_UNKNOWN
&& // subtree root
12029 diri_auth
!= p
.first
) { // not auth mds of diri
12031 * In the nornal case, mds does not trim dir inode whose child dirfrags
12032 * are likely being fragmented (see trim_inode()). But when fragmenting
12033 * subtree roots, following race can happen:
12035 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
12036 * mds.c and drops wrlock on dirfragtreelock.
12037 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
12038 * SYNC and send lock message mds.c
12039 * - mds.c receives the lock message and changes dirfragtreelock state
12041 * - mds.c trim dirfrag and dir inode from its cache
12042 * - mds.c receives the fragment_notify message
12044 * So we need to ensure replicas have received the notify, then unlock
12045 * the dirfragtreelock.
12047 notify
->mark_ack_wanted();
12048 info
.notify_ack_waiting
.insert(p
.first
);
12051 // freshly replicate new dirs to peers
12052 for (const auto& dir
: info
.resultfrags
) {
12053 encode_replica_dir(dir
, p
.first
, notify
->basebl
);
12056 mds
->send_message_mds(notify
, p
.first
);
12060 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
12061 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
, mdr
));
12064 // unfreeze resulting frags
12065 for (const auto& dir
: info
.resultfrags
) {
12066 dout(10) << " result frag " << *dir
<< dendl
;
12068 for (auto &p
: dir
->items
) {
12069 CDentry
*dn
= p
.second
;
12070 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
12071 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
12072 dn
->put(CDentry::PIN_FRAGMENTING
);
12076 dir
->unfreeze_dir();
12079 if (info
.notify_ack_waiting
.empty()) {
12080 fragment_drop_locks(info
);
12082 mds
->locker
->drop_locks_for_fragment_unfreeze(mdr
.get());
12086 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, const MDRequestRef
& mdr
)
12088 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
12090 mdr
->mark_event("commit logged");
12092 ufragment
&uf
= uncommitted_fragments
.at(basedirfrag
);
12094 // remove old frags
12095 C_GatherBuilder
gather(
12098 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag
, uf
.bits
, mdr
),
12101 SnapContext nullsnapc
;
12102 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
12103 for (const auto& fg
: uf
.old_frags
) {
12104 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, fg
, "");
12105 ObjectOperation op
;
12106 if (fg
== frag_t()) {
12107 // backtrace object
12108 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
12112 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
12115 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
12116 ceph::real_clock::now(),
12117 0, gather
.new_sub());
12120 ceph_assert(gather
.has_subs());
12124 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag
, int bits
, const MDRequestRef
& mdr
)
12126 dout(10) << "fragment_old_purged " << basedirfrag
<< dendl
;
12128 mdr
->mark_event("old frags purged");
12130 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, bits
);
12131 mds
->mdlog
->start_submit_entry(le
);
12133 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
12137 mds
->logger
->inc(l_mds_dir_split
);
12139 mds
->logger
->inc(l_mds_dir_merge
);
12144 auto it
= fragments
.find(basedirfrag
);
12145 ceph_assert(it
!= fragments
.end());
12146 it
->second
.finishing
= true;
12147 if (it
->second
.notify_ack_waiting
.empty())
12148 fragment_maybe_finish(it
);
12150 mdr
->mark_event("wating for notify acks");
12154 void MDCache::fragment_drop_locks(fragment_info_t
& info
)
12156 mds
->locker
->drop_locks(info
.mdr
.get());
12157 request_finish(info
.mdr
);
12158 //info.mdr.reset();
12161 void MDCache::fragment_maybe_finish(const fragment_info_iterator
& it
)
12163 if (!it
->second
.finishing
)
12166 // unmark & auth_unpin
12167 for (const auto &dir
: it
->second
.resultfrags
) {
12168 dir
->state_clear(CDir::STATE_FRAGMENTING
);
12169 dir
->auth_unpin(this);
12171 // In case the resulting fragments are beyond the split size,
12172 // we might need to split them again right away (they could
12173 // have been taking inserts between unfreezing and getting
12175 mds
->balancer
->maybe_fragment(dir
, false);
12178 fragments
.erase(it
);
12182 void MDCache::handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &ack
)
12184 dout(10) << "handle_fragment_notify_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
12185 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
12187 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
12191 auto it
= fragments
.find(ack
->get_base_dirfrag());
12192 if (it
== fragments
.end() ||
12193 it
->second
.get_tid() != ack
->get_tid()) {
12194 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl
;
12198 if (it
->second
.notify_ack_waiting
.erase(from
) &&
12199 it
->second
.notify_ack_waiting
.empty()) {
12200 fragment_drop_locks(it
->second
);
12201 fragment_maybe_finish(it
);
12205 void MDCache::handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> ¬ify
)
12207 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
12208 mds_rank_t from
= mds_rank_t(notify
->get_source().num());
12210 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
12214 CInode
*diri
= get_inode(notify
->get_ino());
12216 frag_t base
= notify
->get_basefrag();
12217 int bits
= notify
->get_bits();
12220 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12221 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12222 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12223 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12229 MDSContext::vec waiters
;
12230 std::vector
<CDir
*> resultfrags
;
12231 adjust_dir_fragments(diri
, base
, bits
, &resultfrags
, waiters
, false);
12232 if (g_conf()->mds_debug_frag
)
12233 diri
->verify_dirfrags();
12235 for (const auto& dir
: resultfrags
) {
12236 diri
->take_dir_waiting(dir
->get_frag(), waiters
);
12239 // add new replica dirs values
12240 auto p
= notify
->basebl
.cbegin();
12242 CDir
*tmp_dir
= nullptr;
12243 decode_replica_dir(tmp_dir
, p
, diri
, from
, waiters
);
12246 mds
->queue_waiters(waiters
);
12251 if (notify
->is_ack_wanted()) {
12252 auto ack
= make_message
<MMDSFragmentNotifyAck
>(notify
->get_base_dirfrag(),
12253 notify
->get_bits(), notify
->get_tid());
12254 mds
->send_message_mds(ack
, from
);
12258 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frags
,
12259 LogSegment
*ls
, bufferlist
*rollback
)
12261 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
12262 ceph_assert(!uncommitted_fragments
.count(basedirfrag
));
12263 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
12264 uf
.old_frags
= old_frags
;
12267 ls
->uncommitted_fragments
.insert(basedirfrag
);
12269 uf
.rollback
.swap(*rollback
);
12272 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
12274 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12275 << " op " << EFragment::op_name(op
) << dendl
;
12276 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12277 if (it
!= uncommitted_fragments
.end()) {
12278 ufragment
& uf
= it
->second
;
12279 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
12280 uf
.committed
= true;
12282 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12283 mds
->queue_waiters(uf
.waiters
);
12284 uncommitted_fragments
.erase(it
);
12289 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
)
12291 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12292 << " old_frags (" << old_frags
<< ")" << dendl
;
12293 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12294 if (it
!= uncommitted_fragments
.end()) {
12295 ufragment
& uf
= it
->second
;
12296 if (!uf
.old_frags
.empty()) {
12297 uf
.old_frags
= std::move(old_frags
);
12298 uf
.committed
= true;
12300 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12301 uncommitted_fragments
.erase(it
);
12306 void MDCache::wait_for_uncommitted_fragments(MDSGather
*gather
)
12308 for (auto& p
: uncommitted_fragments
)
12309 p
.second
.waiters
.push_back(gather
->new_sub());
12312 void MDCache::rollback_uncommitted_fragments()
12314 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
12315 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
12316 p
!= uncommitted_fragments
.end();
12318 ufragment
&uf
= p
->second
;
12319 CInode
*diri
= get_inode(p
->first
.ino
);
12322 if (uf
.committed
) {
12323 _fragment_committed(p
->first
, MDRequestRef());
12327 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
12329 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
12330 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
12331 mds
->mdlog
->start_entry(le
);
12332 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
12334 frag_vec_t old_frags
;
12335 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
12337 std::vector
<CDir
*> resultfrags
;
12338 if (uf
.old_frags
.empty()) {
12339 // created by old format EFragment
12340 MDSContext::vec waiters
;
12341 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, &resultfrags
, waiters
, true);
12343 auto bp
= uf
.rollback
.cbegin();
12344 for (const auto& fg
: uf
.old_frags
) {
12345 CDir
*dir
= force_dir_fragment(diri
, fg
);
12346 resultfrags
.push_back(dir
);
12348 dirfrag_rollback rollback
;
12349 decode(rollback
, bp
);
12351 dir
->set_version(rollback
.fnode
.version
);
12352 dir
->fnode
= rollback
.fnode
;
12354 dir
->_mark_dirty(ls
);
12356 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
12357 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
12358 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
12359 ls
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
12361 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
12362 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
12363 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
12364 ls
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
12367 le
->add_orig_frag(dir
->get_frag());
12368 le
->metablob
.add_dir_context(dir
);
12370 le
->metablob
.add_fragmented_dir(dir
, true, false);
12372 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
12373 dir
->state_set(CDir::STATE_DIRTYDFT
);
12374 le
->metablob
.add_fragmented_dir(dir
, true, true);
12380 auto &pi
= diri
->project_inode();
12381 pi
.inode
.version
= diri
->pre_dirty();
12382 diri
->pop_and_dirty_projected_inode(ls
); // hacky
12383 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
12385 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
12386 ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
12389 if (g_conf()->mds_debug_frag
)
12390 diri
->verify_dirfrags();
12392 for (const auto& leaf
: old_frags
) {
12393 ceph_assert(!diri
->dirfragtree
.is_leaf(leaf
));
12396 mds
->mdlog
->submit_entry(le
);
12398 uf
.old_frags
.swap(old_frags
);
12399 _fragment_committed(p
->first
, MDRequestRef());
12403 void MDCache::force_readonly()
12408 dout(1) << "force file system read-only" << dendl
;
12409 mds
->clog
->warn() << "force file system read-only";
12413 mds
->server
->force_clients_readonly();
12415 // revoke write caps
12417 for (auto &p
: inode_map
) {
12418 CInode
*in
= p
.second
;
12420 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
12421 if (!(++count
% 1000))
12422 mds
->heartbeat_reset();
12425 mds
->mdlog
->flush();
12429 // ==============================================================
12432 void MDCache::show_subtrees(int dbl
, bool force_print
)
12434 if (g_conf()->mds_thrash_exports
)
12437 //dout(10) << "show_subtrees" << dendl;
12439 if (!g_conf()->subsys
.should_gather(ceph_subsys_mds
, dbl
))
12440 return; // i won't print anything.
12442 if (subtrees
.empty()) {
12443 dout(ceph::dout::need_dynamic(dbl
)) << "show_subtrees - no subtrees"
12448 if (!force_print
&& subtrees
.size() > SUBTREES_COUNT_THRESHOLD
&&
12449 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12450 dout(ceph::dout::need_dynamic(dbl
)) << "number of subtrees = " << subtrees
.size() << "; not "
12451 "printing subtrees" << dendl
;
12456 std::vector
<CDir
*> basefrags
;
12457 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
12458 p
!= base_inodes
.end();
12460 (*p
)->get_dirfrags(basefrags
);
12461 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12462 dout(15) << "show_subtrees" << dendl
;
12465 list
<pair
<CDir
*,int> > q
;
12470 for (const auto& dir
: basefrags
) {
12471 q
.emplace_back(dir
, 0);
12474 set
<CDir
*> subtrees_seen
;
12476 unsigned int depth
= 0;
12477 while (!q
.empty()) {
12478 CDir
*dir
= q
.front().first
;
12479 unsigned int d
= q
.front().second
;
12482 if (subtrees
.count(dir
) == 0) continue;
12484 subtrees_seen
.insert(dir
);
12486 if (d
> depth
) depth
= d
;
12489 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12490 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
12491 ceph_assert(seen
.count(dir
) == 0);
12495 if (!subtrees
[dir
].empty()) {
12496 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12497 p
!= subtrees
[dir
].end();
12499 //dout(25) << " saw sub " << **p << dendl;
12500 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
12505 if (!force_print
&& depth
> SUBTREES_DEPTH_THRESHOLD
&&
12506 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12507 dout(ceph::dout::need_dynamic(dbl
)) << "max depth among subtrees = " << depth
<< "; not printing "
12508 "subtrees" << dendl
;
12513 for (const auto& dir
: basefrags
) {
12514 q
.emplace_back(dir
, 0);
12517 while (!q
.empty()) {
12518 CDir
*dir
= q
.front().first
;
12519 int d
= q
.front().second
;
12522 if (subtrees
.count(dir
) == 0) continue;
12525 while ((unsigned)d
< indent
.size())
12529 string pad
= "______________________________________";
12530 pad
.resize(depth
*2+1-indent
.size());
12531 if (!subtrees
[dir
].empty())
12532 pad
[0] = '.'; // parent
12536 if (dir
->is_auth())
12542 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
12543 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
12545 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
12548 dout(ceph::dout::need_dynamic(dbl
)) << indent
<< "|_" << pad
<< s
12549 << " " << auth
<< *dir
<< dendl
;
12551 if (dir
->ino() == MDS_INO_ROOT
)
12552 ceph_assert(dir
->inode
== root
);
12553 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
12554 ceph_assert(dir
->inode
== myin
);
12555 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
12556 ceph_assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
12559 if (!subtrees
[dir
].empty()) {
12560 // more at my level?
12561 if (!q
.empty() && q
.front().second
== d
)
12566 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12567 p
!= subtrees
[dir
].end();
12569 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
12573 // verify there isn't stray crap in subtree map
12575 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
12576 p
!= subtrees
.end();
12578 if (subtrees_seen
.count(p
->first
)) continue;
12579 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
12582 ceph_assert(lost
== 0);
12585 void MDCache::show_cache()
12587 dout(7) << "show_cache" << dendl
;
12589 auto show_func
= [this](CInode
*in
) {
12592 dout(7) << " unlinked " << *in
<< dendl
;
12595 auto&& dfs
= in
->get_dirfrags();
12596 for (const auto& dir
: dfs
) {
12597 dout(7) << " dirfrag " << *dir
<< dendl
;
12599 for (auto &p
: dir
->items
) {
12600 CDentry
*dn
= p
.second
;
12601 dout(7) << " dentry " << *dn
<< dendl
;
12602 CDentry::linkage_t
*dnl
= dn
->get_linkage();
12603 if (dnl
->is_primary() && dnl
->get_inode())
12604 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
12609 for (auto &p
: inode_map
)
12610 show_func(p
.second
);
12611 for (auto &p
: snap_inode_map
)
12612 show_func(p
.second
);
12615 void MDCache::cache_status(Formatter
*f
)
12617 f
->open_object_section("cache");
12619 f
->open_object_section("pool");
12620 mempool::get_pool(mempool::mds_co::id
).dump(f
);
12621 f
->close_section();
12623 f
->close_section();
12626 void MDCache::dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
)
12629 if ((max_depth
>= 0) && (cur_depth
> max_depth
)) {
12632 auto&& ls
= in
->get_dirfrags();
12633 for (const auto &subdir
: ls
) {
12634 for (const auto &p
: subdir
->items
) {
12635 CDentry
*dn
= p
.second
;
12636 CInode
*in
= dn
->get_linkage()->get_inode();
12638 dump_tree(in
, cur_depth
+ 1, max_depth
, f
);
12642 f
->open_object_section("inode");
12643 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12644 f
->close_section();
12647 int MDCache::dump_cache(std::string_view file_name
)
12649 return dump_cache(file_name
, NULL
);
12652 int MDCache::dump_cache(Formatter
*f
)
12654 return dump_cache(std::string_view(""), f
);
12658 * Dump the metadata cache, either to a Formatter, if
12659 * provided, else to a plain text file.
12661 int MDCache::dump_cache(std::string_view fn
, Formatter
*f
)
12665 // dumping large caches may cause mds to hang or worse get killed.
12666 // so, disallow the dump if the cache size exceeds the configured
12667 // threshold, which is 1G for formatter and unlimited for file (note
12668 // that this can be jacked up by the admin... and is nothing but foot
12669 // shooting, but the option itself is for devs and hence dangerous to
12670 // tune). TODO: remove this when fixed.
12671 uint64_t threshold
= f
?
12672 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_formatter") :
12673 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_file");
12675 if (threshold
&& cache_size() > threshold
) {
12677 std::stringstream ss
;
12678 ss
<< "cache usage exceeds dump threshold";
12679 f
->open_object_section("result");
12680 f
->dump_string("error", ss
.str());
12681 f
->close_section();
12683 derr
<< "cache usage exceeds dump threshold" << dendl
;
12693 f
->open_array_section("inodes");
12695 char path
[PATH_MAX
] = "";
12697 snprintf(path
, sizeof path
, "%s", fn
.data());
12699 snprintf(path
, sizeof path
, "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
12702 dout(1) << "dump_cache to " << path
<< dendl
;
12704 fd
= ::open(path
, O_WRONLY
|O_CREAT
|O_EXCL
|O_CLOEXEC
, 0600);
12706 derr
<< "failed to open " << path
<< ": " << cpp_strerror(errno
) << dendl
;
12711 auto dump_func
= [fd
, f
](CInode
*in
) {
12714 f
->open_object_section("inode");
12715 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12716 f
->close_section();
12720 ss
<< *in
<< std::endl
;
12721 std::string s
= ss
.str();
12722 r
= safe_write(fd
, s
.c_str(), s
.length());
12725 auto&& dfs
= in
->get_dirfrags();
12726 for (auto &dir
: dfs
) {
12728 tt
<< " " << *dir
<< std::endl
;
12729 std::string t
= tt
.str();
12730 r
= safe_write(fd
, t
.c_str(), t
.length());
12733 for (auto &p
: dir
->items
) {
12734 CDentry
*dn
= p
.second
;
12736 uu
<< " " << *dn
<< std::endl
;
12737 std::string u
= uu
.str();
12738 r
= safe_write(fd
, u
.c_str(), u
.length());
12742 dir
->check_rstats();
12747 for (auto &p
: inode_map
) {
12748 r
= dump_func(p
.second
);
12752 for (auto &p
: snap_inode_map
) {
12753 r
= dump_func(p
.second
);
12761 f
->close_section(); // inodes
12770 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
)
12771 : MDSInternalContext(c
->mds
), cache(c
), mdr(r
)
12774 void C_MDS_RetryRequest::finish(int r
)
12777 cache
->dispatch_request(mdr
);
12781 class C_MDS_EnqueueScrub
: public Context
12784 Formatter
*formatter
;
12785 Context
*on_finish
;
12787 ScrubHeaderRef header
;
12788 C_MDS_EnqueueScrub(std::string_view tag
, Formatter
*f
, Context
*fin
) :
12789 tag(tag
), formatter(f
), on_finish(fin
), header(nullptr) {}
12791 Context
*take_finisher() {
12792 Context
*fin
= on_finish
;
12797 void finish(int r
) override
{
12799 // since recursive scrub is asynchronous, dump minimal output
12800 // to not upset cli tools.
12801 if (header
&& header
->get_recursive()) {
12802 formatter
->open_object_section("results");
12803 formatter
->dump_int("return_code", 0);
12804 formatter
->dump_string("scrub_tag", tag
);
12805 formatter
->dump_string("mode", "asynchronous");
12806 formatter
->close_section(); // results
12808 } else { // we failed the lookup or something; dump ourselves
12809 formatter
->open_object_section("results");
12810 formatter
->dump_int("return_code", r
);
12811 formatter
->close_section(); // results
12812 r
= 0; // already dumped in formatter
12815 on_finish
->complete(r
);
12819 void MDCache::enqueue_scrub(
12820 std::string_view path
,
12821 std::string_view tag
,
12822 bool force
, bool recursive
, bool repair
,
12823 Formatter
*f
, Context
*fin
)
12825 dout(10) << __func__
<< " " << path
<< dendl
;
12826 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12827 if (path
== "~mdsdir") {
12828 filepath
fp(MDS_INO_MDSDIR(mds
->get_nodeid()));
12829 mdr
->set_filepath(fp
);
12832 mdr
->set_filepath(path
);
12835 bool is_internal
= false;
12836 std::string
tag_str(tag
);
12837 if (tag_str
.empty()) {
12839 uuid_gen
.generate_random();
12840 tag_str
= uuid_gen
.to_string();
12841 is_internal
= true;
12844 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(tag_str
, f
, fin
);
12845 cs
->header
= std::make_shared
<ScrubHeader
>(
12846 tag_str
, is_internal
, force
, recursive
, repair
, f
);
12848 mdr
->internal_op_finish
= cs
;
12849 enqueue_scrub_work(mdr
);
12852 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12854 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, true);
12858 // TODO: Remove this restriction
12859 ceph_assert(in
->is_auth());
12861 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
12862 ScrubHeaderRef header
= cs
->header
;
12864 // Cannot scrub same dentry twice at same time
12865 if (in
->scrub_is_in_progress()) {
12866 mds
->server
->respond_to_request(mdr
, -EBUSY
);
12872 header
->set_origin(in
);
12875 if (header
->get_recursive()) {
12876 header
->get_origin()->get(CInode::PIN_SCRUBQUEUE
);
12877 fin
= new MDSInternalContextWrapper(mds
,
12878 new LambdaContext([this, header
](int r
) {
12879 recursive_scrub_finish(header
);
12880 header
->get_origin()->put(CInode::PIN_SCRUBQUEUE
);
12884 fin
= cs
->take_finisher();
12887 // If the scrub did some repair, then flush the journal at the end of
12888 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12889 // the on disk state will still look damaged.
12890 auto scrub_finish
= new LambdaContext([this, header
, fin
](int r
){
12891 if (!header
->get_repaired()) {
12897 auto flush_finish
= new LambdaContext([this, fin
](int r
){
12898 dout(4) << "Expiring log segments because scrub did some repairs" << dendl
;
12899 mds
->mdlog
->trim_all();
12902 MDSGatherBuilder
gather(g_ceph_context
);
12903 auto& expiring_segments
= mds
->mdlog
->get_expiring_segments();
12904 for (auto logseg
: expiring_segments
)
12905 logseg
->wait_for_expiry(gather
.new_sub());
12906 ceph_assert(gather
.has_subs());
12907 gather
.set_finisher(new MDSInternalContextWrapper(mds
, fin
));
12912 dout(4) << "Flushing journal because scrub did some repairs" << dendl
;
12913 mds
->mdlog
->start_new_segment();
12914 mds
->mdlog
->flush();
12915 mds
->mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, flush_finish
));
12918 if (!header
->get_recursive()) {
12919 mds
->scrubstack
->enqueue_inode_top(in
, header
,
12920 new MDSInternalContextWrapper(mds
, scrub_finish
));
12922 mds
->scrubstack
->enqueue_inode_bottom(in
, header
,
12923 new MDSInternalContextWrapper(mds
, scrub_finish
));
12926 mds
->server
->respond_to_request(mdr
, 0);
12930 void MDCache::recursive_scrub_finish(const ScrubHeaderRef
& header
)
12932 if (header
->get_origin()->is_base() &&
12933 header
->get_force() && header
->get_repair()) {
12934 // notify snapserver that base directory is recursively scrubbed.
12935 // After both root and mdsdir are recursively scrubbed, snapserver
12936 // knows that all old format snaprealms are converted to the new
12938 if (mds
->mdsmap
->get_num_in_mds() == 1 &&
12939 mds
->mdsmap
->get_num_failed_mds() == 0 &&
12940 mds
->mdsmap
->get_tableserver() == mds
->get_nodeid()) {
12941 mds
->mark_base_recursively_scrubbed(header
->get_origin()->ino());
12946 struct C_MDC_RespondInternalRequest
: public MDCacheLogContext
{
12948 C_MDC_RespondInternalRequest(MDCache
*c
, MDRequestRef
& m
) :
12949 MDCacheLogContext(c
), mdr(m
) {}
12950 void finish(int r
) override
{
12952 get_mds()->server
->respond_to_request(mdr
, r
);
12956 void MDCache::repair_dirfrag_stats(CDir
*dir
)
12958 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
12960 mdr
->internal_op_private
= dir
;
12961 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12962 repair_dirfrag_stats_work(mdr
);
12965 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
12967 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
12968 dout(10) << __func__
<< " " << *dir
<< dendl
;
12970 if (!dir
->is_auth()) {
12971 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12975 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
12976 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
12978 mds
->locker
->drop_locks(mdr
.get());
12979 mdr
->drop_local_auth_pins();
12980 if (mdr
->is_any_remote_auth_pin())
12981 mds
->locker
->notify_freeze_waiter(dir
);
12985 mdr
->auth_pin(dir
);
12987 MutationImpl::LockOpVec lov
;
12988 CInode
*diri
= dir
->inode
;
12989 lov
.add_rdlock(&diri
->dirfragtreelock
);
12990 lov
.add_wrlock(&diri
->nestlock
);
12991 lov
.add_wrlock(&diri
->filelock
);
12992 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12995 if (!dir
->is_complete()) {
12996 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
13000 frag_info_t frag_info
;
13001 nest_info_t nest_info
;
13002 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
13003 CDentry
*dn
= it
->second
;
13004 if (dn
->last
!= CEPH_NOSNAP
)
13006 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
13007 if (dnl
->is_primary()) {
13008 CInode
*in
= dnl
->get_inode();
13009 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
13011 frag_info
.nsubdirs
++;
13013 frag_info
.nfiles
++;
13014 } else if (dnl
->is_remote())
13015 frag_info
.nfiles
++;
13018 fnode_t
*pf
= dir
->get_projected_fnode();
13019 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
13020 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
13021 if (good_fragstat
&& good_rstat
) {
13022 dout(10) << __func__
<< " no corruption found" << dendl
;
13023 mds
->server
->respond_to_request(mdr
, 0);
13027 pf
= dir
->project_fnode();
13028 pf
->version
= dir
->pre_dirty();
13029 mdr
->add_projected_fnode(dir
);
13031 mdr
->ls
= mds
->mdlog
->get_current_segment();
13032 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
13033 mds
->mdlog
->start_entry(le
);
13035 if (!good_fragstat
) {
13036 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
13037 frag_info
.mtime
= pf
->fragstat
.mtime
;
13038 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
13039 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
13040 pf
->fragstat
= frag_info
;
13041 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
13042 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
13043 mdr
->add_updated_lock(&diri
->filelock
);
13047 if (pf
->rstat
.rctime
> nest_info
.rctime
)
13048 nest_info
.rctime
= pf
->rstat
.rctime
;
13049 pf
->rstat
= nest_info
;
13050 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
13051 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
13052 mdr
->add_updated_lock(&diri
->nestlock
);
13055 le
->metablob
.add_dir_context(dir
);
13056 le
->metablob
.add_dir(dir
, true);
13058 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
13061 void MDCache::repair_inode_stats(CInode
*diri
)
13063 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
13065 mdr
->internal_op_private
= diri
;
13066 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
13067 repair_inode_stats_work(mdr
);
13070 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
13072 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
13073 dout(10) << __func__
<< " " << *diri
<< dendl
;
13075 if (!diri
->is_auth()) {
13076 mds
->server
->respond_to_request(mdr
, -ESTALE
);
13079 if (!diri
->is_dir()) {
13080 mds
->server
->respond_to_request(mdr
, -ENOTDIR
);
13084 MutationImpl::LockOpVec lov
;
13086 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
13089 lov
.add_rdlock(&diri
->dirfragtreelock
);
13090 lov
.add_wrlock(&diri
->nestlock
);
13091 lov
.add_wrlock(&diri
->filelock
);
13092 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13095 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13096 // the scatter-gather process, which will fix any fragstat/rstat errors.
13099 diri
->dirfragtree
.get_leaves(leaves
);
13100 for (const auto& leaf
: leaves
) {
13101 CDir
*dir
= diri
->get_dirfrag(leaf
);
13103 ceph_assert(mdr
->is_auth_pinned(diri
));
13104 dir
= diri
->get_or_open_dirfrag(this, leaf
);
13106 if (dir
->get_version() == 0) {
13107 ceph_assert(dir
->is_auth());
13108 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
13114 diri
->state_set(CInode::STATE_REPAIRSTATS
);
13115 mdr
->ls
= mds
->mdlog
->get_current_segment();
13116 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
13117 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
13118 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
13119 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
13121 mds
->locker
->drop_locks(mdr
.get());
13124 // force the scatter-gather process
13126 lov
.add_rdlock(&diri
->dirfragtreelock
);
13127 lov
.add_rdlock(&diri
->nestlock
);
13128 lov
.add_rdlock(&diri
->filelock
);
13129 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13132 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
13134 frag_info_t dir_info
;
13135 nest_info_t nest_info
;
13136 nest_info
.rsubdirs
= 1; // it gets one to account for self
13137 if (const sr_t
*srnode
= diri
->get_projected_srnode(); srnode
)
13138 nest_info
.rsnaps
= srnode
->snaps
.size();
13142 diri
->dirfragtree
.get_leaves(leaves
);
13143 for (const auto& leaf
: leaves
) {
13144 CDir
*dir
= diri
->get_dirfrag(leaf
);
13146 ceph_assert(dir
->get_version() > 0);
13147 dir_info
.add(dir
->fnode
.accounted_fragstat
);
13148 nest_info
.add(dir
->fnode
.accounted_rstat
);
13152 if (!dir_info
.same_sums(diri
->inode
.dirstat
) ||
13153 !nest_info
.same_sums(diri
->inode
.rstat
)) {
13154 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
13158 mds
->server
->respond_to_request(mdr
, 0);
13161 void MDCache::upgrade_inode_snaprealm(CInode
*in
)
13163 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM
);
13165 mdr
->internal_op_private
= in
;
13166 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
13167 upgrade_inode_snaprealm_work(mdr
);
13170 void MDCache::upgrade_inode_snaprealm_work(MDRequestRef
& mdr
)
13172 CInode
*in
= static_cast<CInode
*>(mdr
->internal_op_private
);
13173 dout(10) << __func__
<< " " << *in
<< dendl
;
13175 if (!in
->is_auth()) {
13176 mds
->server
->respond_to_request(mdr
, -ESTALE
);
13180 MutationImpl::LockOpVec lov
;
13181 lov
.add_xlock(&in
->snaplock
);
13182 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13185 // project_snaprealm() upgrades snaprealm format
13186 auto &pi
= in
->project_inode(false, true);
13187 mdr
->add_projected_inode(in
);
13188 pi
.inode
.version
= in
->pre_dirty();
13190 mdr
->ls
= mds
->mdlog
->get_current_segment();
13191 EUpdate
*le
= new EUpdate(mds
->mdlog
, "upgrade_snaprealm");
13192 mds
->mdlog
->start_entry(le
);
13194 if (in
->is_base()) {
13195 le
->metablob
.add_root(true, in
);
13197 CDentry
*pdn
= in
->get_projected_parent_dn();
13198 le
->metablob
.add_dir_context(pdn
->get_dir());
13199 le
->metablob
.add_primary_dentry(pdn
, in
, true);
13202 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
13205 void MDCache::flush_dentry(std::string_view path
, Context
*fin
)
13207 if (is_readonly()) {
13208 dout(10) << __func__
<< ": read-only FS" << dendl
;
13209 fin
->complete(-EROFS
);
13212 dout(10) << "flush_dentry " << path
<< dendl
;
13213 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
13215 mdr
->set_filepath(fp
);
13216 mdr
->internal_op_finish
= fin
;
13217 flush_dentry_work(mdr
);
13220 class C_FinishIOMDR
: public MDSContext
{
13224 MDSRank
*get_mds() override
{ return mds
; }
13226 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
13227 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
13230 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
13232 MutationImpl::LockOpVec lov
;
13233 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, true);
13237 ceph_assert(in
->is_auth());
13238 in
->flush(new C_FinishIOMDR(mds
, mdr
));
13243 * Initialize performance counters with global perfcounter
13246 void MDCache::register_perfcounters()
13248 PerfCountersBuilder
pcb(g_ceph_context
, "mds_cache", l_mdc_first
, l_mdc_last
);
13250 // Stray/purge statistics
13251 pcb
.add_u64(l_mdc_num_strays
, "num_strays", "Stray dentries", "stry",
13252 PerfCountersBuilder::PRIO_INTERESTING
);
13253 pcb
.add_u64(l_mdc_num_recovering_enqueued
,
13254 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13255 PerfCountersBuilder::PRIO_INTERESTING
);
13256 pcb
.add_u64_counter(l_mdc_recovery_completed
,
13257 "recovery_completed", "File recoveries completed", "recd",
13258 PerfCountersBuilder::PRIO_INTERESTING
);
13260 // useful recovery queue statistics
13261 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
13262 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing",
13263 "Files currently being recovered");
13264 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized",
13265 "Files waiting for recovery with elevated priority");
13266 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started",
13267 "File recoveries started");
13269 // along with other stray dentries stats
13270 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed",
13271 "Stray dentries delayed");
13272 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing",
13273 "Stray dentries enqueuing for purge");
13274 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created",
13275 "Stray dentries created");
13276 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
13277 "Stray dentries enqueued for purge");
13278 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated",
13279 "Stray dentries reintegrated");
13280 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated",
13281 "Stray dentries migrated");
13283 // low prio internal request stats
13284 pcb
.add_u64_counter(l_mdss_ireq_enqueue_scrub
, "ireq_enqueue_scrub",
13285 "Internal Request type enqueue scrub");
13286 pcb
.add_u64_counter(l_mdss_ireq_exportdir
, "ireq_exportdir",
13287 "Internal Request type export dir");
13288 pcb
.add_u64_counter(l_mdss_ireq_flush
, "ireq_flush",
13289 "Internal Request type flush");
13290 pcb
.add_u64_counter(l_mdss_ireq_fragmentdir
, "ireq_fragmentdir",
13291 "Internal Request type fragmentdir");
13292 pcb
.add_u64_counter(l_mdss_ireq_fragstats
, "ireq_fragstats",
13293 "Internal Request type frag stats");
13294 pcb
.add_u64_counter(l_mdss_ireq_inodestats
, "ireq_inodestats",
13295 "Internal Request type inode stats");
13297 logger
.reset(pcb
.create_perf_counters());
13298 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
13299 recovery_queue
.set_logger(logger
.get());
13300 stray_manager
.set_logger(logger
.get());
13304 * Call this when putting references to an inode/dentry or
13305 * when attempting to trim it.
13307 * If this inode is no longer linked by anyone, and this MDS
13308 * rank holds the primary dentry, and that dentry is in a stray
13309 * directory, then give up the dentry to the StrayManager, never
13310 * to be seen again by MDCache.
13312 * @param delay if true, then purgeable inodes are stashed til
13313 * the next trim(), rather than being purged right
13316 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
13317 if (in
->inode
.nlink
> 0 || in
->is_base() || is_readonly() ||
13318 mds
->get_state() <= MDSMap::STATE_REJOIN
)
13321 CDentry
*dn
= in
->get_projected_parent_dn();
13323 if (dn
->state_test(CDentry::STATE_PURGING
)) {
13324 /* We have already entered the purging process, no need
13325 * to re-evaluate me ! */
13329 if (dn
->get_dir()->get_inode()->is_stray()) {
13331 stray_manager
.queue_delayed(dn
);
13333 stray_manager
.eval_stray(dn
);
13337 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
13338 dout(10) << __func__
<< " " << *diri
<< dendl
;
13339 ceph_assert(diri
->get_projected_parent_dir()->inode
->is_stray());
13340 auto&& ls
= diri
->get_dirfrags();
13341 for (auto &p
: ls
) {
13342 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
13343 p
->try_remove_dentries_for_stray();
13345 if (!diri
->snaprealm
) {
13346 if (diri
->is_auth())
13347 diri
->clear_dirty_rstat();
13348 diri
->clear_scatter_dirty();
13352 bool MDCache::dump_inode(Formatter
*f
, uint64_t number
) {
13353 CInode
*in
= get_inode(number
);
13357 f
->open_object_section("inode");
13358 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_PATH
);
13359 f
->close_section();
13363 void MDCache::handle_mdsmap(const MDSMap
&mdsmap
, const MDSMap
&oldmap
) {
13364 // process export_pin_delayed_queue whenever a new MDSMap received
13365 auto &q
= export_pin_delayed_queue
;
13366 for (auto it
= q
.begin(); it
!= q
.end(); ) {
13368 mds_rank_t export_pin
= in
->get_export_pin(false);
13369 if (in
->is_ephemerally_pinned()) {
13370 dout(10) << "ephemeral export pin to " << export_pin
<< " for " << *in
<< dendl
;
13372 dout(10) << " delayed export_pin=" << export_pin
<< " on " << *in
13373 << " max_mds=" << mdsmap
.get_max_mds() << dendl
;
13374 if (export_pin
>= mdsmap
.get_max_mds()) {
13379 in
->state_clear(CInode::STATE_DELAYEDEXPORTPIN
);
13381 in
->queue_export_pin(export_pin
);
13384 if (mdsmap
.get_max_mds() != oldmap
.get_max_mds()) {
13385 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl
;
13386 /* copy to vector to avoid removals during iteration */
13387 std::vector
<CInode
*> migrate
;
13388 migrate
.assign(rand_ephemeral_pins
.begin(), rand_ephemeral_pins
.end());
13389 for (auto& in
: migrate
) {
13390 in
->maybe_ephemeral_rand();
13392 migrate
.assign(dist_ephemeral_pins
.begin(), dist_ephemeral_pins
.end());
13393 for (auto& in
: migrate
) {
13394 in
->maybe_ephemeral_dist();