1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "MDBalancer.h"
29 #include "include/filepath.h"
30 #include "common/likely.h"
32 #include "events/EExport.h"
33 #include "events/EImportStart.h"
34 #include "events/EImportFinish.h"
35 #include "events/ESessions.h"
37 #include "msg/Messenger.h"
39 #include "messages/MClientCaps.h"
42 * this is what the dir->dir_auth values look like
47 * me, me me - still me, but preparing for export
48 * me, them me - send MExportDir (peer is preparing)
49 * them, me me - journaled EExport
54 * me, them me - journaled EImportStart
58 * - auth bit is set if i am listed as first _or_ second dir_auth.
61 #include "common/config.h"
64 #define dout_context g_ceph_context
65 #define dout_subsys ceph_subsys_mds
67 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".mig " << __func__ << " "
71 class MigratorContext
: public MDSContext
{
74 MDSRank
*get_mds() override
{
78 explicit MigratorContext(Migrator
*mig_
) : mig(mig_
) {
79 ceph_assert(mig
!= NULL
);
83 class MigratorLogContext
: public MDSLogContextBase
{
86 MDSRank
*get_mds() override
{
90 explicit MigratorLogContext(Migrator
*mig_
) : mig(mig_
) {
91 ceph_assert(mig
!= NULL
);
95 void Migrator::dispatch(const cref_t
<Message
> &m
)
97 switch (m
->get_type()) {
99 case MSG_MDS_EXPORTDIRDISCOVER
:
100 handle_export_discover(ref_cast
<MExportDirDiscover
>(m
));
102 case MSG_MDS_EXPORTDIRPREP
:
103 handle_export_prep(ref_cast
<MExportDirPrep
>(m
));
105 case MSG_MDS_EXPORTDIR
:
106 if (unlikely(inject_session_race
)) {
107 dout(0) << "waiting for inject_session_race" << dendl
;
108 mds
->wait_for_any_client_connection(new C_MDS_RetryMessage(mds
, m
));
110 handle_export_dir(ref_cast
<MExportDir
>(m
));
113 case MSG_MDS_EXPORTDIRFINISH
:
114 handle_export_finish(ref_cast
<MExportDirFinish
>(m
));
116 case MSG_MDS_EXPORTDIRCANCEL
:
117 handle_export_cancel(ref_cast
<MExportDirCancel
>(m
));
121 case MSG_MDS_EXPORTDIRDISCOVERACK
:
122 handle_export_discover_ack(ref_cast
<MExportDirDiscoverAck
>(m
));
124 case MSG_MDS_EXPORTDIRPREPACK
:
125 handle_export_prep_ack(ref_cast
<MExportDirPrepAck
>(m
));
127 case MSG_MDS_EXPORTDIRACK
:
128 handle_export_ack(ref_cast
<MExportDirAck
>(m
));
130 case MSG_MDS_EXPORTDIRNOTIFYACK
:
131 handle_export_notify_ack(ref_cast
<MExportDirNotifyAck
>(m
));
134 // export 3rd party (dir_auth adjustments)
135 case MSG_MDS_EXPORTDIRNOTIFY
:
136 handle_export_notify(ref_cast
<MExportDirNotify
>(m
));
140 case MSG_MDS_EXPORTCAPS
:
141 handle_export_caps(ref_cast
<MExportCaps
>(m
));
143 case MSG_MDS_EXPORTCAPSACK
:
144 handle_export_caps_ack(ref_cast
<MExportCapsAck
>(m
));
146 case MSG_MDS_GATHERCAPS
:
147 handle_gather_caps(ref_cast
<MGatherCaps
>(m
));
151 derr
<< "migrator unknown message " << m
->get_type() << dendl
;
152 ceph_abort_msg("migrator unknown message");
157 class C_MDC_EmptyImport
: public MigratorContext
{
160 C_MDC_EmptyImport(Migrator
*m
, CDir
*d
) :
161 MigratorContext(m
), dir(d
) {
162 dir
->get(CDir::PIN_PTRWAITER
);
164 void finish(int r
) override
{
165 mig
->export_empty_import(dir
);
166 dir
->put(CDir::PIN_PTRWAITER
);
171 void Migrator::export_empty_import(CDir
*dir
)
173 dout(7) << *dir
<< dendl
;
174 ceph_assert(dir
->is_subtree_root());
176 if (dir
->inode
->is_auth()) {
177 dout(7) << " inode is auth" << dendl
;
180 if (!dir
->is_auth()) {
181 dout(7) << " not auth" << dendl
;
184 if (dir
->is_freezing() || dir
->is_frozen()) {
185 dout(7) << " freezing or frozen" << dendl
;
188 if (dir
->get_num_head_items() > 0) {
189 dout(7) << " not actually empty" << dendl
;
192 if (dir
->inode
->is_root()) {
193 dout(7) << " root" << dendl
;
197 mds_rank_t dest
= dir
->inode
->authority().first
;
198 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
200 dout(7) << " really empty, exporting to " << dest
<< dendl
;
201 assert (dest
!= mds
->get_nodeid());
203 dout(7) << "exporting to mds." << dest
204 << " empty import " << *dir
<< dendl
;
205 export_dir( dir
, dest
);
208 void Migrator::find_stale_export_freeze()
210 utime_t now
= ceph_clock_now();
211 utime_t cutoff
= now
;
212 cutoff
-= g_conf()->mds_freeze_tree_timeout
;
216 * We could have situations like:
218 * - mds.0 authpins an item in subtree A
219 * - mds.0 sends request to mds.1 to authpin an item in subtree B
220 * - mds.0 freezes subtree A
221 * - mds.1 authpins an item in subtree B
222 * - mds.1 sends request to mds.0 to authpin an item in subtree A
223 * - mds.1 freezes subtree B
224 * - mds.1 receives the remote authpin request from mds.0
225 * (wait because subtree B is freezing)
226 * - mds.0 receives the remote authpin request from mds.1
227 * (wait because subtree A is freezing)
230 * - client request authpins items in subtree B
232 * - import subtree A which is parent of subtree B
233 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
235 * - client request tries authpinning items in subtree A
236 * (wait because subtree A is freezing)
238 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
239 p
!= export_state
.end(); ) {
240 CDir
* dir
= p
->first
;
241 export_state_t
& stat
= p
->second
;
243 if (stat
.state
!= EXPORT_DISCOVERING
&& stat
.state
!= EXPORT_FREEZING
)
245 ceph_assert(dir
->freeze_tree_state
);
246 if (stat
.last_cum_auth_pins
!= dir
->freeze_tree_state
->auth_pins
) {
247 stat
.last_cum_auth_pins
= dir
->freeze_tree_state
->auth_pins
;
248 stat
.last_cum_auth_pins_change
= now
;
251 if (stat
.last_cum_auth_pins_change
>= cutoff
)
253 if (stat
.num_remote_waiters
> 0 ||
254 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
255 export_try_cancel(dir
);
260 void Migrator::export_try_cancel(CDir
*dir
, bool notify_peer
)
262 dout(10) << *dir
<< dendl
;
264 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
265 ceph_assert(it
!= export_state
.end());
267 int state
= it
->second
.state
;
270 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl
;
271 num_locking_exports
--;
272 it
->second
.state
= EXPORT_CANCELLED
;
273 dir
->auth_unpin(this);
275 case EXPORT_DISCOVERING
:
276 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl
;
277 it
->second
.state
= EXPORT_CANCELLED
;
278 dir
->unfreeze_tree(); // cancel the freeze
279 dir
->auth_unpin(this);
281 (!mds
->is_cluster_degraded() ||
282 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
283 mds
->send_message_mds(make_message
<MExportDirCancel
>(dir
->dirfrag(),
288 case EXPORT_FREEZING
:
289 dout(10) << "export state=freezing : canceling freeze" << dendl
;
290 it
->second
.state
= EXPORT_CANCELLED
;
291 dir
->unfreeze_tree(); // cancel the freeze
292 if (dir
->is_subtree_root())
293 mdcache
->try_subtree_merge(dir
);
295 (!mds
->is_cluster_degraded() ||
296 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
297 mds
->send_message_mds(make_message
<MExportDirCancel
>(dir
->dirfrag(),
302 // NOTE: state order reversal, warning comes after prepping
304 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl
;
305 it
->second
.state
= EXPORT_CANCELLING
;
308 case EXPORT_PREPPING
:
309 if (state
!= EXPORT_WARNING
) {
310 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl
;
311 it
->second
.state
= EXPORT_CANCELLED
;
317 mdcache
->get_subtree_bounds(dir
, bounds
);
318 for (set
<CDir
*>::iterator q
= bounds
.begin();
322 bd
->put(CDir::PIN_EXPORTBOUND
);
323 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
325 if (state
== EXPORT_WARNING
) {
327 export_notify_abort(dir
, it
->second
, bounds
);
328 // process delayed expires
329 mdcache
->process_delayed_expire(dir
);
332 dir
->unfreeze_tree();
333 mdcache
->try_subtree_merge(dir
);
335 (!mds
->is_cluster_degraded() ||
336 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
337 mds
->send_message_mds(make_message
<MExportDirCancel
>(dir
->dirfrag(),
342 case EXPORT_EXPORTING
:
343 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl
;
344 it
->second
.state
= EXPORT_CANCELLING
;
345 export_reverse(dir
, it
->second
);
348 case EXPORT_LOGGINGFINISH
:
349 case EXPORT_NOTIFYING
:
350 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl
;
351 // leave export_state, don't clean up now.
353 case EXPORT_CANCELLING
:
361 if (it
->second
.state
== EXPORT_CANCELLING
||
362 it
->second
.state
== EXPORT_CANCELLED
) {
364 mut
.swap(it
->second
.mut
);
366 if (it
->second
.state
== EXPORT_CANCELLED
) {
367 export_cancel_finish(it
);
371 if (state
== EXPORT_LOCKING
|| state
== EXPORT_DISCOVERING
) {
372 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(mut
.get());
374 mdcache
->request_kill(mdr
);
376 mds
->locker
->drop_locks(mut
.get());
380 mdcache
->show_subtrees();
382 maybe_do_queued_export();
386 void Migrator::export_cancel_finish(export_state_iterator
& it
)
388 CDir
*dir
= it
->first
;
389 bool unpin
= (it
->second
.state
== EXPORT_CANCELLING
);
390 auto parent
= std::move(it
->second
.parent
);
392 total_exporting_size
-= it
->second
.approx_size
;
393 export_state
.erase(it
);
395 ceph_assert(dir
->state_test(CDir::STATE_EXPORTING
));
396 dir
->clear_exporting();
399 // pinned by Migrator::export_notify_abort()
400 dir
->auth_unpin(this);
402 // send pending import_maps? (these need to go out when all exports have finished.)
403 mdcache
->maybe_send_pending_resolves();
406 child_export_finish(parent
, false);
409 // ==========================================================
410 // mds failure handling
412 void Migrator::handle_mds_failure_or_stop(mds_rank_t who
)
414 dout(5) << who
<< dendl
;
418 // first add an extra auth_pin on any freezes, so that canceling a
419 // nested freeze doesn't complete one further up the hierarchy and
420 // confuse the shit out of us. we'll remove it after canceling the
421 // freeze. this way no freeze completions run before we want them
423 std::vector
<CDir
*> pinned_dirs
;
424 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
425 p
!= export_state
.end();
427 if (p
->second
.state
== EXPORT_FREEZING
) {
428 CDir
*dir
= p
->first
;
429 dout(10) << "adding temp auth_pin on freezing " << *dir
<< dendl
;
431 pinned_dirs
.push_back(dir
);
435 map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
436 while (p
!= export_state
.end()) {
437 map
<CDir
*,export_state_t
>::iterator next
= p
;
439 CDir
*dir
= p
->first
;
442 // - that are going to the failed node
443 // - that aren't frozen yet (to avoid auth_pin deadlock)
444 // - they havne't prepped yet (they may need to discover bounds to do that)
445 if ((p
->second
.peer
== who
&&
446 p
->second
.state
!= EXPORT_CANCELLING
) ||
447 p
->second
.state
== EXPORT_LOCKING
||
448 p
->second
.state
== EXPORT_DISCOVERING
||
449 p
->second
.state
== EXPORT_FREEZING
||
450 p
->second
.state
== EXPORT_PREPPING
) {
451 // the guy i'm exporting to failed, or we're just freezing.
452 dout(10) << "cleaning up export state (" << p
->second
.state
<< ")"
453 << get_export_statename(p
->second
.state
) << " of " << *dir
<< dendl
;
454 export_try_cancel(dir
);
455 } else if (p
->second
.peer
!= who
) {
457 if (p
->second
.warning_ack_waiting
.erase(who
)) {
458 if (p
->second
.state
== EXPORT_WARNING
) {
459 p
->second
.notify_ack_waiting
.erase(who
); // they won't get a notify either.
460 // exporter waiting for warning acks, let's fake theirs.
461 dout(10) << "faking export_warning_ack from mds." << who
462 << " on " << *dir
<< " to mds." << p
->second
.peer
464 if (p
->second
.warning_ack_waiting
.empty())
468 if (p
->second
.notify_ack_waiting
.erase(who
)) {
469 // exporter is waiting for notify acks, fake it
470 dout(10) << "faking export_notify_ack from mds." << who
471 << " on " << *dir
<< " to mds." << p
->second
.peer
473 if (p
->second
.state
== EXPORT_NOTIFYING
) {
474 if (p
->second
.notify_ack_waiting
.empty())
476 } else if (p
->second
.state
== EXPORT_CANCELLING
) {
477 if (p
->second
.notify_ack_waiting
.empty()) {
478 export_cancel_finish(p
);
490 map
<dirfrag_t
,import_state_t
>::iterator q
= import_state
.begin();
491 while (q
!= import_state
.end()) {
492 map
<dirfrag_t
,import_state_t
>::iterator next
= q
;
494 dirfrag_t df
= q
->first
;
495 CInode
*diri
= mdcache
->get_inode(df
.ino
);
496 CDir
*dir
= mdcache
->get_dirfrag(df
);
498 if (q
->second
.peer
== who
) {
500 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
501 << get_import_statename(q
->second
.state
) << " of " << *dir
<< dendl
;
503 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
504 << get_import_statename(q
->second
.state
) << " of " << df
<< dendl
;
506 switch (q
->second
.state
) {
507 case IMPORT_DISCOVERING
:
508 dout(10) << "import state=discovering : clearing state" << dendl
;
509 import_reverse_discovering(df
);
512 case IMPORT_DISCOVERED
:
514 dout(10) << "import state=discovered : unpinning inode " << *diri
<< dendl
;
515 import_reverse_discovered(df
, diri
);
518 case IMPORT_PREPPING
:
520 dout(10) << "import state=prepping : unpinning base+bounds " << *dir
<< dendl
;
521 import_reverse_prepping(dir
, q
->second
);
526 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir
<< dendl
;
529 mdcache
->get_subtree_bounds(dir
, bounds
);
530 import_remove_pins(dir
, bounds
);
532 // adjust auth back to the exporter
533 mdcache
->adjust_subtree_auth(dir
, q
->second
.peer
);
535 // notify bystanders ; wait in aborting state
536 q
->second
.state
= IMPORT_ABORTING
;
537 import_notify_abort(dir
, bounds
);
538 ceph_assert(g_conf()->mds_kill_import_at
!= 10);
542 case IMPORT_LOGGINGSTART
:
544 dout(10) << "import state=loggingstart : reversing import on " << *dir
<< dendl
;
550 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
551 dout(10) << "import state=acking : noting ambiguous import " << *dir
<< dendl
;
554 mdcache
->get_subtree_bounds(dir
, bounds
);
555 mdcache
->add_ambiguous_import(dir
, bounds
);
559 case IMPORT_FINISHING
:
561 dout(10) << "import state=finishing : finishing import on " << *dir
<< dendl
;
562 import_finish(dir
, true);
565 case IMPORT_ABORTING
:
567 dout(10) << "import state=aborting : ignoring repeat failure " << *dir
<< dendl
;
571 auto bystanders_entry
= q
->second
.bystanders
.find(who
);
572 if (bystanders_entry
!= q
->second
.bystanders
.end()) {
573 q
->second
.bystanders
.erase(bystanders_entry
);
574 if (q
->second
.state
== IMPORT_ABORTING
) {
576 dout(10) << "faking export_notify_ack from mds." << who
577 << " on aborting import " << *dir
<< " from mds." << q
->second
.peer
579 if (q
->second
.bystanders
.empty())
580 import_reverse_unfreeze(dir
);
589 for (const auto& dir
: pinned_dirs
) {
590 dout(10) << "removing temp auth_pin on " << *dir
<< dendl
;
591 dir
->auth_unpin(this);
597 void Migrator::show_importing()
600 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
601 p
!= import_state
.end();
603 CDir
*dir
= mdcache
->get_dirfrag(p
->first
);
605 dout(10) << " importing from " << p
->second
.peer
606 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
607 << " " << p
->first
<< " " << *dir
<< dendl
;
609 dout(10) << " importing from " << p
->second
.peer
610 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
611 << " " << p
->first
<< dendl
;
616 void Migrator::show_exporting()
619 for (const auto& [dir
, state
] : export_state
) {
620 dout(10) << " exporting to " << state
.peer
621 << ": (" << state
.state
<< ") " << get_export_statename(state
.state
)
622 << " " << dir
->dirfrag() << " " << *dir
<< dendl
;
628 void Migrator::audit()
630 if (!g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 5>())
635 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
636 p
!= import_state
.end();
638 if (p
->second
.state
== IMPORT_DISCOVERING
)
640 if (p
->second
.state
== IMPORT_DISCOVERED
) {
641 CInode
*in
= mdcache
->get_inode(p
->first
.ino
);
645 CDir
*dir
= mdcache
->get_dirfrag(p
->first
);
647 if (p
->second
.state
== IMPORT_PREPPING
)
649 if (p
->second
.state
== IMPORT_ABORTING
) {
650 ceph_assert(!dir
->is_ambiguous_dir_auth());
651 ceph_assert(dir
->get_dir_auth().first
!= mds
->get_nodeid());
654 ceph_assert(dir
->is_ambiguous_dir_auth());
655 ceph_assert(dir
->authority().first
== mds
->get_nodeid() ||
656 dir
->authority().second
== mds
->get_nodeid());
661 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
662 p
!= export_state
.end();
664 CDir
*dir
= p
->first
;
665 if (p
->second
.state
== EXPORT_LOCKING
||
666 p
->second
.state
== EXPORT_DISCOVERING
||
667 p
->second
.state
== EXPORT_FREEZING
||
668 p
->second
.state
== EXPORT_CANCELLING
)
670 ceph_assert(dir
->is_ambiguous_dir_auth());
671 ceph_assert(dir
->authority().first
== mds
->get_nodeid() ||
672 dir
->authority().second
== mds
->get_nodeid());
675 // ambiguous+me subtrees should be importing|exporting
684 // ==========================================================
687 void Migrator::export_dir_nicely(CDir
*dir
, mds_rank_t dest
)
690 dout(7) << *dir
<< " to " << dest
<< dendl
;
691 export_queue
.push_back(pair
<dirfrag_t
,mds_rank_t
>(dir
->dirfrag(), dest
));
693 maybe_do_queued_export();
696 void Migrator::maybe_do_queued_export()
703 uint64_t max_total_size
= max_export_size
* 2;
705 while (!export_queue
.empty() &&
706 max_total_size
> total_exporting_size
&&
707 max_total_size
- total_exporting_size
>=
708 max_export_size
* (num_locking_exports
+ 1)) {
710 dirfrag_t df
= export_queue
.front().first
;
711 mds_rank_t dest
= export_queue
.front().second
;
712 export_queue
.pop_front();
714 CDir
*dir
= mdcache
->get_dirfrag(df
);
716 if (!dir
->is_auth()) continue;
718 dout(7) << "nicely exporting to mds." << dest
<< " " << *dir
<< dendl
;
720 export_dir(dir
, dest
);
729 class C_MDC_ExportFreeze
: public MigratorContext
{
730 CDir
*dir
; // dir i'm exporting
733 C_MDC_ExportFreeze(Migrator
*m
, CDir
*e
, uint64_t t
) :
734 MigratorContext(m
), dir(e
), tid(t
) {
735 dir
->get(CDir::PIN_PTRWAITER
);
737 void finish(int r
) override
{
739 mig
->export_frozen(dir
, tid
);
740 dir
->put(CDir::PIN_PTRWAITER
);
745 bool Migrator::export_try_grab_locks(CDir
*dir
, MutationRef
& mut
)
747 CInode
*diri
= dir
->get_inode();
749 if (!diri
->filelock
.can_wrlock(diri
->get_loner()) ||
750 !diri
->nestlock
.can_wrlock(diri
->get_loner()))
753 MutationImpl::LockOpVec lov
;
755 set
<CDir
*> wouldbe_bounds
;
756 set
<CInode
*> bound_inodes
;
757 mdcache
->get_wouldbe_subtree_bounds(dir
, wouldbe_bounds
);
758 for (auto& bound
: wouldbe_bounds
)
759 bound_inodes
.insert(bound
->get_inode());
760 for (auto& in
: bound_inodes
)
761 lov
.add_rdlock(&in
->dirfragtreelock
);
763 lov
.add_rdlock(&diri
->dirfragtreelock
);
767 lov
.add_rdlock(&in
->snaplock
);
768 CDentry
* pdn
= in
->get_projected_parent_dn();
771 in
= pdn
->get_dir()->get_inode();
774 if (!mds
->locker
->rdlock_try_set(lov
, mut
))
777 mds
->locker
->wrlock_force(&diri
->filelock
, mut
);
778 mds
->locker
->wrlock_force(&diri
->nestlock
, mut
);
784 /** export_dir(dir, dest)
785 * public method to initiate an export.
786 * will fail if the directory is freezing, frozen, unpinnable, or root.
788 void Migrator::export_dir(CDir
*dir
, mds_rank_t dest
)
790 ceph_assert(dir
->is_auth());
791 ceph_assert(dest
!= mds
->get_nodeid());
793 CDir
* parent
= dir
->inode
->get_projected_parent_dir();
794 if (!mds
->is_stopping() && !dir
->is_exportable(dest
) && dir
->get_num_head_items() > 0) {
795 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": dir is export pinned" << dendl
;
797 } else if (!(mds
->is_active() || mds
->is_stopping())) {
798 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": not active" << dendl
;
800 } else if (mdcache
->is_readonly()) {
801 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": read-only FS, no exports for now" << dendl
;
803 } else if (!mds
->mdsmap
->is_active(dest
)) {
804 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": destination not active" << dendl
;
806 } else if (mds
->is_cluster_degraded()) {
807 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": cluster degraded" << dendl
;
809 } else if (dir
->inode
->is_system()) {
810 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": is a system directory" << dendl
;
812 } else if (dir
->is_frozen() || dir
->is_freezing()) {
813 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": is frozen" << dendl
;
815 } else if (dir
->state_test(CDir::STATE_EXPORTING
)) {
816 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": already exporting" << dendl
;
818 } else if (parent
&& parent
->inode
->is_stray()
819 && parent
->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest
)) {
820 dout(7) << "Cannot export to mds." << dest
<< " " << *dir
<< ": in stray directory" << dendl
;
824 if (unlikely(g_conf()->mds_thrash_exports
)) {
825 // create random subtree bound (which will not be exported)
826 std::vector
<CDir
*> ls
;
827 for (auto p
= dir
->begin(); p
!= dir
->end(); ++p
) {
829 CDentry::linkage_t
*dnl
= dn
->get_linkage();
830 if (dnl
->is_primary()) {
831 CInode
*in
= dnl
->get_inode();
833 auto&& dirs
= in
->get_nested_dirfrags();
834 ls
.insert(std::end(ls
), std::begin(dirs
), std::end(dirs
));
839 int n
= rand() % ls
.size();
843 if (!(bd
->is_frozen() || bd
->is_freezing())) {
844 ceph_assert(bd
->is_auth());
845 dir
->state_set(CDir::STATE_AUXSUBTREE
);
846 mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
847 dout(7) << "create aux subtree " << *bd
<< " under " << *dir
<< dendl
;
852 dout(4) << "Starting export to mds." << dest
<< " " << *dir
<< dendl
;
854 mds
->hit_export_target(dest
, -1);
857 dir
->mark_exporting();
859 MDRequestRef mdr
= mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
860 mdr
->more()->export_dir
= dir
;
863 ceph_assert(export_state
.count(dir
) == 0);
864 export_state_t
& stat
= export_state
[dir
];
865 num_locking_exports
++;
866 stat
.state
= EXPORT_LOCKING
;
868 stat
.tid
= mdr
->reqid
.tid
;
871 mdcache
->dispatch_request(mdr
);
875 * check if directory is too large to be export in whole. If it is,
876 * choose some subdirs, whose total size is suitable.
878 void Migrator::maybe_split_export(CDir
* dir
, uint64_t max_size
, bool null_okay
,
879 vector
<pair
<CDir
*, size_t> >& results
)
881 static const unsigned frag_size
= 800;
882 static const unsigned inode_size
= 1000;
883 static const unsigned cap_size
= 80;
884 static const unsigned remote_size
= 10;
885 static const unsigned null_size
= 1;
887 // state for depth-first search
890 CDir::dentry_key_map::iterator iter
;
891 size_t dirfrag_size
= frag_size
;
892 size_t subdirs_size
= 0;
893 bool complete
= true;
894 vector
<CDir
*> siblings
;
895 vector
<pair
<CDir
*, size_t> > subdirs
;
896 LevelData(const LevelData
&) = default;
898 dir(d
), iter(d
->begin()) {}
901 vector
<LevelData
> stack
;
902 stack
.emplace_back(dir
);
904 size_t found_size
= 0;
905 size_t skipped_size
= 0;
908 auto& data
= stack
.back();
909 CDir
*cur
= data
.dir
;
910 auto& it
= data
.iter
;
911 auto& dirfrag_size
= data
.dirfrag_size
;
913 while(it
!= cur
->end()) {
914 CDentry
*dn
= it
->second
;
917 dirfrag_size
+= dn
->name
.size();
918 if (dn
->get_linkage()->is_null()) {
919 dirfrag_size
+= null_size
;
922 if (dn
->get_linkage()->is_remote()) {
923 dirfrag_size
+= remote_size
;
927 CInode
*in
= dn
->get_linkage()->get_inode();
928 dirfrag_size
+= inode_size
;
929 dirfrag_size
+= in
->get_client_caps().size() * cap_size
;
932 auto ls
= in
->get_nested_dirfrags();
933 std::reverse(ls
.begin(), ls
.end());
935 bool complete
= true;
936 for (auto p
= ls
.begin(); p
!= ls
.end(); ) {
937 if ((*p
)->state_test(CDir::STATE_EXPORTING
) ||
938 (*p
)->is_freezing_dir() || (*p
)->is_frozen_dir()) {
946 // skip exporting dir's ancestors. because they can't get
947 // frozen (exporting dir's parent inode is auth pinned).
948 for (auto p
= stack
.rbegin(); p
< stack
.rend(); ++p
) {
955 stack
.emplace_back(ls
.back());
957 stack
.back().siblings
.swap(ls
);
962 // did above loop push new dirfrag into the stack?
963 if (stack
.back().dir
!= cur
)
967 auto cur_size
= data
.subdirs_size
+ dirfrag_size
;
968 // we can do nothing with large dirfrag
969 if (cur_size
>= max_size
&& found_size
* 2 > max_size
)
972 found_size
+= dirfrag_size
;
974 if (stack
.size() > 1) {
975 auto& parent
= stack
[stack
.size() - 2];
976 parent
.subdirs
.emplace_back(cur
, cur_size
);
977 parent
.subdirs_size
+= cur_size
;
980 // can't merge current dirfrag to its parent if there is skipped subdir
981 results
.insert(results
.end(), data
.subdirs
.begin(), data
.subdirs
.end());
982 skipped_size
+= dirfrag_size
;
986 ls
.swap(data
.siblings
);
992 if (found_size
>= max_size
)
997 stack
.emplace_back(ls
.back());
999 stack
.back().siblings
.swap(ls
);
1003 for (auto& p
: stack
)
1004 results
.insert(results
.end(), p
.subdirs
.begin(), p
.subdirs
.end());
1006 if (results
.empty() && (!skipped_size
|| !null_okay
))
1007 results
.emplace_back(dir
, found_size
+ skipped_size
);
1010 class C_M_ExportDirWait
: public MigratorContext
{
1014 C_M_ExportDirWait(Migrator
*m
, MDRequestRef mdr
, int count
)
1015 : MigratorContext(m
), mdr(mdr
), count(count
) {}
1016 void finish(int r
) override
{
1017 mig
->dispatch_export_dir(mdr
, count
);
1021 void Migrator::dispatch_export_dir(MDRequestRef
& mdr
, int count
)
1023 CDir
*dir
= mdr
->more()->export_dir
;
1024 dout(7) << *mdr
<< " " << *dir
<< dendl
;
1026 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1027 if (it
== export_state
.end() || it
->second
.tid
!= mdr
->reqid
.tid
) {
1028 // export must have aborted.
1029 dout(7) << "export must have aborted " << *mdr
<< dendl
;
1030 ceph_assert(mdr
->killed
|| mdr
->aborted
);
1032 mdr
->aborted
= false;
1033 mdcache
->request_kill(mdr
);
1037 ceph_assert(it
->second
.state
== EXPORT_LOCKING
);
1039 if (mdr
->more()->peer_error
|| dir
->is_frozen() || dir
->is_freezing()) {
1040 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl
;
1041 export_try_cancel(dir
);
1045 mds_rank_t dest
= it
->second
.peer
;
1046 if (!mds
->is_export_target(dest
)) {
1047 dout(7) << "dest is not yet an export target" << dendl
;
1049 dout(7) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl
;
1050 export_try_cancel(dir
);
1054 mds
->locker
->drop_locks(mdr
.get());
1055 mdr
->drop_local_auth_pins();
1057 mds
->wait_for_mdsmap(mds
->mdsmap
->get_epoch(), new C_M_ExportDirWait(this, mdr
, count
+1));
1061 if (!dir
->inode
->get_parent_dn()) {
1062 dout(7) << "waiting for dir to become stable before export: " << *dir
<< dendl
;
1063 dir
->add_waiter(CDir::WAIT_CREATED
, new C_M_ExportDirWait(this, mdr
, 1));
1068 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
1069 MutationImpl::LockOpVec lov
;
1070 // If auth MDS of the subtree root inode is neither the exporter MDS
1071 // nor the importer MDS and it gathers subtree root's fragstat/neststat
1072 // while the subtree is exporting. It's possible that the exporter MDS
1073 // and the importer MDS both are auth MDS of the subtree root or both
1074 // are not auth MDS of the subtree root at the time they receive the
1075 // lock messages. So the auth MDS of the subtree root inode may get no
1076 // or duplicated fragstat/neststat for the subtree root dirfrag.
1077 lov
.lock_scatter_gather(&dir
->get_inode()->filelock
);
1078 lov
.lock_scatter_gather(&dir
->get_inode()->nestlock
);
1079 if (dir
->get_inode()->is_auth()) {
1080 dir
->get_inode()->filelock
.set_scatter_wanted();
1081 dir
->get_inode()->nestlock
.set_scatter_wanted();
1083 lov
.add_rdlock(&dir
->get_inode()->dirfragtreelock
);
1085 if (!mds
->locker
->acquire_locks(mdr
, lov
, nullptr, true)) {
1087 export_try_cancel(dir
);
1093 // NOTE: We need to take an rdlock on bounding dirfrags during
1094 // migration for a rather irritating reason: when we export the
1095 // bound inode, we need to send scatterlock state for the dirfrags
1096 // as well, so that the new auth also gets the correct info. If we
1097 // race with a refragment, this info is useless, as we can't
1098 // redivvy it up. And it's needed for the scatterlocks to work
1099 // properly: when the auth is in a sync/lock state it keeps each
1100 // dirfrag's portion in the local (auth OR replica) dirfrag.
1101 set
<CDir
*> wouldbe_bounds
;
1102 set
<CInode
*> bound_inodes
;
1103 mdcache
->get_wouldbe_subtree_bounds(dir
, wouldbe_bounds
);
1104 for (auto& bound
: wouldbe_bounds
)
1105 bound_inodes
.insert(bound
->get_inode());
1106 for (auto& in
: bound_inodes
)
1107 lov
.add_rdlock(&in
->dirfragtreelock
);
1109 if (!mds
->locker
->rdlock_try_set(lov
, mdr
))
1112 if (!mds
->locker
->try_rdlock_snap_layout(dir
->get_inode(), mdr
))
1115 mdr
->locking_state
|= MutationImpl::ALL_LOCKED
;
1118 ceph_assert(g_conf()->mds_kill_export_at
!= 1);
1120 auto parent
= it
->second
.parent
;
1122 vector
<pair
<CDir
*, size_t> > results
;
1123 maybe_split_export(dir
, max_export_size
, (bool)parent
, results
);
1125 if (results
.size() == 1 && results
.front().first
== dir
) {
1126 num_locking_exports
--;
1127 it
->second
.state
= EXPORT_DISCOVERING
;
1128 // send ExportDirDiscover (ask target)
1130 dir
->inode
->make_path(path
);
1131 auto discover
= make_message
<MExportDirDiscover
>(dir
->dirfrag(), path
,
1134 mds
->send_message_mds(discover
, dest
);
1135 ceph_assert(g_conf()->mds_kill_export_at
!= 2);
1137 it
->second
.last_cum_auth_pins_change
= ceph_clock_now();
1138 it
->second
.approx_size
= results
.front().second
;
1139 total_exporting_size
+= it
->second
.approx_size
;
1141 // start the freeze, but hold it up with an auth_pin.
1143 ceph_assert(dir
->is_freezing_tree());
1144 dir
->add_waiter(CDir::WAIT_FROZEN
, new C_MDC_ExportFreeze(this, dir
, it
->second
.tid
));
1149 parent
->pending_children
+= results
.size();
1151 parent
= std::make_shared
<export_base_t
>(dir
->dirfrag(), dest
,
1152 results
.size(), export_queue_gen
);
1155 if (results
.empty()) {
1156 dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
1157 << parent
->dirfrag
<< dendl
;
1158 parent
->restart
= true;
1160 dout(7) << "subtree is too large, splitting it into: " << dendl
;
1163 for (auto& p
: results
) {
1164 CDir
*sub
= p
.first
;
1165 ceph_assert(sub
!= dir
);
1166 dout(7) << " sub " << *sub
<< dendl
;
1168 sub
->auth_pin(this);
1169 sub
->mark_exporting();
1171 MDRequestRef _mdr
= mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
1172 _mdr
->more()->export_dir
= sub
;
1175 ceph_assert(export_state
.count(sub
) == 0);
1176 auto& stat
= export_state
[sub
];
1177 num_locking_exports
++;
1178 stat
.state
= EXPORT_LOCKING
;
1180 stat
.tid
= _mdr
->reqid
.tid
;
1182 stat
.parent
= parent
;
1183 mdcache
->dispatch_request(_mdr
);
1186 // cancel the original one
1187 export_try_cancel(dir
);
1190 void Migrator::child_export_finish(std::shared_ptr
<export_base_t
>& parent
, bool success
)
1193 parent
->restart
= true;
1194 if (--parent
->pending_children
== 0) {
1195 if (parent
->restart
&&
1196 parent
->export_queue_gen
== export_queue_gen
) {
1197 CDir
*origin
= mdcache
->get_dirfrag(parent
->dirfrag
);
1198 if (origin
&& origin
->is_auth()) {
1199 dout(7) << "child_export_finish requeue " << *origin
<< dendl
;
1200 export_queue
.emplace_front(origin
->dirfrag(), parent
->dest
);
1207 * called on receipt of MExportDirDiscoverAck
1208 * the importer now has the directory's _inode_ in memory, and pinned.
1210 void Migrator::handle_export_discover_ack(const cref_t
<MExportDirDiscoverAck
> &m
)
1212 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
1213 mds_rank_t
dest(m
->get_source().num());
1216 dout(7) << "from " << m
->get_source()
1217 << " on " << *dir
<< dendl
;
1219 mds
->hit_export_target(dest
, -1);
1221 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1222 if (it
== export_state
.end() ||
1223 it
->second
.tid
!= m
->get_tid() ||
1224 it
->second
.peer
!= dest
) {
1225 dout(7) << "must have aborted" << dendl
;
1227 ceph_assert(it
->second
.state
== EXPORT_DISCOVERING
);
1229 if (m
->is_success()) {
1230 // release locks to avoid deadlock
1231 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(it
->second
.mut
.get());
1233 mdcache
->request_finish(mdr
);
1234 it
->second
.mut
.reset();
1235 // freeze the subtree
1236 it
->second
.state
= EXPORT_FREEZING
;
1237 dir
->auth_unpin(this);
1238 ceph_assert(g_conf()->mds_kill_export_at
!= 3);
1241 dout(7) << "peer failed to discover (not active?), canceling" << dendl
;
1242 export_try_cancel(dir
, false);
1247 class C_M_ExportSessionsFlushed
: public MigratorContext
{
1251 C_M_ExportSessionsFlushed(Migrator
*m
, CDir
*d
, uint64_t t
) :
1252 MigratorContext(m
), dir(d
), tid(t
) {
1253 dir
->get(CDir::PIN_PTRWAITER
);
1255 void finish(int r
) override
{
1256 mig
->export_sessions_flushed(dir
, tid
);
1257 dir
->put(CDir::PIN_PTRWAITER
);
1261 void Migrator::export_sessions_flushed(CDir
*dir
, uint64_t tid
)
1263 dout(7) << *dir
<< dendl
;
1265 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1266 if (it
== export_state
.end() ||
1267 it
->second
.state
== EXPORT_CANCELLING
||
1268 it
->second
.tid
!= tid
) {
1269 // export must have aborted.
1270 dout(7) << "export must have aborted on " << dir
<< dendl
;
1274 ceph_assert(it
->second
.state
== EXPORT_PREPPING
|| it
->second
.state
== EXPORT_WARNING
);
1275 ceph_assert(it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0);
1276 it
->second
.warning_ack_waiting
.erase(MDS_RANK_NONE
);
1277 if (it
->second
.state
== EXPORT_WARNING
&& it
->second
.warning_ack_waiting
.empty())
1278 export_go(dir
); // start export.
1281 void Migrator::encode_export_prep_trace(bufferlist
&final_bl
, CDir
*bound
,
1282 CDir
*dir
, export_state_t
&es
,
1283 set
<inodeno_t
> &inodes_added
,
1284 set
<dirfrag_t
> &dirfrags_added
)
1286 ENCODE_START(1, 1, final_bl
);
1288 dout(7) << " started to encode dir " << *bound
<< dendl
;
1294 // don't repeat inodes
1295 if (inodes_added
.count(cur
->inode
->ino()))
1297 inodes_added
.insert(cur
->inode
->ino());
1299 // prepend dentry + inode
1300 ceph_assert(cur
->inode
->is_auth());
1302 mdcache
->encode_replica_dentry(cur
->inode
->parent
, es
.peer
, bl
);
1303 dout(7) << " added " << *cur
->inode
->parent
<< dendl
;
1304 mdcache
->encode_replica_inode(cur
->inode
, es
.peer
, bl
, mds
->mdsmap
->get_up_features());
1305 dout(7) << " added " << *cur
->inode
<< dendl
;
1306 bl
.claim_append(tracebl
);
1307 tracebl
= std::move(bl
);
1309 cur
= cur
->get_parent_dir();
1310 // don't repeat dirfrags
1311 if (dirfrags_added
.count(cur
->dirfrag()) || cur
== dir
) {
1312 start
= 'd'; // start with dentry
1315 dirfrags_added
.insert(cur
->dirfrag());
1318 mdcache
->encode_replica_dir(cur
, es
.peer
, bl
);
1319 dout(7) << " added " << *cur
<< dendl
;
1320 bl
.claim_append(tracebl
);
1321 tracebl
= std::move(bl
);
1322 start
= 'f'; // start with dirfrag
1324 dirfrag_t df
= cur
->dirfrag();
1325 encode(df
, final_bl
);
1326 encode(start
, final_bl
);
1327 final_bl
.claim_append(tracebl
);
1329 ENCODE_FINISH(final_bl
);
1332 void Migrator::export_frozen(CDir
*dir
, uint64_t tid
)
1334 dout(7) << *dir
<< dendl
;
1336 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1337 if (it
== export_state
.end() || it
->second
.tid
!= tid
) {
1338 dout(7) << "export must have aborted" << dendl
;
1342 ceph_assert(it
->second
.state
== EXPORT_FREEZING
);
1343 ceph_assert(dir
->is_frozen_tree_root());
1345 it
->second
.mut
= new MutationImpl();
1347 // ok, try to grab all my locks.
1348 CInode
*diri
= dir
->get_inode();
1349 if ((diri
->is_auth() && diri
->is_frozen()) ||
1350 !export_try_grab_locks(dir
, it
->second
.mut
)) {
1351 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1353 export_try_cancel(dir
);
1357 if (diri
->is_auth())
1358 it
->second
.mut
->auth_pin(diri
);
1360 mdcache
->show_subtrees();
1362 // CDir::_freeze_tree() should have forced it into subtree.
1363 ceph_assert(dir
->get_dir_auth() == mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
1366 mdcache
->get_subtree_bounds(dir
, bounds
);
1368 // generate prep message, log entry.
1369 auto prep
= make_message
<MExportDirPrep
>(dir
->dirfrag(), it
->second
.tid
);
1371 // include list of bystanders
1372 for (const auto &p
: dir
->get_replicas()) {
1373 if (p
.first
!= it
->second
.peer
) {
1374 dout(10) << "bystander mds." << p
.first
<< dendl
;
1375 prep
->add_bystander(p
.first
);
1379 // include base dirfrag
1380 mdcache
->encode_replica_dir(dir
, it
->second
.peer
, prep
->basedir
);
1383 * include spanning tree for all nested exports.
1384 * these need to be on the destination _before_ the final export so that
1385 * dir_auth updates on any nested exports are properly absorbed.
1386 * this includes inodes and dirfrags included in the subtree, but
1387 * only the inodes at the bounds.
1389 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1391 set
<inodeno_t
> inodes_added
;
1392 set
<dirfrag_t
> dirfrags_added
;
1395 for (auto &bound
: bounds
){
1397 bound
->get(CDir::PIN_EXPORTBOUND
);
1398 bound
->state_set(CDir::STATE_EXPORTBOUND
);
1400 dout(7) << " export bound " << *bound
<< dendl
;
1401 prep
->add_bound( bound
->dirfrag() );
1403 bufferlist final_bl
;
1404 encode_export_prep_trace(final_bl
, bound
, dir
, it
->second
, inodes_added
, dirfrags_added
);
1405 prep
->add_trace(final_bl
);
1409 it
->second
.state
= EXPORT_PREPPING
;
1410 mds
->send_message_mds(prep
, it
->second
.peer
);
1411 ceph_assert(g_conf()->mds_kill_export_at
!= 4);
1413 // make sure any new instantiations of caps are flushed out
1414 ceph_assert(it
->second
.warning_ack_waiting
.empty());
1416 set
<client_t
> export_client_set
;
1417 get_export_client_set(dir
, export_client_set
);
1419 MDSGatherBuilder
gather(g_ceph_context
);
1420 mds
->server
->flush_client_sessions(export_client_set
, gather
);
1421 if (gather
.has_subs()) {
1422 it
->second
.warning_ack_waiting
.insert(MDS_RANK_NONE
);
1423 gather
.set_finisher(new C_M_ExportSessionsFlushed(this, dir
, it
->second
.tid
));
1428 void Migrator::get_export_client_set(CDir
*dir
, set
<client_t
>& client_set
)
1432 while (!dfs
.empty()) {
1433 CDir
*dir
= dfs
.front();
1435 for (auto& p
: *dir
) {
1436 CDentry
*dn
= p
.second
;
1437 if (!dn
->get_linkage()->is_primary())
1439 CInode
*in
= dn
->get_linkage()->get_inode();
1442 auto&& ls
= in
->get_dirfrags();
1443 for (auto& q
: ls
) {
1444 if (!q
->state_test(CDir::STATE_EXPORTBOUND
)) {
1445 // include nested dirfrag
1446 ceph_assert(q
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1447 dfs
.push_back(q
); // it's ours, recurse (later)
1451 for (auto& q
: in
->get_client_caps()) {
1452 client_set
.insert(q
.first
);
1458 void Migrator::get_export_client_set(CInode
*in
, set
<client_t
>& client_set
)
1460 for (const auto &p
: in
->get_client_caps()) {
1461 client_set
.insert(p
.first
);
1465 void Migrator::handle_export_prep_ack(const cref_t
<MExportDirPrepAck
> &m
)
1467 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
1468 mds_rank_t
dest(m
->get_source().num());
1471 dout(7) << *dir
<< dendl
;
1473 mds
->hit_export_target(dest
, -1);
1475 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1476 if (it
== export_state
.end() ||
1477 it
->second
.tid
!= m
->get_tid() ||
1478 it
->second
.peer
!= mds_rank_t(m
->get_source().num())) {
1479 // export must have aborted.
1480 dout(7) << "export must have aborted" << dendl
;
1483 ceph_assert(it
->second
.state
== EXPORT_PREPPING
);
1485 if (!m
->is_success()) {
1486 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl
;
1487 export_try_cancel(dir
, false);
1491 ceph_assert(g_conf()->mds_kill_export_at
!= 5);
1494 mdcache
->get_subtree_bounds(dir
, bounds
);
1496 ceph_assert(it
->second
.warning_ack_waiting
.empty() ||
1497 (it
->second
.warning_ack_waiting
.size() == 1 &&
1498 it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0));
1499 ceph_assert(it
->second
.notify_ack_waiting
.empty());
1501 for (const auto &p
: dir
->get_replicas()) {
1502 if (p
.first
== it
->second
.peer
) continue;
1503 if (mds
->is_cluster_degraded() &&
1504 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(p
.first
))
1505 continue; // only if active
1506 it
->second
.warning_ack_waiting
.insert(p
.first
);
1507 it
->second
.notify_ack_waiting
.insert(p
.first
); // we'll eventually get a notifyack, too!
1509 auto notify
= make_message
<MExportDirNotify
>(dir
->dirfrag(), it
->second
.tid
, true,
1510 mds_authority_t(mds
->get_nodeid(),CDIR_AUTH_UNKNOWN
),
1511 mds_authority_t(mds
->get_nodeid(),it
->second
.peer
));
1512 for (auto &cdir
: bounds
) {
1513 notify
->get_bounds().push_back(cdir
->dirfrag());
1515 mds
->send_message_mds(notify
, p
.first
);
1519 it
->second
.state
= EXPORT_WARNING
;
1521 ceph_assert(g_conf()->mds_kill_export_at
!= 6);
1523 if (it
->second
.warning_ack_waiting
.empty())
1524 export_go(dir
); // start export.
1528 class C_M_ExportGo
: public MigratorContext
{
1532 C_M_ExportGo(Migrator
*m
, CDir
*d
, uint64_t t
) :
1533 MigratorContext(m
), dir(d
), tid(t
) {
1534 dir
->get(CDir::PIN_PTRWAITER
);
1536 void finish(int r
) override
{
1537 mig
->export_go_synced(dir
, tid
);
1538 dir
->put(CDir::PIN_PTRWAITER
);
1542 void Migrator::export_go(CDir
*dir
)
1544 auto it
= export_state
.find(dir
);
1545 ceph_assert(it
!= export_state
.end());
1546 dout(7) << *dir
<< " to " << it
->second
.peer
<< dendl
;
1548 // first sync log to flush out e.g. any cap imports
1549 mds
->mdlog
->wait_for_safe(new C_M_ExportGo(this, dir
, it
->second
.tid
));
1550 mds
->mdlog
->flush();
1553 void Migrator::export_go_synced(CDir
*dir
, uint64_t tid
)
1555 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1556 if (it
== export_state
.end() ||
1557 it
->second
.state
== EXPORT_CANCELLING
||
1558 it
->second
.tid
!= tid
) {
1559 // export must have aborted.
1560 dout(7) << "export must have aborted on " << dir
<< dendl
;
1563 ceph_assert(it
->second
.state
== EXPORT_WARNING
);
1564 mds_rank_t dest
= it
->second
.peer
;
1566 dout(7) << *dir
<< " to " << dest
<< dendl
;
1568 mdcache
->show_subtrees();
1570 it
->second
.state
= EXPORT_EXPORTING
;
1571 ceph_assert(g_conf()->mds_kill_export_at
!= 7);
1573 ceph_assert(dir
->is_frozen_tree_root());
1575 // set ambiguous auth
1576 mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), dest
);
1578 // take away the popularity we're sending.
1579 mds
->balancer
->subtract_export(dir
);
1581 // fill export message with cache data
1582 auto req
= make_message
<MExportDir
>(dir
->dirfrag(), it
->second
.tid
);
1583 map
<client_t
,entity_inst_t
> exported_client_map
;
1584 map
<client_t
,client_metadata_t
> exported_client_metadata_map
;
1585 uint64_t num_exported_inodes
= 0;
1586 encode_export_dir(req
->export_data
, dir
, // recur start point
1587 exported_client_map
, exported_client_metadata_map
,
1588 num_exported_inodes
);
1589 encode(exported_client_map
, req
->client_map
, mds
->mdsmap
->get_up_features());
1590 encode(exported_client_metadata_map
, req
->client_map
);
1592 // add bounds to message
1594 mdcache
->get_subtree_bounds(dir
, bounds
);
1595 for (set
<CDir
*>::iterator p
= bounds
.begin();
1598 req
->add_export((*p
)->dirfrag());
1601 mds
->send_message_mds(req
, dest
);
1602 ceph_assert(g_conf()->mds_kill_export_at
!= 8);
1604 mds
->hit_export_target(dest
, num_exported_inodes
+1);
1607 if (mds
->logger
) mds
->logger
->inc(l_mds_exported
);
1608 if (mds
->logger
) mds
->logger
->inc(l_mds_exported_inodes
, num_exported_inodes
);
1610 mdcache
->show_subtrees();
1614 /** encode_export_inode
1615 * update our local state for this inode to export.
1616 * encode relevant state to be sent over the wire.
1617 * used by: encode_export_dir, file_rename (if foreign)
1619 * FIXME: the separation between CInode.encode_export and these methods
1620 * is pretty arbitrary and dumb.
1622 void Migrator::encode_export_inode(CInode
*in
, bufferlist
& enc_state
,
1623 map
<client_t
,entity_inst_t
>& exported_client_map
,
1624 map
<client_t
,client_metadata_t
>& exported_client_metadata_map
)
1626 ENCODE_START(1, 1, enc_state
);
1627 dout(7) << *in
<< dendl
;
1628 ceph_assert(!in
->is_replica(mds
->get_nodeid()));
1630 encode(in
->ino(), enc_state
);
1631 encode(in
->last
, enc_state
);
1632 in
->encode_export(enc_state
);
1635 encode_export_inode_caps(in
, true, enc_state
, exported_client_map
, exported_client_metadata_map
);
1636 ENCODE_FINISH(enc_state
);
1639 void Migrator::encode_export_inode_caps(CInode
*in
, bool auth_cap
, bufferlist
& bl
,
1640 map
<client_t
,entity_inst_t
>& exported_client_map
,
1641 map
<client_t
,client_metadata_t
>& exported_client_metadata_map
)
1643 ENCODE_START(1, 1, bl
);
1644 dout(20) << *in
<< dendl
;
1646 map
<client_t
,Capability::Export
> cap_map
;
1647 in
->export_client_caps(cap_map
);
1648 encode(cap_map
, bl
);
1650 encode(in
->get_mds_caps_wanted(), bl
);
1652 in
->state_set(CInode::STATE_EXPORTINGCAPS
);
1653 in
->get(CInode::PIN_EXPORTINGCAPS
);
1656 // make note of clients named by exported capabilities
1657 for (const auto &p
: in
->get_client_caps()) {
1658 if (exported_client_map
.count(p
.first
))
1660 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
.first
.v
));
1661 exported_client_map
[p
.first
] = session
->info
.inst
;
1662 exported_client_metadata_map
[p
.first
] = session
->info
.client_metadata
;
1667 void Migrator::finish_export_inode_caps(CInode
*in
, mds_rank_t peer
,
1668 map
<client_t
,Capability::Import
>& peer_imported
)
1670 dout(20) << *in
<< dendl
;
1672 in
->state_clear(CInode::STATE_EXPORTINGCAPS
);
1673 in
->put(CInode::PIN_EXPORTINGCAPS
);
1675 // tell (all) clients about migrating caps..
1676 for (const auto &p
: in
->get_client_caps()) {
1677 const Capability
*cap
= &p
.second
;
1679 << " exported caps on " << *in
<< dendl
;
1680 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
1681 cap
->get_cap_id(), cap
->get_mseq(),
1682 mds
->get_osd_epoch_barrier());
1683 map
<client_t
,Capability::Import
>::iterator q
= peer_imported
.find(p
.first
);
1684 ceph_assert(q
!= peer_imported
.end());
1685 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
1686 (q
->second
.cap_id
> 0 ? peer
: -1), 0);
1687 mds
->send_message_client_counted(m
, p
.first
);
1689 in
->clear_client_caps_after_export();
1690 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
1693 void Migrator::finish_export_inode(CInode
*in
, mds_rank_t peer
,
1694 map
<client_t
,Capability::Import
>& peer_imported
,
1695 MDSContext::vec
& finished
)
1697 dout(12) << *in
<< dendl
;
1703 // clear/unpin cached_by (we're no longer the authority)
1704 in
->clear_replica_map();
1706 // twiddle lock states for auth -> replica transition
1707 in
->authlock
.export_twiddle();
1708 in
->linklock
.export_twiddle();
1709 in
->dirfragtreelock
.export_twiddle();
1710 in
->filelock
.export_twiddle();
1711 in
->nestlock
.export_twiddle();
1712 in
->xattrlock
.export_twiddle();
1713 in
->snaplock
.export_twiddle();
1714 in
->flocklock
.export_twiddle();
1715 in
->policylock
.export_twiddle();
1718 ceph_assert(in
->is_auth());
1719 in
->state_clear(CInode::STATE_AUTH
);
1720 in
->replica_nonce
= CInode::EXPORT_NONCE
;
1722 in
->clear_dirty_rstat();
1724 // no more auth subtree? clear scatter dirty
1725 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid()))
1726 in
->clear_scatter_dirty();
1728 in
->clear_dirty_parent();
1730 in
->clear_clientwriteable();
1732 in
->clear_file_locks();
1735 in
->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
1737 in
->finish_export();
1739 finish_export_inode_caps(in
, peer
, peer_imported
);
1742 void Migrator::encode_export_dir(bufferlist
& exportbl
,
1744 map
<client_t
,entity_inst_t
>& exported_client_map
,
1745 map
<client_t
,client_metadata_t
>& exported_client_metadata_map
,
1746 uint64_t &num_exported
)
1748 // This has to be declared before ENCODE_STARTED as it will need to be referenced after ENCODE_FINISH.
1749 std::vector
<CDir
*> subdirs
;
1751 ENCODE_START(1, 1, exportbl
);
1752 dout(7) << *dir
<< " " << dir
->get_num_head_items() << " head items" << dendl
;
1754 ceph_assert(dir
->get_projected_version() == dir
->get_version());
1756 #ifdef MDS_VERIFY_FRAGSTAT
1757 if (dir
->is_complete())
1758 dir
->verify_fragstat();
1762 dirfrag_t df
= dir
->dirfrag();
1763 encode(df
, exportbl
);
1764 dir
->encode_export(exportbl
);
1766 __u32 nden
= dir
->items
.size();
1767 encode(nden
, exportbl
);
1770 for (auto &p
: *dir
) {
1771 CDentry
*dn
= p
.second
;
1772 CInode
*in
= dn
->get_linkage()->get_inode();
1777 dout(7) << " exporting " << *dn
<< dendl
;
1780 encode(dn
->get_name(), exportbl
);
1781 encode(dn
->last
, exportbl
);
1784 dn
->encode_export(exportbl
);
1789 if (dn
->get_linkage()->is_null()) {
1790 exportbl
.append("N", 1); // null dentry
1794 if (dn
->get_linkage()->is_remote()) {
1795 inodeno_t ino
= dn
->get_linkage()->get_remote_ino();
1796 unsigned char d_type
= dn
->get_linkage()->get_remote_d_type();
1797 auto& alternate_name
= dn
->alternate_name
;
1799 CDentry::encode_remote(ino
, d_type
, alternate_name
, exportbl
);
1805 exportbl
.append("i", 1); // inode dentry
1807 ENCODE_START(2, 1, exportbl
);
1808 encode_export_inode(in
, exportbl
, exported_client_map
, exported_client_metadata_map
); // encode, and (update state for) export
1809 encode(dn
->alternate_name
, exportbl
);
1810 ENCODE_FINISH(exportbl
);
1813 auto&& dfs
= in
->get_dirfrags();
1814 for (const auto& t
: dfs
) {
1815 if (!t
->state_test(CDir::STATE_EXPORTBOUND
)) {
1816 // include nested dirfrag
1817 ceph_assert(t
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1818 subdirs
.push_back(t
); // it's ours, recurse (later)
1823 ENCODE_FINISH(exportbl
);
1825 for (const auto &dir
: subdirs
) {
1826 encode_export_dir(exportbl
, dir
, exported_client_map
, exported_client_metadata_map
, num_exported
);
1830 void Migrator::finish_export_dir(CDir
*dir
, mds_rank_t peer
,
1831 map
<inodeno_t
,map
<client_t
,Capability::Import
> >& peer_imported
,
1832 MDSContext::vec
& finished
, int *num_dentries
)
1834 dout(10) << *dir
<< dendl
;
1837 dir
->clear_replica_map();
1840 ceph_assert(dir
->is_auth());
1841 dir
->state_clear(CDir::STATE_AUTH
);
1842 dir
->remove_bloom();
1843 dir
->replica_nonce
= CDir::EXPORT_NONCE
;
1845 if (dir
->is_dirty())
1848 // suck up all waiters
1849 dir
->take_waiting(CDir::WAIT_ANY_MASK
, finished
); // all dir waiters
1852 dir
->finish_export();
1855 std::vector
<CDir
*> subdirs
;
1856 for (auto &p
: *dir
) {
1857 CDentry
*dn
= p
.second
;
1858 CInode
*in
= dn
->get_linkage()->get_inode();
1861 dn
->finish_export();
1864 if (dn
->get_linkage()->is_primary()) {
1865 finish_export_inode(in
, peer
, peer_imported
[in
->ino()], finished
);
1868 auto&& dirs
= in
->get_nested_dirfrags();
1869 subdirs
.insert(std::end(subdirs
), std::begin(dirs
), std::end(dirs
));
1872 mdcache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
1877 for (const auto& dir
: subdirs
) {
1878 finish_export_dir(dir
, peer
, peer_imported
, finished
, num_dentries
);
1882 class C_MDS_ExportFinishLogged
: public MigratorLogContext
{
1885 C_MDS_ExportFinishLogged(Migrator
*m
, CDir
*d
) : MigratorLogContext(m
), dir(d
) {}
1886 void finish(int r
) override
{
1887 mig
->export_logged_finish(dir
);
1893 * i should get an export_ack from the export target.
1895 void Migrator::handle_export_ack(const cref_t
<MExportDirAck
> &m
)
1897 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
1898 mds_rank_t
dest(m
->get_source().num());
1900 ceph_assert(dir
->is_frozen_tree_root()); // i'm exporting!
1903 dout(7) << *dir
<< dendl
;
1905 mds
->hit_export_target(dest
, -1);
1907 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1908 ceph_assert(it
!= export_state
.end());
1909 ceph_assert(it
->second
.state
== EXPORT_EXPORTING
);
1910 ceph_assert(it
->second
.tid
== m
->get_tid());
1912 auto bp
= m
->imported_caps
.cbegin();
1913 decode(it
->second
.peer_imported
, bp
);
1915 it
->second
.state
= EXPORT_LOGGINGFINISH
;
1916 ceph_assert(g_conf()->mds_kill_export_at
!= 9);
1918 mdcache
->get_subtree_bounds(dir
, bounds
);
1921 // include export bounds, to ensure they're in the journal.
1922 EExport
*le
= new EExport(mds
->mdlog
, dir
, it
->second
.peer
);;
1923 mds
->mdlog
->start_entry(le
);
1925 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
1926 le
->metablob
.add_dir(dir
, false);
1927 for (set
<CDir
*>::iterator p
= bounds
.begin();
1931 le
->get_bounds().insert(bound
->dirfrag());
1932 le
->metablob
.add_dir_context(bound
);
1933 le
->metablob
.add_dir(bound
, false);
1936 // list us second, them first.
1937 // this keeps authority().first in sync with subtree auth state in the journal.
1938 mdcache
->adjust_subtree_auth(dir
, it
->second
.peer
, mds
->get_nodeid());
1940 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1941 mds
->mdlog
->submit_entry(le
, new C_MDS_ExportFinishLogged(this, dir
));
1942 mds
->mdlog
->flush();
1943 ceph_assert(g_conf()->mds_kill_export_at
!= 10);
1946 void Migrator::export_notify_abort(CDir
*dir
, export_state_t
& stat
, set
<CDir
*>& bounds
)
1948 dout(7) << *dir
<< dendl
;
1950 ceph_assert(stat
.state
== EXPORT_CANCELLING
);
1952 if (stat
.notify_ack_waiting
.empty()) {
1953 stat
.state
= EXPORT_CANCELLED
;
1957 dir
->auth_pin(this);
1959 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
1960 p
!= stat
.notify_ack_waiting
.end();
1962 auto notify
= make_message
<MExportDirNotify
>(dir
->dirfrag(), stat
.tid
, true,
1963 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
1964 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
1965 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
1966 notify
->get_bounds().push_back((*i
)->dirfrag());
1967 mds
->send_message_mds(notify
, *p
);
1972 * this happens if the dest failes after i send the export data but before it is acked
1973 * that is, we don't know they safely received and logged it, so we reverse our changes
1976 void Migrator::export_reverse(CDir
*dir
, export_state_t
& stat
)
1978 dout(7) << *dir
<< dendl
;
1980 set
<CInode
*> to_eval
;
1983 mdcache
->get_subtree_bounds(dir
, bounds
);
1985 // remove exporting pins
1986 std::deque
<CDir
*> rq
;
1988 while (!rq
.empty()) {
1989 CDir
*t
= rq
.front();
1992 for (auto &p
: *t
) {
1993 CDentry
*dn
= p
.second
;
1995 if (!dn
->get_linkage()->is_primary())
1997 CInode
*in
= dn
->get_linkage()->get_inode();
1999 if (in
->state_test(CInode::STATE_EVALSTALECAPS
)) {
2000 in
->state_clear(CInode::STATE_EVALSTALECAPS
);
2004 auto&& dirs
= in
->get_nested_dirfrags();
2005 for (const auto& dir
: dirs
) {
2013 for (auto bd
: bounds
) {
2014 bd
->put(CDir::PIN_EXPORTBOUND
);
2015 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
2018 // notify bystanders
2019 export_notify_abort(dir
, stat
, bounds
);
2021 // unfreeze tree, with possible subtree merge.
2022 mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
2024 // process delayed expires
2025 mdcache
->process_delayed_expire(dir
);
2027 dir
->unfreeze_tree();
2028 mdcache
->try_subtree_merge(dir
);
2030 // revoke/resume stale caps
2031 for (auto in
: to_eval
) {
2032 bool need_issue
= false;
2033 for (auto &p
: in
->client_caps
) {
2034 Capability
*cap
= &p
.second
;
2035 if (!cap
->is_stale()) {
2041 (!in
->is_auth() || !mds
->locker
->eval(in
, CEPH_CAP_LOCKS
)))
2042 mds
->locker
->issue_caps(in
);
2045 mdcache
->show_cache();
2050 * once i get the ack, and logged the EExportFinish(true),
2051 * send notifies (if any), otherwise go straight to finish.
2054 void Migrator::export_logged_finish(CDir
*dir
)
2056 dout(7) << *dir
<< dendl
;
2058 export_state_t
& stat
= export_state
[dir
];
2062 mdcache
->get_subtree_bounds(dir
, bounds
);
2064 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
2065 p
!= stat
.notify_ack_waiting
.end();
2067 auto notify
= make_message
<MExportDirNotify
>(dir
->dirfrag(), stat
.tid
, true,
2068 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
2069 pair
<int,int>(stat
.peer
, CDIR_AUTH_UNKNOWN
));
2071 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2072 notify
->get_bounds().push_back((*i
)->dirfrag());
2074 mds
->send_message_mds(notify
, *p
);
2077 // wait for notifyacks
2078 stat
.state
= EXPORT_NOTIFYING
;
2079 ceph_assert(g_conf()->mds_kill_export_at
!= 11);
2081 // no notifies to wait for?
2082 if (stat
.notify_ack_waiting
.empty()) {
2083 export_finish(dir
); // skip notify/notify_ack stage.
2085 // notify peer to send cap import messages to clients
2086 if (!mds
->is_cluster_degraded() ||
2087 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(stat
.peer
)) {
2088 mds
->send_message_mds(make_message
<MExportDirFinish
>(dir
->dirfrag(), false, stat
.tid
), stat
.peer
);
2090 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl
;
2097 * i'll get an ack from each bystander.
2098 * when i get them all, do the export.
2100 * i'll get an ack from each bystander.
2101 * when i get them all, unfreeze and send the finish.
2103 void Migrator::handle_export_notify_ack(const cref_t
<MExportDirNotifyAck
> &m
)
2105 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
2106 mds_rank_t
dest(m
->get_source().num());
2108 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2110 mds
->hit_export_target(dest
, -1);
2112 auto export_state_entry
= export_state
.find(dir
);
2113 if (export_state_entry
!= export_state
.end()) {
2114 export_state_t
& stat
= export_state_entry
->second
;
2115 if (stat
.state
== EXPORT_WARNING
&&
2116 stat
.warning_ack_waiting
.erase(from
)) {
2117 // exporting. process warning.
2118 dout(7) << "from " << m
->get_source()
2119 << ": exporting, processing warning on " << *dir
<< dendl
;
2120 if (stat
.warning_ack_waiting
.empty())
2121 export_go(dir
); // start export.
2122 } else if (stat
.state
== EXPORT_NOTIFYING
&&
2123 stat
.notify_ack_waiting
.erase(from
)) {
2124 // exporting. process notify.
2125 dout(7) << "from " << m
->get_source()
2126 << ": exporting, processing notify on " << *dir
<< dendl
;
2127 if (stat
.notify_ack_waiting
.empty())
2129 } else if (stat
.state
== EXPORT_CANCELLING
&&
2130 m
->get_new_auth().second
== CDIR_AUTH_UNKNOWN
&& // not warning ack
2131 stat
.notify_ack_waiting
.erase(from
)) {
2132 dout(7) << "from " << m
->get_source()
2133 << ": cancelling export, processing notify on " << *dir
<< dendl
;
2134 if (stat
.notify_ack_waiting
.empty()) {
2135 export_cancel_finish(export_state_entry
);
2140 auto import_state_entry
= import_state
.find(dir
->dirfrag());
2141 if (import_state_entry
!= import_state
.end()) {
2142 import_state_t
& stat
= import_state_entry
->second
;
2143 if (stat
.state
== IMPORT_ABORTING
) {
2145 dout(7) << "from " << m
->get_source()
2146 << ": aborting import on " << *dir
<< dendl
;
2147 ceph_assert(stat
.bystanders
.count(from
));
2148 stat
.bystanders
.erase(from
);
2149 if (stat
.bystanders
.empty())
2150 import_reverse_unfreeze(dir
);
2156 void Migrator::export_finish(CDir
*dir
)
2158 dout(3) << *dir
<< dendl
;
2160 ceph_assert(g_conf()->mds_kill_export_at
!= 12);
2161 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
2162 if (it
== export_state
.end()) {
2163 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl
;
2167 // send finish/commit to new auth
2168 if (!mds
->is_cluster_degraded() ||
2169 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
)) {
2170 mds
->send_message_mds(make_message
<MExportDirFinish
>(dir
->dirfrag(), true, it
->second
.tid
), it
->second
.peer
);
2172 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl
;
2174 ceph_assert(g_conf()->mds_kill_export_at
!= 13);
2176 // finish export (adjust local cache state)
2177 int num_dentries
= 0;
2178 MDSContext::vec finished
;
2179 finish_export_dir(dir
, it
->second
.peer
,
2180 it
->second
.peer_imported
, finished
, &num_dentries
);
2182 ceph_assert(!dir
->is_auth());
2183 mdcache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2187 mdcache
->get_subtree_bounds(dir
, bounds
);
2188 for (set
<CDir
*>::iterator p
= bounds
.begin();
2192 bd
->put(CDir::PIN_EXPORTBOUND
);
2193 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
2196 if (dir
->state_test(CDir::STATE_AUXSUBTREE
))
2197 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
2199 // discard delayed expires
2200 mdcache
->discard_delayed_expire(dir
);
2202 dout(7) << "unfreezing" << dendl
;
2204 // unfreeze tree, with possible subtree merge.
2205 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
2206 dir
->unfreeze_tree();
2207 mdcache
->try_subtree_merge(dir
);
2209 // no more auth subtree? clear scatter dirty
2210 if (!dir
->get_inode()->is_auth() &&
2211 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2212 dir
->get_inode()->clear_scatter_dirty();
2213 // wake up scatter_nudge waiters
2214 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
2217 if (!finished
.empty())
2218 mds
->queue_waiters(finished
);
2220 MutationRef mut
= std::move(it
->second
.mut
);
2221 auto parent
= std::move(it
->second
.parent
);
2222 // remove from exporting list, clean up state
2223 total_exporting_size
-= it
->second
.approx_size
;
2224 export_state
.erase(it
);
2226 ceph_assert(dir
->state_test(CDir::STATE_EXPORTING
));
2227 dir
->clear_exporting();
2229 mdcache
->show_subtrees();
2232 mdcache
->trim(num_dentries
); // try trimming exported dentries
2234 // send pending import_maps?
2235 mdcache
->maybe_send_pending_resolves();
2237 // drop locks, unpin path
2239 mds
->locker
->drop_locks(mut
.get());
2244 child_export_finish(parent
, true);
2246 maybe_do_queued_export();
2251 class C_MDS_ExportDiscover
: public MigratorContext
{
2253 C_MDS_ExportDiscover(Migrator
*mig
, const cref_t
<MExportDirDiscover
>& m
) : MigratorContext(mig
), m(m
) {}
2254 void finish(int r
) override
{
2255 mig
->handle_export_discover(m
, true);
2258 cref_t
<MExportDirDiscover
> m
;
2261 class C_MDS_ExportDiscoverFactory
: public MDSContextFactory
{
2263 C_MDS_ExportDiscoverFactory(Migrator
*mig
, cref_t
<MExportDirDiscover
> m
) : mig(mig
), m(m
) {}
2264 MDSContext
*build() {
2265 return new C_MDS_ExportDiscover(mig
, m
);
2269 cref_t
<MExportDirDiscover
> m
;
2272 // ==========================================================
2275 void Migrator::handle_export_discover(const cref_t
<MExportDirDiscover
> &m
, bool started
)
2277 mds_rank_t from
= m
->get_source_mds();
2278 ceph_assert(from
!= mds
->get_nodeid());
2280 dout(7) << m
->get_path() << dendl
;
2282 // note import state
2283 dirfrag_t df
= m
->get_dirfrag();
2285 if (!mds
->is_active()) {
2286 dout(7) << " not active, send NACK " << dendl
;
2287 mds
->send_message_mds(make_message
<MExportDirDiscoverAck
>(df
, m
->get_tid(), false), from
);
2291 // only start discovering on this message once.
2292 import_state_t
*p_state
;
2293 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2295 ceph_assert(it
== import_state
.end());
2296 p_state
= &import_state
[df
];
2297 p_state
->state
= IMPORT_DISCOVERING
;
2298 p_state
->peer
= from
;
2299 p_state
->tid
= m
->get_tid();
2301 // am i retrying after ancient path_traverse results?
2302 if (it
== import_state
.end() ||
2303 it
->second
.peer
!= from
||
2304 it
->second
.tid
!= m
->get_tid()) {
2305 dout(7) << " dropping obsolete message" << dendl
;
2308 ceph_assert(it
->second
.state
== IMPORT_DISCOVERING
);
2309 p_state
= &it
->second
;
2312 C_MDS_ExportDiscoverFactory
cf(this, m
);
2313 if (!mdcache
->is_open()) {
2314 dout(10) << " waiting for root" << dendl
;
2315 mds
->mdcache
->wait_for_open(cf
.build());
2319 ceph_assert(g_conf()->mds_kill_import_at
!= 1);
2322 CInode
*in
= mdcache
->get_inode(m
->get_dirfrag().ino
);
2324 // must discover it!
2325 filepath
fpath(m
->get_path());
2326 vector
<CDentry
*> trace
;
2327 MDRequestRef null_ref
;
2328 int r
= mdcache
->path_traverse(null_ref
, cf
, fpath
,
2329 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_PATH_LOCKED
,
2333 dout(7) << "failed to discover or not dir " << m
->get_path() << ", NAK" << dendl
;
2334 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2337 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2341 dout(7) << "have " << df
<< " inode " << *in
<< dendl
;
2343 p_state
->state
= IMPORT_DISCOVERED
;
2345 // pin inode in the cache (for now)
2346 ceph_assert(in
->is_dir());
2347 in
->get(CInode::PIN_IMPORTING
);
2350 dout(7) << " sending export_discover_ack on " << *in
<< dendl
;
2351 mds
->send_message_mds(make_message
<MExportDirDiscoverAck
>(df
, m
->get_tid()), p_state
->peer
);
2352 ceph_assert(g_conf()->mds_kill_import_at
!= 2);
2355 void Migrator::import_reverse_discovering(dirfrag_t df
)
2357 import_state
.erase(df
);
2360 void Migrator::import_reverse_discovered(dirfrag_t df
, CInode
*diri
)
2363 diri
->put(CInode::PIN_IMPORTING
);
2364 import_state
.erase(df
);
2367 void Migrator::import_reverse_prepping(CDir
*dir
, import_state_t
& stat
)
2370 mdcache
->map_dirfrag_set(stat
.bound_ls
, bounds
);
2371 import_remove_pins(dir
, bounds
);
2372 import_reverse_final(dir
);
2375 void Migrator::handle_export_cancel(const cref_t
<MExportDirCancel
> &m
)
2377 dout(7) << "on " << m
->get_dirfrag() << dendl
;
2378 dirfrag_t df
= m
->get_dirfrag();
2379 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2380 if (it
== import_state
.end()) {
2381 ceph_abort_msg("got export_cancel in weird state");
2382 } else if (it
->second
.state
== IMPORT_DISCOVERING
) {
2383 import_reverse_discovering(df
);
2384 } else if (it
->second
.state
== IMPORT_DISCOVERED
) {
2385 CInode
*in
= mdcache
->get_inode(df
.ino
);
2387 import_reverse_discovered(df
, in
);
2388 } else if (it
->second
.state
== IMPORT_PREPPING
) {
2389 CDir
*dir
= mdcache
->get_dirfrag(df
);
2391 import_reverse_prepping(dir
, it
->second
);
2392 } else if (it
->second
.state
== IMPORT_PREPPED
) {
2393 CDir
*dir
= mdcache
->get_dirfrag(df
);
2396 mdcache
->get_subtree_bounds(dir
, bounds
);
2397 import_remove_pins(dir
, bounds
);
2398 // adjust auth back to the exportor
2399 mdcache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2400 import_reverse_unfreeze(dir
);
2402 ceph_abort_msg("got export_cancel in weird state");
2406 class C_MDS_ExportPrep
: public MigratorContext
{
2408 C_MDS_ExportPrep(Migrator
*mig
, const cref_t
<MExportDirPrep
>& m
) : MigratorContext(mig
), m(m
) {}
2409 void finish(int r
) override
{
2410 mig
->handle_export_prep(m
, true);
2413 cref_t
<MExportDirPrep
> m
;
2416 class C_MDS_ExportPrepFactory
: public MDSContextFactory
{
2418 C_MDS_ExportPrepFactory(Migrator
*mig
, cref_t
<MExportDirPrep
> m
) : mig(mig
), m(m
) {}
2419 MDSContext
*build() {
2420 return new C_MDS_ExportPrep(mig
, m
);
2424 cref_t
<MExportDirPrep
> m
;
2427 void Migrator::decode_export_prep_trace(bufferlist::const_iterator
& blp
, mds_rank_t oldauth
, MDSContext::vec
& finished
)
2429 DECODE_START(1, blp
);
2434 dout(10) << " trace from " << df
<< " start " << start
<< dendl
;
2436 CDir
*cur
= nullptr;
2438 cur
= mdcache
->get_dirfrag(df
);
2440 dout(10) << " had " << *cur
<< dendl
;
2441 } else if (start
== 'f') {
2442 CInode
*in
= mdcache
->get_inode(df
.ino
);
2444 dout(10) << " had " << *in
<< dendl
;
2445 mdcache
->decode_replica_dir(cur
, blp
, in
, oldauth
, finished
);
2446 dout(10) << " added " << *cur
<< dendl
;
2447 } else if (start
== '-') {
2450 ceph_abort_msg("unrecognized start char");
2452 while (!blp
.end()) {
2453 CDentry
*dn
= nullptr;
2454 mdcache
->decode_replica_dentry(dn
, blp
, cur
, finished
);
2455 dout(10) << " added " << *dn
<< dendl
;
2456 CInode
*in
= nullptr;
2457 mdcache
->decode_replica_inode(in
, blp
, dn
, finished
);
2458 dout(10) << " added " << *in
<< dendl
;
2461 mdcache
->decode_replica_dir(cur
, blp
, in
, oldauth
, finished
);
2462 dout(10) << " added " << *cur
<< dendl
;
2468 void Migrator::handle_export_prep(const cref_t
<MExportDirPrep
> &m
, bool did_assim
)
2470 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2471 ceph_assert(oldauth
!= mds
->get_nodeid());
2475 MDSContext::vec finished
;
2477 // assimilate root dir.
2478 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2480 ceph_assert(it
!= import_state
.end());
2481 ceph_assert(it
->second
.state
== IMPORT_DISCOVERED
);
2482 ceph_assert(it
->second
.peer
== oldauth
);
2483 diri
= mdcache
->get_inode(m
->get_dirfrag().ino
);
2485 auto p
= m
->basedir
.cbegin();
2486 mdcache
->decode_replica_dir(dir
, p
, diri
, oldauth
, finished
);
2487 dout(7) << "on " << *dir
<< " (first pass)" << dendl
;
2489 if (it
== import_state
.end() ||
2490 it
->second
.peer
!= oldauth
||
2491 it
->second
.tid
!= m
->get_tid()) {
2492 dout(7) << "obsolete message, dropping" << dendl
;
2495 ceph_assert(it
->second
.state
== IMPORT_PREPPING
);
2496 ceph_assert(it
->second
.peer
== oldauth
);
2498 dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
2500 dout(7) << "on " << *dir
<< " (subsequent pass)" << dendl
;
2501 diri
= dir
->get_inode();
2503 ceph_assert(dir
->is_auth() == false);
2505 mdcache
->show_subtrees();
2507 // build import bound map
2508 map
<inodeno_t
, fragset_t
> import_bound_fragset
;
2509 for (const auto &bound
: m
->get_bounds()) {
2510 dout(10) << " bound " << bound
<< dendl
;
2511 import_bound_fragset
[bound
.ino
].insert_raw(bound
.frag
);
2513 // assimilate contents?
2515 dout(7) << "doing assim on " << *dir
<< dendl
;
2517 // change import state
2518 it
->second
.state
= IMPORT_PREPPING
;
2519 it
->second
.bound_ls
= m
->get_bounds();
2520 it
->second
.bystanders
= m
->get_bystanders();
2521 ceph_assert(g_conf()->mds_kill_import_at
!= 3);
2524 dout(7) << "bystanders are " << it
->second
.bystanders
<< dendl
;
2527 diri
->put(CInode::PIN_IMPORTING
);
2528 dir
->get(CDir::PIN_IMPORTING
);
2529 dir
->state_set(CDir::STATE_IMPORTING
);
2531 // assimilate traces to exports
2532 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2533 for (const auto &bl
: m
->traces
) {
2534 auto blp
= bl
.cbegin();
2535 decode_export_prep_trace(blp
, oldauth
, finished
);
2538 // make bound sticky
2539 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2540 p
!= import_bound_fragset
.end();
2542 p
->second
.simplify();
2543 CInode
*in
= mdcache
->get_inode(p
->first
);
2545 in
->get_stickydirs();
2546 dout(7) << " set stickydirs on bound inode " << *in
<< dendl
;
2550 dout(7) << " not doing assim on " << *dir
<< dendl
;
2553 MDSGatherBuilder
gather(g_ceph_context
);
2555 if (!finished
.empty())
2556 mds
->queue_waiters(finished
);
2559 bool success
= true;
2560 if (mds
->is_active()) {
2562 set
<CDir
*> import_bounds
;
2563 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2564 p
!= import_bound_fragset
.end();
2566 CInode
*in
= mdcache
->get_inode(p
->first
);
2569 // map fragset into a frag_t list, based on the inode fragtree
2571 for (const auto& frag
: p
->second
) {
2572 in
->dirfragtree
.get_leaves_under(frag
, leaves
);
2574 dout(10) << " bound inode " << p
->first
<< " fragset " << p
->second
<< " maps to " << leaves
<< dendl
;
2576 for (const auto& leaf
: leaves
) {
2577 CDir
*bound
= mdcache
->get_dirfrag(dirfrag_t(p
->first
, leaf
));
2579 dout(7) << " opening bounding dirfrag " << leaf
<< " on " << *in
<< dendl
;
2580 mdcache
->open_remote_dirfrag(in
, leaf
, gather
.new_sub());
2584 if (!bound
->state_test(CDir::STATE_IMPORTBOUND
)) {
2585 dout(7) << " pinning import bound " << *bound
<< dendl
;
2586 bound
->get(CDir::PIN_IMPORTBOUND
);
2587 bound
->state_set(CDir::STATE_IMPORTBOUND
);
2589 dout(7) << " already pinned import bound " << *bound
<< dendl
;
2591 import_bounds
.insert(bound
);
2595 if (gather
.has_subs()) {
2596 C_MDS_ExportPrepFactory
cf(this, m
);
2597 gather
.set_finisher(cf
.build());
2602 dout(7) << " all ready, noting auth and freezing import region" << dendl
;
2604 if (!mdcache
->is_readonly() &&
2605 // for pinning scatter gather. loner has a higher chance to get wrlock
2606 diri
->filelock
.can_wrlock(diri
->get_loner()) &&
2607 diri
->nestlock
.can_wrlock(diri
->get_loner())) {
2608 it
->second
.mut
= new MutationImpl();
2609 // force some locks. hacky.
2610 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, it
->second
.mut
);
2611 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, it
->second
.mut
);
2613 // note that i am an ambiguous auth for this subtree.
2614 // specify bounds, since the exporter explicitly defines the region.
2615 mdcache
->adjust_bounded_subtree_auth(dir
, import_bounds
,
2616 pair
<int,int>(oldauth
, mds
->get_nodeid()));
2617 mdcache
->verify_subtree_bounds(dir
, import_bounds
);
2619 dir
->_freeze_tree();
2621 it
->second
.state
= IMPORT_PREPPED
;
2623 dout(7) << " couldn't acquire all needed locks, failing. " << *dir
<< dendl
;
2627 dout(7) << " not active, failing. " << *dir
<< dendl
;
2632 import_reverse_prepping(dir
, it
->second
);
2635 dout(7) << " sending export_prep_ack on " << *dir
<< dendl
;
2636 mds
->send_message(make_message
<MExportDirPrepAck
>(dir
->dirfrag(), success
, m
->get_tid()), m
->get_connection());
2638 ceph_assert(g_conf()->mds_kill_import_at
!= 4);
2644 class C_MDS_ImportDirLoggedStart
: public MigratorLogContext
{
2649 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
2651 C_MDS_ImportDirLoggedStart(Migrator
*m
, CDir
*d
, mds_rank_t f
) :
2652 MigratorLogContext(m
), df(d
->dirfrag()), dir(d
), from(f
) {
2653 dir
->get(CDir::PIN_PTRWAITER
);
2655 void finish(int r
) override
{
2656 mig
->import_logged_start(df
, dir
, from
, imported_session_map
);
2657 dir
->put(CDir::PIN_PTRWAITER
);
2661 void Migrator::handle_export_dir(const cref_t
<MExportDir
> &m
)
2663 ceph_assert(g_conf()->mds_kill_import_at
!= 5);
2664 CDir
*dir
= mdcache
->get_dirfrag(m
->dirfrag
);
2667 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2668 dout(7) << "importing " << *dir
<< " from " << oldauth
<< dendl
;
2670 ceph_assert(!dir
->is_auth());
2671 ceph_assert(dir
->freeze_tree_state
);
2673 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->dirfrag
);
2674 ceph_assert(it
!= import_state
.end());
2675 ceph_assert(it
->second
.state
== IMPORT_PREPPED
);
2676 ceph_assert(it
->second
.tid
== m
->get_tid());
2677 ceph_assert(it
->second
.peer
== oldauth
);
2679 if (!dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()))
2680 dir
->get_inode()->dirfragtree
.force_to_leaf(g_ceph_context
, dir
->get_frag());
2682 mdcache
->show_subtrees();
2684 C_MDS_ImportDirLoggedStart
*onlogged
= new C_MDS_ImportDirLoggedStart(this, dir
, oldauth
);
2686 // start the journal entry
2687 EImportStart
*le
= new EImportStart(mds
->mdlog
, dir
->dirfrag(), m
->bounds
, oldauth
);
2688 mds
->mdlog
->start_entry(le
);
2690 le
->metablob
.add_dir_context(dir
);
2692 // adjust auth (list us _first_)
2693 mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), oldauth
);
2695 // new client sessions, open these after we journal
2696 // include imported sessions in EImportStart
2697 auto cmp
= m
->client_map
.cbegin();
2698 map
<client_t
,entity_inst_t
> client_map
;
2699 map
<client_t
,client_metadata_t
> client_metadata_map
;
2700 decode(client_map
, cmp
);
2701 decode(client_metadata_map
, cmp
);
2702 ceph_assert(cmp
.end());
2703 le
->cmapv
= mds
->server
->prepare_force_open_sessions(client_map
, client_metadata_map
,
2704 onlogged
->imported_session_map
);
2705 encode(client_map
, le
->client_map
, mds
->mdsmap
->get_up_features());
2706 encode(client_metadata_map
, le
->client_map
);
2708 auto blp
= m
->export_data
.cbegin();
2709 int num_imported_inodes
= 0;
2710 while (!blp
.end()) {
2711 decode_import_dir(blp
,
2715 mds
->mdlog
->get_current_segment(),
2716 it
->second
.peer_exports
,
2717 it
->second
.updated_scatterlocks
,
2718 num_imported_inodes
);
2720 dout(10) << " " << m
->bounds
.size() << " imported bounds" << dendl
;
2722 // include bounds in EImportStart
2723 set
<CDir
*> import_bounds
;
2724 for (const auto &bound
: m
->bounds
) {
2725 CDir
*bd
= mdcache
->get_dirfrag(bound
);
2727 le
->metablob
.add_dir(bd
, false); // note that parent metadata is already in the event
2728 import_bounds
.insert(bd
);
2730 mdcache
->verify_subtree_bounds(dir
, import_bounds
);
2732 // adjust popularity
2733 mds
->balancer
->add_import(dir
);
2735 dout(7) << "did " << *dir
<< dendl
;
2738 it
->second
.state
= IMPORT_LOGGINGSTART
;
2739 ceph_assert(g_conf()->mds_kill_import_at
!= 6);
2742 mds
->mdlog
->submit_entry(le
, onlogged
);
2743 mds
->mdlog
->flush();
2747 mds
->logger
->inc(l_mds_imported
);
2748 mds
->logger
->inc(l_mds_imported_inodes
, num_imported_inodes
);
2754 * this is an import helper
2755 * called by import_finish, and import_reverse and friends.
2757 void Migrator::import_remove_pins(CDir
*dir
, set
<CDir
*>& bounds
)
2759 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2761 dir
->put(CDir::PIN_IMPORTING
);
2762 dir
->state_clear(CDir::STATE_IMPORTING
);
2766 for (list
<dirfrag_t
>::iterator p
= stat
.bound_ls
.begin();
2767 p
!= stat
.bound_ls
.end();
2769 if (did
.count(p
->ino
))
2772 CInode
*in
= mdcache
->get_inode(p
->ino
);
2774 in
->put_stickydirs();
2777 if (stat
.state
== IMPORT_PREPPING
) {
2778 for (auto bd
: bounds
) {
2779 if (bd
->state_test(CDir::STATE_IMPORTBOUND
)) {
2780 bd
->put(CDir::PIN_IMPORTBOUND
);
2781 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2784 } else if (stat
.state
>= IMPORT_PREPPED
) {
2785 // bounding dirfrags
2786 for (auto bd
: bounds
) {
2787 ceph_assert(bd
->state_test(CDir::STATE_IMPORTBOUND
));
2788 bd
->put(CDir::PIN_IMPORTBOUND
);
2789 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2794 class C_MDC_QueueContexts
: public MigratorContext
{
2796 MDSContext::vec contexts
;
2797 C_MDC_QueueContexts(Migrator
*m
) : MigratorContext(m
) {}
2798 void finish(int r
) override
{
2799 // execute contexts immediately after 'this' context
2800 get_mds()->queue_waiters_front(contexts
);
2805 * note: this does teh full work of reversing and import and cleaning up
2807 * called by both handle_mds_failure and by handle_resolve (if we are
2808 * a survivor coping with an exporter failure+recovery).
2810 void Migrator::import_reverse(CDir
*dir
)
2812 dout(7) << *dir
<< dendl
;
2814 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2815 stat
.state
= IMPORT_ABORTING
;
2818 mdcache
->get_subtree_bounds(dir
, bounds
);
2821 import_remove_pins(dir
, bounds
);
2823 // update auth, with possible subtree merge.
2824 ceph_assert(dir
->is_subtree_root());
2825 if (mds
->is_resolve())
2826 mdcache
->trim_non_auth_subtree(dir
);
2828 mdcache
->adjust_subtree_auth(dir
, stat
.peer
);
2830 auto fin
= new C_MDC_QueueContexts(this);
2831 if (!dir
->get_inode()->is_auth() &&
2832 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2833 dir
->get_inode()->clear_scatter_dirty();
2834 // wake up scatter_nudge waiters
2835 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2838 int num_dentries
= 0;
2839 // adjust auth bits.
2840 std::deque
<CDir
*> q
;
2842 while (!q
.empty()) {
2843 CDir
*cur
= q
.front();
2847 cur
->abort_import();
2849 for (auto &p
: *cur
) {
2850 CDentry
*dn
= p
.second
;
2853 dn
->state_clear(CDentry::STATE_AUTH
);
2854 dn
->clear_replica_map();
2855 dn
->set_replica_nonce(CDentry::EXPORT_NONCE
);
2860 if (dn
->get_linkage()->is_primary()) {
2861 CInode
*in
= dn
->get_linkage()->get_inode();
2862 in
->state_clear(CDentry::STATE_AUTH
);
2863 in
->clear_replica_map();
2864 in
->set_replica_nonce(CInode::EXPORT_NONCE
);
2867 in
->clear_dirty_rstat();
2868 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2869 in
->clear_scatter_dirty();
2870 in
->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2873 in
->clear_dirty_parent();
2875 in
->clear_clientwriteable();
2876 in
->state_clear(CInode::STATE_NEEDSRECOVER
);
2878 in
->authlock
.clear_gather();
2879 in
->linklock
.clear_gather();
2880 in
->dirfragtreelock
.clear_gather();
2881 in
->filelock
.clear_gather();
2883 in
->clear_file_locks();
2885 // non-bounding dir?
2886 auto&& dfs
= in
->get_dirfrags();
2887 for (const auto& dir
: dfs
) {
2888 if (bounds
.count(dir
) == 0)
2893 mdcache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
2898 dir
->add_waiter(CDir::WAIT_UNFREEZE
, fin
);
2900 if (stat
.state
== IMPORT_ACKING
) {
2901 // remove imported caps
2902 for (map
<CInode
*,map
<client_t
,Capability::Export
> >::iterator p
= stat
.peer_exports
.begin();
2903 p
!= stat
.peer_exports
.end();
2905 CInode
*in
= p
->first
;
2906 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2907 q
!= p
->second
.end();
2909 Capability
*cap
= in
->get_client_cap(q
->first
);
2911 ceph_assert(!stat
.session_map
.count(q
->first
));
2914 if (cap
->is_importing())
2915 in
->remove_client_cap(q
->first
);
2917 cap
->clear_clientwriteable();
2919 in
->put(CInode::PIN_IMPORTINGCAPS
);
2921 for (auto& p
: stat
.session_map
) {
2922 Session
*session
= p
.second
.first
;
2923 session
->dec_importing();
2928 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false)); // log failure
2930 mdcache
->trim(num_dentries
); // try trimming dentries
2932 // notify bystanders; wait in aborting state
2933 import_notify_abort(dir
, bounds
);
2936 void Migrator::import_notify_finish(CDir
*dir
, set
<CDir
*>& bounds
)
2938 dout(7) << *dir
<< dendl
;
2940 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2941 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2942 p
!= stat
.bystanders
.end();
2944 auto notify
= make_message
<MExportDirNotify
>(dir
->dirfrag(), stat
.tid
, false,
2945 pair
<int,int>(stat
.peer
, mds
->get_nodeid()),
2946 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
2947 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2948 notify
->get_bounds().push_back((*i
)->dirfrag());
2949 mds
->send_message_mds(notify
, *p
);
2953 void Migrator::import_notify_abort(CDir
*dir
, set
<CDir
*>& bounds
)
2955 dout(7) << *dir
<< dendl
;
2957 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2958 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2959 p
!= stat
.bystanders
.end(); ) {
2960 if (mds
->is_cluster_degraded() &&
2961 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)) {
2962 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2963 stat
.bystanders
.erase(p
++);
2966 auto notify
= make_message
<MExportDirNotify
>(dir
->dirfrag(), stat
.tid
, true,
2967 mds_authority_t(stat
.peer
, mds
->get_nodeid()),
2968 mds_authority_t(stat
.peer
, CDIR_AUTH_UNKNOWN
));
2969 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2970 notify
->get_bounds().push_back((*i
)->dirfrag());
2971 mds
->send_message_mds(notify
, *p
);
2974 if (stat
.bystanders
.empty()) {
2975 dout(7) << "no bystanders, finishing reverse now" << dendl
;
2976 import_reverse_unfreeze(dir
);
2978 ceph_assert(g_conf()->mds_kill_import_at
!= 10);
2982 void Migrator::import_reverse_unfreeze(CDir
*dir
)
2984 dout(7) << *dir
<< dendl
;
2985 ceph_assert(!dir
->is_auth());
2986 mdcache
->discard_delayed_expire(dir
);
2987 dir
->unfreeze_tree();
2988 if (dir
->is_subtree_root())
2989 mdcache
->try_subtree_merge(dir
);
2990 import_reverse_final(dir
);
2993 void Migrator::import_reverse_final(CDir
*dir
)
2995 dout(7) << *dir
<< dendl
;
2998 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2999 ceph_assert(it
!= import_state
.end());
3001 MutationRef mut
= it
->second
.mut
;
3002 import_state
.erase(it
);
3004 // send pending import_maps?
3005 mdcache
->maybe_send_pending_resolves();
3008 mds
->locker
->drop_locks(mut
.get());
3012 mdcache
->show_subtrees();
3013 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3019 void Migrator::import_logged_start(dirfrag_t df
, CDir
*dir
, mds_rank_t from
,
3020 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
)
3022 dout(7) << *dir
<< dendl
;
3024 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
3025 if (it
== import_state
.end() ||
3026 it
->second
.state
!= IMPORT_LOGGINGSTART
) {
3027 dout(7) << "import " << df
<< " must have aborted" << dendl
;
3028 mds
->server
->finish_force_open_sessions(imported_session_map
);
3033 it
->second
.state
= IMPORT_ACKING
;
3035 ceph_assert(g_conf()->mds_kill_import_at
!= 7);
3037 // force open client sessions and finish cap import
3038 mds
->server
->finish_force_open_sessions(imported_session_map
, false);
3040 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
3041 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
3042 p
!= it
->second
.peer_exports
.end();
3044 // parameter 'peer' is NONE, delay sending cap import messages to client
3045 finish_import_inode_caps(p
->first
, MDS_RANK_NONE
, true, imported_session_map
,
3046 p
->second
, imported_caps
[p
->first
->ino()]);
3049 it
->second
.session_map
.swap(imported_session_map
);
3051 // send notify's etc.
3052 dout(7) << "sending ack for " << *dir
<< " to old auth mds." << from
<< dendl
;
3054 // test surviving observer of a failed migration that did not complete
3055 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
3057 auto ack
= make_message
<MExportDirAck
>(dir
->dirfrag(), it
->second
.tid
);
3058 encode(imported_caps
, ack
->imported_caps
);
3060 mds
->send_message_mds(ack
, from
);
3061 ceph_assert(g_conf()->mds_kill_import_at
!= 8);
3063 mdcache
->show_subtrees();
3066 void Migrator::handle_export_finish(const cref_t
<MExportDirFinish
> &m
)
3068 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
3070 dout(7) << *dir
<< (m
->is_last() ? " last" : "") << dendl
;
3072 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
3073 ceph_assert(it
!= import_state
.end());
3074 ceph_assert(it
->second
.tid
== m
->get_tid());
3076 import_finish(dir
, false, m
->is_last());
3079 void Migrator::import_finish(CDir
*dir
, bool notify
, bool last
)
3081 dout(7) << *dir
<< dendl
;
3083 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
3084 ceph_assert(it
!= import_state
.end());
3085 ceph_assert(it
->second
.state
== IMPORT_ACKING
|| it
->second
.state
== IMPORT_FINISHING
);
3087 if (it
->second
.state
== IMPORT_ACKING
) {
3088 ceph_assert(dir
->is_auth());
3089 mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
3093 ceph_assert(g_conf()->mds_kill_import_at
!= 9);
3095 if (it
->second
.state
== IMPORT_ACKING
) {
3096 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
3097 p
!= it
->second
.peer_exports
.end();
3099 CInode
*in
= p
->first
;
3100 ceph_assert(in
->is_auth());
3101 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
3102 q
!= p
->second
.end();
3104 auto r
= it
->second
.session_map
.find(q
->first
);
3105 if (r
== it
->second
.session_map
.end())
3108 Session
*session
= r
->second
.first
;
3109 Capability
*cap
= in
->get_client_cap(q
->first
);
3111 cap
->merge(q
->second
, true);
3112 cap
->clear_importing();
3113 mdcache
->do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
,
3114 q
->second
.mseq
- 1, it
->second
.peer
, CEPH_CAP_FLAG_AUTH
);
3117 in
->replica_caps_wanted
= 0;
3119 for (auto& p
: it
->second
.session_map
) {
3120 Session
*session
= p
.second
.first
;
3121 session
->dec_importing();
3126 ceph_assert(it
->second
.state
== IMPORT_ACKING
);
3127 it
->second
.state
= IMPORT_FINISHING
;
3133 mdcache
->get_subtree_bounds(dir
, bounds
);
3136 import_notify_finish(dir
, bounds
);
3138 import_remove_pins(dir
, bounds
);
3140 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3141 it
->second
.peer_exports
.swap(peer_exports
);
3143 // clear import state (we're done!)
3144 MutationRef mut
= it
->second
.mut
;
3145 import_state
.erase(it
);
3147 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3149 // process delayed expires
3150 mdcache
->process_delayed_expire(dir
);
3152 // unfreeze tree, with possible subtree merge.
3153 dir
->unfreeze_tree();
3154 mdcache
->try_subtree_merge(dir
);
3156 mdcache
->show_subtrees();
3157 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3160 mds
->locker
->drop_locks(mut
.get());
3164 // re-eval imported caps
3165 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= peer_exports
.begin();
3166 p
!= peer_exports
.end();
3168 if (p
->first
->is_auth())
3169 mds
->locker
->eval(p
->first
, CEPH_CAP_LOCKS
, true);
3170 p
->first
->put(CInode::PIN_IMPORTINGCAPS
);
3173 // send pending import_maps?
3174 mdcache
->maybe_send_pending_resolves();
3176 // did i just import mydir?
3177 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
3178 mdcache
->populate_mydir();
3181 if (dir
->get_num_head_items() == 0 &&
3182 !dir
->inode
->is_auth()) {
3184 export_empty_import(dir
);
3188 void Migrator::decode_import_inode(CDentry
*dn
, bufferlist::const_iterator
& blp
,
3189 mds_rank_t oldauth
, LogSegment
*ls
,
3190 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
,
3191 list
<ScatterLock
*>& updated_scatterlocks
)
3195 DECODE_START(1, blp
);
3196 dout(15) << " on " << *dn
<< dendl
;
3203 in
= mdcache
->get_inode(ino
, last
);
3205 in
= new CInode(mds
->mdcache
, true, 2, last
);
3209 // state after link -- or not! -sage
3210 in
->decode_import(blp
, ls
); // cap imports are noted for later action
3213 decode_import_inode_caps(in
, true, blp
, peer_exports
);
3217 // link before state -- or not! -sage
3218 if (dn
->get_linkage()->get_inode() != in
) {
3219 ceph_assert(!dn
->get_linkage()->get_inode());
3220 dn
->dir
->link_primary_inode(dn
, in
);
3224 dn
->dir
->pop_lru_subdirs
.push_back(&in
->item_pop_lru
);
3228 mdcache
->add_inode(in
);
3229 dout(10) << "added " << *in
<< dendl
;
3231 dout(10) << " had " << *in
<< dendl
;
3234 if (in
->get_inode()->is_dirty_rstat())
3235 in
->mark_dirty_rstat();
3237 if (!in
->get_inode()->client_ranges
.empty())
3238 in
->mark_clientwriteable();
3240 // clear if dirtyscattered, since we're going to journal this
3241 // but not until we _actually_ finish the import...
3242 if (in
->filelock
.is_dirty()) {
3243 updated_scatterlocks
.push_back(&in
->filelock
);
3244 mds
->locker
->mark_updated_scatterlock(&in
->filelock
);
3247 if (in
->dirfragtreelock
.is_dirty()) {
3248 updated_scatterlocks
.push_back(&in
->dirfragtreelock
);
3249 mds
->locker
->mark_updated_scatterlock(&in
->dirfragtreelock
);
3252 // adjust replica list
3253 //assert(!in->is_replica(oldauth)); // not true on failed export
3254 in
->add_replica(oldauth
, CInode::EXPORT_NONCE
);
3255 if (in
->is_replica(mds
->get_nodeid()))
3256 in
->remove_replica(mds
->get_nodeid());
3258 if (in
->snaplock
.is_stable() &&
3259 in
->snaplock
.get_state() != LOCK_SYNC
)
3260 mds
->locker
->try_eval(&in
->snaplock
, NULL
);
3262 if (in
->policylock
.is_stable() &&
3263 in
->policylock
.get_state() != LOCK_SYNC
)
3264 mds
->locker
->try_eval(&in
->policylock
, NULL
);
3267 void Migrator::decode_import_inode_caps(CInode
*in
, bool auth_cap
,
3268 bufferlist::const_iterator
&blp
,
3269 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3271 DECODE_START(1, blp
);
3272 map
<client_t
,Capability::Export
> cap_map
;
3273 decode(cap_map
, blp
);
3275 mempool::mds_co::compact_map
<int32_t,int32_t> mds_wanted
;
3276 decode(mds_wanted
, blp
);
3277 mds_wanted
.erase(mds
->get_nodeid());
3278 in
->set_mds_caps_wanted(mds_wanted
);
3280 if (!cap_map
.empty() ||
3281 (auth_cap
&& (in
->get_caps_wanted() & ~CEPH_CAP_PIN
))) {
3282 peer_exports
[in
].swap(cap_map
);
3283 in
->get(CInode::PIN_IMPORTINGCAPS
);
3288 void Migrator::finish_import_inode_caps(CInode
*in
, mds_rank_t peer
, bool auth_cap
,
3289 const map
<client_t
,pair
<Session
*,uint64_t> >& session_map
,
3290 const map
<client_t
,Capability::Export
> &export_map
,
3291 map
<client_t
,Capability::Import
> &import_map
)
3293 const auto& client_ranges
= in
->get_projected_inode()->client_ranges
;
3294 auto r
= client_ranges
.cbegin();
3295 bool needs_recover
= false;
3297 for (auto& it
: export_map
) {
3298 dout(10) << "for client." << it
.first
<< " on " << *in
<< dendl
;
3300 auto p
= session_map
.find(it
.first
);
3301 if (p
== session_map
.end()) {
3302 dout(10) << " no session for client." << it
.first
<< dendl
;
3303 (void)import_map
[it
.first
];
3307 Session
*session
= p
->second
.first
;
3309 Capability
*cap
= in
->get_client_cap(it
.first
);
3311 cap
= in
->add_client_cap(it
.first
, session
);
3313 cap
->mark_importing();
3317 while (r
!= client_ranges
.cend() && r
->first
< it
.first
) {
3318 needs_recover
= true;
3321 if (r
!= client_ranges
.cend() && r
->first
== it
.first
) {
3322 cap
->mark_clientwriteable();
3327 // Always ask exporter mds to send cap export messages for auth caps.
3328 // For non-auth caps, ask exporter mds to send cap export messages to
3329 // clients who haven't opened sessions. The cap export messages will
3330 // make clients open sessions.
3331 if (auth_cap
|| !session
->get_connection()) {
3332 Capability::Import
& im
= import_map
[it
.first
];
3333 im
.cap_id
= cap
->get_cap_id();
3334 im
.mseq
= auth_cap
? it
.second
.mseq
: cap
->get_mseq();
3335 im
.issue_seq
= cap
->get_last_seq() + 1;
3339 cap
->merge(it
.second
, auth_cap
);
3340 mdcache
->do_cap_import(session
, in
, cap
, it
.second
.cap_id
,
3341 it
.second
.seq
, it
.second
.mseq
- 1, peer
,
3342 auth_cap
? CEPH_CAP_FLAG_AUTH
: CEPH_CAP_FLAG_RELEASE
);
3347 if (r
!= client_ranges
.cend())
3348 needs_recover
= true;
3350 in
->state_set(CInode::STATE_NEEDSRECOVER
);
3354 in
->replica_caps_wanted
= 0;
3355 in
->put(CInode::PIN_IMPORTINGCAPS
);
3359 void Migrator::decode_import_dir(bufferlist::const_iterator
& blp
,
3364 map
<CInode
*,map
<client_t
,Capability::Export
> >& peer_exports
,
3365 list
<ScatterLock
*>& updated_scatterlocks
, int &num_imported
)
3367 DECODE_START(1, blp
);
3372 CInode
*diri
= mdcache
->get_inode(df
.ino
);
3374 CDir
*dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, df
.frag
);
3377 dout(7) << *dir
<< dendl
;
3379 if (!dir
->freeze_tree_state
) {
3380 ceph_assert(dir
->get_version() == 0);
3381 dir
->freeze_tree_state
= import_root
->freeze_tree_state
;
3385 dir
->decode_import(blp
, ls
);
3387 // adjust replica list
3388 //assert(!dir->is_replica(oldauth)); // not true on failed export
3389 dir
->add_replica(oldauth
, CDir::EXPORT_NONCE
);
3390 if (dir
->is_replica(mds
->get_nodeid()))
3391 dir
->remove_replica(mds
->get_nodeid());
3393 // add to journal entry
3395 le
->metablob
.add_import_dir(dir
);
3397 int num_imported
= 0;
3399 // take all waiters on this dir
3400 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3401 // a replica's presense in my cache implies/forces it's presense in authority's.
3402 MDSContext::vec waiters
;
3403 dir
->take_waiting(CDir::WAIT_ANY_MASK
, waiters
);
3404 for (auto c
: waiters
)
3405 dir
->add_waiter(CDir::WAIT_UNFREEZE
, c
); // UNFREEZE will get kicked both on success or failure
3407 dout(15) << "doing contents" << dendl
;
3413 for (; nden
>0; nden
--) {
3422 CDentry
*dn
= dir
->lookup_exact_snap(dname
, last
);
3424 dn
= dir
->add_null_dentry(dname
, 1, last
);
3426 dn
->decode_import(blp
, ls
);
3428 dn
->add_replica(oldauth
, CDentry::EXPORT_NONCE
);
3429 if (dn
->is_replica(mds
->get_nodeid()))
3430 dn
->remove_replica(mds
->get_nodeid());
3432 // dentry lock in unreadable state can block path traverse
3433 if (dn
->lock
.get_state() != LOCK_SYNC
)
3434 mds
->locker
->try_eval(&dn
->lock
, NULL
);
3436 dout(15) << " got " << *dn
<< dendl
;
3444 ceph_assert(dn
->get_linkage()->is_null());
3448 else if (icode
== 'L' || icode
== 'l') {
3451 unsigned char d_type
;
3452 mempool::mds_co::string alternate_name
;
3454 CDentry::decode_remote(icode
, ino
, d_type
, alternate_name
, blp
);
3456 if (dn
->get_linkage()->is_remote()) {
3457 ceph_assert(dn
->get_linkage()->get_remote_ino() == ino
);
3458 ceph_assert(dn
->get_alternate_name() == alternate_name
);
3460 dir
->link_remote_inode(dn
, ino
, d_type
);
3461 dn
->set_alternate_name(std::move(alternate_name
));
3464 else if (icode
== 'I' || icode
== 'i') {
3468 DECODE_START(2, blp
);
3469 decode_import_inode(dn
, blp
, oldauth
, ls
,
3470 peer_exports
, updated_scatterlocks
);
3471 ceph_assert(!dn
->is_projected());
3472 decode(dn
->alternate_name
, blp
);
3475 decode_import_inode(dn
, blp
, oldauth
, ls
,
3476 peer_exports
, updated_scatterlocks
);
3480 // add dentry to journal entry
3482 le
->metablob
.add_import_dentry(dn
);
3485 #ifdef MDS_VERIFY_FRAGSTAT
3486 if (dir
->is_complete())
3487 dir
->verify_fragstat();
3490 dir
->inode
->maybe_export_pin();
3492 dout(7) << " done " << *dir
<< dendl
;
3500 // authority bystander
3502 void Migrator::handle_export_notify(const cref_t
<MExportDirNotify
> &m
)
3504 if (!(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())) {
3508 CDir
*dir
= mdcache
->get_dirfrag(m
->get_dirfrag());
3510 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3511 mds_authority_t old_auth
= m
->get_old_auth();
3512 mds_authority_t new_auth
= m
->get_new_auth();
3515 dout(7) << old_auth
<< " -> " << new_auth
3516 << " on missing dir " << m
->get_dirfrag() << dendl
;
3517 } else if (dir
->authority() != old_auth
) {
3518 dout(7) << "old_auth was " << dir
->authority()
3519 << " != " << old_auth
<< " -> " << new_auth
3520 << " on " << *dir
<< dendl
;
3522 dout(7) << old_auth
<< " -> " << new_auth
3523 << " on " << *dir
<< dendl
;
3526 mdcache
->map_dirfrag_set(m
->get_bounds(), have
);
3527 mdcache
->adjust_bounded_subtree_auth(dir
, have
, new_auth
);
3530 mdcache
->try_subtree_merge(dir
);
3534 if (m
->wants_ack()) {
3535 mds
->send_message_mds(make_message
<MExportDirNotifyAck
>(m
->get_dirfrag(), m
->get_tid(), m
->get_new_auth()), from
);
3538 dout(7) << "no ack requested" << dendl
;
3543 void Migrator::export_caps(CInode
*in
)
3545 mds_rank_t dest
= in
->authority().first
;
3546 dout(7) << "to mds." << dest
<< " " << *in
<< dendl
;
3548 ceph_assert(in
->is_any_caps());
3549 ceph_assert(!in
->is_auth());
3550 ceph_assert(!in
->is_ambiguous_auth());
3551 ceph_assert(!in
->state_test(CInode::STATE_EXPORTINGCAPS
));
3553 auto ex
= make_message
<MExportCaps
>();
3554 ex
->ino
= in
->ino();
3556 encode_export_inode_caps(in
, false, ex
->cap_bl
, ex
->client_map
, ex
->client_metadata_map
);
3558 mds
->send_message_mds(ex
, dest
);
3561 void Migrator::handle_export_caps_ack(const cref_t
<MExportCapsAck
> &ack
)
3563 mds_rank_t from
= ack
->get_source().num();
3564 CInode
*in
= mdcache
->get_inode(ack
->ino
);
3566 ceph_assert(!in
->is_auth());
3568 dout(10) << *ack
<< " from "
3569 << ack
->get_source() << " on " << *in
<< dendl
;
3571 map
<client_t
,Capability::Import
> imported_caps
;
3572 map
<client_t
,uint64_t> caps_ids
;
3573 auto blp
= ack
->cap_bl
.cbegin();
3574 decode(imported_caps
, blp
);
3575 decode(caps_ids
, blp
);
3577 for (auto& it
: imported_caps
) {
3578 Capability
*cap
= in
->get_client_cap(it
.first
);
3579 if (!cap
|| cap
->get_cap_id() != caps_ids
.at(it
.first
))
3582 dout(7) << " telling client." << it
.first
3583 << " exported caps on " << *in
<< dendl
;
3584 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
3585 cap
->get_cap_id(), cap
->get_mseq(),
3586 mds
->get_osd_epoch_barrier());
3587 m
->set_cap_peer(it
.second
.cap_id
, it
.second
.issue_seq
, it
.second
.mseq
, from
, 0);
3588 mds
->send_message_client_counted(m
, it
.first
);
3590 in
->remove_client_cap(it
.first
);
3593 mds
->locker
->request_inode_file_caps(in
);
3594 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
3598 void Migrator::handle_gather_caps(const cref_t
<MGatherCaps
> &m
)
3600 CInode
*in
= mdcache
->get_inode(m
->ino
);
3604 dout(10) << *m
<< " from " << m
->get_source()
3605 << " on " << *in
<< dendl
;
3607 if (in
->is_any_caps() &&
3609 !in
->is_ambiguous_auth() &&
3610 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
3614 class C_M_LoggedImportCaps
: public MigratorLogContext
{
3618 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
3619 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3621 C_M_LoggedImportCaps(Migrator
*m
, CInode
*i
, mds_rank_t f
) : MigratorLogContext(m
), in(i
), from(f
) {}
3622 void finish(int r
) override
{
3623 mig
->logged_import_caps(in
, from
, imported_session_map
, peer_exports
);
3627 void Migrator::handle_export_caps(const cref_t
<MExportCaps
> &ex
)
3629 dout(10) << *ex
<< " from " << ex
->get_source() << dendl
;
3630 CInode
*in
= mdcache
->get_inode(ex
->ino
);
3633 ceph_assert(in
->is_auth());
3636 if (!in
->can_auth_pin()) {
3642 map
<client_t
,entity_inst_t
> client_map
{ex
->client_map
};
3643 map
<client_t
,client_metadata_t
> client_metadata_map
{ex
->client_metadata_map
};
3645 C_M_LoggedImportCaps
*finish
= new C_M_LoggedImportCaps(
3646 this, in
, mds_rank_t(ex
->get_source().num()));
3648 version_t pv
= mds
->server
->prepare_force_open_sessions(client_map
, client_metadata_map
,
3649 finish
->imported_session_map
);
3651 auto blp
= ex
->cap_bl
.cbegin();
3652 decode_import_inode_caps(in
, false, blp
, finish
->peer_exports
);
3653 ceph_assert(!finish
->peer_exports
.empty()); // thus, inode is pinned.
3655 // journal open client sessions
3656 ESessions
*le
= new ESessions(pv
, std::move(client_map
),
3657 std::move(client_metadata_map
));
3658 mds
->mdlog
->start_submit_entry(le
, finish
);
3659 mds
->mdlog
->flush();
3663 void Migrator::logged_import_caps(CInode
*in
,
3665 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
,
3666 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3668 dout(10) << *in
<< dendl
;
3669 // see export_go() vs export_go_synced()
3670 ceph_assert(in
->is_auth());
3672 // force open client sessions and finish cap import
3673 mds
->server
->finish_force_open_sessions(imported_session_map
);
3675 auto it
= peer_exports
.find(in
);
3676 ceph_assert(it
!= peer_exports
.end());
3678 // clients will release caps from the exporter when they receive the cap import message.
3679 map
<client_t
,Capability::Import
> imported_caps
;
3680 finish_import_inode_caps(in
, from
, false, imported_session_map
, it
->second
, imported_caps
);
3681 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
3683 if (!imported_caps
.empty()) {
3684 auto ack
= make_message
<MExportCapsAck
>(in
->ino());
3685 map
<client_t
,uint64_t> peer_caps_ids
;
3686 for (auto &p
: imported_caps
)
3687 peer_caps_ids
[p
.first
] = it
->second
.at(p
.first
).cap_id
;
3689 encode(imported_caps
, ack
->cap_bl
);
3690 encode(peer_caps_ids
, ack
->cap_bl
);
3691 mds
->send_message_mds(ack
, from
);
3694 in
->auth_unpin(this);
3697 Migrator::Migrator(MDSRank
*m
, MDCache
*c
) : mds(m
), mdcache(c
) {
3698 max_export_size
= g_conf().get_val
<Option::size_t>("mds_max_export_size");
3699 inject_session_race
= g_conf().get_val
<bool>("mds_inject_migrator_session_race");
3702 void Migrator::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
)
3704 if (changed
.count("mds_max_export_size"))
3705 max_export_size
= g_conf().get_val
<Option::size_t>("mds_max_export_size");
3706 if (changed
.count("mds_inject_migrator_session_race")) {
3707 inject_session_race
= g_conf().get_val
<bool>("mds_inject_migrator_session_race");
3708 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race
<< dendl
;