1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "MDBalancer.h"
29 #include "include/filepath.h"
30 #include "common/likely.h"
32 #include "events/EExport.h"
33 #include "events/EImportStart.h"
34 #include "events/EImportFinish.h"
35 #include "events/ESessions.h"
37 #include "msg/Messenger.h"
39 #include "messages/MClientCaps.h"
41 #include "messages/MExportDirDiscover.h"
42 #include "messages/MExportDirDiscoverAck.h"
43 #include "messages/MExportDirCancel.h"
44 #include "messages/MExportDirPrep.h"
45 #include "messages/MExportDirPrepAck.h"
46 #include "messages/MExportDir.h"
47 #include "messages/MExportDirAck.h"
48 #include "messages/MExportDirNotify.h"
49 #include "messages/MExportDirNotifyAck.h"
50 #include "messages/MExportDirFinish.h"
52 #include "messages/MExportCaps.h"
53 #include "messages/MExportCapsAck.h"
54 #include "messages/MGatherCaps.h"
58 * this is what the dir->dir_auth values look like
63 * me, me me - still me, but preparing for export
64 * me, them me - send MExportDir (peer is preparing)
65 * them, me me - journaled EExport
70 * me, them me - journaled EImportStart
74 * - auth bit is set if i am listed as first _or_ second dir_auth.
77 #include "common/config.h"
80 #define dout_context g_ceph_context
81 #define dout_subsys ceph_subsys_mds
83 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
86 class MigratorContext
: public MDSInternalContextBase
{
89 MDSRank
*get_mds() override
{
93 explicit MigratorContext(Migrator
*mig_
) : mig(mig_
) {
98 class MigratorLogContext
: public MDSLogContextBase
{
101 MDSRank
*get_mds() override
{
105 explicit MigratorLogContext(Migrator
*mig_
) : mig(mig_
) {
110 /* This function DOES put the passed message before returning*/
111 void Migrator::dispatch(Message
*m
)
113 switch (m
->get_type()) {
115 case MSG_MDS_EXPORTDIRDISCOVER
:
116 handle_export_discover(static_cast<MExportDirDiscover
*>(m
));
118 case MSG_MDS_EXPORTDIRPREP
:
119 handle_export_prep(static_cast<MExportDirPrep
*>(m
));
121 case MSG_MDS_EXPORTDIR
:
122 if (unlikely(inject_session_race
)) {
123 dout(0) << "waiting for inject_session_race" << dendl
;
124 mds
->wait_for_any_client_connection(new C_MDS_RetryMessage(mds
, m
));
126 handle_export_dir(static_cast<MExportDir
*>(m
));
129 case MSG_MDS_EXPORTDIRFINISH
:
130 handle_export_finish(static_cast<MExportDirFinish
*>(m
));
132 case MSG_MDS_EXPORTDIRCANCEL
:
133 handle_export_cancel(static_cast<MExportDirCancel
*>(m
));
137 case MSG_MDS_EXPORTDIRDISCOVERACK
:
138 handle_export_discover_ack(static_cast<MExportDirDiscoverAck
*>(m
));
140 case MSG_MDS_EXPORTDIRPREPACK
:
141 handle_export_prep_ack(static_cast<MExportDirPrepAck
*>(m
));
143 case MSG_MDS_EXPORTDIRACK
:
144 handle_export_ack(static_cast<MExportDirAck
*>(m
));
146 case MSG_MDS_EXPORTDIRNOTIFYACK
:
147 handle_export_notify_ack(static_cast<MExportDirNotifyAck
*>(m
));
150 // export 3rd party (dir_auth adjustments)
151 case MSG_MDS_EXPORTDIRNOTIFY
:
152 handle_export_notify(static_cast<MExportDirNotify
*>(m
));
156 case MSG_MDS_EXPORTCAPS
:
157 handle_export_caps(static_cast<MExportCaps
*>(m
));
159 case MSG_MDS_GATHERCAPS
:
160 handle_gather_caps(static_cast<MGatherCaps
*>(m
));
164 derr
<< "migrator unknown message " << m
->get_type() << dendl
;
165 assert(0 == "migrator unknown message");
170 class C_MDC_EmptyImport
: public MigratorContext
{
173 C_MDC_EmptyImport(Migrator
*m
, CDir
*d
) : MigratorContext(m
), dir(d
) {}
174 void finish(int r
) override
{
175 mig
->export_empty_import(dir
);
180 void Migrator::export_empty_import(CDir
*dir
)
182 dout(7) << "export_empty_import " << *dir
<< dendl
;
183 assert(dir
->is_subtree_root());
185 if (dir
->inode
->is_auth()) {
186 dout(7) << " inode is auth" << dendl
;
189 if (!dir
->is_auth()) {
190 dout(7) << " not auth" << dendl
;
193 if (dir
->is_freezing() || dir
->is_frozen()) {
194 dout(7) << " freezing or frozen" << dendl
;
197 if (dir
->get_num_head_items() > 0) {
198 dout(7) << " not actually empty" << dendl
;
201 if (dir
->inode
->is_root()) {
202 dout(7) << " root" << dendl
;
206 mds_rank_t dest
= dir
->inode
->authority().first
;
207 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
209 dout(7) << " really empty, exporting to " << dest
<< dendl
;
210 assert (dest
!= mds
->get_nodeid());
212 dout(7) << "exporting to mds." << dest
213 << " empty import " << *dir
<< dendl
;
214 export_dir( dir
, dest
);
217 void Migrator::find_stale_export_freeze()
219 utime_t now
= ceph_clock_now();
220 utime_t cutoff
= now
;
221 cutoff
-= g_conf
->mds_freeze_tree_timeout
;
225 * We could have situations like:
227 * - mds.0 authpins an item in subtree A
228 * - mds.0 sends request to mds.1 to authpin an item in subtree B
229 * - mds.0 freezes subtree A
230 * - mds.1 authpins an item in subtree B
231 * - mds.1 sends request to mds.0 to authpin an item in subtree A
232 * - mds.1 freezes subtree B
233 * - mds.1 receives the remote authpin request from mds.0
234 * (wait because subtree B is freezing)
235 * - mds.0 receives the remote authpin request from mds.1
236 * (wait because subtree A is freezing)
239 * - client request authpins items in subtree B
241 * - import subtree A which is parent of subtree B
242 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
244 * - client request tries authpinning items in subtree A
245 * (wait because subtree A is freezing)
247 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
248 p
!= export_state
.end(); ) {
249 CDir
* dir
= p
->first
;
250 export_state_t
& stat
= p
->second
;
252 if (stat
.state
!= EXPORT_DISCOVERING
&& stat
.state
!= EXPORT_FREEZING
)
254 if (stat
.last_cum_auth_pins
!= dir
->get_cum_auth_pins()) {
255 stat
.last_cum_auth_pins
= dir
->get_cum_auth_pins();
256 stat
.last_cum_auth_pins_change
= now
;
259 if (stat
.last_cum_auth_pins_change
>= cutoff
)
261 if (stat
.num_remote_waiters
> 0 ||
262 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
263 export_try_cancel(dir
);
268 void Migrator::export_try_cancel(CDir
*dir
, bool notify_peer
)
270 dout(10) << "export_try_cancel " << *dir
<< dendl
;
272 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
273 assert(it
!= export_state
.end());
275 int state
= it
->second
.state
;
278 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl
;
279 it
->second
.state
= EXPORT_CANCELLED
;
280 dir
->auth_unpin(this);
282 case EXPORT_DISCOVERING
:
283 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl
;
284 it
->second
.state
= EXPORT_CANCELLED
;
285 dir
->unfreeze_tree(); // cancel the freeze
286 dir
->auth_unpin(this);
288 (!mds
->is_cluster_degraded() ||
289 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
290 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
293 case EXPORT_FREEZING
:
294 dout(10) << "export state=freezing : canceling freeze" << dendl
;
295 it
->second
.state
= EXPORT_CANCELLED
;
296 dir
->unfreeze_tree(); // cancel the freeze
297 if (dir
->is_subtree_root())
298 cache
->try_subtree_merge(dir
);
300 (!mds
->is_cluster_degraded() ||
301 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
302 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
305 // NOTE: state order reversal, warning comes after prepping
307 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl
;
308 it
->second
.state
= EXPORT_CANCELLING
;
311 case EXPORT_PREPPING
:
312 if (state
!= EXPORT_WARNING
) {
313 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl
;
314 it
->second
.state
= EXPORT_CANCELLED
;
320 cache
->get_subtree_bounds(dir
, bounds
);
321 for (set
<CDir
*>::iterator q
= bounds
.begin();
325 bd
->put(CDir::PIN_EXPORTBOUND
);
326 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
328 if (state
== EXPORT_WARNING
) {
330 export_notify_abort(dir
, it
->second
, bounds
);
331 // process delayed expires
332 cache
->process_delayed_expire(dir
);
335 dir
->unfreeze_tree();
336 cache
->try_subtree_merge(dir
);
337 for (auto bd
: it
->second
.residual_dirs
) {
339 cache
->try_subtree_merge(bd
);
342 (!mds
->is_cluster_degraded() ||
343 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
344 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
347 case EXPORT_EXPORTING
:
348 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl
;
349 it
->second
.state
= EXPORT_CANCELLING
;
350 export_reverse(dir
, it
->second
);
353 case EXPORT_LOGGINGFINISH
:
354 case EXPORT_NOTIFYING
:
355 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl
;
356 // leave export_state, don't clean up now.
358 case EXPORT_CANCELLING
:
366 if (it
->second
.state
== EXPORT_CANCELLING
||
367 it
->second
.state
== EXPORT_CANCELLED
) {
369 mut
.swap(it
->second
.mut
);
371 if (it
->second
.state
== EXPORT_CANCELLED
) {
372 export_state
.erase(it
);
373 dir
->state_clear(CDir::STATE_EXPORTING
);
374 // send pending import_maps?
375 cache
->maybe_send_pending_resolves();
379 if (state
== EXPORT_LOCKING
|| state
== EXPORT_DISCOVERING
) {
380 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(mut
.get());
382 if (mdr
->more()->waiting_on_slave
.empty())
383 mds
->mdcache
->request_finish(mdr
);
385 mds
->locker
->drop_locks(mut
.get());
389 cache
->show_subtrees();
391 maybe_do_queued_export();
395 void Migrator::export_cancel_finish(CDir
*dir
)
397 assert(dir
->state_test(CDir::STATE_EXPORTING
));
398 dir
->state_clear(CDir::STATE_EXPORTING
);
400 // pinned by Migrator::export_notify_abort()
401 dir
->auth_unpin(this);
402 // send pending import_maps? (these need to go out when all exports have finished.)
403 cache
->maybe_send_pending_resolves();
406 // ==========================================================
407 // mds failure handling
409 void Migrator::handle_mds_failure_or_stop(mds_rank_t who
)
411 dout(5) << "handle_mds_failure_or_stop mds." << who
<< dendl
;
415 // first add an extra auth_pin on any freezes, so that canceling a
416 // nested freeze doesn't complete one further up the hierarchy and
417 // confuse the shit out of us. we'll remove it after canceling the
418 // freeze. this way no freeze completions run before we want them
420 list
<CDir
*> pinned_dirs
;
421 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
422 p
!= export_state
.end();
424 if (p
->second
.state
== EXPORT_FREEZING
) {
425 CDir
*dir
= p
->first
;
426 dout(10) << "adding temp auth_pin on freezing " << *dir
<< dendl
;
428 pinned_dirs
.push_back(dir
);
432 map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
433 while (p
!= export_state
.end()) {
434 map
<CDir
*,export_state_t
>::iterator next
= p
;
436 CDir
*dir
= p
->first
;
439 // - that are going to the failed node
440 // - that aren't frozen yet (to avoid auth_pin deadlock)
441 // - they havne't prepped yet (they may need to discover bounds to do that)
442 if ((p
->second
.peer
== who
&&
443 p
->second
.state
!= EXPORT_CANCELLING
) ||
444 p
->second
.state
== EXPORT_LOCKING
||
445 p
->second
.state
== EXPORT_DISCOVERING
||
446 p
->second
.state
== EXPORT_FREEZING
||
447 p
->second
.state
== EXPORT_PREPPING
) {
448 // the guy i'm exporting to failed, or we're just freezing.
449 dout(10) << "cleaning up export state (" << p
->second
.state
<< ")"
450 << get_export_statename(p
->second
.state
) << " of " << *dir
<< dendl
;
451 export_try_cancel(dir
);
452 } else if (p
->second
.peer
!= who
) {
454 if (p
->second
.warning_ack_waiting
.erase(who
)) {
455 if (p
->second
.state
== EXPORT_WARNING
) {
456 p
->second
.notify_ack_waiting
.erase(who
); // they won't get a notify either.
457 // exporter waiting for warning acks, let's fake theirs.
458 dout(10) << "faking export_warning_ack from mds." << who
459 << " on " << *dir
<< " to mds." << p
->second
.peer
461 if (p
->second
.warning_ack_waiting
.empty())
465 if (p
->second
.notify_ack_waiting
.erase(who
)) {
466 // exporter is waiting for notify acks, fake it
467 dout(10) << "faking export_notify_ack from mds." << who
468 << " on " << *dir
<< " to mds." << p
->second
.peer
470 if (p
->second
.state
== EXPORT_NOTIFYING
) {
471 if (p
->second
.notify_ack_waiting
.empty())
473 } else if (p
->second
.state
== EXPORT_CANCELLING
) {
474 if (p
->second
.notify_ack_waiting
.empty()) {
475 export_state
.erase(p
);
476 export_cancel_finish(dir
);
488 map
<dirfrag_t
,import_state_t
>::iterator q
= import_state
.begin();
489 while (q
!= import_state
.end()) {
490 map
<dirfrag_t
,import_state_t
>::iterator next
= q
;
492 dirfrag_t df
= q
->first
;
493 CInode
*diri
= mds
->mdcache
->get_inode(df
.ino
);
494 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
496 if (q
->second
.peer
== who
) {
498 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
499 << get_import_statename(q
->second
.state
) << " of " << *dir
<< dendl
;
501 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
502 << get_import_statename(q
->second
.state
) << " of " << df
<< dendl
;
504 switch (q
->second
.state
) {
505 case IMPORT_DISCOVERING
:
506 dout(10) << "import state=discovering : clearing state" << dendl
;
507 import_reverse_discovering(df
);
510 case IMPORT_DISCOVERED
:
512 dout(10) << "import state=discovered : unpinning inode " << *diri
<< dendl
;
513 import_reverse_discovered(df
, diri
);
516 case IMPORT_PREPPING
:
518 dout(10) << "import state=prepping : unpinning base+bounds " << *dir
<< dendl
;
519 import_reverse_prepping(dir
, q
->second
);
524 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir
<< dendl
;
527 cache
->get_subtree_bounds(dir
, bounds
);
528 import_remove_pins(dir
, bounds
);
530 // adjust auth back to the exporter
531 cache
->adjust_subtree_auth(dir
, q
->second
.peer
);
533 // notify bystanders ; wait in aborting state
534 q
->second
.state
= IMPORT_ABORTING
;
535 import_notify_abort(dir
, bounds
);
536 assert(g_conf
->mds_kill_import_at
!= 10);
540 case IMPORT_LOGGINGSTART
:
542 dout(10) << "import state=loggingstart : reversing import on " << *dir
<< dendl
;
548 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
549 dout(10) << "import state=acking : noting ambiguous import " << *dir
<< dendl
;
552 cache
->get_subtree_bounds(dir
, bounds
);
553 cache
->add_ambiguous_import(dir
, bounds
);
557 case IMPORT_FINISHING
:
559 dout(10) << "import state=finishing : finishing import on " << *dir
<< dendl
;
560 import_finish(dir
, true);
563 case IMPORT_ABORTING
:
565 dout(10) << "import state=aborting : ignoring repeat failure " << *dir
<< dendl
;
569 auto bystanders_entry
= q
->second
.bystanders
.find(who
);
570 if (bystanders_entry
!= q
->second
.bystanders
.end()) {
571 q
->second
.bystanders
.erase(bystanders_entry
);
572 if (q
->second
.state
== IMPORT_ABORTING
) {
574 dout(10) << "faking export_notify_ack from mds." << who
575 << " on aborting import " << *dir
<< " from mds." << q
->second
.peer
577 if (q
->second
.bystanders
.empty())
578 import_reverse_unfreeze(dir
);
587 while (!pinned_dirs
.empty()) {
588 CDir
*dir
= pinned_dirs
.front();
589 dout(10) << "removing temp auth_pin on " << *dir
<< dendl
;
590 dir
->auth_unpin(this);
591 pinned_dirs
.pop_front();
597 void Migrator::show_importing()
599 dout(10) << "show_importing" << dendl
;
600 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
601 p
!= import_state
.end();
603 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
605 dout(10) << " importing from " << p
->second
.peer
606 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
607 << " " << p
->first
<< " " << *dir
<< dendl
;
609 dout(10) << " importing from " << p
->second
.peer
610 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
611 << " " << p
->first
<< dendl
;
616 void Migrator::show_exporting()
618 dout(10) << "show_exporting" << dendl
;
619 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
620 p
!= export_state
.end();
622 dout(10) << " exporting to " << p
->second
.peer
623 << ": (" << p
->second
.state
<< ") " << get_export_statename(p
->second
.state
)
624 << " " << p
->first
->dirfrag() << " " << *p
->first
<< dendl
;
629 void Migrator::audit()
631 if (!g_conf
->subsys
.should_gather(ceph_subsys_mds
, 5))
636 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
637 p
!= import_state
.end();
639 if (p
->second
.state
== IMPORT_DISCOVERING
)
641 if (p
->second
.state
== IMPORT_DISCOVERED
) {
642 CInode
*in
= cache
->get_inode(p
->first
.ino
);
646 CDir
*dir
= cache
->get_dirfrag(p
->first
);
648 if (p
->second
.state
== IMPORT_PREPPING
)
650 if (p
->second
.state
== IMPORT_ABORTING
) {
651 assert(!dir
->is_ambiguous_dir_auth());
652 assert(dir
->get_dir_auth().first
!= mds
->get_nodeid());
655 assert(dir
->is_ambiguous_dir_auth());
656 assert(dir
->authority().first
== mds
->get_nodeid() ||
657 dir
->authority().second
== mds
->get_nodeid());
662 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
663 p
!= export_state
.end();
665 CDir
*dir
= p
->first
;
666 if (p
->second
.state
== EXPORT_LOCKING
||
667 p
->second
.state
== EXPORT_DISCOVERING
||
668 p
->second
.state
== EXPORT_FREEZING
||
669 p
->second
.state
== EXPORT_CANCELLING
)
671 assert(dir
->is_ambiguous_dir_auth());
672 assert(dir
->authority().first
== mds
->get_nodeid() ||
673 dir
->authority().second
== mds
->get_nodeid());
676 // ambiguous+me subtrees should be importing|exporting
685 // ==========================================================
688 void Migrator::export_dir_nicely(CDir
*dir
, mds_rank_t dest
)
691 dout(7) << "export_dir_nicely " << *dir
<< " to " << dest
<< dendl
;
692 export_queue
.push_back(pair
<dirfrag_t
,mds_rank_t
>(dir
->dirfrag(), dest
));
694 maybe_do_queued_export();
697 void Migrator::maybe_do_queued_export()
703 while (!export_queue
.empty() &&
704 export_state
.size() <= 4) {
705 dirfrag_t df
= export_queue
.front().first
;
706 mds_rank_t dest
= export_queue
.front().second
;
707 export_queue
.pop_front();
709 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
711 if (!dir
->is_auth()) continue;
713 dout(0) << "nicely exporting to mds." << dest
<< " " << *dir
<< dendl
;
715 export_dir(dir
, dest
);
723 class C_MDC_ExportFreeze
: public MigratorContext
{
724 CDir
*ex
; // dir i'm exporting
727 C_MDC_ExportFreeze(Migrator
*m
, CDir
*e
, uint64_t t
) :
728 MigratorContext(m
), ex(e
), tid(t
) {
731 void finish(int r
) override
{
733 mig
->export_frozen(ex
, tid
);
738 void Migrator::get_export_lock_set(CDir
*dir
, set
<SimpleLock
*>& locks
)
741 vector
<CDentry
*> trace
;
742 cache
->make_trace(trace
, dir
->inode
);
743 for (vector
<CDentry
*>::iterator it
= trace
.begin();
746 locks
.insert(&(*it
)->lock
);
748 // prevent scatter gather race
749 locks
.insert(&dir
->get_inode()->dirfragtreelock
);
752 // NOTE: We need to take an rdlock on bounding dirfrags during
753 // migration for a rather irritating reason: when we export the
754 // bound inode, we need to send scatterlock state for the dirfrags
755 // as well, so that the new auth also gets the correct info. If we
756 // race with a refragment, this info is useless, as we can't
757 // redivvy it up. And it's needed for the scatterlocks to work
758 // properly: when the auth is in a sync/lock state it keeps each
759 // dirfrag's portion in the local (auth OR replica) dirfrag.
760 set
<CDir
*> wouldbe_bounds
;
761 cache
->get_wouldbe_subtree_bounds(dir
, wouldbe_bounds
);
762 for (set
<CDir
*>::iterator p
= wouldbe_bounds
.begin(); p
!= wouldbe_bounds
.end(); ++p
)
763 locks
.insert(&(*p
)->get_inode()->dirfragtreelock
);
767 class C_M_ExportDirWait
: public MigratorContext
{
771 C_M_ExportDirWait(Migrator
*m
, MDRequestRef mdr
, int count
)
772 : MigratorContext(m
), mdr(mdr
), count(count
) {}
773 void finish(int r
) override
{
774 mig
->dispatch_export_dir(mdr
, count
);
779 /** export_dir(dir, dest)
780 * public method to initiate an export.
781 * will fail if the directory is freezing, frozen, unpinnable, or root.
783 void Migrator::export_dir(CDir
*dir
, mds_rank_t dest
)
785 dout(7) << "export_dir " << *dir
<< " to " << dest
<< dendl
;
786 assert(dir
->is_auth());
787 assert(dest
!= mds
->get_nodeid());
789 if (!(mds
->is_active() || mds
->is_stopping())) {
790 dout(7) << "i'm not active, no exports for now" << dendl
;
793 if (mds
->mdcache
->is_readonly()) {
794 dout(7) << "read-only FS, no exports for now" << dendl
;
797 if (!mds
->mdsmap
->is_active(dest
)) {
798 dout(7) << "dest not active, no exports for now" << dendl
;
801 if (mds
->is_cluster_degraded()) {
802 dout(7) << "cluster degraded, no exports for now" << dendl
;
805 if (dir
->inode
->is_system()) {
806 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl
;
811 CDir
* parent_dir
= dir
->inode
->get_projected_parent_dir();
812 if (parent_dir
&& parent_dir
->inode
->is_stray()) {
813 if (parent_dir
->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest
)) {
814 dout(7) << "i won't export anything in stray" << dendl
;
818 if (!mds
->is_stopping() && !dir
->inode
->is_exportable(dest
)) {
819 dout(7) << "dir is export pinned" << dendl
;
824 if (dir
->is_frozen() ||
825 dir
->is_freezing()) {
826 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl
;
829 if (dir
->state_test(CDir::STATE_EXPORTING
)) {
830 dout(7) << "already exporting" << dendl
;
834 if (g_conf
->mds_thrash_exports
) {
835 // create random subtree bound (which will not be exported)
837 for (auto p
= dir
->begin(); p
!= dir
->end(); ++p
) {
839 CDentry::linkage_t
*dnl
= dn
->get_linkage();
840 if (dnl
->is_primary()) {
841 CInode
*in
= dnl
->get_inode();
843 in
->get_nested_dirfrags(ls
);
847 int n
= rand() % ls
.size();
851 if (!(bd
->is_frozen() || bd
->is_freezing())) {
852 assert(bd
->is_auth());
853 dir
->state_set(CDir::STATE_AUXSUBTREE
);
854 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
855 dout(0) << "export_dir: create aux subtree " << *bd
<< " under " << *dir
<< dendl
;
860 mds
->hit_export_target(ceph_clock_now(), dest
, -1);
863 dir
->state_set(CDir::STATE_EXPORTING
);
865 MDRequestRef mdr
= mds
->mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
866 mdr
->more()->export_dir
= dir
;
868 assert(export_state
.count(dir
) == 0);
869 export_state_t
& stat
= export_state
[dir
];
870 stat
.state
= EXPORT_LOCKING
;
872 stat
.tid
= mdr
->reqid
.tid
;
875 return mds
->mdcache
->dispatch_request(mdr
);
878 void Migrator::dispatch_export_dir(MDRequestRef
& mdr
, int count
)
880 dout(7) << "dispatch_export_dir " << *mdr
<< dendl
;
882 CDir
*dir
= mdr
->more()->export_dir
;
883 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
884 if (it
== export_state
.end() || it
->second
.tid
!= mdr
->reqid
.tid
) {
885 // export must have aborted.
886 dout(7) << "export must have aborted " << *mdr
<< dendl
;
887 mds
->mdcache
->request_finish(mdr
);
890 assert(it
->second
.state
== EXPORT_LOCKING
);
892 mds_rank_t dest
= it
->second
.peer
;
894 if (!mds
->is_export_target(dest
)) {
895 dout(7) << "dest is not yet an export target" << dendl
;
897 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl
;
898 export_try_cancel(dir
);
902 mds
->locker
->drop_locks(mdr
.get());
903 mdr
->drop_local_auth_pins();
905 mds
->wait_for_mdsmap(mds
->mdsmap
->get_epoch(), new C_M_ExportDirWait(this, mdr
, count
+1));
909 if (!dir
->inode
->get_parent_dn()) {
910 dout(7) << "waiting for dir to become stable before export: " << *dir
<< dendl
;
911 dir
->add_waiter(CDir::WAIT_CREATED
, new C_M_ExportDirWait(this, mdr
, 1));
915 if (mdr
->aborted
|| dir
->is_frozen() || dir
->is_freezing()) {
916 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl
;
917 export_try_cancel(dir
);
922 set
<SimpleLock
*> rdlocks
;
923 set
<SimpleLock
*> xlocks
;
924 set
<SimpleLock
*> wrlocks
;
925 get_export_lock_set(dir
, rdlocks
);
926 // If auth MDS of the subtree root inode is neither the exporter MDS
927 // nor the importer MDS and it gathers subtree root's fragstat/neststat
928 // while the subtree is exporting. It's possible that the exporter MDS
929 // and the importer MDS both are auth MDS of the subtree root or both
930 // are not auth MDS of the subtree root at the time they receive the
931 // lock messages. So the auth MDS of the subtree root inode may get no
932 // or duplicated fragstat/neststat for the subtree root dirfrag.
933 wrlocks
.insert(&dir
->get_inode()->filelock
);
934 wrlocks
.insert(&dir
->get_inode()->nestlock
);
935 if (dir
->get_inode()->is_auth()) {
936 dir
->get_inode()->filelock
.set_scatter_wanted();
937 dir
->get_inode()->nestlock
.set_scatter_wanted();
940 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
, NULL
, NULL
, true)) {
942 export_try_cancel(dir
);
946 assert(g_conf
->mds_kill_export_at
!= 1);
947 it
->second
.state
= EXPORT_DISCOVERING
;
949 // send ExportDirDiscover (ask target)
951 dir
->inode
->make_path(path
);
952 MExportDirDiscover
*discover
= new MExportDirDiscover(dir
->dirfrag(), path
,
955 mds
->send_message_mds(discover
, dest
);
956 assert(g_conf
->mds_kill_export_at
!= 2);
958 it
->second
.last_cum_auth_pins_change
= ceph_clock_now();
960 // start the freeze, but hold it up with an auth_pin.
962 assert(dir
->is_freezing_tree());
963 dir
->add_waiter(CDir::WAIT_FROZEN
, new C_MDC_ExportFreeze(this, dir
, it
->second
.tid
));
967 * called on receipt of MExportDirDiscoverAck
968 * the importer now has the directory's _inode_ in memory, and pinned.
970 * This function DOES put the passed message before returning
972 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck
*m
)
974 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
975 mds_rank_t
dest(m
->get_source().num());
976 utime_t now
= ceph_clock_now();
979 dout(7) << "export_discover_ack from " << m
->get_source()
980 << " on " << *dir
<< dendl
;
982 mds
->hit_export_target(now
, dest
, -1);
984 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
985 if (it
== export_state
.end() ||
986 it
->second
.tid
!= m
->get_tid() ||
987 it
->second
.peer
!= dest
) {
988 dout(7) << "must have aborted" << dendl
;
990 assert(it
->second
.state
== EXPORT_DISCOVERING
);
992 if (m
->is_success()) {
993 // release locks to avoid deadlock
994 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(it
->second
.mut
.get());
996 mds
->mdcache
->request_finish(mdr
);
997 it
->second
.mut
.reset();
998 // freeze the subtree
999 it
->second
.state
= EXPORT_FREEZING
;
1000 dir
->auth_unpin(this);
1001 assert(g_conf
->mds_kill_export_at
!= 3);
1004 dout(7) << "peer failed to discover (not active?), canceling" << dendl
;
1005 export_try_cancel(dir
, false);
1012 class C_M_ExportSessionsFlushed
: public MigratorContext
{
1016 C_M_ExportSessionsFlushed(Migrator
*m
, CDir
*d
, uint64_t t
)
1017 : MigratorContext(m
), dir(d
), tid(t
) {
1018 assert(dir
!= NULL
);
1020 void finish(int r
) override
{
1021 mig
->export_sessions_flushed(dir
, tid
);
1025 void Migrator::export_sessions_flushed(CDir
*dir
, uint64_t tid
)
1027 dout(7) << "export_sessions_flushed " << *dir
<< dendl
;
1029 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1030 if (it
== export_state
.end() ||
1031 it
->second
.state
== EXPORT_CANCELLING
||
1032 it
->second
.tid
!= tid
) {
1033 // export must have aborted.
1034 dout(7) << "export must have aborted on " << dir
<< dendl
;
1038 assert(it
->second
.state
== EXPORT_PREPPING
|| it
->second
.state
== EXPORT_WARNING
);
1039 assert(it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0);
1040 it
->second
.warning_ack_waiting
.erase(MDS_RANK_NONE
);
1041 if (it
->second
.state
== EXPORT_WARNING
&& it
->second
.warning_ack_waiting
.empty())
1042 export_go(dir
); // start export.
1045 void Migrator::export_frozen(CDir
*dir
, uint64_t tid
)
1047 dout(7) << "export_frozen on " << *dir
<< dendl
;
1049 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1050 if (it
== export_state
.end() || it
->second
.tid
!= tid
) {
1051 dout(7) << "export must have aborted" << dendl
;
1055 assert(it
->second
.state
== EXPORT_FREEZING
);
1056 assert(dir
->is_frozen_tree_root());
1057 assert(dir
->get_cum_auth_pins() == 0);
1059 CInode
*diri
= dir
->get_inode();
1061 // ok, try to grab all my locks.
1062 set
<SimpleLock
*> rdlocks
;
1063 get_export_lock_set(dir
, rdlocks
);
1064 if ((diri
->is_auth() && diri
->is_frozen()) ||
1065 !mds
->locker
->can_rdlock_set(rdlocks
) ||
1066 !diri
->filelock
.can_wrlock(-1) ||
1067 !diri
->nestlock
.can_wrlock(-1)) {
1068 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1071 dir
->unfreeze_tree();
1072 cache
->try_subtree_merge(dir
);
1074 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
1075 export_state
.erase(it
);
1077 dir
->state_clear(CDir::STATE_EXPORTING
);
1078 cache
->maybe_send_pending_resolves();
1082 it
->second
.mut
= new MutationImpl();
1083 if (diri
->is_auth())
1084 it
->second
.mut
->auth_pin(diri
);
1085 mds
->locker
->rdlock_take_set(rdlocks
, it
->second
.mut
);
1086 mds
->locker
->wrlock_force(&diri
->filelock
, it
->second
.mut
);
1087 mds
->locker
->wrlock_force(&diri
->nestlock
, it
->second
.mut
);
1089 cache
->show_subtrees();
1091 // CDir::_freeze_tree() should have forced it into subtree.
1092 assert(dir
->get_dir_auth() == mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
1094 set
<client_t
> export_client_set
;
1095 check_export_size(dir
, it
->second
, export_client_set
);
1099 cache
->get_subtree_bounds(dir
, bounds
);
1101 // generate prep message, log entry.
1102 MExportDirPrep
*prep
= new MExportDirPrep(dir
->dirfrag(), it
->second
.tid
);
1104 // include list of bystanders
1105 for (const auto &p
: dir
->get_replicas()) {
1106 if (p
.first
!= it
->second
.peer
) {
1107 dout(10) << "bystander mds." << p
.first
<< dendl
;
1108 prep
->add_bystander(p
.first
);
1112 // include base dirfrag
1113 cache
->replicate_dir(dir
, it
->second
.peer
, prep
->basedir
);
1116 * include spanning tree for all nested exports.
1117 * these need to be on the destination _before_ the final export so that
1118 * dir_auth updates on any nested exports are properly absorbed.
1119 * this includes inodes and dirfrags included in the subtree, but
1120 * only the inodes at the bounds.
1122 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1124 set
<inodeno_t
> inodes_added
;
1125 set
<dirfrag_t
> dirfrags_added
;
1128 for (set
<CDir
*>::iterator p
= bounds
.begin();
1134 assert(bound
->state_test(CDir::STATE_EXPORTBOUND
));
1136 dout(7) << " export bound " << *bound
<< dendl
;
1137 prep
->add_bound( bound
->dirfrag() );
1144 if (it
->second
.residual_dirs
.count(bound
)) {
1146 cache
->replicate_dir(bound
, it
->second
.peer
, tracebl
);
1147 dout(7) << " added " << *bound
<< dendl
;
1151 // don't repeat inodes
1152 if (inodes_added
.count(cur
->inode
->ino()))
1154 inodes_added
.insert(cur
->inode
->ino());
1156 // prepend dentry + inode
1157 assert(cur
->inode
->is_auth());
1159 cache
->replicate_dentry(cur
->inode
->parent
, it
->second
.peer
, bl
);
1160 dout(7) << " added " << *cur
->inode
->parent
<< dendl
;
1161 cache
->replicate_inode(cur
->inode
, it
->second
.peer
, bl
,
1162 mds
->mdsmap
->get_up_features());
1163 dout(7) << " added " << *cur
->inode
<< dendl
;
1164 bl
.claim_append(tracebl
);
1167 cur
= cur
->get_parent_dir();
1169 // don't repeat dirfrags
1170 if (dirfrags_added
.count(cur
->dirfrag()) ||
1172 start
= 'd'; // start with dentry
1175 dirfrags_added
.insert(cur
->dirfrag());
1178 cache
->replicate_dir(cur
, it
->second
.peer
, bl
);
1179 dout(7) << " added " << *cur
<< dendl
;
1180 bl
.claim_append(tracebl
);
1183 start
= 'f'; // start with dirfrag
1185 bufferlist final_bl
;
1186 dirfrag_t df
= cur
->dirfrag();
1187 ::encode(df
, final_bl
);
1188 ::encode(start
, final_bl
);
1189 final_bl
.claim_append(tracebl
);
1190 prep
->add_trace(final_bl
);
1194 it
->second
.state
= EXPORT_PREPPING
;
1195 mds
->send_message_mds(prep
, it
->second
.peer
);
1196 assert (g_conf
->mds_kill_export_at
!= 4);
1198 // make sure any new instantiations of caps are flushed out
1199 assert(it
->second
.warning_ack_waiting
.empty());
1201 MDSGatherBuilder
gather(g_ceph_context
);
1202 mds
->server
->flush_client_sessions(export_client_set
, gather
);
1203 if (gather
.has_subs()) {
1204 it
->second
.warning_ack_waiting
.insert(MDS_RANK_NONE
);
1205 gather
.set_finisher(new C_M_ExportSessionsFlushed(this, dir
, it
->second
.tid
));
1210 void Migrator::check_export_size(CDir
*dir
, export_state_t
& stat
, set
<client_t
>& client_set
)
1212 const unsigned frag_size
= 800;
1213 const unsigned inode_size
= 1000;
1214 const unsigned cap_size
= 80;
1215 const unsigned link_size
= 10;
1216 const unsigned null_size
= 1;
1218 uint64_t max_size
= g_conf
->get_val
<uint64_t>("mds_max_export_size");
1219 uint64_t approx_size
= 0;
1223 while (!dfs
.empty()) {
1224 CDir
*dir
= dfs
.front();
1227 approx_size
+= frag_size
;
1228 for (auto &p
: *dir
) {
1229 CDentry
*dn
= p
.second
;
1230 if (dn
->get_linkage()->is_null()) {
1231 approx_size
+= null_size
;
1234 if (dn
->get_linkage()->is_remote()) {
1235 approx_size
+= link_size
;
1239 approx_size
+= inode_size
;
1240 CInode
*in
= dn
->get_linkage()->get_inode();
1244 in
->get_dirfrags(ls
);
1246 if (q
->is_subtree_root()) {
1247 q
->state_set(CDir::STATE_EXPORTBOUND
);
1248 q
->get(CDir::PIN_EXPORTBOUND
);
1250 // include nested dirfrag
1251 assert(q
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1256 for (map
<client_t
, Capability
*>::iterator q
= in
->client_caps
.begin();
1257 q
!= in
->client_caps
.end();
1259 approx_size
+= cap_size
;
1260 client_set
.insert(q
->first
);
1264 if (approx_size
>= max_size
)
1268 while (!dfs
.empty()) {
1269 CDir
*dir
= dfs
.front();
1272 dout(7) << "check_export_size: creating bound " << *dir
<< dendl
;
1273 assert(dir
->is_auth());
1274 dir
->state_set(CDir::STATE_EXPORTBOUND
);
1275 dir
->get(CDir::PIN_EXPORTBOUND
);
1277 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
1278 // Another choice here is finishing all WAIT_UNFREEZE contexts and keeping
1279 // the newly created subtree unfreeze.
1280 dir
->_freeze_tree();
1282 stat
.residual_dirs
.insert(dir
);
1286 void Migrator::get_export_client_set(CInode
*in
, set
<client_t
>& client_set
)
1288 for (map
<client_t
, Capability
*>::iterator q
= in
->client_caps
.begin();
1289 q
!= in
->client_caps
.end();
1291 client_set
.insert(q
->first
);
1294 /* This function DOES put the passed message before returning*/
1295 void Migrator::handle_export_prep_ack(MExportDirPrepAck
*m
)
1297 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1298 mds_rank_t
dest(m
->get_source().num());
1299 utime_t now
= ceph_clock_now();
1302 dout(7) << "export_prep_ack " << *dir
<< dendl
;
1304 mds
->hit_export_target(now
, dest
, -1);
1306 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1307 if (it
== export_state
.end() ||
1308 it
->second
.tid
!= m
->get_tid() ||
1309 it
->second
.peer
!= mds_rank_t(m
->get_source().num())) {
1310 // export must have aborted.
1311 dout(7) << "export must have aborted" << dendl
;
1315 assert(it
->second
.state
== EXPORT_PREPPING
);
1317 if (!m
->is_success()) {
1318 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl
;
1319 export_try_cancel(dir
, false);
1324 assert (g_conf
->mds_kill_export_at
!= 5);
1327 cache
->get_subtree_bounds(dir
, bounds
);
1329 assert(it
->second
.warning_ack_waiting
.empty() ||
1330 (it
->second
.warning_ack_waiting
.size() == 1 &&
1331 it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0));
1332 assert(it
->second
.notify_ack_waiting
.empty());
1334 for (const auto &p
: dir
->get_replicas()) {
1335 if (p
.first
== it
->second
.peer
) continue;
1336 if (mds
->is_cluster_degraded() &&
1337 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(p
.first
))
1338 continue; // only if active
1339 it
->second
.warning_ack_waiting
.insert(p
.first
);
1340 it
->second
.notify_ack_waiting
.insert(p
.first
); // we'll eventually get a notifyack, too!
1342 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), it
->second
.tid
, true,
1343 mds_authority_t(mds
->get_nodeid(),CDIR_AUTH_UNKNOWN
),
1344 mds_authority_t(mds
->get_nodeid(),it
->second
.peer
));
1345 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
1346 notify
->get_bounds().push_back((*q
)->dirfrag());
1347 mds
->send_message_mds(notify
, p
.first
);
1351 it
->second
.state
= EXPORT_WARNING
;
1353 assert(g_conf
->mds_kill_export_at
!= 6);
1355 if (it
->second
.warning_ack_waiting
.empty())
1356 export_go(dir
); // start export.
1363 class C_M_ExportGo
: public MigratorContext
{
1367 C_M_ExportGo(Migrator
*m
, CDir
*d
, uint64_t t
) :
1368 MigratorContext(m
), dir(d
), tid(t
) {
1369 assert(dir
!= NULL
);
1371 void finish(int r
) override
{
1372 mig
->export_go_synced(dir
, tid
);
1376 void Migrator::export_go(CDir
*dir
)
1378 auto it
= export_state
.find(dir
);
1379 assert(it
!= export_state
.end());
1380 dout(7) << "export_go " << *dir
<< " to " << it
->second
.peer
<< dendl
;
1382 // first sync log to flush out e.g. any cap imports
1383 mds
->mdlog
->wait_for_safe(new C_M_ExportGo(this, dir
, it
->second
.tid
));
1384 mds
->mdlog
->flush();
1387 void Migrator::export_go_synced(CDir
*dir
, uint64_t tid
)
1389 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1390 if (it
== export_state
.end() ||
1391 it
->second
.state
== EXPORT_CANCELLING
||
1392 it
->second
.tid
!= tid
) {
1393 // export must have aborted.
1394 dout(7) << "export must have aborted on " << dir
<< dendl
;
1397 assert(it
->second
.state
== EXPORT_WARNING
);
1398 mds_rank_t dest
= it
->second
.peer
;
1400 dout(7) << "export_go_synced " << *dir
<< " to " << dest
<< dendl
;
1402 cache
->show_subtrees();
1404 it
->second
.state
= EXPORT_EXPORTING
;
1405 assert(g_conf
->mds_kill_export_at
!= 7);
1407 assert(dir
->is_frozen_tree_root());
1408 assert(dir
->get_cum_auth_pins() == 0);
1410 // set ambiguous auth
1411 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), dest
);
1413 // take away the popularity we're sending.
1414 utime_t now
= ceph_clock_now();
1415 mds
->balancer
->subtract_export(dir
, now
);
1417 // fill export message with cache data
1418 MExportDir
*req
= new MExportDir(dir
->dirfrag(), it
->second
.tid
);
1419 map
<client_t
,entity_inst_t
> exported_client_map
;
1420 uint64_t num_exported_inodes
= encode_export_dir(req
->export_data
,
1421 dir
, // recur start point
1422 exported_client_map
,
1424 ::encode(exported_client_map
, req
->client_map
,
1425 mds
->mdsmap
->get_up_features());
1427 // add bounds to message
1429 cache
->get_subtree_bounds(dir
, bounds
);
1430 for (set
<CDir
*>::iterator p
= bounds
.begin();
1433 req
->add_export((*p
)->dirfrag());
1436 mds
->send_message_mds(req
, dest
);
1437 assert(g_conf
->mds_kill_export_at
!= 8);
1439 mds
->hit_export_target(now
, dest
, num_exported_inodes
+1);
1442 if (mds
->logger
) mds
->logger
->inc(l_mds_exported
);
1443 if (mds
->logger
) mds
->logger
->inc(l_mds_exported_inodes
, num_exported_inodes
);
1445 cache
->show_subtrees();
1449 /** encode_export_inode
1450 * update our local state for this inode to export.
1451 * encode relevant state to be sent over the wire.
1452 * used by: encode_export_dir, file_rename (if foreign)
1454 * FIXME: the separation between CInode.encode_export and these methods
1455 * is pretty arbitrary and dumb.
1457 void Migrator::encode_export_inode(CInode
*in
, bufferlist
& enc_state
,
1458 map
<client_t
,entity_inst_t
>& exported_client_map
)
1460 dout(7) << "encode_export_inode " << *in
<< dendl
;
1461 assert(!in
->is_replica(mds
->get_nodeid()));
1464 if (!in
->is_replicated()) {
1465 in
->replicate_relax_locks();
1466 dout(20) << " did replicate_relax_locks, now " << *in
<< dendl
;
1469 ::encode(in
->inode
.ino
, enc_state
);
1470 ::encode(in
->last
, enc_state
);
1471 in
->encode_export(enc_state
);
1474 encode_export_inode_caps(in
, true, enc_state
, exported_client_map
);
1477 void Migrator::encode_export_inode_caps(CInode
*in
, bool auth_cap
, bufferlist
& bl
,
1478 map
<client_t
,entity_inst_t
>& exported_client_map
)
1480 dout(20) << "encode_export_inode_caps " << *in
<< dendl
;
1483 map
<client_t
,Capability::Export
> cap_map
;
1484 in
->export_client_caps(cap_map
);
1485 ::encode(cap_map
, bl
);
1487 ::encode(in
->get_mds_caps_wanted(), bl
);
1489 in
->state_set(CInode::STATE_EXPORTINGCAPS
);
1490 in
->get(CInode::PIN_EXPORTINGCAPS
);
1493 // make note of clients named by exported capabilities
1494 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1495 it
!= in
->client_caps
.end();
1497 exported_client_map
[it
->first
] = mds
->sessionmap
.get_inst(entity_name_t::CLIENT(it
->first
.v
));
1500 void Migrator::finish_export_inode_caps(CInode
*in
, mds_rank_t peer
,
1501 map
<client_t
,Capability::Import
>& peer_imported
)
1503 dout(20) << "finish_export_inode_caps " << *in
<< dendl
;
1505 in
->state_clear(CInode::STATE_EXPORTINGCAPS
);
1506 in
->put(CInode::PIN_EXPORTINGCAPS
);
1508 // tell (all) clients about migrating caps..
1509 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1510 it
!= in
->client_caps
.end();
1512 Capability
*cap
= it
->second
;
1513 dout(7) << "finish_export_inode_caps telling client." << it
->first
1514 << " exported caps on " << *in
<< dendl
;
1515 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
1516 cap
->get_cap_id(), cap
->get_mseq(), mds
->get_osd_epoch_barrier());
1518 map
<client_t
,Capability::Import
>::iterator q
= peer_imported
.find(it
->first
);
1519 assert(q
!= peer_imported
.end());
1520 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
1521 (q
->second
.cap_id
> 0 ? peer
: -1), 0);
1522 mds
->send_message_client_counted(m
, it
->first
);
1524 in
->clear_client_caps_after_export();
1525 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
1528 void Migrator::finish_export_inode(CInode
*in
, utime_t now
, mds_rank_t peer
,
1529 map
<client_t
,Capability::Import
>& peer_imported
,
1530 list
<MDSInternalContextBase
*>& finished
)
1532 dout(12) << "finish_export_inode " << *in
<< dendl
;
1538 // clear/unpin cached_by (we're no longer the authority)
1539 in
->clear_replica_map();
1541 // twiddle lock states for auth -> replica transition
1542 in
->authlock
.export_twiddle();
1543 in
->linklock
.export_twiddle();
1544 in
->dirfragtreelock
.export_twiddle();
1545 in
->filelock
.export_twiddle();
1546 in
->nestlock
.export_twiddle();
1547 in
->xattrlock
.export_twiddle();
1548 in
->snaplock
.export_twiddle();
1549 in
->flocklock
.export_twiddle();
1550 in
->policylock
.export_twiddle();
1553 assert(in
->is_auth());
1554 in
->state_clear(CInode::STATE_AUTH
);
1555 in
->replica_nonce
= CInode::EXPORT_NONCE
;
1557 in
->clear_dirty_rstat();
1559 // no more auth subtree? clear scatter dirty
1560 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid()))
1561 in
->clear_scatter_dirty();
1563 in
->item_open_file
.remove_myself();
1565 in
->clear_dirty_parent();
1567 in
->clear_file_locks();
1570 in
->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
1572 in
->finish_export(now
);
1574 finish_export_inode_caps(in
, peer
, peer_imported
);
1577 uint64_t Migrator::encode_export_dir(bufferlist
& exportbl
,
1579 map
<client_t
,entity_inst_t
>& exported_client_map
,
1582 uint64_t num_exported
= 0;
1584 dout(7) << "encode_export_dir " << *dir
<< " " << dir
->get_num_head_items() << " head items" << dendl
;
1586 assert(dir
->get_projected_version() == dir
->get_version());
1588 #ifdef MDS_VERIFY_FRAGSTAT
1589 if (dir
->is_complete())
1590 dir
->verify_fragstat();
1594 dirfrag_t df
= dir
->dirfrag();
1595 ::encode(df
, exportbl
);
1596 dir
->encode_export(exportbl
);
1598 __u32 nden
= dir
->items
.size();
1599 ::encode(nden
, exportbl
);
1602 list
<CDir
*> subdirs
;
1603 for (auto &p
: *dir
) {
1604 CDentry
*dn
= p
.second
;
1605 CInode
*in
= dn
->get_linkage()->get_inode();
1607 if (!dn
->is_replicated())
1608 dn
->lock
.replicate_relax();
1613 dout(7) << "encode_export_dir exporting " << *dn
<< dendl
;
1616 ::encode(dn
->get_name(), exportbl
);
1617 ::encode(dn
->last
, exportbl
);
1620 dn
->encode_export(exportbl
);
1625 if (dn
->get_linkage()->is_null()) {
1626 exportbl
.append("N", 1); // null dentry
1630 if (dn
->get_linkage()->is_remote()) {
1632 exportbl
.append("L", 1); // remote link
1634 inodeno_t ino
= dn
->get_linkage()->get_remote_ino();
1635 unsigned char d_type
= dn
->get_linkage()->get_remote_d_type();
1636 ::encode(ino
, exportbl
);
1637 ::encode(d_type
, exportbl
);
1643 exportbl
.append("I", 1); // inode dentry
1645 encode_export_inode(in
, exportbl
, exported_client_map
); // encode, and (update state for) export
1649 in
->get_dirfrags(dfs
);
1650 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
1652 if (!t
->state_test(CDir::STATE_EXPORTBOUND
)) {
1653 // include nested dirfrag
1654 assert(t
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1655 subdirs
.push_front(t
); // it's ours, recurse (later)
1661 for (auto &dir
: subdirs
)
1662 num_exported
+= encode_export_dir(exportbl
, dir
, exported_client_map
, now
);
1664 return num_exported
;
1667 void Migrator::finish_export_dir(CDir
*dir
, utime_t now
, mds_rank_t peer
,
1668 map
<inodeno_t
,map
<client_t
,Capability::Import
> >& peer_imported
,
1669 list
<MDSInternalContextBase
*>& finished
, int *num_dentries
)
1671 dout(10) << "finish_export_dir " << *dir
<< dendl
;
1674 dir
->clear_replica_map();
1677 assert(dir
->is_auth());
1678 dir
->state_clear(CDir::STATE_AUTH
);
1679 dir
->remove_bloom();
1680 dir
->replica_nonce
= CDir::EXPORT_NONCE
;
1682 if (dir
->is_dirty())
1685 // suck up all waiters
1686 dir
->take_waiting(CDir::WAIT_ANY_MASK
, finished
); // all dir waiters
1689 dir
->finish_export(now
);
1692 list
<CDir
*> subdirs
;
1693 for (auto &p
: *dir
) {
1694 CDentry
*dn
= p
.second
;
1695 CInode
*in
= dn
->get_linkage()->get_inode();
1698 dn
->finish_export();
1701 if (dn
->get_linkage()->is_primary()) {
1702 finish_export_inode(in
, now
, peer
, peer_imported
[in
->ino()], finished
);
1705 in
->get_nested_dirfrags(subdirs
);
1708 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
1713 for (list
<CDir
*>::iterator it
= subdirs
.begin(); it
!= subdirs
.end(); ++it
)
1714 finish_export_dir(*it
, now
, peer
, peer_imported
, finished
, num_dentries
);
1717 class C_MDS_ExportFinishLogged
: public MigratorLogContext
{
1720 C_MDS_ExportFinishLogged(Migrator
*m
, CDir
*d
) : MigratorLogContext(m
), dir(d
) {}
1721 void finish(int r
) override
{
1722 mig
->export_logged_finish(dir
);
1728 * i should get an export_ack from the export target.
1730 * This function DOES put the passed message before returning
1732 void Migrator::handle_export_ack(MExportDirAck
*m
)
1734 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1735 mds_rank_t
dest(m
->get_source().num());
1736 utime_t now
= ceph_clock_now();
1738 assert(dir
->is_frozen_tree_root()); // i'm exporting!
1741 dout(7) << "handle_export_ack " << *dir
<< dendl
;
1743 mds
->hit_export_target(now
, dest
, -1);
1745 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1746 assert(it
!= export_state
.end());
1747 assert(it
->second
.state
== EXPORT_EXPORTING
);
1748 assert(it
->second
.tid
== m
->get_tid());
1750 bufferlist::iterator bp
= m
->imported_caps
.begin();
1751 ::decode(it
->second
.peer_imported
, bp
);
1753 it
->second
.state
= EXPORT_LOGGINGFINISH
;
1754 assert (g_conf
->mds_kill_export_at
!= 9);
1756 cache
->get_subtree_bounds(dir
, bounds
);
1759 // include export bounds, to ensure they're in the journal.
1760 EExport
*le
= new EExport(mds
->mdlog
, dir
, it
->second
.peer
);;
1761 mds
->mdlog
->start_entry(le
);
1763 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
1764 le
->metablob
.add_dir(dir
, false);
1765 for (set
<CDir
*>::iterator p
= bounds
.begin();
1769 le
->get_bounds().insert(bound
->dirfrag());
1770 le
->metablob
.add_dir_context(bound
);
1771 le
->metablob
.add_dir(bound
, false);
1774 // list us second, them first.
1775 // this keeps authority().first in sync with subtree auth state in the journal.
1776 cache
->adjust_subtree_auth(dir
, it
->second
.peer
, mds
->get_nodeid());
1778 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1779 mds
->mdlog
->submit_entry(le
, new C_MDS_ExportFinishLogged(this, dir
));
1780 mds
->mdlog
->flush();
1781 assert (g_conf
->mds_kill_export_at
!= 10);
1786 void Migrator::export_notify_abort(CDir
*dir
, export_state_t
& stat
, set
<CDir
*>& bounds
)
1788 dout(7) << "export_notify_abort " << *dir
<< dendl
;
1790 assert(stat
.state
== EXPORT_CANCELLING
);
1792 if (stat
.notify_ack_waiting
.empty()) {
1793 stat
.state
= EXPORT_CANCELLED
;
1797 dir
->auth_pin(this);
1799 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
1800 p
!= stat
.notify_ack_waiting
.end();
1802 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
1803 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
1804 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
1805 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
1806 notify
->get_bounds().push_back((*i
)->dirfrag());
1807 mds
->send_message_mds(notify
, *p
);
1812 * this happens if hte dest failes after i send teh export data but before it is acked
1813 * that is, we don't know they safely received and logged it, so we reverse our changes
1816 void Migrator::export_reverse(CDir
*dir
, export_state_t
& stat
)
1818 dout(7) << "export_reverse " << *dir
<< dendl
;
1820 set
<CInode
*> to_eval
;
1823 cache
->get_subtree_bounds(dir
, bounds
);
1825 // remove exporting pins
1828 while (!rq
.empty()) {
1829 CDir
*t
= rq
.front();
1832 for (auto &p
: *t
) {
1833 CDentry
*dn
= p
.second
;
1835 if (!dn
->get_linkage()->is_primary())
1837 CInode
*in
= dn
->get_linkage()->get_inode();
1839 if (in
->state_test(CInode::STATE_EVALSTALECAPS
)) {
1840 in
->state_clear(CInode::STATE_EVALSTALECAPS
);
1844 in
->get_nested_dirfrags(rq
);
1849 for (auto bd
: bounds
) {
1850 bd
->put(CDir::PIN_EXPORTBOUND
);
1851 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
1854 // notify bystanders
1855 export_notify_abort(dir
, stat
, bounds
);
1857 // unfreeze tree, with possible subtree merge.
1858 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
1860 // process delayed expires
1861 cache
->process_delayed_expire(dir
);
1863 dir
->unfreeze_tree();
1864 cache
->try_subtree_merge(dir
);
1865 for (auto bd
: stat
.residual_dirs
) {
1866 bd
->unfreeze_tree();
1867 cache
->try_subtree_merge(bd
);
1870 // revoke/resume stale caps
1871 for (auto in
: to_eval
) {
1872 bool need_issue
= false;
1873 for (auto& p
: in
->get_client_caps()) {
1874 Capability
*cap
= p
.second
;
1875 if (cap
->is_stale()) {
1876 mds
->locker
->revoke_stale_caps(cap
);
1882 (!in
->is_auth() || !mds
->locker
->eval(in
, CEPH_CAP_LOCKS
)))
1883 mds
->locker
->issue_caps(in
);
1886 cache
->show_cache();
1891 * once i get the ack, and logged the EExportFinish(true),
1892 * send notifies (if any), otherwise go straight to finish.
1895 void Migrator::export_logged_finish(CDir
*dir
)
1897 dout(7) << "export_logged_finish " << *dir
<< dendl
;
1899 export_state_t
& stat
= export_state
[dir
];
1903 cache
->get_subtree_bounds(dir
, bounds
);
1905 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
1906 p
!= stat
.notify_ack_waiting
.end();
1908 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
1909 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
1910 pair
<int,int>(stat
.peer
, CDIR_AUTH_UNKNOWN
));
1912 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
1913 notify
->get_bounds().push_back((*i
)->dirfrag());
1915 mds
->send_message_mds(notify
, *p
);
1918 // wait for notifyacks
1919 stat
.state
= EXPORT_NOTIFYING
;
1920 assert (g_conf
->mds_kill_export_at
!= 11);
1922 // no notifies to wait for?
1923 if (stat
.notify_ack_waiting
.empty()) {
1924 export_finish(dir
); // skip notify/notify_ack stage.
1926 // notify peer to send cap import messages to clients
1927 if (!mds
->is_cluster_degraded() ||
1928 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(stat
.peer
)) {
1929 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), false, stat
.tid
), stat
.peer
);
1931 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl
;
1938 * i'll get an ack from each bystander.
1939 * when i get them all, do the export.
1941 * i'll get an ack from each bystander.
1942 * when i get them all, unfreeze and send the finish.
1944 * This function DOES put the passed message before returning
1946 void Migrator::handle_export_notify_ack(MExportDirNotifyAck
*m
)
1948 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1949 mds_rank_t
dest(m
->get_source().num());
1950 utime_t now
= ceph_clock_now();
1952 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1954 mds
->hit_export_target(now
, dest
, -1);
1956 auto export_state_entry
= export_state
.find(dir
);
1957 if (export_state_entry
!= export_state
.end()) {
1958 export_state_t
& stat
= export_state_entry
->second
;
1959 if (stat
.state
== EXPORT_WARNING
&&
1960 stat
.warning_ack_waiting
.erase(from
)) {
1961 // exporting. process warning.
1962 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1963 << ": exporting, processing warning on " << *dir
<< dendl
;
1964 if (stat
.warning_ack_waiting
.empty())
1965 export_go(dir
); // start export.
1966 } else if (stat
.state
== EXPORT_NOTIFYING
&&
1967 stat
.notify_ack_waiting
.erase(from
)) {
1968 // exporting. process notify.
1969 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1970 << ": exporting, processing notify on " << *dir
<< dendl
;
1971 if (stat
.notify_ack_waiting
.empty())
1973 } else if (stat
.state
== EXPORT_CANCELLING
&&
1974 m
->get_new_auth().second
== CDIR_AUTH_UNKNOWN
&& // not warning ack
1975 stat
.notify_ack_waiting
.erase(from
)) {
1976 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1977 << ": cancelling export, processing notify on " << *dir
<< dendl
;
1978 if (stat
.notify_ack_waiting
.empty()) {
1979 export_state
.erase(export_state_entry
);
1980 export_cancel_finish(dir
);
1985 auto import_state_entry
= import_state
.find(dir
->dirfrag());
1986 if (import_state_entry
!= import_state
.end()) {
1987 import_state_t
& stat
= import_state_entry
->second
;
1988 if (stat
.state
== IMPORT_ABORTING
) {
1990 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1991 << ": aborting import on " << *dir
<< dendl
;
1992 assert(stat
.bystanders
.count(from
));
1993 stat
.bystanders
.erase(from
);
1994 if (stat
.bystanders
.empty())
1995 import_reverse_unfreeze(dir
);
2003 void Migrator::export_finish(CDir
*dir
)
2005 dout(5) << "export_finish " << *dir
<< dendl
;
2007 assert (g_conf
->mds_kill_export_at
!= 12);
2008 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
2009 if (it
== export_state
.end()) {
2010 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl
;
2014 // send finish/commit to new auth
2015 if (!mds
->is_cluster_degraded() ||
2016 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
)) {
2017 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), true, it
->second
.tid
), it
->second
.peer
);
2019 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl
;
2021 assert(g_conf
->mds_kill_export_at
!= 13);
2023 // finish export (adjust local cache state)
2024 int num_dentries
= 0;
2025 list
<MDSInternalContextBase
*> finished
;
2026 finish_export_dir(dir
, ceph_clock_now(), it
->second
.peer
,
2027 it
->second
.peer_imported
, finished
, &num_dentries
);
2029 assert(!dir
->is_auth());
2030 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2034 cache
->get_subtree_bounds(dir
, bounds
);
2035 for (set
<CDir
*>::iterator p
= bounds
.begin();
2039 bd
->put(CDir::PIN_EXPORTBOUND
);
2040 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
2043 if (dir
->state_test(CDir::STATE_AUXSUBTREE
))
2044 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
2046 // discard delayed expires
2047 cache
->discard_delayed_expire(dir
);
2049 dout(7) << "export_finish unfreezing" << dendl
;
2051 // unfreeze tree, with possible subtree merge.
2052 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
2053 dir
->unfreeze_tree();
2054 cache
->try_subtree_merge(dir
);
2055 for (auto bd
: it
->second
.residual_dirs
) {
2056 export_queue
.push_front(pair
<dirfrag_t
,mds_rank_t
>(bd
->dirfrag(), it
->second
.peer
));
2057 bd
->take_waiting(CDir::WAIT_ANY_MASK
, finished
);
2058 bd
->unfreeze_tree();
2059 cache
->try_subtree_merge(bd
);
2062 // no more auth subtree? clear scatter dirty
2063 if (!dir
->get_inode()->is_auth() &&
2064 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2065 dir
->get_inode()->clear_scatter_dirty();
2066 // wake up scatter_nudge waiters
2067 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
2070 if (!finished
.empty())
2071 mds
->queue_waiters(finished
);
2073 MutationRef mut
= it
->second
.mut
;
2074 // remove from exporting list, clean up state
2075 export_state
.erase(it
);
2076 dir
->state_clear(CDir::STATE_EXPORTING
);
2078 cache
->show_subtrees();
2081 cache
->trim(num_dentries
); // try trimming exported dentries
2083 // send pending import_maps?
2084 mds
->mdcache
->maybe_send_pending_resolves();
2086 // drop locks, unpin path
2088 mds
->locker
->drop_locks(mut
.get());
2092 maybe_do_queued_export();
2102 // ==========================================================
2105 void Migrator::handle_export_discover(MExportDirDiscover
*m
)
2107 mds_rank_t from
= m
->get_source_mds();
2108 assert(from
!= mds
->get_nodeid());
2110 dout(7) << "handle_export_discover on " << m
->get_path() << dendl
;
2112 // note import state
2113 dirfrag_t df
= m
->get_dirfrag();
2115 if (!mds
->is_active()) {
2116 dout(7) << " not active, send NACK " << dendl
;
2117 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid(), false), from
);
2122 // only start discovering on this message once.
2123 import_state_t
*p_state
;
2124 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2126 assert(it
== import_state
.end());
2128 p_state
= &import_state
[df
];
2129 p_state
->state
= IMPORT_DISCOVERING
;
2130 p_state
->peer
= from
;
2131 p_state
->tid
= m
->get_tid();
2133 // am i retrying after ancient path_traverse results?
2134 if (it
== import_state
.end() ||
2135 it
->second
.peer
!= from
||
2136 it
->second
.tid
!= m
->get_tid()) {
2137 dout(7) << " dropping obsolete message" << dendl
;
2141 assert(it
->second
.state
== IMPORT_DISCOVERING
);
2142 p_state
= &it
->second
;
2145 if (!mds
->mdcache
->is_open()) {
2146 dout(5) << " waiting for root" << dendl
;
2147 mds
->mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, m
));
2151 assert (g_conf
->mds_kill_import_at
!= 1);
2154 CInode
*in
= cache
->get_inode(m
->get_dirfrag().ino
);
2156 // must discover it!
2157 filepath
fpath(m
->get_path());
2158 vector
<CDentry
*> trace
;
2159 MDRequestRef null_ref
;
2160 int r
= cache
->path_traverse(null_ref
, m
, NULL
, fpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
2163 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m
->get_path() << ", NAK" << dendl
;
2164 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2167 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2171 dout(7) << "handle_export_discover have " << df
<< " inode " << *in
<< dendl
;
2173 p_state
->state
= IMPORT_DISCOVERED
;
2175 // pin inode in the cache (for now)
2176 assert(in
->is_dir());
2177 in
->get(CInode::PIN_IMPORTING
);
2180 dout(7) << " sending export_discover_ack on " << *in
<< dendl
;
2181 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid()), p_state
->peer
);
2183 assert (g_conf
->mds_kill_import_at
!= 2);
2186 void Migrator::import_reverse_discovering(dirfrag_t df
)
2188 import_state
.erase(df
);
2191 void Migrator::import_reverse_discovered(dirfrag_t df
, CInode
*diri
)
2194 diri
->put(CInode::PIN_IMPORTING
);
2195 import_state
.erase(df
);
2198 void Migrator::import_reverse_prepping(CDir
*dir
, import_state_t
& stat
)
2201 cache
->map_dirfrag_set(stat
.bound_ls
, bounds
);
2202 import_remove_pins(dir
, bounds
);
2203 import_reverse_final(dir
);
2206 /* This function DOES put the passed message before returning*/
2207 void Migrator::handle_export_cancel(MExportDirCancel
*m
)
2209 dout(7) << "handle_export_cancel on " << m
->get_dirfrag() << dendl
;
2210 dirfrag_t df
= m
->get_dirfrag();
2211 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2212 if (it
== import_state
.end()) {
2213 assert(0 == "got export_cancel in weird state");
2214 } else if (it
->second
.state
== IMPORT_DISCOVERING
) {
2215 import_reverse_discovering(df
);
2216 } else if (it
->second
.state
== IMPORT_DISCOVERED
) {
2217 CInode
*in
= cache
->get_inode(df
.ino
);
2219 import_reverse_discovered(df
, in
);
2220 } else if (it
->second
.state
== IMPORT_PREPPING
) {
2221 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2223 import_reverse_prepping(dir
, it
->second
);
2224 } else if (it
->second
.state
== IMPORT_PREPPED
) {
2225 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2228 cache
->get_subtree_bounds(dir
, bounds
);
2229 import_remove_pins(dir
, bounds
);
2230 // adjust auth back to the exportor
2231 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2232 import_reverse_unfreeze(dir
);
2234 assert(0 == "got export_cancel in weird state");
2239 /* This function DOES put the passed message before returning*/
2240 void Migrator::handle_export_prep(MExportDirPrep
*m
)
2242 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2243 assert(oldauth
!= mds
->get_nodeid());
2247 list
<MDSInternalContextBase
*> finished
;
2249 // assimilate root dir.
2250 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2251 if (!m
->did_assim()) {
2252 assert(it
!= import_state
.end());
2253 assert(it
->second
.state
== IMPORT_DISCOVERED
);
2254 assert(it
->second
.peer
== oldauth
);
2255 diri
= cache
->get_inode(m
->get_dirfrag().ino
);
2257 bufferlist::iterator p
= m
->basedir
.begin();
2258 dir
= cache
->add_replica_dir(p
, diri
, oldauth
, finished
);
2259 dout(7) << "handle_export_prep on " << *dir
<< " (first pass)" << dendl
;
2261 if (it
== import_state
.end() ||
2262 it
->second
.peer
!= oldauth
||
2263 it
->second
.tid
!= m
->get_tid()) {
2264 dout(7) << "handle_export_prep obsolete message, dropping" << dendl
;
2268 assert(it
->second
.state
== IMPORT_PREPPING
);
2269 assert(it
->second
.peer
== oldauth
);
2271 dir
= cache
->get_dirfrag(m
->get_dirfrag());
2273 dout(7) << "handle_export_prep on " << *dir
<< " (subsequent pass)" << dendl
;
2274 diri
= dir
->get_inode();
2276 assert(dir
->is_auth() == false);
2278 cache
->show_subtrees();
2280 // build import bound map
2281 map
<inodeno_t
, fragset_t
> import_bound_fragset
;
2282 for (list
<dirfrag_t
>::iterator p
= m
->get_bounds().begin();
2283 p
!= m
->get_bounds().end();
2285 dout(10) << " bound " << *p
<< dendl
;
2286 import_bound_fragset
[p
->ino
].insert(p
->frag
);
2289 // assimilate contents?
2290 if (!m
->did_assim()) {
2291 dout(7) << "doing assim on " << *dir
<< dendl
;
2292 m
->mark_assim(); // only do this the first time!
2294 // change import state
2295 it
->second
.state
= IMPORT_PREPPING
;
2296 it
->second
.bound_ls
= m
->get_bounds();
2297 it
->second
.bystanders
= m
->get_bystanders();
2298 assert(g_conf
->mds_kill_import_at
!= 3);
2301 dout(7) << "bystanders are " << it
->second
.bystanders
<< dendl
;
2304 diri
->put(CInode::PIN_IMPORTING
);
2305 dir
->get(CDir::PIN_IMPORTING
);
2306 dir
->state_set(CDir::STATE_IMPORTING
);
2308 // assimilate traces to exports
2309 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2310 for (list
<bufferlist
>::iterator p
= m
->traces
.begin();
2311 p
!= m
->traces
.end();
2313 bufferlist::iterator q
= p
->begin();
2318 dout(10) << " trace from " << df
<< " start " << start
<< " len " << p
->length() << dendl
;
2322 cur
= cache
->get_dirfrag(df
);
2324 dout(10) << " had " << *cur
<< dendl
;
2325 } else if (start
== 'f') {
2326 CInode
*in
= cache
->get_inode(df
.ino
);
2328 dout(10) << " had " << *in
<< dendl
;
2329 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2330 dout(10) << " added " << *cur
<< dendl
;
2331 } else if (start
== '-') {
2334 assert(0 == "unrecognized start char");
2337 CDentry
*dn
= cache
->add_replica_dentry(q
, cur
, finished
);
2338 dout(10) << " added " << *dn
<< dendl
;
2339 CInode
*in
= cache
->add_replica_inode(q
, dn
, finished
);
2340 dout(10) << " added " << *in
<< dendl
;
2343 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2344 dout(10) << " added " << *cur
<< dendl
;
2348 // make bound sticky
2349 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2350 p
!= import_bound_fragset
.end();
2352 CInode
*in
= cache
->get_inode(p
->first
);
2354 in
->get_stickydirs();
2355 dout(7) << " set stickydirs on bound inode " << *in
<< dendl
;
2359 dout(7) << " not doing assim on " << *dir
<< dendl
;
2362 if (!finished
.empty())
2363 mds
->queue_waiters(finished
);
2366 bool success
= true;
2367 if (mds
->is_active()) {
2369 set
<CDir
*> import_bounds
;
2370 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2371 p
!= import_bound_fragset
.end();
2373 CInode
*in
= cache
->get_inode(p
->first
);
2376 // map fragset into a frag_t list, based on the inode fragtree
2377 list
<frag_t
> fglist
;
2378 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2379 in
->dirfragtree
.get_leaves_under(*q
, fglist
);
2380 dout(10) << " bound inode " << p
->first
<< " fragset " << p
->second
<< " maps to " << fglist
<< dendl
;
2382 for (list
<frag_t
>::iterator q
= fglist
.begin();
2385 CDir
*bound
= cache
->get_dirfrag(dirfrag_t(p
->first
, *q
));
2387 dout(7) << " opening bounding dirfrag " << *q
<< " on " << *in
<< dendl
;
2388 cache
->open_remote_dirfrag(in
, *q
,
2389 new C_MDS_RetryMessage(mds
, m
));
2393 if (!bound
->state_test(CDir::STATE_IMPORTBOUND
)) {
2394 dout(7) << " pinning import bound " << *bound
<< dendl
;
2395 bound
->get(CDir::PIN_IMPORTBOUND
);
2396 bound
->state_set(CDir::STATE_IMPORTBOUND
);
2398 dout(7) << " already pinned import bound " << *bound
<< dendl
;
2400 import_bounds
.insert(bound
);
2404 dout(7) << " all ready, noting auth and freezing import region" << dendl
;
2406 if (!mds
->mdcache
->is_readonly() &&
2407 dir
->get_inode()->filelock
.can_wrlock(-1) &&
2408 dir
->get_inode()->nestlock
.can_wrlock(-1)) {
2409 it
->second
.mut
= new MutationImpl();
2410 // force some locks. hacky.
2411 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, it
->second
.mut
);
2412 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, it
->second
.mut
);
2414 // note that i am an ambiguous auth for this subtree.
2415 // specify bounds, since the exporter explicitly defines the region.
2416 cache
->adjust_bounded_subtree_auth(dir
, import_bounds
,
2417 pair
<int,int>(oldauth
, mds
->get_nodeid()));
2418 cache
->verify_subtree_bounds(dir
, import_bounds
);
2420 dir
->_freeze_tree();
2422 it
->second
.state
= IMPORT_PREPPED
;
2424 dout(7) << " couldn't acquire all needed locks, failing. " << *dir
<< dendl
;
2428 dout(7) << " not active, failing. " << *dir
<< dendl
;
2433 import_reverse_prepping(dir
, it
->second
);
2436 dout(7) << " sending export_prep_ack on " << *dir
<< dendl
;
2437 mds
->send_message(new MExportDirPrepAck(dir
->dirfrag(), success
, m
->get_tid()), m
->get_connection());
2439 assert(g_conf
->mds_kill_import_at
!= 4);
2447 class C_MDS_ImportDirLoggedStart
: public MigratorLogContext
{
2452 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
2454 C_MDS_ImportDirLoggedStart(Migrator
*m
, CDir
*d
, mds_rank_t f
) :
2455 MigratorLogContext(m
), df(d
->dirfrag()), dir(d
), from(f
) {
2457 void finish(int r
) override
{
2458 mig
->import_logged_start(df
, dir
, from
, imported_session_map
);
2462 /* This function DOES put the passed message before returning*/
2463 void Migrator::handle_export_dir(MExportDir
*m
)
2465 assert (g_conf
->mds_kill_import_at
!= 5);
2466 CDir
*dir
= cache
->get_dirfrag(m
->dirfrag
);
2469 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2470 dout(7) << "handle_export_dir importing " << *dir
<< " from " << oldauth
<< dendl
;
2472 assert(!dir
->is_auth());
2474 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->dirfrag
);
2475 assert(it
!= import_state
.end());
2476 assert(it
->second
.state
== IMPORT_PREPPED
);
2477 assert(it
->second
.tid
== m
->get_tid());
2478 assert(it
->second
.peer
== oldauth
);
2480 utime_t now
= ceph_clock_now();
2482 if (!dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()))
2483 dir
->get_inode()->dirfragtree
.force_to_leaf(g_ceph_context
, dir
->get_frag());
2485 cache
->show_subtrees();
2487 C_MDS_ImportDirLoggedStart
*onlogged
= new C_MDS_ImportDirLoggedStart(this, dir
, oldauth
);
2489 // start the journal entry
2490 EImportStart
*le
= new EImportStart(mds
->mdlog
, dir
->dirfrag(), m
->bounds
, oldauth
);
2491 mds
->mdlog
->start_entry(le
);
2493 le
->metablob
.add_dir_context(dir
);
2495 // adjust auth (list us _first_)
2496 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), oldauth
);
2498 // new client sessions, open these after we journal
2499 // include imported sessions in EImportStart
2500 bufferlist::iterator cmp
= m
->client_map
.begin();
2501 map
<client_t
,entity_inst_t
> client_map
;
2502 decode(client_map
, cmp
);
2504 le
->cmapv
= mds
->server
->prepare_force_open_sessions(client_map
, onlogged
->imported_session_map
);
2505 encode(client_map
, le
->client_map
, mds
->mdsmap
->get_up_features());
2507 bufferlist::iterator blp
= m
->export_data
.begin();
2508 int num_imported_inodes
= 0;
2509 while (!blp
.end()) {
2510 num_imported_inodes
+=
2511 decode_import_dir(blp
,
2515 mds
->mdlog
->get_current_segment(),
2516 it
->second
.peer_exports
,
2517 it
->second
.updated_scatterlocks
,
2520 dout(10) << " " << m
->bounds
.size() << " imported bounds" << dendl
;
2522 // include bounds in EImportStart
2523 set
<CDir
*> import_bounds
;
2524 for (vector
<dirfrag_t
>::iterator p
= m
->bounds
.begin();
2525 p
!= m
->bounds
.end();
2527 CDir
*bd
= cache
->get_dirfrag(*p
);
2529 le
->metablob
.add_dir(bd
, false); // note that parent metadata is already in the event
2530 import_bounds
.insert(bd
);
2532 cache
->verify_subtree_bounds(dir
, import_bounds
);
2534 // adjust popularity
2535 mds
->balancer
->add_import(dir
, now
);
2537 dout(7) << "handle_export_dir did " << *dir
<< dendl
;
2540 it
->second
.state
= IMPORT_LOGGINGSTART
;
2541 assert (g_conf
->mds_kill_import_at
!= 6);
2544 mds
->mdlog
->submit_entry(le
, onlogged
);
2545 mds
->mdlog
->flush();
2549 mds
->logger
->inc(l_mds_imported
);
2550 mds
->logger
->inc(l_mds_imported_inodes
, num_imported_inodes
);
2558 * this is an import helper
2559 * called by import_finish, and import_reverse and friends.
2561 void Migrator::import_remove_pins(CDir
*dir
, set
<CDir
*>& bounds
)
2563 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2565 dir
->put(CDir::PIN_IMPORTING
);
2566 dir
->state_clear(CDir::STATE_IMPORTING
);
2570 for (list
<dirfrag_t
>::iterator p
= stat
.bound_ls
.begin();
2571 p
!= stat
.bound_ls
.end();
2573 if (did
.count(p
->ino
))
2576 CInode
*in
= cache
->get_inode(p
->ino
);
2578 in
->put_stickydirs();
2581 if (stat
.state
== IMPORT_PREPPING
) {
2582 for (auto bd
: bounds
) {
2583 if (bd
->state_test(CDir::STATE_IMPORTBOUND
)) {
2584 bd
->put(CDir::PIN_IMPORTBOUND
);
2585 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2588 } else if (stat
.state
>= IMPORT_PREPPED
) {
2589 // bounding dirfrags
2590 for (auto bd
: bounds
) {
2591 assert(bd
->state_test(CDir::STATE_IMPORTBOUND
));
2592 bd
->put(CDir::PIN_IMPORTBOUND
);
2593 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2600 * note: this does teh full work of reversing and import and cleaning up
2602 * called by both handle_mds_failure and by handle_resolve (if we are
2603 * a survivor coping with an exporter failure+recovery).
2605 void Migrator::import_reverse(CDir
*dir
)
2607 dout(7) << "import_reverse " << *dir
<< dendl
;
2609 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2610 stat
.state
= IMPORT_ABORTING
;
2613 cache
->get_subtree_bounds(dir
, bounds
);
2616 import_remove_pins(dir
, bounds
);
2618 // update auth, with possible subtree merge.
2619 assert(dir
->is_subtree_root());
2620 if (mds
->is_resolve())
2621 cache
->trim_non_auth_subtree(dir
);
2623 cache
->adjust_subtree_auth(dir
, stat
.peer
);
2625 C_ContextsBase
<MDSInternalContextBase
, MDSInternalContextGather
> *fin
= new C_ContextsBase
<MDSInternalContextBase
, MDSInternalContextGather
>(g_ceph_context
);
2626 if (!dir
->get_inode()->is_auth() &&
2627 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2628 dir
->get_inode()->clear_scatter_dirty();
2629 // wake up scatter_nudge waiters
2630 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2633 int num_dentries
= 0;
2634 // adjust auth bits.
2637 while (!q
.empty()) {
2638 CDir
*cur
= q
.front();
2642 assert(cur
->is_auth());
2643 cur
->state_clear(CDir::STATE_AUTH
);
2644 cur
->remove_bloom();
2645 cur
->clear_replica_map();
2646 cur
->set_replica_nonce(CDir::EXPORT_NONCE
);
2647 if (cur
->is_dirty())
2650 for (auto &p
: *cur
) {
2651 CDentry
*dn
= p
.second
;
2654 dn
->state_clear(CDentry::STATE_AUTH
);
2655 dn
->clear_replica_map();
2656 dn
->set_replica_nonce(CDentry::EXPORT_NONCE
);
2661 if (dn
->get_linkage()->is_primary()) {
2662 CInode
*in
= dn
->get_linkage()->get_inode();
2663 in
->state_clear(CDentry::STATE_AUTH
);
2664 in
->clear_replica_map();
2665 in
->set_replica_nonce(CInode::EXPORT_NONCE
);
2668 in
->clear_dirty_rstat();
2669 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2670 in
->clear_scatter_dirty();
2671 in
->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2674 in
->clear_dirty_parent();
2676 in
->authlock
.clear_gather();
2677 in
->linklock
.clear_gather();
2678 in
->dirfragtreelock
.clear_gather();
2679 in
->filelock
.clear_gather();
2681 in
->clear_file_locks();
2683 // non-bounding dir?
2685 in
->get_dirfrags(dfs
);
2686 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
2687 if (bounds
.count(*p
) == 0)
2691 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
2696 dir
->add_waiter(CDir::WAIT_UNFREEZE
, fin
);
2698 if (stat
.state
== IMPORT_ACKING
) {
2699 // remove imported caps
2700 for (map
<CInode
*,map
<client_t
,Capability::Export
> >::iterator p
= stat
.peer_exports
.begin();
2701 p
!= stat
.peer_exports
.end();
2703 CInode
*in
= p
->first
;
2704 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2705 q
!= p
->second
.end();
2707 Capability
*cap
= in
->get_client_cap(q
->first
);
2709 assert(!stat
.session_map
.count(q
->first
));
2712 if (cap
->is_importing())
2713 in
->remove_client_cap(q
->first
);
2715 in
->put(CInode::PIN_IMPORTINGCAPS
);
2717 for (auto& p
: stat
.session_map
) {
2718 Session
*session
= p
.second
.first
;
2719 session
->dec_importing();
2724 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false)); // log failure
2726 cache
->trim(num_dentries
); // try trimming dentries
2728 // notify bystanders; wait in aborting state
2729 import_notify_abort(dir
, bounds
);
2732 void Migrator::import_notify_finish(CDir
*dir
, set
<CDir
*>& bounds
)
2734 dout(7) << "import_notify_finish " << *dir
<< dendl
;
2736 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2737 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2738 p
!= stat
.bystanders
.end();
2740 MExportDirNotify
*notify
=
2741 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, false,
2742 pair
<int,int>(stat
.peer
, mds
->get_nodeid()),
2743 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
2744 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2745 notify
->get_bounds().push_back((*i
)->dirfrag());
2746 mds
->send_message_mds(notify
, *p
);
2750 void Migrator::import_notify_abort(CDir
*dir
, set
<CDir
*>& bounds
)
2752 dout(7) << "import_notify_abort " << *dir
<< dendl
;
2754 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2755 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2756 p
!= stat
.bystanders
.end(); ) {
2757 if (mds
->is_cluster_degraded() &&
2758 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)) {
2759 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2760 stat
.bystanders
.erase(p
++);
2763 MExportDirNotify
*notify
=
2764 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
2765 mds_authority_t(stat
.peer
, mds
->get_nodeid()),
2766 mds_authority_t(stat
.peer
, CDIR_AUTH_UNKNOWN
));
2767 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2768 notify
->get_bounds().push_back((*i
)->dirfrag());
2769 mds
->send_message_mds(notify
, *p
);
2772 if (stat
.bystanders
.empty()) {
2773 dout(7) << "no bystanders, finishing reverse now" << dendl
;
2774 import_reverse_unfreeze(dir
);
2776 assert (g_conf
->mds_kill_import_at
!= 10);
2780 void Migrator::import_reverse_unfreeze(CDir
*dir
)
2782 dout(7) << "import_reverse_unfreeze " << *dir
<< dendl
;
2783 assert(!dir
->is_auth());
2784 cache
->discard_delayed_expire(dir
);
2785 dir
->unfreeze_tree();
2786 if (dir
->is_subtree_root())
2787 cache
->try_subtree_merge(dir
);
2788 import_reverse_final(dir
);
2791 void Migrator::import_reverse_final(CDir
*dir
)
2793 dout(7) << "import_reverse_final " << *dir
<< dendl
;
2796 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2797 assert(it
!= import_state
.end());
2799 MutationRef mut
= it
->second
.mut
;
2800 import_state
.erase(it
);
2802 // send pending import_maps?
2803 mds
->mdcache
->maybe_send_pending_resolves();
2806 mds
->locker
->drop_locks(mut
.get());
2810 cache
->show_subtrees();
2811 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2817 void Migrator::import_logged_start(dirfrag_t df
, CDir
*dir
, mds_rank_t from
,
2818 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
)
2820 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2821 if (it
== import_state
.end() ||
2822 it
->second
.state
!= IMPORT_LOGGINGSTART
) {
2823 dout(7) << "import " << df
<< " must have aborted" << dendl
;
2824 mds
->server
->finish_force_open_sessions(imported_session_map
);
2828 dout(7) << "import_logged " << *dir
<< dendl
;
2831 it
->second
.state
= IMPORT_ACKING
;
2833 assert (g_conf
->mds_kill_import_at
!= 7);
2835 // force open client sessions and finish cap import
2836 mds
->server
->finish_force_open_sessions(imported_session_map
, false);
2838 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
2839 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
2840 p
!= it
->second
.peer_exports
.end();
2842 // parameter 'peer' is NONE, delay sending cap import messages to client
2843 finish_import_inode_caps(p
->first
, MDS_RANK_NONE
, true, imported_session_map
,
2844 p
->second
, imported_caps
[p
->first
->ino()]);
2847 it
->second
.session_map
.swap(imported_session_map
);
2849 // send notify's etc.
2850 dout(7) << "sending ack for " << *dir
<< " to old auth mds." << from
<< dendl
;
2852 // test surviving observer of a failed migration that did not complete
2853 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2855 MExportDirAck
*ack
= new MExportDirAck(dir
->dirfrag(), it
->second
.tid
);
2856 ::encode(imported_caps
, ack
->imported_caps
);
2858 mds
->send_message_mds(ack
, from
);
2859 assert (g_conf
->mds_kill_import_at
!= 8);
2861 cache
->show_subtrees();
2864 /* This function DOES put the passed message before returning*/
2865 void Migrator::handle_export_finish(MExportDirFinish
*m
)
2867 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
2869 dout(7) << "handle_export_finish on " << *dir
<< (m
->is_last() ? " last" : "") << dendl
;
2871 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2872 assert(it
!= import_state
.end());
2873 assert(it
->second
.tid
== m
->get_tid());
2875 import_finish(dir
, false, m
->is_last());
2880 void Migrator::import_finish(CDir
*dir
, bool notify
, bool last
)
2882 dout(7) << "import_finish on " << *dir
<< dendl
;
2884 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2885 assert(it
!= import_state
.end());
2886 assert(it
->second
.state
== IMPORT_ACKING
|| it
->second
.state
== IMPORT_FINISHING
);
2888 if (it
->second
.state
== IMPORT_ACKING
) {
2889 assert(dir
->is_auth());
2890 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
2894 assert(g_conf
->mds_kill_import_at
!= 9);
2896 if (it
->second
.state
== IMPORT_ACKING
) {
2897 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
2898 p
!= it
->second
.peer_exports
.end();
2900 CInode
*in
= p
->first
;
2901 assert(in
->is_auth());
2902 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2903 q
!= p
->second
.end();
2905 auto r
= it
->second
.session_map
.find(q
->first
);
2906 if (r
== it
->second
.session_map
.end())
2909 Session
*session
= r
->second
.first
;
2910 Capability
*cap
= in
->get_client_cap(q
->first
);
2912 cap
->merge(q
->second
, true);
2913 cap
->clear_importing();
2914 mds
->mdcache
->do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
,
2915 q
->second
.mseq
- 1, it
->second
.peer
, CEPH_CAP_FLAG_AUTH
);
2918 in
->replica_caps_wanted
= 0;
2920 for (auto& p
: it
->second
.session_map
) {
2921 Session
*session
= p
.second
.first
;
2922 session
->dec_importing();
2927 assert(it
->second
.state
== IMPORT_ACKING
);
2928 it
->second
.state
= IMPORT_FINISHING
;
2934 cache
->get_subtree_bounds(dir
, bounds
);
2937 import_notify_finish(dir
, bounds
);
2939 import_remove_pins(dir
, bounds
);
2941 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
2942 it
->second
.peer_exports
.swap(peer_exports
);
2944 // clear import state (we're done!)
2945 MutationRef mut
= it
->second
.mut
;
2946 import_state
.erase(it
);
2948 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
2950 // process delayed expires
2951 cache
->process_delayed_expire(dir
);
2953 // unfreeze tree, with possible subtree merge.
2954 dir
->unfreeze_tree();
2955 cache
->try_subtree_merge(dir
);
2957 cache
->show_subtrees();
2958 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2961 mds
->locker
->drop_locks(mut
.get());
2965 // re-eval imported caps
2966 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= peer_exports
.begin();
2967 p
!= peer_exports
.end();
2969 if (p
->first
->is_auth())
2970 mds
->locker
->eval(p
->first
, CEPH_CAP_LOCKS
, true);
2971 p
->first
->put(CInode::PIN_IMPORTINGCAPS
);
2974 // send pending import_maps?
2975 mds
->mdcache
->maybe_send_pending_resolves();
2977 // did i just import mydir?
2978 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
2979 cache
->populate_mydir();
2982 if (dir
->get_num_head_items() == 0 &&
2983 !dir
->inode
->is_auth()) {
2985 export_empty_import(dir
);
2990 void Migrator::decode_import_inode(CDentry
*dn
, bufferlist::iterator
& blp
,
2991 mds_rank_t oldauth
, LogSegment
*ls
,
2992 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
,
2993 list
<ScatterLock
*>& updated_scatterlocks
)
2995 dout(15) << "decode_import_inode on " << *dn
<< dendl
;
3000 ::decode(last
, blp
);
3003 CInode
*in
= cache
->get_inode(ino
, last
);
3005 in
= new CInode(mds
->mdcache
, true, 1, last
);
3009 // state after link -- or not! -sage
3010 in
->decode_import(blp
, ls
); // cap imports are noted for later action
3013 decode_import_inode_caps(in
, true, blp
, peer_exports
);
3015 // link before state -- or not! -sage
3016 if (dn
->get_linkage()->get_inode() != in
) {
3017 assert(!dn
->get_linkage()->get_inode());
3018 dn
->dir
->link_primary_inode(dn
, in
);
3022 dn
->dir
->pop_lru_subdirs
.push_back(&in
->item_pop_lru
);
3026 cache
->add_inode(in
);
3027 dout(10) << "added " << *in
<< dendl
;
3029 dout(10) << " had " << *in
<< dendl
;
3032 if (in
->inode
.is_dirty_rstat())
3033 in
->mark_dirty_rstat();
3035 // clear if dirtyscattered, since we're going to journal this
3036 // but not until we _actually_ finish the import...
3037 if (in
->filelock
.is_dirty()) {
3038 updated_scatterlocks
.push_back(&in
->filelock
);
3039 mds
->locker
->mark_updated_scatterlock(&in
->filelock
);
3042 if (in
->dirfragtreelock
.is_dirty()) {
3043 updated_scatterlocks
.push_back(&in
->dirfragtreelock
);
3044 mds
->locker
->mark_updated_scatterlock(&in
->dirfragtreelock
);
3047 // adjust replica list
3048 //assert(!in->is_replica(oldauth)); // not true on failed export
3049 in
->add_replica(oldauth
, CInode::EXPORT_NONCE
);
3050 if (in
->is_replica(mds
->get_nodeid()))
3051 in
->remove_replica(mds
->get_nodeid());
3054 void Migrator::decode_import_inode_caps(CInode
*in
, bool auth_cap
,
3055 bufferlist::iterator
&blp
,
3056 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3058 map
<client_t
,Capability::Export
> cap_map
;
3059 ::decode(cap_map
, blp
);
3061 ::decode(in
->get_mds_caps_wanted(), blp
);
3062 if (!cap_map
.empty() ||
3063 (auth_cap
&& (in
->get_caps_wanted() & ~CEPH_CAP_PIN
))) {
3064 peer_exports
[in
].swap(cap_map
);
3065 in
->get(CInode::PIN_IMPORTINGCAPS
);
3069 void Migrator::finish_import_inode_caps(CInode
*in
, mds_rank_t peer
, bool auth_cap
,
3070 const map
<client_t
,pair
<Session
*,uint64_t> >& session_map
,
3071 const map
<client_t
,Capability::Export
> &export_map
,
3072 map
<client_t
,Capability::Import
> &import_map
)
3074 for (auto& it
: export_map
) {
3075 dout(10) << "finish_import_inode_caps for client." << it
.first
<< " on " << *in
<< dendl
;
3077 auto p
= session_map
.find(it
.first
);
3078 if (p
== session_map
.end()) {
3079 dout(10) << " no session for client." << it
.first
<< dendl
;
3080 (void)import_map
[it
.first
];
3084 Session
*session
= p
->second
.first
;
3086 Capability
*cap
= in
->get_client_cap(it
.first
);
3088 cap
= in
->add_client_cap(it
.first
, session
);
3090 cap
->mark_importing();
3093 Capability::Import
& im
= import_map
[it
.first
];
3094 im
.cap_id
= cap
->get_cap_id();
3095 im
.mseq
= auth_cap
? it
.second
.mseq
: cap
->get_mseq();
3096 im
.issue_seq
= cap
->get_last_seq() + 1;
3099 cap
->merge(it
.second
, auth_cap
);
3100 mds
->mdcache
->do_cap_import(session
, in
, cap
, it
.second
.cap_id
,
3101 it
.second
.seq
, it
.second
.mseq
- 1, peer
,
3102 auth_cap
? CEPH_CAP_FLAG_AUTH
: CEPH_CAP_FLAG_RELEASE
);
3107 in
->replica_caps_wanted
= 0;
3108 in
->put(CInode::PIN_IMPORTINGCAPS
);
3112 int Migrator::decode_import_dir(bufferlist::iterator
& blp
,
3117 map
<CInode
*,map
<client_t
,Capability::Export
> >& peer_exports
,
3118 list
<ScatterLock
*>& updated_scatterlocks
, utime_t now
)
3124 CInode
*diri
= cache
->get_inode(df
.ino
);
3126 CDir
*dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, df
.frag
);
3129 dout(7) << "decode_import_dir " << *dir
<< dendl
;
3132 dir
->decode_import(blp
, now
, ls
);
3134 // adjust replica list
3135 //assert(!dir->is_replica(oldauth)); // not true on failed export
3136 dir
->add_replica(oldauth
, CDir::EXPORT_NONCE
);
3137 if (dir
->is_replica(mds
->get_nodeid()))
3138 dir
->remove_replica(mds
->get_nodeid());
3140 // add to journal entry
3142 le
->metablob
.add_import_dir(dir
);
3144 int num_imported
= 0;
3146 // take all waiters on this dir
3147 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3148 // a replica's presense in my cache implies/forces it's presense in authority's.
3149 list
<MDSInternalContextBase
*> waiters
;
3151 dir
->take_waiting(CDir::WAIT_ANY_MASK
, waiters
);
3152 for (list
<MDSInternalContextBase
*>::iterator it
= waiters
.begin();
3153 it
!= waiters
.end();
3155 import_root
->add_waiter(CDir::WAIT_UNFREEZE
, *it
); // UNFREEZE will get kicked both on success or failure
3157 dout(15) << "doing contents" << dendl
;
3161 ::decode(nden
, blp
);
3163 for (; nden
>0; nden
--) {
3169 ::decode(dname
, blp
);
3170 ::decode(last
, blp
);
3172 CDentry
*dn
= dir
->lookup_exact_snap(dname
, last
);
3174 dn
= dir
->add_null_dentry(dname
, 1, last
);
3176 dn
->decode_import(blp
, ls
);
3178 dn
->add_replica(oldauth
, CDentry::EXPORT_NONCE
);
3179 if (dn
->is_replica(mds
->get_nodeid()))
3180 dn
->remove_replica(mds
->get_nodeid());
3182 // dentry lock in unreadable state can block path traverse
3183 if (dn
->lock
.get_state() != LOCK_SYNC
)
3184 mds
->locker
->try_eval(&dn
->lock
, NULL
);
3186 dout(15) << "decode_import_dir got " << *dn
<< dendl
;
3190 ::decode(icode
, blp
);
3194 assert(dn
->get_linkage()->is_null());
3198 else if (icode
== 'L') {
3201 unsigned char d_type
;
3203 ::decode(d_type
, blp
);
3204 if (dn
->get_linkage()->is_remote()) {
3205 assert(dn
->get_linkage()->get_remote_ino() == ino
);
3207 dir
->link_remote_inode(dn
, ino
, d_type
);
3210 else if (icode
== 'I') {
3213 decode_import_inode(dn
, blp
, oldauth
, ls
,
3214 peer_exports
, updated_scatterlocks
);
3217 // add dentry to journal entry
3219 le
->metablob
.add_import_dentry(dn
);
3222 #ifdef MDS_VERIFY_FRAGSTAT
3223 if (dir
->is_complete())
3224 dir
->verify_fragstat();
3227 dir
->inode
->maybe_export_pin();
3229 dout(7) << "decode_import_dir done " << *dir
<< dendl
;
3230 return num_imported
;
3237 // authority bystander
3239 /* This function DOES put the passed message before returning*/
3240 void Migrator::handle_export_notify(MExportDirNotify
*m
)
3242 if (!(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())) {
3247 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
3249 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3250 mds_authority_t old_auth
= m
->get_old_auth();
3251 mds_authority_t new_auth
= m
->get_new_auth();
3254 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3255 << " on missing dir " << m
->get_dirfrag() << dendl
;
3256 } else if (dir
->authority() != old_auth
) {
3257 dout(7) << "handle_export_notify old_auth was " << dir
->authority()
3258 << " != " << old_auth
<< " -> " << new_auth
3259 << " on " << *dir
<< dendl
;
3261 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3262 << " on " << *dir
<< dendl
;
3265 cache
->map_dirfrag_set(m
->get_bounds(), have
);
3266 cache
->adjust_bounded_subtree_auth(dir
, have
, new_auth
);
3269 cache
->try_subtree_merge(dir
);
3273 if (m
->wants_ack()) {
3274 mds
->send_message_mds(new MExportDirNotifyAck(m
->get_dirfrag(), m
->get_tid(), m
->get_new_auth()), from
);
3277 dout(7) << "handle_export_notify no ack requested" << dendl
;
3284 void Migrator::export_caps(CInode
*in
)
3286 mds_rank_t dest
= in
->authority().first
;
3287 dout(7) << "export_caps to mds." << dest
<< " " << *in
<< dendl
;
3289 assert(in
->is_any_caps());
3290 assert(!in
->is_auth());
3291 assert(!in
->is_ambiguous_auth());
3292 assert(!in
->state_test(CInode::STATE_EXPORTINGCAPS
));
3294 MExportCaps
*ex
= new MExportCaps
;
3295 ex
->ino
= in
->ino();
3297 encode_export_inode_caps(in
, false, ex
->cap_bl
, ex
->client_map
);
3299 mds
->send_message_mds(ex
, dest
);
3302 void Migrator::handle_gather_caps(MGatherCaps
*m
)
3304 CInode
*in
= cache
->get_inode(m
->ino
);
3309 dout(10) << "handle_gather_caps " << *m
<< " from " << m
->get_source()
3312 if (in
->is_any_caps() &&
3314 !in
->is_ambiguous_auth() &&
3315 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
3322 class C_M_LoggedImportCaps
: public MigratorLogContext
{
3326 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
3327 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3329 C_M_LoggedImportCaps(Migrator
*m
, CInode
*i
, mds_rank_t f
) : MigratorLogContext(m
), in(i
), from(f
) {}
3330 void finish(int r
) override
{
3331 mig
->logged_import_caps(in
, from
, imported_session_map
, peer_exports
);
3335 /* This function DOES put the passed message before returning*/
3336 void Migrator::handle_export_caps(MExportCaps
*ex
)
3338 dout(10) << "handle_export_caps " << *ex
<< " from " << ex
->get_source() << dendl
;
3339 CInode
*in
= cache
->get_inode(ex
->ino
);
3342 assert(in
->is_auth());
3345 if (!in
->can_auth_pin()) {
3352 map
<client_t
,entity_inst_t
> client_map
;
3353 client_map
.swap(ex
->client_map
);
3355 C_M_LoggedImportCaps
*finish
= new C_M_LoggedImportCaps(
3356 this, in
, mds_rank_t(ex
->get_source().num()));
3358 version_t pv
= mds
->server
->prepare_force_open_sessions(client_map
,
3359 finish
->imported_session_map
);
3361 bufferlist::iterator blp
= ex
->cap_bl
.begin();
3362 decode_import_inode_caps(in
, false, blp
, finish
->peer_exports
);
3363 assert(!finish
->peer_exports
.empty()); // thus, inode is pinned.
3365 // journal open client sessions
3367 ESessions
*le
= new ESessions(pv
, client_map
);
3368 mds
->mdlog
->start_submit_entry(le
, finish
);
3369 mds
->mdlog
->flush();
3375 void Migrator::logged_import_caps(CInode
*in
,
3377 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
,
3378 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3380 dout(10) << "logged_import_caps on " << *in
<< dendl
;
3381 // see export_go() vs export_go_synced()
3382 assert(in
->is_auth());
3384 // force open client sessions and finish cap import
3385 mds
->server
->finish_force_open_sessions(imported_session_map
);
3387 map
<client_t
,Capability::Import
> imported_caps
;
3389 auto it
= peer_exports
.find(in
);
3390 assert(it
!= peer_exports
.end());
3392 // clients will release caps from the exporter when they receive the cap import message.
3393 finish_import_inode_caps(in
, from
, false, imported_session_map
, it
->second
, imported_caps
);
3394 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
3395 in
->auth_unpin(this);
3398 void Migrator::handle_conf_change(const struct md_config_t
*conf
,
3399 const std::set
<std::string
> &changed
,
3400 const MDSMap
&mds_map
)
3402 if (changed
.count("mds_inject_migrator_session_race")) {
3403 inject_session_race
= conf
->get_val
<bool>("mds_inject_migrator_session_race");
3404 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race
<< dendl
;