1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "MDBalancer.h"
29 #include "include/filepath.h"
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/ESessions.h"
36 #include "msg/Messenger.h"
38 #include "messages/MClientCaps.h"
40 #include "messages/MExportDirDiscover.h"
41 #include "messages/MExportDirDiscoverAck.h"
42 #include "messages/MExportDirCancel.h"
43 #include "messages/MExportDirPrep.h"
44 #include "messages/MExportDirPrepAck.h"
45 #include "messages/MExportDir.h"
46 #include "messages/MExportDirAck.h"
47 #include "messages/MExportDirNotify.h"
48 #include "messages/MExportDirNotifyAck.h"
49 #include "messages/MExportDirFinish.h"
51 #include "messages/MExportCaps.h"
52 #include "messages/MExportCapsAck.h"
53 #include "messages/MGatherCaps.h"
57 * this is what the dir->dir_auth values look like
62 * me, me me - still me, but preparing for export
63 * me, them me - send MExportDir (peer is preparing)
64 * them, me me - journaled EExport
69 * me, them me - journaled EImportStart
73 * - auth bit is set if i am listed as first _or_ second dir_auth.
76 #include "common/config.h"
79 #define dout_context g_ceph_context
80 #define dout_subsys ceph_subsys_mds
82 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
85 class MigratorContext
: public MDSInternalContextBase
{
88 MDSRank
*get_mds() override
{
92 explicit MigratorContext(Migrator
*mig_
) : mig(mig_
) {
97 class MigratorLogContext
: public MDSLogContextBase
{
100 MDSRank
*get_mds() override
{
104 explicit MigratorLogContext(Migrator
*mig_
) : mig(mig_
) {
109 /* This function DOES put the passed message before returning*/
110 void Migrator::dispatch(Message
*m
)
112 switch (m
->get_type()) {
114 case MSG_MDS_EXPORTDIRDISCOVER
:
115 handle_export_discover(static_cast<MExportDirDiscover
*>(m
));
117 case MSG_MDS_EXPORTDIRPREP
:
118 handle_export_prep(static_cast<MExportDirPrep
*>(m
));
120 case MSG_MDS_EXPORTDIR
:
121 handle_export_dir(static_cast<MExportDir
*>(m
));
123 case MSG_MDS_EXPORTDIRFINISH
:
124 handle_export_finish(static_cast<MExportDirFinish
*>(m
));
126 case MSG_MDS_EXPORTDIRCANCEL
:
127 handle_export_cancel(static_cast<MExportDirCancel
*>(m
));
131 case MSG_MDS_EXPORTDIRDISCOVERACK
:
132 handle_export_discover_ack(static_cast<MExportDirDiscoverAck
*>(m
));
134 case MSG_MDS_EXPORTDIRPREPACK
:
135 handle_export_prep_ack(static_cast<MExportDirPrepAck
*>(m
));
137 case MSG_MDS_EXPORTDIRACK
:
138 handle_export_ack(static_cast<MExportDirAck
*>(m
));
140 case MSG_MDS_EXPORTDIRNOTIFYACK
:
141 handle_export_notify_ack(static_cast<MExportDirNotifyAck
*>(m
));
144 // export 3rd party (dir_auth adjustments)
145 case MSG_MDS_EXPORTDIRNOTIFY
:
146 handle_export_notify(static_cast<MExportDirNotify
*>(m
));
150 case MSG_MDS_EXPORTCAPS
:
151 handle_export_caps(static_cast<MExportCaps
*>(m
));
153 case MSG_MDS_GATHERCAPS
:
154 handle_gather_caps(static_cast<MGatherCaps
*>(m
));
158 derr
<< "migrator unknown message " << m
->get_type() << dendl
;
159 assert(0 == "migrator unknown message");
164 class C_MDC_EmptyImport
: public MigratorContext
{
167 C_MDC_EmptyImport(Migrator
*m
, CDir
*d
) : MigratorContext(m
), dir(d
) {}
168 void finish(int r
) override
{
169 mig
->export_empty_import(dir
);
174 void Migrator::export_empty_import(CDir
*dir
)
176 dout(7) << "export_empty_import " << *dir
<< dendl
;
177 assert(dir
->is_subtree_root());
179 if (dir
->inode
->is_auth()) {
180 dout(7) << " inode is auth" << dendl
;
183 if (!dir
->is_auth()) {
184 dout(7) << " not auth" << dendl
;
187 if (dir
->is_freezing() || dir
->is_frozen()) {
188 dout(7) << " freezing or frozen" << dendl
;
191 if (dir
->get_num_head_items() > 0) {
192 dout(7) << " not actually empty" << dendl
;
195 if (dir
->inode
->is_root()) {
196 dout(7) << " root" << dendl
;
200 mds_rank_t dest
= dir
->inode
->authority().first
;
201 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
203 dout(7) << " really empty, exporting to " << dest
<< dendl
;
204 assert (dest
!= mds
->get_nodeid());
206 dout(7) << "exporting to mds." << dest
207 << " empty import " << *dir
<< dendl
;
208 export_dir( dir
, dest
);
211 void Migrator::find_stale_export_freeze()
213 utime_t now
= ceph_clock_now();
214 utime_t cutoff
= now
;
215 cutoff
-= g_conf
->mds_freeze_tree_timeout
;
219 * We could have situations like:
221 * - mds.0 authpins an item in subtree A
222 * - mds.0 sends request to mds.1 to authpin an item in subtree B
223 * - mds.0 freezes subtree A
224 * - mds.1 authpins an item in subtree B
225 * - mds.1 sends request to mds.0 to authpin an item in subtree A
226 * - mds.1 freezes subtree B
227 * - mds.1 receives the remote authpin request from mds.0
228 * (wait because subtree B is freezing)
229 * - mds.0 receives the remote authpin request from mds.1
230 * (wait because subtree A is freezing)
233 * - client request authpins items in subtree B
235 * - import subtree A which is parent of subtree B
236 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
238 * - client request tries authpinning items in subtree A
239 * (wait because subtree A is freezing)
241 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
242 p
!= export_state
.end(); ) {
243 CDir
* dir
= p
->first
;
244 export_state_t
& stat
= p
->second
;
246 if (stat
.state
!= EXPORT_DISCOVERING
&& stat
.state
!= EXPORT_FREEZING
)
248 if (stat
.last_cum_auth_pins
!= dir
->get_cum_auth_pins()) {
249 stat
.last_cum_auth_pins
= dir
->get_cum_auth_pins();
250 stat
.last_cum_auth_pins_change
= now
;
253 if (stat
.last_cum_auth_pins_change
>= cutoff
)
255 if (stat
.num_remote_waiters
> 0 ||
256 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
257 export_try_cancel(dir
);
262 void Migrator::export_try_cancel(CDir
*dir
, bool notify_peer
)
264 dout(10) << "export_try_cancel " << *dir
<< dendl
;
266 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
267 assert(it
!= export_state
.end());
269 int state
= it
->second
.state
;
272 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl
;
273 it
->second
.state
= EXPORT_CANCELLED
;
274 dir
->auth_unpin(this);
276 case EXPORT_DISCOVERING
:
277 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl
;
278 it
->second
.state
= EXPORT_CANCELLED
;
279 dir
->unfreeze_tree(); // cancel the freeze
280 dir
->auth_unpin(this);
282 (!mds
->is_cluster_degraded() ||
283 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
284 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
287 case EXPORT_FREEZING
:
288 dout(10) << "export state=freezing : canceling freeze" << dendl
;
289 it
->second
.state
= EXPORT_CANCELLED
;
290 dir
->unfreeze_tree(); // cancel the freeze
291 if (dir
->is_subtree_root())
292 cache
->try_subtree_merge(dir
);
294 (!mds
->is_cluster_degraded() ||
295 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
296 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
299 // NOTE: state order reversal, warning comes after prepping
301 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl
;
302 it
->second
.state
= EXPORT_CANCELLING
;
305 case EXPORT_PREPPING
:
306 if (state
!= EXPORT_WARNING
) {
307 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl
;
308 it
->second
.state
= EXPORT_CANCELLED
;
314 cache
->get_subtree_bounds(dir
, bounds
);
315 for (set
<CDir
*>::iterator q
= bounds
.begin();
319 bd
->put(CDir::PIN_EXPORTBOUND
);
320 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
322 if (state
== EXPORT_WARNING
) {
324 export_notify_abort(dir
, bounds
);
325 // process delayed expires
326 cache
->process_delayed_expire(dir
);
329 dir
->unfreeze_tree();
330 cache
->try_subtree_merge(dir
);
332 (!mds
->is_cluster_degraded() ||
333 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
334 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
337 case EXPORT_EXPORTING
:
338 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl
;
339 it
->second
.state
= EXPORT_CANCELLING
;
343 case EXPORT_LOGGINGFINISH
:
344 case EXPORT_NOTIFYING
:
345 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl
;
346 // leave export_state, don't clean up now.
348 case EXPORT_CANCELLING
:
356 if (it
->second
.state
== EXPORT_CANCELLING
||
357 it
->second
.state
== EXPORT_CANCELLED
) {
359 mut
.swap(it
->second
.mut
);
361 if (it
->second
.state
== EXPORT_CANCELLED
) {
362 export_state
.erase(it
);
363 dir
->state_clear(CDir::STATE_EXPORTING
);
364 // send pending import_maps?
365 cache
->maybe_send_pending_resolves();
369 if (state
== EXPORT_LOCKING
|| state
== EXPORT_DISCOVERING
) {
370 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(mut
.get());
372 if (mdr
->more()->waiting_on_slave
.empty())
373 mds
->mdcache
->request_finish(mdr
);
375 mds
->locker
->drop_locks(mut
.get());
379 cache
->show_subtrees();
381 maybe_do_queued_export();
385 void Migrator::export_cancel_finish(CDir
*dir
)
387 assert(dir
->state_test(CDir::STATE_EXPORTING
));
388 dir
->state_clear(CDir::STATE_EXPORTING
);
390 // pinned by Migrator::export_notify_abort()
391 dir
->auth_unpin(this);
392 // send pending import_maps? (these need to go out when all exports have finished.)
393 cache
->maybe_send_pending_resolves();
396 // ==========================================================
397 // mds failure handling
399 void Migrator::handle_mds_failure_or_stop(mds_rank_t who
)
401 dout(5) << "handle_mds_failure_or_stop mds." << who
<< dendl
;
405 // first add an extra auth_pin on any freezes, so that canceling a
406 // nested freeze doesn't complete one further up the hierarchy and
407 // confuse the shit out of us. we'll remove it after canceling the
408 // freeze. this way no freeze completions run before we want them
410 list
<CDir
*> pinned_dirs
;
411 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
412 p
!= export_state
.end();
414 if (p
->second
.state
== EXPORT_FREEZING
) {
415 CDir
*dir
= p
->first
;
416 dout(10) << "adding temp auth_pin on freezing " << *dir
<< dendl
;
418 pinned_dirs
.push_back(dir
);
422 map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
423 while (p
!= export_state
.end()) {
424 map
<CDir
*,export_state_t
>::iterator next
= p
;
426 CDir
*dir
= p
->first
;
429 // - that are going to the failed node
430 // - that aren't frozen yet (to avoid auth_pin deadlock)
431 // - they havne't prepped yet (they may need to discover bounds to do that)
432 if ((p
->second
.peer
== who
&&
433 p
->second
.state
!= EXPORT_CANCELLING
) ||
434 p
->second
.state
== EXPORT_LOCKING
||
435 p
->second
.state
== EXPORT_DISCOVERING
||
436 p
->second
.state
== EXPORT_FREEZING
||
437 p
->second
.state
== EXPORT_PREPPING
) {
438 // the guy i'm exporting to failed, or we're just freezing.
439 dout(10) << "cleaning up export state (" << p
->second
.state
<< ")"
440 << get_export_statename(p
->second
.state
) << " of " << *dir
<< dendl
;
441 export_try_cancel(dir
);
442 } else if (p
->second
.peer
!= who
) {
444 if (p
->second
.warning_ack_waiting
.erase(who
)) {
445 if (p
->second
.state
== EXPORT_WARNING
) {
446 p
->second
.notify_ack_waiting
.erase(who
); // they won't get a notify either.
447 // exporter waiting for warning acks, let's fake theirs.
448 dout(10) << "faking export_warning_ack from mds." << who
449 << " on " << *dir
<< " to mds." << p
->second
.peer
451 if (p
->second
.warning_ack_waiting
.empty())
455 if (p
->second
.notify_ack_waiting
.erase(who
)) {
456 // exporter is waiting for notify acks, fake it
457 dout(10) << "faking export_notify_ack from mds." << who
458 << " on " << *dir
<< " to mds." << p
->second
.peer
460 if (p
->second
.state
== EXPORT_NOTIFYING
) {
461 if (p
->second
.notify_ack_waiting
.empty())
463 } else if (p
->second
.state
== EXPORT_CANCELLING
) {
464 if (p
->second
.notify_ack_waiting
.empty()) {
465 export_state
.erase(p
);
466 export_cancel_finish(dir
);
478 map
<dirfrag_t
,import_state_t
>::iterator q
= import_state
.begin();
479 while (q
!= import_state
.end()) {
480 map
<dirfrag_t
,import_state_t
>::iterator next
= q
;
482 dirfrag_t df
= q
->first
;
483 CInode
*diri
= mds
->mdcache
->get_inode(df
.ino
);
484 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
486 if (q
->second
.peer
== who
) {
488 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
489 << get_import_statename(q
->second
.state
) << " of " << *dir
<< dendl
;
491 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
492 << get_import_statename(q
->second
.state
) << " of " << df
<< dendl
;
494 switch (q
->second
.state
) {
495 case IMPORT_DISCOVERING
:
496 dout(10) << "import state=discovering : clearing state" << dendl
;
497 import_reverse_discovering(df
);
500 case IMPORT_DISCOVERED
:
502 dout(10) << "import state=discovered : unpinning inode " << *diri
<< dendl
;
503 import_reverse_discovered(df
, diri
);
506 case IMPORT_PREPPING
:
508 dout(10) << "import state=prepping : unpinning base+bounds " << *dir
<< dendl
;
509 import_reverse_prepping(dir
);
514 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir
<< dendl
;
517 cache
->get_subtree_bounds(dir
, bounds
);
518 import_remove_pins(dir
, bounds
);
520 // adjust auth back to the exporter
521 cache
->adjust_subtree_auth(dir
, q
->second
.peer
);
523 // notify bystanders ; wait in aborting state
524 import_state
[df
].state
= IMPORT_ABORTING
;
525 import_notify_abort(dir
, bounds
);
526 assert(g_conf
->mds_kill_import_at
!= 10);
530 case IMPORT_LOGGINGSTART
:
532 dout(10) << "import state=loggingstart : reversing import on " << *dir
<< dendl
;
538 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
539 dout(10) << "import state=acking : noting ambiguous import " << *dir
<< dendl
;
542 cache
->get_subtree_bounds(dir
, bounds
);
543 cache
->add_ambiguous_import(dir
, bounds
);
547 case IMPORT_FINISHING
:
549 dout(10) << "import state=finishing : finishing import on " << *dir
<< dendl
;
550 import_finish(dir
, true);
553 case IMPORT_ABORTING
:
555 dout(10) << "import state=aborting : ignoring repeat failure " << *dir
<< dendl
;
559 auto bystanders_entry
= q
->second
.bystanders
.find(who
);
560 if (bystanders_entry
!= q
->second
.bystanders
.end()) {
561 q
->second
.bystanders
.erase(bystanders_entry
);
562 if (q
->second
.state
== IMPORT_ABORTING
) {
564 dout(10) << "faking export_notify_ack from mds." << who
565 << " on aborting import " << *dir
<< " from mds." << q
->second
.peer
567 if (q
->second
.bystanders
.empty())
568 import_reverse_unfreeze(dir
);
577 while (!pinned_dirs
.empty()) {
578 CDir
*dir
= pinned_dirs
.front();
579 dout(10) << "removing temp auth_pin on " << *dir
<< dendl
;
580 dir
->auth_unpin(this);
581 pinned_dirs
.pop_front();
587 void Migrator::show_importing()
589 dout(10) << "show_importing" << dendl
;
590 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
591 p
!= import_state
.end();
593 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
595 dout(10) << " importing from " << p
->second
.peer
596 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
597 << " " << p
->first
<< " " << *dir
<< dendl
;
599 dout(10) << " importing from " << p
->second
.peer
600 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
601 << " " << p
->first
<< dendl
;
606 void Migrator::show_exporting()
608 dout(10) << "show_exporting" << dendl
;
609 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
610 p
!= export_state
.end();
612 dout(10) << " exporting to " << p
->second
.peer
613 << ": (" << p
->second
.state
<< ") " << get_export_statename(p
->second
.state
)
614 << " " << p
->first
->dirfrag() << " " << *p
->first
<< dendl
;
619 void Migrator::audit()
621 if (!g_conf
->subsys
.should_gather(ceph_subsys_mds
, 5))
626 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
627 p
!= import_state
.end();
629 if (p
->second
.state
== IMPORT_DISCOVERING
)
631 if (p
->second
.state
== IMPORT_DISCOVERED
) {
632 CInode
*in
= cache
->get_inode(p
->first
.ino
);
636 CDir
*dir
= cache
->get_dirfrag(p
->first
);
638 if (p
->second
.state
== IMPORT_PREPPING
)
640 if (p
->second
.state
== IMPORT_ABORTING
) {
641 assert(!dir
->is_ambiguous_dir_auth());
642 assert(dir
->get_dir_auth().first
!= mds
->get_nodeid());
645 assert(dir
->is_ambiguous_dir_auth());
646 assert(dir
->authority().first
== mds
->get_nodeid() ||
647 dir
->authority().second
== mds
->get_nodeid());
652 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
653 p
!= export_state
.end();
655 CDir
*dir
= p
->first
;
656 if (p
->second
.state
== EXPORT_LOCKING
||
657 p
->second
.state
== EXPORT_DISCOVERING
||
658 p
->second
.state
== EXPORT_FREEZING
||
659 p
->second
.state
== EXPORT_CANCELLING
)
661 assert(dir
->is_ambiguous_dir_auth());
662 assert(dir
->authority().first
== mds
->get_nodeid() ||
663 dir
->authority().second
== mds
->get_nodeid());
666 // ambiguous+me subtrees should be importing|exporting
675 // ==========================================================
678 void Migrator::export_dir_nicely(CDir
*dir
, mds_rank_t dest
)
681 dout(7) << "export_dir_nicely " << *dir
<< " to " << dest
<< dendl
;
682 export_queue
.push_back(pair
<dirfrag_t
,mds_rank_t
>(dir
->dirfrag(), dest
));
684 maybe_do_queued_export();
687 void Migrator::maybe_do_queued_export()
693 while (!export_queue
.empty() &&
694 export_state
.size() <= 4) {
695 dirfrag_t df
= export_queue
.front().first
;
696 mds_rank_t dest
= export_queue
.front().second
;
697 export_queue
.pop_front();
699 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
701 if (!dir
->is_auth()) continue;
703 dout(0) << "nicely exporting to mds." << dest
<< " " << *dir
<< dendl
;
705 export_dir(dir
, dest
);
713 class C_MDC_ExportFreeze
: public MigratorContext
{
714 CDir
*ex
; // dir i'm exporting
717 C_MDC_ExportFreeze(Migrator
*m
, CDir
*e
, uint64_t t
) :
718 MigratorContext(m
), ex(e
), tid(t
) {
721 void finish(int r
) override
{
723 mig
->export_frozen(ex
, tid
);
728 void Migrator::get_export_lock_set(CDir
*dir
, set
<SimpleLock
*>& locks
)
731 vector
<CDentry
*> trace
;
732 cache
->make_trace(trace
, dir
->inode
);
733 for (vector
<CDentry
*>::iterator it
= trace
.begin();
736 locks
.insert(&(*it
)->lock
);
738 // prevent scatter gather race
739 locks
.insert(&dir
->get_inode()->dirfragtreelock
);
742 // NOTE: We need to take an rdlock on bounding dirfrags during
743 // migration for a rather irritating reason: when we export the
744 // bound inode, we need to send scatterlock state for the dirfrags
745 // as well, so that the new auth also gets the correct info. If we
746 // race with a refragment, this info is useless, as we can't
747 // redivvy it up. And it's needed for the scatterlocks to work
748 // properly: when the auth is in a sync/lock state it keeps each
749 // dirfrag's portion in the local (auth OR replica) dirfrag.
750 set
<CDir
*> wouldbe_bounds
;
751 cache
->get_wouldbe_subtree_bounds(dir
, wouldbe_bounds
);
752 for (set
<CDir
*>::iterator p
= wouldbe_bounds
.begin(); p
!= wouldbe_bounds
.end(); ++p
)
753 locks
.insert(&(*p
)->get_inode()->dirfragtreelock
);
757 class C_M_ExportDirWait
: public MigratorContext
{
761 C_M_ExportDirWait(Migrator
*m
, MDRequestRef mdr
, int count
)
762 : MigratorContext(m
), mdr(mdr
), count(count
) {}
763 void finish(int r
) override
{
764 mig
->dispatch_export_dir(mdr
, count
);
769 /** export_dir(dir, dest)
770 * public method to initiate an export.
771 * will fail if the directory is freezing, frozen, unpinnable, or root.
773 void Migrator::export_dir(CDir
*dir
, mds_rank_t dest
)
775 dout(7) << "export_dir " << *dir
<< " to " << dest
<< dendl
;
776 assert(dir
->is_auth());
777 assert(dest
!= mds
->get_nodeid());
779 if (mds
->mdcache
->is_readonly()) {
780 dout(7) << "read-only FS, no exports for now" << dendl
;
783 if (!mds
->mdsmap
->is_active(dest
)) {
784 dout(7) << "dest not active, no exports for now" << dendl
;
787 if (mds
->is_cluster_degraded()) {
788 dout(7) << "cluster degraded, no exports for now" << dendl
;
791 if (dir
->inode
->is_system()) {
792 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl
;
797 if (!dir
->inode
->is_base() && dir
->inode
->get_projected_parent_dir()->inode
->is_stray() &&
798 dir
->inode
->get_projected_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest
)) {
799 dout(7) << "i won't export anything in stray" << dendl
;
803 if (dir
->is_frozen() ||
804 dir
->is_freezing()) {
805 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl
;
808 if (dir
->state_test(CDir::STATE_EXPORTING
)) {
809 dout(7) << "already exporting" << dendl
;
813 if (!mds
->is_stopping() && !dir
->inode
->is_exportable(dest
)) {
814 dout(7) << "dir is export pinned" << dendl
;
818 if (dest
== mds
->get_nodeid() || !mds
->mdsmap
->is_up(dest
)) {
819 dout(7) << "cannot export: dest " << dest
<< " is me or is not active" << dendl
;
823 if (g_conf
->mds_thrash_exports
) {
824 // create random subtree bound (which will not be exported)
826 for (auto p
= dir
->begin(); p
!= dir
->end(); ++p
) {
828 CDentry::linkage_t
*dnl
= dn
->get_linkage();
829 if (dnl
->is_primary()) {
830 CInode
*in
= dnl
->get_inode();
832 in
->get_nested_dirfrags(ls
);
836 int n
= rand() % ls
.size();
840 if (!(bd
->is_frozen() || bd
->is_freezing())) {
841 assert(bd
->is_auth());
842 dir
->state_set(CDir::STATE_AUXSUBTREE
);
843 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
844 dout(0) << "export_dir: create aux subtree " << *bd
<< " under " << *dir
<< dendl
;
849 mds
->hit_export_target(ceph_clock_now(), dest
, -1);
852 dir
->state_set(CDir::STATE_EXPORTING
);
854 MDRequestRef mdr
= mds
->mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
855 mdr
->more()->export_dir
= dir
;
857 assert(export_state
.count(dir
) == 0);
858 export_state_t
& stat
= export_state
[dir
];
859 stat
.state
= EXPORT_LOCKING
;
861 stat
.tid
= mdr
->reqid
.tid
;
864 return mds
->mdcache
->dispatch_request(mdr
);
867 void Migrator::dispatch_export_dir(MDRequestRef
& mdr
, int count
)
869 dout(7) << "dispatch_export_dir " << *mdr
<< dendl
;
871 CDir
*dir
= mdr
->more()->export_dir
;
872 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
873 if (it
== export_state
.end() || it
->second
.tid
!= mdr
->reqid
.tid
) {
874 // export must have aborted.
875 dout(7) << "export must have aborted " << *mdr
<< dendl
;
876 mds
->mdcache
->request_finish(mdr
);
879 assert(it
->second
.state
== EXPORT_LOCKING
);
881 mds_rank_t dest
= it
->second
.peer
;
883 if (!mds
->is_export_target(dest
)) {
884 dout(7) << "dest is not yet an export target" << dendl
;
886 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl
;
887 export_try_cancel(dir
);
891 mds
->locker
->drop_locks(mdr
.get());
892 mdr
->drop_local_auth_pins();
894 mds
->wait_for_mdsmap(mds
->mdsmap
->get_epoch(), new C_M_ExportDirWait(this, mdr
, count
+1));
898 if (!dir
->inode
->get_parent_dn()) {
899 dout(7) << "waiting for dir to become stable before export: " << *dir
<< dendl
;
900 dir
->add_waiter(CDir::WAIT_CREATED
, new C_M_ExportDirWait(this, mdr
, 1));
904 if (mdr
->aborted
|| dir
->is_frozen() || dir
->is_freezing()) {
905 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl
;
906 export_try_cancel(dir
);
911 set
<SimpleLock
*> rdlocks
;
912 set
<SimpleLock
*> xlocks
;
913 set
<SimpleLock
*> wrlocks
;
914 get_export_lock_set(dir
, rdlocks
);
915 // If auth MDS of the subtree root inode is neither the exporter MDS
916 // nor the importer MDS and it gathers subtree root's fragstat/neststat
917 // while the subtree is exporting. It's possible that the exporter MDS
918 // and the importer MDS both are auth MDS of the subtree root or both
919 // are not auth MDS of the subtree root at the time they receive the
920 // lock messages. So the auth MDS of the subtree root inode may get no
921 // or duplicated fragstat/neststat for the subtree root dirfrag.
922 wrlocks
.insert(&dir
->get_inode()->filelock
);
923 wrlocks
.insert(&dir
->get_inode()->nestlock
);
924 if (dir
->get_inode()->is_auth()) {
925 dir
->get_inode()->filelock
.set_scatter_wanted();
926 dir
->get_inode()->nestlock
.set_scatter_wanted();
929 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
, NULL
, NULL
, true)) {
931 export_try_cancel(dir
);
935 assert(g_conf
->mds_kill_export_at
!= 1);
936 it
->second
.state
= EXPORT_DISCOVERING
;
938 // send ExportDirDiscover (ask target)
940 dir
->inode
->make_path(path
);
941 MExportDirDiscover
*discover
= new MExportDirDiscover(dir
->dirfrag(), path
,
944 mds
->send_message_mds(discover
, dest
);
945 assert(g_conf
->mds_kill_export_at
!= 2);
947 it
->second
.last_cum_auth_pins_change
= ceph_clock_now();
949 // start the freeze, but hold it up with an auth_pin.
951 assert(dir
->is_freezing_tree());
952 dir
->add_waiter(CDir::WAIT_FROZEN
, new C_MDC_ExportFreeze(this, dir
, it
->second
.tid
));
956 * called on receipt of MExportDirDiscoverAck
957 * the importer now has the directory's _inode_ in memory, and pinned.
959 * This function DOES put the passed message before returning
961 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck
*m
)
963 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
964 mds_rank_t
dest(m
->get_source().num());
965 utime_t now
= ceph_clock_now();
968 dout(7) << "export_discover_ack from " << m
->get_source()
969 << " on " << *dir
<< dendl
;
971 mds
->hit_export_target(now
, dest
, -1);
973 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
974 if (it
== export_state
.end() ||
975 it
->second
.tid
!= m
->get_tid() ||
976 it
->second
.peer
!= dest
) {
977 dout(7) << "must have aborted" << dendl
;
979 assert(it
->second
.state
== EXPORT_DISCOVERING
);
981 if (m
->is_success()) {
982 // release locks to avoid deadlock
983 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(it
->second
.mut
.get());
985 mds
->mdcache
->request_finish(mdr
);
986 it
->second
.mut
.reset();
987 // freeze the subtree
988 it
->second
.state
= EXPORT_FREEZING
;
989 dir
->auth_unpin(this);
990 assert(g_conf
->mds_kill_export_at
!= 3);
993 dout(7) << "peer failed to discover (not active?), canceling" << dendl
;
994 export_try_cancel(dir
, false);
1001 class C_M_ExportSessionsFlushed
: public MigratorContext
{
1005 C_M_ExportSessionsFlushed(Migrator
*m
, CDir
*d
, uint64_t t
)
1006 : MigratorContext(m
), dir(d
), tid(t
) {
1007 assert(dir
!= NULL
);
1009 void finish(int r
) override
{
1010 mig
->export_sessions_flushed(dir
, tid
);
1014 void Migrator::export_sessions_flushed(CDir
*dir
, uint64_t tid
)
1016 dout(7) << "export_sessions_flushed " << *dir
<< dendl
;
1018 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1019 if (it
== export_state
.end() ||
1020 it
->second
.state
== EXPORT_CANCELLING
||
1021 it
->second
.tid
!= tid
) {
1022 // export must have aborted.
1023 dout(7) << "export must have aborted on " << dir
<< dendl
;
1027 assert(it
->second
.state
== EXPORT_PREPPING
|| it
->second
.state
== EXPORT_WARNING
);
1028 assert(it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0);
1029 it
->second
.warning_ack_waiting
.erase(MDS_RANK_NONE
);
1030 if (it
->second
.state
== EXPORT_WARNING
&& it
->second
.warning_ack_waiting
.empty())
1031 export_go(dir
); // start export.
1034 void Migrator::export_frozen(CDir
*dir
, uint64_t tid
)
1036 dout(7) << "export_frozen on " << *dir
<< dendl
;
1038 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1039 if (it
== export_state
.end() || it
->second
.tid
!= tid
) {
1040 dout(7) << "export must have aborted" << dendl
;
1044 assert(it
->second
.state
== EXPORT_FREEZING
);
1045 assert(dir
->is_frozen_tree_root());
1046 assert(dir
->get_cum_auth_pins() == 0);
1048 CInode
*diri
= dir
->get_inode();
1050 // ok, try to grab all my locks.
1051 set
<SimpleLock
*> rdlocks
;
1052 get_export_lock_set(dir
, rdlocks
);
1053 if ((diri
->is_auth() && diri
->is_frozen()) ||
1054 !mds
->locker
->can_rdlock_set(rdlocks
) ||
1055 !diri
->filelock
.can_wrlock(-1) ||
1056 !diri
->nestlock
.can_wrlock(-1)) {
1057 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1060 dir
->unfreeze_tree();
1061 cache
->try_subtree_merge(dir
);
1063 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
1064 export_state
.erase(it
);
1066 dir
->state_clear(CDir::STATE_EXPORTING
);
1067 cache
->maybe_send_pending_resolves();
1071 it
->second
.mut
= new MutationImpl();
1072 if (diri
->is_auth())
1073 it
->second
.mut
->auth_pin(diri
);
1074 mds
->locker
->rdlock_take_set(rdlocks
, it
->second
.mut
);
1075 mds
->locker
->wrlock_force(&diri
->filelock
, it
->second
.mut
);
1076 mds
->locker
->wrlock_force(&diri
->nestlock
, it
->second
.mut
);
1078 cache
->show_subtrees();
1080 // CDir::_freeze_tree() should have forced it into subtree.
1081 assert(dir
->get_dir_auth() == mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
1084 cache
->get_subtree_bounds(dir
, bounds
);
1086 // generate prep message, log entry.
1087 MExportDirPrep
*prep
= new MExportDirPrep(dir
->dirfrag(), it
->second
.tid
);
1089 // include list of bystanders
1090 for (compact_map
<mds_rank_t
,unsigned>::iterator p
= dir
->replicas_begin();
1091 p
!= dir
->replicas_end();
1093 if (p
->first
!= it
->second
.peer
) {
1094 dout(10) << "bystander mds." << p
->first
<< dendl
;
1095 prep
->add_bystander(p
->first
);
1099 // include base dirfrag
1100 cache
->replicate_dir(dir
, it
->second
.peer
, prep
->basedir
);
1103 * include spanning tree for all nested exports.
1104 * these need to be on the destination _before_ the final export so that
1105 * dir_auth updates on any nested exports are properly absorbed.
1106 * this includes inodes and dirfrags included in the subtree, but
1107 * only the inodes at the bounds.
1109 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1111 set
<inodeno_t
> inodes_added
;
1112 set
<dirfrag_t
> dirfrags_added
;
1115 for (set
<CDir
*>::iterator p
= bounds
.begin();
1121 bound
->get(CDir::PIN_EXPORTBOUND
);
1122 bound
->state_set(CDir::STATE_EXPORTBOUND
);
1124 dout(7) << " export bound " << *bound
<< dendl
;
1125 prep
->add_bound( bound
->dirfrag() );
1133 // don't repeat inodes
1134 if (inodes_added
.count(cur
->inode
->ino()))
1136 inodes_added
.insert(cur
->inode
->ino());
1138 // prepend dentry + inode
1139 assert(cur
->inode
->is_auth());
1141 cache
->replicate_dentry(cur
->inode
->parent
, it
->second
.peer
, bl
);
1142 dout(7) << " added " << *cur
->inode
->parent
<< dendl
;
1143 cache
->replicate_inode(cur
->inode
, it
->second
.peer
, bl
,
1144 mds
->mdsmap
->get_up_features());
1145 dout(7) << " added " << *cur
->inode
<< dendl
;
1146 bl
.claim_append(tracebl
);
1149 cur
= cur
->get_parent_dir();
1151 // don't repeat dirfrags
1152 if (dirfrags_added
.count(cur
->dirfrag()) ||
1154 start
= 'd'; // start with dentry
1157 dirfrags_added
.insert(cur
->dirfrag());
1160 cache
->replicate_dir(cur
, it
->second
.peer
, bl
);
1161 dout(7) << " added " << *cur
<< dendl
;
1162 bl
.claim_append(tracebl
);
1165 start
= 'f'; // start with dirfrag
1167 bufferlist final_bl
;
1168 dirfrag_t df
= cur
->dirfrag();
1169 ::encode(df
, final_bl
);
1170 ::encode(start
, final_bl
);
1171 final_bl
.claim_append(tracebl
);
1172 prep
->add_trace(final_bl
);
1176 it
->second
.state
= EXPORT_PREPPING
;
1177 mds
->send_message_mds(prep
, it
->second
.peer
);
1178 assert (g_conf
->mds_kill_export_at
!= 4);
1180 // make sure any new instantiations of caps are flushed out
1181 assert(it
->second
.warning_ack_waiting
.empty());
1183 set
<client_t
> export_client_set
;
1184 get_export_client_set(dir
, export_client_set
);
1186 MDSGatherBuilder
gather(g_ceph_context
);
1187 mds
->server
->flush_client_sessions(export_client_set
, gather
);
1188 if (gather
.has_subs()) {
1189 it
->second
.warning_ack_waiting
.insert(MDS_RANK_NONE
);
1190 gather
.set_finisher(new C_M_ExportSessionsFlushed(this, dir
, it
->second
.tid
));
1195 void Migrator::get_export_client_set(CDir
*dir
, set
<client_t
>& client_set
)
1199 while (!dfs
.empty()) {
1200 CDir
*dir
= dfs
.front();
1202 for (CDir::map_t::iterator p
= dir
->begin(); p
!= dir
->end(); ++p
) {
1203 CDentry
*dn
= p
->second
;
1204 if (!dn
->get_linkage()->is_primary())
1206 CInode
*in
= dn
->get_linkage()->get_inode();
1210 in
->get_dirfrags(ls
);
1211 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
1212 if (!(*q
)->state_test(CDir::STATE_EXPORTBOUND
)) {
1213 // include nested dirfrag
1214 assert((*q
)->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1215 dfs
.push_back(*q
); // it's ours, recurse (later)
1219 for (map
<client_t
, Capability
*>::iterator q
= in
->client_caps
.begin();
1220 q
!= in
->client_caps
.end();
1222 client_set
.insert(q
->first
);
1227 void Migrator::get_export_client_set(CInode
*in
, set
<client_t
>& client_set
)
1229 for (map
<client_t
, Capability
*>::iterator q
= in
->client_caps
.begin();
1230 q
!= in
->client_caps
.end();
1232 client_set
.insert(q
->first
);
1235 /* This function DOES put the passed message before returning*/
1236 void Migrator::handle_export_prep_ack(MExportDirPrepAck
*m
)
1238 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1239 mds_rank_t
dest(m
->get_source().num());
1240 utime_t now
= ceph_clock_now();
1243 dout(7) << "export_prep_ack " << *dir
<< dendl
;
1245 mds
->hit_export_target(now
, dest
, -1);
1247 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1248 if (it
== export_state
.end() ||
1249 it
->second
.tid
!= m
->get_tid() ||
1250 it
->second
.peer
!= mds_rank_t(m
->get_source().num())) {
1251 // export must have aborted.
1252 dout(7) << "export must have aborted" << dendl
;
1256 assert(it
->second
.state
== EXPORT_PREPPING
);
1258 if (!m
->is_success()) {
1259 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl
;
1260 export_try_cancel(dir
, false);
1265 assert (g_conf
->mds_kill_export_at
!= 5);
1268 cache
->get_subtree_bounds(dir
, bounds
);
1270 assert(it
->second
.warning_ack_waiting
.empty() ||
1271 (it
->second
.warning_ack_waiting
.size() == 1 &&
1272 it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0));
1273 assert(it
->second
.notify_ack_waiting
.empty());
1275 for (compact_map
<mds_rank_t
,unsigned>::iterator p
= dir
->replicas_begin();
1276 p
!= dir
->replicas_end();
1278 if (p
->first
== it
->second
.peer
) continue;
1279 if (mds
->is_cluster_degraded() &&
1280 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(p
->first
))
1281 continue; // only if active
1282 it
->second
.warning_ack_waiting
.insert(p
->first
);
1283 it
->second
.notify_ack_waiting
.insert(p
->first
); // we'll eventually get a notifyack, too!
1285 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), it
->second
.tid
, true,
1286 mds_authority_t(mds
->get_nodeid(),CDIR_AUTH_UNKNOWN
),
1287 mds_authority_t(mds
->get_nodeid(),it
->second
.peer
));
1288 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
1289 notify
->get_bounds().push_back((*q
)->dirfrag());
1290 mds
->send_message_mds(notify
, p
->first
);
1294 it
->second
.state
= EXPORT_WARNING
;
1296 assert(g_conf
->mds_kill_export_at
!= 6);
1298 if (it
->second
.warning_ack_waiting
.empty())
1299 export_go(dir
); // start export.
1306 class C_M_ExportGo
: public MigratorContext
{
1310 C_M_ExportGo(Migrator
*m
, CDir
*d
, uint64_t t
) :
1311 MigratorContext(m
), dir(d
), tid(t
) {
1312 assert(dir
!= NULL
);
1314 void finish(int r
) override
{
1315 mig
->export_go_synced(dir
, tid
);
1319 void Migrator::export_go(CDir
*dir
)
1321 assert(export_state
.count(dir
));
1322 dout(7) << "export_go " << *dir
<< " to " << export_state
[dir
].peer
<< dendl
;
1324 // first sync log to flush out e.g. any cap imports
1325 mds
->mdlog
->wait_for_safe(new C_M_ExportGo(this, dir
, export_state
[dir
].tid
));
1326 mds
->mdlog
->flush();
1329 void Migrator::export_go_synced(CDir
*dir
, uint64_t tid
)
1331 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1332 if (it
== export_state
.end() ||
1333 it
->second
.state
== EXPORT_CANCELLING
||
1334 it
->second
.tid
!= tid
) {
1335 // export must have aborted.
1336 dout(7) << "export must have aborted on " << dir
<< dendl
;
1339 assert(it
->second
.state
== EXPORT_WARNING
);
1340 mds_rank_t dest
= it
->second
.peer
;
1342 dout(7) << "export_go_synced " << *dir
<< " to " << dest
<< dendl
;
1344 cache
->show_subtrees();
1346 it
->second
.state
= EXPORT_EXPORTING
;
1347 assert(g_conf
->mds_kill_export_at
!= 7);
1349 assert(dir
->is_frozen_tree_root());
1350 assert(dir
->get_cum_auth_pins() == 0);
1352 // set ambiguous auth
1353 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), dest
);
1355 // take away the popularity we're sending.
1356 utime_t now
= ceph_clock_now();
1357 mds
->balancer
->subtract_export(dir
, now
);
1359 // fill export message with cache data
1360 MExportDir
*req
= new MExportDir(dir
->dirfrag(), it
->second
.tid
);
1361 map
<client_t
,entity_inst_t
> exported_client_map
;
1362 uint64_t num_exported_inodes
= encode_export_dir(req
->export_data
,
1363 dir
, // recur start point
1364 exported_client_map
,
1366 ::encode(exported_client_map
, req
->client_map
,
1367 mds
->mdsmap
->get_up_features());
1369 // add bounds to message
1371 cache
->get_subtree_bounds(dir
, bounds
);
1372 for (set
<CDir
*>::iterator p
= bounds
.begin();
1375 req
->add_export((*p
)->dirfrag());
1378 mds
->send_message_mds(req
, dest
);
1379 assert(g_conf
->mds_kill_export_at
!= 8);
1381 mds
->hit_export_target(now
, dest
, num_exported_inodes
+1);
1384 if (mds
->logger
) mds
->logger
->inc(l_mds_exported
);
1385 if (mds
->logger
) mds
->logger
->inc(l_mds_exported_inodes
, num_exported_inodes
);
1387 cache
->show_subtrees();
1391 /** encode_export_inode
1392 * update our local state for this inode to export.
1393 * encode relevant state to be sent over the wire.
1394 * used by: encode_export_dir, file_rename (if foreign)
1396 * FIXME: the separation between CInode.encode_export and these methods
1397 * is pretty arbitrary and dumb.
1399 void Migrator::encode_export_inode(CInode
*in
, bufferlist
& enc_state
,
1400 map
<client_t
,entity_inst_t
>& exported_client_map
)
1402 dout(7) << "encode_export_inode " << *in
<< dendl
;
1403 assert(!in
->is_replica(mds
->get_nodeid()));
1406 if (!in
->is_replicated()) {
1407 in
->replicate_relax_locks();
1408 dout(20) << " did replicate_relax_locks, now " << *in
<< dendl
;
1411 ::encode(in
->inode
.ino
, enc_state
);
1412 ::encode(in
->last
, enc_state
);
1413 in
->encode_export(enc_state
);
1416 encode_export_inode_caps(in
, true, enc_state
, exported_client_map
);
1419 void Migrator::encode_export_inode_caps(CInode
*in
, bool auth_cap
, bufferlist
& bl
,
1420 map
<client_t
,entity_inst_t
>& exported_client_map
)
1422 dout(20) << "encode_export_inode_caps " << *in
<< dendl
;
1425 map
<client_t
,Capability::Export
> cap_map
;
1426 in
->export_client_caps(cap_map
);
1427 ::encode(cap_map
, bl
);
1429 ::encode(in
->get_mds_caps_wanted(), bl
);
1431 in
->state_set(CInode::STATE_EXPORTINGCAPS
);
1432 in
->get(CInode::PIN_EXPORTINGCAPS
);
1435 // make note of clients named by exported capabilities
1436 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1437 it
!= in
->client_caps
.end();
1439 exported_client_map
[it
->first
] = mds
->sessionmap
.get_inst(entity_name_t::CLIENT(it
->first
.v
));
1442 void Migrator::finish_export_inode_caps(CInode
*in
, mds_rank_t peer
,
1443 map
<client_t
,Capability::Import
>& peer_imported
)
1445 dout(20) << "finish_export_inode_caps " << *in
<< dendl
;
1447 in
->state_clear(CInode::STATE_EXPORTINGCAPS
);
1448 in
->put(CInode::PIN_EXPORTINGCAPS
);
1450 // tell (all) clients about migrating caps..
1451 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1452 it
!= in
->client_caps
.end();
1454 Capability
*cap
= it
->second
;
1455 dout(7) << "finish_export_inode_caps telling client." << it
->first
1456 << " exported caps on " << *in
<< dendl
;
1457 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
1458 cap
->get_cap_id(), cap
->get_mseq(), mds
->get_osd_epoch_barrier());
1460 map
<client_t
,Capability::Import
>::iterator q
= peer_imported
.find(it
->first
);
1461 assert(q
!= peer_imported
.end());
1462 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
, peer
, 0);
1463 mds
->send_message_client_counted(m
, it
->first
);
1465 in
->clear_client_caps_after_export();
1466 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
1469 void Migrator::finish_export_inode(CInode
*in
, utime_t now
, mds_rank_t peer
,
1470 map
<client_t
,Capability::Import
>& peer_imported
,
1471 list
<MDSInternalContextBase
*>& finished
)
1473 dout(12) << "finish_export_inode " << *in
<< dendl
;
1479 // clear/unpin cached_by (we're no longer the authority)
1480 in
->clear_replica_map();
1482 // twiddle lock states for auth -> replica transition
1483 in
->authlock
.export_twiddle();
1484 in
->linklock
.export_twiddle();
1485 in
->dirfragtreelock
.export_twiddle();
1486 in
->filelock
.export_twiddle();
1487 in
->nestlock
.export_twiddle();
1488 in
->xattrlock
.export_twiddle();
1489 in
->snaplock
.export_twiddle();
1490 in
->flocklock
.export_twiddle();
1491 in
->policylock
.export_twiddle();
1494 assert(in
->is_auth());
1495 in
->state_clear(CInode::STATE_AUTH
);
1496 in
->replica_nonce
= CInode::EXPORT_NONCE
;
1498 in
->clear_dirty_rstat();
1500 // no more auth subtree? clear scatter dirty
1501 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid()))
1502 in
->clear_scatter_dirty();
1504 in
->item_open_file
.remove_myself();
1506 in
->clear_dirty_parent();
1508 in
->clear_file_locks();
1511 in
->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
1513 in
->finish_export(now
);
1515 finish_export_inode_caps(in
, peer
, peer_imported
);
1518 uint64_t Migrator::encode_export_dir(bufferlist
& exportbl
,
1520 map
<client_t
,entity_inst_t
>& exported_client_map
,
1523 uint64_t num_exported
= 0;
1525 dout(7) << "encode_export_dir " << *dir
<< " " << dir
->get_num_head_items() << " head items" << dendl
;
1527 assert(dir
->get_projected_version() == dir
->get_version());
1529 #ifdef MDS_VERIFY_FRAGSTAT
1530 if (dir
->is_complete())
1531 dir
->verify_fragstat();
1535 dirfrag_t df
= dir
->dirfrag();
1536 ::encode(df
, exportbl
);
1537 dir
->encode_export(exportbl
);
1539 __u32 nden
= dir
->items
.size();
1540 ::encode(nden
, exportbl
);
1543 list
<CDir
*> subdirs
;
1544 CDir::map_t::iterator it
;
1545 for (it
= dir
->begin(); it
!= dir
->end(); ++it
) {
1546 CDentry
*dn
= it
->second
;
1547 CInode
*in
= dn
->get_linkage()->get_inode();
1549 if (!dn
->is_replicated())
1550 dn
->lock
.replicate_relax();
1555 dout(7) << "encode_export_dir exporting " << *dn
<< dendl
;
1558 ::encode(dn
->name
, exportbl
);
1559 ::encode(dn
->last
, exportbl
);
1562 dn
->encode_export(exportbl
);
1567 if (dn
->get_linkage()->is_null()) {
1568 exportbl
.append("N", 1); // null dentry
1572 if (dn
->get_linkage()->is_remote()) {
1574 exportbl
.append("L", 1); // remote link
1576 inodeno_t ino
= dn
->get_linkage()->get_remote_ino();
1577 unsigned char d_type
= dn
->get_linkage()->get_remote_d_type();
1578 ::encode(ino
, exportbl
);
1579 ::encode(d_type
, exportbl
);
1585 exportbl
.append("I", 1); // inode dentry
1587 encode_export_inode(in
, exportbl
, exported_client_map
); // encode, and (update state for) export
1591 in
->get_dirfrags(dfs
);
1592 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
1594 if (!t
->state_test(CDir::STATE_EXPORTBOUND
)) {
1595 // include nested dirfrag
1596 assert(t
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1597 subdirs
.push_back(t
); // it's ours, recurse (later)
1603 for (list
<CDir
*>::iterator it
= subdirs
.begin(); it
!= subdirs
.end(); ++it
)
1604 num_exported
+= encode_export_dir(exportbl
, *it
, exported_client_map
, now
);
1606 return num_exported
;
1609 void Migrator::finish_export_dir(CDir
*dir
, utime_t now
, mds_rank_t peer
,
1610 map
<inodeno_t
,map
<client_t
,Capability::Import
> >& peer_imported
,
1611 list
<MDSInternalContextBase
*>& finished
, int *num_dentries
)
1613 dout(10) << "finish_export_dir " << *dir
<< dendl
;
1616 dir
->clear_replica_map();
1619 assert(dir
->is_auth());
1620 dir
->state_clear(CDir::STATE_AUTH
);
1621 dir
->remove_bloom();
1622 dir
->replica_nonce
= CDir::EXPORT_NONCE
;
1624 if (dir
->is_dirty())
1627 // suck up all waiters
1628 dir
->take_waiting(CDir::WAIT_ANY_MASK
, finished
); // all dir waiters
1631 dir
->finish_export(now
);
1634 list
<CDir
*> subdirs
;
1635 CDir::map_t::iterator it
;
1636 for (it
= dir
->begin(); it
!= dir
->end(); ++it
) {
1637 CDentry
*dn
= it
->second
;
1638 CInode
*in
= dn
->get_linkage()->get_inode();
1641 dn
->finish_export();
1644 if (dn
->get_linkage()->is_primary()) {
1645 finish_export_inode(in
, now
, peer
, peer_imported
[in
->ino()], finished
);
1648 in
->get_nested_dirfrags(subdirs
);
1651 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
1656 for (list
<CDir
*>::iterator it
= subdirs
.begin(); it
!= subdirs
.end(); ++it
)
1657 finish_export_dir(*it
, now
, peer
, peer_imported
, finished
, num_dentries
);
1660 class C_MDS_ExportFinishLogged
: public MigratorLogContext
{
1663 C_MDS_ExportFinishLogged(Migrator
*m
, CDir
*d
) : MigratorLogContext(m
), dir(d
) {}
1664 void finish(int r
) override
{
1665 mig
->export_logged_finish(dir
);
1671 * i should get an export_ack from the export target.
1673 * This function DOES put the passed message before returning
1675 void Migrator::handle_export_ack(MExportDirAck
*m
)
1677 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1678 mds_rank_t
dest(m
->get_source().num());
1679 utime_t now
= ceph_clock_now();
1681 assert(dir
->is_frozen_tree_root()); // i'm exporting!
1684 dout(7) << "handle_export_ack " << *dir
<< dendl
;
1686 mds
->hit_export_target(now
, dest
, -1);
1688 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1689 assert(it
!= export_state
.end());
1690 assert(it
->second
.state
== EXPORT_EXPORTING
);
1691 assert(it
->second
.tid
== m
->get_tid());
1693 bufferlist::iterator bp
= m
->imported_caps
.begin();
1694 ::decode(it
->second
.peer_imported
, bp
);
1696 it
->second
.state
= EXPORT_LOGGINGFINISH
;
1697 assert (g_conf
->mds_kill_export_at
!= 9);
1699 cache
->get_subtree_bounds(dir
, bounds
);
1702 // include export bounds, to ensure they're in the journal.
1703 EExport
*le
= new EExport(mds
->mdlog
, dir
, it
->second
.peer
);;
1704 mds
->mdlog
->start_entry(le
);
1706 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
1707 le
->metablob
.add_dir(dir
, false);
1708 for (set
<CDir
*>::iterator p
= bounds
.begin();
1712 le
->get_bounds().insert(bound
->dirfrag());
1713 le
->metablob
.add_dir_context(bound
);
1714 le
->metablob
.add_dir(bound
, false);
1717 // list us second, them first.
1718 // this keeps authority().first in sync with subtree auth state in the journal.
1719 cache
->adjust_subtree_auth(dir
, it
->second
.peer
, mds
->get_nodeid());
1721 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1722 mds
->mdlog
->submit_entry(le
, new C_MDS_ExportFinishLogged(this, dir
));
1723 mds
->mdlog
->flush();
1724 assert (g_conf
->mds_kill_export_at
!= 10);
1729 void Migrator::export_notify_abort(CDir
*dir
, set
<CDir
*>& bounds
)
1731 dout(7) << "export_notify_abort " << *dir
<< dendl
;
1733 export_state_t
& stat
= export_state
[dir
];
1734 assert(stat
.state
== EXPORT_CANCELLING
);
1736 if (stat
.notify_ack_waiting
.empty()) {
1737 stat
.state
= EXPORT_CANCELLED
;
1741 dir
->auth_pin(this);
1743 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
1744 p
!= stat
.notify_ack_waiting
.end();
1746 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(),stat
.tid
, true,
1747 pair
<int,int>(mds
->get_nodeid(),stat
.peer
),
1748 pair
<int,int>(mds
->get_nodeid(),CDIR_AUTH_UNKNOWN
));
1749 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
1750 notify
->get_bounds().push_back((*i
)->dirfrag());
1751 mds
->send_message_mds(notify
, *p
);
1756 * this happens if hte dest failes after i send teh export data but before it is acked
1757 * that is, we don't know they safely received and logged it, so we reverse our changes
1760 void Migrator::export_reverse(CDir
*dir
)
1762 dout(7) << "export_reverse " << *dir
<< dendl
;
1764 set
<CInode
*> to_eval
;
1767 cache
->get_subtree_bounds(dir
, bounds
);
1769 // remove exporting pins
1772 while (!rq
.empty()) {
1773 CDir
*t
= rq
.front();
1776 for (CDir::map_t::iterator p
= t
->items
.begin(); p
!= t
->items
.end(); ++p
) {
1777 p
->second
->abort_export();
1778 if (!p
->second
->get_linkage()->is_primary())
1780 CInode
*in
= p
->second
->get_linkage()->get_inode();
1782 if (in
->state_test(CInode::STATE_EVALSTALECAPS
)) {
1783 in
->state_clear(CInode::STATE_EVALSTALECAPS
);
1787 in
->get_nested_dirfrags(rq
);
1792 for (const auto &bd
: bounds
) {
1793 bd
->put(CDir::PIN_EXPORTBOUND
);
1794 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
1797 // notify bystanders
1798 export_notify_abort(dir
, bounds
);
1800 // unfreeze tree, with possible subtree merge.
1801 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
1803 // process delayed expires
1804 cache
->process_delayed_expire(dir
);
1806 dir
->unfreeze_tree();
1807 cache
->try_subtree_merge(dir
);
1809 // revoke/resume stale caps
1810 for (auto in
: to_eval
) {
1811 bool need_issue
= false;
1812 for (auto& p
: in
->get_client_caps()) {
1813 Capability
*cap
= p
.second
;
1814 if (cap
->is_stale()) {
1815 mds
->locker
->revoke_stale_caps(cap
);
1821 (!in
->is_auth() || !mds
->locker
->eval(in
, CEPH_CAP_LOCKS
)))
1822 mds
->locker
->issue_caps(in
);
1825 cache
->show_cache();
1830 * once i get the ack, and logged the EExportFinish(true),
1831 * send notifies (if any), otherwise go straight to finish.
1834 void Migrator::export_logged_finish(CDir
*dir
)
1836 dout(7) << "export_logged_finish " << *dir
<< dendl
;
1838 export_state_t
& stat
= export_state
[dir
];
1842 cache
->get_subtree_bounds(dir
, bounds
);
1844 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
1845 p
!= stat
.notify_ack_waiting
.end();
1847 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
1848 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
1849 pair
<int,int>(stat
.peer
, CDIR_AUTH_UNKNOWN
));
1851 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
1852 notify
->get_bounds().push_back((*i
)->dirfrag());
1854 mds
->send_message_mds(notify
, *p
);
1857 // wait for notifyacks
1858 stat
.state
= EXPORT_NOTIFYING
;
1859 assert (g_conf
->mds_kill_export_at
!= 11);
1861 // no notifies to wait for?
1862 if (stat
.notify_ack_waiting
.empty()) {
1863 export_finish(dir
); // skip notify/notify_ack stage.
1865 // notify peer to send cap import messages to clients
1866 if (!mds
->is_cluster_degraded() ||
1867 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(stat
.peer
)) {
1868 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), false, stat
.tid
), stat
.peer
);
1870 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl
;
1877 * i'll get an ack from each bystander.
1878 * when i get them all, do the export.
1880 * i'll get an ack from each bystander.
1881 * when i get them all, unfreeze and send the finish.
1883 * This function DOES put the passed message before returning
1885 void Migrator::handle_export_notify_ack(MExportDirNotifyAck
*m
)
1887 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1888 mds_rank_t
dest(m
->get_source().num());
1889 utime_t now
= ceph_clock_now();
1891 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1893 mds
->hit_export_target(now
, dest
, -1);
1895 auto export_state_entry
= export_state
.find(dir
);
1896 if (export_state_entry
!= export_state
.end()) {
1897 export_state_t
& stat
= export_state_entry
->second
;
1898 if (stat
.state
== EXPORT_WARNING
&&
1899 stat
.warning_ack_waiting
.erase(from
)) {
1900 // exporting. process warning.
1901 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1902 << ": exporting, processing warning on " << *dir
<< dendl
;
1903 if (stat
.warning_ack_waiting
.empty())
1904 export_go(dir
); // start export.
1905 } else if (stat
.state
== EXPORT_NOTIFYING
&&
1906 stat
.notify_ack_waiting
.erase(from
)) {
1907 // exporting. process notify.
1908 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1909 << ": exporting, processing notify on " << *dir
<< dendl
;
1910 if (stat
.notify_ack_waiting
.empty())
1912 } else if (stat
.state
== EXPORT_CANCELLING
&&
1913 m
->get_new_auth().second
== CDIR_AUTH_UNKNOWN
&& // not warning ack
1914 stat
.notify_ack_waiting
.erase(from
)) {
1915 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1916 << ": cancelling export, processing notify on " << *dir
<< dendl
;
1917 if (stat
.notify_ack_waiting
.empty()) {
1918 export_state
.erase(export_state_entry
);
1919 export_cancel_finish(dir
);
1924 auto import_state_entry
= import_state
.find(dir
->dirfrag());
1925 if (import_state_entry
!= import_state
.end()) {
1926 import_state_t
& stat
= import_state_entry
->second
;
1927 if (stat
.state
== IMPORT_ABORTING
) {
1929 dout(7) << "handle_export_notify_ack from " << m
->get_source()
1930 << ": aborting import on " << *dir
<< dendl
;
1931 assert(stat
.bystanders
.count(from
));
1932 stat
.bystanders
.erase(from
);
1933 if (stat
.bystanders
.empty())
1934 import_reverse_unfreeze(dir
);
1942 void Migrator::export_finish(CDir
*dir
)
1944 dout(5) << "export_finish " << *dir
<< dendl
;
1946 assert (g_conf
->mds_kill_export_at
!= 12);
1947 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1948 if (it
== export_state
.end()) {
1949 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl
;
1953 // send finish/commit to new auth
1954 if (!mds
->is_cluster_degraded() ||
1955 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
)) {
1956 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), true, it
->second
.tid
), it
->second
.peer
);
1958 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl
;
1960 assert(g_conf
->mds_kill_export_at
!= 13);
1962 // finish export (adjust local cache state)
1963 int num_dentries
= 0;
1964 list
<MDSInternalContextBase
*> finished
;
1965 finish_export_dir(dir
, ceph_clock_now(), it
->second
.peer
,
1966 it
->second
.peer_imported
, finished
, &num_dentries
);
1968 assert(!dir
->is_auth());
1969 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
1973 cache
->get_subtree_bounds(dir
, bounds
);
1974 for (set
<CDir
*>::iterator p
= bounds
.begin();
1978 bd
->put(CDir::PIN_EXPORTBOUND
);
1979 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
1982 if (dir
->state_test(CDir::STATE_AUXSUBTREE
))
1983 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
1985 // discard delayed expires
1986 cache
->discard_delayed_expire(dir
);
1988 dout(7) << "export_finish unfreezing" << dendl
;
1990 // unfreeze tree, with possible subtree merge.
1991 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
1992 dir
->unfreeze_tree();
1993 cache
->try_subtree_merge(dir
);
1995 // no more auth subtree? clear scatter dirty
1996 if (!dir
->get_inode()->is_auth() &&
1997 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
1998 dir
->get_inode()->clear_scatter_dirty();
1999 // wake up scatter_nudge waiters
2000 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
2003 if (!finished
.empty())
2004 mds
->queue_waiters(finished
);
2006 MutationRef mut
= it
->second
.mut
;
2007 // remove from exporting list, clean up state
2008 export_state
.erase(it
);
2009 dir
->state_clear(CDir::STATE_EXPORTING
);
2011 cache
->show_subtrees();
2014 cache
->trim(-1, num_dentries
); // try trimming exported dentries
2016 // send pending import_maps?
2017 mds
->mdcache
->maybe_send_pending_resolves();
2019 // drop locks, unpin path
2021 mds
->locker
->drop_locks(mut
.get());
2025 maybe_do_queued_export();
2035 // ==========================================================
2038 void Migrator::handle_export_discover(MExportDirDiscover
*m
)
2040 mds_rank_t from
= m
->get_source_mds();
2041 assert(from
!= mds
->get_nodeid());
2043 dout(7) << "handle_export_discover on " << m
->get_path() << dendl
;
2045 // note import state
2046 dirfrag_t df
= m
->get_dirfrag();
2048 if (!mds
->is_active()) {
2049 dout(7) << " not active, send NACK " << dendl
;
2050 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid(), false), from
);
2055 // only start discovering on this message once.
2056 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2058 assert(it
== import_state
.end());
2060 import_state
[df
].state
= IMPORT_DISCOVERING
;
2061 import_state
[df
].peer
= from
;
2062 import_state
[df
].tid
= m
->get_tid();
2064 // am i retrying after ancient path_traverse results?
2065 if (it
== import_state
.end() ||
2066 it
->second
.peer
!= from
||
2067 it
->second
.tid
!= m
->get_tid()) {
2068 dout(7) << " dropping obsolete message" << dendl
;
2072 assert(it
->second
.state
== IMPORT_DISCOVERING
);
2075 if (!mds
->mdcache
->is_open()) {
2076 dout(5) << " waiting for root" << dendl
;
2077 mds
->mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, m
));
2081 assert (g_conf
->mds_kill_import_at
!= 1);
2084 CInode
*in
= cache
->get_inode(m
->get_dirfrag().ino
);
2086 // must discover it!
2087 filepath
fpath(m
->get_path());
2088 vector
<CDentry
*> trace
;
2089 MDRequestRef null_ref
;
2090 int r
= cache
->path_traverse(null_ref
, m
, NULL
, fpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
2093 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m
->get_path() << ", NAK" << dendl
;
2094 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2097 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2101 dout(7) << "handle_export_discover have " << df
<< " inode " << *in
<< dendl
;
2103 import_state
[df
].state
= IMPORT_DISCOVERED
;
2105 // pin inode in the cache (for now)
2106 assert(in
->is_dir());
2107 in
->get(CInode::PIN_IMPORTING
);
2110 dout(7) << " sending export_discover_ack on " << *in
<< dendl
;
2111 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid()), import_state
[df
].peer
);
2113 assert (g_conf
->mds_kill_import_at
!= 2);
2116 void Migrator::import_reverse_discovering(dirfrag_t df
)
2118 import_state
.erase(df
);
2121 void Migrator::import_reverse_discovered(dirfrag_t df
, CInode
*diri
)
2124 diri
->put(CInode::PIN_IMPORTING
);
2125 import_state
.erase(df
);
2128 void Migrator::import_reverse_prepping(CDir
*dir
)
2131 cache
->map_dirfrag_set(import_state
[dir
->dirfrag()].bound_ls
, bounds
);
2132 import_remove_pins(dir
, bounds
);
2133 import_reverse_final(dir
);
2136 /* This function DOES put the passed message before returning*/
2137 void Migrator::handle_export_cancel(MExportDirCancel
*m
)
2139 dout(7) << "handle_export_cancel on " << m
->get_dirfrag() << dendl
;
2140 dirfrag_t df
= m
->get_dirfrag();
2141 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2142 if (it
== import_state
.end()) {
2143 assert(0 == "got export_cancel in weird state");
2144 } else if (it
->second
.state
== IMPORT_DISCOVERING
) {
2145 import_reverse_discovering(df
);
2146 } else if (it
->second
.state
== IMPORT_DISCOVERED
) {
2147 CInode
*in
= cache
->get_inode(df
.ino
);
2149 import_reverse_discovered(df
, in
);
2150 } else if (it
->second
.state
== IMPORT_PREPPING
) {
2151 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2153 import_reverse_prepping(dir
);
2154 } else if (it
->second
.state
== IMPORT_PREPPED
) {
2155 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2158 cache
->get_subtree_bounds(dir
, bounds
);
2159 import_remove_pins(dir
, bounds
);
2160 // adjust auth back to the exportor
2161 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2162 import_reverse_unfreeze(dir
);
2164 assert(0 == "got export_cancel in weird state");
2169 /* This function DOES put the passed message before returning*/
2170 void Migrator::handle_export_prep(MExportDirPrep
*m
)
2172 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2173 assert(oldauth
!= mds
->get_nodeid());
2177 list
<MDSInternalContextBase
*> finished
;
2179 // assimilate root dir.
2180 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2181 if (!m
->did_assim()) {
2182 assert(it
!= import_state
.end());
2183 assert(it
->second
.state
== IMPORT_DISCOVERED
);
2184 assert(it
->second
.peer
== oldauth
);
2185 diri
= cache
->get_inode(m
->get_dirfrag().ino
);
2187 bufferlist::iterator p
= m
->basedir
.begin();
2188 dir
= cache
->add_replica_dir(p
, diri
, oldauth
, finished
);
2189 dout(7) << "handle_export_prep on " << *dir
<< " (first pass)" << dendl
;
2191 if (it
== import_state
.end() ||
2192 it
->second
.peer
!= oldauth
||
2193 it
->second
.tid
!= m
->get_tid()) {
2194 dout(7) << "handle_export_prep obsolete message, dropping" << dendl
;
2198 assert(it
->second
.state
== IMPORT_PREPPING
);
2199 assert(it
->second
.peer
== oldauth
);
2201 dir
= cache
->get_dirfrag(m
->get_dirfrag());
2203 dout(7) << "handle_export_prep on " << *dir
<< " (subsequent pass)" << dendl
;
2204 diri
= dir
->get_inode();
2206 assert(dir
->is_auth() == false);
2208 cache
->show_subtrees();
2210 // build import bound map
2211 map
<inodeno_t
, fragset_t
> import_bound_fragset
;
2212 for (list
<dirfrag_t
>::iterator p
= m
->get_bounds().begin();
2213 p
!= m
->get_bounds().end();
2215 dout(10) << " bound " << *p
<< dendl
;
2216 import_bound_fragset
[p
->ino
].insert(p
->frag
);
2219 // assimilate contents?
2220 if (!m
->did_assim()) {
2221 dout(7) << "doing assim on " << *dir
<< dendl
;
2222 m
->mark_assim(); // only do this the first time!
2224 // change import state
2225 it
->second
.state
= IMPORT_PREPPING
;
2226 it
->second
.bound_ls
= m
->get_bounds();
2227 it
->second
.bystanders
= m
->get_bystanders();
2228 assert(g_conf
->mds_kill_import_at
!= 3);
2231 dout(7) << "bystanders are " << it
->second
.bystanders
<< dendl
;
2234 diri
->put(CInode::PIN_IMPORTING
);
2235 dir
->get(CDir::PIN_IMPORTING
);
2236 dir
->state_set(CDir::STATE_IMPORTING
);
2238 // assimilate traces to exports
2239 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2240 for (list
<bufferlist
>::iterator p
= m
->traces
.begin();
2241 p
!= m
->traces
.end();
2243 bufferlist::iterator q
= p
->begin();
2248 dout(10) << " trace from " << df
<< " start " << start
<< " len " << p
->length() << dendl
;
2252 cur
= cache
->get_dirfrag(df
);
2254 dout(10) << " had " << *cur
<< dendl
;
2255 } else if (start
== 'f') {
2256 CInode
*in
= cache
->get_inode(df
.ino
);
2258 dout(10) << " had " << *in
<< dendl
;
2259 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2260 dout(10) << " added " << *cur
<< dendl
;
2261 } else if (start
== '-') {
2264 assert(0 == "unrecognized start char");
2266 while (start
!= '-') {
2267 CDentry
*dn
= cache
->add_replica_dentry(q
, cur
, finished
);
2268 dout(10) << " added " << *dn
<< dendl
;
2269 CInode
*in
= cache
->add_replica_inode(q
, dn
, finished
);
2270 dout(10) << " added " << *in
<< dendl
;
2273 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2274 dout(10) << " added " << *cur
<< dendl
;
2278 // make bound sticky
2279 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2280 p
!= import_bound_fragset
.end();
2282 CInode
*in
= cache
->get_inode(p
->first
);
2284 in
->get_stickydirs();
2285 dout(7) << " set stickydirs on bound inode " << *in
<< dendl
;
2289 dout(7) << " not doing assim on " << *dir
<< dendl
;
2292 if (!finished
.empty())
2293 mds
->queue_waiters(finished
);
2296 bool success
= true;
2297 if (mds
->is_active()) {
2299 set
<CDir
*> import_bounds
;
2300 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2301 p
!= import_bound_fragset
.end();
2303 CInode
*in
= cache
->get_inode(p
->first
);
2306 // map fragset into a frag_t list, based on the inode fragtree
2307 list
<frag_t
> fglist
;
2308 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2309 in
->dirfragtree
.get_leaves_under(*q
, fglist
);
2310 dout(10) << " bound inode " << p
->first
<< " fragset " << p
->second
<< " maps to " << fglist
<< dendl
;
2312 for (list
<frag_t
>::iterator q
= fglist
.begin();
2315 CDir
*bound
= cache
->get_dirfrag(dirfrag_t(p
->first
, *q
));
2317 dout(7) << " opening bounding dirfrag " << *q
<< " on " << *in
<< dendl
;
2318 cache
->open_remote_dirfrag(in
, *q
,
2319 new C_MDS_RetryMessage(mds
, m
));
2323 if (!bound
->state_test(CDir::STATE_IMPORTBOUND
)) {
2324 dout(7) << " pinning import bound " << *bound
<< dendl
;
2325 bound
->get(CDir::PIN_IMPORTBOUND
);
2326 bound
->state_set(CDir::STATE_IMPORTBOUND
);
2328 dout(7) << " already pinned import bound " << *bound
<< dendl
;
2330 import_bounds
.insert(bound
);
2334 dout(7) << " all ready, noting auth and freezing import region" << dendl
;
2336 if (!mds
->mdcache
->is_readonly() &&
2337 dir
->get_inode()->filelock
.can_wrlock(-1) &&
2338 dir
->get_inode()->nestlock
.can_wrlock(-1)) {
2339 it
->second
.mut
= new MutationImpl();
2340 // force some locks. hacky.
2341 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, it
->second
.mut
);
2342 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, it
->second
.mut
);
2344 // note that i am an ambiguous auth for this subtree.
2345 // specify bounds, since the exporter explicitly defines the region.
2346 cache
->adjust_bounded_subtree_auth(dir
, import_bounds
,
2347 pair
<int,int>(oldauth
, mds
->get_nodeid()));
2348 cache
->verify_subtree_bounds(dir
, import_bounds
);
2350 dir
->_freeze_tree();
2352 it
->second
.state
= IMPORT_PREPPED
;
2354 dout(7) << " couldn't acquire all needed locks, failing. " << *dir
<< dendl
;
2358 dout(7) << " not active, failing. " << *dir
<< dendl
;
2363 import_reverse_prepping(dir
);
2366 dout(7) << " sending export_prep_ack on " << *dir
<< dendl
;
2367 mds
->send_message(new MExportDirPrepAck(dir
->dirfrag(), success
, m
->get_tid()), m
->get_connection());
2369 assert(g_conf
->mds_kill_import_at
!= 4);
2377 class C_MDS_ImportDirLoggedStart
: public MigratorLogContext
{
2382 map
<client_t
,entity_inst_t
> imported_client_map
;
2383 map
<client_t
,uint64_t> sseqmap
;
2385 C_MDS_ImportDirLoggedStart(Migrator
*m
, CDir
*d
, mds_rank_t f
) :
2386 MigratorLogContext(m
), df(d
->dirfrag()), dir(d
), from(f
) {
2388 void finish(int r
) override
{
2389 mig
->import_logged_start(df
, dir
, from
, imported_client_map
, sseqmap
);
2393 /* This function DOES put the passed message before returning*/
2394 void Migrator::handle_export_dir(MExportDir
*m
)
2396 assert (g_conf
->mds_kill_import_at
!= 5);
2397 CDir
*dir
= cache
->get_dirfrag(m
->dirfrag
);
2400 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2401 dout(7) << "handle_export_dir importing " << *dir
<< " from " << oldauth
<< dendl
;
2403 assert(!dir
->is_auth());
2405 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->dirfrag
);
2406 assert(it
!= import_state
.end());
2407 assert(it
->second
.state
== IMPORT_PREPPED
);
2408 assert(it
->second
.tid
== m
->get_tid());
2409 assert(it
->second
.peer
== oldauth
);
2411 utime_t now
= ceph_clock_now();
2413 if (!dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()))
2414 dir
->get_inode()->dirfragtree
.force_to_leaf(g_ceph_context
, dir
->get_frag());
2416 cache
->show_subtrees();
2418 C_MDS_ImportDirLoggedStart
*onlogged
= new C_MDS_ImportDirLoggedStart(this, dir
, oldauth
);
2420 // start the journal entry
2421 EImportStart
*le
= new EImportStart(mds
->mdlog
, dir
->dirfrag(), m
->bounds
, oldauth
);
2422 mds
->mdlog
->start_entry(le
);
2424 le
->metablob
.add_dir_context(dir
);
2426 // adjust auth (list us _first_)
2427 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), oldauth
);
2429 // new client sessions, open these after we journal
2430 // include imported sessions in EImportStart
2431 bufferlist::iterator cmp
= m
->client_map
.begin();
2432 ::decode(onlogged
->imported_client_map
, cmp
);
2434 le
->cmapv
= mds
->server
->prepare_force_open_sessions(onlogged
->imported_client_map
, onlogged
->sseqmap
);
2435 le
->client_map
.claim(m
->client_map
);
2437 bufferlist::iterator blp
= m
->export_data
.begin();
2438 int num_imported_inodes
= 0;
2439 while (!blp
.end()) {
2440 num_imported_inodes
+=
2441 decode_import_dir(blp
,
2445 mds
->mdlog
->get_current_segment(),
2446 it
->second
.peer_exports
,
2447 it
->second
.updated_scatterlocks
,
2450 dout(10) << " " << m
->bounds
.size() << " imported bounds" << dendl
;
2452 // include bounds in EImportStart
2453 set
<CDir
*> import_bounds
;
2454 for (vector
<dirfrag_t
>::iterator p
= m
->bounds
.begin();
2455 p
!= m
->bounds
.end();
2457 CDir
*bd
= cache
->get_dirfrag(*p
);
2459 le
->metablob
.add_dir(bd
, false); // note that parent metadata is already in the event
2460 import_bounds
.insert(bd
);
2462 cache
->verify_subtree_bounds(dir
, import_bounds
);
2464 // adjust popularity
2465 mds
->balancer
->add_import(dir
, now
);
2467 dout(7) << "handle_export_dir did " << *dir
<< dendl
;
2470 it
->second
.state
= IMPORT_LOGGINGSTART
;
2471 assert (g_conf
->mds_kill_import_at
!= 6);
2474 mds
->mdlog
->submit_entry(le
, onlogged
);
2475 mds
->mdlog
->flush();
2479 mds
->logger
->inc(l_mds_imported
);
2480 mds
->logger
->inc(l_mds_imported_inodes
, num_imported_inodes
);
2488 * this is an import helper
2489 * called by import_finish, and import_reverse and friends.
2491 void Migrator::import_remove_pins(CDir
*dir
, set
<CDir
*>& bounds
)
2493 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2495 dir
->put(CDir::PIN_IMPORTING
);
2496 dir
->state_clear(CDir::STATE_IMPORTING
);
2500 for (list
<dirfrag_t
>::iterator p
= stat
.bound_ls
.begin();
2501 p
!= stat
.bound_ls
.end();
2503 if (did
.count(p
->ino
))
2506 CInode
*in
= cache
->get_inode(p
->ino
);
2508 in
->put_stickydirs();
2511 if (stat
.state
== IMPORT_PREPPING
) {
2512 for (auto bd
: bounds
) {
2513 if (bd
->state_test(CDir::STATE_IMPORTBOUND
)) {
2514 bd
->put(CDir::PIN_IMPORTBOUND
);
2515 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2518 } else if (stat
.state
>= IMPORT_PREPPED
) {
2519 // bounding dirfrags
2520 for (auto bd
: bounds
) {
2521 assert(bd
->state_test(CDir::STATE_IMPORTBOUND
));
2522 bd
->put(CDir::PIN_IMPORTBOUND
);
2523 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2530 * note: this does teh full work of reversing and import and cleaning up
2532 * called by both handle_mds_failure and by handle_resolve (if we are
2533 * a survivor coping with an exporter failure+recovery).
2535 void Migrator::import_reverse(CDir
*dir
)
2537 dout(7) << "import_reverse " << *dir
<< dendl
;
2539 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2540 stat
.state
= IMPORT_ABORTING
;
2543 cache
->get_subtree_bounds(dir
, bounds
);
2546 import_remove_pins(dir
, bounds
);
2548 // update auth, with possible subtree merge.
2549 assert(dir
->is_subtree_root());
2550 if (mds
->is_resolve())
2551 cache
->trim_non_auth_subtree(dir
);
2553 cache
->adjust_subtree_auth(dir
, stat
.peer
);
2555 C_ContextsBase
<MDSInternalContextBase
, MDSInternalContextGather
> *fin
= new C_ContextsBase
<MDSInternalContextBase
, MDSInternalContextGather
>(g_ceph_context
);
2556 if (!dir
->get_inode()->is_auth() &&
2557 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2558 dir
->get_inode()->clear_scatter_dirty();
2559 // wake up scatter_nudge waiters
2560 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2563 int num_dentries
= 0;
2564 // adjust auth bits.
2567 while (!q
.empty()) {
2568 CDir
*cur
= q
.front();
2572 assert(cur
->is_auth());
2573 cur
->state_clear(CDir::STATE_AUTH
);
2574 cur
->remove_bloom();
2575 cur
->clear_replica_map();
2576 cur
->set_replica_nonce(CDir::EXPORT_NONCE
);
2577 if (cur
->is_dirty())
2580 CDir::map_t::iterator it
;
2581 for (it
= cur
->begin(); it
!= cur
->end(); ++it
) {
2582 CDentry
*dn
= it
->second
;
2585 dn
->state_clear(CDentry::STATE_AUTH
);
2586 dn
->clear_replica_map();
2587 dn
->set_replica_nonce(CDentry::EXPORT_NONCE
);
2592 if (dn
->get_linkage()->is_primary()) {
2593 CInode
*in
= dn
->get_linkage()->get_inode();
2594 in
->state_clear(CDentry::STATE_AUTH
);
2595 in
->clear_replica_map();
2596 in
->set_replica_nonce(CInode::EXPORT_NONCE
);
2599 in
->clear_dirty_rstat();
2600 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2601 in
->clear_scatter_dirty();
2602 in
->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2605 in
->clear_dirty_parent();
2607 in
->authlock
.clear_gather();
2608 in
->linklock
.clear_gather();
2609 in
->dirfragtreelock
.clear_gather();
2610 in
->filelock
.clear_gather();
2612 in
->clear_file_locks();
2614 // non-bounding dir?
2616 in
->get_dirfrags(dfs
);
2617 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
2618 if (bounds
.count(*p
) == 0)
2622 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
2627 dir
->add_waiter(CDir::WAIT_UNFREEZE
, fin
);
2629 if (stat
.state
== IMPORT_ACKING
) {
2630 // remove imported caps
2631 for (map
<CInode
*,map
<client_t
,Capability::Export
> >::iterator p
= stat
.peer_exports
.begin();
2632 p
!= stat
.peer_exports
.end();
2634 CInode
*in
= p
->first
;
2635 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2636 q
!= p
->second
.end();
2638 Capability
*cap
= in
->get_client_cap(q
->first
);
2640 if (cap
->is_importing())
2641 in
->remove_client_cap(q
->first
);
2643 in
->put(CInode::PIN_IMPORTINGCAPS
);
2645 for (map
<client_t
,entity_inst_t
>::iterator p
= stat
.client_map
.begin();
2646 p
!= stat
.client_map
.end();
2648 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->first
.v
));
2650 session
->dec_importing();
2655 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false)); // log failure
2657 cache
->trim(-1, num_dentries
); // try trimming dentries
2659 // notify bystanders; wait in aborting state
2660 import_notify_abort(dir
, bounds
);
2663 void Migrator::import_notify_finish(CDir
*dir
, set
<CDir
*>& bounds
)
2665 dout(7) << "import_notify_finish " << *dir
<< dendl
;
2667 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2668 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2669 p
!= stat
.bystanders
.end();
2671 MExportDirNotify
*notify
=
2672 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, false,
2673 pair
<int,int>(stat
.peer
, mds
->get_nodeid()),
2674 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
2675 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2676 notify
->get_bounds().push_back((*i
)->dirfrag());
2677 mds
->send_message_mds(notify
, *p
);
2681 void Migrator::import_notify_abort(CDir
*dir
, set
<CDir
*>& bounds
)
2683 dout(7) << "import_notify_abort " << *dir
<< dendl
;
2685 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2686 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
2687 p
!= stat
.bystanders
.end(); ) {
2688 if (mds
->is_cluster_degraded() &&
2689 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)) {
2690 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2691 stat
.bystanders
.erase(p
++);
2694 MExportDirNotify
*notify
=
2695 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
2696 mds_authority_t(stat
.peer
, mds
->get_nodeid()),
2697 mds_authority_t(stat
.peer
, CDIR_AUTH_UNKNOWN
));
2698 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2699 notify
->get_bounds().push_back((*i
)->dirfrag());
2700 mds
->send_message_mds(notify
, *p
);
2703 if (stat
.bystanders
.empty()) {
2704 dout(7) << "no bystanders, finishing reverse now" << dendl
;
2705 import_reverse_unfreeze(dir
);
2707 assert (g_conf
->mds_kill_import_at
!= 10);
2711 void Migrator::import_reverse_unfreeze(CDir
*dir
)
2713 dout(7) << "import_reverse_unfreeze " << *dir
<< dendl
;
2714 assert(!dir
->is_auth());
2715 cache
->discard_delayed_expire(dir
);
2716 dir
->unfreeze_tree();
2717 if (dir
->is_subtree_root())
2718 cache
->try_subtree_merge(dir
);
2719 import_reverse_final(dir
);
2722 void Migrator::import_reverse_final(CDir
*dir
)
2724 dout(7) << "import_reverse_final " << *dir
<< dendl
;
2727 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2728 assert(it
!= import_state
.end());
2730 MutationRef mut
= it
->second
.mut
;
2731 import_state
.erase(it
);
2733 // send pending import_maps?
2734 mds
->mdcache
->maybe_send_pending_resolves();
2737 mds
->locker
->drop_locks(mut
.get());
2741 cache
->show_subtrees();
2742 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2748 void Migrator::import_logged_start(dirfrag_t df
, CDir
*dir
, mds_rank_t from
,
2749 map
<client_t
,entity_inst_t
>& imported_client_map
,
2750 map
<client_t
,uint64_t>& sseqmap
)
2752 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2753 if (it
== import_state
.end() ||
2754 it
->second
.state
!= IMPORT_LOGGINGSTART
) {
2755 dout(7) << "import " << df
<< " must have aborted" << dendl
;
2756 mds
->server
->finish_force_open_sessions(imported_client_map
, sseqmap
);
2760 dout(7) << "import_logged " << *dir
<< dendl
;
2763 it
->second
.state
= IMPORT_ACKING
;
2765 assert (g_conf
->mds_kill_import_at
!= 7);
2767 // force open client sessions and finish cap import
2768 mds
->server
->finish_force_open_sessions(imported_client_map
, sseqmap
, false);
2769 it
->second
.client_map
.swap(imported_client_map
);
2771 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
2772 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
2773 p
!= it
->second
.peer_exports
.end();
2775 // parameter 'peer' is NONE, delay sending cap import messages to client
2776 finish_import_inode_caps(p
->first
, MDS_RANK_NONE
, true, p
->second
, imported_caps
[p
->first
->ino()]);
2779 // send notify's etc.
2780 dout(7) << "sending ack for " << *dir
<< " to old auth mds." << from
<< dendl
;
2782 // test surviving observer of a failed migration that did not complete
2783 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2785 MExportDirAck
*ack
= new MExportDirAck(dir
->dirfrag(), it
->second
.tid
);
2786 ::encode(imported_caps
, ack
->imported_caps
);
2788 mds
->send_message_mds(ack
, from
);
2789 assert (g_conf
->mds_kill_import_at
!= 8);
2791 cache
->show_subtrees();
2794 /* This function DOES put the passed message before returning*/
2795 void Migrator::handle_export_finish(MExportDirFinish
*m
)
2797 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
2799 dout(7) << "handle_export_finish on " << *dir
<< (m
->is_last() ? " last" : "") << dendl
;
2801 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2802 assert(it
!= import_state
.end());
2803 assert(it
->second
.tid
== m
->get_tid());
2805 import_finish(dir
, false, m
->is_last());
2810 void Migrator::import_finish(CDir
*dir
, bool notify
, bool last
)
2812 dout(7) << "import_finish on " << *dir
<< dendl
;
2814 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
2815 assert(it
!= import_state
.end());
2816 assert(it
->second
.state
== IMPORT_ACKING
|| it
->second
.state
== IMPORT_FINISHING
);
2818 if (it
->second
.state
== IMPORT_ACKING
) {
2819 assert(dir
->is_auth());
2820 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
2824 assert(g_conf
->mds_kill_import_at
!= 9);
2826 if (it
->second
.state
== IMPORT_ACKING
) {
2827 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
2828 p
!= it
->second
.peer_exports
.end();
2830 CInode
*in
= p
->first
;
2831 assert(in
->is_auth());
2832 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2833 q
!= p
->second
.end();
2835 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
2837 Capability
*cap
= in
->get_client_cap(q
->first
);
2839 cap
->merge(q
->second
, true);
2840 cap
->clear_importing();
2841 mds
->mdcache
->do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
,
2842 q
->second
.mseq
- 1, it
->second
.peer
, CEPH_CAP_FLAG_AUTH
);
2845 in
->replica_caps_wanted
= 0;
2847 for (map
<client_t
,entity_inst_t
>::iterator p
= it
->second
.client_map
.begin();
2848 p
!= it
->second
.client_map
.end();
2850 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->first
.v
));
2852 session
->dec_importing();
2857 assert(it
->second
.state
== IMPORT_ACKING
);
2858 it
->second
.state
= IMPORT_FINISHING
;
2864 cache
->get_subtree_bounds(dir
, bounds
);
2867 import_notify_finish(dir
, bounds
);
2869 import_remove_pins(dir
, bounds
);
2871 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
2872 it
->second
.peer_exports
.swap(peer_exports
);
2874 // clear import state (we're done!)
2875 MutationRef mut
= it
->second
.mut
;
2876 import_state
.erase(it
);
2878 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
2880 // process delayed expires
2881 cache
->process_delayed_expire(dir
);
2883 // unfreeze tree, with possible subtree merge.
2884 dir
->unfreeze_tree();
2885 cache
->try_subtree_merge(dir
);
2887 cache
->show_subtrees();
2888 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2891 mds
->locker
->drop_locks(mut
.get());
2895 // re-eval imported caps
2896 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= peer_exports
.begin();
2897 p
!= peer_exports
.end();
2899 if (p
->first
->is_auth())
2900 mds
->locker
->eval(p
->first
, CEPH_CAP_LOCKS
, true);
2901 p
->first
->put(CInode::PIN_IMPORTINGCAPS
);
2904 // send pending import_maps?
2905 mds
->mdcache
->maybe_send_pending_resolves();
2907 // did i just import mydir?
2908 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
2909 cache
->populate_mydir();
2912 if (dir
->get_num_head_items() == 0 &&
2913 !dir
->inode
->is_auth()) {
2915 export_empty_import(dir
);
2920 void Migrator::decode_import_inode(CDentry
*dn
, bufferlist::iterator
& blp
,
2921 mds_rank_t oldauth
, LogSegment
*ls
,
2922 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
,
2923 list
<ScatterLock
*>& updated_scatterlocks
)
2925 dout(15) << "decode_import_inode on " << *dn
<< dendl
;
2930 ::decode(last
, blp
);
2933 CInode
*in
= cache
->get_inode(ino
, last
);
2935 in
= new CInode(mds
->mdcache
, true, 1, last
);
2939 // state after link -- or not! -sage
2940 in
->decode_import(blp
, ls
); // cap imports are noted for later action
2943 decode_import_inode_caps(in
, true, blp
, peer_exports
);
2945 // link before state -- or not! -sage
2946 if (dn
->get_linkage()->get_inode() != in
) {
2947 assert(!dn
->get_linkage()->get_inode());
2948 dn
->dir
->link_primary_inode(dn
, in
);
2953 cache
->add_inode(in
);
2954 dout(10) << "added " << *in
<< dendl
;
2956 dout(10) << " had " << *in
<< dendl
;
2959 if (in
->inode
.is_dirty_rstat())
2960 in
->mark_dirty_rstat();
2962 // clear if dirtyscattered, since we're going to journal this
2963 // but not until we _actually_ finish the import...
2964 if (in
->filelock
.is_dirty()) {
2965 updated_scatterlocks
.push_back(&in
->filelock
);
2966 mds
->locker
->mark_updated_scatterlock(&in
->filelock
);
2969 if (in
->dirfragtreelock
.is_dirty()) {
2970 updated_scatterlocks
.push_back(&in
->dirfragtreelock
);
2971 mds
->locker
->mark_updated_scatterlock(&in
->dirfragtreelock
);
2974 // adjust replica list
2975 //assert(!in->is_replica(oldauth)); // not true on failed export
2976 in
->add_replica(oldauth
, CInode::EXPORT_NONCE
);
2977 if (in
->is_replica(mds
->get_nodeid()))
2978 in
->remove_replica(mds
->get_nodeid());
2981 void Migrator::decode_import_inode_caps(CInode
*in
, bool auth_cap
,
2982 bufferlist::iterator
&blp
,
2983 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
2985 map
<client_t
,Capability::Export
> cap_map
;
2986 ::decode(cap_map
, blp
);
2988 ::decode(in
->get_mds_caps_wanted(), blp
);
2989 if (!cap_map
.empty() ||
2990 (auth_cap
&& !in
->get_mds_caps_wanted().empty())) {
2991 peer_exports
[in
].swap(cap_map
);
2992 in
->get(CInode::PIN_IMPORTINGCAPS
);
2996 void Migrator::finish_import_inode_caps(CInode
*in
, mds_rank_t peer
, bool auth_cap
,
2997 map
<client_t
,Capability::Export
> &export_map
,
2998 map
<client_t
,Capability::Import
> &import_map
)
3000 for (map
<client_t
,Capability::Export
>::iterator it
= export_map
.begin();
3001 it
!= export_map
.end();
3003 dout(10) << "finish_import_inode_caps for client." << it
->first
<< " on " << *in
<< dendl
;
3004 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(it
->first
.v
));
3007 Capability
*cap
= in
->get_client_cap(it
->first
);
3009 cap
= in
->add_client_cap(it
->first
, session
);
3011 cap
->mark_importing();
3014 Capability::Import
& im
= import_map
[it
->first
];
3015 im
.cap_id
= cap
->get_cap_id();
3016 im
.mseq
= auth_cap
? it
->second
.mseq
: cap
->get_mseq();
3017 im
.issue_seq
= cap
->get_last_seq() + 1;
3020 cap
->merge(it
->second
, auth_cap
);
3021 mds
->mdcache
->do_cap_import(session
, in
, cap
, it
->second
.cap_id
,
3022 it
->second
.seq
, it
->second
.mseq
- 1, peer
,
3023 auth_cap
? CEPH_CAP_FLAG_AUTH
: CEPH_CAP_FLAG_RELEASE
);
3028 in
->replica_caps_wanted
= 0;
3029 in
->put(CInode::PIN_IMPORTINGCAPS
);
3033 int Migrator::decode_import_dir(bufferlist::iterator
& blp
,
3038 map
<CInode
*,map
<client_t
,Capability::Export
> >& peer_exports
,
3039 list
<ScatterLock
*>& updated_scatterlocks
, utime_t now
)
3045 CInode
*diri
= cache
->get_inode(df
.ino
);
3047 CDir
*dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, df
.frag
);
3050 dout(7) << "decode_import_dir " << *dir
<< dendl
;
3053 dir
->decode_import(blp
, now
, ls
);
3055 // adjust replica list
3056 //assert(!dir->is_replica(oldauth)); // not true on failed export
3057 dir
->add_replica(oldauth
, CDir::EXPORT_NONCE
);
3058 if (dir
->is_replica(mds
->get_nodeid()))
3059 dir
->remove_replica(mds
->get_nodeid());
3061 // add to journal entry
3063 le
->metablob
.add_import_dir(dir
);
3065 int num_imported
= 0;
3067 // take all waiters on this dir
3068 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3069 // a replica's presense in my cache implies/forces it's presense in authority's.
3070 list
<MDSInternalContextBase
*> waiters
;
3072 dir
->take_waiting(CDir::WAIT_ANY_MASK
, waiters
);
3073 for (list
<MDSInternalContextBase
*>::iterator it
= waiters
.begin();
3074 it
!= waiters
.end();
3076 import_root
->add_waiter(CDir::WAIT_UNFREEZE
, *it
); // UNFREEZE will get kicked both on success or failure
3078 dout(15) << "doing contents" << dendl
;
3082 ::decode(nden
, blp
);
3084 for (; nden
>0; nden
--) {
3090 ::decode(dname
, blp
);
3091 ::decode(last
, blp
);
3093 CDentry
*dn
= dir
->lookup_exact_snap(dname
, last
);
3095 dn
= dir
->add_null_dentry(dname
, 1, last
);
3097 dn
->decode_import(blp
, ls
);
3099 dn
->add_replica(oldauth
, CDentry::EXPORT_NONCE
);
3100 if (dn
->is_replica(mds
->get_nodeid()))
3101 dn
->remove_replica(mds
->get_nodeid());
3103 // dentry lock in unreadable state can block path traverse
3104 if (dn
->lock
.get_state() != LOCK_SYNC
)
3105 mds
->locker
->try_eval(&dn
->lock
, NULL
);
3107 dout(15) << "decode_import_dir got " << *dn
<< dendl
;
3111 ::decode(icode
, blp
);
3115 assert(dn
->get_linkage()->is_null());
3119 else if (icode
== 'L') {
3122 unsigned char d_type
;
3124 ::decode(d_type
, blp
);
3125 if (dn
->get_linkage()->is_remote()) {
3126 assert(dn
->get_linkage()->get_remote_ino() == ino
);
3128 dir
->link_remote_inode(dn
, ino
, d_type
);
3131 else if (icode
== 'I') {
3134 decode_import_inode(dn
, blp
, oldauth
, ls
,
3135 peer_exports
, updated_scatterlocks
);
3138 // add dentry to journal entry
3140 le
->metablob
.add_import_dentry(dn
);
3143 #ifdef MDS_VERIFY_FRAGSTAT
3144 if (dir
->is_complete())
3145 dir
->verify_fragstat();
3148 dir
->inode
->maybe_export_pin();
3150 dout(7) << "decode_import_dir done " << *dir
<< dendl
;
3151 return num_imported
;
3158 // authority bystander
3160 /* This function DOES put the passed message before returning*/
3161 void Migrator::handle_export_notify(MExportDirNotify
*m
)
3163 if (!(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())) {
3168 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
3170 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3171 mds_authority_t old_auth
= m
->get_old_auth();
3172 mds_authority_t new_auth
= m
->get_new_auth();
3175 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3176 << " on missing dir " << m
->get_dirfrag() << dendl
;
3177 } else if (dir
->authority() != old_auth
) {
3178 dout(7) << "handle_export_notify old_auth was " << dir
->authority()
3179 << " != " << old_auth
<< " -> " << new_auth
3180 << " on " << *dir
<< dendl
;
3182 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3183 << " on " << *dir
<< dendl
;
3186 cache
->map_dirfrag_set(m
->get_bounds(), have
);
3187 cache
->adjust_bounded_subtree_auth(dir
, have
, new_auth
);
3190 cache
->try_subtree_merge(dir
);
3194 if (m
->wants_ack()) {
3195 mds
->send_message_mds(new MExportDirNotifyAck(m
->get_dirfrag(), m
->get_tid(), m
->get_new_auth()), from
);
3198 dout(7) << "handle_export_notify no ack requested" << dendl
;
3205 void Migrator::export_caps(CInode
*in
)
3207 mds_rank_t dest
= in
->authority().first
;
3208 dout(7) << "export_caps to mds." << dest
<< " " << *in
<< dendl
;
3210 assert(in
->is_any_caps());
3211 assert(!in
->is_auth());
3212 assert(!in
->is_ambiguous_auth());
3213 assert(!in
->state_test(CInode::STATE_EXPORTINGCAPS
));
3215 MExportCaps
*ex
= new MExportCaps
;
3216 ex
->ino
= in
->ino();
3218 encode_export_inode_caps(in
, false, ex
->cap_bl
, ex
->client_map
);
3220 mds
->send_message_mds(ex
, dest
);
3223 void Migrator::handle_gather_caps(MGatherCaps
*m
)
3225 CInode
*in
= cache
->get_inode(m
->ino
);
3230 dout(10) << "handle_gather_caps " << *m
<< " from " << m
->get_source()
3233 if (in
->is_any_caps() &&
3235 !in
->is_ambiguous_auth() &&
3236 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
3243 class C_M_LoggedImportCaps
: public MigratorLogContext
{
3247 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3248 map
<client_t
,entity_inst_t
> client_map
;
3249 map
<client_t
,uint64_t> sseqmap
;
3251 C_M_LoggedImportCaps(Migrator
*m
, CInode
*i
, mds_rank_t f
) : MigratorLogContext(m
), in(i
), from(f
) {}
3252 void finish(int r
) override
{
3253 mig
->logged_import_caps(in
, from
, peer_exports
, client_map
, sseqmap
);
3257 /* This function DOES put the passed message before returning*/
3258 void Migrator::handle_export_caps(MExportCaps
*ex
)
3260 dout(10) << "handle_export_caps " << *ex
<< " from " << ex
->get_source() << dendl
;
3261 CInode
*in
= cache
->get_inode(ex
->ino
);
3264 assert(in
->is_auth());
3267 if (in
->is_frozen())
3270 C_M_LoggedImportCaps
*finish
= new C_M_LoggedImportCaps(
3271 this, in
, mds_rank_t(ex
->get_source().num()));
3272 finish
->client_map
= ex
->client_map
;
3275 bufferlist::iterator blp
= ex
->cap_bl
.begin();
3276 decode_import_inode_caps(in
, false, blp
, finish
->peer_exports
);
3277 assert(!finish
->peer_exports
.empty()); // thus, inode is pinned.
3279 // journal open client sessions
3280 version_t pv
= mds
->server
->prepare_force_open_sessions(finish
->client_map
, finish
->sseqmap
);
3282 ESessions
*le
= new ESessions(pv
, ex
->client_map
);
3283 mds
->mdlog
->start_submit_entry(le
, finish
);
3284 mds
->mdlog
->flush();
3290 void Migrator::logged_import_caps(CInode
*in
,
3292 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
,
3293 map
<client_t
,entity_inst_t
>& client_map
,
3294 map
<client_t
,uint64_t>& sseqmap
)
3296 dout(10) << "logged_import_caps on " << *in
<< dendl
;
3297 // see export_go() vs export_go_synced()
3298 assert(in
->is_auth());
3300 // force open client sessions and finish cap import
3301 mds
->server
->finish_force_open_sessions(client_map
, sseqmap
);
3303 map
<client_t
,Capability::Import
> imported_caps
;
3305 assert(peer_exports
.count(in
));
3306 // clients will release caps from the exporter when they receive the cap import message.
3307 finish_import_inode_caps(in
, from
, false, peer_exports
[in
], imported_caps
);
3308 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);