1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "MDBalancer.h"
29 #include "include/filepath.h"
30 #include "common/likely.h"
32 #include "events/EExport.h"
33 #include "events/EImportStart.h"
34 #include "events/EImportFinish.h"
35 #include "events/ESessions.h"
37 #include "msg/Messenger.h"
39 #include "messages/MClientCaps.h"
41 #include "messages/MExportDirDiscover.h"
42 #include "messages/MExportDirDiscoverAck.h"
43 #include "messages/MExportDirCancel.h"
44 #include "messages/MExportDirPrep.h"
45 #include "messages/MExportDirPrepAck.h"
46 #include "messages/MExportDir.h"
47 #include "messages/MExportDirAck.h"
48 #include "messages/MExportDirNotify.h"
49 #include "messages/MExportDirNotifyAck.h"
50 #include "messages/MExportDirFinish.h"
52 #include "messages/MExportCaps.h"
53 #include "messages/MExportCapsAck.h"
54 #include "messages/MGatherCaps.h"
58 * this is what the dir->dir_auth values look like
63 * me, me me - still me, but preparing for export
64 * me, them me - send MExportDir (peer is preparing)
65 * them, me me - journaled EExport
70 * me, them me - journaled EImportStart
74 * - auth bit is set if i am listed as first _or_ second dir_auth.
77 #include "common/config.h"
80 #define dout_context g_ceph_context
81 #define dout_subsys ceph_subsys_mds
83 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
86 class MigratorContext
: public MDSInternalContextBase
{
89 MDSRank
*get_mds() override
{
93 explicit MigratorContext(Migrator
*mig_
) : mig(mig_
) {
98 class MigratorLogContext
: public MDSLogContextBase
{
101 MDSRank
*get_mds() override
{
105 explicit MigratorLogContext(Migrator
*mig_
) : mig(mig_
) {
110 /* This function DOES put the passed message before returning*/
111 void Migrator::dispatch(Message
*m
)
113 if (unlikely(inject_message_loss
)) {
114 if (inject_message_loss
== m
->get_type() - MDS_PORT_MIGRATOR
) {
115 dout(0) << "inject message loss " << *m
<< dendl
;
121 switch (m
->get_type()) {
123 case MSG_MDS_EXPORTDIRDISCOVER
:
124 handle_export_discover(static_cast<MExportDirDiscover
*>(m
));
126 case MSG_MDS_EXPORTDIRPREP
:
127 handle_export_prep(static_cast<MExportDirPrep
*>(m
));
129 case MSG_MDS_EXPORTDIR
:
130 if (unlikely(inject_session_race
)) {
131 dout(0) << "waiting for inject_session_race" << dendl
;
132 mds
->wait_for_any_client_connection(new C_MDS_RetryMessage(mds
, m
));
134 handle_export_dir(static_cast<MExportDir
*>(m
));
137 case MSG_MDS_EXPORTDIRFINISH
:
138 handle_export_finish(static_cast<MExportDirFinish
*>(m
));
140 case MSG_MDS_EXPORTDIRCANCEL
:
141 handle_export_cancel(static_cast<MExportDirCancel
*>(m
));
145 case MSG_MDS_EXPORTDIRDISCOVERACK
:
146 handle_export_discover_ack(static_cast<MExportDirDiscoverAck
*>(m
));
148 case MSG_MDS_EXPORTDIRPREPACK
:
149 handle_export_prep_ack(static_cast<MExportDirPrepAck
*>(m
));
151 case MSG_MDS_EXPORTDIRACK
:
152 handle_export_ack(static_cast<MExportDirAck
*>(m
));
154 case MSG_MDS_EXPORTDIRNOTIFYACK
:
155 handle_export_notify_ack(static_cast<MExportDirNotifyAck
*>(m
));
158 // export 3rd party (dir_auth adjustments)
159 case MSG_MDS_EXPORTDIRNOTIFY
:
160 handle_export_notify(static_cast<MExportDirNotify
*>(m
));
164 case MSG_MDS_EXPORTCAPS
:
165 handle_export_caps(static_cast<MExportCaps
*>(m
));
167 case MSG_MDS_EXPORTCAPSACK
:
168 handle_export_caps_ack(static_cast<MExportCapsAck
*>(m
));
170 case MSG_MDS_GATHERCAPS
:
171 handle_gather_caps(static_cast<MGatherCaps
*>(m
));
175 derr
<< "migrator unknown message " << m
->get_type() << dendl
;
176 assert(0 == "migrator unknown message");
181 class C_MDC_EmptyImport
: public MigratorContext
{
184 C_MDC_EmptyImport(Migrator
*m
, CDir
*d
) : MigratorContext(m
), dir(d
) {}
185 void finish(int r
) override
{
186 mig
->export_empty_import(dir
);
191 void Migrator::export_empty_import(CDir
*dir
)
193 dout(7) << "export_empty_import " << *dir
<< dendl
;
194 assert(dir
->is_subtree_root());
196 if (dir
->inode
->is_auth()) {
197 dout(7) << " inode is auth" << dendl
;
200 if (!dir
->is_auth()) {
201 dout(7) << " not auth" << dendl
;
204 if (dir
->is_freezing() || dir
->is_frozen()) {
205 dout(7) << " freezing or frozen" << dendl
;
208 if (dir
->get_num_head_items() > 0) {
209 dout(7) << " not actually empty" << dendl
;
212 if (dir
->inode
->is_root()) {
213 dout(7) << " root" << dendl
;
217 mds_rank_t dest
= dir
->inode
->authority().first
;
218 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
220 dout(7) << " really empty, exporting to " << dest
<< dendl
;
221 assert (dest
!= mds
->get_nodeid());
223 dout(7) << "exporting to mds." << dest
224 << " empty import " << *dir
<< dendl
;
225 export_dir( dir
, dest
);
228 void Migrator::find_stale_export_freeze()
230 utime_t now
= ceph_clock_now();
231 utime_t cutoff
= now
;
232 cutoff
-= g_conf
->mds_freeze_tree_timeout
;
236 * We could have situations like:
238 * - mds.0 authpins an item in subtree A
239 * - mds.0 sends request to mds.1 to authpin an item in subtree B
240 * - mds.0 freezes subtree A
241 * - mds.1 authpins an item in subtree B
242 * - mds.1 sends request to mds.0 to authpin an item in subtree A
243 * - mds.1 freezes subtree B
244 * - mds.1 receives the remote authpin request from mds.0
245 * (wait because subtree B is freezing)
246 * - mds.0 receives the remote authpin request from mds.1
247 * (wait because subtree A is freezing)
250 * - client request authpins items in subtree B
252 * - import subtree A which is parent of subtree B
253 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
255 * - client request tries authpinning items in subtree A
256 * (wait because subtree A is freezing)
258 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
259 p
!= export_state
.end(); ) {
260 CDir
* dir
= p
->first
;
261 export_state_t
& stat
= p
->second
;
263 if (stat
.state
!= EXPORT_DISCOVERING
&& stat
.state
!= EXPORT_FREEZING
)
265 if (stat
.last_cum_auth_pins
!= dir
->get_cum_auth_pins()) {
266 stat
.last_cum_auth_pins
= dir
->get_cum_auth_pins();
267 stat
.last_cum_auth_pins_change
= now
;
270 if (stat
.last_cum_auth_pins_change
>= cutoff
)
272 if (stat
.num_remote_waiters
> 0 ||
273 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
274 export_try_cancel(dir
);
279 void Migrator::export_try_cancel(CDir
*dir
, bool notify_peer
)
281 dout(10) << "export_try_cancel " << *dir
<< dendl
;
283 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
284 assert(it
!= export_state
.end());
286 int state
= it
->second
.state
;
289 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl
;
290 num_locking_exports
--;
291 it
->second
.state
= EXPORT_CANCELLED
;
292 dir
->auth_unpin(this);
294 case EXPORT_DISCOVERING
:
295 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl
;
296 it
->second
.state
= EXPORT_CANCELLED
;
297 dir
->unfreeze_tree(); // cancel the freeze
298 dir
->auth_unpin(this);
300 (!mds
->is_cluster_degraded() ||
301 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
302 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
305 case EXPORT_FREEZING
:
306 dout(10) << "export state=freezing : canceling freeze" << dendl
;
307 it
->second
.state
= EXPORT_CANCELLED
;
308 dir
->unfreeze_tree(); // cancel the freeze
309 if (dir
->is_subtree_root())
310 cache
->try_subtree_merge(dir
);
312 (!mds
->is_cluster_degraded() ||
313 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
314 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
317 // NOTE: state order reversal, warning comes after prepping
319 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl
;
320 it
->second
.state
= EXPORT_CANCELLING
;
323 case EXPORT_PREPPING
:
324 if (state
!= EXPORT_WARNING
) {
325 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl
;
326 it
->second
.state
= EXPORT_CANCELLED
;
332 cache
->get_subtree_bounds(dir
, bounds
);
333 for (set
<CDir
*>::iterator q
= bounds
.begin();
337 bd
->put(CDir::PIN_EXPORTBOUND
);
338 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
340 if (state
== EXPORT_WARNING
) {
342 export_notify_abort(dir
, it
->second
, bounds
);
343 // process delayed expires
344 cache
->process_delayed_expire(dir
);
347 dir
->unfreeze_tree();
348 cache
->try_subtree_merge(dir
);
350 (!mds
->is_cluster_degraded() ||
351 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
))) // tell them.
352 mds
->send_message_mds(new MExportDirCancel(dir
->dirfrag(), it
->second
.tid
), it
->second
.peer
);
355 case EXPORT_EXPORTING
:
356 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl
;
357 it
->second
.state
= EXPORT_CANCELLING
;
358 export_reverse(dir
, it
->second
);
361 case EXPORT_LOGGINGFINISH
:
362 case EXPORT_NOTIFYING
:
363 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl
;
364 // leave export_state, don't clean up now.
366 case EXPORT_CANCELLING
:
374 if (it
->second
.state
== EXPORT_CANCELLING
||
375 it
->second
.state
== EXPORT_CANCELLED
) {
377 mut
.swap(it
->second
.mut
);
379 if (it
->second
.state
== EXPORT_CANCELLED
) {
380 export_cancel_finish(it
);
384 if (state
== EXPORT_LOCKING
|| state
== EXPORT_DISCOVERING
) {
385 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(mut
.get());
387 mds
->mdcache
->request_kill(mdr
);
389 mds
->locker
->drop_locks(mut
.get());
393 cache
->show_subtrees();
395 maybe_do_queued_export();
399 void Migrator::export_cancel_finish(export_state_iterator
& it
)
401 CDir
*dir
= it
->first
;
402 bool unpin
= (it
->second
.state
== EXPORT_CANCELLING
);
403 auto parent
= std::move(it
->second
.parent
);
405 total_exporting_size
-= it
->second
.approx_size
;
406 export_state
.erase(it
);
408 assert(dir
->state_test(CDir::STATE_EXPORTING
));
409 dir
->clear_exporting();
412 // pinned by Migrator::export_notify_abort()
413 dir
->auth_unpin(this);
415 // send pending import_maps? (these need to go out when all exports have finished.)
416 cache
->maybe_send_pending_resolves();
419 child_export_finish(parent
, false);
422 // ==========================================================
423 // mds failure handling
425 void Migrator::handle_mds_failure_or_stop(mds_rank_t who
)
427 dout(5) << "handle_mds_failure_or_stop mds." << who
<< dendl
;
431 // first add an extra auth_pin on any freezes, so that canceling a
432 // nested freeze doesn't complete one further up the hierarchy and
433 // confuse the shit out of us. we'll remove it after canceling the
434 // freeze. this way no freeze completions run before we want them
436 list
<CDir
*> pinned_dirs
;
437 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
438 p
!= export_state
.end();
440 if (p
->second
.state
== EXPORT_FREEZING
) {
441 CDir
*dir
= p
->first
;
442 dout(10) << "adding temp auth_pin on freezing " << *dir
<< dendl
;
444 pinned_dirs
.push_back(dir
);
448 map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
449 while (p
!= export_state
.end()) {
450 map
<CDir
*,export_state_t
>::iterator next
= p
;
452 CDir
*dir
= p
->first
;
455 // - that are going to the failed node
456 // - that aren't frozen yet (to avoid auth_pin deadlock)
457 // - they havne't prepped yet (they may need to discover bounds to do that)
458 if ((p
->second
.peer
== who
&&
459 p
->second
.state
!= EXPORT_CANCELLING
) ||
460 p
->second
.state
== EXPORT_LOCKING
||
461 p
->second
.state
== EXPORT_DISCOVERING
||
462 p
->second
.state
== EXPORT_FREEZING
||
463 p
->second
.state
== EXPORT_PREPPING
) {
464 // the guy i'm exporting to failed, or we're just freezing.
465 dout(10) << "cleaning up export state (" << p
->second
.state
<< ")"
466 << get_export_statename(p
->second
.state
) << " of " << *dir
<< dendl
;
467 export_try_cancel(dir
);
468 } else if (p
->second
.peer
!= who
) {
470 if (p
->second
.warning_ack_waiting
.erase(who
)) {
471 if (p
->second
.state
== EXPORT_WARNING
) {
472 p
->second
.notify_ack_waiting
.erase(who
); // they won't get a notify either.
473 // exporter waiting for warning acks, let's fake theirs.
474 dout(10) << "faking export_warning_ack from mds." << who
475 << " on " << *dir
<< " to mds." << p
->second
.peer
477 if (p
->second
.warning_ack_waiting
.empty())
481 if (p
->second
.notify_ack_waiting
.erase(who
)) {
482 // exporter is waiting for notify acks, fake it
483 dout(10) << "faking export_notify_ack from mds." << who
484 << " on " << *dir
<< " to mds." << p
->second
.peer
486 if (p
->second
.state
== EXPORT_NOTIFYING
) {
487 if (p
->second
.notify_ack_waiting
.empty())
489 } else if (p
->second
.state
== EXPORT_CANCELLING
) {
490 if (p
->second
.notify_ack_waiting
.empty()) {
491 export_cancel_finish(p
);
503 map
<dirfrag_t
,import_state_t
>::iterator q
= import_state
.begin();
504 while (q
!= import_state
.end()) {
505 map
<dirfrag_t
,import_state_t
>::iterator next
= q
;
507 dirfrag_t df
= q
->first
;
508 CInode
*diri
= mds
->mdcache
->get_inode(df
.ino
);
509 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
511 if (q
->second
.peer
== who
) {
513 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
514 << get_import_statename(q
->second
.state
) << " of " << *dir
<< dendl
;
516 dout(10) << "cleaning up import state (" << q
->second
.state
<< ")"
517 << get_import_statename(q
->second
.state
) << " of " << df
<< dendl
;
519 switch (q
->second
.state
) {
520 case IMPORT_DISCOVERING
:
521 dout(10) << "import state=discovering : clearing state" << dendl
;
522 import_reverse_discovering(df
);
525 case IMPORT_DISCOVERED
:
527 dout(10) << "import state=discovered : unpinning inode " << *diri
<< dendl
;
528 import_reverse_discovered(df
, diri
);
531 case IMPORT_PREPPING
:
533 dout(10) << "import state=prepping : unpinning base+bounds " << *dir
<< dendl
;
534 import_reverse_prepping(dir
, q
->second
);
539 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir
<< dendl
;
542 cache
->get_subtree_bounds(dir
, bounds
);
543 import_remove_pins(dir
, bounds
);
545 // adjust auth back to the exporter
546 cache
->adjust_subtree_auth(dir
, q
->second
.peer
);
548 // notify bystanders ; wait in aborting state
549 q
->second
.state
= IMPORT_ABORTING
;
550 import_notify_abort(dir
, bounds
);
551 assert(g_conf
->mds_kill_import_at
!= 10);
555 case IMPORT_LOGGINGSTART
:
557 dout(10) << "import state=loggingstart : reversing import on " << *dir
<< dendl
;
563 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
564 dout(10) << "import state=acking : noting ambiguous import " << *dir
<< dendl
;
567 cache
->get_subtree_bounds(dir
, bounds
);
568 cache
->add_ambiguous_import(dir
, bounds
);
572 case IMPORT_FINISHING
:
574 dout(10) << "import state=finishing : finishing import on " << *dir
<< dendl
;
575 import_finish(dir
, true);
578 case IMPORT_ABORTING
:
580 dout(10) << "import state=aborting : ignoring repeat failure " << *dir
<< dendl
;
584 auto bystanders_entry
= q
->second
.bystanders
.find(who
);
585 if (bystanders_entry
!= q
->second
.bystanders
.end()) {
586 q
->second
.bystanders
.erase(bystanders_entry
);
587 if (q
->second
.state
== IMPORT_ABORTING
) {
589 dout(10) << "faking export_notify_ack from mds." << who
590 << " on aborting import " << *dir
<< " from mds." << q
->second
.peer
592 if (q
->second
.bystanders
.empty())
593 import_reverse_unfreeze(dir
);
602 while (!pinned_dirs
.empty()) {
603 CDir
*dir
= pinned_dirs
.front();
604 dout(10) << "removing temp auth_pin on " << *dir
<< dendl
;
605 dir
->auth_unpin(this);
606 pinned_dirs
.pop_front();
612 void Migrator::show_importing()
614 dout(10) << "show_importing" << dendl
;
615 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
616 p
!= import_state
.end();
618 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
620 dout(10) << " importing from " << p
->second
.peer
621 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
622 << " " << p
->first
<< " " << *dir
<< dendl
;
624 dout(10) << " importing from " << p
->second
.peer
625 << ": (" << p
->second
.state
<< ") " << get_import_statename(p
->second
.state
)
626 << " " << p
->first
<< dendl
;
631 void Migrator::show_exporting()
633 dout(10) << "show_exporting" << dendl
;
634 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
635 p
!= export_state
.end();
637 dout(10) << " exporting to " << p
->second
.peer
638 << ": (" << p
->second
.state
<< ") " << get_export_statename(p
->second
.state
)
639 << " " << p
->first
->dirfrag() << " " << *p
->first
<< dendl
;
644 void Migrator::audit()
646 if (!g_conf
->subsys
.should_gather(ceph_subsys_mds
, 5))
651 for (map
<dirfrag_t
,import_state_t
>::iterator p
= import_state
.begin();
652 p
!= import_state
.end();
654 if (p
->second
.state
== IMPORT_DISCOVERING
)
656 if (p
->second
.state
== IMPORT_DISCOVERED
) {
657 CInode
*in
= cache
->get_inode(p
->first
.ino
);
661 CDir
*dir
= cache
->get_dirfrag(p
->first
);
663 if (p
->second
.state
== IMPORT_PREPPING
)
665 if (p
->second
.state
== IMPORT_ABORTING
) {
666 assert(!dir
->is_ambiguous_dir_auth());
667 assert(dir
->get_dir_auth().first
!= mds
->get_nodeid());
670 assert(dir
->is_ambiguous_dir_auth());
671 assert(dir
->authority().first
== mds
->get_nodeid() ||
672 dir
->authority().second
== mds
->get_nodeid());
677 for (map
<CDir
*,export_state_t
>::iterator p
= export_state
.begin();
678 p
!= export_state
.end();
680 CDir
*dir
= p
->first
;
681 if (p
->second
.state
== EXPORT_LOCKING
||
682 p
->second
.state
== EXPORT_DISCOVERING
||
683 p
->second
.state
== EXPORT_FREEZING
||
684 p
->second
.state
== EXPORT_CANCELLING
)
686 assert(dir
->is_ambiguous_dir_auth());
687 assert(dir
->authority().first
== mds
->get_nodeid() ||
688 dir
->authority().second
== mds
->get_nodeid());
691 // ambiguous+me subtrees should be importing|exporting
700 // ==========================================================
703 void Migrator::export_dir_nicely(CDir
*dir
, mds_rank_t dest
)
706 dout(7) << "export_dir_nicely " << *dir
<< " to " << dest
<< dendl
;
707 export_queue
.push_back(pair
<dirfrag_t
,mds_rank_t
>(dir
->dirfrag(), dest
));
709 maybe_do_queued_export();
712 void Migrator::maybe_do_queued_export()
719 uint64_t max_total_size
= max_export_size
* 2;
721 while (!export_queue
.empty() &&
722 max_total_size
> total_exporting_size
&&
723 max_total_size
- total_exporting_size
>=
724 max_export_size
* (num_locking_exports
+ 1)) {
726 dirfrag_t df
= export_queue
.front().first
;
727 mds_rank_t dest
= export_queue
.front().second
;
728 export_queue
.pop_front();
730 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
732 if (!dir
->is_auth()) continue;
734 dout(0) << "nicely exporting to mds." << dest
<< " " << *dir
<< dendl
;
736 export_dir(dir
, dest
);
745 class C_MDC_ExportFreeze
: public MigratorContext
{
746 CDir
*ex
; // dir i'm exporting
749 C_MDC_ExportFreeze(Migrator
*m
, CDir
*e
, uint64_t t
) :
750 MigratorContext(m
), ex(e
), tid(t
) {
753 void finish(int r
) override
{
755 mig
->export_frozen(ex
, tid
);
760 void Migrator::get_export_lock_set(CDir
*dir
, set
<SimpleLock
*>& locks
)
763 vector
<CDentry
*> trace
;
764 cache
->make_trace(trace
, dir
->inode
);
765 for (vector
<CDentry
*>::iterator it
= trace
.begin();
768 locks
.insert(&(*it
)->lock
);
770 // prevent scatter gather race
771 locks
.insert(&dir
->get_inode()->dirfragtreelock
);
774 // NOTE: We need to take an rdlock on bounding dirfrags during
775 // migration for a rather irritating reason: when we export the
776 // bound inode, we need to send scatterlock state for the dirfrags
777 // as well, so that the new auth also gets the correct info. If we
778 // race with a refragment, this info is useless, as we can't
779 // redivvy it up. And it's needed for the scatterlocks to work
780 // properly: when the auth is in a sync/lock state it keeps each
781 // dirfrag's portion in the local (auth OR replica) dirfrag.
782 set
<CDir
*> wouldbe_bounds
;
783 cache
->get_wouldbe_subtree_bounds(dir
, wouldbe_bounds
);
784 for (set
<CDir
*>::iterator p
= wouldbe_bounds
.begin(); p
!= wouldbe_bounds
.end(); ++p
)
785 locks
.insert(&(*p
)->get_inode()->dirfragtreelock
);
789 /** export_dir(dir, dest)
790 * public method to initiate an export.
791 * will fail if the directory is freezing, frozen, unpinnable, or root.
793 void Migrator::export_dir(CDir
*dir
, mds_rank_t dest
)
795 dout(7) << "export_dir " << *dir
<< " to " << dest
<< dendl
;
796 assert(dir
->is_auth());
797 assert(dest
!= mds
->get_nodeid());
799 if (!(mds
->is_active() || mds
->is_stopping())) {
800 dout(7) << "i'm not active, no exports for now" << dendl
;
803 if (mds
->mdcache
->is_readonly()) {
804 dout(7) << "read-only FS, no exports for now" << dendl
;
807 if (!mds
->mdsmap
->is_active(dest
)) {
808 dout(7) << "dest not active, no exports for now" << dendl
;
811 if (mds
->is_cluster_degraded()) {
812 dout(7) << "cluster degraded, no exports for now" << dendl
;
815 if (dir
->inode
->is_system()) {
816 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl
;
821 CDir
* parent_dir
= dir
->inode
->get_projected_parent_dir();
822 if (parent_dir
&& parent_dir
->inode
->is_stray()) {
823 if (parent_dir
->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest
)) {
824 dout(7) << "i won't export anything in stray" << dendl
;
828 if (!mds
->is_stopping() && !dir
->inode
->is_exportable(dest
)) {
829 dout(7) << "dir is export pinned" << dendl
;
834 if (dir
->is_frozen() ||
835 dir
->is_freezing()) {
836 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl
;
839 if (dir
->state_test(CDir::STATE_EXPORTING
)) {
840 dout(7) << "already exporting" << dendl
;
844 if (g_conf
->mds_thrash_exports
) {
845 // create random subtree bound (which will not be exported)
847 for (auto p
= dir
->begin(); p
!= dir
->end(); ++p
) {
849 CDentry::linkage_t
*dnl
= dn
->get_linkage();
850 if (dnl
->is_primary()) {
851 CInode
*in
= dnl
->get_inode();
853 in
->get_nested_dirfrags(ls
);
857 int n
= rand() % ls
.size();
861 if (!(bd
->is_frozen() || bd
->is_freezing())) {
862 assert(bd
->is_auth());
863 dir
->state_set(CDir::STATE_AUXSUBTREE
);
864 mds
->mdcache
->adjust_subtree_auth(dir
, mds
->get_nodeid());
865 dout(0) << "export_dir: create aux subtree " << *bd
<< " under " << *dir
<< dendl
;
870 mds
->hit_export_target(ceph_clock_now(), dest
, -1);
873 dir
->mark_exporting();
875 MDRequestRef mdr
= mds
->mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
876 mdr
->more()->export_dir
= dir
;
878 assert(export_state
.count(dir
) == 0);
879 export_state_t
& stat
= export_state
[dir
];
880 num_locking_exports
++;
881 stat
.state
= EXPORT_LOCKING
;
883 stat
.tid
= mdr
->reqid
.tid
;
886 mds
->mdcache
->dispatch_request(mdr
);
890 * check if directory is too large to be export in whole. If it is,
891 * choose some subdirs, whose total size is suitable.
893 void Migrator::maybe_split_export(CDir
* dir
, uint64_t max_size
, bool null_okay
,
894 vector
<pair
<CDir
*, size_t> >& results
)
896 static const unsigned frag_size
= 800;
897 static const unsigned inode_size
= 1000;
898 static const unsigned cap_size
= 80;
899 static const unsigned remote_size
= 10;
900 static const unsigned null_size
= 1;
902 // state for depth-first search
905 CDir::dentry_key_map::iterator iter
;
906 size_t dirfrag_size
= frag_size
;
907 size_t subdirs_size
= 0;
908 bool complete
= true;
909 vector
<CDir
*> siblings
;
910 vector
<pair
<CDir
*, size_t> > subdirs
;
911 LevelData(const LevelData
&) = default;
913 dir(d
), iter(d
->begin()) {}
916 vector
<LevelData
> stack
;
917 stack
.emplace_back(dir
);
919 size_t found_size
= 0;
920 size_t skipped_size
= 0;
923 auto& data
= stack
.back();
924 CDir
*cur
= data
.dir
;
925 auto& it
= data
.iter
;
926 auto& dirfrag_size
= data
.dirfrag_size
;
928 while(it
!= cur
->end()) {
929 CDentry
*dn
= it
->second
;
932 dirfrag_size
+= dn
->name
.size();
933 if (dn
->get_linkage()->is_null()) {
934 dirfrag_size
+= null_size
;
937 if (dn
->get_linkage()->is_remote()) {
938 dirfrag_size
+= remote_size
;
942 CInode
*in
= dn
->get_linkage()->get_inode();
943 dirfrag_size
+= inode_size
;
944 dirfrag_size
+= in
->get_client_caps().size() * cap_size
;
948 in
->get_nested_dirfrags(ls
);
949 std::reverse(ls
.begin(), ls
.end());
951 bool complete
= true;
952 for (auto p
= ls
.begin(); p
!= ls
.end(); ) {
953 if ((*p
)->state_test(CDir::STATE_EXPORTING
) ||
954 (*p
)->is_freezing_dir() || (*p
)->is_frozen_dir()) {
962 // skip exporting dir's ancestors. because they can't get
963 // frozen (exporting dir's parent inode is auth pinned).
964 for (auto p
= stack
.rbegin(); p
< stack
.rend(); ++p
) {
971 stack
.emplace_back(ls
.back());
973 stack
.back().siblings
.swap(ls
);
978 // did above loop push new dirfrag into the stack?
979 if (stack
.back().dir
!= cur
)
983 auto cur_size
= data
.subdirs_size
+ dirfrag_size
;
984 // we can do nothing with large dirfrag
985 if (cur_size
>= max_size
&& found_size
* 2 > max_size
)
988 found_size
+= dirfrag_size
;
990 if (stack
.size() > 1) {
991 auto& parent
= stack
[stack
.size() - 2];
992 parent
.subdirs
.emplace_back(cur
, cur_size
);
993 parent
.subdirs_size
+= cur_size
;
996 // can't merge current dirfrag to its parent if there is skipped subdir
997 results
.insert(results
.end(), data
.subdirs
.begin(), data
.subdirs
.end());
998 skipped_size
+= dirfrag_size
;
1002 ls
.swap(data
.siblings
);
1008 if (found_size
>= max_size
)
1013 stack
.emplace_back(ls
.back());
1015 stack
.back().siblings
.swap(ls
);
1019 for (auto& p
: stack
)
1020 results
.insert(results
.end(), p
.subdirs
.begin(), p
.subdirs
.end());
1022 if (results
.empty() && (!skipped_size
|| !null_okay
))
1023 results
.emplace_back(dir
, found_size
+ skipped_size
);
1026 class C_M_ExportDirWait
: public MigratorContext
{
1030 C_M_ExportDirWait(Migrator
*m
, MDRequestRef mdr
, int count
)
1031 : MigratorContext(m
), mdr(mdr
), count(count
) {}
1032 void finish(int r
) override
{
1033 mig
->dispatch_export_dir(mdr
, count
);
1037 void Migrator::dispatch_export_dir(MDRequestRef
& mdr
, int count
)
1039 CDir
*dir
= mdr
->more()->export_dir
;
1040 dout(7) << "dispatch_export_dir " << *mdr
<< " " << *dir
<< dendl
;
1042 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1043 if (it
== export_state
.end() || it
->second
.tid
!= mdr
->reqid
.tid
) {
1044 // export must have aborted.
1045 dout(7) << "export must have aborted " << *mdr
<< dendl
;
1046 assert(mdr
->killed
|| mdr
->aborted
);
1048 mdr
->aborted
= false;
1049 mds
->mdcache
->request_kill(mdr
);
1053 assert(it
->second
.state
== EXPORT_LOCKING
);
1055 mds_rank_t dest
= it
->second
.peer
;
1057 if (!mds
->is_export_target(dest
)) {
1058 dout(7) << "dest is not yet an export target" << dendl
;
1060 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl
;
1061 export_try_cancel(dir
);
1065 mds
->locker
->drop_locks(mdr
.get());
1066 mdr
->drop_local_auth_pins();
1068 mds
->wait_for_mdsmap(mds
->mdsmap
->get_epoch(), new C_M_ExportDirWait(this, mdr
, count
+1));
1072 if (!dir
->inode
->get_parent_dn()) {
1073 dout(7) << "waiting for dir to become stable before export: " << *dir
<< dendl
;
1074 dir
->add_waiter(CDir::WAIT_CREATED
, new C_M_ExportDirWait(this, mdr
, 1));
1078 if (mdr
->aborted
|| dir
->is_frozen() || dir
->is_freezing()) {
1079 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl
;
1080 export_try_cancel(dir
);
1085 set
<SimpleLock
*> rdlocks
;
1086 set
<SimpleLock
*> xlocks
;
1087 set
<SimpleLock
*> wrlocks
;
1088 get_export_lock_set(dir
, rdlocks
);
1089 // If auth MDS of the subtree root inode is neither the exporter MDS
1090 // nor the importer MDS and it gathers subtree root's fragstat/neststat
1091 // while the subtree is exporting. It's possible that the exporter MDS
1092 // and the importer MDS both are auth MDS of the subtree root or both
1093 // are not auth MDS of the subtree root at the time they receive the
1094 // lock messages. So the auth MDS of the subtree root inode may get no
1095 // or duplicated fragstat/neststat for the subtree root dirfrag.
1096 wrlocks
.insert(&dir
->get_inode()->filelock
);
1097 wrlocks
.insert(&dir
->get_inode()->nestlock
);
1098 if (dir
->get_inode()->is_auth()) {
1099 dir
->get_inode()->filelock
.set_scatter_wanted();
1100 dir
->get_inode()->nestlock
.set_scatter_wanted();
1103 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
, NULL
, NULL
, true)) {
1105 export_try_cancel(dir
);
1109 assert(g_conf
->mds_kill_export_at
!= 1);
1111 auto parent
= it
->second
.parent
;
1113 vector
<pair
<CDir
*, size_t> > results
;
1114 maybe_split_export(dir
, max_export_size
, (bool)parent
, results
);
1116 if (results
.size() == 1 && results
.front().first
== dir
) {
1117 num_locking_exports
--;
1118 it
->second
.state
= EXPORT_DISCOVERING
;
1119 // send ExportDirDiscover (ask target)
1121 dir
->inode
->make_path(path
);
1122 MExportDirDiscover
*discover
= new MExportDirDiscover(dir
->dirfrag(), path
,
1123 mds
->get_nodeid(), it
->second
.tid
);
1124 mds
->send_message_mds(discover
, dest
);
1125 assert(g_conf
->mds_kill_export_at
!= 2);
1127 it
->second
.last_cum_auth_pins_change
= ceph_clock_now();
1128 it
->second
.approx_size
= results
.front().second
;
1129 it
->second
.orig_size
= it
->second
.approx_size
;
1130 total_exporting_size
+= it
->second
.approx_size
;
1132 // start the freeze, but hold it up with an auth_pin.
1134 assert(dir
->is_freezing_tree());
1135 dir
->add_waiter(CDir::WAIT_FROZEN
, new C_MDC_ExportFreeze(this, dir
, it
->second
.tid
));
1140 parent
->pending_children
+= results
.size();
1142 parent
= std::make_shared
<export_base_t
>(dir
->dirfrag(), dest
,
1143 results
.size(), export_queue_gen
);
1146 if (results
.empty()) {
1147 dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
1148 << parent
->dirfrag
<< dendl
;
1149 parent
->restart
= true;
1151 dout(7) << "subtree is too large, splitting it into: " << dendl
;
1154 for (auto& p
: results
) {
1155 CDir
*sub
= p
.first
;
1157 dout(7) << " sub " << *sub
<< dendl
;
1159 sub
->auth_pin(this);
1160 sub
->mark_exporting();
1162 MDRequestRef _mdr
= mds
->mdcache
->request_start_internal(CEPH_MDS_OP_EXPORTDIR
);
1163 _mdr
->more()->export_dir
= sub
;
1165 assert(export_state
.count(sub
) == 0);
1166 auto& stat
= export_state
[sub
];
1167 num_locking_exports
++;
1168 stat
.state
= EXPORT_LOCKING
;
1170 stat
.tid
= _mdr
->reqid
.tid
;
1172 stat
.parent
= parent
;
1173 mds
->mdcache
->dispatch_request(_mdr
);
1176 // cancel the original one
1177 export_try_cancel(dir
);
1180 void Migrator::restart_export_dir(CDir
*dir
, uint64_t tid
)
1182 auto it
= export_state
.find(dir
);
1183 if (it
== export_state
.end() || it
->second
.tid
!= tid
)
1185 if (it
->second
.state
!= EXPORT_DISCOVERING
&&
1186 it
->second
.state
!= EXPORT_FREEZING
)
1189 dout(7) << "restart_export_dir " << *dir
<< dendl
;
1191 std::shared_ptr
<export_base_t
> parent
;
1192 parent
.swap(it
->second
.parent
);
1194 export_queue
.emplace_front(dir
->dirfrag(), it
->second
.peer
);
1196 export_try_cancel(dir
);
1199 child_export_finish(parent
, true);
1202 class C_MDC_RestartExportDir
: public MigratorContext
{
1206 C_MDC_RestartExportDir(Migrator
*m
, CDir
*d
, uint64_t t
) :
1207 MigratorContext(m
), dir(d
), tid(t
) {}
1208 void finish(int r
) override
{
1209 mig
->restart_export_dir(dir
, tid
);
1213 bool Migrator::adjust_export_size(export_state_t
&stat
, CDir
*dir
)
1215 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
1216 dir
->is_freezing_dir() || dir
->is_frozen_dir())
1219 if (stat
.approx_size
>= max_export_size
&&
1220 stat
.approx_size
>= stat
.orig_size
* 2)
1223 vector
<pair
<CDir
*, size_t> > results
;
1224 maybe_split_export(dir
, max_export_size
, true, results
);
1225 if (results
.size() == 1 && results
.front().first
== dir
) {
1226 auto size
= results
.front().second
;
1227 stat
.approx_size
+= size
;
1228 total_exporting_size
+= size
;
1235 void Migrator::adjust_export_after_rename(CInode
* diri
, CDir
*olddir
)
1237 CDir
*newdir
= diri
->get_parent_dir();
1238 if (newdir
== olddir
)
1241 CDir
*freezing_dir
= newdir
->get_freezing_tree_root();
1242 CDir
*old_freezing_dir
= olddir
->get_freezing_tree_root();
1243 if (!freezing_dir
|| freezing_dir
== old_freezing_dir
)
1246 dout(7) << "adjust_export_after_rename " << *diri
<< dendl
;
1248 auto &stat
= export_state
.at(freezing_dir
);
1249 assert(stat
.state
== EXPORT_DISCOVERING
||
1250 stat
.state
== EXPORT_FREEZING
);
1252 if (g_conf
->mds_thrash_exports
) {
1253 if (rand() % 3 == 0) {
1254 mds
->queue_waiter_front(new C_MDC_RestartExportDir(this, freezing_dir
, stat
.tid
));
1260 diri
->get_nested_dirfrags(ls
);
1262 if (!adjust_export_size(stat
, d
)) {
1263 mds
->queue_waiter_front(new C_MDC_RestartExportDir(this, freezing_dir
, stat
.tid
));
1269 void Migrator::child_export_finish(std::shared_ptr
<export_base_t
>& parent
, bool success
)
1272 parent
->restart
= true;
1273 if (--parent
->pending_children
== 0) {
1274 if (parent
->restart
&&
1275 parent
->export_queue_gen
== export_queue_gen
) {
1276 CDir
*origin
= mds
->mdcache
->get_dirfrag(parent
->dirfrag
);
1277 if (origin
&& origin
->is_auth()) {
1278 dout(7) << "child_export_finish requeue " << *origin
<< dendl
;
1279 export_queue
.emplace_front(origin
->dirfrag(), parent
->dest
);
1286 * called on receipt of MExportDirDiscoverAck
1287 * the importer now has the directory's _inode_ in memory, and pinned.
1289 * This function DOES put the passed message before returning
1291 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck
*m
)
1293 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1294 mds_rank_t
dest(m
->get_source().num());
1295 utime_t now
= ceph_clock_now();
1298 dout(7) << "export_discover_ack from " << m
->get_source()
1299 << " on " << *dir
<< dendl
;
1301 mds
->hit_export_target(now
, dest
, -1);
1303 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1304 if (it
== export_state
.end() ||
1305 it
->second
.tid
!= m
->get_tid() ||
1306 it
->second
.peer
!= dest
) {
1307 dout(7) << "must have aborted" << dendl
;
1309 assert(it
->second
.state
== EXPORT_DISCOVERING
);
1311 if (m
->is_success()) {
1312 // release locks to avoid deadlock
1313 MDRequestRef mdr
= static_cast<MDRequestImpl
*>(it
->second
.mut
.get());
1315 mds
->mdcache
->request_finish(mdr
);
1316 it
->second
.mut
.reset();
1317 // freeze the subtree
1318 it
->second
.state
= EXPORT_FREEZING
;
1319 dir
->auth_unpin(this);
1320 assert(g_conf
->mds_kill_export_at
!= 3);
1323 dout(7) << "peer failed to discover (not active?), canceling" << dendl
;
1324 export_try_cancel(dir
, false);
1331 class C_M_ExportSessionsFlushed
: public MigratorContext
{
1335 C_M_ExportSessionsFlushed(Migrator
*m
, CDir
*d
, uint64_t t
)
1336 : MigratorContext(m
), dir(d
), tid(t
) {
1337 assert(dir
!= NULL
);
1339 void finish(int r
) override
{
1340 mig
->export_sessions_flushed(dir
, tid
);
1344 void Migrator::export_sessions_flushed(CDir
*dir
, uint64_t tid
)
1346 dout(7) << "export_sessions_flushed " << *dir
<< dendl
;
1348 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1349 if (it
== export_state
.end() ||
1350 it
->second
.state
== EXPORT_CANCELLING
||
1351 it
->second
.tid
!= tid
) {
1352 // export must have aborted.
1353 dout(7) << "export must have aborted on " << dir
<< dendl
;
1357 assert(it
->second
.state
== EXPORT_PREPPING
|| it
->second
.state
== EXPORT_WARNING
);
1358 assert(it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0);
1359 it
->second
.warning_ack_waiting
.erase(MDS_RANK_NONE
);
1360 if (it
->second
.state
== EXPORT_WARNING
&& it
->second
.warning_ack_waiting
.empty())
1361 export_go(dir
); // start export.
1364 void Migrator::export_frozen(CDir
*dir
, uint64_t tid
)
1366 dout(7) << "export_frozen on " << *dir
<< dendl
;
1368 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1369 if (it
== export_state
.end() || it
->second
.tid
!= tid
) {
1370 dout(7) << "export must have aborted" << dendl
;
1374 assert(it
->second
.state
== EXPORT_FREEZING
);
1375 assert(dir
->is_frozen_tree_root());
1376 assert(dir
->get_cum_auth_pins() == 0);
1378 CInode
*diri
= dir
->get_inode();
1380 // ok, try to grab all my locks.
1381 set
<SimpleLock
*> rdlocks
;
1382 get_export_lock_set(dir
, rdlocks
);
1383 if ((diri
->is_auth() && diri
->is_frozen()) ||
1384 !mds
->locker
->can_rdlock_set(rdlocks
) ||
1385 !diri
->filelock
.can_wrlock(-1) ||
1386 !diri
->nestlock
.can_wrlock(-1)) {
1387 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1389 export_try_cancel(dir
);
1393 it
->second
.mut
= new MutationImpl();
1394 if (diri
->is_auth())
1395 it
->second
.mut
->auth_pin(diri
);
1396 mds
->locker
->rdlock_take_set(rdlocks
, it
->second
.mut
);
1397 mds
->locker
->wrlock_force(&diri
->filelock
, it
->second
.mut
);
1398 mds
->locker
->wrlock_force(&diri
->nestlock
, it
->second
.mut
);
1400 cache
->show_subtrees();
1402 // CDir::_freeze_tree() should have forced it into subtree.
1403 assert(dir
->get_dir_auth() == mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
1406 cache
->get_subtree_bounds(dir
, bounds
);
1408 // generate prep message, log entry.
1409 MExportDirPrep
*prep
= new MExportDirPrep(dir
->dirfrag(), it
->second
.tid
);
1411 // include list of bystanders
1412 for (const auto &p
: dir
->get_replicas()) {
1413 if (p
.first
!= it
->second
.peer
) {
1414 dout(10) << "bystander mds." << p
.first
<< dendl
;
1415 prep
->add_bystander(p
.first
);
1419 // include base dirfrag
1420 cache
->replicate_dir(dir
, it
->second
.peer
, prep
->basedir
);
1423 * include spanning tree for all nested exports.
1424 * these need to be on the destination _before_ the final export so that
1425 * dir_auth updates on any nested exports are properly absorbed.
1426 * this includes inodes and dirfrags included in the subtree, but
1427 * only the inodes at the bounds.
1429 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1431 set
<inodeno_t
> inodes_added
;
1432 set
<dirfrag_t
> dirfrags_added
;
1435 for (set
<CDir
*>::iterator p
= bounds
.begin();
1441 bound
->get(CDir::PIN_EXPORTBOUND
);
1442 bound
->state_set(CDir::STATE_EXPORTBOUND
);
1444 dout(7) << " export bound " << *bound
<< dendl
;
1445 prep
->add_bound( bound
->dirfrag() );
1453 // don't repeat inodes
1454 if (inodes_added
.count(cur
->inode
->ino()))
1456 inodes_added
.insert(cur
->inode
->ino());
1458 // prepend dentry + inode
1459 assert(cur
->inode
->is_auth());
1461 cache
->replicate_dentry(cur
->inode
->parent
, it
->second
.peer
, bl
);
1462 dout(7) << " added " << *cur
->inode
->parent
<< dendl
;
1463 cache
->replicate_inode(cur
->inode
, it
->second
.peer
, bl
,
1464 mds
->mdsmap
->get_up_features());
1465 dout(7) << " added " << *cur
->inode
<< dendl
;
1466 bl
.claim_append(tracebl
);
1469 cur
= cur
->get_parent_dir();
1471 // don't repeat dirfrags
1472 if (dirfrags_added
.count(cur
->dirfrag()) ||
1474 start
= 'd'; // start with dentry
1477 dirfrags_added
.insert(cur
->dirfrag());
1480 cache
->replicate_dir(cur
, it
->second
.peer
, bl
);
1481 dout(7) << " added " << *cur
<< dendl
;
1482 bl
.claim_append(tracebl
);
1485 start
= 'f'; // start with dirfrag
1487 bufferlist final_bl
;
1488 dirfrag_t df
= cur
->dirfrag();
1489 ::encode(df
, final_bl
);
1490 ::encode(start
, final_bl
);
1491 final_bl
.claim_append(tracebl
);
1492 prep
->add_trace(final_bl
);
1496 it
->second
.state
= EXPORT_PREPPING
;
1497 mds
->send_message_mds(prep
, it
->second
.peer
);
1498 assert (g_conf
->mds_kill_export_at
!= 4);
1500 // make sure any new instantiations of caps are flushed out
1501 assert(it
->second
.warning_ack_waiting
.empty());
1503 set
<client_t
> export_client_set
;
1504 get_export_client_set(dir
, export_client_set
);
1506 MDSGatherBuilder
gather(g_ceph_context
);
1507 mds
->server
->flush_client_sessions(export_client_set
, gather
);
1508 if (gather
.has_subs()) {
1509 it
->second
.warning_ack_waiting
.insert(MDS_RANK_NONE
);
1510 gather
.set_finisher(new C_M_ExportSessionsFlushed(this, dir
, it
->second
.tid
));
1515 void Migrator::get_export_client_set(CDir
*dir
, set
<client_t
>& client_set
)
1519 while (!dfs
.empty()) {
1520 CDir
*dir
= dfs
.front();
1522 for (auto& p
: *dir
) {
1523 CDentry
*dn
= p
.second
;
1524 if (!dn
->get_linkage()->is_primary())
1526 CInode
*in
= dn
->get_linkage()->get_inode();
1530 in
->get_dirfrags(ls
);
1531 for (auto& q
: ls
) {
1532 if (!q
->state_test(CDir::STATE_EXPORTBOUND
)) {
1533 // include nested dirfrag
1534 assert(q
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1535 dfs
.push_back(q
); // it's ours, recurse (later)
1539 for (auto& q
: in
->get_client_caps()) {
1540 client_set
.insert(q
.first
);
1546 void Migrator::get_export_client_set(CInode
*in
, set
<client_t
>& client_set
)
1548 for (map
<client_t
, Capability
*>::iterator q
= in
->client_caps
.begin();
1549 q
!= in
->client_caps
.end();
1551 client_set
.insert(q
->first
);
1554 /* This function DOES put the passed message before returning*/
1555 void Migrator::handle_export_prep_ack(MExportDirPrepAck
*m
)
1557 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1558 mds_rank_t
dest(m
->get_source().num());
1559 utime_t now
= ceph_clock_now();
1562 dout(7) << "export_prep_ack " << *dir
<< dendl
;
1564 mds
->hit_export_target(now
, dest
, -1);
1566 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1567 if (it
== export_state
.end() ||
1568 it
->second
.tid
!= m
->get_tid() ||
1569 it
->second
.peer
!= mds_rank_t(m
->get_source().num())) {
1570 // export must have aborted.
1571 dout(7) << "export must have aborted" << dendl
;
1575 assert(it
->second
.state
== EXPORT_PREPPING
);
1577 if (!m
->is_success()) {
1578 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl
;
1579 export_try_cancel(dir
, false);
1584 assert (g_conf
->mds_kill_export_at
!= 5);
1587 cache
->get_subtree_bounds(dir
, bounds
);
1589 assert(it
->second
.warning_ack_waiting
.empty() ||
1590 (it
->second
.warning_ack_waiting
.size() == 1 &&
1591 it
->second
.warning_ack_waiting
.count(MDS_RANK_NONE
) > 0));
1592 assert(it
->second
.notify_ack_waiting
.empty());
1594 for (const auto &p
: dir
->get_replicas()) {
1595 if (p
.first
== it
->second
.peer
) continue;
1596 if (mds
->is_cluster_degraded() &&
1597 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(p
.first
))
1598 continue; // only if active
1599 it
->second
.warning_ack_waiting
.insert(p
.first
);
1600 it
->second
.notify_ack_waiting
.insert(p
.first
); // we'll eventually get a notifyack, too!
1602 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), it
->second
.tid
, true,
1603 mds_authority_t(mds
->get_nodeid(),CDIR_AUTH_UNKNOWN
),
1604 mds_authority_t(mds
->get_nodeid(),it
->second
.peer
));
1605 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
1606 notify
->get_bounds().push_back((*q
)->dirfrag());
1607 mds
->send_message_mds(notify
, p
.first
);
1611 it
->second
.state
= EXPORT_WARNING
;
1613 assert(g_conf
->mds_kill_export_at
!= 6);
1615 if (it
->second
.warning_ack_waiting
.empty())
1616 export_go(dir
); // start export.
1623 class C_M_ExportGo
: public MigratorContext
{
1627 C_M_ExportGo(Migrator
*m
, CDir
*d
, uint64_t t
) :
1628 MigratorContext(m
), dir(d
), tid(t
) {
1629 assert(dir
!= NULL
);
1631 void finish(int r
) override
{
1632 mig
->export_go_synced(dir
, tid
);
1636 void Migrator::export_go(CDir
*dir
)
1638 auto it
= export_state
.find(dir
);
1639 assert(it
!= export_state
.end());
1640 dout(7) << "export_go " << *dir
<< " to " << it
->second
.peer
<< dendl
;
1642 // first sync log to flush out e.g. any cap imports
1643 mds
->mdlog
->wait_for_safe(new C_M_ExportGo(this, dir
, it
->second
.tid
));
1644 mds
->mdlog
->flush();
1647 void Migrator::export_go_synced(CDir
*dir
, uint64_t tid
)
1649 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
1650 if (it
== export_state
.end() ||
1651 it
->second
.state
== EXPORT_CANCELLING
||
1652 it
->second
.tid
!= tid
) {
1653 // export must have aborted.
1654 dout(7) << "export must have aborted on " << dir
<< dendl
;
1657 assert(it
->second
.state
== EXPORT_WARNING
);
1658 mds_rank_t dest
= it
->second
.peer
;
1660 dout(7) << "export_go_synced " << *dir
<< " to " << dest
<< dendl
;
1662 cache
->show_subtrees();
1664 it
->second
.state
= EXPORT_EXPORTING
;
1665 assert(g_conf
->mds_kill_export_at
!= 7);
1667 assert(dir
->is_frozen_tree_root());
1668 assert(dir
->get_cum_auth_pins() == 0);
1670 // set ambiguous auth
1671 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), dest
);
1673 // take away the popularity we're sending.
1674 utime_t now
= ceph_clock_now();
1675 mds
->balancer
->subtract_export(dir
, now
);
1677 // fill export message with cache data
1678 MExportDir
*req
= new MExportDir(dir
->dirfrag(), it
->second
.tid
);
1679 map
<client_t
,entity_inst_t
> exported_client_map
;
1680 uint64_t num_exported_inodes
= encode_export_dir(req
->export_data
,
1681 dir
, // recur start point
1682 exported_client_map
,
1684 ::encode(exported_client_map
, req
->client_map
,
1685 mds
->mdsmap
->get_up_features());
1687 // add bounds to message
1689 cache
->get_subtree_bounds(dir
, bounds
);
1690 for (set
<CDir
*>::iterator p
= bounds
.begin();
1693 req
->add_export((*p
)->dirfrag());
1696 mds
->send_message_mds(req
, dest
);
1697 assert(g_conf
->mds_kill_export_at
!= 8);
1699 mds
->hit_export_target(now
, dest
, num_exported_inodes
+1);
1702 if (mds
->logger
) mds
->logger
->inc(l_mds_exported
);
1703 if (mds
->logger
) mds
->logger
->inc(l_mds_exported_inodes
, num_exported_inodes
);
1705 cache
->show_subtrees();
1709 /** encode_export_inode
1710 * update our local state for this inode to export.
1711 * encode relevant state to be sent over the wire.
1712 * used by: encode_export_dir, file_rename (if foreign)
1714 * FIXME: the separation between CInode.encode_export and these methods
1715 * is pretty arbitrary and dumb.
1717 void Migrator::encode_export_inode(CInode
*in
, bufferlist
& enc_state
,
1718 map
<client_t
,entity_inst_t
>& exported_client_map
)
1720 dout(7) << "encode_export_inode " << *in
<< dendl
;
1721 assert(!in
->is_replica(mds
->get_nodeid()));
1724 if (!in
->is_replicated()) {
1725 in
->replicate_relax_locks();
1726 dout(20) << " did replicate_relax_locks, now " << *in
<< dendl
;
1729 ::encode(in
->inode
.ino
, enc_state
);
1730 ::encode(in
->last
, enc_state
);
1731 in
->encode_export(enc_state
);
1734 encode_export_inode_caps(in
, true, enc_state
, exported_client_map
);
1737 void Migrator::encode_export_inode_caps(CInode
*in
, bool auth_cap
, bufferlist
& bl
,
1738 map
<client_t
,entity_inst_t
>& exported_client_map
)
1740 dout(20) << "encode_export_inode_caps " << *in
<< dendl
;
1743 map
<client_t
,Capability::Export
> cap_map
;
1744 in
->export_client_caps(cap_map
);
1745 ::encode(cap_map
, bl
);
1747 ::encode(in
->get_mds_caps_wanted(), bl
);
1749 in
->state_set(CInode::STATE_EXPORTINGCAPS
);
1750 in
->get(CInode::PIN_EXPORTINGCAPS
);
1753 // make note of clients named by exported capabilities
1754 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1755 it
!= in
->client_caps
.end();
1757 exported_client_map
[it
->first
] = mds
->sessionmap
.get_inst(entity_name_t::CLIENT(it
->first
.v
));
1760 void Migrator::finish_export_inode_caps(CInode
*in
, mds_rank_t peer
,
1761 map
<client_t
,Capability::Import
>& peer_imported
)
1763 dout(20) << "finish_export_inode_caps " << *in
<< dendl
;
1765 in
->state_clear(CInode::STATE_EXPORTINGCAPS
);
1766 in
->put(CInode::PIN_EXPORTINGCAPS
);
1768 // tell (all) clients about migrating caps..
1769 for (map
<client_t
, Capability
*>::iterator it
= in
->client_caps
.begin();
1770 it
!= in
->client_caps
.end();
1772 Capability
*cap
= it
->second
;
1773 dout(7) << "finish_export_inode_caps telling client." << it
->first
1774 << " exported caps on " << *in
<< dendl
;
1775 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
1776 cap
->get_cap_id(), cap
->get_mseq(), mds
->get_osd_epoch_barrier());
1778 map
<client_t
,Capability::Import
>::iterator q
= peer_imported
.find(it
->first
);
1779 assert(q
!= peer_imported
.end());
1780 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
1781 (q
->second
.cap_id
> 0 ? peer
: -1), 0);
1782 mds
->send_message_client_counted(m
, it
->first
);
1784 in
->clear_client_caps_after_export();
1785 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
1788 void Migrator::finish_export_inode(CInode
*in
, utime_t now
, mds_rank_t peer
,
1789 map
<client_t
,Capability::Import
>& peer_imported
,
1790 list
<MDSInternalContextBase
*>& finished
)
1792 dout(12) << "finish_export_inode " << *in
<< dendl
;
1798 // clear/unpin cached_by (we're no longer the authority)
1799 in
->clear_replica_map();
1801 // twiddle lock states for auth -> replica transition
1802 in
->authlock
.export_twiddle();
1803 in
->linklock
.export_twiddle();
1804 in
->dirfragtreelock
.export_twiddle();
1805 in
->filelock
.export_twiddle();
1806 in
->nestlock
.export_twiddle();
1807 in
->xattrlock
.export_twiddle();
1808 in
->snaplock
.export_twiddle();
1809 in
->flocklock
.export_twiddle();
1810 in
->policylock
.export_twiddle();
1813 assert(in
->is_auth());
1814 in
->state_clear(CInode::STATE_AUTH
);
1815 in
->replica_nonce
= CInode::EXPORT_NONCE
;
1817 in
->clear_dirty_rstat();
1819 // no more auth subtree? clear scatter dirty
1820 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid()))
1821 in
->clear_scatter_dirty();
1823 in
->item_open_file
.remove_myself();
1825 in
->clear_dirty_parent();
1827 in
->clear_file_locks();
1830 in
->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
1832 in
->finish_export(now
);
1834 finish_export_inode_caps(in
, peer
, peer_imported
);
1837 uint64_t Migrator::encode_export_dir(bufferlist
& exportbl
,
1839 map
<client_t
,entity_inst_t
>& exported_client_map
,
1842 uint64_t num_exported
= 0;
1844 dout(7) << "encode_export_dir " << *dir
<< " " << dir
->get_num_head_items() << " head items" << dendl
;
1846 assert(dir
->get_projected_version() == dir
->get_version());
1848 #ifdef MDS_VERIFY_FRAGSTAT
1849 if (dir
->is_complete())
1850 dir
->verify_fragstat();
1854 dirfrag_t df
= dir
->dirfrag();
1855 ::encode(df
, exportbl
);
1856 dir
->encode_export(exportbl
);
1858 __u32 nden
= dir
->items
.size();
1859 ::encode(nden
, exportbl
);
1862 list
<CDir
*> subdirs
;
1863 for (auto &p
: *dir
) {
1864 CDentry
*dn
= p
.second
;
1865 CInode
*in
= dn
->get_linkage()->get_inode();
1867 if (!dn
->is_replicated())
1868 dn
->lock
.replicate_relax();
1873 dout(7) << "encode_export_dir exporting " << *dn
<< dendl
;
1876 ::encode(dn
->get_name(), exportbl
);
1877 ::encode(dn
->last
, exportbl
);
1880 dn
->encode_export(exportbl
);
1885 if (dn
->get_linkage()->is_null()) {
1886 exportbl
.append("N", 1); // null dentry
1890 if (dn
->get_linkage()->is_remote()) {
1892 exportbl
.append("L", 1); // remote link
1894 inodeno_t ino
= dn
->get_linkage()->get_remote_ino();
1895 unsigned char d_type
= dn
->get_linkage()->get_remote_d_type();
1896 ::encode(ino
, exportbl
);
1897 ::encode(d_type
, exportbl
);
1903 exportbl
.append("I", 1); // inode dentry
1905 encode_export_inode(in
, exportbl
, exported_client_map
); // encode, and (update state for) export
1909 in
->get_dirfrags(dfs
);
1910 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
1912 if (!t
->state_test(CDir::STATE_EXPORTBOUND
)) {
1913 // include nested dirfrag
1914 assert(t
->get_dir_auth().first
== CDIR_AUTH_PARENT
);
1915 subdirs
.push_front(t
); // it's ours, recurse (later)
1921 for (auto &dir
: subdirs
)
1922 num_exported
+= encode_export_dir(exportbl
, dir
, exported_client_map
, now
);
1924 return num_exported
;
1927 void Migrator::finish_export_dir(CDir
*dir
, utime_t now
, mds_rank_t peer
,
1928 map
<inodeno_t
,map
<client_t
,Capability::Import
> >& peer_imported
,
1929 list
<MDSInternalContextBase
*>& finished
, int *num_dentries
)
1931 dout(10) << "finish_export_dir " << *dir
<< dendl
;
1934 dir
->clear_replica_map();
1937 assert(dir
->is_auth());
1938 dir
->state_clear(CDir::STATE_AUTH
);
1939 dir
->remove_bloom();
1940 dir
->replica_nonce
= CDir::EXPORT_NONCE
;
1942 if (dir
->is_dirty())
1945 // suck up all waiters
1946 dir
->take_waiting(CDir::WAIT_ANY_MASK
, finished
); // all dir waiters
1949 dir
->finish_export(now
);
1952 list
<CDir
*> subdirs
;
1953 for (auto &p
: *dir
) {
1954 CDentry
*dn
= p
.second
;
1955 CInode
*in
= dn
->get_linkage()->get_inode();
1958 dn
->finish_export();
1961 if (dn
->get_linkage()->is_primary()) {
1962 finish_export_inode(in
, now
, peer
, peer_imported
[in
->ino()], finished
);
1965 in
->get_nested_dirfrags(subdirs
);
1968 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
1973 for (list
<CDir
*>::iterator it
= subdirs
.begin(); it
!= subdirs
.end(); ++it
)
1974 finish_export_dir(*it
, now
, peer
, peer_imported
, finished
, num_dentries
);
1977 class C_MDS_ExportFinishLogged
: public MigratorLogContext
{
1980 C_MDS_ExportFinishLogged(Migrator
*m
, CDir
*d
) : MigratorLogContext(m
), dir(d
) {}
1981 void finish(int r
) override
{
1982 mig
->export_logged_finish(dir
);
1988 * i should get an export_ack from the export target.
1990 * This function DOES put the passed message before returning
1992 void Migrator::handle_export_ack(MExportDirAck
*m
)
1994 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
1995 mds_rank_t
dest(m
->get_source().num());
1996 utime_t now
= ceph_clock_now();
1998 assert(dir
->is_frozen_tree_root()); // i'm exporting!
2001 dout(7) << "handle_export_ack " << *dir
<< dendl
;
2003 mds
->hit_export_target(now
, dest
, -1);
2005 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
2006 assert(it
!= export_state
.end());
2007 assert(it
->second
.state
== EXPORT_EXPORTING
);
2008 assert(it
->second
.tid
== m
->get_tid());
2010 bufferlist::iterator bp
= m
->imported_caps
.begin();
2011 ::decode(it
->second
.peer_imported
, bp
);
2013 it
->second
.state
= EXPORT_LOGGINGFINISH
;
2014 assert (g_conf
->mds_kill_export_at
!= 9);
2016 cache
->get_subtree_bounds(dir
, bounds
);
2019 // include export bounds, to ensure they're in the journal.
2020 EExport
*le
= new EExport(mds
->mdlog
, dir
, it
->second
.peer
);;
2021 mds
->mdlog
->start_entry(le
);
2023 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2024 le
->metablob
.add_dir(dir
, false);
2025 for (set
<CDir
*>::iterator p
= bounds
.begin();
2029 le
->get_bounds().insert(bound
->dirfrag());
2030 le
->metablob
.add_dir_context(bound
);
2031 le
->metablob
.add_dir(bound
, false);
2034 // list us second, them first.
2035 // this keeps authority().first in sync with subtree auth state in the journal.
2036 cache
->adjust_subtree_auth(dir
, it
->second
.peer
, mds
->get_nodeid());
2038 // log export completion, then finish (unfreeze, trigger finish context, etc.)
2039 mds
->mdlog
->submit_entry(le
, new C_MDS_ExportFinishLogged(this, dir
));
2040 mds
->mdlog
->flush();
2041 assert (g_conf
->mds_kill_export_at
!= 10);
2046 void Migrator::export_notify_abort(CDir
*dir
, export_state_t
& stat
, set
<CDir
*>& bounds
)
2048 dout(7) << "export_notify_abort " << *dir
<< dendl
;
2050 assert(stat
.state
== EXPORT_CANCELLING
);
2052 if (stat
.notify_ack_waiting
.empty()) {
2053 stat
.state
= EXPORT_CANCELLED
;
2057 dir
->auth_pin(this);
2059 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
2060 p
!= stat
.notify_ack_waiting
.end();
2062 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
2063 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
2064 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
2065 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2066 notify
->get_bounds().push_back((*i
)->dirfrag());
2067 mds
->send_message_mds(notify
, *p
);
2072 * this happens if hte dest failes after i send teh export data but before it is acked
2073 * that is, we don't know they safely received and logged it, so we reverse our changes
2076 void Migrator::export_reverse(CDir
*dir
, export_state_t
& stat
)
2078 dout(7) << "export_reverse " << *dir
<< dendl
;
2080 set
<CInode
*> to_eval
;
2083 cache
->get_subtree_bounds(dir
, bounds
);
2085 // remove exporting pins
2088 while (!rq
.empty()) {
2089 CDir
*t
= rq
.front();
2092 for (auto &p
: *t
) {
2093 CDentry
*dn
= p
.second
;
2095 if (!dn
->get_linkage()->is_primary())
2097 CInode
*in
= dn
->get_linkage()->get_inode();
2099 if (in
->state_test(CInode::STATE_EVALSTALECAPS
)) {
2100 in
->state_clear(CInode::STATE_EVALSTALECAPS
);
2104 in
->get_nested_dirfrags(rq
);
2109 for (auto bd
: bounds
) {
2110 bd
->put(CDir::PIN_EXPORTBOUND
);
2111 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
2114 // notify bystanders
2115 export_notify_abort(dir
, stat
, bounds
);
2117 // unfreeze tree, with possible subtree merge.
2118 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
2120 // process delayed expires
2121 cache
->process_delayed_expire(dir
);
2123 dir
->unfreeze_tree();
2124 cache
->try_subtree_merge(dir
);
2126 // revoke/resume stale caps
2127 for (auto in
: to_eval
) {
2128 bool need_issue
= false;
2129 for (auto& p
: in
->get_client_caps()) {
2130 Capability
*cap
= p
.second
;
2131 if (!cap
->is_stale()) {
2137 (!in
->is_auth() || !mds
->locker
->eval(in
, CEPH_CAP_LOCKS
)))
2138 mds
->locker
->issue_caps(in
);
2141 cache
->show_cache();
2146 * once i get the ack, and logged the EExportFinish(true),
2147 * send notifies (if any), otherwise go straight to finish.
2150 void Migrator::export_logged_finish(CDir
*dir
)
2152 dout(7) << "export_logged_finish " << *dir
<< dendl
;
2154 export_state_t
& stat
= export_state
[dir
];
2158 cache
->get_subtree_bounds(dir
, bounds
);
2160 for (set
<mds_rank_t
>::iterator p
= stat
.notify_ack_waiting
.begin();
2161 p
!= stat
.notify_ack_waiting
.end();
2163 MExportDirNotify
*notify
= new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
2164 pair
<int,int>(mds
->get_nodeid(), stat
.peer
),
2165 pair
<int,int>(stat
.peer
, CDIR_AUTH_UNKNOWN
));
2167 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
2168 notify
->get_bounds().push_back((*i
)->dirfrag());
2170 mds
->send_message_mds(notify
, *p
);
2173 // wait for notifyacks
2174 stat
.state
= EXPORT_NOTIFYING
;
2175 assert (g_conf
->mds_kill_export_at
!= 11);
2177 // no notifies to wait for?
2178 if (stat
.notify_ack_waiting
.empty()) {
2179 export_finish(dir
); // skip notify/notify_ack stage.
2181 // notify peer to send cap import messages to clients
2182 if (!mds
->is_cluster_degraded() ||
2183 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(stat
.peer
)) {
2184 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), false, stat
.tid
), stat
.peer
);
2186 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl
;
2193 * i'll get an ack from each bystander.
2194 * when i get them all, do the export.
2196 * i'll get an ack from each bystander.
2197 * when i get them all, unfreeze and send the finish.
2199 * This function DOES put the passed message before returning
2201 void Migrator::handle_export_notify_ack(MExportDirNotifyAck
*m
)
2203 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
2204 mds_rank_t
dest(m
->get_source().num());
2205 utime_t now
= ceph_clock_now();
2207 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2209 mds
->hit_export_target(now
, dest
, -1);
2211 auto export_state_entry
= export_state
.find(dir
);
2212 if (export_state_entry
!= export_state
.end()) {
2213 export_state_t
& stat
= export_state_entry
->second
;
2214 if (stat
.state
== EXPORT_WARNING
&&
2215 stat
.warning_ack_waiting
.erase(from
)) {
2216 // exporting. process warning.
2217 dout(7) << "handle_export_notify_ack from " << m
->get_source()
2218 << ": exporting, processing warning on " << *dir
<< dendl
;
2219 if (stat
.warning_ack_waiting
.empty())
2220 export_go(dir
); // start export.
2221 } else if (stat
.state
== EXPORT_NOTIFYING
&&
2222 stat
.notify_ack_waiting
.erase(from
)) {
2223 // exporting. process notify.
2224 dout(7) << "handle_export_notify_ack from " << m
->get_source()
2225 << ": exporting, processing notify on " << *dir
<< dendl
;
2226 if (stat
.notify_ack_waiting
.empty())
2228 } else if (stat
.state
== EXPORT_CANCELLING
&&
2229 m
->get_new_auth().second
== CDIR_AUTH_UNKNOWN
&& // not warning ack
2230 stat
.notify_ack_waiting
.erase(from
)) {
2231 dout(7) << "handle_export_notify_ack from " << m
->get_source()
2232 << ": cancelling export, processing notify on " << *dir
<< dendl
;
2233 if (stat
.notify_ack_waiting
.empty()) {
2234 export_cancel_finish(export_state_entry
);
2239 auto import_state_entry
= import_state
.find(dir
->dirfrag());
2240 if (import_state_entry
!= import_state
.end()) {
2241 import_state_t
& stat
= import_state_entry
->second
;
2242 if (stat
.state
== IMPORT_ABORTING
) {
2244 dout(7) << "handle_export_notify_ack from " << m
->get_source()
2245 << ": aborting import on " << *dir
<< dendl
;
2246 assert(stat
.bystanders
.count(from
));
2247 stat
.bystanders
.erase(from
);
2248 if (stat
.bystanders
.empty())
2249 import_reverse_unfreeze(dir
);
2257 void Migrator::export_finish(CDir
*dir
)
2259 dout(5) << "export_finish " << *dir
<< dendl
;
2261 assert (g_conf
->mds_kill_export_at
!= 12);
2262 map
<CDir
*,export_state_t
>::iterator it
= export_state
.find(dir
);
2263 if (it
== export_state
.end()) {
2264 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl
;
2268 // send finish/commit to new auth
2269 if (!mds
->is_cluster_degraded() ||
2270 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(it
->second
.peer
)) {
2271 mds
->send_message_mds(new MExportDirFinish(dir
->dirfrag(), true, it
->second
.tid
), it
->second
.peer
);
2273 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl
;
2275 assert(g_conf
->mds_kill_export_at
!= 13);
2277 // finish export (adjust local cache state)
2278 int num_dentries
= 0;
2279 list
<MDSInternalContextBase
*> finished
;
2280 finish_export_dir(dir
, ceph_clock_now(), it
->second
.peer
,
2281 it
->second
.peer_imported
, finished
, &num_dentries
);
2283 assert(!dir
->is_auth());
2284 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2288 cache
->get_subtree_bounds(dir
, bounds
);
2289 for (set
<CDir
*>::iterator p
= bounds
.begin();
2293 bd
->put(CDir::PIN_EXPORTBOUND
);
2294 bd
->state_clear(CDir::STATE_EXPORTBOUND
);
2297 if (dir
->state_test(CDir::STATE_AUXSUBTREE
))
2298 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
2300 // discard delayed expires
2301 cache
->discard_delayed_expire(dir
);
2303 dout(7) << "export_finish unfreezing" << dendl
;
2305 // unfreeze tree, with possible subtree merge.
2306 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
2307 dir
->unfreeze_tree();
2308 cache
->try_subtree_merge(dir
);
2310 // no more auth subtree? clear scatter dirty
2311 if (!dir
->get_inode()->is_auth() &&
2312 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2313 dir
->get_inode()->clear_scatter_dirty();
2314 // wake up scatter_nudge waiters
2315 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, finished
);
2318 if (!finished
.empty())
2319 mds
->queue_waiters(finished
);
2321 MutationRef mut
= std::move(it
->second
.mut
);
2322 auto parent
= std::move(it
->second
.parent
);
2323 // remove from exporting list, clean up state
2324 total_exporting_size
-= it
->second
.approx_size
;
2325 export_state
.erase(it
);
2327 assert(dir
->state_test(CDir::STATE_EXPORTING
));
2328 dir
->clear_exporting();
2330 cache
->show_subtrees();
2333 cache
->trim(num_dentries
); // try trimming exported dentries
2335 // send pending import_maps?
2336 mds
->mdcache
->maybe_send_pending_resolves();
2338 // drop locks, unpin path
2340 mds
->locker
->drop_locks(mut
.get());
2345 child_export_finish(parent
, true);
2347 maybe_do_queued_export();
2357 // ==========================================================
2360 void Migrator::handle_export_discover(MExportDirDiscover
*m
)
2362 mds_rank_t from
= m
->get_source_mds();
2363 assert(from
!= mds
->get_nodeid());
2365 dout(7) << "handle_export_discover on " << m
->get_path() << dendl
;
2367 // note import state
2368 dirfrag_t df
= m
->get_dirfrag();
2370 if (!mds
->is_active()) {
2371 dout(7) << " not active, send NACK " << dendl
;
2372 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid(), false), from
);
2377 // only start discovering on this message once.
2378 import_state_t
*p_state
;
2379 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2381 assert(it
== import_state
.end());
2383 p_state
= &import_state
[df
];
2384 p_state
->state
= IMPORT_DISCOVERING
;
2385 p_state
->peer
= from
;
2386 p_state
->tid
= m
->get_tid();
2388 // am i retrying after ancient path_traverse results?
2389 if (it
== import_state
.end() ||
2390 it
->second
.peer
!= from
||
2391 it
->second
.tid
!= m
->get_tid()) {
2392 dout(7) << " dropping obsolete message" << dendl
;
2396 assert(it
->second
.state
== IMPORT_DISCOVERING
);
2397 p_state
= &it
->second
;
2400 if (!mds
->mdcache
->is_open()) {
2401 dout(5) << " waiting for root" << dendl
;
2402 mds
->mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, m
));
2406 assert (g_conf
->mds_kill_import_at
!= 1);
2409 CInode
*in
= cache
->get_inode(m
->get_dirfrag().ino
);
2411 // must discover it!
2412 filepath
fpath(m
->get_path());
2413 vector
<CDentry
*> trace
;
2414 MDRequestRef null_ref
;
2415 int r
= cache
->path_traverse(null_ref
, m
, NULL
, fpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
2418 dout(7) << "handle_export_discover failed to discover or not dir " << m
->get_path() << ", NAK" << dendl
;
2419 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2422 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2426 dout(7) << "handle_export_discover have " << df
<< " inode " << *in
<< dendl
;
2428 p_state
->state
= IMPORT_DISCOVERED
;
2430 // pin inode in the cache (for now)
2431 assert(in
->is_dir());
2432 in
->get(CInode::PIN_IMPORTING
);
2435 dout(7) << " sending export_discover_ack on " << *in
<< dendl
;
2436 mds
->send_message_mds(new MExportDirDiscoverAck(df
, m
->get_tid()), p_state
->peer
);
2438 assert (g_conf
->mds_kill_import_at
!= 2);
2441 void Migrator::import_reverse_discovering(dirfrag_t df
)
2443 import_state
.erase(df
);
2446 void Migrator::import_reverse_discovered(dirfrag_t df
, CInode
*diri
)
2449 diri
->put(CInode::PIN_IMPORTING
);
2450 import_state
.erase(df
);
2453 void Migrator::import_reverse_prepping(CDir
*dir
, import_state_t
& stat
)
2456 cache
->map_dirfrag_set(stat
.bound_ls
, bounds
);
2457 import_remove_pins(dir
, bounds
);
2458 import_reverse_final(dir
);
2461 /* This function DOES put the passed message before returning*/
2462 void Migrator::handle_export_cancel(MExportDirCancel
*m
)
2464 dout(7) << "handle_export_cancel on " << m
->get_dirfrag() << dendl
;
2465 dirfrag_t df
= m
->get_dirfrag();
2466 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(df
);
2467 if (it
== import_state
.end()) {
2468 assert(0 == "got export_cancel in weird state");
2469 } else if (it
->second
.state
== IMPORT_DISCOVERING
) {
2470 import_reverse_discovering(df
);
2471 } else if (it
->second
.state
== IMPORT_DISCOVERED
) {
2472 CInode
*in
= cache
->get_inode(df
.ino
);
2474 import_reverse_discovered(df
, in
);
2475 } else if (it
->second
.state
== IMPORT_PREPPING
) {
2476 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2478 import_reverse_prepping(dir
, it
->second
);
2479 } else if (it
->second
.state
== IMPORT_PREPPED
) {
2480 CDir
*dir
= mds
->mdcache
->get_dirfrag(df
);
2483 cache
->get_subtree_bounds(dir
, bounds
);
2484 import_remove_pins(dir
, bounds
);
2485 // adjust auth back to the exportor
2486 cache
->adjust_subtree_auth(dir
, it
->second
.peer
);
2487 import_reverse_unfreeze(dir
);
2489 assert(0 == "got export_cancel in weird state");
2494 /* This function DOES put the passed message before returning*/
2495 void Migrator::handle_export_prep(MExportDirPrep
*m
)
2497 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2498 assert(oldauth
!= mds
->get_nodeid());
2502 list
<MDSInternalContextBase
*> finished
;
2504 // assimilate root dir.
2505 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
2506 if (!m
->did_assim()) {
2507 assert(it
!= import_state
.end());
2508 assert(it
->second
.state
== IMPORT_DISCOVERED
);
2509 assert(it
->second
.peer
== oldauth
);
2510 diri
= cache
->get_inode(m
->get_dirfrag().ino
);
2512 bufferlist::iterator p
= m
->basedir
.begin();
2513 dir
= cache
->add_replica_dir(p
, diri
, oldauth
, finished
);
2514 dout(7) << "handle_export_prep on " << *dir
<< " (first pass)" << dendl
;
2516 if (it
== import_state
.end() ||
2517 it
->second
.peer
!= oldauth
||
2518 it
->second
.tid
!= m
->get_tid()) {
2519 dout(7) << "handle_export_prep obsolete message, dropping" << dendl
;
2523 assert(it
->second
.state
== IMPORT_PREPPING
);
2524 assert(it
->second
.peer
== oldauth
);
2526 dir
= cache
->get_dirfrag(m
->get_dirfrag());
2528 dout(7) << "handle_export_prep on " << *dir
<< " (subsequent pass)" << dendl
;
2529 diri
= dir
->get_inode();
2531 assert(dir
->is_auth() == false);
2533 cache
->show_subtrees();
2535 // build import bound map
2536 map
<inodeno_t
, fragset_t
> import_bound_fragset
;
2537 for (list
<dirfrag_t
>::iterator p
= m
->get_bounds().begin();
2538 p
!= m
->get_bounds().end();
2540 dout(10) << " bound " << *p
<< dendl
;
2541 import_bound_fragset
[p
->ino
].insert(p
->frag
);
2544 // assimilate contents?
2545 if (!m
->did_assim()) {
2546 dout(7) << "doing assim on " << *dir
<< dendl
;
2547 m
->mark_assim(); // only do this the first time!
2549 // change import state
2550 it
->second
.state
= IMPORT_PREPPING
;
2551 it
->second
.bound_ls
= m
->get_bounds();
2552 it
->second
.bystanders
= m
->get_bystanders();
2553 assert(g_conf
->mds_kill_import_at
!= 3);
2556 dout(7) << "bystanders are " << it
->second
.bystanders
<< dendl
;
2559 diri
->put(CInode::PIN_IMPORTING
);
2560 dir
->get(CDir::PIN_IMPORTING
);
2561 dir
->state_set(CDir::STATE_IMPORTING
);
2563 // assimilate traces to exports
2564 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2565 for (list
<bufferlist
>::iterator p
= m
->traces
.begin();
2566 p
!= m
->traces
.end();
2568 bufferlist::iterator q
= p
->begin();
2573 dout(10) << " trace from " << df
<< " start " << start
<< " len " << p
->length() << dendl
;
2577 cur
= cache
->get_dirfrag(df
);
2579 dout(10) << " had " << *cur
<< dendl
;
2580 } else if (start
== 'f') {
2581 CInode
*in
= cache
->get_inode(df
.ino
);
2583 dout(10) << " had " << *in
<< dendl
;
2584 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2585 dout(10) << " added " << *cur
<< dendl
;
2586 } else if (start
== '-') {
2589 assert(0 == "unrecognized start char");
2592 CDentry
*dn
= cache
->add_replica_dentry(q
, cur
, finished
);
2593 dout(10) << " added " << *dn
<< dendl
;
2594 CInode
*in
= cache
->add_replica_inode(q
, dn
, finished
);
2595 dout(10) << " added " << *in
<< dendl
;
2598 cur
= cache
->add_replica_dir(q
, in
, oldauth
, finished
);
2599 dout(10) << " added " << *cur
<< dendl
;
2603 // make bound sticky
2604 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2605 p
!= import_bound_fragset
.end();
2607 CInode
*in
= cache
->get_inode(p
->first
);
2609 in
->get_stickydirs();
2610 dout(7) << " set stickydirs on bound inode " << *in
<< dendl
;
2614 dout(7) << " not doing assim on " << *dir
<< dendl
;
2617 if (!finished
.empty())
2618 mds
->queue_waiters(finished
);
2621 bool success
= true;
2622 if (mds
->is_active()) {
2624 set
<CDir
*> import_bounds
;
2625 for (map
<inodeno_t
,fragset_t
>::iterator p
= import_bound_fragset
.begin();
2626 p
!= import_bound_fragset
.end();
2628 CInode
*in
= cache
->get_inode(p
->first
);
2631 // map fragset into a frag_t list, based on the inode fragtree
2632 list
<frag_t
> fglist
;
2633 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2634 in
->dirfragtree
.get_leaves_under(*q
, fglist
);
2635 dout(10) << " bound inode " << p
->first
<< " fragset " << p
->second
<< " maps to " << fglist
<< dendl
;
2637 for (list
<frag_t
>::iterator q
= fglist
.begin();
2640 CDir
*bound
= cache
->get_dirfrag(dirfrag_t(p
->first
, *q
));
2642 dout(7) << " opening bounding dirfrag " << *q
<< " on " << *in
<< dendl
;
2643 cache
->open_remote_dirfrag(in
, *q
,
2644 new C_MDS_RetryMessage(mds
, m
));
2648 if (!bound
->state_test(CDir::STATE_IMPORTBOUND
)) {
2649 dout(7) << " pinning import bound " << *bound
<< dendl
;
2650 bound
->get(CDir::PIN_IMPORTBOUND
);
2651 bound
->state_set(CDir::STATE_IMPORTBOUND
);
2653 dout(7) << " already pinned import bound " << *bound
<< dendl
;
2655 import_bounds
.insert(bound
);
2659 dout(7) << " all ready, noting auth and freezing import region" << dendl
;
2661 if (!mds
->mdcache
->is_readonly() &&
2662 diri
->filelock
.can_wrlock(-1) &&
2663 diri
->nestlock
.can_wrlock(-1)) {
2664 it
->second
.mut
= new MutationImpl();
2665 // force some locks. hacky.
2666 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, it
->second
.mut
);
2667 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, it
->second
.mut
);
2669 // note that i am an ambiguous auth for this subtree.
2670 // specify bounds, since the exporter explicitly defines the region.
2671 cache
->adjust_bounded_subtree_auth(dir
, import_bounds
,
2672 pair
<int,int>(oldauth
, mds
->get_nodeid()));
2673 cache
->verify_subtree_bounds(dir
, import_bounds
);
2675 dir
->_freeze_tree();
2677 it
->second
.state
= IMPORT_PREPPED
;
2679 dout(7) << " couldn't acquire all needed locks, failing. " << *dir
<< dendl
;
2683 dout(7) << " not active, failing. " << *dir
<< dendl
;
2688 import_reverse_prepping(dir
, it
->second
);
2691 dout(7) << " sending export_prep_ack on " << *dir
<< dendl
;
2692 mds
->send_message(new MExportDirPrepAck(dir
->dirfrag(), success
, m
->get_tid()), m
->get_connection());
2694 assert(g_conf
->mds_kill_import_at
!= 4);
2702 class C_MDS_ImportDirLoggedStart
: public MigratorLogContext
{
2707 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
2709 C_MDS_ImportDirLoggedStart(Migrator
*m
, CDir
*d
, mds_rank_t f
) :
2710 MigratorLogContext(m
), df(d
->dirfrag()), dir(d
), from(f
) {
2712 void finish(int r
) override
{
2713 mig
->import_logged_start(df
, dir
, from
, imported_session_map
);
2717 /* This function DOES put the passed message before returning*/
2718 void Migrator::handle_export_dir(MExportDir
*m
)
2720 assert (g_conf
->mds_kill_import_at
!= 5);
2721 CDir
*dir
= cache
->get_dirfrag(m
->dirfrag
);
2724 mds_rank_t oldauth
= mds_rank_t(m
->get_source().num());
2725 dout(7) << "handle_export_dir importing " << *dir
<< " from " << oldauth
<< dendl
;
2727 assert(!dir
->is_auth());
2729 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->dirfrag
);
2730 assert(it
!= import_state
.end());
2731 assert(it
->second
.state
== IMPORT_PREPPED
);
2732 assert(it
->second
.tid
== m
->get_tid());
2733 assert(it
->second
.peer
== oldauth
);
2735 utime_t now
= ceph_clock_now();
2737 if (!dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()))
2738 dir
->get_inode()->dirfragtree
.force_to_leaf(g_ceph_context
, dir
->get_frag());
2740 cache
->show_subtrees();
2742 C_MDS_ImportDirLoggedStart
*onlogged
= new C_MDS_ImportDirLoggedStart(this, dir
, oldauth
);
2744 // start the journal entry
2745 EImportStart
*le
= new EImportStart(mds
->mdlog
, dir
->dirfrag(), m
->bounds
, oldauth
);
2746 mds
->mdlog
->start_entry(le
);
2748 le
->metablob
.add_dir_context(dir
);
2750 // adjust auth (list us _first_)
2751 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), oldauth
);
2753 // new client sessions, open these after we journal
2754 // include imported sessions in EImportStart
2755 bufferlist::iterator cmp
= m
->client_map
.begin();
2756 map
<client_t
,entity_inst_t
> client_map
;
2757 decode(client_map
, cmp
);
2759 le
->cmapv
= mds
->server
->prepare_force_open_sessions(client_map
, onlogged
->imported_session_map
);
2760 encode(client_map
, le
->client_map
, mds
->mdsmap
->get_up_features());
2762 bufferlist::iterator blp
= m
->export_data
.begin();
2763 int num_imported_inodes
= 0;
2764 while (!blp
.end()) {
2765 num_imported_inodes
+=
2766 decode_import_dir(blp
,
2770 mds
->mdlog
->get_current_segment(),
2771 it
->second
.peer_exports
,
2772 it
->second
.updated_scatterlocks
,
2775 dout(10) << " " << m
->bounds
.size() << " imported bounds" << dendl
;
2777 // include bounds in EImportStart
2778 set
<CDir
*> import_bounds
;
2779 for (vector
<dirfrag_t
>::iterator p
= m
->bounds
.begin();
2780 p
!= m
->bounds
.end();
2782 CDir
*bd
= cache
->get_dirfrag(*p
);
2784 le
->metablob
.add_dir(bd
, false); // note that parent metadata is already in the event
2785 import_bounds
.insert(bd
);
2787 cache
->verify_subtree_bounds(dir
, import_bounds
);
2789 // adjust popularity
2790 mds
->balancer
->add_import(dir
, now
);
2792 dout(7) << "handle_export_dir did " << *dir
<< dendl
;
2795 it
->second
.state
= IMPORT_LOGGINGSTART
;
2796 assert (g_conf
->mds_kill_import_at
!= 6);
2799 mds
->mdlog
->submit_entry(le
, onlogged
);
2800 mds
->mdlog
->flush();
2804 mds
->logger
->inc(l_mds_imported
);
2805 mds
->logger
->inc(l_mds_imported_inodes
, num_imported_inodes
);
2813 * this is an import helper
2814 * called by import_finish, and import_reverse and friends.
2816 void Migrator::import_remove_pins(CDir
*dir
, set
<CDir
*>& bounds
)
2818 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2820 dir
->put(CDir::PIN_IMPORTING
);
2821 dir
->state_clear(CDir::STATE_IMPORTING
);
2825 for (list
<dirfrag_t
>::iterator p
= stat
.bound_ls
.begin();
2826 p
!= stat
.bound_ls
.end();
2828 if (did
.count(p
->ino
))
2831 CInode
*in
= cache
->get_inode(p
->ino
);
2833 in
->put_stickydirs();
2836 if (stat
.state
== IMPORT_PREPPING
) {
2837 for (auto bd
: bounds
) {
2838 if (bd
->state_test(CDir::STATE_IMPORTBOUND
)) {
2839 bd
->put(CDir::PIN_IMPORTBOUND
);
2840 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2843 } else if (stat
.state
>= IMPORT_PREPPED
) {
2844 // bounding dirfrags
2845 for (auto bd
: bounds
) {
2846 assert(bd
->state_test(CDir::STATE_IMPORTBOUND
));
2847 bd
->put(CDir::PIN_IMPORTBOUND
);
2848 bd
->state_clear(CDir::STATE_IMPORTBOUND
);
2853 class C_MDC_QueueContexts
: public MigratorContext
{
2855 list
<MDSInternalContextBase
*> contexts
;
2856 C_MDC_QueueContexts(Migrator
*m
) : MigratorContext(m
) {}
2857 void finish(int r
) override
{
2858 // execute contexts immediately after 'this' context
2859 get_mds()->queue_waiters_front(contexts
);
2864 * note: this does teh full work of reversing and import and cleaning up
2866 * called by both handle_mds_failure and by handle_resolve (if we are
2867 * a survivor coping with an exporter failure+recovery).
2869 void Migrator::import_reverse(CDir
*dir
)
2871 dout(7) << "import_reverse " << *dir
<< dendl
;
2873 import_state_t
& stat
= import_state
[dir
->dirfrag()];
2874 stat
.state
= IMPORT_ABORTING
;
2877 cache
->get_subtree_bounds(dir
, bounds
);
2880 import_remove_pins(dir
, bounds
);
2882 // update auth, with possible subtree merge.
2883 assert(dir
->is_subtree_root());
2884 if (mds
->is_resolve())
2885 cache
->trim_non_auth_subtree(dir
);
2887 cache
->adjust_subtree_auth(dir
, stat
.peer
);
2889 auto fin
= new C_MDC_QueueContexts(this);
2890 if (!dir
->get_inode()->is_auth() &&
2891 !dir
->get_inode()->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2892 dir
->get_inode()->clear_scatter_dirty();
2893 // wake up scatter_nudge waiters
2894 dir
->get_inode()->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2897 int num_dentries
= 0;
2898 // adjust auth bits.
2901 while (!q
.empty()) {
2902 CDir
*cur
= q
.front();
2906 assert(cur
->is_auth());
2907 cur
->state_clear(CDir::STATE_AUTH
);
2908 cur
->remove_bloom();
2909 cur
->clear_replica_map();
2910 cur
->set_replica_nonce(CDir::EXPORT_NONCE
);
2911 if (cur
->is_dirty())
2914 for (auto &p
: *cur
) {
2915 CDentry
*dn
= p
.second
;
2918 dn
->state_clear(CDentry::STATE_AUTH
);
2919 dn
->clear_replica_map();
2920 dn
->set_replica_nonce(CDentry::EXPORT_NONCE
);
2925 if (dn
->get_linkage()->is_primary()) {
2926 CInode
*in
= dn
->get_linkage()->get_inode();
2927 in
->state_clear(CDentry::STATE_AUTH
);
2928 in
->clear_replica_map();
2929 in
->set_replica_nonce(CInode::EXPORT_NONCE
);
2932 in
->clear_dirty_rstat();
2933 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
2934 in
->clear_scatter_dirty();
2935 in
->take_waiting(CInode::WAIT_ANY_MASK
, fin
->contexts
);
2938 in
->clear_dirty_parent();
2940 in
->authlock
.clear_gather();
2941 in
->linklock
.clear_gather();
2942 in
->dirfragtreelock
.clear_gather();
2943 in
->filelock
.clear_gather();
2945 in
->clear_file_locks();
2947 // non-bounding dir?
2949 in
->get_dirfrags(dfs
);
2950 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
2951 if (bounds
.count(*p
) == 0)
2955 cache
->touch_dentry_bottom(dn
); // move dentry to tail of LRU
2960 dir
->add_waiter(CDir::WAIT_UNFREEZE
, fin
);
2962 if (stat
.state
== IMPORT_ACKING
) {
2963 // remove imported caps
2964 for (map
<CInode
*,map
<client_t
,Capability::Export
> >::iterator p
= stat
.peer_exports
.begin();
2965 p
!= stat
.peer_exports
.end();
2967 CInode
*in
= p
->first
;
2968 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
2969 q
!= p
->second
.end();
2971 Capability
*cap
= in
->get_client_cap(q
->first
);
2973 assert(!stat
.session_map
.count(q
->first
));
2976 if (cap
->is_importing())
2977 in
->remove_client_cap(q
->first
);
2979 in
->put(CInode::PIN_IMPORTINGCAPS
);
2981 for (auto& p
: stat
.session_map
) {
2982 Session
*session
= p
.second
.first
;
2983 session
->dec_importing();
2988 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false)); // log failure
2990 cache
->trim(num_dentries
); // try trimming dentries
2992 // notify bystanders; wait in aborting state
2993 import_notify_abort(dir
, bounds
);
2996 void Migrator::import_notify_finish(CDir
*dir
, set
<CDir
*>& bounds
)
2998 dout(7) << "import_notify_finish " << *dir
<< dendl
;
3000 import_state_t
& stat
= import_state
[dir
->dirfrag()];
3001 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
3002 p
!= stat
.bystanders
.end();
3004 MExportDirNotify
*notify
=
3005 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, false,
3006 pair
<int,int>(stat
.peer
, mds
->get_nodeid()),
3007 pair
<int,int>(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
));
3008 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
3009 notify
->get_bounds().push_back((*i
)->dirfrag());
3010 mds
->send_message_mds(notify
, *p
);
3014 void Migrator::import_notify_abort(CDir
*dir
, set
<CDir
*>& bounds
)
3016 dout(7) << "import_notify_abort " << *dir
<< dendl
;
3018 import_state_t
& stat
= import_state
[dir
->dirfrag()];
3019 for (set
<mds_rank_t
>::iterator p
= stat
.bystanders
.begin();
3020 p
!= stat
.bystanders
.end(); ) {
3021 if (mds
->is_cluster_degraded() &&
3022 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)) {
3023 // this can happen if both exporter and bystander fail in the same mdsmap epoch
3024 stat
.bystanders
.erase(p
++);
3027 MExportDirNotify
*notify
=
3028 new MExportDirNotify(dir
->dirfrag(), stat
.tid
, true,
3029 mds_authority_t(stat
.peer
, mds
->get_nodeid()),
3030 mds_authority_t(stat
.peer
, CDIR_AUTH_UNKNOWN
));
3031 for (set
<CDir
*>::iterator i
= bounds
.begin(); i
!= bounds
.end(); ++i
)
3032 notify
->get_bounds().push_back((*i
)->dirfrag());
3033 mds
->send_message_mds(notify
, *p
);
3036 if (stat
.bystanders
.empty()) {
3037 dout(7) << "no bystanders, finishing reverse now" << dendl
;
3038 import_reverse_unfreeze(dir
);
3040 assert (g_conf
->mds_kill_import_at
!= 10);
3044 void Migrator::import_reverse_unfreeze(CDir
*dir
)
3046 dout(7) << "import_reverse_unfreeze " << *dir
<< dendl
;
3047 assert(!dir
->is_auth());
3048 cache
->discard_delayed_expire(dir
);
3049 dir
->unfreeze_tree();
3050 if (dir
->is_subtree_root())
3051 cache
->try_subtree_merge(dir
);
3052 import_reverse_final(dir
);
3055 void Migrator::import_reverse_final(CDir
*dir
)
3057 dout(7) << "import_reverse_final " << *dir
<< dendl
;
3060 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
3061 assert(it
!= import_state
.end());
3063 MutationRef mut
= it
->second
.mut
;
3064 import_state
.erase(it
);
3066 // send pending import_maps?
3067 mds
->mdcache
->maybe_send_pending_resolves();
3070 mds
->locker
->drop_locks(mut
.get());
3074 cache
->show_subtrees();
3075 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3081 void Migrator::import_logged_start(dirfrag_t df
, CDir
*dir
, mds_rank_t from
,
3082 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
)
3084 map
<dirfrag_t
, import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
3085 if (it
== import_state
.end() ||
3086 it
->second
.state
!= IMPORT_LOGGINGSTART
) {
3087 dout(7) << "import " << df
<< " must have aborted" << dendl
;
3088 mds
->server
->finish_force_open_sessions(imported_session_map
);
3092 dout(7) << "import_logged " << *dir
<< dendl
;
3095 it
->second
.state
= IMPORT_ACKING
;
3097 assert (g_conf
->mds_kill_import_at
!= 7);
3099 // force open client sessions and finish cap import
3100 mds
->server
->finish_force_open_sessions(imported_session_map
, false);
3102 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
3103 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
3104 p
!= it
->second
.peer_exports
.end();
3106 // parameter 'peer' is NONE, delay sending cap import messages to client
3107 finish_import_inode_caps(p
->first
, MDS_RANK_NONE
, true, imported_session_map
,
3108 p
->second
, imported_caps
[p
->first
->ino()]);
3111 it
->second
.session_map
.swap(imported_session_map
);
3113 // send notify's etc.
3114 dout(7) << "sending ack for " << *dir
<< " to old auth mds." << from
<< dendl
;
3116 // test surviving observer of a failed migration that did not complete
3117 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
3119 MExportDirAck
*ack
= new MExportDirAck(dir
->dirfrag(), it
->second
.tid
);
3120 ::encode(imported_caps
, ack
->imported_caps
);
3122 mds
->send_message_mds(ack
, from
);
3123 assert (g_conf
->mds_kill_import_at
!= 8);
3125 cache
->show_subtrees();
3128 /* This function DOES put the passed message before returning*/
3129 void Migrator::handle_export_finish(MExportDirFinish
*m
)
3131 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
3133 dout(7) << "handle_export_finish on " << *dir
<< (m
->is_last() ? " last" : "") << dendl
;
3135 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(m
->get_dirfrag());
3136 assert(it
!= import_state
.end());
3137 assert(it
->second
.tid
== m
->get_tid());
3139 import_finish(dir
, false, m
->is_last());
3144 void Migrator::import_finish(CDir
*dir
, bool notify
, bool last
)
3146 dout(7) << "import_finish on " << *dir
<< dendl
;
3148 map
<dirfrag_t
,import_state_t
>::iterator it
= import_state
.find(dir
->dirfrag());
3149 assert(it
!= import_state
.end());
3150 assert(it
->second
.state
== IMPORT_ACKING
|| it
->second
.state
== IMPORT_FINISHING
);
3152 if (it
->second
.state
== IMPORT_ACKING
) {
3153 assert(dir
->is_auth());
3154 cache
->adjust_subtree_auth(dir
, mds
->get_nodeid(), mds
->get_nodeid());
3158 assert(g_conf
->mds_kill_import_at
!= 9);
3160 if (it
->second
.state
== IMPORT_ACKING
) {
3161 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= it
->second
.peer_exports
.begin();
3162 p
!= it
->second
.peer_exports
.end();
3164 CInode
*in
= p
->first
;
3165 assert(in
->is_auth());
3166 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.begin();
3167 q
!= p
->second
.end();
3169 auto r
= it
->second
.session_map
.find(q
->first
);
3170 if (r
== it
->second
.session_map
.end())
3173 Session
*session
= r
->second
.first
;
3174 Capability
*cap
= in
->get_client_cap(q
->first
);
3176 cap
->merge(q
->second
, true);
3177 cap
->clear_importing();
3178 mds
->mdcache
->do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
,
3179 q
->second
.mseq
- 1, it
->second
.peer
, CEPH_CAP_FLAG_AUTH
);
3182 in
->replica_caps_wanted
= 0;
3184 for (auto& p
: it
->second
.session_map
) {
3185 Session
*session
= p
.second
.first
;
3186 session
->dec_importing();
3191 assert(it
->second
.state
== IMPORT_ACKING
);
3192 it
->second
.state
= IMPORT_FINISHING
;
3198 cache
->get_subtree_bounds(dir
, bounds
);
3201 import_notify_finish(dir
, bounds
);
3203 import_remove_pins(dir
, bounds
);
3205 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3206 it
->second
.peer_exports
.swap(peer_exports
);
3208 // clear import state (we're done!)
3209 MutationRef mut
= it
->second
.mut
;
3210 import_state
.erase(it
);
3212 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3214 // process delayed expires
3215 cache
->process_delayed_expire(dir
);
3217 // unfreeze tree, with possible subtree merge.
3218 dir
->unfreeze_tree();
3219 cache
->try_subtree_merge(dir
);
3221 cache
->show_subtrees();
3222 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3225 mds
->locker
->drop_locks(mut
.get());
3229 // re-eval imported caps
3230 for (map
<CInode
*, map
<client_t
,Capability::Export
> >::iterator p
= peer_exports
.begin();
3231 p
!= peer_exports
.end();
3233 if (p
->first
->is_auth())
3234 mds
->locker
->eval(p
->first
, CEPH_CAP_LOCKS
, true);
3235 p
->first
->put(CInode::PIN_IMPORTINGCAPS
);
3238 // send pending import_maps?
3239 mds
->mdcache
->maybe_send_pending_resolves();
3241 // did i just import mydir?
3242 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
3243 cache
->populate_mydir();
3246 if (dir
->get_num_head_items() == 0 &&
3247 !dir
->inode
->is_auth()) {
3249 export_empty_import(dir
);
3254 void Migrator::decode_import_inode(CDentry
*dn
, bufferlist::iterator
& blp
,
3255 mds_rank_t oldauth
, LogSegment
*ls
,
3256 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
,
3257 list
<ScatterLock
*>& updated_scatterlocks
)
3259 dout(15) << "decode_import_inode on " << *dn
<< dendl
;
3264 ::decode(last
, blp
);
3267 CInode
*in
= cache
->get_inode(ino
, last
);
3269 in
= new CInode(mds
->mdcache
, true, 1, last
);
3273 // state after link -- or not! -sage
3274 in
->decode_import(blp
, ls
); // cap imports are noted for later action
3277 decode_import_inode_caps(in
, true, blp
, peer_exports
);
3279 // link before state -- or not! -sage
3280 if (dn
->get_linkage()->get_inode() != in
) {
3281 assert(!dn
->get_linkage()->get_inode());
3282 dn
->dir
->link_primary_inode(dn
, in
);
3286 dn
->dir
->pop_lru_subdirs
.push_back(&in
->item_pop_lru
);
3290 cache
->add_inode(in
);
3291 dout(10) << "added " << *in
<< dendl
;
3293 dout(10) << " had " << *in
<< dendl
;
3296 if (in
->inode
.is_dirty_rstat())
3297 in
->mark_dirty_rstat();
3299 // clear if dirtyscattered, since we're going to journal this
3300 // but not until we _actually_ finish the import...
3301 if (in
->filelock
.is_dirty()) {
3302 updated_scatterlocks
.push_back(&in
->filelock
);
3303 mds
->locker
->mark_updated_scatterlock(&in
->filelock
);
3306 if (in
->dirfragtreelock
.is_dirty()) {
3307 updated_scatterlocks
.push_back(&in
->dirfragtreelock
);
3308 mds
->locker
->mark_updated_scatterlock(&in
->dirfragtreelock
);
3311 // adjust replica list
3312 //assert(!in->is_replica(oldauth)); // not true on failed export
3313 in
->add_replica(oldauth
, CInode::EXPORT_NONCE
);
3314 if (in
->is_replica(mds
->get_nodeid()))
3315 in
->remove_replica(mds
->get_nodeid());
3318 void Migrator::decode_import_inode_caps(CInode
*in
, bool auth_cap
,
3319 bufferlist::iterator
&blp
,
3320 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3322 map
<client_t
,Capability::Export
> cap_map
;
3323 ::decode(cap_map
, blp
);
3325 ::decode(in
->get_mds_caps_wanted(), blp
);
3326 if (!cap_map
.empty() ||
3327 (auth_cap
&& (in
->get_caps_wanted() & ~CEPH_CAP_PIN
))) {
3328 peer_exports
[in
].swap(cap_map
);
3329 in
->get(CInode::PIN_IMPORTINGCAPS
);
3333 void Migrator::finish_import_inode_caps(CInode
*in
, mds_rank_t peer
, bool auth_cap
,
3334 const map
<client_t
,pair
<Session
*,uint64_t> >& session_map
,
3335 const map
<client_t
,Capability::Export
> &export_map
,
3336 map
<client_t
,Capability::Import
> &import_map
)
3338 for (auto& it
: export_map
) {
3339 dout(10) << "finish_import_inode_caps for client." << it
.first
<< " on " << *in
<< dendl
;
3341 auto p
= session_map
.find(it
.first
);
3342 if (p
== session_map
.end()) {
3343 dout(10) << " no session for client." << it
.first
<< dendl
;
3344 (void)import_map
[it
.first
];
3348 Session
*session
= p
->second
.first
;
3350 Capability
*cap
= in
->get_client_cap(it
.first
);
3352 cap
= in
->add_client_cap(it
.first
, session
);
3354 cap
->mark_importing();
3357 // Always ask exporter mds to send cap export messages for auth caps.
3358 // For non-auth caps, ask exporter mds to send cap export messages to
3359 // clients who haven't opened sessions. The cap export messages will
3360 // make clients open sessions.
3361 if (auth_cap
|| session
->connection
== nullptr) {
3362 Capability::Import
& im
= import_map
[it
.first
];
3363 im
.cap_id
= cap
->get_cap_id();
3364 im
.mseq
= auth_cap
? it
.second
.mseq
: cap
->get_mseq();
3365 im
.issue_seq
= cap
->get_last_seq() + 1;
3369 cap
->merge(it
.second
, auth_cap
);
3370 mds
->mdcache
->do_cap_import(session
, in
, cap
, it
.second
.cap_id
,
3371 it
.second
.seq
, it
.second
.mseq
- 1, peer
,
3372 auth_cap
? CEPH_CAP_FLAG_AUTH
: CEPH_CAP_FLAG_RELEASE
);
3377 in
->replica_caps_wanted
= 0;
3378 in
->put(CInode::PIN_IMPORTINGCAPS
);
3382 int Migrator::decode_import_dir(bufferlist::iterator
& blp
,
3387 map
<CInode
*,map
<client_t
,Capability::Export
> >& peer_exports
,
3388 list
<ScatterLock
*>& updated_scatterlocks
, utime_t now
)
3394 CInode
*diri
= cache
->get_inode(df
.ino
);
3396 CDir
*dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, df
.frag
);
3399 dout(7) << "decode_import_dir " << *dir
<< dendl
;
3402 dir
->decode_import(blp
, now
, ls
);
3404 // adjust replica list
3405 //assert(!dir->is_replica(oldauth)); // not true on failed export
3406 dir
->add_replica(oldauth
, CDir::EXPORT_NONCE
);
3407 if (dir
->is_replica(mds
->get_nodeid()))
3408 dir
->remove_replica(mds
->get_nodeid());
3410 // add to journal entry
3412 le
->metablob
.add_import_dir(dir
);
3414 int num_imported
= 0;
3416 // take all waiters on this dir
3417 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3418 // a replica's presense in my cache implies/forces it's presense in authority's.
3419 list
<MDSInternalContextBase
*> waiters
;
3421 dir
->take_waiting(CDir::WAIT_ANY_MASK
, waiters
);
3422 for (list
<MDSInternalContextBase
*>::iterator it
= waiters
.begin();
3423 it
!= waiters
.end();
3425 import_root
->add_waiter(CDir::WAIT_UNFREEZE
, *it
); // UNFREEZE will get kicked both on success or failure
3427 dout(15) << "doing contents" << dendl
;
3431 ::decode(nden
, blp
);
3433 for (; nden
>0; nden
--) {
3439 ::decode(dname
, blp
);
3440 ::decode(last
, blp
);
3442 CDentry
*dn
= dir
->lookup_exact_snap(dname
, last
);
3444 dn
= dir
->add_null_dentry(dname
, 1, last
);
3446 dn
->decode_import(blp
, ls
);
3448 dn
->add_replica(oldauth
, CDentry::EXPORT_NONCE
);
3449 if (dn
->is_replica(mds
->get_nodeid()))
3450 dn
->remove_replica(mds
->get_nodeid());
3452 // dentry lock in unreadable state can block path traverse
3453 if (dn
->lock
.get_state() != LOCK_SYNC
)
3454 mds
->locker
->try_eval(&dn
->lock
, NULL
);
3456 dout(15) << "decode_import_dir got " << *dn
<< dendl
;
3460 ::decode(icode
, blp
);
3464 assert(dn
->get_linkage()->is_null());
3468 else if (icode
== 'L') {
3471 unsigned char d_type
;
3473 ::decode(d_type
, blp
);
3474 if (dn
->get_linkage()->is_remote()) {
3475 assert(dn
->get_linkage()->get_remote_ino() == ino
);
3477 dir
->link_remote_inode(dn
, ino
, d_type
);
3480 else if (icode
== 'I') {
3483 decode_import_inode(dn
, blp
, oldauth
, ls
,
3484 peer_exports
, updated_scatterlocks
);
3487 // add dentry to journal entry
3489 le
->metablob
.add_import_dentry(dn
);
3492 #ifdef MDS_VERIFY_FRAGSTAT
3493 if (dir
->is_complete())
3494 dir
->verify_fragstat();
3497 dir
->inode
->maybe_export_pin();
3499 dout(7) << "decode_import_dir done " << *dir
<< dendl
;
3500 return num_imported
;
3507 // authority bystander
3509 /* This function DOES put the passed message before returning*/
3510 void Migrator::handle_export_notify(MExportDirNotify
*m
)
3512 if (!(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping())) {
3517 CDir
*dir
= cache
->get_dirfrag(m
->get_dirfrag());
3519 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3520 mds_authority_t old_auth
= m
->get_old_auth();
3521 mds_authority_t new_auth
= m
->get_new_auth();
3524 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3525 << " on missing dir " << m
->get_dirfrag() << dendl
;
3526 } else if (dir
->authority() != old_auth
) {
3527 dout(7) << "handle_export_notify old_auth was " << dir
->authority()
3528 << " != " << old_auth
<< " -> " << new_auth
3529 << " on " << *dir
<< dendl
;
3531 dout(7) << "handle_export_notify " << old_auth
<< " -> " << new_auth
3532 << " on " << *dir
<< dendl
;
3535 cache
->map_dirfrag_set(m
->get_bounds(), have
);
3536 cache
->adjust_bounded_subtree_auth(dir
, have
, new_auth
);
3539 cache
->try_subtree_merge(dir
);
3543 if (m
->wants_ack()) {
3544 mds
->send_message_mds(new MExportDirNotifyAck(m
->get_dirfrag(), m
->get_tid(), m
->get_new_auth()), from
);
3547 dout(7) << "handle_export_notify no ack requested" << dendl
;
3554 void Migrator::export_caps(CInode
*in
)
3556 mds_rank_t dest
= in
->authority().first
;
3557 dout(7) << "export_caps to mds." << dest
<< " " << *in
<< dendl
;
3559 assert(in
->is_any_caps());
3560 assert(!in
->is_auth());
3561 assert(!in
->is_ambiguous_auth());
3562 assert(!in
->state_test(CInode::STATE_EXPORTINGCAPS
));
3564 MExportCaps
*ex
= new MExportCaps
;
3565 ex
->ino
= in
->ino();
3567 encode_export_inode_caps(in
, false, ex
->cap_bl
, ex
->client_map
);
3569 mds
->send_message_mds(ex
, dest
);
3572 /* This function DOES put the passed message before returning*/
3573 void Migrator::handle_export_caps_ack(MExportCapsAck
*ack
)
3575 mds_rank_t from
= ack
->get_source().num();
3576 CInode
*in
= cache
->get_inode(ack
->ino
);
3578 assert(!in
->is_auth());
3580 dout(10) << "handle_export_caps_ack " << *ack
<< " from "
3581 << ack
->get_source() << " on " << *in
<< dendl
;
3583 map
<client_t
,Capability::Import
> imported_caps
;
3584 map
<client_t
,uint64_t> caps_ids
;
3585 auto blp
= ack
->cap_bl
.begin();
3586 ::decode(imported_caps
, blp
);
3587 ::decode(caps_ids
, blp
);
3589 for (auto& it
: imported_caps
) {
3590 Capability
*cap
= in
->get_client_cap(it
.first
);
3591 if (!cap
|| cap
->get_cap_id() != caps_ids
.at(it
.first
))
3594 dout(7) << __func__
<< " telling client." << it
.first
3595 << " exported caps on " << *in
<< dendl
;
3596 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, in
->ino(), 0,
3597 cap
->get_cap_id(), cap
->get_mseq(),
3598 mds
->get_osd_epoch_barrier());
3599 m
->set_cap_peer(it
.second
.cap_id
, it
.second
.issue_seq
, it
.second
.mseq
, from
, 0);
3600 mds
->send_message_client_counted(m
, it
.first
);
3602 in
->remove_client_cap(it
.first
);
3605 mds
->locker
->request_inode_file_caps(in
);
3606 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
3612 void Migrator::handle_gather_caps(MGatherCaps
*m
)
3614 CInode
*in
= cache
->get_inode(m
->ino
);
3618 dout(10) << "handle_gather_caps " << *m
<< " from " << m
->get_source()
3619 << " on " << *in
<< dendl
;
3621 if (in
->is_any_caps() &&
3623 !in
->is_ambiguous_auth() &&
3624 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
3631 class C_M_LoggedImportCaps
: public MigratorLogContext
{
3635 map
<client_t
,pair
<Session
*,uint64_t> > imported_session_map
;
3636 map
<CInode
*, map
<client_t
,Capability::Export
> > peer_exports
;
3638 C_M_LoggedImportCaps(Migrator
*m
, CInode
*i
, mds_rank_t f
) : MigratorLogContext(m
), in(i
), from(f
) {}
3639 void finish(int r
) override
{
3640 mig
->logged_import_caps(in
, from
, imported_session_map
, peer_exports
);
3644 /* This function DOES put the passed message before returning*/
3645 void Migrator::handle_export_caps(MExportCaps
*ex
)
3647 dout(10) << "handle_export_caps " << *ex
<< " from " << ex
->get_source() << dendl
;
3648 CInode
*in
= cache
->get_inode(ex
->ino
);
3651 assert(in
->is_auth());
3654 if (!in
->can_auth_pin()) {
3661 map
<client_t
,entity_inst_t
> client_map
;
3662 client_map
.swap(ex
->client_map
);
3664 C_M_LoggedImportCaps
*finish
= new C_M_LoggedImportCaps(
3665 this, in
, mds_rank_t(ex
->get_source().num()));
3667 version_t pv
= mds
->server
->prepare_force_open_sessions(client_map
,
3668 finish
->imported_session_map
);
3670 bufferlist::iterator blp
= ex
->cap_bl
.begin();
3671 decode_import_inode_caps(in
, false, blp
, finish
->peer_exports
);
3672 assert(!finish
->peer_exports
.empty()); // thus, inode is pinned.
3674 // journal open client sessions
3676 ESessions
*le
= new ESessions(pv
, client_map
);
3677 mds
->mdlog
->start_submit_entry(le
, finish
);
3678 mds
->mdlog
->flush();
3684 void Migrator::logged_import_caps(CInode
*in
,
3686 map
<client_t
,pair
<Session
*,uint64_t> >& imported_session_map
,
3687 map
<CInode
*, map
<client_t
,Capability::Export
> >& peer_exports
)
3689 dout(10) << "logged_import_caps on " << *in
<< dendl
;
3690 // see export_go() vs export_go_synced()
3691 assert(in
->is_auth());
3693 // force open client sessions and finish cap import
3694 mds
->server
->finish_force_open_sessions(imported_session_map
);
3696 auto it
= peer_exports
.find(in
);
3697 assert(it
!= peer_exports
.end());
3699 // clients will release caps from the exporter when they receive the cap import message.
3700 map
<client_t
,Capability::Import
> imported_caps
;
3701 finish_import_inode_caps(in
, from
, false, imported_session_map
, it
->second
, imported_caps
);
3702 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
3704 if (!imported_caps
.empty()) {
3705 MExportCapsAck
*ack
= new MExportCapsAck(in
->ino());
3706 map
<client_t
,uint64_t> peer_caps_ids
;
3707 for (auto &p
: imported_caps
)
3708 peer_caps_ids
[p
.first
] = it
->second
.at(p
.first
).cap_id
;
3710 ::encode(imported_caps
, ack
->cap_bl
);
3711 ::encode(peer_caps_ids
, ack
->cap_bl
);
3712 mds
->send_message_mds(ack
, from
);
3715 in
->auth_unpin(this);
3718 Migrator::Migrator(MDSRank
*m
, MDCache
*c
) : mds(m
), cache(c
) {
3719 max_export_size
= g_conf
->get_val
<uint64_t>("mds_max_export_size");
3720 inject_session_race
= g_conf
->get_val
<bool>("mds_inject_migrator_session_race");
3723 void Migrator::handle_conf_change(const struct md_config_t
*conf
,
3724 const std::set
<std::string
> &changed
,
3725 const MDSMap
&mds_map
)
3727 if (changed
.count("mds_max_export_size"))
3728 max_export_size
= conf
->get_val
<uint64_t>("mds_max_export_size");
3730 if (changed
.count("mds_inject_migrator_session_race")) {
3731 inject_session_race
= conf
->get_val
<bool>("mds_inject_migrator_session_race");
3732 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race
<< dendl
;
3735 if (changed
.count("mds_inject_migrator_message_loss")) {
3736 inject_message_loss
= g_conf
->get_val
<int64_t>("mds_inject_migrator_message_loss");
3737 dout(0) << "mds_inject_migrator_message_loss is " << inject_message_loss
<< dendl
;