]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Migrator.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mds / Migrator.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MDSRank.h"
16#include "MDCache.h"
17#include "CInode.h"
18#include "CDir.h"
19#include "CDentry.h"
20#include "Migrator.h"
21#include "Locker.h"
22#include "Server.h"
23
24#include "MDBalancer.h"
25#include "MDLog.h"
26#include "MDSMap.h"
27#include "Mutation.h"
28
29#include "include/filepath.h"
28e407b8 30#include "common/likely.h"
7c673cae
FG
31
32#include "events/EExport.h"
33#include "events/EImportStart.h"
34#include "events/EImportFinish.h"
35#include "events/ESessions.h"
36
37#include "msg/Messenger.h"
38
39#include "messages/MClientCaps.h"
40
41#include "messages/MExportDirDiscover.h"
42#include "messages/MExportDirDiscoverAck.h"
43#include "messages/MExportDirCancel.h"
44#include "messages/MExportDirPrep.h"
45#include "messages/MExportDirPrepAck.h"
46#include "messages/MExportDir.h"
47#include "messages/MExportDirAck.h"
48#include "messages/MExportDirNotify.h"
49#include "messages/MExportDirNotifyAck.h"
50#include "messages/MExportDirFinish.h"
51
52#include "messages/MExportCaps.h"
53#include "messages/MExportCapsAck.h"
54#include "messages/MGatherCaps.h"
55
56
57/*
58 * this is what the dir->dir_auth values look like
59 *
60 * dir_auth authbits
61 * export
62 * me me - before
63 * me, me me - still me, but preparing for export
64 * me, them me - send MExportDir (peer is preparing)
65 * them, me me - journaled EExport
66 * them them - done
67 *
68 * import:
69 * them them - before
70 * me, them me - journaled EImportStart
71 * me me - done
72 *
73 * which implies:
74 * - auth bit is set if i am listed as first _or_ second dir_auth.
75 */
76
77#include "common/config.h"
78
79
80#define dout_context g_ceph_context
81#define dout_subsys ceph_subsys_mds
82#undef dout_prefix
83#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
84
85
86class MigratorContext : public MDSInternalContextBase {
87protected:
88 Migrator *mig;
89 MDSRank *get_mds() override {
90 return mig->mds;
91 }
92public:
93 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
94 assert(mig != NULL);
95 }
96};
97
98class MigratorLogContext : public MDSLogContextBase {
99protected:
100 Migrator *mig;
101 MDSRank *get_mds() override {
102 return mig->mds;
103 }
104public:
105 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
106 assert(mig != NULL);
107 }
108};
109
110/* This function DOES put the passed message before returning*/
111void Migrator::dispatch(Message *m)
112{
113 switch (m->get_type()) {
114 // import
115 case MSG_MDS_EXPORTDIRDISCOVER:
116 handle_export_discover(static_cast<MExportDirDiscover*>(m));
117 break;
118 case MSG_MDS_EXPORTDIRPREP:
119 handle_export_prep(static_cast<MExportDirPrep*>(m));
120 break;
121 case MSG_MDS_EXPORTDIR:
28e407b8
AA
122 if (unlikely(inject_session_race)) {
123 dout(0) << "waiting for inject_session_race" << dendl;
124 mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
125 } else {
126 handle_export_dir(static_cast<MExportDir*>(m));
127 }
7c673cae
FG
128 break;
129 case MSG_MDS_EXPORTDIRFINISH:
130 handle_export_finish(static_cast<MExportDirFinish*>(m));
131 break;
132 case MSG_MDS_EXPORTDIRCANCEL:
133 handle_export_cancel(static_cast<MExportDirCancel*>(m));
134 break;
135
136 // export
137 case MSG_MDS_EXPORTDIRDISCOVERACK:
138 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
139 break;
140 case MSG_MDS_EXPORTDIRPREPACK:
141 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
142 break;
143 case MSG_MDS_EXPORTDIRACK:
144 handle_export_ack(static_cast<MExportDirAck*>(m));
145 break;
146 case MSG_MDS_EXPORTDIRNOTIFYACK:
147 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
148 break;
149
150 // export 3rd party (dir_auth adjustments)
151 case MSG_MDS_EXPORTDIRNOTIFY:
152 handle_export_notify(static_cast<MExportDirNotify*>(m));
153 break;
154
155 // caps
156 case MSG_MDS_EXPORTCAPS:
157 handle_export_caps(static_cast<MExportCaps*>(m));
158 break;
159 case MSG_MDS_GATHERCAPS:
160 handle_gather_caps(static_cast<MGatherCaps*>(m));
161 break;
162
163 default:
164 derr << "migrator unknown message " << m->get_type() << dendl;
165 assert(0 == "migrator unknown message");
166 }
167}
168
169
170class C_MDC_EmptyImport : public MigratorContext {
171 CDir *dir;
172public:
173 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
174 void finish(int r) override {
175 mig->export_empty_import(dir);
176 }
177};
178
179
180void Migrator::export_empty_import(CDir *dir)
181{
182 dout(7) << "export_empty_import " << *dir << dendl;
183 assert(dir->is_subtree_root());
184
185 if (dir->inode->is_auth()) {
186 dout(7) << " inode is auth" << dendl;
187 return;
188 }
189 if (!dir->is_auth()) {
190 dout(7) << " not auth" << dendl;
191 return;
192 }
193 if (dir->is_freezing() || dir->is_frozen()) {
194 dout(7) << " freezing or frozen" << dendl;
195 return;
196 }
197 if (dir->get_num_head_items() > 0) {
198 dout(7) << " not actually empty" << dendl;
199 return;
200 }
201 if (dir->inode->is_root()) {
202 dout(7) << " root" << dendl;
203 return;
204 }
205
206 mds_rank_t dest = dir->inode->authority().first;
207 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
208
209 dout(7) << " really empty, exporting to " << dest << dendl;
210 assert (dest != mds->get_nodeid());
211
212 dout(7) << "exporting to mds." << dest
213 << " empty import " << *dir << dendl;
214 export_dir( dir, dest );
215}
216
217void Migrator::find_stale_export_freeze()
218{
219 utime_t now = ceph_clock_now();
220 utime_t cutoff = now;
221 cutoff -= g_conf->mds_freeze_tree_timeout;
222
223
224 /*
225 * We could have situations like:
226 *
227 * - mds.0 authpins an item in subtree A
228 * - mds.0 sends request to mds.1 to authpin an item in subtree B
229 * - mds.0 freezes subtree A
230 * - mds.1 authpins an item in subtree B
231 * - mds.1 sends request to mds.0 to authpin an item in subtree A
232 * - mds.1 freezes subtree B
233 * - mds.1 receives the remote authpin request from mds.0
234 * (wait because subtree B is freezing)
235 * - mds.0 receives the remote authpin request from mds.1
236 * (wait because subtree A is freezing)
237 *
238 *
239 * - client request authpins items in subtree B
240 * - freeze subtree B
241 * - import subtree A which is parent of subtree B
242 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
243 * - freeze subtree A
244 * - client request tries authpinning items in subtree A
245 * (wait because subtree A is freezing)
246 */
247 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
248 p != export_state.end(); ) {
249 CDir* dir = p->first;
250 export_state_t& stat = p->second;
251 ++p;
252 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
253 continue;
254 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
255 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
256 stat.last_cum_auth_pins_change = now;
257 continue;
258 }
259 if (stat.last_cum_auth_pins_change >= cutoff)
260 continue;
261 if (stat.num_remote_waiters > 0 ||
262 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
263 export_try_cancel(dir);
264 }
265 }
266}
267
268void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
269{
270 dout(10) << "export_try_cancel " << *dir << dendl;
271
272 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
273 assert(it != export_state.end());
274
275 int state = it->second.state;
276 switch (state) {
277 case EXPORT_LOCKING:
278 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
279 it->second.state = EXPORT_CANCELLED;
280 dir->auth_unpin(this);
281 break;
282 case EXPORT_DISCOVERING:
283 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
284 it->second.state = EXPORT_CANCELLED;
285 dir->unfreeze_tree(); // cancel the freeze
286 dir->auth_unpin(this);
287 if (notify_peer &&
288 (!mds->is_cluster_degraded() ||
289 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
290 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
291 break;
292
293 case EXPORT_FREEZING:
294 dout(10) << "export state=freezing : canceling freeze" << dendl;
295 it->second.state = EXPORT_CANCELLED;
296 dir->unfreeze_tree(); // cancel the freeze
224ce89b
WB
297 if (dir->is_subtree_root())
298 cache->try_subtree_merge(dir);
7c673cae
FG
299 if (notify_peer &&
300 (!mds->is_cluster_degraded() ||
301 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
302 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
303 break;
304
305 // NOTE: state order reversal, warning comes after prepping
306 case EXPORT_WARNING:
307 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
308 it->second.state = EXPORT_CANCELLING;
309 // fall-thru
310
311 case EXPORT_PREPPING:
312 if (state != EXPORT_WARNING) {
313 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
314 it->second.state = EXPORT_CANCELLED;
315 }
316
317 {
318 // unpin bounds
319 set<CDir*> bounds;
320 cache->get_subtree_bounds(dir, bounds);
321 for (set<CDir*>::iterator q = bounds.begin();
322 q != bounds.end();
323 ++q) {
324 CDir *bd = *q;
325 bd->put(CDir::PIN_EXPORTBOUND);
326 bd->state_clear(CDir::STATE_EXPORTBOUND);
327 }
328 if (state == EXPORT_WARNING) {
329 // notify bystanders
b32b8144 330 export_notify_abort(dir, it->second, bounds);
7c673cae
FG
331 // process delayed expires
332 cache->process_delayed_expire(dir);
333 }
334 }
335 dir->unfreeze_tree();
7c673cae 336 cache->try_subtree_merge(dir);
b32b8144
FG
337 for (auto bd : it->second.residual_dirs) {
338 bd->unfreeze_tree();
339 cache->try_subtree_merge(bd);
340 }
7c673cae
FG
341 if (notify_peer &&
342 (!mds->is_cluster_degraded() ||
343 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
344 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
345 break;
346
347 case EXPORT_EXPORTING:
348 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
349 it->second.state = EXPORT_CANCELLING;
b32b8144 350 export_reverse(dir, it->second);
7c673cae
FG
351 break;
352
353 case EXPORT_LOGGINGFINISH:
354 case EXPORT_NOTIFYING:
355 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
356 // leave export_state, don't clean up now.
357 break;
358 case EXPORT_CANCELLING:
359 break;
360
361 default:
362 ceph_abort();
363 }
364
365 // finish clean-up?
366 if (it->second.state == EXPORT_CANCELLING ||
367 it->second.state == EXPORT_CANCELLED) {
368 MutationRef mut;
369 mut.swap(it->second.mut);
370
371 if (it->second.state == EXPORT_CANCELLED) {
372 export_state.erase(it);
373 dir->state_clear(CDir::STATE_EXPORTING);
374 // send pending import_maps?
375 cache->maybe_send_pending_resolves();
376 }
377
378 // drop locks
379 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
380 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
381 assert(mdr);
382 if (mdr->more()->waiting_on_slave.empty())
383 mds->mdcache->request_finish(mdr);
384 } else if (mut) {
385 mds->locker->drop_locks(mut.get());
386 mut->cleanup();
387 }
388
389 cache->show_subtrees();
390
391 maybe_do_queued_export();
392 }
393}
394
395void Migrator::export_cancel_finish(CDir *dir)
396{
397 assert(dir->state_test(CDir::STATE_EXPORTING));
398 dir->state_clear(CDir::STATE_EXPORTING);
399
400 // pinned by Migrator::export_notify_abort()
401 dir->auth_unpin(this);
402 // send pending import_maps? (these need to go out when all exports have finished.)
403 cache->maybe_send_pending_resolves();
404}
405
406// ==========================================================
407// mds failure handling
408
409void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
410{
411 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
412
413 // check my exports
414
415 // first add an extra auth_pin on any freezes, so that canceling a
416 // nested freeze doesn't complete one further up the hierarchy and
417 // confuse the shit out of us. we'll remove it after canceling the
418 // freeze. this way no freeze completions run before we want them
419 // to.
420 list<CDir*> pinned_dirs;
421 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
422 p != export_state.end();
423 ++p) {
424 if (p->second.state == EXPORT_FREEZING) {
425 CDir *dir = p->first;
426 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
427 dir->auth_pin(this);
428 pinned_dirs.push_back(dir);
429 }
430 }
431
432 map<CDir*,export_state_t>::iterator p = export_state.begin();
433 while (p != export_state.end()) {
434 map<CDir*,export_state_t>::iterator next = p;
435 ++next;
436 CDir *dir = p->first;
437
438 // abort exports:
439 // - that are going to the failed node
440 // - that aren't frozen yet (to avoid auth_pin deadlock)
441 // - they havne't prepped yet (they may need to discover bounds to do that)
442 if ((p->second.peer == who &&
443 p->second.state != EXPORT_CANCELLING) ||
444 p->second.state == EXPORT_LOCKING ||
445 p->second.state == EXPORT_DISCOVERING ||
446 p->second.state == EXPORT_FREEZING ||
447 p->second.state == EXPORT_PREPPING) {
448 // the guy i'm exporting to failed, or we're just freezing.
449 dout(10) << "cleaning up export state (" << p->second.state << ")"
450 << get_export_statename(p->second.state) << " of " << *dir << dendl;
451 export_try_cancel(dir);
452 } else if (p->second.peer != who) {
453 // bystander failed.
454 if (p->second.warning_ack_waiting.erase(who)) {
455 if (p->second.state == EXPORT_WARNING) {
456 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
457 // exporter waiting for warning acks, let's fake theirs.
458 dout(10) << "faking export_warning_ack from mds." << who
459 << " on " << *dir << " to mds." << p->second.peer
460 << dendl;
461 if (p->second.warning_ack_waiting.empty())
462 export_go(dir);
463 }
464 }
465 if (p->second.notify_ack_waiting.erase(who)) {
466 // exporter is waiting for notify acks, fake it
467 dout(10) << "faking export_notify_ack from mds." << who
468 << " on " << *dir << " to mds." << p->second.peer
469 << dendl;
470 if (p->second.state == EXPORT_NOTIFYING) {
471 if (p->second.notify_ack_waiting.empty())
472 export_finish(dir);
473 } else if (p->second.state == EXPORT_CANCELLING) {
474 if (p->second.notify_ack_waiting.empty()) {
475 export_state.erase(p);
476 export_cancel_finish(dir);
477 }
478 }
479 }
480 }
481
482 // next!
483 p = next;
484 }
485
486
487 // check my imports
488 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
489 while (q != import_state.end()) {
490 map<dirfrag_t,import_state_t>::iterator next = q;
491 ++next;
492 dirfrag_t df = q->first;
493 CInode *diri = mds->mdcache->get_inode(df.ino);
494 CDir *dir = mds->mdcache->get_dirfrag(df);
495
496 if (q->second.peer == who) {
497 if (dir)
498 dout(10) << "cleaning up import state (" << q->second.state << ")"
499 << get_import_statename(q->second.state) << " of " << *dir << dendl;
500 else
501 dout(10) << "cleaning up import state (" << q->second.state << ")"
502 << get_import_statename(q->second.state) << " of " << df << dendl;
503
504 switch (q->second.state) {
505 case IMPORT_DISCOVERING:
506 dout(10) << "import state=discovering : clearing state" << dendl;
507 import_reverse_discovering(df);
508 break;
509
510 case IMPORT_DISCOVERED:
511 assert(diri);
512 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
513 import_reverse_discovered(df, diri);
514 break;
515
516 case IMPORT_PREPPING:
517 assert(dir);
518 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
b32b8144 519 import_reverse_prepping(dir, q->second);
7c673cae
FG
520 break;
521
522 case IMPORT_PREPPED:
523 assert(dir);
524 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
525 {
526 set<CDir*> bounds;
527 cache->get_subtree_bounds(dir, bounds);
528 import_remove_pins(dir, bounds);
529
530 // adjust auth back to the exporter
531 cache->adjust_subtree_auth(dir, q->second.peer);
7c673cae
FG
532
533 // notify bystanders ; wait in aborting state
b32b8144 534 q->second.state = IMPORT_ABORTING;
7c673cae
FG
535 import_notify_abort(dir, bounds);
536 assert(g_conf->mds_kill_import_at != 10);
537 }
538 break;
539
540 case IMPORT_LOGGINGSTART:
541 assert(dir);
542 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
543 import_reverse(dir);
544 break;
545
546 case IMPORT_ACKING:
547 assert(dir);
548 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
549 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
550 {
551 set<CDir*> bounds;
552 cache->get_subtree_bounds(dir, bounds);
553 cache->add_ambiguous_import(dir, bounds);
554 }
555 break;
556
557 case IMPORT_FINISHING:
558 assert(dir);
559 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
560 import_finish(dir, true);
561 break;
562
563 case IMPORT_ABORTING:
564 assert(dir);
565 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
566 break;
567 }
568 } else {
569 auto bystanders_entry = q->second.bystanders.find(who);
570 if (bystanders_entry != q->second.bystanders.end()) {
571 q->second.bystanders.erase(bystanders_entry);
572 if (q->second.state == IMPORT_ABORTING) {
573 assert(dir);
574 dout(10) << "faking export_notify_ack from mds." << who
575 << " on aborting import " << *dir << " from mds." << q->second.peer
576 << dendl;
224ce89b 577 if (q->second.bystanders.empty())
7c673cae 578 import_reverse_unfreeze(dir);
7c673cae
FG
579 }
580 }
581 }
582
583 // next!
584 q = next;
585 }
586
587 while (!pinned_dirs.empty()) {
588 CDir *dir = pinned_dirs.front();
589 dout(10) << "removing temp auth_pin on " << *dir << dendl;
590 dir->auth_unpin(this);
591 pinned_dirs.pop_front();
592 }
593}
594
595
596
597void Migrator::show_importing()
598{
599 dout(10) << "show_importing" << dendl;
600 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
601 p != import_state.end();
602 ++p) {
603 CDir *dir = mds->mdcache->get_dirfrag(p->first);
604 if (dir) {
605 dout(10) << " importing from " << p->second.peer
606 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
607 << " " << p->first << " " << *dir << dendl;
608 } else {
609 dout(10) << " importing from " << p->second.peer
610 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
611 << " " << p->first << dendl;
612 }
613 }
614}
615
616void Migrator::show_exporting()
617{
618 dout(10) << "show_exporting" << dendl;
619 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
620 p != export_state.end();
621 ++p)
622 dout(10) << " exporting to " << p->second.peer
623 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
624 << " " << p->first->dirfrag() << " " << *p->first << dendl;
625}
626
627
628
629void Migrator::audit()
630{
631 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
632 return; // hrm.
633
634 // import_state
635 show_importing();
636 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
637 p != import_state.end();
638 ++p) {
639 if (p->second.state == IMPORT_DISCOVERING)
640 continue;
641 if (p->second.state == IMPORT_DISCOVERED) {
642 CInode *in = cache->get_inode(p->first.ino);
643 assert(in);
644 continue;
645 }
646 CDir *dir = cache->get_dirfrag(p->first);
647 assert(dir);
648 if (p->second.state == IMPORT_PREPPING)
649 continue;
650 if (p->second.state == IMPORT_ABORTING) {
651 assert(!dir->is_ambiguous_dir_auth());
652 assert(dir->get_dir_auth().first != mds->get_nodeid());
653 continue;
654 }
655 assert(dir->is_ambiguous_dir_auth());
656 assert(dir->authority().first == mds->get_nodeid() ||
657 dir->authority().second == mds->get_nodeid());
658 }
659
660 // export_state
661 show_exporting();
662 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
663 p != export_state.end();
664 ++p) {
665 CDir *dir = p->first;
666 if (p->second.state == EXPORT_LOCKING ||
667 p->second.state == EXPORT_DISCOVERING ||
668 p->second.state == EXPORT_FREEZING ||
669 p->second.state == EXPORT_CANCELLING)
670 continue;
671 assert(dir->is_ambiguous_dir_auth());
672 assert(dir->authority().first == mds->get_nodeid() ||
673 dir->authority().second == mds->get_nodeid());
674 }
675
676 // ambiguous+me subtrees should be importing|exporting
677
678 // write me
679}
680
681
682
683
684
685// ==========================================================
686// EXPORT
687
688void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
689{
690 // enqueue
691 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
692 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
693
694 maybe_do_queued_export();
695}
696
697void Migrator::maybe_do_queued_export()
698{
699 static bool running;
700 if (running)
701 return;
702 running = true;
703 while (!export_queue.empty() &&
704 export_state.size() <= 4) {
705 dirfrag_t df = export_queue.front().first;
706 mds_rank_t dest = export_queue.front().second;
707 export_queue.pop_front();
708
709 CDir *dir = mds->mdcache->get_dirfrag(df);
710 if (!dir) continue;
711 if (!dir->is_auth()) continue;
712
713 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
714
715 export_dir(dir, dest);
716 }
717 running = false;
718}
719
720
721
722
723class C_MDC_ExportFreeze : public MigratorContext {
724 CDir *ex; // dir i'm exporting
725 uint64_t tid;
726public:
727 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
728 MigratorContext(m), ex(e), tid(t) {
729 assert(ex != NULL);
730 }
731 void finish(int r) override {
732 if (r >= 0)
733 mig->export_frozen(ex, tid);
734 }
735};
736
737
738void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
739{
740 // path
741 vector<CDentry*> trace;
742 cache->make_trace(trace, dir->inode);
743 for (vector<CDentry*>::iterator it = trace.begin();
744 it != trace.end();
745 ++it)
746 locks.insert(&(*it)->lock);
747
748 // prevent scatter gather race
749 locks.insert(&dir->get_inode()->dirfragtreelock);
750
751 // bound dftlocks:
752 // NOTE: We need to take an rdlock on bounding dirfrags during
753 // migration for a rather irritating reason: when we export the
754 // bound inode, we need to send scatterlock state for the dirfrags
755 // as well, so that the new auth also gets the correct info. If we
756 // race with a refragment, this info is useless, as we can't
757 // redivvy it up. And it's needed for the scatterlocks to work
758 // properly: when the auth is in a sync/lock state it keeps each
759 // dirfrag's portion in the local (auth OR replica) dirfrag.
760 set<CDir*> wouldbe_bounds;
761 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
762 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
763 locks.insert(&(*p)->get_inode()->dirfragtreelock);
764}
765
766
31f18b77 767class C_M_ExportDirWait : public MigratorContext {
7c673cae
FG
768 MDRequestRef mdr;
769 int count;
770public:
31f18b77 771 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
7c673cae
FG
772 : MigratorContext(m), mdr(mdr), count(count) {}
773 void finish(int r) override {
774 mig->dispatch_export_dir(mdr, count);
775 }
776};
777
778
779/** export_dir(dir, dest)
780 * public method to initiate an export.
781 * will fail if the directory is freezing, frozen, unpinnable, or root.
782 */
783void Migrator::export_dir(CDir *dir, mds_rank_t dest)
784{
785 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
786 assert(dir->is_auth());
787 assert(dest != mds->get_nodeid());
788
181888fb
FG
789 if (!(mds->is_active() || mds->is_stopping())) {
790 dout(7) << "i'm not active, no exports for now" << dendl;
791 return;
792 }
7c673cae
FG
793 if (mds->mdcache->is_readonly()) {
794 dout(7) << "read-only FS, no exports for now" << dendl;
795 return;
796 }
31f18b77
FG
797 if (!mds->mdsmap->is_active(dest)) {
798 dout(7) << "dest not active, no exports for now" << dendl;
799 return;
800 }
7c673cae
FG
801 if (mds->is_cluster_degraded()) {
802 dout(7) << "cluster degraded, no exports for now" << dendl;
803 return;
804 }
805 if (dir->inode->is_system()) {
806 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
807 //ceph_abort();
808 return;
809 }
810
b32b8144
FG
811 CDir* parent_dir = dir->inode->get_projected_parent_dir();
812 if (parent_dir && parent_dir->inode->is_stray()) {
813 if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
814 dout(7) << "i won't export anything in stray" << dendl;
815 return;
816 }
817 } else {
818 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
819 dout(7) << "dir is export pinned" << dendl;
820 return;
821 }
7c673cae
FG
822 }
823
824 if (dir->is_frozen() ||
825 dir->is_freezing()) {
826 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
827 return;
828 }
829 if (dir->state_test(CDir::STATE_EXPORTING)) {
830 dout(7) << "already exporting" << dendl;
831 return;
832 }
833
7c673cae
FG
834 if (g_conf->mds_thrash_exports) {
835 // create random subtree bound (which will not be exported)
836 list<CDir*> ls;
837 for (auto p = dir->begin(); p != dir->end(); ++p) {
838 auto dn = p->second;
839 CDentry::linkage_t *dnl= dn->get_linkage();
840 if (dnl->is_primary()) {
841 CInode *in = dnl->get_inode();
842 if (in->is_dir())
843 in->get_nested_dirfrags(ls);
844 }
845 }
846 if (ls.size() > 0) {
847 int n = rand() % ls.size();
848 auto p = ls.begin();
849 while (n--) ++p;
850 CDir *bd = *p;
851 if (!(bd->is_frozen() || bd->is_freezing())) {
852 assert(bd->is_auth());
853 dir->state_set(CDir::STATE_AUXSUBTREE);
854 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
855 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
856 }
857 }
858 }
859
860 mds->hit_export_target(ceph_clock_now(), dest, -1);
861
862 dir->auth_pin(this);
863 dir->state_set(CDir::STATE_EXPORTING);
864
865 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
866 mdr->more()->export_dir = dir;
867
868 assert(export_state.count(dir) == 0);
869 export_state_t& stat = export_state[dir];
870 stat.state = EXPORT_LOCKING;
871 stat.peer = dest;
872 stat.tid = mdr->reqid.tid;
873 stat.mut = mdr;
874
875 return mds->mdcache->dispatch_request(mdr);
876}
877
878void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
879{
880 dout(7) << "dispatch_export_dir " << *mdr << dendl;
881
882 CDir *dir = mdr->more()->export_dir;
883 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
884 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
885 // export must have aborted.
886 dout(7) << "export must have aborted " << *mdr << dendl;
887 mds->mdcache->request_finish(mdr);
888 return;
889 }
890 assert(it->second.state == EXPORT_LOCKING);
891
892 mds_rank_t dest = it->second.peer;
893
894 if (!mds->is_export_target(dest)) {
895 dout(7) << "dest is not yet an export target" << dendl;
896 if (count > 3) {
897 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
898 export_try_cancel(dir);
899 return;
900 }
224ce89b
WB
901
902 mds->locker->drop_locks(mdr.get());
903 mdr->drop_local_auth_pins();
904
31f18b77 905 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
7c673cae
FG
906 return;
907 }
908
909 if (!dir->inode->get_parent_dn()) {
910 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
31f18b77 911 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
7c673cae
FG
912 return;
913 }
914
915 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
916 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
917 export_try_cancel(dir);
918 return;
919 }
920
921 // locks?
922 set<SimpleLock*> rdlocks;
923 set<SimpleLock*> xlocks;
924 set<SimpleLock*> wrlocks;
925 get_export_lock_set(dir, rdlocks);
926 // If auth MDS of the subtree root inode is neither the exporter MDS
927 // nor the importer MDS and it gathers subtree root's fragstat/neststat
928 // while the subtree is exporting. It's possible that the exporter MDS
929 // and the importer MDS both are auth MDS of the subtree root or both
930 // are not auth MDS of the subtree root at the time they receive the
931 // lock messages. So the auth MDS of the subtree root inode may get no
932 // or duplicated fragstat/neststat for the subtree root dirfrag.
933 wrlocks.insert(&dir->get_inode()->filelock);
934 wrlocks.insert(&dir->get_inode()->nestlock);
935 if (dir->get_inode()->is_auth()) {
936 dir->get_inode()->filelock.set_scatter_wanted();
937 dir->get_inode()->nestlock.set_scatter_wanted();
938 }
939
940 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
941 if (mdr->aborted)
942 export_try_cancel(dir);
943 return;
944 }
945
946 assert(g_conf->mds_kill_export_at != 1);
947 it->second.state = EXPORT_DISCOVERING;
948
949 // send ExportDirDiscover (ask target)
950 filepath path;
951 dir->inode->make_path(path);
952 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
953 mds->get_nodeid(),
954 it->second.tid);
955 mds->send_message_mds(discover, dest);
956 assert(g_conf->mds_kill_export_at != 2);
957
958 it->second.last_cum_auth_pins_change = ceph_clock_now();
959
960 // start the freeze, but hold it up with an auth_pin.
961 dir->freeze_tree();
962 assert(dir->is_freezing_tree());
963 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
964}
965
966/*
967 * called on receipt of MExportDirDiscoverAck
968 * the importer now has the directory's _inode_ in memory, and pinned.
969 *
970 * This function DOES put the passed message before returning
971 */
972void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
973{
974 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
975 mds_rank_t dest(m->get_source().num());
976 utime_t now = ceph_clock_now();
977 assert(dir);
978
979 dout(7) << "export_discover_ack from " << m->get_source()
980 << " on " << *dir << dendl;
981
982 mds->hit_export_target(now, dest, -1);
983
984 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
985 if (it == export_state.end() ||
986 it->second.tid != m->get_tid() ||
987 it->second.peer != dest) {
988 dout(7) << "must have aborted" << dendl;
989 } else {
990 assert(it->second.state == EXPORT_DISCOVERING);
c07f9fc5
FG
991
992 if (m->is_success()) {
993 // release locks to avoid deadlock
994 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
995 assert(mdr);
996 mds->mdcache->request_finish(mdr);
997 it->second.mut.reset();
998 // freeze the subtree
999 it->second.state = EXPORT_FREEZING;
1000 dir->auth_unpin(this);
1001 assert(g_conf->mds_kill_export_at != 3);
1002
1003 } else {
1004 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
1005 export_try_cancel(dir, false);
1006 }
7c673cae
FG
1007 }
1008
1009 m->put(); // done
1010}
1011
1012class C_M_ExportSessionsFlushed : public MigratorContext {
1013 CDir *dir;
1014 uint64_t tid;
1015public:
1016 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1017 : MigratorContext(m), dir(d), tid(t) {
1018 assert(dir != NULL);
1019 }
1020 void finish(int r) override {
1021 mig->export_sessions_flushed(dir, tid);
1022 }
1023};
1024
1025void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1026{
1027 dout(7) << "export_sessions_flushed " << *dir << dendl;
1028
1029 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1030 if (it == export_state.end() ||
1031 it->second.state == EXPORT_CANCELLING ||
1032 it->second.tid != tid) {
1033 // export must have aborted.
1034 dout(7) << "export must have aborted on " << dir << dendl;
1035 return;
1036 }
1037
1038 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1039 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1040 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1041 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1042 export_go(dir); // start export.
1043}
1044
1045void Migrator::export_frozen(CDir *dir, uint64_t tid)
1046{
1047 dout(7) << "export_frozen on " << *dir << dendl;
1048
1049 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1050 if (it == export_state.end() || it->second.tid != tid) {
1051 dout(7) << "export must have aborted" << dendl;
1052 return;
1053 }
1054
1055 assert(it->second.state == EXPORT_FREEZING);
1056 assert(dir->is_frozen_tree_root());
1057 assert(dir->get_cum_auth_pins() == 0);
1058
1059 CInode *diri = dir->get_inode();
1060
1061 // ok, try to grab all my locks.
1062 set<SimpleLock*> rdlocks;
1063 get_export_lock_set(dir, rdlocks);
1064 if ((diri->is_auth() && diri->is_frozen()) ||
1065 !mds->locker->can_rdlock_set(rdlocks) ||
1066 !diri->filelock.can_wrlock(-1) ||
1067 !diri->nestlock.can_wrlock(-1)) {
1068 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1069 << *dir << dendl;
7c673cae
FG
1070 // .. unwind ..
1071 dir->unfreeze_tree();
224ce89b 1072 cache->try_subtree_merge(dir);
7c673cae
FG
1073
1074 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae 1075 export_state.erase(it);
224ce89b
WB
1076
1077 dir->state_clear(CDir::STATE_EXPORTING);
1078 cache->maybe_send_pending_resolves();
7c673cae
FG
1079 return;
1080 }
1081
1082 it->second.mut = new MutationImpl();
1083 if (diri->is_auth())
1084 it->second.mut->auth_pin(diri);
1085 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1086 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1087 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1088
1089 cache->show_subtrees();
1090
224ce89b
WB
1091 // CDir::_freeze_tree() should have forced it into subtree.
1092 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
b32b8144
FG
1093
1094 set<client_t> export_client_set;
1095 check_export_size(dir, it->second, export_client_set);
1096
7c673cae 1097 // note the bounds.
7c673cae
FG
1098 set<CDir*> bounds;
1099 cache->get_subtree_bounds(dir, bounds);
1100
1101 // generate prep message, log entry.
1102 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1103
1104 // include list of bystanders
181888fb
FG
1105 for (const auto &p : dir->get_replicas()) {
1106 if (p.first != it->second.peer) {
1107 dout(10) << "bystander mds." << p.first << dendl;
1108 prep->add_bystander(p.first);
7c673cae
FG
1109 }
1110 }
1111
1112 // include base dirfrag
1113 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1114
1115 /*
1116 * include spanning tree for all nested exports.
1117 * these need to be on the destination _before_ the final export so that
1118 * dir_auth updates on any nested exports are properly absorbed.
1119 * this includes inodes and dirfrags included in the subtree, but
1120 * only the inodes at the bounds.
1121 *
1122 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1123 */
1124 set<inodeno_t> inodes_added;
1125 set<dirfrag_t> dirfrags_added;
1126
1127 // check bounds
1128 for (set<CDir*>::iterator p = bounds.begin();
1129 p != bounds.end();
1130 ++p) {
1131 CDir *bound = *p;
1132
1133 // pin it.
b32b8144 1134 assert(bound->state_test(CDir::STATE_EXPORTBOUND));
7c673cae
FG
1135
1136 dout(7) << " export bound " << *bound << dendl;
1137 prep->add_bound( bound->dirfrag() );
1138
1139 // trace to bound
1140 bufferlist tracebl;
1141 CDir *cur = bound;
b32b8144 1142
7c673cae 1143 char start = '-';
b32b8144
FG
1144 if (it->second.residual_dirs.count(bound)) {
1145 start = 'f';
1146 cache->replicate_dir(bound, it->second.peer, tracebl);
1147 dout(7) << " added " << *bound << dendl;
1148 }
1149
7c673cae
FG
1150 while (1) {
1151 // don't repeat inodes
1152 if (inodes_added.count(cur->inode->ino()))
1153 break;
1154 inodes_added.insert(cur->inode->ino());
1155
1156 // prepend dentry + inode
1157 assert(cur->inode->is_auth());
1158 bufferlist bl;
1159 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1160 dout(7) << " added " << *cur->inode->parent << dendl;
1161 cache->replicate_inode(cur->inode, it->second.peer, bl,
1162 mds->mdsmap->get_up_features());
1163 dout(7) << " added " << *cur->inode << dendl;
1164 bl.claim_append(tracebl);
1165 tracebl.claim(bl);
1166
1167 cur = cur->get_parent_dir();
1168
1169 // don't repeat dirfrags
1170 if (dirfrags_added.count(cur->dirfrag()) ||
1171 cur == dir) {
1172 start = 'd'; // start with dentry
1173 break;
1174 }
1175 dirfrags_added.insert(cur->dirfrag());
1176
1177 // prepend dir
1178 cache->replicate_dir(cur, it->second.peer, bl);
1179 dout(7) << " added " << *cur << dendl;
1180 bl.claim_append(tracebl);
1181 tracebl.claim(bl);
1182
1183 start = 'f'; // start with dirfrag
1184 }
1185 bufferlist final_bl;
1186 dirfrag_t df = cur->dirfrag();
1187 ::encode(df, final_bl);
1188 ::encode(start, final_bl);
1189 final_bl.claim_append(tracebl);
1190 prep->add_trace(final_bl);
1191 }
1192
1193 // send.
1194 it->second.state = EXPORT_PREPPING;
1195 mds->send_message_mds(prep, it->second.peer);
1196 assert (g_conf->mds_kill_export_at != 4);
1197
1198 // make sure any new instantiations of caps are flushed out
1199 assert(it->second.warning_ack_waiting.empty());
1200
7c673cae
FG
1201 MDSGatherBuilder gather(g_ceph_context);
1202 mds->server->flush_client_sessions(export_client_set, gather);
1203 if (gather.has_subs()) {
1204 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1205 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1206 gather.activate();
1207 }
1208}
1209
b32b8144 1210void Migrator::check_export_size(CDir *dir, export_state_t& stat, set<client_t>& client_set)
7c673cae 1211{
b32b8144
FG
1212 const unsigned frag_size = 800;
1213 const unsigned inode_size = 1000;
1214 const unsigned cap_size = 80;
1215 const unsigned link_size = 10;
1216 const unsigned null_size = 1;
1217
1218 uint64_t max_size = g_conf->get_val<uint64_t>("mds_max_export_size");
1219 uint64_t approx_size = 0;
1220
7c673cae
FG
1221 list<CDir*> dfs;
1222 dfs.push_back(dir);
1223 while (!dfs.empty()) {
1224 CDir *dir = dfs.front();
1225 dfs.pop_front();
b32b8144
FG
1226
1227 approx_size += frag_size;
94b18763
FG
1228 for (auto &p : *dir) {
1229 CDentry *dn = p.second;
b32b8144
FG
1230 if (dn->get_linkage()->is_null()) {
1231 approx_size += null_size;
1232 continue;
1233 }
1234 if (dn->get_linkage()->is_remote()) {
1235 approx_size += link_size;
7c673cae 1236 continue;
b32b8144
FG
1237 }
1238
1239 approx_size += inode_size;
7c673cae
FG
1240 CInode *in = dn->get_linkage()->get_inode();
1241 if (in->is_dir()) {
1242 // directory?
1243 list<CDir*> ls;
1244 in->get_dirfrags(ls);
b32b8144
FG
1245 for (auto q : ls) {
1246 if (q->is_subtree_root()) {
1247 q->state_set(CDir::STATE_EXPORTBOUND);
1248 q->get(CDir::PIN_EXPORTBOUND);
1249 } else {
7c673cae 1250 // include nested dirfrag
b32b8144
FG
1251 assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
1252 dfs.push_front(q);
7c673cae
FG
1253 }
1254 }
1255 }
1256 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1257 q != in->client_caps.end();
b32b8144
FG
1258 ++q) {
1259 approx_size += cap_size;
7c673cae 1260 client_set.insert(q->first);
b32b8144 1261 }
7c673cae 1262 }
b32b8144
FG
1263
1264 if (approx_size >= max_size)
1265 break;
1266 }
1267
1268 while (!dfs.empty()) {
1269 CDir *dir = dfs.front();
1270 dfs.pop_front();
1271
1272 dout(7) << "check_export_size: creating bound " << *dir << dendl;
1273 assert(dir->is_auth());
1274 dir->state_set(CDir::STATE_EXPORTBOUND);
1275 dir->get(CDir::PIN_EXPORTBOUND);
1276
1277 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
1278 // Another choice here is finishing all WAIT_UNFREEZE contexts and keeping
1279 // the newly created subtree unfreeze.
1280 dir->_freeze_tree();
1281
1282 stat.residual_dirs.insert(dir);
7c673cae
FG
1283 }
1284}
1285
1286void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1287{
1288 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1289 q != in->client_caps.end();
1290 ++q)
1291 client_set.insert(q->first);
1292}
1293
1294/* This function DOES put the passed message before returning*/
1295void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1296{
1297 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1298 mds_rank_t dest(m->get_source().num());
1299 utime_t now = ceph_clock_now();
1300 assert(dir);
1301
1302 dout(7) << "export_prep_ack " << *dir << dendl;
1303
1304 mds->hit_export_target(now, dest, -1);
1305
1306 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1307 if (it == export_state.end() ||
1308 it->second.tid != m->get_tid() ||
1309 it->second.peer != mds_rank_t(m->get_source().num())) {
1310 // export must have aborted.
1311 dout(7) << "export must have aborted" << dendl;
1312 m->put();
1313 return;
1314 }
1315 assert(it->second.state == EXPORT_PREPPING);
1316
1317 if (!m->is_success()) {
c07f9fc5 1318 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
7c673cae
FG
1319 export_try_cancel(dir, false);
1320 m->put();
1321 return;
1322 }
1323
1324 assert (g_conf->mds_kill_export_at != 5);
1325 // send warnings
1326 set<CDir*> bounds;
1327 cache->get_subtree_bounds(dir, bounds);
1328
1329 assert(it->second.warning_ack_waiting.empty() ||
1330 (it->second.warning_ack_waiting.size() == 1 &&
1331 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1332 assert(it->second.notify_ack_waiting.empty());
1333
181888fb
FG
1334 for (const auto &p : dir->get_replicas()) {
1335 if (p.first == it->second.peer) continue;
7c673cae 1336 if (mds->is_cluster_degraded() &&
181888fb 1337 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
7c673cae 1338 continue; // only if active
181888fb
FG
1339 it->second.warning_ack_waiting.insert(p.first);
1340 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
7c673cae
FG
1341
1342 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1343 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1344 mds_authority_t(mds->get_nodeid(),it->second.peer));
1345 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1346 notify->get_bounds().push_back((*q)->dirfrag());
181888fb 1347 mds->send_message_mds(notify, p.first);
7c673cae
FG
1348
1349 }
1350
1351 it->second.state = EXPORT_WARNING;
1352
1353 assert(g_conf->mds_kill_export_at != 6);
1354 // nobody to warn?
1355 if (it->second.warning_ack_waiting.empty())
1356 export_go(dir); // start export.
1357
1358 // done.
1359 m->put();
1360}
1361
1362
1363class C_M_ExportGo : public MigratorContext {
1364 CDir *dir;
1365 uint64_t tid;
1366public:
1367 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1368 MigratorContext(m), dir(d), tid(t) {
1369 assert(dir != NULL);
1370 }
1371 void finish(int r) override {
1372 mig->export_go_synced(dir, tid);
1373 }
1374};
1375
1376void Migrator::export_go(CDir *dir)
1377{
b32b8144
FG
1378 auto it = export_state.find(dir);
1379 assert(it != export_state.end());
1380 dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
7c673cae
FG
1381
1382 // first sync log to flush out e.g. any cap imports
b32b8144 1383 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
7c673cae
FG
1384 mds->mdlog->flush();
1385}
1386
1387void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1388{
1389 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1390 if (it == export_state.end() ||
1391 it->second.state == EXPORT_CANCELLING ||
1392 it->second.tid != tid) {
1393 // export must have aborted.
1394 dout(7) << "export must have aborted on " << dir << dendl;
1395 return;
1396 }
1397 assert(it->second.state == EXPORT_WARNING);
1398 mds_rank_t dest = it->second.peer;
1399
1400 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1401
1402 cache->show_subtrees();
1403
1404 it->second.state = EXPORT_EXPORTING;
1405 assert(g_conf->mds_kill_export_at != 7);
1406
1407 assert(dir->is_frozen_tree_root());
1408 assert(dir->get_cum_auth_pins() == 0);
1409
1410 // set ambiguous auth
1411 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1412
1413 // take away the popularity we're sending.
1414 utime_t now = ceph_clock_now();
1415 mds->balancer->subtract_export(dir, now);
1416
1417 // fill export message with cache data
1418 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1419 map<client_t,entity_inst_t> exported_client_map;
1420 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1421 dir, // recur start point
1422 exported_client_map,
1423 now);
1424 ::encode(exported_client_map, req->client_map,
1425 mds->mdsmap->get_up_features());
1426
1427 // add bounds to message
1428 set<CDir*> bounds;
1429 cache->get_subtree_bounds(dir, bounds);
1430 for (set<CDir*>::iterator p = bounds.begin();
1431 p != bounds.end();
1432 ++p)
1433 req->add_export((*p)->dirfrag());
1434
1435 // send
1436 mds->send_message_mds(req, dest);
1437 assert(g_conf->mds_kill_export_at != 8);
1438
1439 mds->hit_export_target(now, dest, num_exported_inodes+1);
1440
1441 // stats
1442 if (mds->logger) mds->logger->inc(l_mds_exported);
1443 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1444
1445 cache->show_subtrees();
1446}
1447
1448
1449/** encode_export_inode
1450 * update our local state for this inode to export.
1451 * encode relevant state to be sent over the wire.
1452 * used by: encode_export_dir, file_rename (if foreign)
1453 *
1454 * FIXME: the separation between CInode.encode_export and these methods
1455 * is pretty arbitrary and dumb.
1456 */
1457void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1458 map<client_t,entity_inst_t>& exported_client_map)
1459{
1460 dout(7) << "encode_export_inode " << *in << dendl;
1461 assert(!in->is_replica(mds->get_nodeid()));
1462
1463 // relax locks?
1464 if (!in->is_replicated()) {
1465 in->replicate_relax_locks();
1466 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1467 }
1468
1469 ::encode(in->inode.ino, enc_state);
1470 ::encode(in->last, enc_state);
1471 in->encode_export(enc_state);
1472
1473 // caps
1474 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1475}
1476
1477void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1478 map<client_t,entity_inst_t>& exported_client_map)
1479{
1480 dout(20) << "encode_export_inode_caps " << *in << dendl;
1481
1482 // encode caps
1483 map<client_t,Capability::Export> cap_map;
1484 in->export_client_caps(cap_map);
1485 ::encode(cap_map, bl);
1486 if (auth_cap) {
1487 ::encode(in->get_mds_caps_wanted(), bl);
1488
1489 in->state_set(CInode::STATE_EXPORTINGCAPS);
1490 in->get(CInode::PIN_EXPORTINGCAPS);
1491 }
1492
1493 // make note of clients named by exported capabilities
1494 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1495 it != in->client_caps.end();
1496 ++it)
1497 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1498}
1499
1500void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1501 map<client_t,Capability::Import>& peer_imported)
1502{
1503 dout(20) << "finish_export_inode_caps " << *in << dendl;
1504
1505 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1506 in->put(CInode::PIN_EXPORTINGCAPS);
1507
1508 // tell (all) clients about migrating caps..
1509 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1510 it != in->client_caps.end();
1511 ++it) {
1512 Capability *cap = it->second;
1513 dout(7) << "finish_export_inode_caps telling client." << it->first
1514 << " exported caps on " << *in << dendl;
1515 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1516 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1517
1518 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1519 assert(q != peer_imported.end());
28e407b8
AA
1520 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
1521 (q->second.cap_id > 0 ? peer : -1), 0);
7c673cae
FG
1522 mds->send_message_client_counted(m, it->first);
1523 }
1524 in->clear_client_caps_after_export();
1525 mds->locker->eval(in, CEPH_CAP_LOCKS);
1526}
1527
1528void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1529 map<client_t,Capability::Import>& peer_imported,
1530 list<MDSInternalContextBase*>& finished)
1531{
1532 dout(12) << "finish_export_inode " << *in << dendl;
1533
1534 // clean
1535 if (in->is_dirty())
1536 in->mark_clean();
1537
1538 // clear/unpin cached_by (we're no longer the authority)
1539 in->clear_replica_map();
1540
1541 // twiddle lock states for auth -> replica transition
1542 in->authlock.export_twiddle();
1543 in->linklock.export_twiddle();
1544 in->dirfragtreelock.export_twiddle();
1545 in->filelock.export_twiddle();
1546 in->nestlock.export_twiddle();
1547 in->xattrlock.export_twiddle();
1548 in->snaplock.export_twiddle();
1549 in->flocklock.export_twiddle();
1550 in->policylock.export_twiddle();
1551
1552 // mark auth
1553 assert(in->is_auth());
1554 in->state_clear(CInode::STATE_AUTH);
1555 in->replica_nonce = CInode::EXPORT_NONCE;
1556
1557 in->clear_dirty_rstat();
1558
1559 // no more auth subtree? clear scatter dirty
1560 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1561 in->clear_scatter_dirty();
1562
1563 in->item_open_file.remove_myself();
1564
1565 in->clear_dirty_parent();
1566
1567 in->clear_file_locks();
1568
1569 // waiters
1570 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1571
1572 in->finish_export(now);
1573
1574 finish_export_inode_caps(in, peer, peer_imported);
7c673cae
FG
1575}
1576
1577uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1578 CDir *dir,
1579 map<client_t,entity_inst_t>& exported_client_map,
1580 utime_t now)
1581{
1582 uint64_t num_exported = 0;
1583
1584 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1585
1586 assert(dir->get_projected_version() == dir->get_version());
1587
1588#ifdef MDS_VERIFY_FRAGSTAT
1589 if (dir->is_complete())
1590 dir->verify_fragstat();
1591#endif
1592
1593 // dir
1594 dirfrag_t df = dir->dirfrag();
1595 ::encode(df, exportbl);
1596 dir->encode_export(exportbl);
1597
1598 __u32 nden = dir->items.size();
1599 ::encode(nden, exportbl);
1600
1601 // dentries
1602 list<CDir*> subdirs;
94b18763
FG
1603 for (auto &p : *dir) {
1604 CDentry *dn = p.second;
7c673cae
FG
1605 CInode *in = dn->get_linkage()->get_inode();
1606
1607 if (!dn->is_replicated())
1608 dn->lock.replicate_relax();
1609
1610 num_exported++;
1611
1612 // -- dentry
1613 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1614
1615 // dn name
94b18763 1616 ::encode(dn->get_name(), exportbl);
7c673cae
FG
1617 ::encode(dn->last, exportbl);
1618
1619 // state
1620 dn->encode_export(exportbl);
1621
1622 // points to...
1623
1624 // null dentry?
1625 if (dn->get_linkage()->is_null()) {
1626 exportbl.append("N", 1); // null dentry
1627 continue;
1628 }
1629
1630 if (dn->get_linkage()->is_remote()) {
1631 // remote link
1632 exportbl.append("L", 1); // remote link
1633
1634 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1635 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1636 ::encode(ino, exportbl);
1637 ::encode(d_type, exportbl);
1638 continue;
1639 }
1640
1641 // primary link
1642 // -- inode
1643 exportbl.append("I", 1); // inode dentry
1644
1645 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1646
1647 // directory?
1648 list<CDir*> dfs;
1649 in->get_dirfrags(dfs);
1650 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1651 CDir *t = *p;
1652 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1653 // include nested dirfrag
1654 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
b32b8144 1655 subdirs.push_front(t); // it's ours, recurse (later)
7c673cae
FG
1656 }
1657 }
1658 }
1659
1660 // subdirs
94b18763
FG
1661 for (auto &dir : subdirs)
1662 num_exported += encode_export_dir(exportbl, dir, exported_client_map, now);
7c673cae
FG
1663
1664 return num_exported;
1665}
1666
1667void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1668 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1669 list<MDSInternalContextBase*>& finished, int *num_dentries)
1670{
1671 dout(10) << "finish_export_dir " << *dir << dendl;
1672
1673 // release open_by
1674 dir->clear_replica_map();
1675
1676 // mark
1677 assert(dir->is_auth());
1678 dir->state_clear(CDir::STATE_AUTH);
1679 dir->remove_bloom();
1680 dir->replica_nonce = CDir::EXPORT_NONCE;
1681
1682 if (dir->is_dirty())
1683 dir->mark_clean();
1684
1685 // suck up all waiters
1686 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1687
1688 // pop
1689 dir->finish_export(now);
1690
1691 // dentries
1692 list<CDir*> subdirs;
94b18763
FG
1693 for (auto &p : *dir) {
1694 CDentry *dn = p.second;
7c673cae
FG
1695 CInode *in = dn->get_linkage()->get_inode();
1696
1697 // dentry
1698 dn->finish_export();
1699
1700 // inode?
1701 if (dn->get_linkage()->is_primary()) {
1702 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1703
1704 // subdirs?
1705 in->get_nested_dirfrags(subdirs);
1706 }
1707
1708 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1709 ++(*num_dentries);
1710 }
1711
1712 // subdirs
1713 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1714 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1715}
1716
1717class C_MDS_ExportFinishLogged : public MigratorLogContext {
1718 CDir *dir;
1719public:
1720 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1721 void finish(int r) override {
1722 mig->export_logged_finish(dir);
1723 }
1724};
1725
1726
1727/*
1728 * i should get an export_ack from the export target.
1729 *
1730 * This function DOES put the passed message before returning
1731 */
1732void Migrator::handle_export_ack(MExportDirAck *m)
1733{
1734 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1735 mds_rank_t dest(m->get_source().num());
1736 utime_t now = ceph_clock_now();
1737 assert(dir);
1738 assert(dir->is_frozen_tree_root()); // i'm exporting!
1739
1740 // yay!
1741 dout(7) << "handle_export_ack " << *dir << dendl;
1742
1743 mds->hit_export_target(now, dest, -1);
1744
1745 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1746 assert(it != export_state.end());
1747 assert(it->second.state == EXPORT_EXPORTING);
1748 assert(it->second.tid == m->get_tid());
1749
1750 bufferlist::iterator bp = m->imported_caps.begin();
1751 ::decode(it->second.peer_imported, bp);
1752
1753 it->second.state = EXPORT_LOGGINGFINISH;
1754 assert (g_conf->mds_kill_export_at != 9);
1755 set<CDir*> bounds;
1756 cache->get_subtree_bounds(dir, bounds);
1757
7c673cae
FG
1758 // log completion.
1759 // include export bounds, to ensure they're in the journal.
31f18b77 1760 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
7c673cae
FG
1761 mds->mdlog->start_entry(le);
1762
1763 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
31f18b77 1764 le->metablob.add_dir(dir, false);
7c673cae
FG
1765 for (set<CDir*>::iterator p = bounds.begin();
1766 p != bounds.end();
1767 ++p) {
1768 CDir *bound = *p;
1769 le->get_bounds().insert(bound->dirfrag());
1770 le->metablob.add_dir_context(bound);
1771 le->metablob.add_dir(bound, false);
1772 }
1773
31f18b77
FG
1774 // list us second, them first.
1775 // this keeps authority().first in sync with subtree auth state in the journal.
1776 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1777
7c673cae
FG
1778 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1779 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1780 mds->mdlog->flush();
1781 assert (g_conf->mds_kill_export_at != 10);
1782
1783 m->put();
1784}
1785
b32b8144 1786void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
7c673cae
FG
1787{
1788 dout(7) << "export_notify_abort " << *dir << dendl;
1789
7c673cae
FG
1790 assert(stat.state == EXPORT_CANCELLING);
1791
1792 if (stat.notify_ack_waiting.empty()) {
1793 stat.state = EXPORT_CANCELLED;
1794 return;
1795 }
1796
1797 dir->auth_pin(this);
1798
1799 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1800 p != stat.notify_ack_waiting.end();
1801 ++p) {
b32b8144
FG
1802 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1803 pair<int,int>(mds->get_nodeid(), stat.peer),
1804 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
7c673cae
FG
1805 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1806 notify->get_bounds().push_back((*i)->dirfrag());
1807 mds->send_message_mds(notify, *p);
1808 }
1809}
1810
1811/*
1812 * this happens if hte dest failes after i send teh export data but before it is acked
1813 * that is, we don't know they safely received and logged it, so we reverse our changes
1814 * and go on.
1815 */
b32b8144 1816void Migrator::export_reverse(CDir *dir, export_state_t& stat)
7c673cae
FG
1817{
1818 dout(7) << "export_reverse " << *dir << dendl;
1819
1820 set<CInode*> to_eval;
1821
1822 set<CDir*> bounds;
1823 cache->get_subtree_bounds(dir, bounds);
1824
1825 // remove exporting pins
1826 list<CDir*> rq;
1827 rq.push_back(dir);
1828 while (!rq.empty()) {
1829 CDir *t = rq.front();
1830 rq.pop_front();
1831 t->abort_export();
94b18763
FG
1832 for (auto &p : *t) {
1833 CDentry *dn = p.second;
1834 dn->abort_export();
1835 if (!dn->get_linkage()->is_primary())
7c673cae 1836 continue;
94b18763 1837 CInode *in = dn->get_linkage()->get_inode();
7c673cae
FG
1838 in->abort_export();
1839 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1840 in->state_clear(CInode::STATE_EVALSTALECAPS);
1841 to_eval.insert(in);
1842 }
1843 if (in->is_dir())
1844 in->get_nested_dirfrags(rq);
1845 }
1846 }
1847
1848 // unpin bounds
b32b8144 1849 for (auto bd : bounds) {
7c673cae
FG
1850 bd->put(CDir::PIN_EXPORTBOUND);
1851 bd->state_clear(CDir::STATE_EXPORTBOUND);
1852 }
1853
7c673cae 1854 // notify bystanders
b32b8144 1855 export_notify_abort(dir, stat, bounds);
7c673cae 1856
224ce89b
WB
1857 // unfreeze tree, with possible subtree merge.
1858 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1859
7c673cae
FG
1860 // process delayed expires
1861 cache->process_delayed_expire(dir);
224ce89b 1862
7c673cae 1863 dir->unfreeze_tree();
224ce89b 1864 cache->try_subtree_merge(dir);
b32b8144
FG
1865 for (auto bd : stat.residual_dirs) {
1866 bd->unfreeze_tree();
1867 cache->try_subtree_merge(bd);
1868 }
7c673cae
FG
1869
1870 // revoke/resume stale caps
1871 for (auto in : to_eval) {
1872 bool need_issue = false;
1873 for (auto& p : in->get_client_caps()) {
1874 Capability *cap = p.second;
1875 if (cap->is_stale()) {
1876 mds->locker->revoke_stale_caps(cap);
1877 } else {
1878 need_issue = true;
1879 }
1880 }
1881 if (need_issue &&
1882 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
1883 mds->locker->issue_caps(in);
1884 }
1885
1886 cache->show_cache();
1887}
1888
1889
1890/*
1891 * once i get the ack, and logged the EExportFinish(true),
1892 * send notifies (if any), otherwise go straight to finish.
1893 *
1894 */
1895void Migrator::export_logged_finish(CDir *dir)
1896{
1897 dout(7) << "export_logged_finish " << *dir << dendl;
1898
1899 export_state_t& stat = export_state[dir];
1900
1901 // send notifies
1902 set<CDir*> bounds;
1903 cache->get_subtree_bounds(dir, bounds);
1904
1905 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1906 p != stat.notify_ack_waiting.end();
1907 ++p) {
1908 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1909 pair<int,int>(mds->get_nodeid(), stat.peer),
1910 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
1911
1912 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1913 notify->get_bounds().push_back((*i)->dirfrag());
1914
1915 mds->send_message_mds(notify, *p);
1916 }
1917
1918 // wait for notifyacks
1919 stat.state = EXPORT_NOTIFYING;
1920 assert (g_conf->mds_kill_export_at != 11);
1921
1922 // no notifies to wait for?
1923 if (stat.notify_ack_waiting.empty()) {
1924 export_finish(dir); // skip notify/notify_ack stage.
1925 } else {
1926 // notify peer to send cap import messages to clients
1927 if (!mds->is_cluster_degraded() ||
1928 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
1929 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
1930 } else {
1931 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
1932 }
1933 }
1934}
1935
1936/*
1937 * warning:
1938 * i'll get an ack from each bystander.
1939 * when i get them all, do the export.
1940 * notify:
1941 * i'll get an ack from each bystander.
1942 * when i get them all, unfreeze and send the finish.
1943 *
1944 * This function DOES put the passed message before returning
1945 */
1946void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
1947{
1948 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1949 mds_rank_t dest(m->get_source().num());
1950 utime_t now = ceph_clock_now();
1951 assert(dir);
1952 mds_rank_t from = mds_rank_t(m->get_source().num());
1953
1954 mds->hit_export_target(now, dest, -1);
1955
1956 auto export_state_entry = export_state.find(dir);
1957 if (export_state_entry != export_state.end()) {
1958 export_state_t& stat = export_state_entry->second;
1959 if (stat.state == EXPORT_WARNING &&
1960 stat.warning_ack_waiting.erase(from)) {
1961 // exporting. process warning.
1962 dout(7) << "handle_export_notify_ack from " << m->get_source()
1963 << ": exporting, processing warning on " << *dir << dendl;
1964 if (stat.warning_ack_waiting.empty())
1965 export_go(dir); // start export.
1966 } else if (stat.state == EXPORT_NOTIFYING &&
1967 stat.notify_ack_waiting.erase(from)) {
1968 // exporting. process notify.
1969 dout(7) << "handle_export_notify_ack from " << m->get_source()
1970 << ": exporting, processing notify on " << *dir << dendl;
1971 if (stat.notify_ack_waiting.empty())
1972 export_finish(dir);
1973 } else if (stat.state == EXPORT_CANCELLING &&
1974 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
1975 stat.notify_ack_waiting.erase(from)) {
1976 dout(7) << "handle_export_notify_ack from " << m->get_source()
1977 << ": cancelling export, processing notify on " << *dir << dendl;
1978 if (stat.notify_ack_waiting.empty()) {
1979 export_state.erase(export_state_entry);
1980 export_cancel_finish(dir);
1981 }
1982 }
1983 }
1984 else {
1985 auto import_state_entry = import_state.find(dir->dirfrag());
1986 if (import_state_entry != import_state.end()) {
1987 import_state_t& stat = import_state_entry->second;
1988 if (stat.state == IMPORT_ABORTING) {
1989 // reversing import
1990 dout(7) << "handle_export_notify_ack from " << m->get_source()
1991 << ": aborting import on " << *dir << dendl;
1992 assert(stat.bystanders.count(from));
1993 stat.bystanders.erase(from);
1994 if (stat.bystanders.empty())
1995 import_reverse_unfreeze(dir);
1996 }
1997 }
1998 }
1999
2000 m->put();
2001}
2002
2003void Migrator::export_finish(CDir *dir)
2004{
2005 dout(5) << "export_finish " << *dir << dendl;
2006
2007 assert (g_conf->mds_kill_export_at != 12);
2008 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2009 if (it == export_state.end()) {
2010 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
2011 return;
2012 }
2013
2014 // send finish/commit to new auth
2015 if (!mds->is_cluster_degraded() ||
2016 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
2017 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
2018 } else {
2019 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
2020 }
2021 assert(g_conf->mds_kill_export_at != 13);
2022
2023 // finish export (adjust local cache state)
2024 int num_dentries = 0;
224ce89b 2025 list<MDSInternalContextBase*> finished;
7c673cae 2026 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
224ce89b
WB
2027 it->second.peer_imported, finished, &num_dentries);
2028
2029 assert(!dir->is_auth());
2030 cache->adjust_subtree_auth(dir, it->second.peer);
2031
7c673cae
FG
2032 // unpin bounds
2033 set<CDir*> bounds;
2034 cache->get_subtree_bounds(dir, bounds);
2035 for (set<CDir*>::iterator p = bounds.begin();
2036 p != bounds.end();
2037 ++p) {
2038 CDir *bd = *p;
2039 bd->put(CDir::PIN_EXPORTBOUND);
2040 bd->state_clear(CDir::STATE_EXPORTBOUND);
2041 }
2042
2043 if (dir->state_test(CDir::STATE_AUXSUBTREE))
2044 dir->state_clear(CDir::STATE_AUXSUBTREE);
2045
224ce89b
WB
2046 // discard delayed expires
2047 cache->discard_delayed_expire(dir);
2048
2049 dout(7) << "export_finish unfreezing" << dendl;
2050
2051 // unfreeze tree, with possible subtree merge.
7c673cae 2052 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
224ce89b 2053 dir->unfreeze_tree();
7c673cae 2054 cache->try_subtree_merge(dir);
b32b8144
FG
2055 for (auto bd : it->second.residual_dirs) {
2056 export_queue.push_front(pair<dirfrag_t,mds_rank_t>(bd->dirfrag(), it->second.peer));
2057 bd->take_waiting(CDir::WAIT_ANY_MASK, finished);
2058 bd->unfreeze_tree();
2059 cache->try_subtree_merge(bd);
2060 }
7c673cae
FG
2061
2062 // no more auth subtree? clear scatter dirty
2063 if (!dir->get_inode()->is_auth() &&
2064 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2065 dir->get_inode()->clear_scatter_dirty();
2066 // wake up scatter_nudge waiters
224ce89b 2067 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
7c673cae
FG
2068 }
2069
224ce89b
WB
2070 if (!finished.empty())
2071 mds->queue_waiters(finished);
7c673cae
FG
2072
2073 MutationRef mut = it->second.mut;
2074 // remove from exporting list, clean up state
2075 export_state.erase(it);
2076 dir->state_clear(CDir::STATE_EXPORTING);
2077
2078 cache->show_subtrees();
2079 audit();
2080
181888fb 2081 cache->trim(num_dentries); // try trimming exported dentries
7c673cae
FG
2082
2083 // send pending import_maps?
2084 mds->mdcache->maybe_send_pending_resolves();
2085
2086 // drop locks, unpin path
2087 if (mut) {
2088 mds->locker->drop_locks(mut.get());
2089 mut->cleanup();
2090 }
2091
2092 maybe_do_queued_export();
2093}
2094
2095
2096
2097
2098
2099
2100
2101
2102// ==========================================================
2103// IMPORT
2104
2105void Migrator::handle_export_discover(MExportDirDiscover *m)
2106{
2107 mds_rank_t from = m->get_source_mds();
2108 assert(from != mds->get_nodeid());
2109
2110 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2111
2112 // note import state
2113 dirfrag_t df = m->get_dirfrag();
c07f9fc5
FG
2114
2115 if (!mds->is_active()) {
2116 dout(7) << " not active, send NACK " << dendl;
2117 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
2118 m->put();
2119 return;
2120 }
2121
7c673cae 2122 // only start discovering on this message once.
b32b8144 2123 import_state_t *p_state;
7c673cae
FG
2124 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2125 if (!m->started) {
2126 assert(it == import_state.end());
2127 m->started = true;
b32b8144
FG
2128 p_state = &import_state[df];
2129 p_state->state = IMPORT_DISCOVERING;
2130 p_state->peer = from;
2131 p_state->tid = m->get_tid();
7c673cae
FG
2132 } else {
2133 // am i retrying after ancient path_traverse results?
2134 if (it == import_state.end() ||
2135 it->second.peer != from ||
2136 it->second.tid != m->get_tid()) {
2137 dout(7) << " dropping obsolete message" << dendl;
2138 m->put();
2139 return;
2140 }
2141 assert(it->second.state == IMPORT_DISCOVERING);
b32b8144 2142 p_state = &it->second;
7c673cae
FG
2143 }
2144
2145 if (!mds->mdcache->is_open()) {
2146 dout(5) << " waiting for root" << dendl;
2147 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2148 return;
2149 }
2150
2151 assert (g_conf->mds_kill_import_at != 1);
2152
2153 // do we have it?
2154 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2155 if (!in) {
2156 // must discover it!
2157 filepath fpath(m->get_path());
2158 vector<CDentry*> trace;
2159 MDRequestRef null_ref;
2160 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2161 if (r > 0) return;
2162 if (r < 0) {
2163 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2164 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2165 }
2166
2167 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2168 }
2169
2170 // yay
2171 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2172
b32b8144 2173 p_state->state = IMPORT_DISCOVERED;
7c673cae
FG
2174
2175 // pin inode in the cache (for now)
2176 assert(in->is_dir());
2177 in->get(CInode::PIN_IMPORTING);
2178
2179 // reply
2180 dout(7) << " sending export_discover_ack on " << *in << dendl;
b32b8144 2181 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), p_state->peer);
7c673cae
FG
2182 m->put();
2183 assert (g_conf->mds_kill_import_at != 2);
2184}
2185
2186void Migrator::import_reverse_discovering(dirfrag_t df)
2187{
2188 import_state.erase(df);
2189}
2190
2191void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2192{
2193 // unpin base
2194 diri->put(CInode::PIN_IMPORTING);
2195 import_state.erase(df);
2196}
2197
b32b8144 2198void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
7c673cae
FG
2199{
2200 set<CDir*> bounds;
b32b8144 2201 cache->map_dirfrag_set(stat.bound_ls, bounds);
7c673cae
FG
2202 import_remove_pins(dir, bounds);
2203 import_reverse_final(dir);
2204}
2205
2206/* This function DOES put the passed message before returning*/
2207void Migrator::handle_export_cancel(MExportDirCancel *m)
2208{
2209 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2210 dirfrag_t df = m->get_dirfrag();
2211 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2212 if (it == import_state.end()) {
2213 assert(0 == "got export_cancel in weird state");
2214 } else if (it->second.state == IMPORT_DISCOVERING) {
2215 import_reverse_discovering(df);
2216 } else if (it->second.state == IMPORT_DISCOVERED) {
2217 CInode *in = cache->get_inode(df.ino);
2218 assert(in);
2219 import_reverse_discovered(df, in);
2220 } else if (it->second.state == IMPORT_PREPPING) {
2221 CDir *dir = mds->mdcache->get_dirfrag(df);
2222 assert(dir);
b32b8144 2223 import_reverse_prepping(dir, it->second);
7c673cae
FG
2224 } else if (it->second.state == IMPORT_PREPPED) {
2225 CDir *dir = mds->mdcache->get_dirfrag(df);
2226 assert(dir);
2227 set<CDir*> bounds;
2228 cache->get_subtree_bounds(dir, bounds);
2229 import_remove_pins(dir, bounds);
2230 // adjust auth back to the exportor
2231 cache->adjust_subtree_auth(dir, it->second.peer);
7c673cae
FG
2232 import_reverse_unfreeze(dir);
2233 } else {
2234 assert(0 == "got export_cancel in weird state");
2235 }
2236 m->put();
2237}
2238
2239/* This function DOES put the passed message before returning*/
2240void Migrator::handle_export_prep(MExportDirPrep *m)
2241{
2242 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2243 assert(oldauth != mds->get_nodeid());
2244
2245 CDir *dir;
2246 CInode *diri;
2247 list<MDSInternalContextBase*> finished;
2248
2249 // assimilate root dir.
2250 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2251 if (!m->did_assim()) {
2252 assert(it != import_state.end());
2253 assert(it->second.state == IMPORT_DISCOVERED);
31f18b77 2254 assert(it->second.peer == oldauth);
7c673cae
FG
2255 diri = cache->get_inode(m->get_dirfrag().ino);
2256 assert(diri);
2257 bufferlist::iterator p = m->basedir.begin();
2258 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2259 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2260 } else {
2261 if (it == import_state.end() ||
2262 it->second.peer != oldauth ||
2263 it->second.tid != m->get_tid()) {
2264 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2265 m->put();
2266 return;
2267 }
2268 assert(it->second.state == IMPORT_PREPPING);
31f18b77 2269 assert(it->second.peer == oldauth);
7c673cae
FG
2270
2271 dir = cache->get_dirfrag(m->get_dirfrag());
2272 assert(dir);
2273 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2274 diri = dir->get_inode();
2275 }
2276 assert(dir->is_auth() == false);
2277
2278 cache->show_subtrees();
2279
2280 // build import bound map
2281 map<inodeno_t, fragset_t> import_bound_fragset;
2282 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2283 p != m->get_bounds().end();
2284 ++p) {
2285 dout(10) << " bound " << *p << dendl;
2286 import_bound_fragset[p->ino].insert(p->frag);
2287 }
2288
2289 // assimilate contents?
2290 if (!m->did_assim()) {
2291 dout(7) << "doing assim on " << *dir << dendl;
2292 m->mark_assim(); // only do this the first time!
2293
2294 // change import state
2295 it->second.state = IMPORT_PREPPING;
2296 it->second.bound_ls = m->get_bounds();
2297 it->second.bystanders = m->get_bystanders();
2298 assert(g_conf->mds_kill_import_at != 3);
2299
2300 // bystander list
2301 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2302
2303 // move pin to dir
2304 diri->put(CInode::PIN_IMPORTING);
2305 dir->get(CDir::PIN_IMPORTING);
2306 dir->state_set(CDir::STATE_IMPORTING);
2307
2308 // assimilate traces to exports
2309 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2310 for (list<bufferlist>::iterator p = m->traces.begin();
2311 p != m->traces.end();
2312 ++p) {
2313 bufferlist::iterator q = p->begin();
2314 dirfrag_t df;
2315 ::decode(df, q);
2316 char start;
2317 ::decode(start, q);
2318 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2319
2320 CDir *cur = 0;
2321 if (start == 'd') {
2322 cur = cache->get_dirfrag(df);
2323 assert(cur);
2324 dout(10) << " had " << *cur << dendl;
2325 } else if (start == 'f') {
2326 CInode *in = cache->get_inode(df.ino);
2327 assert(in);
2328 dout(10) << " had " << *in << dendl;
2329 cur = cache->add_replica_dir(q, in, oldauth, finished);
2330 dout(10) << " added " << *cur << dendl;
2331 } else if (start == '-') {
2332 // nothing
2333 } else
2334 assert(0 == "unrecognized start char");
2335
b32b8144 2336 while (!q.end()) {
7c673cae
FG
2337 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2338 dout(10) << " added " << *dn << dendl;
2339 CInode *in = cache->add_replica_inode(q, dn, finished);
2340 dout(10) << " added " << *in << dendl;
2341 if (q.end())
2342 break;
2343 cur = cache->add_replica_dir(q, in, oldauth, finished);
2344 dout(10) << " added " << *cur << dendl;
2345 }
2346 }
2347
2348 // make bound sticky
2349 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2350 p != import_bound_fragset.end();
2351 ++p) {
2352 CInode *in = cache->get_inode(p->first);
2353 assert(in);
2354 in->get_stickydirs();
2355 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2356 }
2357
2358 } else {
2359 dout(7) << " not doing assim on " << *dir << dendl;
2360 }
2361
2362 if (!finished.empty())
2363 mds->queue_waiters(finished);
2364
2365
c07f9fc5
FG
2366 bool success = true;
2367 if (mds->is_active()) {
2368 // open all bounds
2369 set<CDir*> import_bounds;
2370 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2371 p != import_bound_fragset.end();
2372 ++p) {
2373 CInode *in = cache->get_inode(p->first);
2374 assert(in);
7c673cae 2375
c07f9fc5
FG
2376 // map fragset into a frag_t list, based on the inode fragtree
2377 list<frag_t> fglist;
2378 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2379 in->dirfragtree.get_leaves_under(*q, fglist);
2380 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2381
2382 for (list<frag_t>::iterator q = fglist.begin();
2383 q != fglist.end();
2384 ++q) {
2385 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2386 if (!bound) {
2387 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2388 cache->open_remote_dirfrag(in, *q,
2389 new C_MDS_RetryMessage(mds, m));
2390 return;
2391 }
7c673cae 2392
c07f9fc5
FG
2393 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2394 dout(7) << " pinning import bound " << *bound << dendl;
2395 bound->get(CDir::PIN_IMPORTBOUND);
2396 bound->state_set(CDir::STATE_IMPORTBOUND);
2397 } else {
2398 dout(7) << " already pinned import bound " << *bound << dendl;
2399 }
2400 import_bounds.insert(bound);
7c673cae 2401 }
7c673cae 2402 }
7c673cae 2403
c07f9fc5
FG
2404 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2405
2406 if (!mds->mdcache->is_readonly() &&
2407 dir->get_inode()->filelock.can_wrlock(-1) &&
2408 dir->get_inode()->nestlock.can_wrlock(-1)) {
2409 it->second.mut = new MutationImpl();
2410 // force some locks. hacky.
2411 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2412 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2413
2414 // note that i am an ambiguous auth for this subtree.
2415 // specify bounds, since the exporter explicitly defines the region.
2416 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2417 pair<int,int>(oldauth, mds->get_nodeid()));
2418 cache->verify_subtree_bounds(dir, import_bounds);
2419 // freeze.
2420 dir->_freeze_tree();
2421 // note new state
2422 it->second.state = IMPORT_PREPPED;
2423 } else {
2424 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2425 success = false;
2426 }
7c673cae 2427 } else {
c07f9fc5 2428 dout(7) << " not active, failing. " << *dir << dendl;
7c673cae 2429 success = false;
7c673cae
FG
2430 }
2431
c07f9fc5 2432 if (!success)
b32b8144 2433 import_reverse_prepping(dir, it->second);
c07f9fc5 2434
7c673cae
FG
2435 // ok!
2436 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2437 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2438
2439 assert(g_conf->mds_kill_import_at != 4);
2440 // done
2441 m->put();
2442}
2443
2444
2445
2446
2447class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2448 dirfrag_t df;
2449 CDir *dir;
2450 mds_rank_t from;
2451public:
28e407b8 2452 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae
FG
2453
2454 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2455 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2456 }
2457 void finish(int r) override {
28e407b8 2458 mig->import_logged_start(df, dir, from, imported_session_map);
7c673cae
FG
2459 }
2460};
2461
2462/* This function DOES put the passed message before returning*/
2463void Migrator::handle_export_dir(MExportDir *m)
2464{
2465 assert (g_conf->mds_kill_import_at != 5);
2466 CDir *dir = cache->get_dirfrag(m->dirfrag);
2467 assert(dir);
31f18b77
FG
2468
2469 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2470 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2471
2472 assert(!dir->is_auth());
7c673cae
FG
2473
2474 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2475 assert(it != import_state.end());
2476 assert(it->second.state == IMPORT_PREPPED);
2477 assert(it->second.tid == m->get_tid());
31f18b77 2478 assert(it->second.peer == oldauth);
7c673cae
FG
2479
2480 utime_t now = ceph_clock_now();
7c673cae
FG
2481
2482 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2483 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2484
2485 cache->show_subtrees();
2486
31f18b77 2487 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
7c673cae
FG
2488
2489 // start the journal entry
31f18b77 2490 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
7c673cae
FG
2491 mds->mdlog->start_entry(le);
2492
2493 le->metablob.add_dir_context(dir);
2494
2495 // adjust auth (list us _first_)
2496 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2497
2498 // new client sessions, open these after we journal
2499 // include imported sessions in EImportStart
2500 bufferlist::iterator cmp = m->client_map.begin();
28e407b8
AA
2501 map<client_t,entity_inst_t> client_map;
2502 decode(client_map, cmp);
7c673cae 2503 assert(cmp.end());
28e407b8
AA
2504 le->cmapv = mds->server->prepare_force_open_sessions(client_map, onlogged->imported_session_map);
2505 encode(client_map, le->client_map, mds->mdsmap->get_up_features());
7c673cae
FG
2506
2507 bufferlist::iterator blp = m->export_data.begin();
2508 int num_imported_inodes = 0;
2509 while (!blp.end()) {
2510 num_imported_inodes +=
2511 decode_import_dir(blp,
2512 oldauth,
2513 dir, // import root
2514 le,
2515 mds->mdlog->get_current_segment(),
2516 it->second.peer_exports,
2517 it->second.updated_scatterlocks,
2518 now);
2519 }
2520 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2521
2522 // include bounds in EImportStart
2523 set<CDir*> import_bounds;
2524 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2525 p != m->bounds.end();
2526 ++p) {
2527 CDir *bd = cache->get_dirfrag(*p);
2528 assert(bd);
2529 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2530 import_bounds.insert(bd);
2531 }
2532 cache->verify_subtree_bounds(dir, import_bounds);
2533
2534 // adjust popularity
2535 mds->balancer->add_import(dir, now);
2536
2537 dout(7) << "handle_export_dir did " << *dir << dendl;
2538
2539 // note state
2540 it->second.state = IMPORT_LOGGINGSTART;
2541 assert (g_conf->mds_kill_import_at != 6);
2542
2543 // log it
2544 mds->mdlog->submit_entry(le, onlogged);
2545 mds->mdlog->flush();
2546
2547 // some stats
2548 if (mds->logger) {
2549 mds->logger->inc(l_mds_imported);
2550 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2551 }
2552
2553 m->put();
2554}
2555
2556
2557/*
2558 * this is an import helper
2559 * called by import_finish, and import_reverse and friends.
2560 */
2561void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2562{
2563 import_state_t& stat = import_state[dir->dirfrag()];
2564 // root
2565 dir->put(CDir::PIN_IMPORTING);
2566 dir->state_clear(CDir::STATE_IMPORTING);
2567
2568 // bounding inodes
2569 set<inodeno_t> did;
2570 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2571 p != stat.bound_ls.end();
2572 ++p) {
2573 if (did.count(p->ino))
2574 continue;
2575 did.insert(p->ino);
2576 CInode *in = cache->get_inode(p->ino);
2577 assert(in);
2578 in->put_stickydirs();
2579 }
2580
2581 if (stat.state == IMPORT_PREPPING) {
2582 for (auto bd : bounds) {
2583 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2584 bd->put(CDir::PIN_IMPORTBOUND);
2585 bd->state_clear(CDir::STATE_IMPORTBOUND);
2586 }
2587 }
2588 } else if (stat.state >= IMPORT_PREPPED) {
2589 // bounding dirfrags
2590 for (auto bd : bounds) {
2591 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2592 bd->put(CDir::PIN_IMPORTBOUND);
2593 bd->state_clear(CDir::STATE_IMPORTBOUND);
2594 }
2595 }
2596}
2597
2598
2599/*
2600 * note: this does teh full work of reversing and import and cleaning up
2601 * state.
2602 * called by both handle_mds_failure and by handle_resolve (if we are
2603 * a survivor coping with an exporter failure+recovery).
2604 */
2605void Migrator::import_reverse(CDir *dir)
2606{
2607 dout(7) << "import_reverse " << *dir << dendl;
2608
2609 import_state_t& stat = import_state[dir->dirfrag()];
2610 stat.state = IMPORT_ABORTING;
2611
2612 set<CDir*> bounds;
2613 cache->get_subtree_bounds(dir, bounds);
2614
2615 // remove pins
2616 import_remove_pins(dir, bounds);
2617
2618 // update auth, with possible subtree merge.
2619 assert(dir->is_subtree_root());
2620 if (mds->is_resolve())
2621 cache->trim_non_auth_subtree(dir);
2622
2623 cache->adjust_subtree_auth(dir, stat.peer);
2624
2625 C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather> *fin = new C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather>(g_ceph_context);
2626 if (!dir->get_inode()->is_auth() &&
2627 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2628 dir->get_inode()->clear_scatter_dirty();
2629 // wake up scatter_nudge waiters
2630 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2631 }
2632
2633 int num_dentries = 0;
2634 // adjust auth bits.
2635 list<CDir*> q;
2636 q.push_back(dir);
2637 while (!q.empty()) {
2638 CDir *cur = q.front();
2639 q.pop_front();
2640
2641 // dir
2642 assert(cur->is_auth());
2643 cur->state_clear(CDir::STATE_AUTH);
2644 cur->remove_bloom();
2645 cur->clear_replica_map();
2646 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2647 if (cur->is_dirty())
2648 cur->mark_clean();
2649
94b18763
FG
2650 for (auto &p : *cur) {
2651 CDentry *dn = p.second;
7c673cae
FG
2652
2653 // dentry
2654 dn->state_clear(CDentry::STATE_AUTH);
2655 dn->clear_replica_map();
2656 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2657 if (dn->is_dirty())
2658 dn->mark_clean();
2659
2660 // inode?
2661 if (dn->get_linkage()->is_primary()) {
2662 CInode *in = dn->get_linkage()->get_inode();
2663 in->state_clear(CDentry::STATE_AUTH);
2664 in->clear_replica_map();
2665 in->set_replica_nonce(CInode::EXPORT_NONCE);
2666 if (in->is_dirty())
2667 in->mark_clean();
2668 in->clear_dirty_rstat();
2669 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2670 in->clear_scatter_dirty();
2671 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2672 }
2673
2674 in->clear_dirty_parent();
2675
2676 in->authlock.clear_gather();
2677 in->linklock.clear_gather();
2678 in->dirfragtreelock.clear_gather();
2679 in->filelock.clear_gather();
2680
2681 in->clear_file_locks();
2682
2683 // non-bounding dir?
2684 list<CDir*> dfs;
2685 in->get_dirfrags(dfs);
2686 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2687 if (bounds.count(*p) == 0)
2688 q.push_back(*p);
2689 }
2690
2691 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2692 ++num_dentries;
2693 }
2694 }
2695
2696 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2697
2698 if (stat.state == IMPORT_ACKING) {
2699 // remove imported caps
2700 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
28e407b8
AA
2701 p != stat.peer_exports.end();
2702 ++p) {
7c673cae
FG
2703 CInode *in = p->first;
2704 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
28e407b8
AA
2705 q != p->second.end();
2706 ++q) {
7c673cae 2707 Capability *cap = in->get_client_cap(q->first);
28e407b8
AA
2708 if (!cap) {
2709 assert(!stat.session_map.count(q->first));
2710 continue;
2711 }
7c673cae
FG
2712 if (cap->is_importing())
2713 in->remove_client_cap(q->first);
2714 }
2715 in->put(CInode::PIN_IMPORTINGCAPS);
2716 }
28e407b8
AA
2717 for (auto& p : stat.session_map) {
2718 Session *session = p.second.first;
7c673cae
FG
2719 session->dec_importing();
2720 }
2721 }
2722
2723 // log our failure
2724 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2725
181888fb 2726 cache->trim(num_dentries); // try trimming dentries
7c673cae
FG
2727
2728 // notify bystanders; wait in aborting state
2729 import_notify_abort(dir, bounds);
2730}
2731
2732void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2733{
2734 dout(7) << "import_notify_finish " << *dir << dendl;
2735
2736 import_state_t& stat = import_state[dir->dirfrag()];
2737 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2738 p != stat.bystanders.end();
2739 ++p) {
2740 MExportDirNotify *notify =
2741 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
2742 pair<int,int>(stat.peer, mds->get_nodeid()),
2743 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2744 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2745 notify->get_bounds().push_back((*i)->dirfrag());
2746 mds->send_message_mds(notify, *p);
2747 }
2748}
2749
2750void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2751{
2752 dout(7) << "import_notify_abort " << *dir << dendl;
2753
2754 import_state_t& stat = import_state[dir->dirfrag()];
2755 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2756 p != stat.bystanders.end(); ) {
2757 if (mds->is_cluster_degraded() &&
2758 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2759 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2760 stat.bystanders.erase(p++);
2761 continue;
2762 }
2763 MExportDirNotify *notify =
2764 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2765 mds_authority_t(stat.peer, mds->get_nodeid()),
2766 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
2767 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2768 notify->get_bounds().push_back((*i)->dirfrag());
2769 mds->send_message_mds(notify, *p);
2770 ++p;
2771 }
2772 if (stat.bystanders.empty()) {
2773 dout(7) << "no bystanders, finishing reverse now" << dendl;
2774 import_reverse_unfreeze(dir);
2775 } else {
2776 assert (g_conf->mds_kill_import_at != 10);
2777 }
2778}
2779
2780void Migrator::import_reverse_unfreeze(CDir *dir)
2781{
7c673cae 2782 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
224ce89b 2783 assert(!dir->is_auth());
7c673cae 2784 cache->discard_delayed_expire(dir);
224ce89b
WB
2785 dir->unfreeze_tree();
2786 if (dir->is_subtree_root())
2787 cache->try_subtree_merge(dir);
7c673cae
FG
2788 import_reverse_final(dir);
2789}
2790
2791void Migrator::import_reverse_final(CDir *dir)
2792{
2793 dout(7) << "import_reverse_final " << *dir << dendl;
2794
2795 // clean up
2796 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2797 assert(it != import_state.end());
2798
2799 MutationRef mut = it->second.mut;
2800 import_state.erase(it);
2801
2802 // send pending import_maps?
2803 mds->mdcache->maybe_send_pending_resolves();
2804
2805 if (mut) {
2806 mds->locker->drop_locks(mut.get());
2807 mut->cleanup();
2808 }
2809
2810 cache->show_subtrees();
2811 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2812}
2813
2814
2815
2816
2817void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
28e407b8 2818 map<client_t,pair<Session*,uint64_t> >& imported_session_map)
7c673cae
FG
2819{
2820 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2821 if (it == import_state.end() ||
2822 it->second.state != IMPORT_LOGGINGSTART) {
2823 dout(7) << "import " << df << " must have aborted" << dendl;
28e407b8 2824 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae
FG
2825 return;
2826 }
2827
2828 dout(7) << "import_logged " << *dir << dendl;
2829
2830 // note state
2831 it->second.state = IMPORT_ACKING;
2832
2833 assert (g_conf->mds_kill_import_at != 7);
2834
2835 // force open client sessions and finish cap import
28e407b8 2836 mds->server->finish_force_open_sessions(imported_session_map, false);
7c673cae
FG
2837
2838 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2839 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2840 p != it->second.peer_exports.end();
2841 ++p) {
2842 // parameter 'peer' is NONE, delay sending cap import messages to client
28e407b8
AA
2843 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
2844 p->second, imported_caps[p->first->ino()]);
7c673cae 2845 }
28e407b8
AA
2846
2847 it->second.session_map.swap(imported_session_map);
7c673cae
FG
2848
2849 // send notify's etc.
2850 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2851
2852 // test surviving observer of a failed migration that did not complete
2853 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2854
2855 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
2856 ::encode(imported_caps, ack->imported_caps);
2857
2858 mds->send_message_mds(ack, from);
2859 assert (g_conf->mds_kill_import_at != 8);
2860
2861 cache->show_subtrees();
2862}
2863
2864/* This function DOES put the passed message before returning*/
2865void Migrator::handle_export_finish(MExportDirFinish *m)
2866{
2867 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2868 assert(dir);
2869 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
2870
2871 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2872 assert(it != import_state.end());
2873 assert(it->second.tid == m->get_tid());
2874
2875 import_finish(dir, false, m->is_last());
2876
2877 m->put();
2878}
2879
2880void Migrator::import_finish(CDir *dir, bool notify, bool last)
2881{
2882 dout(7) << "import_finish on " << *dir << dendl;
2883
2884 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
2885 assert(it != import_state.end());
2886 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
2887
224ce89b
WB
2888 if (it->second.state == IMPORT_ACKING) {
2889 assert(dir->is_auth());
2890 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2891 }
2892
7c673cae
FG
2893 // log finish
2894 assert(g_conf->mds_kill_import_at != 9);
2895
2896 if (it->second.state == IMPORT_ACKING) {
2897 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2898 p != it->second.peer_exports.end();
2899 ++p) {
2900 CInode *in = p->first;
2901 assert(in->is_auth());
2902 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2903 q != p->second.end();
2904 ++q) {
28e407b8
AA
2905 auto r = it->second.session_map.find(q->first);
2906 if (r == it->second.session_map.end())
2907 continue;
2908
2909 Session *session = r->second.first;
7c673cae
FG
2910 Capability *cap = in->get_client_cap(q->first);
2911 assert(cap);
2912 cap->merge(q->second, true);
2913 cap->clear_importing();
2914 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
2915 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
2916 }
2917 p->second.clear();
2918 in->replica_caps_wanted = 0;
2919 }
28e407b8
AA
2920 for (auto& p : it->second.session_map) {
2921 Session *session = p.second.first;
7c673cae
FG
2922 session->dec_importing();
2923 }
2924 }
2925
2926 if (!last) {
2927 assert(it->second.state == IMPORT_ACKING);
2928 it->second.state = IMPORT_FINISHING;
2929 return;
2930 }
2931
2932 // remove pins
2933 set<CDir*> bounds;
2934 cache->get_subtree_bounds(dir, bounds);
2935
2936 if (notify)
2937 import_notify_finish(dir, bounds);
2938
2939 import_remove_pins(dir, bounds);
2940
2941 map<CInode*, map<client_t,Capability::Export> > peer_exports;
2942 it->second.peer_exports.swap(peer_exports);
2943
2944 // clear import state (we're done!)
2945 MutationRef mut = it->second.mut;
2946 import_state.erase(it);
2947
7c673cae
FG
2948 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
2949
7c673cae
FG
2950 // process delayed expires
2951 cache->process_delayed_expire(dir);
2952
224ce89b 2953 // unfreeze tree, with possible subtree merge.
7c673cae 2954 dir->unfreeze_tree();
224ce89b
WB
2955 cache->try_subtree_merge(dir);
2956
7c673cae
FG
2957 cache->show_subtrees();
2958 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2959
2960 if (mut) {
2961 mds->locker->drop_locks(mut.get());
2962 mut->cleanup();
2963 }
2964
2965 // re-eval imported caps
2966 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
2967 p != peer_exports.end();
2968 ++p) {
2969 if (p->first->is_auth())
2970 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
2971 p->first->put(CInode::PIN_IMPORTINGCAPS);
2972 }
2973
2974 // send pending import_maps?
2975 mds->mdcache->maybe_send_pending_resolves();
2976
2977 // did i just import mydir?
2978 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
2979 cache->populate_mydir();
2980
2981 // is it empty?
2982 if (dir->get_num_head_items() == 0 &&
2983 !dir->inode->is_auth()) {
2984 // reexport!
2985 export_empty_import(dir);
2986 }
2987}
2988
2989
2990void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
2991 mds_rank_t oldauth, LogSegment *ls,
2992 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
2993 list<ScatterLock*>& updated_scatterlocks)
2994{
2995 dout(15) << "decode_import_inode on " << *dn << dendl;
2996
2997 inodeno_t ino;
2998 snapid_t last;
2999 ::decode(ino, blp);
3000 ::decode(last, blp);
3001
3002 bool added = false;
3003 CInode *in = cache->get_inode(ino, last);
3004 if (!in) {
3005 in = new CInode(mds->mdcache, true, 1, last);
3006 added = true;
3007 }
3008
3009 // state after link -- or not! -sage
3010 in->decode_import(blp, ls); // cap imports are noted for later action
3011
3012 // caps
3013 decode_import_inode_caps(in, true, blp, peer_exports);
3014
3015 // link before state -- or not! -sage
3016 if (dn->get_linkage()->get_inode() != in) {
3017 assert(!dn->get_linkage()->get_inode());
3018 dn->dir->link_primary_inode(dn, in);
3019 }
28e407b8
AA
3020
3021 if (in->is_dir())
3022 dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
7c673cae
FG
3023
3024 // add inode?
3025 if (added) {
3026 cache->add_inode(in);
3027 dout(10) << "added " << *in << dendl;
3028 } else {
3029 dout(10) << " had " << *in << dendl;
3030 }
3031
3032 if (in->inode.is_dirty_rstat())
3033 in->mark_dirty_rstat();
3034
3035 // clear if dirtyscattered, since we're going to journal this
3036 // but not until we _actually_ finish the import...
3037 if (in->filelock.is_dirty()) {
3038 updated_scatterlocks.push_back(&in->filelock);
3039 mds->locker->mark_updated_scatterlock(&in->filelock);
3040 }
3041
3042 if (in->dirfragtreelock.is_dirty()) {
3043 updated_scatterlocks.push_back(&in->dirfragtreelock);
3044 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
3045 }
3046
3047 // adjust replica list
3048 //assert(!in->is_replica(oldauth)); // not true on failed export
3049 in->add_replica(oldauth, CInode::EXPORT_NONCE);
3050 if (in->is_replica(mds->get_nodeid()))
3051 in->remove_replica(mds->get_nodeid());
3052}
3053
3054void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
3055 bufferlist::iterator &blp,
3056 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3057{
3058 map<client_t,Capability::Export> cap_map;
3059 ::decode(cap_map, blp);
3060 if (auth_cap)
3061 ::decode(in->get_mds_caps_wanted(), blp);
3062 if (!cap_map.empty() ||
b32b8144 3063 (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
7c673cae
FG
3064 peer_exports[in].swap(cap_map);
3065 in->get(CInode::PIN_IMPORTINGCAPS);
3066 }
3067}
3068
3069void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
28e407b8
AA
3070 const map<client_t,pair<Session*,uint64_t> >& session_map,
3071 const map<client_t,Capability::Export> &export_map,
7c673cae
FG
3072 map<client_t,Capability::Import> &import_map)
3073{
28e407b8
AA
3074 for (auto& it : export_map) {
3075 dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
3076
3077 auto p = session_map.find(it.first);
3078 if (p == session_map.end()) {
3079 dout(10) << " no session for client." << it.first << dendl;
3080 (void)import_map[it.first];
3081 continue;
3082 }
7c673cae 3083
28e407b8
AA
3084 Session *session = p->second.first;
3085
3086 Capability *cap = in->get_client_cap(it.first);
7c673cae 3087 if (!cap) {
28e407b8 3088 cap = in->add_client_cap(it.first, session);
7c673cae
FG
3089 if (peer < 0)
3090 cap->mark_importing();
3091 }
3092
28e407b8 3093 Capability::Import& im = import_map[it.first];
7c673cae 3094 im.cap_id = cap->get_cap_id();
28e407b8 3095 im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
7c673cae
FG
3096 im.issue_seq = cap->get_last_seq() + 1;
3097
3098 if (peer >= 0) {
28e407b8
AA
3099 cap->merge(it.second, auth_cap);
3100 mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
3101 it.second.seq, it.second.mseq - 1, peer,
7c673cae
FG
3102 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3103 }
3104 }
3105
3106 if (peer >= 0) {
3107 in->replica_caps_wanted = 0;
3108 in->put(CInode::PIN_IMPORTINGCAPS);
3109 }
3110}
3111
3112int Migrator::decode_import_dir(bufferlist::iterator& blp,
3113 mds_rank_t oldauth,
3114 CDir *import_root,
3115 EImportStart *le,
3116 LogSegment *ls,
3117 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3118 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3119{
3120 // set up dir
3121 dirfrag_t df;
3122 ::decode(df, blp);
3123
3124 CInode *diri = cache->get_inode(df.ino);
3125 assert(diri);
3126 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3127 assert(dir);
3128
3129 dout(7) << "decode_import_dir " << *dir << dendl;
3130
3131 // assimilate state
3132 dir->decode_import(blp, now, ls);
3133
3134 // adjust replica list
3135 //assert(!dir->is_replica(oldauth)); // not true on failed export
3136 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3137 if (dir->is_replica(mds->get_nodeid()))
3138 dir->remove_replica(mds->get_nodeid());
3139
3140 // add to journal entry
3141 if (le)
3142 le->metablob.add_import_dir(dir);
3143
3144 int num_imported = 0;
3145
3146 // take all waiters on this dir
3147 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3148 // a replica's presense in my cache implies/forces it's presense in authority's.
3149 list<MDSInternalContextBase*> waiters;
3150
3151 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3152 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3153 it != waiters.end();
3154 ++it)
3155 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3156
3157 dout(15) << "doing contents" << dendl;
3158
3159 // contents
3160 __u32 nden;
3161 ::decode(nden, blp);
3162
3163 for (; nden>0; nden--) {
3164 num_imported++;
3165
3166 // dentry
3167 string dname;
3168 snapid_t last;
3169 ::decode(dname, blp);
3170 ::decode(last, blp);
3171
3172 CDentry *dn = dir->lookup_exact_snap(dname, last);
3173 if (!dn)
3174 dn = dir->add_null_dentry(dname, 1, last);
3175
3176 dn->decode_import(blp, ls);
3177
3178 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3179 if (dn->is_replica(mds->get_nodeid()))
3180 dn->remove_replica(mds->get_nodeid());
3181
3182 // dentry lock in unreadable state can block path traverse
3183 if (dn->lock.get_state() != LOCK_SYNC)
3184 mds->locker->try_eval(&dn->lock, NULL);
3185
3186 dout(15) << "decode_import_dir got " << *dn << dendl;
3187
3188 // points to...
3189 char icode;
3190 ::decode(icode, blp);
3191
3192 if (icode == 'N') {
3193 // null dentry
3194 assert(dn->get_linkage()->is_null());
3195
3196 // fall thru
3197 }
3198 else if (icode == 'L') {
3199 // remote link
3200 inodeno_t ino;
3201 unsigned char d_type;
3202 ::decode(ino, blp);
3203 ::decode(d_type, blp);
3204 if (dn->get_linkage()->is_remote()) {
3205 assert(dn->get_linkage()->get_remote_ino() == ino);
3206 } else {
3207 dir->link_remote_inode(dn, ino, d_type);
3208 }
3209 }
3210 else if (icode == 'I') {
3211 // inode
3212 assert(le);
3213 decode_import_inode(dn, blp, oldauth, ls,
3214 peer_exports, updated_scatterlocks);
3215 }
3216
3217 // add dentry to journal entry
3218 if (le)
3219 le->metablob.add_import_dentry(dn);
3220 }
3221
3222#ifdef MDS_VERIFY_FRAGSTAT
3223 if (dir->is_complete())
3224 dir->verify_fragstat();
3225#endif
3226
3227 dir->inode->maybe_export_pin();
3228
3229 dout(7) << "decode_import_dir done " << *dir << dendl;
3230 return num_imported;
3231}
3232
3233
3234
3235
3236
3237// authority bystander
3238
3239/* This function DOES put the passed message before returning*/
3240void Migrator::handle_export_notify(MExportDirNotify *m)
3241{
3242 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3243 m->put();
3244 return;
3245 }
3246
3247 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3248
3249 mds_rank_t from = mds_rank_t(m->get_source().num());
3250 mds_authority_t old_auth = m->get_old_auth();
3251 mds_authority_t new_auth = m->get_new_auth();
3252
3253 if (!dir) {
3254 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3255 << " on missing dir " << m->get_dirfrag() << dendl;
3256 } else if (dir->authority() != old_auth) {
3257 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3258 << " != " << old_auth << " -> " << new_auth
3259 << " on " << *dir << dendl;
3260 } else {
3261 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3262 << " on " << *dir << dendl;
3263 // adjust auth
3264 set<CDir*> have;
3265 cache->map_dirfrag_set(m->get_bounds(), have);
3266 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3267
3268 // induce a merge?
3269 cache->try_subtree_merge(dir);
3270 }
3271
3272 // send ack
3273 if (m->wants_ack()) {
3274 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3275 } else {
3276 // aborted. no ack.
3277 dout(7) << "handle_export_notify no ack requested" << dendl;
3278 }
3279
3280 m->put();
3281}
3282
3283/** cap exports **/
3284void Migrator::export_caps(CInode *in)
3285{
3286 mds_rank_t dest = in->authority().first;
3287 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3288
3289 assert(in->is_any_caps());
3290 assert(!in->is_auth());
3291 assert(!in->is_ambiguous_auth());
3292 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3293
3294 MExportCaps *ex = new MExportCaps;
3295 ex->ino = in->ino();
3296
3297 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3298
3299 mds->send_message_mds(ex, dest);
3300}
3301
3302void Migrator::handle_gather_caps(MGatherCaps *m)
3303{
3304 CInode *in = cache->get_inode(m->ino);
3305
3306 if (!in)
3307 goto out;
3308
3309 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
3310 << " on " << *in
3311 << dendl;
3312 if (in->is_any_caps() &&
3313 !in->is_auth() &&
3314 !in->is_ambiguous_auth() &&
3315 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3316 export_caps(in);
3317
3318out:
3319 m->put();
3320}
3321
3322class C_M_LoggedImportCaps : public MigratorLogContext {
3323 CInode *in;
3324 mds_rank_t from;
3325public:
28e407b8 3326 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae 3327 map<CInode*, map<client_t,Capability::Export> > peer_exports;
7c673cae
FG
3328
3329 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3330 void finish(int r) override {
28e407b8 3331 mig->logged_import_caps(in, from, imported_session_map, peer_exports);
7c673cae
FG
3332 }
3333};
3334
3335/* This function DOES put the passed message before returning*/
3336void Migrator::handle_export_caps(MExportCaps *ex)
3337{
3338 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3339 CInode *in = cache->get_inode(ex->ino);
3340
3341 assert(in);
3342 assert(in->is_auth());
3343
3344 // FIXME
28e407b8
AA
3345 if (!in->can_auth_pin()) {
3346 ex->put();
7c673cae 3347 return;
28e407b8
AA
3348 }
3349
181888fb 3350 in->auth_pin(this);
7c673cae 3351
28e407b8
AA
3352 map<client_t,entity_inst_t> client_map;
3353 client_map.swap(ex->client_map);
3354
7c673cae
FG
3355 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3356 this, in, mds_rank_t(ex->get_source().num()));
7c673cae 3357
28e407b8
AA
3358 version_t pv = mds->server->prepare_force_open_sessions(client_map,
3359 finish->imported_session_map);
7c673cae
FG
3360 // decode new caps
3361 bufferlist::iterator blp = ex->cap_bl.begin();
3362 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3363 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3364
3365 // journal open client sessions
7c673cae 3366
28e407b8 3367 ESessions *le = new ESessions(pv, client_map);
7c673cae
FG
3368 mds->mdlog->start_submit_entry(le, finish);
3369 mds->mdlog->flush();
3370
3371 ex->put();
3372}
3373
3374
3375void Migrator::logged_import_caps(CInode *in,
3376 mds_rank_t from,
28e407b8
AA
3377 map<client_t,pair<Session*,uint64_t> >& imported_session_map,
3378 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
7c673cae
FG
3379{
3380 dout(10) << "logged_import_caps on " << *in << dendl;
3381 // see export_go() vs export_go_synced()
3382 assert(in->is_auth());
3383
3384 // force open client sessions and finish cap import
28e407b8 3385 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae
FG
3386
3387 map<client_t,Capability::Import> imported_caps;
3388
28e407b8
AA
3389 auto it = peer_exports.find(in);
3390 assert(it != peer_exports.end());
3391
7c673cae 3392 // clients will release caps from the exporter when they receive the cap import message.
28e407b8 3393 finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
7c673cae 3394 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
181888fb 3395 in->auth_unpin(this);
7c673cae 3396}
28e407b8
AA
3397
3398void Migrator::handle_conf_change(const struct md_config_t *conf,
3399 const std::set <std::string> &changed,
3400 const MDSMap &mds_map)
3401{
3402 if (changed.count("mds_inject_migrator_session_race")) {
3403 inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
3404 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
3405 }
3406}