]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Migrator.cc
update sources to v12.1.1
[ceph.git] / ceph / src / mds / Migrator.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "MDSRank.h"
16 #include "MDCache.h"
17 #include "CInode.h"
18 #include "CDir.h"
19 #include "CDentry.h"
20 #include "Migrator.h"
21 #include "Locker.h"
22 #include "Server.h"
23
24 #include "MDBalancer.h"
25 #include "MDLog.h"
26 #include "MDSMap.h"
27 #include "Mutation.h"
28
29 #include "include/filepath.h"
30
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/ESessions.h"
35
36 #include "msg/Messenger.h"
37
38 #include "messages/MClientCaps.h"
39
40 #include "messages/MExportDirDiscover.h"
41 #include "messages/MExportDirDiscoverAck.h"
42 #include "messages/MExportDirCancel.h"
43 #include "messages/MExportDirPrep.h"
44 #include "messages/MExportDirPrepAck.h"
45 #include "messages/MExportDir.h"
46 #include "messages/MExportDirAck.h"
47 #include "messages/MExportDirNotify.h"
48 #include "messages/MExportDirNotifyAck.h"
49 #include "messages/MExportDirFinish.h"
50
51 #include "messages/MExportCaps.h"
52 #include "messages/MExportCapsAck.h"
53 #include "messages/MGatherCaps.h"
54
55
56 /*
57 * this is what the dir->dir_auth values look like
58 *
59 * dir_auth authbits
60 * export
61 * me me - before
62 * me, me me - still me, but preparing for export
63 * me, them me - send MExportDir (peer is preparing)
64 * them, me me - journaled EExport
65 * them them - done
66 *
67 * import:
68 * them them - before
69 * me, them me - journaled EImportStart
70 * me me - done
71 *
72 * which implies:
73 * - auth bit is set if i am listed as first _or_ second dir_auth.
74 */
75
76 #include "common/config.h"
77
78
79 #define dout_context g_ceph_context
80 #define dout_subsys ceph_subsys_mds
81 #undef dout_prefix
82 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
83
84
85 class MigratorContext : public MDSInternalContextBase {
86 protected:
87 Migrator *mig;
88 MDSRank *get_mds() override {
89 return mig->mds;
90 }
91 public:
92 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
93 assert(mig != NULL);
94 }
95 };
96
97 class MigratorLogContext : public MDSLogContextBase {
98 protected:
99 Migrator *mig;
100 MDSRank *get_mds() override {
101 return mig->mds;
102 }
103 public:
104 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
105 assert(mig != NULL);
106 }
107 };
108
109 /* This function DOES put the passed message before returning*/
110 void Migrator::dispatch(Message *m)
111 {
112 switch (m->get_type()) {
113 // import
114 case MSG_MDS_EXPORTDIRDISCOVER:
115 handle_export_discover(static_cast<MExportDirDiscover*>(m));
116 break;
117 case MSG_MDS_EXPORTDIRPREP:
118 handle_export_prep(static_cast<MExportDirPrep*>(m));
119 break;
120 case MSG_MDS_EXPORTDIR:
121 handle_export_dir(static_cast<MExportDir*>(m));
122 break;
123 case MSG_MDS_EXPORTDIRFINISH:
124 handle_export_finish(static_cast<MExportDirFinish*>(m));
125 break;
126 case MSG_MDS_EXPORTDIRCANCEL:
127 handle_export_cancel(static_cast<MExportDirCancel*>(m));
128 break;
129
130 // export
131 case MSG_MDS_EXPORTDIRDISCOVERACK:
132 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
133 break;
134 case MSG_MDS_EXPORTDIRPREPACK:
135 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
136 break;
137 case MSG_MDS_EXPORTDIRACK:
138 handle_export_ack(static_cast<MExportDirAck*>(m));
139 break;
140 case MSG_MDS_EXPORTDIRNOTIFYACK:
141 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
142 break;
143
144 // export 3rd party (dir_auth adjustments)
145 case MSG_MDS_EXPORTDIRNOTIFY:
146 handle_export_notify(static_cast<MExportDirNotify*>(m));
147 break;
148
149 // caps
150 case MSG_MDS_EXPORTCAPS:
151 handle_export_caps(static_cast<MExportCaps*>(m));
152 break;
153 case MSG_MDS_GATHERCAPS:
154 handle_gather_caps(static_cast<MGatherCaps*>(m));
155 break;
156
157 default:
158 derr << "migrator unknown message " << m->get_type() << dendl;
159 assert(0 == "migrator unknown message");
160 }
161 }
162
163
164 class C_MDC_EmptyImport : public MigratorContext {
165 CDir *dir;
166 public:
167 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
168 void finish(int r) override {
169 mig->export_empty_import(dir);
170 }
171 };
172
173
174 void Migrator::export_empty_import(CDir *dir)
175 {
176 dout(7) << "export_empty_import " << *dir << dendl;
177 assert(dir->is_subtree_root());
178
179 if (dir->inode->is_auth()) {
180 dout(7) << " inode is auth" << dendl;
181 return;
182 }
183 if (!dir->is_auth()) {
184 dout(7) << " not auth" << dendl;
185 return;
186 }
187 if (dir->is_freezing() || dir->is_frozen()) {
188 dout(7) << " freezing or frozen" << dendl;
189 return;
190 }
191 if (dir->get_num_head_items() > 0) {
192 dout(7) << " not actually empty" << dendl;
193 return;
194 }
195 if (dir->inode->is_root()) {
196 dout(7) << " root" << dendl;
197 return;
198 }
199
200 mds_rank_t dest = dir->inode->authority().first;
201 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
202
203 dout(7) << " really empty, exporting to " << dest << dendl;
204 assert (dest != mds->get_nodeid());
205
206 dout(7) << "exporting to mds." << dest
207 << " empty import " << *dir << dendl;
208 export_dir( dir, dest );
209 }
210
211 void Migrator::find_stale_export_freeze()
212 {
213 utime_t now = ceph_clock_now();
214 utime_t cutoff = now;
215 cutoff -= g_conf->mds_freeze_tree_timeout;
216
217
218 /*
219 * We could have situations like:
220 *
221 * - mds.0 authpins an item in subtree A
222 * - mds.0 sends request to mds.1 to authpin an item in subtree B
223 * - mds.0 freezes subtree A
224 * - mds.1 authpins an item in subtree B
225 * - mds.1 sends request to mds.0 to authpin an item in subtree A
226 * - mds.1 freezes subtree B
227 * - mds.1 receives the remote authpin request from mds.0
228 * (wait because subtree B is freezing)
229 * - mds.0 receives the remote authpin request from mds.1
230 * (wait because subtree A is freezing)
231 *
232 *
233 * - client request authpins items in subtree B
234 * - freeze subtree B
235 * - import subtree A which is parent of subtree B
236 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
237 * - freeze subtree A
238 * - client request tries authpinning items in subtree A
239 * (wait because subtree A is freezing)
240 */
241 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
242 p != export_state.end(); ) {
243 CDir* dir = p->first;
244 export_state_t& stat = p->second;
245 ++p;
246 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
247 continue;
248 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
249 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
250 stat.last_cum_auth_pins_change = now;
251 continue;
252 }
253 if (stat.last_cum_auth_pins_change >= cutoff)
254 continue;
255 if (stat.num_remote_waiters > 0 ||
256 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
257 export_try_cancel(dir);
258 }
259 }
260 }
261
262 void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
263 {
264 dout(10) << "export_try_cancel " << *dir << dendl;
265
266 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
267 assert(it != export_state.end());
268
269 int state = it->second.state;
270 switch (state) {
271 case EXPORT_LOCKING:
272 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
273 it->second.state = EXPORT_CANCELLED;
274 dir->auth_unpin(this);
275 break;
276 case EXPORT_DISCOVERING:
277 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
278 it->second.state = EXPORT_CANCELLED;
279 dir->unfreeze_tree(); // cancel the freeze
280 dir->auth_unpin(this);
281 if (notify_peer &&
282 (!mds->is_cluster_degraded() ||
283 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
284 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
285 break;
286
287 case EXPORT_FREEZING:
288 dout(10) << "export state=freezing : canceling freeze" << dendl;
289 it->second.state = EXPORT_CANCELLED;
290 dir->unfreeze_tree(); // cancel the freeze
291 if (dir->is_subtree_root())
292 cache->try_subtree_merge(dir);
293 if (notify_peer &&
294 (!mds->is_cluster_degraded() ||
295 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
296 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
297 break;
298
299 // NOTE: state order reversal, warning comes after prepping
300 case EXPORT_WARNING:
301 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
302 it->second.state = EXPORT_CANCELLING;
303 // fall-thru
304
305 case EXPORT_PREPPING:
306 if (state != EXPORT_WARNING) {
307 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
308 it->second.state = EXPORT_CANCELLED;
309 }
310
311 {
312 // unpin bounds
313 set<CDir*> bounds;
314 cache->get_subtree_bounds(dir, bounds);
315 for (set<CDir*>::iterator q = bounds.begin();
316 q != bounds.end();
317 ++q) {
318 CDir *bd = *q;
319 bd->put(CDir::PIN_EXPORTBOUND);
320 bd->state_clear(CDir::STATE_EXPORTBOUND);
321 }
322 if (state == EXPORT_WARNING) {
323 // notify bystanders
324 export_notify_abort(dir, bounds);
325 // process delayed expires
326 cache->process_delayed_expire(dir);
327 }
328 }
329 dir->unfreeze_tree();
330 cache->try_subtree_merge(dir);
331 if (notify_peer &&
332 (!mds->is_cluster_degraded() ||
333 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
334 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
335 break;
336
337 case EXPORT_EXPORTING:
338 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
339 it->second.state = EXPORT_CANCELLING;
340 export_reverse(dir);
341 break;
342
343 case EXPORT_LOGGINGFINISH:
344 case EXPORT_NOTIFYING:
345 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
346 // leave export_state, don't clean up now.
347 break;
348 case EXPORT_CANCELLING:
349 break;
350
351 default:
352 ceph_abort();
353 }
354
355 // finish clean-up?
356 if (it->second.state == EXPORT_CANCELLING ||
357 it->second.state == EXPORT_CANCELLED) {
358 MutationRef mut;
359 mut.swap(it->second.mut);
360
361 if (it->second.state == EXPORT_CANCELLED) {
362 export_state.erase(it);
363 dir->state_clear(CDir::STATE_EXPORTING);
364 // send pending import_maps?
365 cache->maybe_send_pending_resolves();
366 }
367
368 // drop locks
369 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
370 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
371 assert(mdr);
372 if (mdr->more()->waiting_on_slave.empty())
373 mds->mdcache->request_finish(mdr);
374 } else if (mut) {
375 mds->locker->drop_locks(mut.get());
376 mut->cleanup();
377 }
378
379 cache->show_subtrees();
380
381 maybe_do_queued_export();
382 }
383 }
384
385 void Migrator::export_cancel_finish(CDir *dir)
386 {
387 assert(dir->state_test(CDir::STATE_EXPORTING));
388 dir->state_clear(CDir::STATE_EXPORTING);
389
390 // pinned by Migrator::export_notify_abort()
391 dir->auth_unpin(this);
392 // send pending import_maps? (these need to go out when all exports have finished.)
393 cache->maybe_send_pending_resolves();
394 }
395
396 // ==========================================================
397 // mds failure handling
398
399 void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
400 {
401 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
402
403 // check my exports
404
405 // first add an extra auth_pin on any freezes, so that canceling a
406 // nested freeze doesn't complete one further up the hierarchy and
407 // confuse the shit out of us. we'll remove it after canceling the
408 // freeze. this way no freeze completions run before we want them
409 // to.
410 list<CDir*> pinned_dirs;
411 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
412 p != export_state.end();
413 ++p) {
414 if (p->second.state == EXPORT_FREEZING) {
415 CDir *dir = p->first;
416 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
417 dir->auth_pin(this);
418 pinned_dirs.push_back(dir);
419 }
420 }
421
422 map<CDir*,export_state_t>::iterator p = export_state.begin();
423 while (p != export_state.end()) {
424 map<CDir*,export_state_t>::iterator next = p;
425 ++next;
426 CDir *dir = p->first;
427
428 // abort exports:
429 // - that are going to the failed node
430 // - that aren't frozen yet (to avoid auth_pin deadlock)
431 // - they havne't prepped yet (they may need to discover bounds to do that)
432 if ((p->second.peer == who &&
433 p->second.state != EXPORT_CANCELLING) ||
434 p->second.state == EXPORT_LOCKING ||
435 p->second.state == EXPORT_DISCOVERING ||
436 p->second.state == EXPORT_FREEZING ||
437 p->second.state == EXPORT_PREPPING) {
438 // the guy i'm exporting to failed, or we're just freezing.
439 dout(10) << "cleaning up export state (" << p->second.state << ")"
440 << get_export_statename(p->second.state) << " of " << *dir << dendl;
441 export_try_cancel(dir);
442 } else if (p->second.peer != who) {
443 // bystander failed.
444 if (p->second.warning_ack_waiting.erase(who)) {
445 if (p->second.state == EXPORT_WARNING) {
446 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
447 // exporter waiting for warning acks, let's fake theirs.
448 dout(10) << "faking export_warning_ack from mds." << who
449 << " on " << *dir << " to mds." << p->second.peer
450 << dendl;
451 if (p->second.warning_ack_waiting.empty())
452 export_go(dir);
453 }
454 }
455 if (p->second.notify_ack_waiting.erase(who)) {
456 // exporter is waiting for notify acks, fake it
457 dout(10) << "faking export_notify_ack from mds." << who
458 << " on " << *dir << " to mds." << p->second.peer
459 << dendl;
460 if (p->second.state == EXPORT_NOTIFYING) {
461 if (p->second.notify_ack_waiting.empty())
462 export_finish(dir);
463 } else if (p->second.state == EXPORT_CANCELLING) {
464 if (p->second.notify_ack_waiting.empty()) {
465 export_state.erase(p);
466 export_cancel_finish(dir);
467 }
468 }
469 }
470 }
471
472 // next!
473 p = next;
474 }
475
476
477 // check my imports
478 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
479 while (q != import_state.end()) {
480 map<dirfrag_t,import_state_t>::iterator next = q;
481 ++next;
482 dirfrag_t df = q->first;
483 CInode *diri = mds->mdcache->get_inode(df.ino);
484 CDir *dir = mds->mdcache->get_dirfrag(df);
485
486 if (q->second.peer == who) {
487 if (dir)
488 dout(10) << "cleaning up import state (" << q->second.state << ")"
489 << get_import_statename(q->second.state) << " of " << *dir << dendl;
490 else
491 dout(10) << "cleaning up import state (" << q->second.state << ")"
492 << get_import_statename(q->second.state) << " of " << df << dendl;
493
494 switch (q->second.state) {
495 case IMPORT_DISCOVERING:
496 dout(10) << "import state=discovering : clearing state" << dendl;
497 import_reverse_discovering(df);
498 break;
499
500 case IMPORT_DISCOVERED:
501 assert(diri);
502 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
503 import_reverse_discovered(df, diri);
504 break;
505
506 case IMPORT_PREPPING:
507 assert(dir);
508 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
509 import_reverse_prepping(dir);
510 break;
511
512 case IMPORT_PREPPED:
513 assert(dir);
514 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
515 {
516 set<CDir*> bounds;
517 cache->get_subtree_bounds(dir, bounds);
518 import_remove_pins(dir, bounds);
519
520 // adjust auth back to the exporter
521 cache->adjust_subtree_auth(dir, q->second.peer);
522
523 // notify bystanders ; wait in aborting state
524 import_state[df].state = IMPORT_ABORTING;
525 import_notify_abort(dir, bounds);
526 assert(g_conf->mds_kill_import_at != 10);
527 }
528 break;
529
530 case IMPORT_LOGGINGSTART:
531 assert(dir);
532 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
533 import_reverse(dir);
534 break;
535
536 case IMPORT_ACKING:
537 assert(dir);
538 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
539 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
540 {
541 set<CDir*> bounds;
542 cache->get_subtree_bounds(dir, bounds);
543 cache->add_ambiguous_import(dir, bounds);
544 }
545 break;
546
547 case IMPORT_FINISHING:
548 assert(dir);
549 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
550 import_finish(dir, true);
551 break;
552
553 case IMPORT_ABORTING:
554 assert(dir);
555 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
556 break;
557 }
558 } else {
559 auto bystanders_entry = q->second.bystanders.find(who);
560 if (bystanders_entry != q->second.bystanders.end()) {
561 q->second.bystanders.erase(bystanders_entry);
562 if (q->second.state == IMPORT_ABORTING) {
563 assert(dir);
564 dout(10) << "faking export_notify_ack from mds." << who
565 << " on aborting import " << *dir << " from mds." << q->second.peer
566 << dendl;
567 if (q->second.bystanders.empty())
568 import_reverse_unfreeze(dir);
569 }
570 }
571 }
572
573 // next!
574 q = next;
575 }
576
577 while (!pinned_dirs.empty()) {
578 CDir *dir = pinned_dirs.front();
579 dout(10) << "removing temp auth_pin on " << *dir << dendl;
580 dir->auth_unpin(this);
581 pinned_dirs.pop_front();
582 }
583 }
584
585
586
587 void Migrator::show_importing()
588 {
589 dout(10) << "show_importing" << dendl;
590 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
591 p != import_state.end();
592 ++p) {
593 CDir *dir = mds->mdcache->get_dirfrag(p->first);
594 if (dir) {
595 dout(10) << " importing from " << p->second.peer
596 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
597 << " " << p->first << " " << *dir << dendl;
598 } else {
599 dout(10) << " importing from " << p->second.peer
600 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
601 << " " << p->first << dendl;
602 }
603 }
604 }
605
606 void Migrator::show_exporting()
607 {
608 dout(10) << "show_exporting" << dendl;
609 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
610 p != export_state.end();
611 ++p)
612 dout(10) << " exporting to " << p->second.peer
613 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
614 << " " << p->first->dirfrag() << " " << *p->first << dendl;
615 }
616
617
618
619 void Migrator::audit()
620 {
621 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
622 return; // hrm.
623
624 // import_state
625 show_importing();
626 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
627 p != import_state.end();
628 ++p) {
629 if (p->second.state == IMPORT_DISCOVERING)
630 continue;
631 if (p->second.state == IMPORT_DISCOVERED) {
632 CInode *in = cache->get_inode(p->first.ino);
633 assert(in);
634 continue;
635 }
636 CDir *dir = cache->get_dirfrag(p->first);
637 assert(dir);
638 if (p->second.state == IMPORT_PREPPING)
639 continue;
640 if (p->second.state == IMPORT_ABORTING) {
641 assert(!dir->is_ambiguous_dir_auth());
642 assert(dir->get_dir_auth().first != mds->get_nodeid());
643 continue;
644 }
645 assert(dir->is_ambiguous_dir_auth());
646 assert(dir->authority().first == mds->get_nodeid() ||
647 dir->authority().second == mds->get_nodeid());
648 }
649
650 // export_state
651 show_exporting();
652 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
653 p != export_state.end();
654 ++p) {
655 CDir *dir = p->first;
656 if (p->second.state == EXPORT_LOCKING ||
657 p->second.state == EXPORT_DISCOVERING ||
658 p->second.state == EXPORT_FREEZING ||
659 p->second.state == EXPORT_CANCELLING)
660 continue;
661 assert(dir->is_ambiguous_dir_auth());
662 assert(dir->authority().first == mds->get_nodeid() ||
663 dir->authority().second == mds->get_nodeid());
664 }
665
666 // ambiguous+me subtrees should be importing|exporting
667
668 // write me
669 }
670
671
672
673
674
675 // ==========================================================
676 // EXPORT
677
678 void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
679 {
680 // enqueue
681 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
682 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
683
684 maybe_do_queued_export();
685 }
686
687 void Migrator::maybe_do_queued_export()
688 {
689 static bool running;
690 if (running)
691 return;
692 running = true;
693 while (!export_queue.empty() &&
694 export_state.size() <= 4) {
695 dirfrag_t df = export_queue.front().first;
696 mds_rank_t dest = export_queue.front().second;
697 export_queue.pop_front();
698
699 CDir *dir = mds->mdcache->get_dirfrag(df);
700 if (!dir) continue;
701 if (!dir->is_auth()) continue;
702
703 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
704
705 export_dir(dir, dest);
706 }
707 running = false;
708 }
709
710
711
712
713 class C_MDC_ExportFreeze : public MigratorContext {
714 CDir *ex; // dir i'm exporting
715 uint64_t tid;
716 public:
717 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
718 MigratorContext(m), ex(e), tid(t) {
719 assert(ex != NULL);
720 }
721 void finish(int r) override {
722 if (r >= 0)
723 mig->export_frozen(ex, tid);
724 }
725 };
726
727
728 void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
729 {
730 // path
731 vector<CDentry*> trace;
732 cache->make_trace(trace, dir->inode);
733 for (vector<CDentry*>::iterator it = trace.begin();
734 it != trace.end();
735 ++it)
736 locks.insert(&(*it)->lock);
737
738 // prevent scatter gather race
739 locks.insert(&dir->get_inode()->dirfragtreelock);
740
741 // bound dftlocks:
742 // NOTE: We need to take an rdlock on bounding dirfrags during
743 // migration for a rather irritating reason: when we export the
744 // bound inode, we need to send scatterlock state for the dirfrags
745 // as well, so that the new auth also gets the correct info. If we
746 // race with a refragment, this info is useless, as we can't
747 // redivvy it up. And it's needed for the scatterlocks to work
748 // properly: when the auth is in a sync/lock state it keeps each
749 // dirfrag's portion in the local (auth OR replica) dirfrag.
750 set<CDir*> wouldbe_bounds;
751 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
752 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
753 locks.insert(&(*p)->get_inode()->dirfragtreelock);
754 }
755
756
757 class C_M_ExportDirWait : public MigratorContext {
758 MDRequestRef mdr;
759 int count;
760 public:
761 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
762 : MigratorContext(m), mdr(mdr), count(count) {}
763 void finish(int r) override {
764 mig->dispatch_export_dir(mdr, count);
765 }
766 };
767
768
769 /** export_dir(dir, dest)
770 * public method to initiate an export.
771 * will fail if the directory is freezing, frozen, unpinnable, or root.
772 */
773 void Migrator::export_dir(CDir *dir, mds_rank_t dest)
774 {
775 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
776 assert(dir->is_auth());
777 assert(dest != mds->get_nodeid());
778
779 if (mds->mdcache->is_readonly()) {
780 dout(7) << "read-only FS, no exports for now" << dendl;
781 return;
782 }
783 if (!mds->mdsmap->is_active(dest)) {
784 dout(7) << "dest not active, no exports for now" << dendl;
785 return;
786 }
787 if (mds->is_cluster_degraded()) {
788 dout(7) << "cluster degraded, no exports for now" << dendl;
789 return;
790 }
791 if (dir->inode->is_system()) {
792 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
793 //ceph_abort();
794 return;
795 }
796
797 if (!dir->inode->is_base() && dir->inode->get_projected_parent_dir()->inode->is_stray() &&
798 dir->inode->get_projected_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
799 dout(7) << "i won't export anything in stray" << dendl;
800 return;
801 }
802
803 if (dir->is_frozen() ||
804 dir->is_freezing()) {
805 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
806 return;
807 }
808 if (dir->state_test(CDir::STATE_EXPORTING)) {
809 dout(7) << "already exporting" << dendl;
810 return;
811 }
812
813 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
814 dout(7) << "dir is export pinned" << dendl;
815 return;
816 }
817
818 if (dest == mds->get_nodeid() || !mds->mdsmap->is_up(dest)) {
819 dout(7) << "cannot export: dest " << dest << " is me or is not active" << dendl;
820 return;
821 }
822
823 if (g_conf->mds_thrash_exports) {
824 // create random subtree bound (which will not be exported)
825 list<CDir*> ls;
826 for (auto p = dir->begin(); p != dir->end(); ++p) {
827 auto dn = p->second;
828 CDentry::linkage_t *dnl= dn->get_linkage();
829 if (dnl->is_primary()) {
830 CInode *in = dnl->get_inode();
831 if (in->is_dir())
832 in->get_nested_dirfrags(ls);
833 }
834 }
835 if (ls.size() > 0) {
836 int n = rand() % ls.size();
837 auto p = ls.begin();
838 while (n--) ++p;
839 CDir *bd = *p;
840 if (!(bd->is_frozen() || bd->is_freezing())) {
841 assert(bd->is_auth());
842 dir->state_set(CDir::STATE_AUXSUBTREE);
843 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
844 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
845 }
846 }
847 }
848
849 mds->hit_export_target(ceph_clock_now(), dest, -1);
850
851 dir->auth_pin(this);
852 dir->state_set(CDir::STATE_EXPORTING);
853
854 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
855 mdr->more()->export_dir = dir;
856
857 assert(export_state.count(dir) == 0);
858 export_state_t& stat = export_state[dir];
859 stat.state = EXPORT_LOCKING;
860 stat.peer = dest;
861 stat.tid = mdr->reqid.tid;
862 stat.mut = mdr;
863
864 return mds->mdcache->dispatch_request(mdr);
865 }
866
867 void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
868 {
869 dout(7) << "dispatch_export_dir " << *mdr << dendl;
870
871 CDir *dir = mdr->more()->export_dir;
872 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
873 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
874 // export must have aborted.
875 dout(7) << "export must have aborted " << *mdr << dendl;
876 mds->mdcache->request_finish(mdr);
877 return;
878 }
879 assert(it->second.state == EXPORT_LOCKING);
880
881 mds_rank_t dest = it->second.peer;
882
883 if (!mds->is_export_target(dest)) {
884 dout(7) << "dest is not yet an export target" << dendl;
885 if (count > 3) {
886 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
887 export_try_cancel(dir);
888 return;
889 }
890
891 mds->locker->drop_locks(mdr.get());
892 mdr->drop_local_auth_pins();
893
894 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
895 return;
896 }
897
898 if (!dir->inode->get_parent_dn()) {
899 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
900 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
901 return;
902 }
903
904 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
905 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
906 export_try_cancel(dir);
907 return;
908 }
909
910 // locks?
911 set<SimpleLock*> rdlocks;
912 set<SimpleLock*> xlocks;
913 set<SimpleLock*> wrlocks;
914 get_export_lock_set(dir, rdlocks);
915 // If auth MDS of the subtree root inode is neither the exporter MDS
916 // nor the importer MDS and it gathers subtree root's fragstat/neststat
917 // while the subtree is exporting. It's possible that the exporter MDS
918 // and the importer MDS both are auth MDS of the subtree root or both
919 // are not auth MDS of the subtree root at the time they receive the
920 // lock messages. So the auth MDS of the subtree root inode may get no
921 // or duplicated fragstat/neststat for the subtree root dirfrag.
922 wrlocks.insert(&dir->get_inode()->filelock);
923 wrlocks.insert(&dir->get_inode()->nestlock);
924 if (dir->get_inode()->is_auth()) {
925 dir->get_inode()->filelock.set_scatter_wanted();
926 dir->get_inode()->nestlock.set_scatter_wanted();
927 }
928
929 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
930 if (mdr->aborted)
931 export_try_cancel(dir);
932 return;
933 }
934
935 assert(g_conf->mds_kill_export_at != 1);
936 it->second.state = EXPORT_DISCOVERING;
937
938 // send ExportDirDiscover (ask target)
939 filepath path;
940 dir->inode->make_path(path);
941 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
942 mds->get_nodeid(),
943 it->second.tid);
944 mds->send_message_mds(discover, dest);
945 assert(g_conf->mds_kill_export_at != 2);
946
947 it->second.last_cum_auth_pins_change = ceph_clock_now();
948
949 // start the freeze, but hold it up with an auth_pin.
950 dir->freeze_tree();
951 assert(dir->is_freezing_tree());
952 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
953 }
954
955 /*
956 * called on receipt of MExportDirDiscoverAck
957 * the importer now has the directory's _inode_ in memory, and pinned.
958 *
959 * This function DOES put the passed message before returning
960 */
961 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
962 {
963 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
964 mds_rank_t dest(m->get_source().num());
965 utime_t now = ceph_clock_now();
966 assert(dir);
967
968 dout(7) << "export_discover_ack from " << m->get_source()
969 << " on " << *dir << dendl;
970
971 mds->hit_export_target(now, dest, -1);
972
973 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
974 if (it == export_state.end() ||
975 it->second.tid != m->get_tid() ||
976 it->second.peer != dest) {
977 dout(7) << "must have aborted" << dendl;
978 } else {
979 assert(it->second.state == EXPORT_DISCOVERING);
980 // release locks to avoid deadlock
981 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
982 assert(mdr);
983 mds->mdcache->request_finish(mdr);
984 it->second.mut.reset();
985 // freeze the subtree
986 it->second.state = EXPORT_FREEZING;
987 dir->auth_unpin(this);
988 assert(g_conf->mds_kill_export_at != 3);
989 }
990
991 m->put(); // done
992 }
993
994 class C_M_ExportSessionsFlushed : public MigratorContext {
995 CDir *dir;
996 uint64_t tid;
997 public:
998 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
999 : MigratorContext(m), dir(d), tid(t) {
1000 assert(dir != NULL);
1001 }
1002 void finish(int r) override {
1003 mig->export_sessions_flushed(dir, tid);
1004 }
1005 };
1006
1007 void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1008 {
1009 dout(7) << "export_sessions_flushed " << *dir << dendl;
1010
1011 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1012 if (it == export_state.end() ||
1013 it->second.state == EXPORT_CANCELLING ||
1014 it->second.tid != tid) {
1015 // export must have aborted.
1016 dout(7) << "export must have aborted on " << dir << dendl;
1017 return;
1018 }
1019
1020 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1021 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1022 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1023 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1024 export_go(dir); // start export.
1025 }
1026
1027 void Migrator::export_frozen(CDir *dir, uint64_t tid)
1028 {
1029 dout(7) << "export_frozen on " << *dir << dendl;
1030
1031 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1032 if (it == export_state.end() || it->second.tid != tid) {
1033 dout(7) << "export must have aborted" << dendl;
1034 return;
1035 }
1036
1037 assert(it->second.state == EXPORT_FREEZING);
1038 assert(dir->is_frozen_tree_root());
1039 assert(dir->get_cum_auth_pins() == 0);
1040
1041 CInode *diri = dir->get_inode();
1042
1043 // ok, try to grab all my locks.
1044 set<SimpleLock*> rdlocks;
1045 get_export_lock_set(dir, rdlocks);
1046 if ((diri->is_auth() && diri->is_frozen()) ||
1047 !mds->locker->can_rdlock_set(rdlocks) ||
1048 !diri->filelock.can_wrlock(-1) ||
1049 !diri->nestlock.can_wrlock(-1)) {
1050 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1051 << *dir << dendl;
1052 // .. unwind ..
1053 dir->unfreeze_tree();
1054 cache->try_subtree_merge(dir);
1055
1056 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
1057 export_state.erase(it);
1058
1059 dir->state_clear(CDir::STATE_EXPORTING);
1060 cache->maybe_send_pending_resolves();
1061 return;
1062 }
1063
1064 it->second.mut = new MutationImpl();
1065 if (diri->is_auth())
1066 it->second.mut->auth_pin(diri);
1067 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1068 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1069 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1070
1071 cache->show_subtrees();
1072
1073 // CDir::_freeze_tree() should have forced it into subtree.
1074 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
1075 // note the bounds.
1076 set<CDir*> bounds;
1077 cache->get_subtree_bounds(dir, bounds);
1078
1079 // generate prep message, log entry.
1080 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1081
1082 // include list of bystanders
1083 for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
1084 p != dir->replicas_end();
1085 ++p) {
1086 if (p->first != it->second.peer) {
1087 dout(10) << "bystander mds." << p->first << dendl;
1088 prep->add_bystander(p->first);
1089 }
1090 }
1091
1092 // include base dirfrag
1093 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1094
1095 /*
1096 * include spanning tree for all nested exports.
1097 * these need to be on the destination _before_ the final export so that
1098 * dir_auth updates on any nested exports are properly absorbed.
1099 * this includes inodes and dirfrags included in the subtree, but
1100 * only the inodes at the bounds.
1101 *
1102 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1103 */
1104 set<inodeno_t> inodes_added;
1105 set<dirfrag_t> dirfrags_added;
1106
1107 // check bounds
1108 for (set<CDir*>::iterator p = bounds.begin();
1109 p != bounds.end();
1110 ++p) {
1111 CDir *bound = *p;
1112
1113 // pin it.
1114 bound->get(CDir::PIN_EXPORTBOUND);
1115 bound->state_set(CDir::STATE_EXPORTBOUND);
1116
1117 dout(7) << " export bound " << *bound << dendl;
1118 prep->add_bound( bound->dirfrag() );
1119
1120 // trace to bound
1121 bufferlist tracebl;
1122 CDir *cur = bound;
1123
1124 char start = '-';
1125 while (1) {
1126 // don't repeat inodes
1127 if (inodes_added.count(cur->inode->ino()))
1128 break;
1129 inodes_added.insert(cur->inode->ino());
1130
1131 // prepend dentry + inode
1132 assert(cur->inode->is_auth());
1133 bufferlist bl;
1134 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1135 dout(7) << " added " << *cur->inode->parent << dendl;
1136 cache->replicate_inode(cur->inode, it->second.peer, bl,
1137 mds->mdsmap->get_up_features());
1138 dout(7) << " added " << *cur->inode << dendl;
1139 bl.claim_append(tracebl);
1140 tracebl.claim(bl);
1141
1142 cur = cur->get_parent_dir();
1143
1144 // don't repeat dirfrags
1145 if (dirfrags_added.count(cur->dirfrag()) ||
1146 cur == dir) {
1147 start = 'd'; // start with dentry
1148 break;
1149 }
1150 dirfrags_added.insert(cur->dirfrag());
1151
1152 // prepend dir
1153 cache->replicate_dir(cur, it->second.peer, bl);
1154 dout(7) << " added " << *cur << dendl;
1155 bl.claim_append(tracebl);
1156 tracebl.claim(bl);
1157
1158 start = 'f'; // start with dirfrag
1159 }
1160 bufferlist final_bl;
1161 dirfrag_t df = cur->dirfrag();
1162 ::encode(df, final_bl);
1163 ::encode(start, final_bl);
1164 final_bl.claim_append(tracebl);
1165 prep->add_trace(final_bl);
1166 }
1167
1168 // send.
1169 it->second.state = EXPORT_PREPPING;
1170 mds->send_message_mds(prep, it->second.peer);
1171 assert (g_conf->mds_kill_export_at != 4);
1172
1173 // make sure any new instantiations of caps are flushed out
1174 assert(it->second.warning_ack_waiting.empty());
1175
1176 set<client_t> export_client_set;
1177 get_export_client_set(dir, export_client_set);
1178
1179 MDSGatherBuilder gather(g_ceph_context);
1180 mds->server->flush_client_sessions(export_client_set, gather);
1181 if (gather.has_subs()) {
1182 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1183 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1184 gather.activate();
1185 }
1186 }
1187
1188 void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
1189 {
1190 list<CDir*> dfs;
1191 dfs.push_back(dir);
1192 while (!dfs.empty()) {
1193 CDir *dir = dfs.front();
1194 dfs.pop_front();
1195 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p) {
1196 CDentry *dn = p->second;
1197 if (!dn->get_linkage()->is_primary())
1198 continue;
1199 CInode *in = dn->get_linkage()->get_inode();
1200 if (in->is_dir()) {
1201 // directory?
1202 list<CDir*> ls;
1203 in->get_dirfrags(ls);
1204 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
1205 if (!(*q)->state_test(CDir::STATE_EXPORTBOUND)) {
1206 // include nested dirfrag
1207 assert((*q)->get_dir_auth().first == CDIR_AUTH_PARENT);
1208 dfs.push_back(*q); // it's ours, recurse (later)
1209 }
1210 }
1211 }
1212 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1213 q != in->client_caps.end();
1214 ++q)
1215 client_set.insert(q->first);
1216 }
1217 }
1218 }
1219
1220 void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1221 {
1222 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1223 q != in->client_caps.end();
1224 ++q)
1225 client_set.insert(q->first);
1226 }
1227
1228 /* This function DOES put the passed message before returning*/
1229 void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1230 {
1231 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1232 mds_rank_t dest(m->get_source().num());
1233 utime_t now = ceph_clock_now();
1234 assert(dir);
1235
1236 dout(7) << "export_prep_ack " << *dir << dendl;
1237
1238 mds->hit_export_target(now, dest, -1);
1239
1240 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1241 if (it == export_state.end() ||
1242 it->second.tid != m->get_tid() ||
1243 it->second.peer != mds_rank_t(m->get_source().num())) {
1244 // export must have aborted.
1245 dout(7) << "export must have aborted" << dendl;
1246 m->put();
1247 return;
1248 }
1249 assert(it->second.state == EXPORT_PREPPING);
1250
1251 if (!m->is_success()) {
1252 dout(7) << "peer couldn't acquire all needed locks, canceling" << dendl;
1253 export_try_cancel(dir, false);
1254 m->put();
1255 return;
1256 }
1257
1258 assert (g_conf->mds_kill_export_at != 5);
1259 // send warnings
1260 set<CDir*> bounds;
1261 cache->get_subtree_bounds(dir, bounds);
1262
1263 assert(it->second.warning_ack_waiting.empty() ||
1264 (it->second.warning_ack_waiting.size() == 1 &&
1265 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1266 assert(it->second.notify_ack_waiting.empty());
1267
1268 for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
1269 p != dir->replicas_end();
1270 ++p) {
1271 if (p->first == it->second.peer) continue;
1272 if (mds->is_cluster_degraded() &&
1273 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first))
1274 continue; // only if active
1275 it->second.warning_ack_waiting.insert(p->first);
1276 it->second.notify_ack_waiting.insert(p->first); // we'll eventually get a notifyack, too!
1277
1278 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1279 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1280 mds_authority_t(mds->get_nodeid(),it->second.peer));
1281 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1282 notify->get_bounds().push_back((*q)->dirfrag());
1283 mds->send_message_mds(notify, p->first);
1284
1285 }
1286
1287 it->second.state = EXPORT_WARNING;
1288
1289 assert(g_conf->mds_kill_export_at != 6);
1290 // nobody to warn?
1291 if (it->second.warning_ack_waiting.empty())
1292 export_go(dir); // start export.
1293
1294 // done.
1295 m->put();
1296 }
1297
1298
1299 class C_M_ExportGo : public MigratorContext {
1300 CDir *dir;
1301 uint64_t tid;
1302 public:
1303 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1304 MigratorContext(m), dir(d), tid(t) {
1305 assert(dir != NULL);
1306 }
1307 void finish(int r) override {
1308 mig->export_go_synced(dir, tid);
1309 }
1310 };
1311
1312 void Migrator::export_go(CDir *dir)
1313 {
1314 assert(export_state.count(dir));
1315 dout(7) << "export_go " << *dir << " to " << export_state[dir].peer << dendl;
1316
1317 // first sync log to flush out e.g. any cap imports
1318 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, export_state[dir].tid));
1319 mds->mdlog->flush();
1320 }
1321
1322 void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1323 {
1324 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1325 if (it == export_state.end() ||
1326 it->second.state == EXPORT_CANCELLING ||
1327 it->second.tid != tid) {
1328 // export must have aborted.
1329 dout(7) << "export must have aborted on " << dir << dendl;
1330 return;
1331 }
1332 assert(it->second.state == EXPORT_WARNING);
1333 mds_rank_t dest = it->second.peer;
1334
1335 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1336
1337 cache->show_subtrees();
1338
1339 it->second.state = EXPORT_EXPORTING;
1340 assert(g_conf->mds_kill_export_at != 7);
1341
1342 assert(dir->is_frozen_tree_root());
1343 assert(dir->get_cum_auth_pins() == 0);
1344
1345 // set ambiguous auth
1346 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1347
1348 // take away the popularity we're sending.
1349 utime_t now = ceph_clock_now();
1350 mds->balancer->subtract_export(dir, now);
1351
1352 // fill export message with cache data
1353 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1354 map<client_t,entity_inst_t> exported_client_map;
1355 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1356 dir, // recur start point
1357 exported_client_map,
1358 now);
1359 ::encode(exported_client_map, req->client_map,
1360 mds->mdsmap->get_up_features());
1361
1362 // add bounds to message
1363 set<CDir*> bounds;
1364 cache->get_subtree_bounds(dir, bounds);
1365 for (set<CDir*>::iterator p = bounds.begin();
1366 p != bounds.end();
1367 ++p)
1368 req->add_export((*p)->dirfrag());
1369
1370 // send
1371 mds->send_message_mds(req, dest);
1372 assert(g_conf->mds_kill_export_at != 8);
1373
1374 mds->hit_export_target(now, dest, num_exported_inodes+1);
1375
1376 // stats
1377 if (mds->logger) mds->logger->inc(l_mds_exported);
1378 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1379
1380 cache->show_subtrees();
1381 }
1382
1383
1384 /** encode_export_inode
1385 * update our local state for this inode to export.
1386 * encode relevant state to be sent over the wire.
1387 * used by: encode_export_dir, file_rename (if foreign)
1388 *
1389 * FIXME: the separation between CInode.encode_export and these methods
1390 * is pretty arbitrary and dumb.
1391 */
1392 void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1393 map<client_t,entity_inst_t>& exported_client_map)
1394 {
1395 dout(7) << "encode_export_inode " << *in << dendl;
1396 assert(!in->is_replica(mds->get_nodeid()));
1397
1398 // relax locks?
1399 if (!in->is_replicated()) {
1400 in->replicate_relax_locks();
1401 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1402 }
1403
1404 ::encode(in->inode.ino, enc_state);
1405 ::encode(in->last, enc_state);
1406 in->encode_export(enc_state);
1407
1408 // caps
1409 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1410 }
1411
1412 void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1413 map<client_t,entity_inst_t>& exported_client_map)
1414 {
1415 dout(20) << "encode_export_inode_caps " << *in << dendl;
1416
1417 // encode caps
1418 map<client_t,Capability::Export> cap_map;
1419 in->export_client_caps(cap_map);
1420 ::encode(cap_map, bl);
1421 if (auth_cap) {
1422 ::encode(in->get_mds_caps_wanted(), bl);
1423
1424 in->state_set(CInode::STATE_EXPORTINGCAPS);
1425 in->get(CInode::PIN_EXPORTINGCAPS);
1426 }
1427
1428 // make note of clients named by exported capabilities
1429 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1430 it != in->client_caps.end();
1431 ++it)
1432 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1433 }
1434
1435 void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1436 map<client_t,Capability::Import>& peer_imported)
1437 {
1438 dout(20) << "finish_export_inode_caps " << *in << dendl;
1439
1440 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1441 in->put(CInode::PIN_EXPORTINGCAPS);
1442
1443 // tell (all) clients about migrating caps..
1444 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1445 it != in->client_caps.end();
1446 ++it) {
1447 Capability *cap = it->second;
1448 dout(7) << "finish_export_inode_caps telling client." << it->first
1449 << " exported caps on " << *in << dendl;
1450 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1451 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1452
1453 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1454 assert(q != peer_imported.end());
1455 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, peer, 0);
1456 mds->send_message_client_counted(m, it->first);
1457 }
1458 in->clear_client_caps_after_export();
1459 mds->locker->eval(in, CEPH_CAP_LOCKS);
1460 }
1461
1462 void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1463 map<client_t,Capability::Import>& peer_imported,
1464 list<MDSInternalContextBase*>& finished)
1465 {
1466 dout(12) << "finish_export_inode " << *in << dendl;
1467
1468 // clean
1469 if (in->is_dirty())
1470 in->mark_clean();
1471
1472 // clear/unpin cached_by (we're no longer the authority)
1473 in->clear_replica_map();
1474
1475 // twiddle lock states for auth -> replica transition
1476 in->authlock.export_twiddle();
1477 in->linklock.export_twiddle();
1478 in->dirfragtreelock.export_twiddle();
1479 in->filelock.export_twiddle();
1480 in->nestlock.export_twiddle();
1481 in->xattrlock.export_twiddle();
1482 in->snaplock.export_twiddle();
1483 in->flocklock.export_twiddle();
1484 in->policylock.export_twiddle();
1485
1486 // mark auth
1487 assert(in->is_auth());
1488 in->state_clear(CInode::STATE_AUTH);
1489 in->replica_nonce = CInode::EXPORT_NONCE;
1490
1491 in->clear_dirty_rstat();
1492
1493 // no more auth subtree? clear scatter dirty
1494 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1495 in->clear_scatter_dirty();
1496
1497 in->item_open_file.remove_myself();
1498
1499 in->clear_dirty_parent();
1500
1501 in->clear_file_locks();
1502
1503 // waiters
1504 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1505
1506 in->finish_export(now);
1507
1508 finish_export_inode_caps(in, peer, peer_imported);
1509 }
1510
1511 uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1512 CDir *dir,
1513 map<client_t,entity_inst_t>& exported_client_map,
1514 utime_t now)
1515 {
1516 uint64_t num_exported = 0;
1517
1518 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1519
1520 assert(dir->get_projected_version() == dir->get_version());
1521
1522 #ifdef MDS_VERIFY_FRAGSTAT
1523 if (dir->is_complete())
1524 dir->verify_fragstat();
1525 #endif
1526
1527 // dir
1528 dirfrag_t df = dir->dirfrag();
1529 ::encode(df, exportbl);
1530 dir->encode_export(exportbl);
1531
1532 __u32 nden = dir->items.size();
1533 ::encode(nden, exportbl);
1534
1535 // dentries
1536 list<CDir*> subdirs;
1537 CDir::map_t::iterator it;
1538 for (it = dir->begin(); it != dir->end(); ++it) {
1539 CDentry *dn = it->second;
1540 CInode *in = dn->get_linkage()->get_inode();
1541
1542 if (!dn->is_replicated())
1543 dn->lock.replicate_relax();
1544
1545 num_exported++;
1546
1547 // -- dentry
1548 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1549
1550 // dn name
1551 ::encode(dn->name, exportbl);
1552 ::encode(dn->last, exportbl);
1553
1554 // state
1555 dn->encode_export(exportbl);
1556
1557 // points to...
1558
1559 // null dentry?
1560 if (dn->get_linkage()->is_null()) {
1561 exportbl.append("N", 1); // null dentry
1562 continue;
1563 }
1564
1565 if (dn->get_linkage()->is_remote()) {
1566 // remote link
1567 exportbl.append("L", 1); // remote link
1568
1569 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1570 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1571 ::encode(ino, exportbl);
1572 ::encode(d_type, exportbl);
1573 continue;
1574 }
1575
1576 // primary link
1577 // -- inode
1578 exportbl.append("I", 1); // inode dentry
1579
1580 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1581
1582 // directory?
1583 list<CDir*> dfs;
1584 in->get_dirfrags(dfs);
1585 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1586 CDir *t = *p;
1587 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1588 // include nested dirfrag
1589 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
1590 subdirs.push_back(t); // it's ours, recurse (later)
1591 }
1592 }
1593 }
1594
1595 // subdirs
1596 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1597 num_exported += encode_export_dir(exportbl, *it, exported_client_map, now);
1598
1599 return num_exported;
1600 }
1601
1602 void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1603 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1604 list<MDSInternalContextBase*>& finished, int *num_dentries)
1605 {
1606 dout(10) << "finish_export_dir " << *dir << dendl;
1607
1608 // release open_by
1609 dir->clear_replica_map();
1610
1611 // mark
1612 assert(dir->is_auth());
1613 dir->state_clear(CDir::STATE_AUTH);
1614 dir->remove_bloom();
1615 dir->replica_nonce = CDir::EXPORT_NONCE;
1616
1617 if (dir->is_dirty())
1618 dir->mark_clean();
1619
1620 // suck up all waiters
1621 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1622
1623 // pop
1624 dir->finish_export(now);
1625
1626 // dentries
1627 list<CDir*> subdirs;
1628 CDir::map_t::iterator it;
1629 for (it = dir->begin(); it != dir->end(); ++it) {
1630 CDentry *dn = it->second;
1631 CInode *in = dn->get_linkage()->get_inode();
1632
1633 // dentry
1634 dn->finish_export();
1635
1636 // inode?
1637 if (dn->get_linkage()->is_primary()) {
1638 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1639
1640 // subdirs?
1641 in->get_nested_dirfrags(subdirs);
1642 }
1643
1644 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1645 ++(*num_dentries);
1646 }
1647
1648 // subdirs
1649 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1650 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1651 }
1652
1653 class C_MDS_ExportFinishLogged : public MigratorLogContext {
1654 CDir *dir;
1655 public:
1656 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1657 void finish(int r) override {
1658 mig->export_logged_finish(dir);
1659 }
1660 };
1661
1662
1663 /*
1664 * i should get an export_ack from the export target.
1665 *
1666 * This function DOES put the passed message before returning
1667 */
1668 void Migrator::handle_export_ack(MExportDirAck *m)
1669 {
1670 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1671 mds_rank_t dest(m->get_source().num());
1672 utime_t now = ceph_clock_now();
1673 assert(dir);
1674 assert(dir->is_frozen_tree_root()); // i'm exporting!
1675
1676 // yay!
1677 dout(7) << "handle_export_ack " << *dir << dendl;
1678
1679 mds->hit_export_target(now, dest, -1);
1680
1681 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1682 assert(it != export_state.end());
1683 assert(it->second.state == EXPORT_EXPORTING);
1684 assert(it->second.tid == m->get_tid());
1685
1686 bufferlist::iterator bp = m->imported_caps.begin();
1687 ::decode(it->second.peer_imported, bp);
1688
1689 it->second.state = EXPORT_LOGGINGFINISH;
1690 assert (g_conf->mds_kill_export_at != 9);
1691 set<CDir*> bounds;
1692 cache->get_subtree_bounds(dir, bounds);
1693
1694 // log completion.
1695 // include export bounds, to ensure they're in the journal.
1696 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
1697 mds->mdlog->start_entry(le);
1698
1699 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
1700 le->metablob.add_dir(dir, false);
1701 for (set<CDir*>::iterator p = bounds.begin();
1702 p != bounds.end();
1703 ++p) {
1704 CDir *bound = *p;
1705 le->get_bounds().insert(bound->dirfrag());
1706 le->metablob.add_dir_context(bound);
1707 le->metablob.add_dir(bound, false);
1708 }
1709
1710 // list us second, them first.
1711 // this keeps authority().first in sync with subtree auth state in the journal.
1712 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1713
1714 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1715 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1716 mds->mdlog->flush();
1717 assert (g_conf->mds_kill_export_at != 10);
1718
1719 m->put();
1720 }
1721
1722 void Migrator::export_notify_abort(CDir *dir, set<CDir*>& bounds)
1723 {
1724 dout(7) << "export_notify_abort " << *dir << dendl;
1725
1726 export_state_t& stat = export_state[dir];
1727 assert(stat.state == EXPORT_CANCELLING);
1728
1729 if (stat.notify_ack_waiting.empty()) {
1730 stat.state = EXPORT_CANCELLED;
1731 return;
1732 }
1733
1734 dir->auth_pin(this);
1735
1736 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1737 p != stat.notify_ack_waiting.end();
1738 ++p) {
1739 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(),stat.tid, true,
1740 pair<int,int>(mds->get_nodeid(),stat.peer),
1741 pair<int,int>(mds->get_nodeid(),CDIR_AUTH_UNKNOWN));
1742 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1743 notify->get_bounds().push_back((*i)->dirfrag());
1744 mds->send_message_mds(notify, *p);
1745 }
1746 }
1747
1748 /*
1749 * this happens if hte dest failes after i send teh export data but before it is acked
1750 * that is, we don't know they safely received and logged it, so we reverse our changes
1751 * and go on.
1752 */
1753 void Migrator::export_reverse(CDir *dir)
1754 {
1755 dout(7) << "export_reverse " << *dir << dendl;
1756
1757 set<CInode*> to_eval;
1758
1759 set<CDir*> bounds;
1760 cache->get_subtree_bounds(dir, bounds);
1761
1762 // remove exporting pins
1763 list<CDir*> rq;
1764 rq.push_back(dir);
1765 while (!rq.empty()) {
1766 CDir *t = rq.front();
1767 rq.pop_front();
1768 t->abort_export();
1769 for (CDir::map_t::iterator p = t->items.begin(); p != t->items.end(); ++p) {
1770 p->second->abort_export();
1771 if (!p->second->get_linkage()->is_primary())
1772 continue;
1773 CInode *in = p->second->get_linkage()->get_inode();
1774 in->abort_export();
1775 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1776 in->state_clear(CInode::STATE_EVALSTALECAPS);
1777 to_eval.insert(in);
1778 }
1779 if (in->is_dir())
1780 in->get_nested_dirfrags(rq);
1781 }
1782 }
1783
1784 // unpin bounds
1785 for (const auto &bd : bounds) {
1786 bd->put(CDir::PIN_EXPORTBOUND);
1787 bd->state_clear(CDir::STATE_EXPORTBOUND);
1788 }
1789
1790 // notify bystanders
1791 export_notify_abort(dir, bounds);
1792
1793 // unfreeze tree, with possible subtree merge.
1794 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1795
1796 // process delayed expires
1797 cache->process_delayed_expire(dir);
1798
1799 dir->unfreeze_tree();
1800 cache->try_subtree_merge(dir);
1801
1802 // revoke/resume stale caps
1803 for (auto in : to_eval) {
1804 bool need_issue = false;
1805 for (auto& p : in->get_client_caps()) {
1806 Capability *cap = p.second;
1807 if (cap->is_stale()) {
1808 mds->locker->revoke_stale_caps(cap);
1809 } else {
1810 need_issue = true;
1811 }
1812 }
1813 if (need_issue &&
1814 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
1815 mds->locker->issue_caps(in);
1816 }
1817
1818 cache->show_cache();
1819 }
1820
1821
1822 /*
1823 * once i get the ack, and logged the EExportFinish(true),
1824 * send notifies (if any), otherwise go straight to finish.
1825 *
1826 */
1827 void Migrator::export_logged_finish(CDir *dir)
1828 {
1829 dout(7) << "export_logged_finish " << *dir << dendl;
1830
1831 export_state_t& stat = export_state[dir];
1832
1833 // send notifies
1834 set<CDir*> bounds;
1835 cache->get_subtree_bounds(dir, bounds);
1836
1837 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1838 p != stat.notify_ack_waiting.end();
1839 ++p) {
1840 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1841 pair<int,int>(mds->get_nodeid(), stat.peer),
1842 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
1843
1844 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1845 notify->get_bounds().push_back((*i)->dirfrag());
1846
1847 mds->send_message_mds(notify, *p);
1848 }
1849
1850 // wait for notifyacks
1851 stat.state = EXPORT_NOTIFYING;
1852 assert (g_conf->mds_kill_export_at != 11);
1853
1854 // no notifies to wait for?
1855 if (stat.notify_ack_waiting.empty()) {
1856 export_finish(dir); // skip notify/notify_ack stage.
1857 } else {
1858 // notify peer to send cap import messages to clients
1859 if (!mds->is_cluster_degraded() ||
1860 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
1861 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
1862 } else {
1863 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
1864 }
1865 }
1866 }
1867
1868 /*
1869 * warning:
1870 * i'll get an ack from each bystander.
1871 * when i get them all, do the export.
1872 * notify:
1873 * i'll get an ack from each bystander.
1874 * when i get them all, unfreeze and send the finish.
1875 *
1876 * This function DOES put the passed message before returning
1877 */
1878 void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
1879 {
1880 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1881 mds_rank_t dest(m->get_source().num());
1882 utime_t now = ceph_clock_now();
1883 assert(dir);
1884 mds_rank_t from = mds_rank_t(m->get_source().num());
1885
1886 mds->hit_export_target(now, dest, -1);
1887
1888 auto export_state_entry = export_state.find(dir);
1889 if (export_state_entry != export_state.end()) {
1890 export_state_t& stat = export_state_entry->second;
1891 if (stat.state == EXPORT_WARNING &&
1892 stat.warning_ack_waiting.erase(from)) {
1893 // exporting. process warning.
1894 dout(7) << "handle_export_notify_ack from " << m->get_source()
1895 << ": exporting, processing warning on " << *dir << dendl;
1896 if (stat.warning_ack_waiting.empty())
1897 export_go(dir); // start export.
1898 } else if (stat.state == EXPORT_NOTIFYING &&
1899 stat.notify_ack_waiting.erase(from)) {
1900 // exporting. process notify.
1901 dout(7) << "handle_export_notify_ack from " << m->get_source()
1902 << ": exporting, processing notify on " << *dir << dendl;
1903 if (stat.notify_ack_waiting.empty())
1904 export_finish(dir);
1905 } else if (stat.state == EXPORT_CANCELLING &&
1906 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
1907 stat.notify_ack_waiting.erase(from)) {
1908 dout(7) << "handle_export_notify_ack from " << m->get_source()
1909 << ": cancelling export, processing notify on " << *dir << dendl;
1910 if (stat.notify_ack_waiting.empty()) {
1911 export_state.erase(export_state_entry);
1912 export_cancel_finish(dir);
1913 }
1914 }
1915 }
1916 else {
1917 auto import_state_entry = import_state.find(dir->dirfrag());
1918 if (import_state_entry != import_state.end()) {
1919 import_state_t& stat = import_state_entry->second;
1920 if (stat.state == IMPORT_ABORTING) {
1921 // reversing import
1922 dout(7) << "handle_export_notify_ack from " << m->get_source()
1923 << ": aborting import on " << *dir << dendl;
1924 assert(stat.bystanders.count(from));
1925 stat.bystanders.erase(from);
1926 if (stat.bystanders.empty())
1927 import_reverse_unfreeze(dir);
1928 }
1929 }
1930 }
1931
1932 m->put();
1933 }
1934
1935 void Migrator::export_finish(CDir *dir)
1936 {
1937 dout(5) << "export_finish " << *dir << dendl;
1938
1939 assert (g_conf->mds_kill_export_at != 12);
1940 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1941 if (it == export_state.end()) {
1942 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
1943 return;
1944 }
1945
1946 // send finish/commit to new auth
1947 if (!mds->is_cluster_degraded() ||
1948 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
1949 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
1950 } else {
1951 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
1952 }
1953 assert(g_conf->mds_kill_export_at != 13);
1954
1955 // finish export (adjust local cache state)
1956 int num_dentries = 0;
1957 list<MDSInternalContextBase*> finished;
1958 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
1959 it->second.peer_imported, finished, &num_dentries);
1960
1961 assert(!dir->is_auth());
1962 cache->adjust_subtree_auth(dir, it->second.peer);
1963
1964 // unpin bounds
1965 set<CDir*> bounds;
1966 cache->get_subtree_bounds(dir, bounds);
1967 for (set<CDir*>::iterator p = bounds.begin();
1968 p != bounds.end();
1969 ++p) {
1970 CDir *bd = *p;
1971 bd->put(CDir::PIN_EXPORTBOUND);
1972 bd->state_clear(CDir::STATE_EXPORTBOUND);
1973 }
1974
1975 if (dir->state_test(CDir::STATE_AUXSUBTREE))
1976 dir->state_clear(CDir::STATE_AUXSUBTREE);
1977
1978 // discard delayed expires
1979 cache->discard_delayed_expire(dir);
1980
1981 dout(7) << "export_finish unfreezing" << dendl;
1982
1983 // unfreeze tree, with possible subtree merge.
1984 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
1985 dir->unfreeze_tree();
1986 cache->try_subtree_merge(dir);
1987
1988 // no more auth subtree? clear scatter dirty
1989 if (!dir->get_inode()->is_auth() &&
1990 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
1991 dir->get_inode()->clear_scatter_dirty();
1992 // wake up scatter_nudge waiters
1993 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
1994 }
1995
1996 if (!finished.empty())
1997 mds->queue_waiters(finished);
1998
1999 MutationRef mut = it->second.mut;
2000 // remove from exporting list, clean up state
2001 export_state.erase(it);
2002 dir->state_clear(CDir::STATE_EXPORTING);
2003
2004 cache->show_subtrees();
2005 audit();
2006
2007 cache->trim(-1, num_dentries); // try trimming exported dentries
2008
2009 // send pending import_maps?
2010 mds->mdcache->maybe_send_pending_resolves();
2011
2012 // drop locks, unpin path
2013 if (mut) {
2014 mds->locker->drop_locks(mut.get());
2015 mut->cleanup();
2016 }
2017
2018 maybe_do_queued_export();
2019 }
2020
2021
2022
2023
2024
2025
2026
2027
2028 // ==========================================================
2029 // IMPORT
2030
2031 void Migrator::handle_export_discover(MExportDirDiscover *m)
2032 {
2033 mds_rank_t from = m->get_source_mds();
2034 assert(from != mds->get_nodeid());
2035
2036 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2037
2038 // note import state
2039 dirfrag_t df = m->get_dirfrag();
2040 // only start discovering on this message once.
2041 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2042 if (!m->started) {
2043 assert(it == import_state.end());
2044 m->started = true;
2045 import_state[df].state = IMPORT_DISCOVERING;
2046 import_state[df].peer = from;
2047 import_state[df].tid = m->get_tid();
2048 } else {
2049 // am i retrying after ancient path_traverse results?
2050 if (it == import_state.end() ||
2051 it->second.peer != from ||
2052 it->second.tid != m->get_tid()) {
2053 dout(7) << " dropping obsolete message" << dendl;
2054 m->put();
2055 return;
2056 }
2057 assert(it->second.state == IMPORT_DISCOVERING);
2058 }
2059
2060 if (!mds->mdcache->is_open()) {
2061 dout(5) << " waiting for root" << dendl;
2062 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2063 return;
2064 }
2065
2066 assert (g_conf->mds_kill_import_at != 1);
2067
2068 // do we have it?
2069 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2070 if (!in) {
2071 // must discover it!
2072 filepath fpath(m->get_path());
2073 vector<CDentry*> trace;
2074 MDRequestRef null_ref;
2075 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2076 if (r > 0) return;
2077 if (r < 0) {
2078 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2079 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2080 }
2081
2082 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2083 }
2084
2085 // yay
2086 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2087
2088 import_state[df].state = IMPORT_DISCOVERED;
2089
2090 // pin inode in the cache (for now)
2091 assert(in->is_dir());
2092 in->get(CInode::PIN_IMPORTING);
2093
2094 // reply
2095 dout(7) << " sending export_discover_ack on " << *in << dendl;
2096 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), import_state[df].peer);
2097 m->put();
2098 assert (g_conf->mds_kill_import_at != 2);
2099 }
2100
2101 void Migrator::import_reverse_discovering(dirfrag_t df)
2102 {
2103 import_state.erase(df);
2104 }
2105
2106 void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2107 {
2108 // unpin base
2109 diri->put(CInode::PIN_IMPORTING);
2110 import_state.erase(df);
2111 }
2112
2113 void Migrator::import_reverse_prepping(CDir *dir)
2114 {
2115 set<CDir*> bounds;
2116 cache->map_dirfrag_set(import_state[dir->dirfrag()].bound_ls, bounds);
2117 import_remove_pins(dir, bounds);
2118 import_reverse_final(dir);
2119 }
2120
2121 /* This function DOES put the passed message before returning*/
2122 void Migrator::handle_export_cancel(MExportDirCancel *m)
2123 {
2124 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2125 dirfrag_t df = m->get_dirfrag();
2126 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2127 if (it == import_state.end()) {
2128 assert(0 == "got export_cancel in weird state");
2129 } else if (it->second.state == IMPORT_DISCOVERING) {
2130 import_reverse_discovering(df);
2131 } else if (it->second.state == IMPORT_DISCOVERED) {
2132 CInode *in = cache->get_inode(df.ino);
2133 assert(in);
2134 import_reverse_discovered(df, in);
2135 } else if (it->second.state == IMPORT_PREPPING) {
2136 CDir *dir = mds->mdcache->get_dirfrag(df);
2137 assert(dir);
2138 import_reverse_prepping(dir);
2139 } else if (it->second.state == IMPORT_PREPPED) {
2140 CDir *dir = mds->mdcache->get_dirfrag(df);
2141 assert(dir);
2142 set<CDir*> bounds;
2143 cache->get_subtree_bounds(dir, bounds);
2144 import_remove_pins(dir, bounds);
2145 // adjust auth back to the exportor
2146 cache->adjust_subtree_auth(dir, it->second.peer);
2147 import_reverse_unfreeze(dir);
2148 } else {
2149 assert(0 == "got export_cancel in weird state");
2150 }
2151 m->put();
2152 }
2153
2154 /* This function DOES put the passed message before returning*/
2155 void Migrator::handle_export_prep(MExportDirPrep *m)
2156 {
2157 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2158 assert(oldauth != mds->get_nodeid());
2159
2160 CDir *dir;
2161 CInode *diri;
2162 list<MDSInternalContextBase*> finished;
2163
2164 // assimilate root dir.
2165 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2166 if (!m->did_assim()) {
2167 assert(it != import_state.end());
2168 assert(it->second.state == IMPORT_DISCOVERED);
2169 assert(it->second.peer == oldauth);
2170 diri = cache->get_inode(m->get_dirfrag().ino);
2171 assert(diri);
2172 bufferlist::iterator p = m->basedir.begin();
2173 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2174 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2175 } else {
2176 if (it == import_state.end() ||
2177 it->second.peer != oldauth ||
2178 it->second.tid != m->get_tid()) {
2179 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2180 m->put();
2181 return;
2182 }
2183 assert(it->second.state == IMPORT_PREPPING);
2184 assert(it->second.peer == oldauth);
2185
2186 dir = cache->get_dirfrag(m->get_dirfrag());
2187 assert(dir);
2188 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2189 diri = dir->get_inode();
2190 }
2191 assert(dir->is_auth() == false);
2192
2193 cache->show_subtrees();
2194
2195 // build import bound map
2196 map<inodeno_t, fragset_t> import_bound_fragset;
2197 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2198 p != m->get_bounds().end();
2199 ++p) {
2200 dout(10) << " bound " << *p << dendl;
2201 import_bound_fragset[p->ino].insert(p->frag);
2202 }
2203
2204 // assimilate contents?
2205 if (!m->did_assim()) {
2206 dout(7) << "doing assim on " << *dir << dendl;
2207 m->mark_assim(); // only do this the first time!
2208
2209 // change import state
2210 it->second.state = IMPORT_PREPPING;
2211 it->second.bound_ls = m->get_bounds();
2212 it->second.bystanders = m->get_bystanders();
2213 assert(g_conf->mds_kill_import_at != 3);
2214
2215 // bystander list
2216 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2217
2218 // move pin to dir
2219 diri->put(CInode::PIN_IMPORTING);
2220 dir->get(CDir::PIN_IMPORTING);
2221 dir->state_set(CDir::STATE_IMPORTING);
2222
2223 // assimilate traces to exports
2224 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2225 for (list<bufferlist>::iterator p = m->traces.begin();
2226 p != m->traces.end();
2227 ++p) {
2228 bufferlist::iterator q = p->begin();
2229 dirfrag_t df;
2230 ::decode(df, q);
2231 char start;
2232 ::decode(start, q);
2233 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2234
2235 CDir *cur = 0;
2236 if (start == 'd') {
2237 cur = cache->get_dirfrag(df);
2238 assert(cur);
2239 dout(10) << " had " << *cur << dendl;
2240 } else if (start == 'f') {
2241 CInode *in = cache->get_inode(df.ino);
2242 assert(in);
2243 dout(10) << " had " << *in << dendl;
2244 cur = cache->add_replica_dir(q, in, oldauth, finished);
2245 dout(10) << " added " << *cur << dendl;
2246 } else if (start == '-') {
2247 // nothing
2248 } else
2249 assert(0 == "unrecognized start char");
2250
2251 while (start != '-') {
2252 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2253 dout(10) << " added " << *dn << dendl;
2254 CInode *in = cache->add_replica_inode(q, dn, finished);
2255 dout(10) << " added " << *in << dendl;
2256 if (q.end())
2257 break;
2258 cur = cache->add_replica_dir(q, in, oldauth, finished);
2259 dout(10) << " added " << *cur << dendl;
2260 }
2261 }
2262
2263 // make bound sticky
2264 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2265 p != import_bound_fragset.end();
2266 ++p) {
2267 CInode *in = cache->get_inode(p->first);
2268 assert(in);
2269 in->get_stickydirs();
2270 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2271 }
2272
2273 } else {
2274 dout(7) << " not doing assim on " << *dir << dendl;
2275 }
2276
2277 if (!finished.empty())
2278 mds->queue_waiters(finished);
2279
2280
2281 // open all bounds
2282 set<CDir*> import_bounds;
2283 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2284 p != import_bound_fragset.end();
2285 ++p) {
2286 CInode *in = cache->get_inode(p->first);
2287 assert(in);
2288
2289 // map fragset into a frag_t list, based on the inode fragtree
2290 list<frag_t> fglist;
2291 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2292 in->dirfragtree.get_leaves_under(*q, fglist);
2293 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2294
2295 for (list<frag_t>::iterator q = fglist.begin();
2296 q != fglist.end();
2297 ++q) {
2298 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2299 if (!bound) {
2300 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2301 cache->open_remote_dirfrag(in, *q,
2302 new C_MDS_RetryMessage(mds, m));
2303 return;
2304 }
2305
2306 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2307 dout(7) << " pinning import bound " << *bound << dendl;
2308 bound->get(CDir::PIN_IMPORTBOUND);
2309 bound->state_set(CDir::STATE_IMPORTBOUND);
2310 } else {
2311 dout(7) << " already pinned import bound " << *bound << dendl;
2312 }
2313 import_bounds.insert(bound);
2314 }
2315 }
2316
2317 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2318
2319 bool success = true;
2320 if (!mds->mdcache->is_readonly() &&
2321 dir->get_inode()->filelock.can_wrlock(-1) &&
2322 dir->get_inode()->nestlock.can_wrlock(-1)) {
2323 it->second.mut = new MutationImpl();
2324 // force some locks. hacky.
2325 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2326 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2327
2328 // note that i am an ambiguous auth for this subtree.
2329 // specify bounds, since the exporter explicitly defines the region.
2330 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2331 pair<int,int>(oldauth, mds->get_nodeid()));
2332 cache->verify_subtree_bounds(dir, import_bounds);
2333 // freeze.
2334 dir->_freeze_tree();
2335 // note new state
2336 it->second.state = IMPORT_PREPPED;
2337 } else {
2338 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2339 success = false;
2340 import_reverse_prepping(dir);
2341 }
2342
2343 // ok!
2344 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2345 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2346
2347 assert(g_conf->mds_kill_import_at != 4);
2348 // done
2349 m->put();
2350 }
2351
2352
2353
2354
2355 class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2356 dirfrag_t df;
2357 CDir *dir;
2358 mds_rank_t from;
2359 public:
2360 map<client_t,entity_inst_t> imported_client_map;
2361 map<client_t,uint64_t> sseqmap;
2362
2363 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2364 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2365 }
2366 void finish(int r) override {
2367 mig->import_logged_start(df, dir, from, imported_client_map, sseqmap);
2368 }
2369 };
2370
2371 /* This function DOES put the passed message before returning*/
2372 void Migrator::handle_export_dir(MExportDir *m)
2373 {
2374 assert (g_conf->mds_kill_import_at != 5);
2375 CDir *dir = cache->get_dirfrag(m->dirfrag);
2376 assert(dir);
2377
2378 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2379 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2380
2381 assert(!dir->is_auth());
2382
2383 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2384 assert(it != import_state.end());
2385 assert(it->second.state == IMPORT_PREPPED);
2386 assert(it->second.tid == m->get_tid());
2387 assert(it->second.peer == oldauth);
2388
2389 utime_t now = ceph_clock_now();
2390
2391 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2392 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2393
2394 cache->show_subtrees();
2395
2396 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
2397
2398 // start the journal entry
2399 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
2400 mds->mdlog->start_entry(le);
2401
2402 le->metablob.add_dir_context(dir);
2403
2404 // adjust auth (list us _first_)
2405 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2406
2407 // new client sessions, open these after we journal
2408 // include imported sessions in EImportStart
2409 bufferlist::iterator cmp = m->client_map.begin();
2410 ::decode(onlogged->imported_client_map, cmp);
2411 assert(cmp.end());
2412 le->cmapv = mds->server->prepare_force_open_sessions(onlogged->imported_client_map, onlogged->sseqmap);
2413 le->client_map.claim(m->client_map);
2414
2415 bufferlist::iterator blp = m->export_data.begin();
2416 int num_imported_inodes = 0;
2417 while (!blp.end()) {
2418 num_imported_inodes +=
2419 decode_import_dir(blp,
2420 oldauth,
2421 dir, // import root
2422 le,
2423 mds->mdlog->get_current_segment(),
2424 it->second.peer_exports,
2425 it->second.updated_scatterlocks,
2426 now);
2427 }
2428 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2429
2430 // include bounds in EImportStart
2431 set<CDir*> import_bounds;
2432 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2433 p != m->bounds.end();
2434 ++p) {
2435 CDir *bd = cache->get_dirfrag(*p);
2436 assert(bd);
2437 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2438 import_bounds.insert(bd);
2439 }
2440 cache->verify_subtree_bounds(dir, import_bounds);
2441
2442 // adjust popularity
2443 mds->balancer->add_import(dir, now);
2444
2445 dout(7) << "handle_export_dir did " << *dir << dendl;
2446
2447 // note state
2448 it->second.state = IMPORT_LOGGINGSTART;
2449 assert (g_conf->mds_kill_import_at != 6);
2450
2451 // log it
2452 mds->mdlog->submit_entry(le, onlogged);
2453 mds->mdlog->flush();
2454
2455 // some stats
2456 if (mds->logger) {
2457 mds->logger->inc(l_mds_imported);
2458 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2459 }
2460
2461 m->put();
2462 }
2463
2464
2465 /*
2466 * this is an import helper
2467 * called by import_finish, and import_reverse and friends.
2468 */
2469 void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2470 {
2471 import_state_t& stat = import_state[dir->dirfrag()];
2472 // root
2473 dir->put(CDir::PIN_IMPORTING);
2474 dir->state_clear(CDir::STATE_IMPORTING);
2475
2476 // bounding inodes
2477 set<inodeno_t> did;
2478 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2479 p != stat.bound_ls.end();
2480 ++p) {
2481 if (did.count(p->ino))
2482 continue;
2483 did.insert(p->ino);
2484 CInode *in = cache->get_inode(p->ino);
2485 assert(in);
2486 in->put_stickydirs();
2487 }
2488
2489 if (stat.state == IMPORT_PREPPING) {
2490 for (auto bd : bounds) {
2491 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2492 bd->put(CDir::PIN_IMPORTBOUND);
2493 bd->state_clear(CDir::STATE_IMPORTBOUND);
2494 }
2495 }
2496 } else if (stat.state >= IMPORT_PREPPED) {
2497 // bounding dirfrags
2498 for (auto bd : bounds) {
2499 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2500 bd->put(CDir::PIN_IMPORTBOUND);
2501 bd->state_clear(CDir::STATE_IMPORTBOUND);
2502 }
2503 }
2504 }
2505
2506
2507 /*
2508 * note: this does teh full work of reversing and import and cleaning up
2509 * state.
2510 * called by both handle_mds_failure and by handle_resolve (if we are
2511 * a survivor coping with an exporter failure+recovery).
2512 */
2513 void Migrator::import_reverse(CDir *dir)
2514 {
2515 dout(7) << "import_reverse " << *dir << dendl;
2516
2517 import_state_t& stat = import_state[dir->dirfrag()];
2518 stat.state = IMPORT_ABORTING;
2519
2520 set<CDir*> bounds;
2521 cache->get_subtree_bounds(dir, bounds);
2522
2523 // remove pins
2524 import_remove_pins(dir, bounds);
2525
2526 // update auth, with possible subtree merge.
2527 assert(dir->is_subtree_root());
2528 if (mds->is_resolve())
2529 cache->trim_non_auth_subtree(dir);
2530
2531 cache->adjust_subtree_auth(dir, stat.peer);
2532
2533 C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather> *fin = new C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather>(g_ceph_context);
2534 if (!dir->get_inode()->is_auth() &&
2535 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2536 dir->get_inode()->clear_scatter_dirty();
2537 // wake up scatter_nudge waiters
2538 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2539 }
2540
2541 int num_dentries = 0;
2542 // adjust auth bits.
2543 list<CDir*> q;
2544 q.push_back(dir);
2545 while (!q.empty()) {
2546 CDir *cur = q.front();
2547 q.pop_front();
2548
2549 // dir
2550 assert(cur->is_auth());
2551 cur->state_clear(CDir::STATE_AUTH);
2552 cur->remove_bloom();
2553 cur->clear_replica_map();
2554 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2555 if (cur->is_dirty())
2556 cur->mark_clean();
2557
2558 CDir::map_t::iterator it;
2559 for (it = cur->begin(); it != cur->end(); ++it) {
2560 CDentry *dn = it->second;
2561
2562 // dentry
2563 dn->state_clear(CDentry::STATE_AUTH);
2564 dn->clear_replica_map();
2565 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2566 if (dn->is_dirty())
2567 dn->mark_clean();
2568
2569 // inode?
2570 if (dn->get_linkage()->is_primary()) {
2571 CInode *in = dn->get_linkage()->get_inode();
2572 in->state_clear(CDentry::STATE_AUTH);
2573 in->clear_replica_map();
2574 in->set_replica_nonce(CInode::EXPORT_NONCE);
2575 if (in->is_dirty())
2576 in->mark_clean();
2577 in->clear_dirty_rstat();
2578 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2579 in->clear_scatter_dirty();
2580 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2581 }
2582
2583 in->clear_dirty_parent();
2584
2585 in->authlock.clear_gather();
2586 in->linklock.clear_gather();
2587 in->dirfragtreelock.clear_gather();
2588 in->filelock.clear_gather();
2589
2590 in->clear_file_locks();
2591
2592 // non-bounding dir?
2593 list<CDir*> dfs;
2594 in->get_dirfrags(dfs);
2595 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2596 if (bounds.count(*p) == 0)
2597 q.push_back(*p);
2598 }
2599
2600 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2601 ++num_dentries;
2602 }
2603 }
2604
2605 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2606
2607 if (stat.state == IMPORT_ACKING) {
2608 // remove imported caps
2609 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
2610 p != stat.peer_exports.end();
2611 ++p) {
2612 CInode *in = p->first;
2613 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2614 q != p->second.end();
2615 ++q) {
2616 Capability *cap = in->get_client_cap(q->first);
2617 assert(cap);
2618 if (cap->is_importing())
2619 in->remove_client_cap(q->first);
2620 }
2621 in->put(CInode::PIN_IMPORTINGCAPS);
2622 }
2623 for (map<client_t,entity_inst_t>::iterator p = stat.client_map.begin();
2624 p != stat.client_map.end();
2625 ++p) {
2626 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2627 assert(session);
2628 session->dec_importing();
2629 }
2630 }
2631
2632 // log our failure
2633 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2634
2635 cache->trim(-1, num_dentries); // try trimming dentries
2636
2637 // notify bystanders; wait in aborting state
2638 import_notify_abort(dir, bounds);
2639 }
2640
2641 void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2642 {
2643 dout(7) << "import_notify_finish " << *dir << dendl;
2644
2645 import_state_t& stat = import_state[dir->dirfrag()];
2646 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2647 p != stat.bystanders.end();
2648 ++p) {
2649 MExportDirNotify *notify =
2650 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
2651 pair<int,int>(stat.peer, mds->get_nodeid()),
2652 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2653 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2654 notify->get_bounds().push_back((*i)->dirfrag());
2655 mds->send_message_mds(notify, *p);
2656 }
2657 }
2658
2659 void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2660 {
2661 dout(7) << "import_notify_abort " << *dir << dendl;
2662
2663 import_state_t& stat = import_state[dir->dirfrag()];
2664 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2665 p != stat.bystanders.end(); ) {
2666 if (mds->is_cluster_degraded() &&
2667 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2668 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2669 stat.bystanders.erase(p++);
2670 continue;
2671 }
2672 MExportDirNotify *notify =
2673 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2674 mds_authority_t(stat.peer, mds->get_nodeid()),
2675 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
2676 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2677 notify->get_bounds().push_back((*i)->dirfrag());
2678 mds->send_message_mds(notify, *p);
2679 ++p;
2680 }
2681 if (stat.bystanders.empty()) {
2682 dout(7) << "no bystanders, finishing reverse now" << dendl;
2683 import_reverse_unfreeze(dir);
2684 } else {
2685 assert (g_conf->mds_kill_import_at != 10);
2686 }
2687 }
2688
2689 void Migrator::import_reverse_unfreeze(CDir *dir)
2690 {
2691 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
2692 assert(!dir->is_auth());
2693 cache->discard_delayed_expire(dir);
2694 dir->unfreeze_tree();
2695 if (dir->is_subtree_root())
2696 cache->try_subtree_merge(dir);
2697 import_reverse_final(dir);
2698 }
2699
2700 void Migrator::import_reverse_final(CDir *dir)
2701 {
2702 dout(7) << "import_reverse_final " << *dir << dendl;
2703
2704 // clean up
2705 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2706 assert(it != import_state.end());
2707
2708 MutationRef mut = it->second.mut;
2709 import_state.erase(it);
2710
2711 // send pending import_maps?
2712 mds->mdcache->maybe_send_pending_resolves();
2713
2714 if (mut) {
2715 mds->locker->drop_locks(mut.get());
2716 mut->cleanup();
2717 }
2718
2719 cache->show_subtrees();
2720 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2721 }
2722
2723
2724
2725
2726 void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
2727 map<client_t,entity_inst_t>& imported_client_map,
2728 map<client_t,uint64_t>& sseqmap)
2729 {
2730 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2731 if (it == import_state.end() ||
2732 it->second.state != IMPORT_LOGGINGSTART) {
2733 dout(7) << "import " << df << " must have aborted" << dendl;
2734 mds->server->finish_force_open_sessions(imported_client_map, sseqmap);
2735 return;
2736 }
2737
2738 dout(7) << "import_logged " << *dir << dendl;
2739
2740 // note state
2741 it->second.state = IMPORT_ACKING;
2742
2743 assert (g_conf->mds_kill_import_at != 7);
2744
2745 // force open client sessions and finish cap import
2746 mds->server->finish_force_open_sessions(imported_client_map, sseqmap, false);
2747 it->second.client_map.swap(imported_client_map);
2748
2749 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2750 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2751 p != it->second.peer_exports.end();
2752 ++p) {
2753 // parameter 'peer' is NONE, delay sending cap import messages to client
2754 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, p->second, imported_caps[p->first->ino()]);
2755 }
2756
2757 // send notify's etc.
2758 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2759
2760 // test surviving observer of a failed migration that did not complete
2761 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2762
2763 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
2764 ::encode(imported_caps, ack->imported_caps);
2765
2766 mds->send_message_mds(ack, from);
2767 assert (g_conf->mds_kill_import_at != 8);
2768
2769 cache->show_subtrees();
2770 }
2771
2772 /* This function DOES put the passed message before returning*/
2773 void Migrator::handle_export_finish(MExportDirFinish *m)
2774 {
2775 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2776 assert(dir);
2777 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
2778
2779 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2780 assert(it != import_state.end());
2781 assert(it->second.tid == m->get_tid());
2782
2783 import_finish(dir, false, m->is_last());
2784
2785 m->put();
2786 }
2787
2788 void Migrator::import_finish(CDir *dir, bool notify, bool last)
2789 {
2790 dout(7) << "import_finish on " << *dir << dendl;
2791
2792 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
2793 assert(it != import_state.end());
2794 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
2795
2796 if (it->second.state == IMPORT_ACKING) {
2797 assert(dir->is_auth());
2798 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2799 }
2800
2801 // log finish
2802 assert(g_conf->mds_kill_import_at != 9);
2803
2804 if (it->second.state == IMPORT_ACKING) {
2805 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2806 p != it->second.peer_exports.end();
2807 ++p) {
2808 CInode *in = p->first;
2809 assert(in->is_auth());
2810 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2811 q != p->second.end();
2812 ++q) {
2813 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
2814 assert(session);
2815 Capability *cap = in->get_client_cap(q->first);
2816 assert(cap);
2817 cap->merge(q->second, true);
2818 cap->clear_importing();
2819 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
2820 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
2821 }
2822 p->second.clear();
2823 in->replica_caps_wanted = 0;
2824 }
2825 for (map<client_t,entity_inst_t>::iterator p = it->second.client_map.begin();
2826 p != it->second.client_map.end();
2827 ++p) {
2828 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2829 assert(session);
2830 session->dec_importing();
2831 }
2832 }
2833
2834 if (!last) {
2835 assert(it->second.state == IMPORT_ACKING);
2836 it->second.state = IMPORT_FINISHING;
2837 return;
2838 }
2839
2840 // remove pins
2841 set<CDir*> bounds;
2842 cache->get_subtree_bounds(dir, bounds);
2843
2844 if (notify)
2845 import_notify_finish(dir, bounds);
2846
2847 import_remove_pins(dir, bounds);
2848
2849 map<CInode*, map<client_t,Capability::Export> > peer_exports;
2850 it->second.peer_exports.swap(peer_exports);
2851
2852 // clear import state (we're done!)
2853 MutationRef mut = it->second.mut;
2854 import_state.erase(it);
2855
2856 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
2857
2858 // process delayed expires
2859 cache->process_delayed_expire(dir);
2860
2861 // unfreeze tree, with possible subtree merge.
2862 dir->unfreeze_tree();
2863 cache->try_subtree_merge(dir);
2864
2865 cache->show_subtrees();
2866 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2867
2868 if (mut) {
2869 mds->locker->drop_locks(mut.get());
2870 mut->cleanup();
2871 }
2872
2873 // re-eval imported caps
2874 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
2875 p != peer_exports.end();
2876 ++p) {
2877 if (p->first->is_auth())
2878 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
2879 p->first->put(CInode::PIN_IMPORTINGCAPS);
2880 }
2881
2882 // send pending import_maps?
2883 mds->mdcache->maybe_send_pending_resolves();
2884
2885 // did i just import mydir?
2886 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
2887 cache->populate_mydir();
2888
2889 // is it empty?
2890 if (dir->get_num_head_items() == 0 &&
2891 !dir->inode->is_auth()) {
2892 // reexport!
2893 export_empty_import(dir);
2894 }
2895 }
2896
2897
2898 void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
2899 mds_rank_t oldauth, LogSegment *ls,
2900 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
2901 list<ScatterLock*>& updated_scatterlocks)
2902 {
2903 dout(15) << "decode_import_inode on " << *dn << dendl;
2904
2905 inodeno_t ino;
2906 snapid_t last;
2907 ::decode(ino, blp);
2908 ::decode(last, blp);
2909
2910 bool added = false;
2911 CInode *in = cache->get_inode(ino, last);
2912 if (!in) {
2913 in = new CInode(mds->mdcache, true, 1, last);
2914 added = true;
2915 }
2916
2917 // state after link -- or not! -sage
2918 in->decode_import(blp, ls); // cap imports are noted for later action
2919
2920 // caps
2921 decode_import_inode_caps(in, true, blp, peer_exports);
2922
2923 // link before state -- or not! -sage
2924 if (dn->get_linkage()->get_inode() != in) {
2925 assert(!dn->get_linkage()->get_inode());
2926 dn->dir->link_primary_inode(dn, in);
2927 }
2928
2929 // add inode?
2930 if (added) {
2931 cache->add_inode(in);
2932 dout(10) << "added " << *in << dendl;
2933 } else {
2934 dout(10) << " had " << *in << dendl;
2935 }
2936
2937 if (in->inode.is_dirty_rstat())
2938 in->mark_dirty_rstat();
2939
2940 // clear if dirtyscattered, since we're going to journal this
2941 // but not until we _actually_ finish the import...
2942 if (in->filelock.is_dirty()) {
2943 updated_scatterlocks.push_back(&in->filelock);
2944 mds->locker->mark_updated_scatterlock(&in->filelock);
2945 }
2946
2947 if (in->dirfragtreelock.is_dirty()) {
2948 updated_scatterlocks.push_back(&in->dirfragtreelock);
2949 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
2950 }
2951
2952 // adjust replica list
2953 //assert(!in->is_replica(oldauth)); // not true on failed export
2954 in->add_replica(oldauth, CInode::EXPORT_NONCE);
2955 if (in->is_replica(mds->get_nodeid()))
2956 in->remove_replica(mds->get_nodeid());
2957 }
2958
2959 void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
2960 bufferlist::iterator &blp,
2961 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
2962 {
2963 map<client_t,Capability::Export> cap_map;
2964 ::decode(cap_map, blp);
2965 if (auth_cap)
2966 ::decode(in->get_mds_caps_wanted(), blp);
2967 if (!cap_map.empty() ||
2968 (auth_cap && !in->get_mds_caps_wanted().empty())) {
2969 peer_exports[in].swap(cap_map);
2970 in->get(CInode::PIN_IMPORTINGCAPS);
2971 }
2972 }
2973
2974 void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
2975 map<client_t,Capability::Export> &export_map,
2976 map<client_t,Capability::Import> &import_map)
2977 {
2978 for (map<client_t,Capability::Export>::iterator it = export_map.begin();
2979 it != export_map.end();
2980 ++it) {
2981 dout(10) << "finish_import_inode_caps for client." << it->first << " on " << *in << dendl;
2982 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(it->first.v));
2983 assert(session);
2984
2985 Capability *cap = in->get_client_cap(it->first);
2986 if (!cap) {
2987 cap = in->add_client_cap(it->first, session);
2988 if (peer < 0)
2989 cap->mark_importing();
2990 }
2991
2992 Capability::Import& im = import_map[it->first];
2993 im.cap_id = cap->get_cap_id();
2994 im.mseq = auth_cap ? it->second.mseq : cap->get_mseq();
2995 im.issue_seq = cap->get_last_seq() + 1;
2996
2997 if (peer >= 0) {
2998 cap->merge(it->second, auth_cap);
2999 mds->mdcache->do_cap_import(session, in, cap, it->second.cap_id,
3000 it->second.seq, it->second.mseq - 1, peer,
3001 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3002 }
3003 }
3004
3005 if (peer >= 0) {
3006 in->replica_caps_wanted = 0;
3007 in->put(CInode::PIN_IMPORTINGCAPS);
3008 }
3009 }
3010
3011 int Migrator::decode_import_dir(bufferlist::iterator& blp,
3012 mds_rank_t oldauth,
3013 CDir *import_root,
3014 EImportStart *le,
3015 LogSegment *ls,
3016 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3017 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3018 {
3019 // set up dir
3020 dirfrag_t df;
3021 ::decode(df, blp);
3022
3023 CInode *diri = cache->get_inode(df.ino);
3024 assert(diri);
3025 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3026 assert(dir);
3027
3028 dout(7) << "decode_import_dir " << *dir << dendl;
3029
3030 // assimilate state
3031 dir->decode_import(blp, now, ls);
3032
3033 // adjust replica list
3034 //assert(!dir->is_replica(oldauth)); // not true on failed export
3035 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3036 if (dir->is_replica(mds->get_nodeid()))
3037 dir->remove_replica(mds->get_nodeid());
3038
3039 // add to journal entry
3040 if (le)
3041 le->metablob.add_import_dir(dir);
3042
3043 int num_imported = 0;
3044
3045 // take all waiters on this dir
3046 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3047 // a replica's presense in my cache implies/forces it's presense in authority's.
3048 list<MDSInternalContextBase*> waiters;
3049
3050 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3051 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3052 it != waiters.end();
3053 ++it)
3054 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3055
3056 dout(15) << "doing contents" << dendl;
3057
3058 // contents
3059 __u32 nden;
3060 ::decode(nden, blp);
3061
3062 for (; nden>0; nden--) {
3063 num_imported++;
3064
3065 // dentry
3066 string dname;
3067 snapid_t last;
3068 ::decode(dname, blp);
3069 ::decode(last, blp);
3070
3071 CDentry *dn = dir->lookup_exact_snap(dname, last);
3072 if (!dn)
3073 dn = dir->add_null_dentry(dname, 1, last);
3074
3075 dn->decode_import(blp, ls);
3076
3077 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3078 if (dn->is_replica(mds->get_nodeid()))
3079 dn->remove_replica(mds->get_nodeid());
3080
3081 // dentry lock in unreadable state can block path traverse
3082 if (dn->lock.get_state() != LOCK_SYNC)
3083 mds->locker->try_eval(&dn->lock, NULL);
3084
3085 dout(15) << "decode_import_dir got " << *dn << dendl;
3086
3087 // points to...
3088 char icode;
3089 ::decode(icode, blp);
3090
3091 if (icode == 'N') {
3092 // null dentry
3093 assert(dn->get_linkage()->is_null());
3094
3095 // fall thru
3096 }
3097 else if (icode == 'L') {
3098 // remote link
3099 inodeno_t ino;
3100 unsigned char d_type;
3101 ::decode(ino, blp);
3102 ::decode(d_type, blp);
3103 if (dn->get_linkage()->is_remote()) {
3104 assert(dn->get_linkage()->get_remote_ino() == ino);
3105 } else {
3106 dir->link_remote_inode(dn, ino, d_type);
3107 }
3108 }
3109 else if (icode == 'I') {
3110 // inode
3111 assert(le);
3112 decode_import_inode(dn, blp, oldauth, ls,
3113 peer_exports, updated_scatterlocks);
3114 }
3115
3116 // add dentry to journal entry
3117 if (le)
3118 le->metablob.add_import_dentry(dn);
3119 }
3120
3121 #ifdef MDS_VERIFY_FRAGSTAT
3122 if (dir->is_complete())
3123 dir->verify_fragstat();
3124 #endif
3125
3126 dir->inode->maybe_export_pin();
3127
3128 dout(7) << "decode_import_dir done " << *dir << dendl;
3129 return num_imported;
3130 }
3131
3132
3133
3134
3135
3136 // authority bystander
3137
3138 /* This function DOES put the passed message before returning*/
3139 void Migrator::handle_export_notify(MExportDirNotify *m)
3140 {
3141 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3142 m->put();
3143 return;
3144 }
3145
3146 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3147
3148 mds_rank_t from = mds_rank_t(m->get_source().num());
3149 mds_authority_t old_auth = m->get_old_auth();
3150 mds_authority_t new_auth = m->get_new_auth();
3151
3152 if (!dir) {
3153 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3154 << " on missing dir " << m->get_dirfrag() << dendl;
3155 } else if (dir->authority() != old_auth) {
3156 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3157 << " != " << old_auth << " -> " << new_auth
3158 << " on " << *dir << dendl;
3159 } else {
3160 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3161 << " on " << *dir << dendl;
3162 // adjust auth
3163 set<CDir*> have;
3164 cache->map_dirfrag_set(m->get_bounds(), have);
3165 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3166
3167 // induce a merge?
3168 cache->try_subtree_merge(dir);
3169 }
3170
3171 // send ack
3172 if (m->wants_ack()) {
3173 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3174 } else {
3175 // aborted. no ack.
3176 dout(7) << "handle_export_notify no ack requested" << dendl;
3177 }
3178
3179 m->put();
3180 }
3181
3182 /** cap exports **/
3183 void Migrator::export_caps(CInode *in)
3184 {
3185 mds_rank_t dest = in->authority().first;
3186 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3187
3188 assert(in->is_any_caps());
3189 assert(!in->is_auth());
3190 assert(!in->is_ambiguous_auth());
3191 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3192
3193 MExportCaps *ex = new MExportCaps;
3194 ex->ino = in->ino();
3195
3196 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3197
3198 mds->send_message_mds(ex, dest);
3199 }
3200
3201 void Migrator::handle_gather_caps(MGatherCaps *m)
3202 {
3203 CInode *in = cache->get_inode(m->ino);
3204
3205 if (!in)
3206 goto out;
3207
3208 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
3209 << " on " << *in
3210 << dendl;
3211 if (in->is_any_caps() &&
3212 !in->is_auth() &&
3213 !in->is_ambiguous_auth() &&
3214 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3215 export_caps(in);
3216
3217 out:
3218 m->put();
3219 }
3220
3221 class C_M_LoggedImportCaps : public MigratorLogContext {
3222 CInode *in;
3223 mds_rank_t from;
3224 public:
3225 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3226 map<client_t,entity_inst_t> client_map;
3227 map<client_t,uint64_t> sseqmap;
3228
3229 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3230 void finish(int r) override {
3231 mig->logged_import_caps(in, from, peer_exports, client_map, sseqmap);
3232 }
3233 };
3234
3235 /* This function DOES put the passed message before returning*/
3236 void Migrator::handle_export_caps(MExportCaps *ex)
3237 {
3238 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3239 CInode *in = cache->get_inode(ex->ino);
3240
3241 assert(in);
3242 assert(in->is_auth());
3243
3244 // FIXME
3245 if (in->is_frozen())
3246 return;
3247
3248 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3249 this, in, mds_rank_t(ex->get_source().num()));
3250 finish->client_map = ex->client_map;
3251
3252 // decode new caps
3253 bufferlist::iterator blp = ex->cap_bl.begin();
3254 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3255 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3256
3257 // journal open client sessions
3258 version_t pv = mds->server->prepare_force_open_sessions(finish->client_map, finish->sseqmap);
3259
3260 ESessions *le = new ESessions(pv, ex->client_map);
3261 mds->mdlog->start_submit_entry(le, finish);
3262 mds->mdlog->flush();
3263
3264 ex->put();
3265 }
3266
3267
3268 void Migrator::logged_import_caps(CInode *in,
3269 mds_rank_t from,
3270 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3271 map<client_t,entity_inst_t>& client_map,
3272 map<client_t,uint64_t>& sseqmap)
3273 {
3274 dout(10) << "logged_import_caps on " << *in << dendl;
3275 // see export_go() vs export_go_synced()
3276 assert(in->is_auth());
3277
3278 // force open client sessions and finish cap import
3279 mds->server->finish_force_open_sessions(client_map, sseqmap);
3280
3281 map<client_t,Capability::Import> imported_caps;
3282
3283 assert(peer_exports.count(in));
3284 // clients will release caps from the exporter when they receive the cap import message.
3285 finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
3286 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
3287 }