]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Migrator.cc
update sources to v12.2.5
[ceph.git] / ceph / src / mds / Migrator.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MDSRank.h"
16#include "MDCache.h"
17#include "CInode.h"
18#include "CDir.h"
19#include "CDentry.h"
20#include "Migrator.h"
21#include "Locker.h"
22#include "Server.h"
23
24#include "MDBalancer.h"
25#include "MDLog.h"
26#include "MDSMap.h"
27#include "Mutation.h"
28
29#include "include/filepath.h"
30
31#include "events/EExport.h"
32#include "events/EImportStart.h"
33#include "events/EImportFinish.h"
34#include "events/ESessions.h"
35
36#include "msg/Messenger.h"
37
38#include "messages/MClientCaps.h"
39
40#include "messages/MExportDirDiscover.h"
41#include "messages/MExportDirDiscoverAck.h"
42#include "messages/MExportDirCancel.h"
43#include "messages/MExportDirPrep.h"
44#include "messages/MExportDirPrepAck.h"
45#include "messages/MExportDir.h"
46#include "messages/MExportDirAck.h"
47#include "messages/MExportDirNotify.h"
48#include "messages/MExportDirNotifyAck.h"
49#include "messages/MExportDirFinish.h"
50
51#include "messages/MExportCaps.h"
52#include "messages/MExportCapsAck.h"
53#include "messages/MGatherCaps.h"
54
55
56/*
57 * this is what the dir->dir_auth values look like
58 *
59 * dir_auth authbits
60 * export
61 * me me - before
62 * me, me me - still me, but preparing for export
63 * me, them me - send MExportDir (peer is preparing)
64 * them, me me - journaled EExport
65 * them them - done
66 *
67 * import:
68 * them them - before
69 * me, them me - journaled EImportStart
70 * me me - done
71 *
72 * which implies:
73 * - auth bit is set if i am listed as first _or_ second dir_auth.
74 */
75
76#include "common/config.h"
77
78
79#define dout_context g_ceph_context
80#define dout_subsys ceph_subsys_mds
81#undef dout_prefix
82#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
83
84
85class MigratorContext : public MDSInternalContextBase {
86protected:
87 Migrator *mig;
88 MDSRank *get_mds() override {
89 return mig->mds;
90 }
91public:
92 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
93 assert(mig != NULL);
94 }
95};
96
97class MigratorLogContext : public MDSLogContextBase {
98protected:
99 Migrator *mig;
100 MDSRank *get_mds() override {
101 return mig->mds;
102 }
103public:
104 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
105 assert(mig != NULL);
106 }
107};
108
109/* This function DOES put the passed message before returning*/
110void Migrator::dispatch(Message *m)
111{
112 switch (m->get_type()) {
113 // import
114 case MSG_MDS_EXPORTDIRDISCOVER:
115 handle_export_discover(static_cast<MExportDirDiscover*>(m));
116 break;
117 case MSG_MDS_EXPORTDIRPREP:
118 handle_export_prep(static_cast<MExportDirPrep*>(m));
119 break;
120 case MSG_MDS_EXPORTDIR:
121 handle_export_dir(static_cast<MExportDir*>(m));
122 break;
123 case MSG_MDS_EXPORTDIRFINISH:
124 handle_export_finish(static_cast<MExportDirFinish*>(m));
125 break;
126 case MSG_MDS_EXPORTDIRCANCEL:
127 handle_export_cancel(static_cast<MExportDirCancel*>(m));
128 break;
129
130 // export
131 case MSG_MDS_EXPORTDIRDISCOVERACK:
132 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
133 break;
134 case MSG_MDS_EXPORTDIRPREPACK:
135 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
136 break;
137 case MSG_MDS_EXPORTDIRACK:
138 handle_export_ack(static_cast<MExportDirAck*>(m));
139 break;
140 case MSG_MDS_EXPORTDIRNOTIFYACK:
141 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
142 break;
143
144 // export 3rd party (dir_auth adjustments)
145 case MSG_MDS_EXPORTDIRNOTIFY:
146 handle_export_notify(static_cast<MExportDirNotify*>(m));
147 break;
148
149 // caps
150 case MSG_MDS_EXPORTCAPS:
151 handle_export_caps(static_cast<MExportCaps*>(m));
152 break;
153 case MSG_MDS_GATHERCAPS:
154 handle_gather_caps(static_cast<MGatherCaps*>(m));
155 break;
156
157 default:
158 derr << "migrator unknown message " << m->get_type() << dendl;
159 assert(0 == "migrator unknown message");
160 }
161}
162
163
164class C_MDC_EmptyImport : public MigratorContext {
165 CDir *dir;
166public:
167 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
168 void finish(int r) override {
169 mig->export_empty_import(dir);
170 }
171};
172
173
174void Migrator::export_empty_import(CDir *dir)
175{
176 dout(7) << "export_empty_import " << *dir << dendl;
177 assert(dir->is_subtree_root());
178
179 if (dir->inode->is_auth()) {
180 dout(7) << " inode is auth" << dendl;
181 return;
182 }
183 if (!dir->is_auth()) {
184 dout(7) << " not auth" << dendl;
185 return;
186 }
187 if (dir->is_freezing() || dir->is_frozen()) {
188 dout(7) << " freezing or frozen" << dendl;
189 return;
190 }
191 if (dir->get_num_head_items() > 0) {
192 dout(7) << " not actually empty" << dendl;
193 return;
194 }
195 if (dir->inode->is_root()) {
196 dout(7) << " root" << dendl;
197 return;
198 }
199
200 mds_rank_t dest = dir->inode->authority().first;
201 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
202
203 dout(7) << " really empty, exporting to " << dest << dendl;
204 assert (dest != mds->get_nodeid());
205
206 dout(7) << "exporting to mds." << dest
207 << " empty import " << *dir << dendl;
208 export_dir( dir, dest );
209}
210
211void Migrator::find_stale_export_freeze()
212{
213 utime_t now = ceph_clock_now();
214 utime_t cutoff = now;
215 cutoff -= g_conf->mds_freeze_tree_timeout;
216
217
218 /*
219 * We could have situations like:
220 *
221 * - mds.0 authpins an item in subtree A
222 * - mds.0 sends request to mds.1 to authpin an item in subtree B
223 * - mds.0 freezes subtree A
224 * - mds.1 authpins an item in subtree B
225 * - mds.1 sends request to mds.0 to authpin an item in subtree A
226 * - mds.1 freezes subtree B
227 * - mds.1 receives the remote authpin request from mds.0
228 * (wait because subtree B is freezing)
229 * - mds.0 receives the remote authpin request from mds.1
230 * (wait because subtree A is freezing)
231 *
232 *
233 * - client request authpins items in subtree B
234 * - freeze subtree B
235 * - import subtree A which is parent of subtree B
236 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
237 * - freeze subtree A
238 * - client request tries authpinning items in subtree A
239 * (wait because subtree A is freezing)
240 */
241 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
242 p != export_state.end(); ) {
243 CDir* dir = p->first;
244 export_state_t& stat = p->second;
245 ++p;
246 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
247 continue;
248 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
249 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
250 stat.last_cum_auth_pins_change = now;
251 continue;
252 }
253 if (stat.last_cum_auth_pins_change >= cutoff)
254 continue;
255 if (stat.num_remote_waiters > 0 ||
256 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
257 export_try_cancel(dir);
258 }
259 }
260}
261
262void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
263{
264 dout(10) << "export_try_cancel " << *dir << dendl;
265
266 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
267 assert(it != export_state.end());
268
269 int state = it->second.state;
270 switch (state) {
271 case EXPORT_LOCKING:
272 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
273 it->second.state = EXPORT_CANCELLED;
274 dir->auth_unpin(this);
275 break;
276 case EXPORT_DISCOVERING:
277 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
278 it->second.state = EXPORT_CANCELLED;
279 dir->unfreeze_tree(); // cancel the freeze
280 dir->auth_unpin(this);
281 if (notify_peer &&
282 (!mds->is_cluster_degraded() ||
283 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
284 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
285 break;
286
287 case EXPORT_FREEZING:
288 dout(10) << "export state=freezing : canceling freeze" << dendl;
289 it->second.state = EXPORT_CANCELLED;
290 dir->unfreeze_tree(); // cancel the freeze
224ce89b
WB
291 if (dir->is_subtree_root())
292 cache->try_subtree_merge(dir);
7c673cae
FG
293 if (notify_peer &&
294 (!mds->is_cluster_degraded() ||
295 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
296 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
297 break;
298
299 // NOTE: state order reversal, warning comes after prepping
300 case EXPORT_WARNING:
301 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
302 it->second.state = EXPORT_CANCELLING;
303 // fall-thru
304
305 case EXPORT_PREPPING:
306 if (state != EXPORT_WARNING) {
307 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
308 it->second.state = EXPORT_CANCELLED;
309 }
310
311 {
312 // unpin bounds
313 set<CDir*> bounds;
314 cache->get_subtree_bounds(dir, bounds);
315 for (set<CDir*>::iterator q = bounds.begin();
316 q != bounds.end();
317 ++q) {
318 CDir *bd = *q;
319 bd->put(CDir::PIN_EXPORTBOUND);
320 bd->state_clear(CDir::STATE_EXPORTBOUND);
321 }
322 if (state == EXPORT_WARNING) {
323 // notify bystanders
b32b8144 324 export_notify_abort(dir, it->second, bounds);
7c673cae
FG
325 // process delayed expires
326 cache->process_delayed_expire(dir);
327 }
328 }
329 dir->unfreeze_tree();
7c673cae 330 cache->try_subtree_merge(dir);
b32b8144
FG
331 for (auto bd : it->second.residual_dirs) {
332 bd->unfreeze_tree();
333 cache->try_subtree_merge(bd);
334 }
7c673cae
FG
335 if (notify_peer &&
336 (!mds->is_cluster_degraded() ||
337 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
338 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
339 break;
340
341 case EXPORT_EXPORTING:
342 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
343 it->second.state = EXPORT_CANCELLING;
b32b8144 344 export_reverse(dir, it->second);
7c673cae
FG
345 break;
346
347 case EXPORT_LOGGINGFINISH:
348 case EXPORT_NOTIFYING:
349 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
350 // leave export_state, don't clean up now.
351 break;
352 case EXPORT_CANCELLING:
353 break;
354
355 default:
356 ceph_abort();
357 }
358
359 // finish clean-up?
360 if (it->second.state == EXPORT_CANCELLING ||
361 it->second.state == EXPORT_CANCELLED) {
362 MutationRef mut;
363 mut.swap(it->second.mut);
364
365 if (it->second.state == EXPORT_CANCELLED) {
366 export_state.erase(it);
367 dir->state_clear(CDir::STATE_EXPORTING);
368 // send pending import_maps?
369 cache->maybe_send_pending_resolves();
370 }
371
372 // drop locks
373 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
374 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
375 assert(mdr);
376 if (mdr->more()->waiting_on_slave.empty())
377 mds->mdcache->request_finish(mdr);
378 } else if (mut) {
379 mds->locker->drop_locks(mut.get());
380 mut->cleanup();
381 }
382
383 cache->show_subtrees();
384
385 maybe_do_queued_export();
386 }
387}
388
389void Migrator::export_cancel_finish(CDir *dir)
390{
391 assert(dir->state_test(CDir::STATE_EXPORTING));
392 dir->state_clear(CDir::STATE_EXPORTING);
393
394 // pinned by Migrator::export_notify_abort()
395 dir->auth_unpin(this);
396 // send pending import_maps? (these need to go out when all exports have finished.)
397 cache->maybe_send_pending_resolves();
398}
399
400// ==========================================================
401// mds failure handling
402
403void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
404{
405 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
406
407 // check my exports
408
409 // first add an extra auth_pin on any freezes, so that canceling a
410 // nested freeze doesn't complete one further up the hierarchy and
411 // confuse the shit out of us. we'll remove it after canceling the
412 // freeze. this way no freeze completions run before we want them
413 // to.
414 list<CDir*> pinned_dirs;
415 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
416 p != export_state.end();
417 ++p) {
418 if (p->second.state == EXPORT_FREEZING) {
419 CDir *dir = p->first;
420 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
421 dir->auth_pin(this);
422 pinned_dirs.push_back(dir);
423 }
424 }
425
426 map<CDir*,export_state_t>::iterator p = export_state.begin();
427 while (p != export_state.end()) {
428 map<CDir*,export_state_t>::iterator next = p;
429 ++next;
430 CDir *dir = p->first;
431
432 // abort exports:
433 // - that are going to the failed node
434 // - that aren't frozen yet (to avoid auth_pin deadlock)
435 // - they havne't prepped yet (they may need to discover bounds to do that)
436 if ((p->second.peer == who &&
437 p->second.state != EXPORT_CANCELLING) ||
438 p->second.state == EXPORT_LOCKING ||
439 p->second.state == EXPORT_DISCOVERING ||
440 p->second.state == EXPORT_FREEZING ||
441 p->second.state == EXPORT_PREPPING) {
442 // the guy i'm exporting to failed, or we're just freezing.
443 dout(10) << "cleaning up export state (" << p->second.state << ")"
444 << get_export_statename(p->second.state) << " of " << *dir << dendl;
445 export_try_cancel(dir);
446 } else if (p->second.peer != who) {
447 // bystander failed.
448 if (p->second.warning_ack_waiting.erase(who)) {
449 if (p->second.state == EXPORT_WARNING) {
450 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
451 // exporter waiting for warning acks, let's fake theirs.
452 dout(10) << "faking export_warning_ack from mds." << who
453 << " on " << *dir << " to mds." << p->second.peer
454 << dendl;
455 if (p->second.warning_ack_waiting.empty())
456 export_go(dir);
457 }
458 }
459 if (p->second.notify_ack_waiting.erase(who)) {
460 // exporter is waiting for notify acks, fake it
461 dout(10) << "faking export_notify_ack from mds." << who
462 << " on " << *dir << " to mds." << p->second.peer
463 << dendl;
464 if (p->second.state == EXPORT_NOTIFYING) {
465 if (p->second.notify_ack_waiting.empty())
466 export_finish(dir);
467 } else if (p->second.state == EXPORT_CANCELLING) {
468 if (p->second.notify_ack_waiting.empty()) {
469 export_state.erase(p);
470 export_cancel_finish(dir);
471 }
472 }
473 }
474 }
475
476 // next!
477 p = next;
478 }
479
480
481 // check my imports
482 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
483 while (q != import_state.end()) {
484 map<dirfrag_t,import_state_t>::iterator next = q;
485 ++next;
486 dirfrag_t df = q->first;
487 CInode *diri = mds->mdcache->get_inode(df.ino);
488 CDir *dir = mds->mdcache->get_dirfrag(df);
489
490 if (q->second.peer == who) {
491 if (dir)
492 dout(10) << "cleaning up import state (" << q->second.state << ")"
493 << get_import_statename(q->second.state) << " of " << *dir << dendl;
494 else
495 dout(10) << "cleaning up import state (" << q->second.state << ")"
496 << get_import_statename(q->second.state) << " of " << df << dendl;
497
498 switch (q->second.state) {
499 case IMPORT_DISCOVERING:
500 dout(10) << "import state=discovering : clearing state" << dendl;
501 import_reverse_discovering(df);
502 break;
503
504 case IMPORT_DISCOVERED:
505 assert(diri);
506 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
507 import_reverse_discovered(df, diri);
508 break;
509
510 case IMPORT_PREPPING:
511 assert(dir);
512 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
b32b8144 513 import_reverse_prepping(dir, q->second);
7c673cae
FG
514 break;
515
516 case IMPORT_PREPPED:
517 assert(dir);
518 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
519 {
520 set<CDir*> bounds;
521 cache->get_subtree_bounds(dir, bounds);
522 import_remove_pins(dir, bounds);
523
524 // adjust auth back to the exporter
525 cache->adjust_subtree_auth(dir, q->second.peer);
7c673cae
FG
526
527 // notify bystanders ; wait in aborting state
b32b8144 528 q->second.state = IMPORT_ABORTING;
7c673cae
FG
529 import_notify_abort(dir, bounds);
530 assert(g_conf->mds_kill_import_at != 10);
531 }
532 break;
533
534 case IMPORT_LOGGINGSTART:
535 assert(dir);
536 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
537 import_reverse(dir);
538 break;
539
540 case IMPORT_ACKING:
541 assert(dir);
542 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
543 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
544 {
545 set<CDir*> bounds;
546 cache->get_subtree_bounds(dir, bounds);
547 cache->add_ambiguous_import(dir, bounds);
548 }
549 break;
550
551 case IMPORT_FINISHING:
552 assert(dir);
553 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
554 import_finish(dir, true);
555 break;
556
557 case IMPORT_ABORTING:
558 assert(dir);
559 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
560 break;
561 }
562 } else {
563 auto bystanders_entry = q->second.bystanders.find(who);
564 if (bystanders_entry != q->second.bystanders.end()) {
565 q->second.bystanders.erase(bystanders_entry);
566 if (q->second.state == IMPORT_ABORTING) {
567 assert(dir);
568 dout(10) << "faking export_notify_ack from mds." << who
569 << " on aborting import " << *dir << " from mds." << q->second.peer
570 << dendl;
224ce89b 571 if (q->second.bystanders.empty())
7c673cae 572 import_reverse_unfreeze(dir);
7c673cae
FG
573 }
574 }
575 }
576
577 // next!
578 q = next;
579 }
580
581 while (!pinned_dirs.empty()) {
582 CDir *dir = pinned_dirs.front();
583 dout(10) << "removing temp auth_pin on " << *dir << dendl;
584 dir->auth_unpin(this);
585 pinned_dirs.pop_front();
586 }
587}
588
589
590
591void Migrator::show_importing()
592{
593 dout(10) << "show_importing" << dendl;
594 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
595 p != import_state.end();
596 ++p) {
597 CDir *dir = mds->mdcache->get_dirfrag(p->first);
598 if (dir) {
599 dout(10) << " importing from " << p->second.peer
600 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
601 << " " << p->first << " " << *dir << dendl;
602 } else {
603 dout(10) << " importing from " << p->second.peer
604 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
605 << " " << p->first << dendl;
606 }
607 }
608}
609
610void Migrator::show_exporting()
611{
612 dout(10) << "show_exporting" << dendl;
613 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
614 p != export_state.end();
615 ++p)
616 dout(10) << " exporting to " << p->second.peer
617 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
618 << " " << p->first->dirfrag() << " " << *p->first << dendl;
619}
620
621
622
623void Migrator::audit()
624{
625 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
626 return; // hrm.
627
628 // import_state
629 show_importing();
630 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
631 p != import_state.end();
632 ++p) {
633 if (p->second.state == IMPORT_DISCOVERING)
634 continue;
635 if (p->second.state == IMPORT_DISCOVERED) {
636 CInode *in = cache->get_inode(p->first.ino);
637 assert(in);
638 continue;
639 }
640 CDir *dir = cache->get_dirfrag(p->first);
641 assert(dir);
642 if (p->second.state == IMPORT_PREPPING)
643 continue;
644 if (p->second.state == IMPORT_ABORTING) {
645 assert(!dir->is_ambiguous_dir_auth());
646 assert(dir->get_dir_auth().first != mds->get_nodeid());
647 continue;
648 }
649 assert(dir->is_ambiguous_dir_auth());
650 assert(dir->authority().first == mds->get_nodeid() ||
651 dir->authority().second == mds->get_nodeid());
652 }
653
654 // export_state
655 show_exporting();
656 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
657 p != export_state.end();
658 ++p) {
659 CDir *dir = p->first;
660 if (p->second.state == EXPORT_LOCKING ||
661 p->second.state == EXPORT_DISCOVERING ||
662 p->second.state == EXPORT_FREEZING ||
663 p->second.state == EXPORT_CANCELLING)
664 continue;
665 assert(dir->is_ambiguous_dir_auth());
666 assert(dir->authority().first == mds->get_nodeid() ||
667 dir->authority().second == mds->get_nodeid());
668 }
669
670 // ambiguous+me subtrees should be importing|exporting
671
672 // write me
673}
674
675
676
677
678
679// ==========================================================
680// EXPORT
681
682void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
683{
684 // enqueue
685 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
686 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
687
688 maybe_do_queued_export();
689}
690
691void Migrator::maybe_do_queued_export()
692{
693 static bool running;
694 if (running)
695 return;
696 running = true;
697 while (!export_queue.empty() &&
698 export_state.size() <= 4) {
699 dirfrag_t df = export_queue.front().first;
700 mds_rank_t dest = export_queue.front().second;
701 export_queue.pop_front();
702
703 CDir *dir = mds->mdcache->get_dirfrag(df);
704 if (!dir) continue;
705 if (!dir->is_auth()) continue;
706
707 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
708
709 export_dir(dir, dest);
710 }
711 running = false;
712}
713
714
715
716
717class C_MDC_ExportFreeze : public MigratorContext {
718 CDir *ex; // dir i'm exporting
719 uint64_t tid;
720public:
721 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
722 MigratorContext(m), ex(e), tid(t) {
723 assert(ex != NULL);
724 }
725 void finish(int r) override {
726 if (r >= 0)
727 mig->export_frozen(ex, tid);
728 }
729};
730
731
732void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
733{
734 // path
735 vector<CDentry*> trace;
736 cache->make_trace(trace, dir->inode);
737 for (vector<CDentry*>::iterator it = trace.begin();
738 it != trace.end();
739 ++it)
740 locks.insert(&(*it)->lock);
741
742 // prevent scatter gather race
743 locks.insert(&dir->get_inode()->dirfragtreelock);
744
745 // bound dftlocks:
746 // NOTE: We need to take an rdlock on bounding dirfrags during
747 // migration for a rather irritating reason: when we export the
748 // bound inode, we need to send scatterlock state for the dirfrags
749 // as well, so that the new auth also gets the correct info. If we
750 // race with a refragment, this info is useless, as we can't
751 // redivvy it up. And it's needed for the scatterlocks to work
752 // properly: when the auth is in a sync/lock state it keeps each
753 // dirfrag's portion in the local (auth OR replica) dirfrag.
754 set<CDir*> wouldbe_bounds;
755 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
756 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
757 locks.insert(&(*p)->get_inode()->dirfragtreelock);
758}
759
760
31f18b77 761class C_M_ExportDirWait : public MigratorContext {
7c673cae
FG
762 MDRequestRef mdr;
763 int count;
764public:
31f18b77 765 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
7c673cae
FG
766 : MigratorContext(m), mdr(mdr), count(count) {}
767 void finish(int r) override {
768 mig->dispatch_export_dir(mdr, count);
769 }
770};
771
772
773/** export_dir(dir, dest)
774 * public method to initiate an export.
775 * will fail if the directory is freezing, frozen, unpinnable, or root.
776 */
777void Migrator::export_dir(CDir *dir, mds_rank_t dest)
778{
779 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
780 assert(dir->is_auth());
781 assert(dest != mds->get_nodeid());
782
181888fb
FG
783 if (!(mds->is_active() || mds->is_stopping())) {
784 dout(7) << "i'm not active, no exports for now" << dendl;
785 return;
786 }
7c673cae
FG
787 if (mds->mdcache->is_readonly()) {
788 dout(7) << "read-only FS, no exports for now" << dendl;
789 return;
790 }
31f18b77
FG
791 if (!mds->mdsmap->is_active(dest)) {
792 dout(7) << "dest not active, no exports for now" << dendl;
793 return;
794 }
7c673cae
FG
795 if (mds->is_cluster_degraded()) {
796 dout(7) << "cluster degraded, no exports for now" << dendl;
797 return;
798 }
799 if (dir->inode->is_system()) {
800 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
801 //ceph_abort();
802 return;
803 }
804
b32b8144
FG
805 CDir* parent_dir = dir->inode->get_projected_parent_dir();
806 if (parent_dir && parent_dir->inode->is_stray()) {
807 if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
808 dout(7) << "i won't export anything in stray" << dendl;
809 return;
810 }
811 } else {
812 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
813 dout(7) << "dir is export pinned" << dendl;
814 return;
815 }
7c673cae
FG
816 }
817
818 if (dir->is_frozen() ||
819 dir->is_freezing()) {
820 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
821 return;
822 }
823 if (dir->state_test(CDir::STATE_EXPORTING)) {
824 dout(7) << "already exporting" << dendl;
825 return;
826 }
827
7c673cae
FG
828 if (g_conf->mds_thrash_exports) {
829 // create random subtree bound (which will not be exported)
830 list<CDir*> ls;
831 for (auto p = dir->begin(); p != dir->end(); ++p) {
832 auto dn = p->second;
833 CDentry::linkage_t *dnl= dn->get_linkage();
834 if (dnl->is_primary()) {
835 CInode *in = dnl->get_inode();
836 if (in->is_dir())
837 in->get_nested_dirfrags(ls);
838 }
839 }
840 if (ls.size() > 0) {
841 int n = rand() % ls.size();
842 auto p = ls.begin();
843 while (n--) ++p;
844 CDir *bd = *p;
845 if (!(bd->is_frozen() || bd->is_freezing())) {
846 assert(bd->is_auth());
847 dir->state_set(CDir::STATE_AUXSUBTREE);
848 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
849 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
850 }
851 }
852 }
853
854 mds->hit_export_target(ceph_clock_now(), dest, -1);
855
856 dir->auth_pin(this);
857 dir->state_set(CDir::STATE_EXPORTING);
858
859 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
860 mdr->more()->export_dir = dir;
861
862 assert(export_state.count(dir) == 0);
863 export_state_t& stat = export_state[dir];
864 stat.state = EXPORT_LOCKING;
865 stat.peer = dest;
866 stat.tid = mdr->reqid.tid;
867 stat.mut = mdr;
868
869 return mds->mdcache->dispatch_request(mdr);
870}
871
872void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
873{
874 dout(7) << "dispatch_export_dir " << *mdr << dendl;
875
876 CDir *dir = mdr->more()->export_dir;
877 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
878 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
879 // export must have aborted.
880 dout(7) << "export must have aborted " << *mdr << dendl;
881 mds->mdcache->request_finish(mdr);
882 return;
883 }
884 assert(it->second.state == EXPORT_LOCKING);
885
886 mds_rank_t dest = it->second.peer;
887
888 if (!mds->is_export_target(dest)) {
889 dout(7) << "dest is not yet an export target" << dendl;
890 if (count > 3) {
891 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
892 export_try_cancel(dir);
893 return;
894 }
224ce89b
WB
895
896 mds->locker->drop_locks(mdr.get());
897 mdr->drop_local_auth_pins();
898
31f18b77 899 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
7c673cae
FG
900 return;
901 }
902
903 if (!dir->inode->get_parent_dn()) {
904 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
31f18b77 905 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
7c673cae
FG
906 return;
907 }
908
909 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
910 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
911 export_try_cancel(dir);
912 return;
913 }
914
915 // locks?
916 set<SimpleLock*> rdlocks;
917 set<SimpleLock*> xlocks;
918 set<SimpleLock*> wrlocks;
919 get_export_lock_set(dir, rdlocks);
920 // If auth MDS of the subtree root inode is neither the exporter MDS
921 // nor the importer MDS and it gathers subtree root's fragstat/neststat
922 // while the subtree is exporting. It's possible that the exporter MDS
923 // and the importer MDS both are auth MDS of the subtree root or both
924 // are not auth MDS of the subtree root at the time they receive the
925 // lock messages. So the auth MDS of the subtree root inode may get no
926 // or duplicated fragstat/neststat for the subtree root dirfrag.
927 wrlocks.insert(&dir->get_inode()->filelock);
928 wrlocks.insert(&dir->get_inode()->nestlock);
929 if (dir->get_inode()->is_auth()) {
930 dir->get_inode()->filelock.set_scatter_wanted();
931 dir->get_inode()->nestlock.set_scatter_wanted();
932 }
933
934 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
935 if (mdr->aborted)
936 export_try_cancel(dir);
937 return;
938 }
939
940 assert(g_conf->mds_kill_export_at != 1);
941 it->second.state = EXPORT_DISCOVERING;
942
943 // send ExportDirDiscover (ask target)
944 filepath path;
945 dir->inode->make_path(path);
946 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
947 mds->get_nodeid(),
948 it->second.tid);
949 mds->send_message_mds(discover, dest);
950 assert(g_conf->mds_kill_export_at != 2);
951
952 it->second.last_cum_auth_pins_change = ceph_clock_now();
953
954 // start the freeze, but hold it up with an auth_pin.
955 dir->freeze_tree();
956 assert(dir->is_freezing_tree());
957 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
958}
959
960/*
961 * called on receipt of MExportDirDiscoverAck
962 * the importer now has the directory's _inode_ in memory, and pinned.
963 *
964 * This function DOES put the passed message before returning
965 */
966void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
967{
968 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
969 mds_rank_t dest(m->get_source().num());
970 utime_t now = ceph_clock_now();
971 assert(dir);
972
973 dout(7) << "export_discover_ack from " << m->get_source()
974 << " on " << *dir << dendl;
975
976 mds->hit_export_target(now, dest, -1);
977
978 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
979 if (it == export_state.end() ||
980 it->second.tid != m->get_tid() ||
981 it->second.peer != dest) {
982 dout(7) << "must have aborted" << dendl;
983 } else {
984 assert(it->second.state == EXPORT_DISCOVERING);
c07f9fc5
FG
985
986 if (m->is_success()) {
987 // release locks to avoid deadlock
988 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
989 assert(mdr);
990 mds->mdcache->request_finish(mdr);
991 it->second.mut.reset();
992 // freeze the subtree
993 it->second.state = EXPORT_FREEZING;
994 dir->auth_unpin(this);
995 assert(g_conf->mds_kill_export_at != 3);
996
997 } else {
998 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
999 export_try_cancel(dir, false);
1000 }
7c673cae
FG
1001 }
1002
1003 m->put(); // done
1004}
1005
1006class C_M_ExportSessionsFlushed : public MigratorContext {
1007 CDir *dir;
1008 uint64_t tid;
1009public:
1010 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1011 : MigratorContext(m), dir(d), tid(t) {
1012 assert(dir != NULL);
1013 }
1014 void finish(int r) override {
1015 mig->export_sessions_flushed(dir, tid);
1016 }
1017};
1018
1019void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1020{
1021 dout(7) << "export_sessions_flushed " << *dir << dendl;
1022
1023 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1024 if (it == export_state.end() ||
1025 it->second.state == EXPORT_CANCELLING ||
1026 it->second.tid != tid) {
1027 // export must have aborted.
1028 dout(7) << "export must have aborted on " << dir << dendl;
1029 return;
1030 }
1031
1032 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1033 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1034 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1035 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1036 export_go(dir); // start export.
1037}
1038
1039void Migrator::export_frozen(CDir *dir, uint64_t tid)
1040{
1041 dout(7) << "export_frozen on " << *dir << dendl;
1042
1043 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1044 if (it == export_state.end() || it->second.tid != tid) {
1045 dout(7) << "export must have aborted" << dendl;
1046 return;
1047 }
1048
1049 assert(it->second.state == EXPORT_FREEZING);
1050 assert(dir->is_frozen_tree_root());
1051 assert(dir->get_cum_auth_pins() == 0);
1052
1053 CInode *diri = dir->get_inode();
1054
1055 // ok, try to grab all my locks.
1056 set<SimpleLock*> rdlocks;
1057 get_export_lock_set(dir, rdlocks);
1058 if ((diri->is_auth() && diri->is_frozen()) ||
1059 !mds->locker->can_rdlock_set(rdlocks) ||
1060 !diri->filelock.can_wrlock(-1) ||
1061 !diri->nestlock.can_wrlock(-1)) {
1062 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1063 << *dir << dendl;
7c673cae
FG
1064 // .. unwind ..
1065 dir->unfreeze_tree();
224ce89b 1066 cache->try_subtree_merge(dir);
7c673cae
FG
1067
1068 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae 1069 export_state.erase(it);
224ce89b
WB
1070
1071 dir->state_clear(CDir::STATE_EXPORTING);
1072 cache->maybe_send_pending_resolves();
7c673cae
FG
1073 return;
1074 }
1075
1076 it->second.mut = new MutationImpl();
1077 if (diri->is_auth())
1078 it->second.mut->auth_pin(diri);
1079 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1080 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1081 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1082
1083 cache->show_subtrees();
1084
224ce89b
WB
1085 // CDir::_freeze_tree() should have forced it into subtree.
1086 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
b32b8144
FG
1087
1088 set<client_t> export_client_set;
1089 check_export_size(dir, it->second, export_client_set);
1090
7c673cae 1091 // note the bounds.
7c673cae
FG
1092 set<CDir*> bounds;
1093 cache->get_subtree_bounds(dir, bounds);
1094
1095 // generate prep message, log entry.
1096 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1097
1098 // include list of bystanders
181888fb
FG
1099 for (const auto &p : dir->get_replicas()) {
1100 if (p.first != it->second.peer) {
1101 dout(10) << "bystander mds." << p.first << dendl;
1102 prep->add_bystander(p.first);
7c673cae
FG
1103 }
1104 }
1105
1106 // include base dirfrag
1107 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1108
1109 /*
1110 * include spanning tree for all nested exports.
1111 * these need to be on the destination _before_ the final export so that
1112 * dir_auth updates on any nested exports are properly absorbed.
1113 * this includes inodes and dirfrags included in the subtree, but
1114 * only the inodes at the bounds.
1115 *
1116 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1117 */
1118 set<inodeno_t> inodes_added;
1119 set<dirfrag_t> dirfrags_added;
1120
1121 // check bounds
1122 for (set<CDir*>::iterator p = bounds.begin();
1123 p != bounds.end();
1124 ++p) {
1125 CDir *bound = *p;
1126
1127 // pin it.
b32b8144 1128 assert(bound->state_test(CDir::STATE_EXPORTBOUND));
7c673cae
FG
1129
1130 dout(7) << " export bound " << *bound << dendl;
1131 prep->add_bound( bound->dirfrag() );
1132
1133 // trace to bound
1134 bufferlist tracebl;
1135 CDir *cur = bound;
b32b8144 1136
7c673cae 1137 char start = '-';
b32b8144
FG
1138 if (it->second.residual_dirs.count(bound)) {
1139 start = 'f';
1140 cache->replicate_dir(bound, it->second.peer, tracebl);
1141 dout(7) << " added " << *bound << dendl;
1142 }
1143
7c673cae
FG
1144 while (1) {
1145 // don't repeat inodes
1146 if (inodes_added.count(cur->inode->ino()))
1147 break;
1148 inodes_added.insert(cur->inode->ino());
1149
1150 // prepend dentry + inode
1151 assert(cur->inode->is_auth());
1152 bufferlist bl;
1153 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1154 dout(7) << " added " << *cur->inode->parent << dendl;
1155 cache->replicate_inode(cur->inode, it->second.peer, bl,
1156 mds->mdsmap->get_up_features());
1157 dout(7) << " added " << *cur->inode << dendl;
1158 bl.claim_append(tracebl);
1159 tracebl.claim(bl);
1160
1161 cur = cur->get_parent_dir();
1162
1163 // don't repeat dirfrags
1164 if (dirfrags_added.count(cur->dirfrag()) ||
1165 cur == dir) {
1166 start = 'd'; // start with dentry
1167 break;
1168 }
1169 dirfrags_added.insert(cur->dirfrag());
1170
1171 // prepend dir
1172 cache->replicate_dir(cur, it->second.peer, bl);
1173 dout(7) << " added " << *cur << dendl;
1174 bl.claim_append(tracebl);
1175 tracebl.claim(bl);
1176
1177 start = 'f'; // start with dirfrag
1178 }
1179 bufferlist final_bl;
1180 dirfrag_t df = cur->dirfrag();
1181 ::encode(df, final_bl);
1182 ::encode(start, final_bl);
1183 final_bl.claim_append(tracebl);
1184 prep->add_trace(final_bl);
1185 }
1186
1187 // send.
1188 it->second.state = EXPORT_PREPPING;
1189 mds->send_message_mds(prep, it->second.peer);
1190 assert (g_conf->mds_kill_export_at != 4);
1191
1192 // make sure any new instantiations of caps are flushed out
1193 assert(it->second.warning_ack_waiting.empty());
1194
7c673cae
FG
1195 MDSGatherBuilder gather(g_ceph_context);
1196 mds->server->flush_client_sessions(export_client_set, gather);
1197 if (gather.has_subs()) {
1198 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1199 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1200 gather.activate();
1201 }
1202}
1203
b32b8144 1204void Migrator::check_export_size(CDir *dir, export_state_t& stat, set<client_t>& client_set)
7c673cae 1205{
b32b8144
FG
1206 const unsigned frag_size = 800;
1207 const unsigned inode_size = 1000;
1208 const unsigned cap_size = 80;
1209 const unsigned link_size = 10;
1210 const unsigned null_size = 1;
1211
1212 uint64_t max_size = g_conf->get_val<uint64_t>("mds_max_export_size");
1213 uint64_t approx_size = 0;
1214
7c673cae
FG
1215 list<CDir*> dfs;
1216 dfs.push_back(dir);
1217 while (!dfs.empty()) {
1218 CDir *dir = dfs.front();
1219 dfs.pop_front();
b32b8144
FG
1220
1221 approx_size += frag_size;
94b18763
FG
1222 for (auto &p : *dir) {
1223 CDentry *dn = p.second;
b32b8144
FG
1224 if (dn->get_linkage()->is_null()) {
1225 approx_size += null_size;
1226 continue;
1227 }
1228 if (dn->get_linkage()->is_remote()) {
1229 approx_size += link_size;
7c673cae 1230 continue;
b32b8144
FG
1231 }
1232
1233 approx_size += inode_size;
7c673cae
FG
1234 CInode *in = dn->get_linkage()->get_inode();
1235 if (in->is_dir()) {
1236 // directory?
1237 list<CDir*> ls;
1238 in->get_dirfrags(ls);
b32b8144
FG
1239 for (auto q : ls) {
1240 if (q->is_subtree_root()) {
1241 q->state_set(CDir::STATE_EXPORTBOUND);
1242 q->get(CDir::PIN_EXPORTBOUND);
1243 } else {
7c673cae 1244 // include nested dirfrag
b32b8144
FG
1245 assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
1246 dfs.push_front(q);
7c673cae
FG
1247 }
1248 }
1249 }
1250 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1251 q != in->client_caps.end();
b32b8144
FG
1252 ++q) {
1253 approx_size += cap_size;
7c673cae 1254 client_set.insert(q->first);
b32b8144 1255 }
7c673cae 1256 }
b32b8144
FG
1257
1258 if (approx_size >= max_size)
1259 break;
1260 }
1261
1262 while (!dfs.empty()) {
1263 CDir *dir = dfs.front();
1264 dfs.pop_front();
1265
1266 dout(7) << "check_export_size: creating bound " << *dir << dendl;
1267 assert(dir->is_auth());
1268 dir->state_set(CDir::STATE_EXPORTBOUND);
1269 dir->get(CDir::PIN_EXPORTBOUND);
1270
1271 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
1272 // Another choice here is finishing all WAIT_UNFREEZE contexts and keeping
1273 // the newly created subtree unfreeze.
1274 dir->_freeze_tree();
1275
1276 stat.residual_dirs.insert(dir);
7c673cae
FG
1277 }
1278}
1279
1280void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1281{
1282 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1283 q != in->client_caps.end();
1284 ++q)
1285 client_set.insert(q->first);
1286}
1287
1288/* This function DOES put the passed message before returning*/
1289void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1290{
1291 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1292 mds_rank_t dest(m->get_source().num());
1293 utime_t now = ceph_clock_now();
1294 assert(dir);
1295
1296 dout(7) << "export_prep_ack " << *dir << dendl;
1297
1298 mds->hit_export_target(now, dest, -1);
1299
1300 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1301 if (it == export_state.end() ||
1302 it->second.tid != m->get_tid() ||
1303 it->second.peer != mds_rank_t(m->get_source().num())) {
1304 // export must have aborted.
1305 dout(7) << "export must have aborted" << dendl;
1306 m->put();
1307 return;
1308 }
1309 assert(it->second.state == EXPORT_PREPPING);
1310
1311 if (!m->is_success()) {
c07f9fc5 1312 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
7c673cae
FG
1313 export_try_cancel(dir, false);
1314 m->put();
1315 return;
1316 }
1317
1318 assert (g_conf->mds_kill_export_at != 5);
1319 // send warnings
1320 set<CDir*> bounds;
1321 cache->get_subtree_bounds(dir, bounds);
1322
1323 assert(it->second.warning_ack_waiting.empty() ||
1324 (it->second.warning_ack_waiting.size() == 1 &&
1325 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1326 assert(it->second.notify_ack_waiting.empty());
1327
181888fb
FG
1328 for (const auto &p : dir->get_replicas()) {
1329 if (p.first == it->second.peer) continue;
7c673cae 1330 if (mds->is_cluster_degraded() &&
181888fb 1331 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
7c673cae 1332 continue; // only if active
181888fb
FG
1333 it->second.warning_ack_waiting.insert(p.first);
1334 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
7c673cae
FG
1335
1336 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1337 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1338 mds_authority_t(mds->get_nodeid(),it->second.peer));
1339 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1340 notify->get_bounds().push_back((*q)->dirfrag());
181888fb 1341 mds->send_message_mds(notify, p.first);
7c673cae
FG
1342
1343 }
1344
1345 it->second.state = EXPORT_WARNING;
1346
1347 assert(g_conf->mds_kill_export_at != 6);
1348 // nobody to warn?
1349 if (it->second.warning_ack_waiting.empty())
1350 export_go(dir); // start export.
1351
1352 // done.
1353 m->put();
1354}
1355
1356
1357class C_M_ExportGo : public MigratorContext {
1358 CDir *dir;
1359 uint64_t tid;
1360public:
1361 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1362 MigratorContext(m), dir(d), tid(t) {
1363 assert(dir != NULL);
1364 }
1365 void finish(int r) override {
1366 mig->export_go_synced(dir, tid);
1367 }
1368};
1369
1370void Migrator::export_go(CDir *dir)
1371{
b32b8144
FG
1372 auto it = export_state.find(dir);
1373 assert(it != export_state.end());
1374 dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
7c673cae
FG
1375
1376 // first sync log to flush out e.g. any cap imports
b32b8144 1377 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
7c673cae
FG
1378 mds->mdlog->flush();
1379}
1380
1381void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1382{
1383 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1384 if (it == export_state.end() ||
1385 it->second.state == EXPORT_CANCELLING ||
1386 it->second.tid != tid) {
1387 // export must have aborted.
1388 dout(7) << "export must have aborted on " << dir << dendl;
1389 return;
1390 }
1391 assert(it->second.state == EXPORT_WARNING);
1392 mds_rank_t dest = it->second.peer;
1393
1394 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1395
1396 cache->show_subtrees();
1397
1398 it->second.state = EXPORT_EXPORTING;
1399 assert(g_conf->mds_kill_export_at != 7);
1400
1401 assert(dir->is_frozen_tree_root());
1402 assert(dir->get_cum_auth_pins() == 0);
1403
1404 // set ambiguous auth
1405 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1406
1407 // take away the popularity we're sending.
1408 utime_t now = ceph_clock_now();
1409 mds->balancer->subtract_export(dir, now);
1410
1411 // fill export message with cache data
1412 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1413 map<client_t,entity_inst_t> exported_client_map;
1414 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1415 dir, // recur start point
1416 exported_client_map,
1417 now);
1418 ::encode(exported_client_map, req->client_map,
1419 mds->mdsmap->get_up_features());
1420
1421 // add bounds to message
1422 set<CDir*> bounds;
1423 cache->get_subtree_bounds(dir, bounds);
1424 for (set<CDir*>::iterator p = bounds.begin();
1425 p != bounds.end();
1426 ++p)
1427 req->add_export((*p)->dirfrag());
1428
1429 // send
1430 mds->send_message_mds(req, dest);
1431 assert(g_conf->mds_kill_export_at != 8);
1432
1433 mds->hit_export_target(now, dest, num_exported_inodes+1);
1434
1435 // stats
1436 if (mds->logger) mds->logger->inc(l_mds_exported);
1437 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1438
1439 cache->show_subtrees();
1440}
1441
1442
1443/** encode_export_inode
1444 * update our local state for this inode to export.
1445 * encode relevant state to be sent over the wire.
1446 * used by: encode_export_dir, file_rename (if foreign)
1447 *
1448 * FIXME: the separation between CInode.encode_export and these methods
1449 * is pretty arbitrary and dumb.
1450 */
1451void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1452 map<client_t,entity_inst_t>& exported_client_map)
1453{
1454 dout(7) << "encode_export_inode " << *in << dendl;
1455 assert(!in->is_replica(mds->get_nodeid()));
1456
1457 // relax locks?
1458 if (!in->is_replicated()) {
1459 in->replicate_relax_locks();
1460 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1461 }
1462
1463 ::encode(in->inode.ino, enc_state);
1464 ::encode(in->last, enc_state);
1465 in->encode_export(enc_state);
1466
1467 // caps
1468 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1469}
1470
1471void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1472 map<client_t,entity_inst_t>& exported_client_map)
1473{
1474 dout(20) << "encode_export_inode_caps " << *in << dendl;
1475
1476 // encode caps
1477 map<client_t,Capability::Export> cap_map;
1478 in->export_client_caps(cap_map);
1479 ::encode(cap_map, bl);
1480 if (auth_cap) {
1481 ::encode(in->get_mds_caps_wanted(), bl);
1482
1483 in->state_set(CInode::STATE_EXPORTINGCAPS);
1484 in->get(CInode::PIN_EXPORTINGCAPS);
1485 }
1486
1487 // make note of clients named by exported capabilities
1488 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1489 it != in->client_caps.end();
1490 ++it)
1491 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1492}
1493
1494void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1495 map<client_t,Capability::Import>& peer_imported)
1496{
1497 dout(20) << "finish_export_inode_caps " << *in << dendl;
1498
1499 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1500 in->put(CInode::PIN_EXPORTINGCAPS);
1501
1502 // tell (all) clients about migrating caps..
1503 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1504 it != in->client_caps.end();
1505 ++it) {
1506 Capability *cap = it->second;
1507 dout(7) << "finish_export_inode_caps telling client." << it->first
1508 << " exported caps on " << *in << dendl;
1509 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1510 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1511
1512 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1513 assert(q != peer_imported.end());
1514 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, peer, 0);
1515 mds->send_message_client_counted(m, it->first);
1516 }
1517 in->clear_client_caps_after_export();
1518 mds->locker->eval(in, CEPH_CAP_LOCKS);
1519}
1520
1521void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1522 map<client_t,Capability::Import>& peer_imported,
1523 list<MDSInternalContextBase*>& finished)
1524{
1525 dout(12) << "finish_export_inode " << *in << dendl;
1526
1527 // clean
1528 if (in->is_dirty())
1529 in->mark_clean();
1530
1531 // clear/unpin cached_by (we're no longer the authority)
1532 in->clear_replica_map();
1533
1534 // twiddle lock states for auth -> replica transition
1535 in->authlock.export_twiddle();
1536 in->linklock.export_twiddle();
1537 in->dirfragtreelock.export_twiddle();
1538 in->filelock.export_twiddle();
1539 in->nestlock.export_twiddle();
1540 in->xattrlock.export_twiddle();
1541 in->snaplock.export_twiddle();
1542 in->flocklock.export_twiddle();
1543 in->policylock.export_twiddle();
1544
1545 // mark auth
1546 assert(in->is_auth());
1547 in->state_clear(CInode::STATE_AUTH);
1548 in->replica_nonce = CInode::EXPORT_NONCE;
1549
1550 in->clear_dirty_rstat();
1551
1552 // no more auth subtree? clear scatter dirty
1553 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1554 in->clear_scatter_dirty();
1555
1556 in->item_open_file.remove_myself();
1557
1558 in->clear_dirty_parent();
1559
1560 in->clear_file_locks();
1561
1562 // waiters
1563 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1564
1565 in->finish_export(now);
1566
1567 finish_export_inode_caps(in, peer, peer_imported);
7c673cae
FG
1568}
1569
1570uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1571 CDir *dir,
1572 map<client_t,entity_inst_t>& exported_client_map,
1573 utime_t now)
1574{
1575 uint64_t num_exported = 0;
1576
1577 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1578
1579 assert(dir->get_projected_version() == dir->get_version());
1580
1581#ifdef MDS_VERIFY_FRAGSTAT
1582 if (dir->is_complete())
1583 dir->verify_fragstat();
1584#endif
1585
1586 // dir
1587 dirfrag_t df = dir->dirfrag();
1588 ::encode(df, exportbl);
1589 dir->encode_export(exportbl);
1590
1591 __u32 nden = dir->items.size();
1592 ::encode(nden, exportbl);
1593
1594 // dentries
1595 list<CDir*> subdirs;
94b18763
FG
1596 for (auto &p : *dir) {
1597 CDentry *dn = p.second;
7c673cae
FG
1598 CInode *in = dn->get_linkage()->get_inode();
1599
1600 if (!dn->is_replicated())
1601 dn->lock.replicate_relax();
1602
1603 num_exported++;
1604
1605 // -- dentry
1606 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1607
1608 // dn name
94b18763 1609 ::encode(dn->get_name(), exportbl);
7c673cae
FG
1610 ::encode(dn->last, exportbl);
1611
1612 // state
1613 dn->encode_export(exportbl);
1614
1615 // points to...
1616
1617 // null dentry?
1618 if (dn->get_linkage()->is_null()) {
1619 exportbl.append("N", 1); // null dentry
1620 continue;
1621 }
1622
1623 if (dn->get_linkage()->is_remote()) {
1624 // remote link
1625 exportbl.append("L", 1); // remote link
1626
1627 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1628 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1629 ::encode(ino, exportbl);
1630 ::encode(d_type, exportbl);
1631 continue;
1632 }
1633
1634 // primary link
1635 // -- inode
1636 exportbl.append("I", 1); // inode dentry
1637
1638 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1639
1640 // directory?
1641 list<CDir*> dfs;
1642 in->get_dirfrags(dfs);
1643 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1644 CDir *t = *p;
1645 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1646 // include nested dirfrag
1647 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
b32b8144 1648 subdirs.push_front(t); // it's ours, recurse (later)
7c673cae
FG
1649 }
1650 }
1651 }
1652
1653 // subdirs
94b18763
FG
1654 for (auto &dir : subdirs)
1655 num_exported += encode_export_dir(exportbl, dir, exported_client_map, now);
7c673cae
FG
1656
1657 return num_exported;
1658}
1659
1660void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1661 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1662 list<MDSInternalContextBase*>& finished, int *num_dentries)
1663{
1664 dout(10) << "finish_export_dir " << *dir << dendl;
1665
1666 // release open_by
1667 dir->clear_replica_map();
1668
1669 // mark
1670 assert(dir->is_auth());
1671 dir->state_clear(CDir::STATE_AUTH);
1672 dir->remove_bloom();
1673 dir->replica_nonce = CDir::EXPORT_NONCE;
1674
1675 if (dir->is_dirty())
1676 dir->mark_clean();
1677
1678 // suck up all waiters
1679 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1680
1681 // pop
1682 dir->finish_export(now);
1683
1684 // dentries
1685 list<CDir*> subdirs;
94b18763
FG
1686 for (auto &p : *dir) {
1687 CDentry *dn = p.second;
7c673cae
FG
1688 CInode *in = dn->get_linkage()->get_inode();
1689
1690 // dentry
1691 dn->finish_export();
1692
1693 // inode?
1694 if (dn->get_linkage()->is_primary()) {
1695 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1696
1697 // subdirs?
1698 in->get_nested_dirfrags(subdirs);
1699 }
1700
1701 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1702 ++(*num_dentries);
1703 }
1704
1705 // subdirs
1706 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1707 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1708}
1709
1710class C_MDS_ExportFinishLogged : public MigratorLogContext {
1711 CDir *dir;
1712public:
1713 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1714 void finish(int r) override {
1715 mig->export_logged_finish(dir);
1716 }
1717};
1718
1719
1720/*
1721 * i should get an export_ack from the export target.
1722 *
1723 * This function DOES put the passed message before returning
1724 */
1725void Migrator::handle_export_ack(MExportDirAck *m)
1726{
1727 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1728 mds_rank_t dest(m->get_source().num());
1729 utime_t now = ceph_clock_now();
1730 assert(dir);
1731 assert(dir->is_frozen_tree_root()); // i'm exporting!
1732
1733 // yay!
1734 dout(7) << "handle_export_ack " << *dir << dendl;
1735
1736 mds->hit_export_target(now, dest, -1);
1737
1738 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1739 assert(it != export_state.end());
1740 assert(it->second.state == EXPORT_EXPORTING);
1741 assert(it->second.tid == m->get_tid());
1742
1743 bufferlist::iterator bp = m->imported_caps.begin();
1744 ::decode(it->second.peer_imported, bp);
1745
1746 it->second.state = EXPORT_LOGGINGFINISH;
1747 assert (g_conf->mds_kill_export_at != 9);
1748 set<CDir*> bounds;
1749 cache->get_subtree_bounds(dir, bounds);
1750
7c673cae
FG
1751 // log completion.
1752 // include export bounds, to ensure they're in the journal.
31f18b77 1753 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
7c673cae
FG
1754 mds->mdlog->start_entry(le);
1755
1756 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
31f18b77 1757 le->metablob.add_dir(dir, false);
7c673cae
FG
1758 for (set<CDir*>::iterator p = bounds.begin();
1759 p != bounds.end();
1760 ++p) {
1761 CDir *bound = *p;
1762 le->get_bounds().insert(bound->dirfrag());
1763 le->metablob.add_dir_context(bound);
1764 le->metablob.add_dir(bound, false);
1765 }
1766
31f18b77
FG
1767 // list us second, them first.
1768 // this keeps authority().first in sync with subtree auth state in the journal.
1769 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1770
7c673cae
FG
1771 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1772 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1773 mds->mdlog->flush();
1774 assert (g_conf->mds_kill_export_at != 10);
1775
1776 m->put();
1777}
1778
b32b8144 1779void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
7c673cae
FG
1780{
1781 dout(7) << "export_notify_abort " << *dir << dendl;
1782
7c673cae
FG
1783 assert(stat.state == EXPORT_CANCELLING);
1784
1785 if (stat.notify_ack_waiting.empty()) {
1786 stat.state = EXPORT_CANCELLED;
1787 return;
1788 }
1789
1790 dir->auth_pin(this);
1791
1792 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1793 p != stat.notify_ack_waiting.end();
1794 ++p) {
b32b8144
FG
1795 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1796 pair<int,int>(mds->get_nodeid(), stat.peer),
1797 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
7c673cae
FG
1798 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1799 notify->get_bounds().push_back((*i)->dirfrag());
1800 mds->send_message_mds(notify, *p);
1801 }
1802}
1803
1804/*
1805 * this happens if hte dest failes after i send teh export data but before it is acked
1806 * that is, we don't know they safely received and logged it, so we reverse our changes
1807 * and go on.
1808 */
b32b8144 1809void Migrator::export_reverse(CDir *dir, export_state_t& stat)
7c673cae
FG
1810{
1811 dout(7) << "export_reverse " << *dir << dendl;
1812
1813 set<CInode*> to_eval;
1814
1815 set<CDir*> bounds;
1816 cache->get_subtree_bounds(dir, bounds);
1817
1818 // remove exporting pins
1819 list<CDir*> rq;
1820 rq.push_back(dir);
1821 while (!rq.empty()) {
1822 CDir *t = rq.front();
1823 rq.pop_front();
1824 t->abort_export();
94b18763
FG
1825 for (auto &p : *t) {
1826 CDentry *dn = p.second;
1827 dn->abort_export();
1828 if (!dn->get_linkage()->is_primary())
7c673cae 1829 continue;
94b18763 1830 CInode *in = dn->get_linkage()->get_inode();
7c673cae
FG
1831 in->abort_export();
1832 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1833 in->state_clear(CInode::STATE_EVALSTALECAPS);
1834 to_eval.insert(in);
1835 }
1836 if (in->is_dir())
1837 in->get_nested_dirfrags(rq);
1838 }
1839 }
1840
1841 // unpin bounds
b32b8144 1842 for (auto bd : bounds) {
7c673cae
FG
1843 bd->put(CDir::PIN_EXPORTBOUND);
1844 bd->state_clear(CDir::STATE_EXPORTBOUND);
1845 }
1846
7c673cae 1847 // notify bystanders
b32b8144 1848 export_notify_abort(dir, stat, bounds);
7c673cae 1849
224ce89b
WB
1850 // unfreeze tree, with possible subtree merge.
1851 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1852
7c673cae
FG
1853 // process delayed expires
1854 cache->process_delayed_expire(dir);
224ce89b 1855
7c673cae 1856 dir->unfreeze_tree();
224ce89b 1857 cache->try_subtree_merge(dir);
b32b8144
FG
1858 for (auto bd : stat.residual_dirs) {
1859 bd->unfreeze_tree();
1860 cache->try_subtree_merge(bd);
1861 }
7c673cae
FG
1862
1863 // revoke/resume stale caps
1864 for (auto in : to_eval) {
1865 bool need_issue = false;
1866 for (auto& p : in->get_client_caps()) {
1867 Capability *cap = p.second;
1868 if (cap->is_stale()) {
1869 mds->locker->revoke_stale_caps(cap);
1870 } else {
1871 need_issue = true;
1872 }
1873 }
1874 if (need_issue &&
1875 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
1876 mds->locker->issue_caps(in);
1877 }
1878
1879 cache->show_cache();
1880}
1881
1882
1883/*
1884 * once i get the ack, and logged the EExportFinish(true),
1885 * send notifies (if any), otherwise go straight to finish.
1886 *
1887 */
1888void Migrator::export_logged_finish(CDir *dir)
1889{
1890 dout(7) << "export_logged_finish " << *dir << dendl;
1891
1892 export_state_t& stat = export_state[dir];
1893
1894 // send notifies
1895 set<CDir*> bounds;
1896 cache->get_subtree_bounds(dir, bounds);
1897
1898 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1899 p != stat.notify_ack_waiting.end();
1900 ++p) {
1901 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1902 pair<int,int>(mds->get_nodeid(), stat.peer),
1903 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
1904
1905 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1906 notify->get_bounds().push_back((*i)->dirfrag());
1907
1908 mds->send_message_mds(notify, *p);
1909 }
1910
1911 // wait for notifyacks
1912 stat.state = EXPORT_NOTIFYING;
1913 assert (g_conf->mds_kill_export_at != 11);
1914
1915 // no notifies to wait for?
1916 if (stat.notify_ack_waiting.empty()) {
1917 export_finish(dir); // skip notify/notify_ack stage.
1918 } else {
1919 // notify peer to send cap import messages to clients
1920 if (!mds->is_cluster_degraded() ||
1921 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
1922 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
1923 } else {
1924 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
1925 }
1926 }
1927}
1928
1929/*
1930 * warning:
1931 * i'll get an ack from each bystander.
1932 * when i get them all, do the export.
1933 * notify:
1934 * i'll get an ack from each bystander.
1935 * when i get them all, unfreeze and send the finish.
1936 *
1937 * This function DOES put the passed message before returning
1938 */
1939void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
1940{
1941 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1942 mds_rank_t dest(m->get_source().num());
1943 utime_t now = ceph_clock_now();
1944 assert(dir);
1945 mds_rank_t from = mds_rank_t(m->get_source().num());
1946
1947 mds->hit_export_target(now, dest, -1);
1948
1949 auto export_state_entry = export_state.find(dir);
1950 if (export_state_entry != export_state.end()) {
1951 export_state_t& stat = export_state_entry->second;
1952 if (stat.state == EXPORT_WARNING &&
1953 stat.warning_ack_waiting.erase(from)) {
1954 // exporting. process warning.
1955 dout(7) << "handle_export_notify_ack from " << m->get_source()
1956 << ": exporting, processing warning on " << *dir << dendl;
1957 if (stat.warning_ack_waiting.empty())
1958 export_go(dir); // start export.
1959 } else if (stat.state == EXPORT_NOTIFYING &&
1960 stat.notify_ack_waiting.erase(from)) {
1961 // exporting. process notify.
1962 dout(7) << "handle_export_notify_ack from " << m->get_source()
1963 << ": exporting, processing notify on " << *dir << dendl;
1964 if (stat.notify_ack_waiting.empty())
1965 export_finish(dir);
1966 } else if (stat.state == EXPORT_CANCELLING &&
1967 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
1968 stat.notify_ack_waiting.erase(from)) {
1969 dout(7) << "handle_export_notify_ack from " << m->get_source()
1970 << ": cancelling export, processing notify on " << *dir << dendl;
1971 if (stat.notify_ack_waiting.empty()) {
1972 export_state.erase(export_state_entry);
1973 export_cancel_finish(dir);
1974 }
1975 }
1976 }
1977 else {
1978 auto import_state_entry = import_state.find(dir->dirfrag());
1979 if (import_state_entry != import_state.end()) {
1980 import_state_t& stat = import_state_entry->second;
1981 if (stat.state == IMPORT_ABORTING) {
1982 // reversing import
1983 dout(7) << "handle_export_notify_ack from " << m->get_source()
1984 << ": aborting import on " << *dir << dendl;
1985 assert(stat.bystanders.count(from));
1986 stat.bystanders.erase(from);
1987 if (stat.bystanders.empty())
1988 import_reverse_unfreeze(dir);
1989 }
1990 }
1991 }
1992
1993 m->put();
1994}
1995
1996void Migrator::export_finish(CDir *dir)
1997{
1998 dout(5) << "export_finish " << *dir << dendl;
1999
2000 assert (g_conf->mds_kill_export_at != 12);
2001 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2002 if (it == export_state.end()) {
2003 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
2004 return;
2005 }
2006
2007 // send finish/commit to new auth
2008 if (!mds->is_cluster_degraded() ||
2009 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
2010 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
2011 } else {
2012 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
2013 }
2014 assert(g_conf->mds_kill_export_at != 13);
2015
2016 // finish export (adjust local cache state)
2017 int num_dentries = 0;
224ce89b 2018 list<MDSInternalContextBase*> finished;
7c673cae 2019 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
224ce89b
WB
2020 it->second.peer_imported, finished, &num_dentries);
2021
2022 assert(!dir->is_auth());
2023 cache->adjust_subtree_auth(dir, it->second.peer);
2024
7c673cae
FG
2025 // unpin bounds
2026 set<CDir*> bounds;
2027 cache->get_subtree_bounds(dir, bounds);
2028 for (set<CDir*>::iterator p = bounds.begin();
2029 p != bounds.end();
2030 ++p) {
2031 CDir *bd = *p;
2032 bd->put(CDir::PIN_EXPORTBOUND);
2033 bd->state_clear(CDir::STATE_EXPORTBOUND);
2034 }
2035
2036 if (dir->state_test(CDir::STATE_AUXSUBTREE))
2037 dir->state_clear(CDir::STATE_AUXSUBTREE);
2038
224ce89b
WB
2039 // discard delayed expires
2040 cache->discard_delayed_expire(dir);
2041
2042 dout(7) << "export_finish unfreezing" << dendl;
2043
2044 // unfreeze tree, with possible subtree merge.
7c673cae 2045 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
224ce89b 2046 dir->unfreeze_tree();
7c673cae 2047 cache->try_subtree_merge(dir);
b32b8144
FG
2048 for (auto bd : it->second.residual_dirs) {
2049 export_queue.push_front(pair<dirfrag_t,mds_rank_t>(bd->dirfrag(), it->second.peer));
2050 bd->take_waiting(CDir::WAIT_ANY_MASK, finished);
2051 bd->unfreeze_tree();
2052 cache->try_subtree_merge(bd);
2053 }
7c673cae
FG
2054
2055 // no more auth subtree? clear scatter dirty
2056 if (!dir->get_inode()->is_auth() &&
2057 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2058 dir->get_inode()->clear_scatter_dirty();
2059 // wake up scatter_nudge waiters
224ce89b 2060 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
7c673cae
FG
2061 }
2062
224ce89b
WB
2063 if (!finished.empty())
2064 mds->queue_waiters(finished);
7c673cae
FG
2065
2066 MutationRef mut = it->second.mut;
2067 // remove from exporting list, clean up state
2068 export_state.erase(it);
2069 dir->state_clear(CDir::STATE_EXPORTING);
2070
2071 cache->show_subtrees();
2072 audit();
2073
181888fb 2074 cache->trim(num_dentries); // try trimming exported dentries
7c673cae
FG
2075
2076 // send pending import_maps?
2077 mds->mdcache->maybe_send_pending_resolves();
2078
2079 // drop locks, unpin path
2080 if (mut) {
2081 mds->locker->drop_locks(mut.get());
2082 mut->cleanup();
2083 }
2084
2085 maybe_do_queued_export();
2086}
2087
2088
2089
2090
2091
2092
2093
2094
2095// ==========================================================
2096// IMPORT
2097
2098void Migrator::handle_export_discover(MExportDirDiscover *m)
2099{
2100 mds_rank_t from = m->get_source_mds();
2101 assert(from != mds->get_nodeid());
2102
2103 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2104
2105 // note import state
2106 dirfrag_t df = m->get_dirfrag();
c07f9fc5
FG
2107
2108 if (!mds->is_active()) {
2109 dout(7) << " not active, send NACK " << dendl;
2110 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
2111 m->put();
2112 return;
2113 }
2114
7c673cae 2115 // only start discovering on this message once.
b32b8144 2116 import_state_t *p_state;
7c673cae
FG
2117 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2118 if (!m->started) {
2119 assert(it == import_state.end());
2120 m->started = true;
b32b8144
FG
2121 p_state = &import_state[df];
2122 p_state->state = IMPORT_DISCOVERING;
2123 p_state->peer = from;
2124 p_state->tid = m->get_tid();
7c673cae
FG
2125 } else {
2126 // am i retrying after ancient path_traverse results?
2127 if (it == import_state.end() ||
2128 it->second.peer != from ||
2129 it->second.tid != m->get_tid()) {
2130 dout(7) << " dropping obsolete message" << dendl;
2131 m->put();
2132 return;
2133 }
2134 assert(it->second.state == IMPORT_DISCOVERING);
b32b8144 2135 p_state = &it->second;
7c673cae
FG
2136 }
2137
2138 if (!mds->mdcache->is_open()) {
2139 dout(5) << " waiting for root" << dendl;
2140 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2141 return;
2142 }
2143
2144 assert (g_conf->mds_kill_import_at != 1);
2145
2146 // do we have it?
2147 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2148 if (!in) {
2149 // must discover it!
2150 filepath fpath(m->get_path());
2151 vector<CDentry*> trace;
2152 MDRequestRef null_ref;
2153 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2154 if (r > 0) return;
2155 if (r < 0) {
2156 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2157 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2158 }
2159
2160 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2161 }
2162
2163 // yay
2164 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2165
b32b8144 2166 p_state->state = IMPORT_DISCOVERED;
7c673cae
FG
2167
2168 // pin inode in the cache (for now)
2169 assert(in->is_dir());
2170 in->get(CInode::PIN_IMPORTING);
2171
2172 // reply
2173 dout(7) << " sending export_discover_ack on " << *in << dendl;
b32b8144 2174 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), p_state->peer);
7c673cae
FG
2175 m->put();
2176 assert (g_conf->mds_kill_import_at != 2);
2177}
2178
2179void Migrator::import_reverse_discovering(dirfrag_t df)
2180{
2181 import_state.erase(df);
2182}
2183
2184void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2185{
2186 // unpin base
2187 diri->put(CInode::PIN_IMPORTING);
2188 import_state.erase(df);
2189}
2190
b32b8144 2191void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
7c673cae
FG
2192{
2193 set<CDir*> bounds;
b32b8144 2194 cache->map_dirfrag_set(stat.bound_ls, bounds);
7c673cae
FG
2195 import_remove_pins(dir, bounds);
2196 import_reverse_final(dir);
2197}
2198
2199/* This function DOES put the passed message before returning*/
2200void Migrator::handle_export_cancel(MExportDirCancel *m)
2201{
2202 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2203 dirfrag_t df = m->get_dirfrag();
2204 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2205 if (it == import_state.end()) {
2206 assert(0 == "got export_cancel in weird state");
2207 } else if (it->second.state == IMPORT_DISCOVERING) {
2208 import_reverse_discovering(df);
2209 } else if (it->second.state == IMPORT_DISCOVERED) {
2210 CInode *in = cache->get_inode(df.ino);
2211 assert(in);
2212 import_reverse_discovered(df, in);
2213 } else if (it->second.state == IMPORT_PREPPING) {
2214 CDir *dir = mds->mdcache->get_dirfrag(df);
2215 assert(dir);
b32b8144 2216 import_reverse_prepping(dir, it->second);
7c673cae
FG
2217 } else if (it->second.state == IMPORT_PREPPED) {
2218 CDir *dir = mds->mdcache->get_dirfrag(df);
2219 assert(dir);
2220 set<CDir*> bounds;
2221 cache->get_subtree_bounds(dir, bounds);
2222 import_remove_pins(dir, bounds);
2223 // adjust auth back to the exportor
2224 cache->adjust_subtree_auth(dir, it->second.peer);
7c673cae
FG
2225 import_reverse_unfreeze(dir);
2226 } else {
2227 assert(0 == "got export_cancel in weird state");
2228 }
2229 m->put();
2230}
2231
2232/* This function DOES put the passed message before returning*/
2233void Migrator::handle_export_prep(MExportDirPrep *m)
2234{
2235 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2236 assert(oldauth != mds->get_nodeid());
2237
2238 CDir *dir;
2239 CInode *diri;
2240 list<MDSInternalContextBase*> finished;
2241
2242 // assimilate root dir.
2243 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2244 if (!m->did_assim()) {
2245 assert(it != import_state.end());
2246 assert(it->second.state == IMPORT_DISCOVERED);
31f18b77 2247 assert(it->second.peer == oldauth);
7c673cae
FG
2248 diri = cache->get_inode(m->get_dirfrag().ino);
2249 assert(diri);
2250 bufferlist::iterator p = m->basedir.begin();
2251 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2252 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2253 } else {
2254 if (it == import_state.end() ||
2255 it->second.peer != oldauth ||
2256 it->second.tid != m->get_tid()) {
2257 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2258 m->put();
2259 return;
2260 }
2261 assert(it->second.state == IMPORT_PREPPING);
31f18b77 2262 assert(it->second.peer == oldauth);
7c673cae
FG
2263
2264 dir = cache->get_dirfrag(m->get_dirfrag());
2265 assert(dir);
2266 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2267 diri = dir->get_inode();
2268 }
2269 assert(dir->is_auth() == false);
2270
2271 cache->show_subtrees();
2272
2273 // build import bound map
2274 map<inodeno_t, fragset_t> import_bound_fragset;
2275 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2276 p != m->get_bounds().end();
2277 ++p) {
2278 dout(10) << " bound " << *p << dendl;
2279 import_bound_fragset[p->ino].insert(p->frag);
2280 }
2281
2282 // assimilate contents?
2283 if (!m->did_assim()) {
2284 dout(7) << "doing assim on " << *dir << dendl;
2285 m->mark_assim(); // only do this the first time!
2286
2287 // change import state
2288 it->second.state = IMPORT_PREPPING;
2289 it->second.bound_ls = m->get_bounds();
2290 it->second.bystanders = m->get_bystanders();
2291 assert(g_conf->mds_kill_import_at != 3);
2292
2293 // bystander list
2294 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2295
2296 // move pin to dir
2297 diri->put(CInode::PIN_IMPORTING);
2298 dir->get(CDir::PIN_IMPORTING);
2299 dir->state_set(CDir::STATE_IMPORTING);
2300
2301 // assimilate traces to exports
2302 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2303 for (list<bufferlist>::iterator p = m->traces.begin();
2304 p != m->traces.end();
2305 ++p) {
2306 bufferlist::iterator q = p->begin();
2307 dirfrag_t df;
2308 ::decode(df, q);
2309 char start;
2310 ::decode(start, q);
2311 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2312
2313 CDir *cur = 0;
2314 if (start == 'd') {
2315 cur = cache->get_dirfrag(df);
2316 assert(cur);
2317 dout(10) << " had " << *cur << dendl;
2318 } else if (start == 'f') {
2319 CInode *in = cache->get_inode(df.ino);
2320 assert(in);
2321 dout(10) << " had " << *in << dendl;
2322 cur = cache->add_replica_dir(q, in, oldauth, finished);
2323 dout(10) << " added " << *cur << dendl;
2324 } else if (start == '-') {
2325 // nothing
2326 } else
2327 assert(0 == "unrecognized start char");
2328
b32b8144 2329 while (!q.end()) {
7c673cae
FG
2330 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2331 dout(10) << " added " << *dn << dendl;
2332 CInode *in = cache->add_replica_inode(q, dn, finished);
2333 dout(10) << " added " << *in << dendl;
2334 if (q.end())
2335 break;
2336 cur = cache->add_replica_dir(q, in, oldauth, finished);
2337 dout(10) << " added " << *cur << dendl;
2338 }
2339 }
2340
2341 // make bound sticky
2342 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2343 p != import_bound_fragset.end();
2344 ++p) {
2345 CInode *in = cache->get_inode(p->first);
2346 assert(in);
2347 in->get_stickydirs();
2348 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2349 }
2350
2351 } else {
2352 dout(7) << " not doing assim on " << *dir << dendl;
2353 }
2354
2355 if (!finished.empty())
2356 mds->queue_waiters(finished);
2357
2358
c07f9fc5
FG
2359 bool success = true;
2360 if (mds->is_active()) {
2361 // open all bounds
2362 set<CDir*> import_bounds;
2363 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2364 p != import_bound_fragset.end();
2365 ++p) {
2366 CInode *in = cache->get_inode(p->first);
2367 assert(in);
7c673cae 2368
c07f9fc5
FG
2369 // map fragset into a frag_t list, based on the inode fragtree
2370 list<frag_t> fglist;
2371 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2372 in->dirfragtree.get_leaves_under(*q, fglist);
2373 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2374
2375 for (list<frag_t>::iterator q = fglist.begin();
2376 q != fglist.end();
2377 ++q) {
2378 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2379 if (!bound) {
2380 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2381 cache->open_remote_dirfrag(in, *q,
2382 new C_MDS_RetryMessage(mds, m));
2383 return;
2384 }
7c673cae 2385
c07f9fc5
FG
2386 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2387 dout(7) << " pinning import bound " << *bound << dendl;
2388 bound->get(CDir::PIN_IMPORTBOUND);
2389 bound->state_set(CDir::STATE_IMPORTBOUND);
2390 } else {
2391 dout(7) << " already pinned import bound " << *bound << dendl;
2392 }
2393 import_bounds.insert(bound);
7c673cae 2394 }
7c673cae 2395 }
7c673cae 2396
c07f9fc5
FG
2397 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2398
2399 if (!mds->mdcache->is_readonly() &&
2400 dir->get_inode()->filelock.can_wrlock(-1) &&
2401 dir->get_inode()->nestlock.can_wrlock(-1)) {
2402 it->second.mut = new MutationImpl();
2403 // force some locks. hacky.
2404 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2405 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2406
2407 // note that i am an ambiguous auth for this subtree.
2408 // specify bounds, since the exporter explicitly defines the region.
2409 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2410 pair<int,int>(oldauth, mds->get_nodeid()));
2411 cache->verify_subtree_bounds(dir, import_bounds);
2412 // freeze.
2413 dir->_freeze_tree();
2414 // note new state
2415 it->second.state = IMPORT_PREPPED;
2416 } else {
2417 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2418 success = false;
2419 }
7c673cae 2420 } else {
c07f9fc5 2421 dout(7) << " not active, failing. " << *dir << dendl;
7c673cae 2422 success = false;
7c673cae
FG
2423 }
2424
c07f9fc5 2425 if (!success)
b32b8144 2426 import_reverse_prepping(dir, it->second);
c07f9fc5 2427
7c673cae
FG
2428 // ok!
2429 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2430 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2431
2432 assert(g_conf->mds_kill_import_at != 4);
2433 // done
2434 m->put();
2435}
2436
2437
2438
2439
2440class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2441 dirfrag_t df;
2442 CDir *dir;
2443 mds_rank_t from;
2444public:
2445 map<client_t,entity_inst_t> imported_client_map;
2446 map<client_t,uint64_t> sseqmap;
2447
2448 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2449 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2450 }
2451 void finish(int r) override {
2452 mig->import_logged_start(df, dir, from, imported_client_map, sseqmap);
2453 }
2454};
2455
2456/* This function DOES put the passed message before returning*/
2457void Migrator::handle_export_dir(MExportDir *m)
2458{
2459 assert (g_conf->mds_kill_import_at != 5);
2460 CDir *dir = cache->get_dirfrag(m->dirfrag);
2461 assert(dir);
31f18b77
FG
2462
2463 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2464 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2465
2466 assert(!dir->is_auth());
7c673cae
FG
2467
2468 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2469 assert(it != import_state.end());
2470 assert(it->second.state == IMPORT_PREPPED);
2471 assert(it->second.tid == m->get_tid());
31f18b77 2472 assert(it->second.peer == oldauth);
7c673cae
FG
2473
2474 utime_t now = ceph_clock_now();
7c673cae
FG
2475
2476 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2477 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2478
2479 cache->show_subtrees();
2480
31f18b77 2481 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
7c673cae
FG
2482
2483 // start the journal entry
31f18b77 2484 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
7c673cae
FG
2485 mds->mdlog->start_entry(le);
2486
2487 le->metablob.add_dir_context(dir);
2488
2489 // adjust auth (list us _first_)
2490 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2491
2492 // new client sessions, open these after we journal
2493 // include imported sessions in EImportStart
2494 bufferlist::iterator cmp = m->client_map.begin();
2495 ::decode(onlogged->imported_client_map, cmp);
2496 assert(cmp.end());
2497 le->cmapv = mds->server->prepare_force_open_sessions(onlogged->imported_client_map, onlogged->sseqmap);
2498 le->client_map.claim(m->client_map);
2499
2500 bufferlist::iterator blp = m->export_data.begin();
2501 int num_imported_inodes = 0;
2502 while (!blp.end()) {
2503 num_imported_inodes +=
2504 decode_import_dir(blp,
2505 oldauth,
2506 dir, // import root
2507 le,
2508 mds->mdlog->get_current_segment(),
2509 it->second.peer_exports,
2510 it->second.updated_scatterlocks,
2511 now);
2512 }
2513 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2514
2515 // include bounds in EImportStart
2516 set<CDir*> import_bounds;
2517 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2518 p != m->bounds.end();
2519 ++p) {
2520 CDir *bd = cache->get_dirfrag(*p);
2521 assert(bd);
2522 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2523 import_bounds.insert(bd);
2524 }
2525 cache->verify_subtree_bounds(dir, import_bounds);
2526
2527 // adjust popularity
2528 mds->balancer->add_import(dir, now);
2529
2530 dout(7) << "handle_export_dir did " << *dir << dendl;
2531
2532 // note state
2533 it->second.state = IMPORT_LOGGINGSTART;
2534 assert (g_conf->mds_kill_import_at != 6);
2535
2536 // log it
2537 mds->mdlog->submit_entry(le, onlogged);
2538 mds->mdlog->flush();
2539
2540 // some stats
2541 if (mds->logger) {
2542 mds->logger->inc(l_mds_imported);
2543 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2544 }
2545
2546 m->put();
2547}
2548
2549
2550/*
2551 * this is an import helper
2552 * called by import_finish, and import_reverse and friends.
2553 */
2554void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2555{
2556 import_state_t& stat = import_state[dir->dirfrag()];
2557 // root
2558 dir->put(CDir::PIN_IMPORTING);
2559 dir->state_clear(CDir::STATE_IMPORTING);
2560
2561 // bounding inodes
2562 set<inodeno_t> did;
2563 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2564 p != stat.bound_ls.end();
2565 ++p) {
2566 if (did.count(p->ino))
2567 continue;
2568 did.insert(p->ino);
2569 CInode *in = cache->get_inode(p->ino);
2570 assert(in);
2571 in->put_stickydirs();
2572 }
2573
2574 if (stat.state == IMPORT_PREPPING) {
2575 for (auto bd : bounds) {
2576 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2577 bd->put(CDir::PIN_IMPORTBOUND);
2578 bd->state_clear(CDir::STATE_IMPORTBOUND);
2579 }
2580 }
2581 } else if (stat.state >= IMPORT_PREPPED) {
2582 // bounding dirfrags
2583 for (auto bd : bounds) {
2584 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2585 bd->put(CDir::PIN_IMPORTBOUND);
2586 bd->state_clear(CDir::STATE_IMPORTBOUND);
2587 }
2588 }
2589}
2590
2591
2592/*
2593 * note: this does teh full work of reversing and import and cleaning up
2594 * state.
2595 * called by both handle_mds_failure and by handle_resolve (if we are
2596 * a survivor coping with an exporter failure+recovery).
2597 */
2598void Migrator::import_reverse(CDir *dir)
2599{
2600 dout(7) << "import_reverse " << *dir << dendl;
2601
2602 import_state_t& stat = import_state[dir->dirfrag()];
2603 stat.state = IMPORT_ABORTING;
2604
2605 set<CDir*> bounds;
2606 cache->get_subtree_bounds(dir, bounds);
2607
2608 // remove pins
2609 import_remove_pins(dir, bounds);
2610
2611 // update auth, with possible subtree merge.
2612 assert(dir->is_subtree_root());
2613 if (mds->is_resolve())
2614 cache->trim_non_auth_subtree(dir);
2615
2616 cache->adjust_subtree_auth(dir, stat.peer);
2617
2618 C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather> *fin = new C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather>(g_ceph_context);
2619 if (!dir->get_inode()->is_auth() &&
2620 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2621 dir->get_inode()->clear_scatter_dirty();
2622 // wake up scatter_nudge waiters
2623 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2624 }
2625
2626 int num_dentries = 0;
2627 // adjust auth bits.
2628 list<CDir*> q;
2629 q.push_back(dir);
2630 while (!q.empty()) {
2631 CDir *cur = q.front();
2632 q.pop_front();
2633
2634 // dir
2635 assert(cur->is_auth());
2636 cur->state_clear(CDir::STATE_AUTH);
2637 cur->remove_bloom();
2638 cur->clear_replica_map();
2639 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2640 if (cur->is_dirty())
2641 cur->mark_clean();
2642
94b18763
FG
2643 for (auto &p : *cur) {
2644 CDentry *dn = p.second;
7c673cae
FG
2645
2646 // dentry
2647 dn->state_clear(CDentry::STATE_AUTH);
2648 dn->clear_replica_map();
2649 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2650 if (dn->is_dirty())
2651 dn->mark_clean();
2652
2653 // inode?
2654 if (dn->get_linkage()->is_primary()) {
2655 CInode *in = dn->get_linkage()->get_inode();
2656 in->state_clear(CDentry::STATE_AUTH);
2657 in->clear_replica_map();
2658 in->set_replica_nonce(CInode::EXPORT_NONCE);
2659 if (in->is_dirty())
2660 in->mark_clean();
2661 in->clear_dirty_rstat();
2662 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2663 in->clear_scatter_dirty();
2664 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2665 }
2666
2667 in->clear_dirty_parent();
2668
2669 in->authlock.clear_gather();
2670 in->linklock.clear_gather();
2671 in->dirfragtreelock.clear_gather();
2672 in->filelock.clear_gather();
2673
2674 in->clear_file_locks();
2675
2676 // non-bounding dir?
2677 list<CDir*> dfs;
2678 in->get_dirfrags(dfs);
2679 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2680 if (bounds.count(*p) == 0)
2681 q.push_back(*p);
2682 }
2683
2684 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2685 ++num_dentries;
2686 }
2687 }
2688
2689 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2690
2691 if (stat.state == IMPORT_ACKING) {
2692 // remove imported caps
2693 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
2694 p != stat.peer_exports.end();
2695 ++p) {
2696 CInode *in = p->first;
2697 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2698 q != p->second.end();
2699 ++q) {
2700 Capability *cap = in->get_client_cap(q->first);
2701 assert(cap);
2702 if (cap->is_importing())
2703 in->remove_client_cap(q->first);
2704 }
2705 in->put(CInode::PIN_IMPORTINGCAPS);
2706 }
2707 for (map<client_t,entity_inst_t>::iterator p = stat.client_map.begin();
2708 p != stat.client_map.end();
2709 ++p) {
2710 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2711 assert(session);
2712 session->dec_importing();
2713 }
2714 }
2715
2716 // log our failure
2717 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2718
181888fb 2719 cache->trim(num_dentries); // try trimming dentries
7c673cae
FG
2720
2721 // notify bystanders; wait in aborting state
2722 import_notify_abort(dir, bounds);
2723}
2724
2725void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2726{
2727 dout(7) << "import_notify_finish " << *dir << dendl;
2728
2729 import_state_t& stat = import_state[dir->dirfrag()];
2730 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2731 p != stat.bystanders.end();
2732 ++p) {
2733 MExportDirNotify *notify =
2734 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
2735 pair<int,int>(stat.peer, mds->get_nodeid()),
2736 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2737 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2738 notify->get_bounds().push_back((*i)->dirfrag());
2739 mds->send_message_mds(notify, *p);
2740 }
2741}
2742
2743void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2744{
2745 dout(7) << "import_notify_abort " << *dir << dendl;
2746
2747 import_state_t& stat = import_state[dir->dirfrag()];
2748 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2749 p != stat.bystanders.end(); ) {
2750 if (mds->is_cluster_degraded() &&
2751 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2752 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2753 stat.bystanders.erase(p++);
2754 continue;
2755 }
2756 MExportDirNotify *notify =
2757 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2758 mds_authority_t(stat.peer, mds->get_nodeid()),
2759 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
2760 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2761 notify->get_bounds().push_back((*i)->dirfrag());
2762 mds->send_message_mds(notify, *p);
2763 ++p;
2764 }
2765 if (stat.bystanders.empty()) {
2766 dout(7) << "no bystanders, finishing reverse now" << dendl;
2767 import_reverse_unfreeze(dir);
2768 } else {
2769 assert (g_conf->mds_kill_import_at != 10);
2770 }
2771}
2772
2773void Migrator::import_reverse_unfreeze(CDir *dir)
2774{
7c673cae 2775 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
224ce89b 2776 assert(!dir->is_auth());
7c673cae 2777 cache->discard_delayed_expire(dir);
224ce89b
WB
2778 dir->unfreeze_tree();
2779 if (dir->is_subtree_root())
2780 cache->try_subtree_merge(dir);
7c673cae
FG
2781 import_reverse_final(dir);
2782}
2783
2784void Migrator::import_reverse_final(CDir *dir)
2785{
2786 dout(7) << "import_reverse_final " << *dir << dendl;
2787
2788 // clean up
2789 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2790 assert(it != import_state.end());
2791
2792 MutationRef mut = it->second.mut;
2793 import_state.erase(it);
2794
2795 // send pending import_maps?
2796 mds->mdcache->maybe_send_pending_resolves();
2797
2798 if (mut) {
2799 mds->locker->drop_locks(mut.get());
2800 mut->cleanup();
2801 }
2802
2803 cache->show_subtrees();
2804 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2805}
2806
2807
2808
2809
2810void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
2811 map<client_t,entity_inst_t>& imported_client_map,
2812 map<client_t,uint64_t>& sseqmap)
2813{
2814 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2815 if (it == import_state.end() ||
2816 it->second.state != IMPORT_LOGGINGSTART) {
2817 dout(7) << "import " << df << " must have aborted" << dendl;
2818 mds->server->finish_force_open_sessions(imported_client_map, sseqmap);
2819 return;
2820 }
2821
2822 dout(7) << "import_logged " << *dir << dendl;
2823
2824 // note state
2825 it->second.state = IMPORT_ACKING;
2826
2827 assert (g_conf->mds_kill_import_at != 7);
2828
2829 // force open client sessions and finish cap import
2830 mds->server->finish_force_open_sessions(imported_client_map, sseqmap, false);
2831 it->second.client_map.swap(imported_client_map);
2832
2833 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2834 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2835 p != it->second.peer_exports.end();
2836 ++p) {
2837 // parameter 'peer' is NONE, delay sending cap import messages to client
2838 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, p->second, imported_caps[p->first->ino()]);
2839 }
2840
2841 // send notify's etc.
2842 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2843
2844 // test surviving observer of a failed migration that did not complete
2845 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2846
2847 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
2848 ::encode(imported_caps, ack->imported_caps);
2849
2850 mds->send_message_mds(ack, from);
2851 assert (g_conf->mds_kill_import_at != 8);
2852
2853 cache->show_subtrees();
2854}
2855
2856/* This function DOES put the passed message before returning*/
2857void Migrator::handle_export_finish(MExportDirFinish *m)
2858{
2859 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2860 assert(dir);
2861 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
2862
2863 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2864 assert(it != import_state.end());
2865 assert(it->second.tid == m->get_tid());
2866
2867 import_finish(dir, false, m->is_last());
2868
2869 m->put();
2870}
2871
2872void Migrator::import_finish(CDir *dir, bool notify, bool last)
2873{
2874 dout(7) << "import_finish on " << *dir << dendl;
2875
2876 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
2877 assert(it != import_state.end());
2878 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
2879
224ce89b
WB
2880 if (it->second.state == IMPORT_ACKING) {
2881 assert(dir->is_auth());
2882 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2883 }
2884
7c673cae
FG
2885 // log finish
2886 assert(g_conf->mds_kill_import_at != 9);
2887
2888 if (it->second.state == IMPORT_ACKING) {
2889 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2890 p != it->second.peer_exports.end();
2891 ++p) {
2892 CInode *in = p->first;
2893 assert(in->is_auth());
2894 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2895 q != p->second.end();
2896 ++q) {
2897 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
2898 assert(session);
2899 Capability *cap = in->get_client_cap(q->first);
2900 assert(cap);
2901 cap->merge(q->second, true);
2902 cap->clear_importing();
2903 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
2904 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
2905 }
2906 p->second.clear();
2907 in->replica_caps_wanted = 0;
2908 }
2909 for (map<client_t,entity_inst_t>::iterator p = it->second.client_map.begin();
2910 p != it->second.client_map.end();
2911 ++p) {
2912 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2913 assert(session);
2914 session->dec_importing();
2915 }
2916 }
2917
2918 if (!last) {
2919 assert(it->second.state == IMPORT_ACKING);
2920 it->second.state = IMPORT_FINISHING;
2921 return;
2922 }
2923
2924 // remove pins
2925 set<CDir*> bounds;
2926 cache->get_subtree_bounds(dir, bounds);
2927
2928 if (notify)
2929 import_notify_finish(dir, bounds);
2930
2931 import_remove_pins(dir, bounds);
2932
2933 map<CInode*, map<client_t,Capability::Export> > peer_exports;
2934 it->second.peer_exports.swap(peer_exports);
2935
2936 // clear import state (we're done!)
2937 MutationRef mut = it->second.mut;
2938 import_state.erase(it);
2939
7c673cae
FG
2940 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
2941
7c673cae
FG
2942 // process delayed expires
2943 cache->process_delayed_expire(dir);
2944
224ce89b 2945 // unfreeze tree, with possible subtree merge.
7c673cae 2946 dir->unfreeze_tree();
224ce89b
WB
2947 cache->try_subtree_merge(dir);
2948
7c673cae
FG
2949 cache->show_subtrees();
2950 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2951
2952 if (mut) {
2953 mds->locker->drop_locks(mut.get());
2954 mut->cleanup();
2955 }
2956
2957 // re-eval imported caps
2958 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
2959 p != peer_exports.end();
2960 ++p) {
2961 if (p->first->is_auth())
2962 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
2963 p->first->put(CInode::PIN_IMPORTINGCAPS);
2964 }
2965
2966 // send pending import_maps?
2967 mds->mdcache->maybe_send_pending_resolves();
2968
2969 // did i just import mydir?
2970 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
2971 cache->populate_mydir();
2972
2973 // is it empty?
2974 if (dir->get_num_head_items() == 0 &&
2975 !dir->inode->is_auth()) {
2976 // reexport!
2977 export_empty_import(dir);
2978 }
2979}
2980
2981
2982void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
2983 mds_rank_t oldauth, LogSegment *ls,
2984 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
2985 list<ScatterLock*>& updated_scatterlocks)
2986{
2987 dout(15) << "decode_import_inode on " << *dn << dendl;
2988
2989 inodeno_t ino;
2990 snapid_t last;
2991 ::decode(ino, blp);
2992 ::decode(last, blp);
2993
2994 bool added = false;
2995 CInode *in = cache->get_inode(ino, last);
2996 if (!in) {
2997 in = new CInode(mds->mdcache, true, 1, last);
2998 added = true;
2999 }
3000
3001 // state after link -- or not! -sage
3002 in->decode_import(blp, ls); // cap imports are noted for later action
3003
3004 // caps
3005 decode_import_inode_caps(in, true, blp, peer_exports);
3006
3007 // link before state -- or not! -sage
3008 if (dn->get_linkage()->get_inode() != in) {
3009 assert(!dn->get_linkage()->get_inode());
3010 dn->dir->link_primary_inode(dn, in);
3011 }
3012
3013 // add inode?
3014 if (added) {
3015 cache->add_inode(in);
3016 dout(10) << "added " << *in << dendl;
3017 } else {
3018 dout(10) << " had " << *in << dendl;
3019 }
3020
3021 if (in->inode.is_dirty_rstat())
3022 in->mark_dirty_rstat();
3023
3024 // clear if dirtyscattered, since we're going to journal this
3025 // but not until we _actually_ finish the import...
3026 if (in->filelock.is_dirty()) {
3027 updated_scatterlocks.push_back(&in->filelock);
3028 mds->locker->mark_updated_scatterlock(&in->filelock);
3029 }
3030
3031 if (in->dirfragtreelock.is_dirty()) {
3032 updated_scatterlocks.push_back(&in->dirfragtreelock);
3033 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
3034 }
3035
3036 // adjust replica list
3037 //assert(!in->is_replica(oldauth)); // not true on failed export
3038 in->add_replica(oldauth, CInode::EXPORT_NONCE);
3039 if (in->is_replica(mds->get_nodeid()))
3040 in->remove_replica(mds->get_nodeid());
3041}
3042
3043void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
3044 bufferlist::iterator &blp,
3045 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3046{
3047 map<client_t,Capability::Export> cap_map;
3048 ::decode(cap_map, blp);
3049 if (auth_cap)
3050 ::decode(in->get_mds_caps_wanted(), blp);
3051 if (!cap_map.empty() ||
b32b8144 3052 (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
7c673cae
FG
3053 peer_exports[in].swap(cap_map);
3054 in->get(CInode::PIN_IMPORTINGCAPS);
3055 }
3056}
3057
3058void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
3059 map<client_t,Capability::Export> &export_map,
3060 map<client_t,Capability::Import> &import_map)
3061{
3062 for (map<client_t,Capability::Export>::iterator it = export_map.begin();
3063 it != export_map.end();
3064 ++it) {
3065 dout(10) << "finish_import_inode_caps for client." << it->first << " on " << *in << dendl;
3066 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(it->first.v));
3067 assert(session);
3068
3069 Capability *cap = in->get_client_cap(it->first);
3070 if (!cap) {
3071 cap = in->add_client_cap(it->first, session);
3072 if (peer < 0)
3073 cap->mark_importing();
3074 }
3075
3076 Capability::Import& im = import_map[it->first];
3077 im.cap_id = cap->get_cap_id();
3078 im.mseq = auth_cap ? it->second.mseq : cap->get_mseq();
3079 im.issue_seq = cap->get_last_seq() + 1;
3080
3081 if (peer >= 0) {
3082 cap->merge(it->second, auth_cap);
3083 mds->mdcache->do_cap_import(session, in, cap, it->second.cap_id,
3084 it->second.seq, it->second.mseq - 1, peer,
3085 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3086 }
3087 }
3088
3089 if (peer >= 0) {
3090 in->replica_caps_wanted = 0;
3091 in->put(CInode::PIN_IMPORTINGCAPS);
3092 }
3093}
3094
3095int Migrator::decode_import_dir(bufferlist::iterator& blp,
3096 mds_rank_t oldauth,
3097 CDir *import_root,
3098 EImportStart *le,
3099 LogSegment *ls,
3100 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3101 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3102{
3103 // set up dir
3104 dirfrag_t df;
3105 ::decode(df, blp);
3106
3107 CInode *diri = cache->get_inode(df.ino);
3108 assert(diri);
3109 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3110 assert(dir);
3111
3112 dout(7) << "decode_import_dir " << *dir << dendl;
3113
3114 // assimilate state
3115 dir->decode_import(blp, now, ls);
3116
3117 // adjust replica list
3118 //assert(!dir->is_replica(oldauth)); // not true on failed export
3119 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3120 if (dir->is_replica(mds->get_nodeid()))
3121 dir->remove_replica(mds->get_nodeid());
3122
3123 // add to journal entry
3124 if (le)
3125 le->metablob.add_import_dir(dir);
3126
3127 int num_imported = 0;
3128
3129 // take all waiters on this dir
3130 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3131 // a replica's presense in my cache implies/forces it's presense in authority's.
3132 list<MDSInternalContextBase*> waiters;
3133
3134 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3135 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3136 it != waiters.end();
3137 ++it)
3138 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3139
3140 dout(15) << "doing contents" << dendl;
3141
3142 // contents
3143 __u32 nden;
3144 ::decode(nden, blp);
3145
3146 for (; nden>0; nden--) {
3147 num_imported++;
3148
3149 // dentry
3150 string dname;
3151 snapid_t last;
3152 ::decode(dname, blp);
3153 ::decode(last, blp);
3154
3155 CDentry *dn = dir->lookup_exact_snap(dname, last);
3156 if (!dn)
3157 dn = dir->add_null_dentry(dname, 1, last);
3158
3159 dn->decode_import(blp, ls);
3160
3161 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3162 if (dn->is_replica(mds->get_nodeid()))
3163 dn->remove_replica(mds->get_nodeid());
3164
3165 // dentry lock in unreadable state can block path traverse
3166 if (dn->lock.get_state() != LOCK_SYNC)
3167 mds->locker->try_eval(&dn->lock, NULL);
3168
3169 dout(15) << "decode_import_dir got " << *dn << dendl;
3170
3171 // points to...
3172 char icode;
3173 ::decode(icode, blp);
3174
3175 if (icode == 'N') {
3176 // null dentry
3177 assert(dn->get_linkage()->is_null());
3178
3179 // fall thru
3180 }
3181 else if (icode == 'L') {
3182 // remote link
3183 inodeno_t ino;
3184 unsigned char d_type;
3185 ::decode(ino, blp);
3186 ::decode(d_type, blp);
3187 if (dn->get_linkage()->is_remote()) {
3188 assert(dn->get_linkage()->get_remote_ino() == ino);
3189 } else {
3190 dir->link_remote_inode(dn, ino, d_type);
3191 }
3192 }
3193 else if (icode == 'I') {
3194 // inode
3195 assert(le);
3196 decode_import_inode(dn, blp, oldauth, ls,
3197 peer_exports, updated_scatterlocks);
3198 }
3199
3200 // add dentry to journal entry
3201 if (le)
3202 le->metablob.add_import_dentry(dn);
3203 }
3204
3205#ifdef MDS_VERIFY_FRAGSTAT
3206 if (dir->is_complete())
3207 dir->verify_fragstat();
3208#endif
3209
3210 dir->inode->maybe_export_pin();
3211
3212 dout(7) << "decode_import_dir done " << *dir << dendl;
3213 return num_imported;
3214}
3215
3216
3217
3218
3219
3220// authority bystander
3221
3222/* This function DOES put the passed message before returning*/
3223void Migrator::handle_export_notify(MExportDirNotify *m)
3224{
3225 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3226 m->put();
3227 return;
3228 }
3229
3230 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3231
3232 mds_rank_t from = mds_rank_t(m->get_source().num());
3233 mds_authority_t old_auth = m->get_old_auth();
3234 mds_authority_t new_auth = m->get_new_auth();
3235
3236 if (!dir) {
3237 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3238 << " on missing dir " << m->get_dirfrag() << dendl;
3239 } else if (dir->authority() != old_auth) {
3240 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3241 << " != " << old_auth << " -> " << new_auth
3242 << " on " << *dir << dendl;
3243 } else {
3244 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3245 << " on " << *dir << dendl;
3246 // adjust auth
3247 set<CDir*> have;
3248 cache->map_dirfrag_set(m->get_bounds(), have);
3249 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3250
3251 // induce a merge?
3252 cache->try_subtree_merge(dir);
3253 }
3254
3255 // send ack
3256 if (m->wants_ack()) {
3257 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3258 } else {
3259 // aborted. no ack.
3260 dout(7) << "handle_export_notify no ack requested" << dendl;
3261 }
3262
3263 m->put();
3264}
3265
3266/** cap exports **/
3267void Migrator::export_caps(CInode *in)
3268{
3269 mds_rank_t dest = in->authority().first;
3270 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3271
3272 assert(in->is_any_caps());
3273 assert(!in->is_auth());
3274 assert(!in->is_ambiguous_auth());
3275 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3276
3277 MExportCaps *ex = new MExportCaps;
3278 ex->ino = in->ino();
3279
3280 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3281
3282 mds->send_message_mds(ex, dest);
3283}
3284
3285void Migrator::handle_gather_caps(MGatherCaps *m)
3286{
3287 CInode *in = cache->get_inode(m->ino);
3288
3289 if (!in)
3290 goto out;
3291
3292 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
3293 << " on " << *in
3294 << dendl;
3295 if (in->is_any_caps() &&
3296 !in->is_auth() &&
3297 !in->is_ambiguous_auth() &&
3298 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3299 export_caps(in);
3300
3301out:
3302 m->put();
3303}
3304
3305class C_M_LoggedImportCaps : public MigratorLogContext {
3306 CInode *in;
3307 mds_rank_t from;
3308public:
3309 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3310 map<client_t,entity_inst_t> client_map;
3311 map<client_t,uint64_t> sseqmap;
3312
3313 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3314 void finish(int r) override {
3315 mig->logged_import_caps(in, from, peer_exports, client_map, sseqmap);
3316 }
3317};
3318
3319/* This function DOES put the passed message before returning*/
3320void Migrator::handle_export_caps(MExportCaps *ex)
3321{
3322 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3323 CInode *in = cache->get_inode(ex->ino);
3324
3325 assert(in);
3326 assert(in->is_auth());
3327
3328 // FIXME
181888fb 3329 if (!in->can_auth_pin())
7c673cae 3330 return;
181888fb 3331 in->auth_pin(this);
7c673cae
FG
3332
3333 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3334 this, in, mds_rank_t(ex->get_source().num()));
3335 finish->client_map = ex->client_map;
3336
3337 // decode new caps
3338 bufferlist::iterator blp = ex->cap_bl.begin();
3339 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3340 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3341
3342 // journal open client sessions
3343 version_t pv = mds->server->prepare_force_open_sessions(finish->client_map, finish->sseqmap);
3344
3345 ESessions *le = new ESessions(pv, ex->client_map);
3346 mds->mdlog->start_submit_entry(le, finish);
3347 mds->mdlog->flush();
3348
3349 ex->put();
3350}
3351
3352
3353void Migrator::logged_import_caps(CInode *in,
3354 mds_rank_t from,
3355 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3356 map<client_t,entity_inst_t>& client_map,
3357 map<client_t,uint64_t>& sseqmap)
3358{
3359 dout(10) << "logged_import_caps on " << *in << dendl;
3360 // see export_go() vs export_go_synced()
3361 assert(in->is_auth());
3362
3363 // force open client sessions and finish cap import
3364 mds->server->finish_force_open_sessions(client_map, sseqmap);
3365
3366 map<client_t,Capability::Import> imported_caps;
3367
3368 assert(peer_exports.count(in));
3369 // clients will release caps from the exporter when they receive the cap import message.
3370 finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
3371 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
181888fb 3372 in->auth_unpin(this);
7c673cae 3373}