]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Migrator.cc
update sources to 12.2.8
[ceph.git] / ceph / src / mds / Migrator.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MDSRank.h"
16#include "MDCache.h"
17#include "CInode.h"
18#include "CDir.h"
19#include "CDentry.h"
20#include "Migrator.h"
21#include "Locker.h"
22#include "Server.h"
23
24#include "MDBalancer.h"
25#include "MDLog.h"
26#include "MDSMap.h"
27#include "Mutation.h"
28
29#include "include/filepath.h"
28e407b8 30#include "common/likely.h"
7c673cae
FG
31
32#include "events/EExport.h"
33#include "events/EImportStart.h"
34#include "events/EImportFinish.h"
35#include "events/ESessions.h"
36
37#include "msg/Messenger.h"
38
39#include "messages/MClientCaps.h"
40
41#include "messages/MExportDirDiscover.h"
42#include "messages/MExportDirDiscoverAck.h"
43#include "messages/MExportDirCancel.h"
44#include "messages/MExportDirPrep.h"
45#include "messages/MExportDirPrepAck.h"
46#include "messages/MExportDir.h"
47#include "messages/MExportDirAck.h"
48#include "messages/MExportDirNotify.h"
49#include "messages/MExportDirNotifyAck.h"
50#include "messages/MExportDirFinish.h"
51
52#include "messages/MExportCaps.h"
53#include "messages/MExportCapsAck.h"
54#include "messages/MGatherCaps.h"
55
56
57/*
58 * this is what the dir->dir_auth values look like
59 *
60 * dir_auth authbits
61 * export
62 * me me - before
63 * me, me me - still me, but preparing for export
64 * me, them me - send MExportDir (peer is preparing)
65 * them, me me - journaled EExport
66 * them them - done
67 *
68 * import:
69 * them them - before
70 * me, them me - journaled EImportStart
71 * me me - done
72 *
73 * which implies:
74 * - auth bit is set if i am listed as first _or_ second dir_auth.
75 */
76
77#include "common/config.h"
78
79
80#define dout_context g_ceph_context
81#define dout_subsys ceph_subsys_mds
82#undef dout_prefix
83#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
84
85
86class MigratorContext : public MDSInternalContextBase {
87protected:
88 Migrator *mig;
89 MDSRank *get_mds() override {
90 return mig->mds;
91 }
92public:
93 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
94 assert(mig != NULL);
95 }
96};
97
98class MigratorLogContext : public MDSLogContextBase {
99protected:
100 Migrator *mig;
101 MDSRank *get_mds() override {
102 return mig->mds;
103 }
104public:
105 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
106 assert(mig != NULL);
107 }
108};
109
110/* This function DOES put the passed message before returning*/
111void Migrator::dispatch(Message *m)
112{
1adf2230
AA
113 if (unlikely(inject_message_loss)) {
114 if (inject_message_loss == m->get_type() - MDS_PORT_MIGRATOR) {
115 dout(0) << "inject message loss " << *m << dendl;
116 m->put();
117 return;
118 }
119 }
120
7c673cae
FG
121 switch (m->get_type()) {
122 // import
123 case MSG_MDS_EXPORTDIRDISCOVER:
124 handle_export_discover(static_cast<MExportDirDiscover*>(m));
125 break;
126 case MSG_MDS_EXPORTDIRPREP:
127 handle_export_prep(static_cast<MExportDirPrep*>(m));
128 break;
129 case MSG_MDS_EXPORTDIR:
28e407b8
AA
130 if (unlikely(inject_session_race)) {
131 dout(0) << "waiting for inject_session_race" << dendl;
132 mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
133 } else {
134 handle_export_dir(static_cast<MExportDir*>(m));
135 }
7c673cae
FG
136 break;
137 case MSG_MDS_EXPORTDIRFINISH:
138 handle_export_finish(static_cast<MExportDirFinish*>(m));
139 break;
140 case MSG_MDS_EXPORTDIRCANCEL:
141 handle_export_cancel(static_cast<MExportDirCancel*>(m));
142 break;
143
144 // export
145 case MSG_MDS_EXPORTDIRDISCOVERACK:
146 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
147 break;
148 case MSG_MDS_EXPORTDIRPREPACK:
149 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
150 break;
151 case MSG_MDS_EXPORTDIRACK:
152 handle_export_ack(static_cast<MExportDirAck*>(m));
153 break;
154 case MSG_MDS_EXPORTDIRNOTIFYACK:
155 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
156 break;
157
158 // export 3rd party (dir_auth adjustments)
159 case MSG_MDS_EXPORTDIRNOTIFY:
160 handle_export_notify(static_cast<MExportDirNotify*>(m));
161 break;
162
163 // caps
164 case MSG_MDS_EXPORTCAPS:
165 handle_export_caps(static_cast<MExportCaps*>(m));
166 break;
1adf2230
AA
167 case MSG_MDS_EXPORTCAPSACK:
168 handle_export_caps_ack(static_cast<MExportCapsAck*>(m));
169 break;
7c673cae
FG
170 case MSG_MDS_GATHERCAPS:
171 handle_gather_caps(static_cast<MGatherCaps*>(m));
172 break;
173
174 default:
175 derr << "migrator unknown message " << m->get_type() << dendl;
176 assert(0 == "migrator unknown message");
177 }
178}
179
180
181class C_MDC_EmptyImport : public MigratorContext {
182 CDir *dir;
183public:
184 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
185 void finish(int r) override {
186 mig->export_empty_import(dir);
187 }
188};
189
190
191void Migrator::export_empty_import(CDir *dir)
192{
193 dout(7) << "export_empty_import " << *dir << dendl;
194 assert(dir->is_subtree_root());
195
196 if (dir->inode->is_auth()) {
197 dout(7) << " inode is auth" << dendl;
198 return;
199 }
200 if (!dir->is_auth()) {
201 dout(7) << " not auth" << dendl;
202 return;
203 }
204 if (dir->is_freezing() || dir->is_frozen()) {
205 dout(7) << " freezing or frozen" << dendl;
206 return;
207 }
208 if (dir->get_num_head_items() > 0) {
209 dout(7) << " not actually empty" << dendl;
210 return;
211 }
212 if (dir->inode->is_root()) {
213 dout(7) << " root" << dendl;
214 return;
215 }
216
217 mds_rank_t dest = dir->inode->authority().first;
218 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
219
220 dout(7) << " really empty, exporting to " << dest << dendl;
221 assert (dest != mds->get_nodeid());
222
223 dout(7) << "exporting to mds." << dest
224 << " empty import " << *dir << dendl;
225 export_dir( dir, dest );
226}
227
228void Migrator::find_stale_export_freeze()
229{
230 utime_t now = ceph_clock_now();
231 utime_t cutoff = now;
232 cutoff -= g_conf->mds_freeze_tree_timeout;
233
234
235 /*
236 * We could have situations like:
237 *
238 * - mds.0 authpins an item in subtree A
239 * - mds.0 sends request to mds.1 to authpin an item in subtree B
240 * - mds.0 freezes subtree A
241 * - mds.1 authpins an item in subtree B
242 * - mds.1 sends request to mds.0 to authpin an item in subtree A
243 * - mds.1 freezes subtree B
244 * - mds.1 receives the remote authpin request from mds.0
245 * (wait because subtree B is freezing)
246 * - mds.0 receives the remote authpin request from mds.1
247 * (wait because subtree A is freezing)
248 *
249 *
250 * - client request authpins items in subtree B
251 * - freeze subtree B
252 * - import subtree A which is parent of subtree B
253 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
254 * - freeze subtree A
255 * - client request tries authpinning items in subtree A
256 * (wait because subtree A is freezing)
257 */
258 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
259 p != export_state.end(); ) {
260 CDir* dir = p->first;
261 export_state_t& stat = p->second;
262 ++p;
263 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
264 continue;
265 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
266 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
267 stat.last_cum_auth_pins_change = now;
268 continue;
269 }
270 if (stat.last_cum_auth_pins_change >= cutoff)
271 continue;
272 if (stat.num_remote_waiters > 0 ||
273 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
274 export_try_cancel(dir);
275 }
276 }
277}
278
279void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
280{
281 dout(10) << "export_try_cancel " << *dir << dendl;
282
283 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
284 assert(it != export_state.end());
285
286 int state = it->second.state;
287 switch (state) {
288 case EXPORT_LOCKING:
289 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
290 it->second.state = EXPORT_CANCELLED;
291 dir->auth_unpin(this);
292 break;
293 case EXPORT_DISCOVERING:
294 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
295 it->second.state = EXPORT_CANCELLED;
296 dir->unfreeze_tree(); // cancel the freeze
297 dir->auth_unpin(this);
298 if (notify_peer &&
299 (!mds->is_cluster_degraded() ||
300 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
301 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
302 break;
303
304 case EXPORT_FREEZING:
305 dout(10) << "export state=freezing : canceling freeze" << dendl;
306 it->second.state = EXPORT_CANCELLED;
307 dir->unfreeze_tree(); // cancel the freeze
224ce89b
WB
308 if (dir->is_subtree_root())
309 cache->try_subtree_merge(dir);
7c673cae
FG
310 if (notify_peer &&
311 (!mds->is_cluster_degraded() ||
312 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
313 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
314 break;
315
316 // NOTE: state order reversal, warning comes after prepping
317 case EXPORT_WARNING:
318 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
319 it->second.state = EXPORT_CANCELLING;
320 // fall-thru
321
322 case EXPORT_PREPPING:
323 if (state != EXPORT_WARNING) {
324 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
325 it->second.state = EXPORT_CANCELLED;
326 }
327
328 {
329 // unpin bounds
330 set<CDir*> bounds;
331 cache->get_subtree_bounds(dir, bounds);
332 for (set<CDir*>::iterator q = bounds.begin();
333 q != bounds.end();
334 ++q) {
335 CDir *bd = *q;
336 bd->put(CDir::PIN_EXPORTBOUND);
337 bd->state_clear(CDir::STATE_EXPORTBOUND);
338 }
339 if (state == EXPORT_WARNING) {
340 // notify bystanders
b32b8144 341 export_notify_abort(dir, it->second, bounds);
7c673cae
FG
342 // process delayed expires
343 cache->process_delayed_expire(dir);
344 }
345 }
346 dir->unfreeze_tree();
7c673cae 347 cache->try_subtree_merge(dir);
b32b8144
FG
348 for (auto bd : it->second.residual_dirs) {
349 bd->unfreeze_tree();
350 cache->try_subtree_merge(bd);
351 }
7c673cae
FG
352 if (notify_peer &&
353 (!mds->is_cluster_degraded() ||
354 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
355 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
356 break;
357
358 case EXPORT_EXPORTING:
359 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
360 it->second.state = EXPORT_CANCELLING;
b32b8144 361 export_reverse(dir, it->second);
7c673cae
FG
362 break;
363
364 case EXPORT_LOGGINGFINISH:
365 case EXPORT_NOTIFYING:
366 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
367 // leave export_state, don't clean up now.
368 break;
369 case EXPORT_CANCELLING:
370 break;
371
372 default:
373 ceph_abort();
374 }
375
376 // finish clean-up?
377 if (it->second.state == EXPORT_CANCELLING ||
378 it->second.state == EXPORT_CANCELLED) {
379 MutationRef mut;
380 mut.swap(it->second.mut);
381
382 if (it->second.state == EXPORT_CANCELLED) {
383 export_state.erase(it);
1adf2230 384 dir->clear_exporting();
7c673cae
FG
385 // send pending import_maps?
386 cache->maybe_send_pending_resolves();
387 }
388
389 // drop locks
390 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
391 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
392 assert(mdr);
393 if (mdr->more()->waiting_on_slave.empty())
394 mds->mdcache->request_finish(mdr);
395 } else if (mut) {
396 mds->locker->drop_locks(mut.get());
397 mut->cleanup();
398 }
399
400 cache->show_subtrees();
401
402 maybe_do_queued_export();
403 }
404}
405
406void Migrator::export_cancel_finish(CDir *dir)
407{
408 assert(dir->state_test(CDir::STATE_EXPORTING));
1adf2230 409 dir->clear_exporting();
7c673cae
FG
410
411 // pinned by Migrator::export_notify_abort()
412 dir->auth_unpin(this);
413 // send pending import_maps? (these need to go out when all exports have finished.)
414 cache->maybe_send_pending_resolves();
415}
416
417// ==========================================================
418// mds failure handling
419
420void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
421{
422 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
423
424 // check my exports
425
426 // first add an extra auth_pin on any freezes, so that canceling a
427 // nested freeze doesn't complete one further up the hierarchy and
428 // confuse the shit out of us. we'll remove it after canceling the
429 // freeze. this way no freeze completions run before we want them
430 // to.
431 list<CDir*> pinned_dirs;
432 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
433 p != export_state.end();
434 ++p) {
435 if (p->second.state == EXPORT_FREEZING) {
436 CDir *dir = p->first;
437 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
438 dir->auth_pin(this);
439 pinned_dirs.push_back(dir);
440 }
441 }
442
443 map<CDir*,export_state_t>::iterator p = export_state.begin();
444 while (p != export_state.end()) {
445 map<CDir*,export_state_t>::iterator next = p;
446 ++next;
447 CDir *dir = p->first;
448
449 // abort exports:
450 // - that are going to the failed node
451 // - that aren't frozen yet (to avoid auth_pin deadlock)
452 // - they havne't prepped yet (they may need to discover bounds to do that)
453 if ((p->second.peer == who &&
454 p->second.state != EXPORT_CANCELLING) ||
455 p->second.state == EXPORT_LOCKING ||
456 p->second.state == EXPORT_DISCOVERING ||
457 p->second.state == EXPORT_FREEZING ||
458 p->second.state == EXPORT_PREPPING) {
459 // the guy i'm exporting to failed, or we're just freezing.
460 dout(10) << "cleaning up export state (" << p->second.state << ")"
461 << get_export_statename(p->second.state) << " of " << *dir << dendl;
462 export_try_cancel(dir);
463 } else if (p->second.peer != who) {
464 // bystander failed.
465 if (p->second.warning_ack_waiting.erase(who)) {
466 if (p->second.state == EXPORT_WARNING) {
467 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
468 // exporter waiting for warning acks, let's fake theirs.
469 dout(10) << "faking export_warning_ack from mds." << who
470 << " on " << *dir << " to mds." << p->second.peer
471 << dendl;
472 if (p->second.warning_ack_waiting.empty())
473 export_go(dir);
474 }
475 }
476 if (p->second.notify_ack_waiting.erase(who)) {
477 // exporter is waiting for notify acks, fake it
478 dout(10) << "faking export_notify_ack from mds." << who
479 << " on " << *dir << " to mds." << p->second.peer
480 << dendl;
481 if (p->second.state == EXPORT_NOTIFYING) {
482 if (p->second.notify_ack_waiting.empty())
483 export_finish(dir);
484 } else if (p->second.state == EXPORT_CANCELLING) {
485 if (p->second.notify_ack_waiting.empty()) {
486 export_state.erase(p);
487 export_cancel_finish(dir);
488 }
489 }
490 }
491 }
492
493 // next!
494 p = next;
495 }
496
497
498 // check my imports
499 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
500 while (q != import_state.end()) {
501 map<dirfrag_t,import_state_t>::iterator next = q;
502 ++next;
503 dirfrag_t df = q->first;
504 CInode *diri = mds->mdcache->get_inode(df.ino);
505 CDir *dir = mds->mdcache->get_dirfrag(df);
506
507 if (q->second.peer == who) {
508 if (dir)
509 dout(10) << "cleaning up import state (" << q->second.state << ")"
510 << get_import_statename(q->second.state) << " of " << *dir << dendl;
511 else
512 dout(10) << "cleaning up import state (" << q->second.state << ")"
513 << get_import_statename(q->second.state) << " of " << df << dendl;
514
515 switch (q->second.state) {
516 case IMPORT_DISCOVERING:
517 dout(10) << "import state=discovering : clearing state" << dendl;
518 import_reverse_discovering(df);
519 break;
520
521 case IMPORT_DISCOVERED:
522 assert(diri);
523 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
524 import_reverse_discovered(df, diri);
525 break;
526
527 case IMPORT_PREPPING:
528 assert(dir);
529 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
b32b8144 530 import_reverse_prepping(dir, q->second);
7c673cae
FG
531 break;
532
533 case IMPORT_PREPPED:
534 assert(dir);
535 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
536 {
537 set<CDir*> bounds;
538 cache->get_subtree_bounds(dir, bounds);
539 import_remove_pins(dir, bounds);
540
541 // adjust auth back to the exporter
542 cache->adjust_subtree_auth(dir, q->second.peer);
7c673cae
FG
543
544 // notify bystanders ; wait in aborting state
b32b8144 545 q->second.state = IMPORT_ABORTING;
7c673cae
FG
546 import_notify_abort(dir, bounds);
547 assert(g_conf->mds_kill_import_at != 10);
548 }
549 break;
550
551 case IMPORT_LOGGINGSTART:
552 assert(dir);
553 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
554 import_reverse(dir);
555 break;
556
557 case IMPORT_ACKING:
558 assert(dir);
559 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
560 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
561 {
562 set<CDir*> bounds;
563 cache->get_subtree_bounds(dir, bounds);
564 cache->add_ambiguous_import(dir, bounds);
565 }
566 break;
567
568 case IMPORT_FINISHING:
569 assert(dir);
570 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
571 import_finish(dir, true);
572 break;
573
574 case IMPORT_ABORTING:
575 assert(dir);
576 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
577 break;
578 }
579 } else {
580 auto bystanders_entry = q->second.bystanders.find(who);
581 if (bystanders_entry != q->second.bystanders.end()) {
582 q->second.bystanders.erase(bystanders_entry);
583 if (q->second.state == IMPORT_ABORTING) {
584 assert(dir);
585 dout(10) << "faking export_notify_ack from mds." << who
586 << " on aborting import " << *dir << " from mds." << q->second.peer
587 << dendl;
224ce89b 588 if (q->second.bystanders.empty())
7c673cae 589 import_reverse_unfreeze(dir);
7c673cae
FG
590 }
591 }
592 }
593
594 // next!
595 q = next;
596 }
597
598 while (!pinned_dirs.empty()) {
599 CDir *dir = pinned_dirs.front();
600 dout(10) << "removing temp auth_pin on " << *dir << dendl;
601 dir->auth_unpin(this);
602 pinned_dirs.pop_front();
603 }
604}
605
606
607
608void Migrator::show_importing()
609{
610 dout(10) << "show_importing" << dendl;
611 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
612 p != import_state.end();
613 ++p) {
614 CDir *dir = mds->mdcache->get_dirfrag(p->first);
615 if (dir) {
616 dout(10) << " importing from " << p->second.peer
617 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
618 << " " << p->first << " " << *dir << dendl;
619 } else {
620 dout(10) << " importing from " << p->second.peer
621 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
622 << " " << p->first << dendl;
623 }
624 }
625}
626
627void Migrator::show_exporting()
628{
629 dout(10) << "show_exporting" << dendl;
630 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
631 p != export_state.end();
632 ++p)
633 dout(10) << " exporting to " << p->second.peer
634 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
635 << " " << p->first->dirfrag() << " " << *p->first << dendl;
636}
637
638
639
640void Migrator::audit()
641{
642 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
643 return; // hrm.
644
645 // import_state
646 show_importing();
647 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
648 p != import_state.end();
649 ++p) {
650 if (p->second.state == IMPORT_DISCOVERING)
651 continue;
652 if (p->second.state == IMPORT_DISCOVERED) {
653 CInode *in = cache->get_inode(p->first.ino);
654 assert(in);
655 continue;
656 }
657 CDir *dir = cache->get_dirfrag(p->first);
658 assert(dir);
659 if (p->second.state == IMPORT_PREPPING)
660 continue;
661 if (p->second.state == IMPORT_ABORTING) {
662 assert(!dir->is_ambiguous_dir_auth());
663 assert(dir->get_dir_auth().first != mds->get_nodeid());
664 continue;
665 }
666 assert(dir->is_ambiguous_dir_auth());
667 assert(dir->authority().first == mds->get_nodeid() ||
668 dir->authority().second == mds->get_nodeid());
669 }
670
671 // export_state
672 show_exporting();
673 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
674 p != export_state.end();
675 ++p) {
676 CDir *dir = p->first;
677 if (p->second.state == EXPORT_LOCKING ||
678 p->second.state == EXPORT_DISCOVERING ||
679 p->second.state == EXPORT_FREEZING ||
680 p->second.state == EXPORT_CANCELLING)
681 continue;
682 assert(dir->is_ambiguous_dir_auth());
683 assert(dir->authority().first == mds->get_nodeid() ||
684 dir->authority().second == mds->get_nodeid());
685 }
686
687 // ambiguous+me subtrees should be importing|exporting
688
689 // write me
690}
691
692
693
694
695
696// ==========================================================
697// EXPORT
698
699void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
700{
701 // enqueue
702 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
703 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
704
705 maybe_do_queued_export();
706}
707
708void Migrator::maybe_do_queued_export()
709{
710 static bool running;
711 if (running)
712 return;
713 running = true;
714 while (!export_queue.empty() &&
715 export_state.size() <= 4) {
716 dirfrag_t df = export_queue.front().first;
717 mds_rank_t dest = export_queue.front().second;
718 export_queue.pop_front();
719
720 CDir *dir = mds->mdcache->get_dirfrag(df);
721 if (!dir) continue;
722 if (!dir->is_auth()) continue;
723
724 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
725
726 export_dir(dir, dest);
727 }
728 running = false;
729}
730
731
732
733
734class C_MDC_ExportFreeze : public MigratorContext {
735 CDir *ex; // dir i'm exporting
736 uint64_t tid;
737public:
738 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
739 MigratorContext(m), ex(e), tid(t) {
740 assert(ex != NULL);
741 }
742 void finish(int r) override {
743 if (r >= 0)
744 mig->export_frozen(ex, tid);
745 }
746};
747
748
749void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
750{
751 // path
752 vector<CDentry*> trace;
753 cache->make_trace(trace, dir->inode);
754 for (vector<CDentry*>::iterator it = trace.begin();
755 it != trace.end();
756 ++it)
757 locks.insert(&(*it)->lock);
758
759 // prevent scatter gather race
760 locks.insert(&dir->get_inode()->dirfragtreelock);
761
762 // bound dftlocks:
763 // NOTE: We need to take an rdlock on bounding dirfrags during
764 // migration for a rather irritating reason: when we export the
765 // bound inode, we need to send scatterlock state for the dirfrags
766 // as well, so that the new auth also gets the correct info. If we
767 // race with a refragment, this info is useless, as we can't
768 // redivvy it up. And it's needed for the scatterlocks to work
769 // properly: when the auth is in a sync/lock state it keeps each
770 // dirfrag's portion in the local (auth OR replica) dirfrag.
771 set<CDir*> wouldbe_bounds;
772 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
773 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
774 locks.insert(&(*p)->get_inode()->dirfragtreelock);
775}
776
777
31f18b77 778class C_M_ExportDirWait : public MigratorContext {
7c673cae
FG
779 MDRequestRef mdr;
780 int count;
781public:
31f18b77 782 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
7c673cae
FG
783 : MigratorContext(m), mdr(mdr), count(count) {}
784 void finish(int r) override {
785 mig->dispatch_export_dir(mdr, count);
786 }
787};
788
789
790/** export_dir(dir, dest)
791 * public method to initiate an export.
792 * will fail if the directory is freezing, frozen, unpinnable, or root.
793 */
794void Migrator::export_dir(CDir *dir, mds_rank_t dest)
795{
796 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
797 assert(dir->is_auth());
798 assert(dest != mds->get_nodeid());
799
181888fb
FG
800 if (!(mds->is_active() || mds->is_stopping())) {
801 dout(7) << "i'm not active, no exports for now" << dendl;
802 return;
803 }
7c673cae
FG
804 if (mds->mdcache->is_readonly()) {
805 dout(7) << "read-only FS, no exports for now" << dendl;
806 return;
807 }
31f18b77
FG
808 if (!mds->mdsmap->is_active(dest)) {
809 dout(7) << "dest not active, no exports for now" << dendl;
810 return;
811 }
7c673cae
FG
812 if (mds->is_cluster_degraded()) {
813 dout(7) << "cluster degraded, no exports for now" << dendl;
814 return;
815 }
816 if (dir->inode->is_system()) {
817 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
818 //ceph_abort();
819 return;
820 }
821
b32b8144
FG
822 CDir* parent_dir = dir->inode->get_projected_parent_dir();
823 if (parent_dir && parent_dir->inode->is_stray()) {
824 if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
825 dout(7) << "i won't export anything in stray" << dendl;
826 return;
827 }
828 } else {
829 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
830 dout(7) << "dir is export pinned" << dendl;
831 return;
832 }
7c673cae
FG
833 }
834
835 if (dir->is_frozen() ||
836 dir->is_freezing()) {
837 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
838 return;
839 }
840 if (dir->state_test(CDir::STATE_EXPORTING)) {
841 dout(7) << "already exporting" << dendl;
842 return;
843 }
844
7c673cae
FG
845 if (g_conf->mds_thrash_exports) {
846 // create random subtree bound (which will not be exported)
847 list<CDir*> ls;
848 for (auto p = dir->begin(); p != dir->end(); ++p) {
849 auto dn = p->second;
850 CDentry::linkage_t *dnl= dn->get_linkage();
851 if (dnl->is_primary()) {
852 CInode *in = dnl->get_inode();
853 if (in->is_dir())
854 in->get_nested_dirfrags(ls);
855 }
856 }
857 if (ls.size() > 0) {
858 int n = rand() % ls.size();
859 auto p = ls.begin();
860 while (n--) ++p;
861 CDir *bd = *p;
862 if (!(bd->is_frozen() || bd->is_freezing())) {
863 assert(bd->is_auth());
864 dir->state_set(CDir::STATE_AUXSUBTREE);
865 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
866 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
867 }
868 }
869 }
870
871 mds->hit_export_target(ceph_clock_now(), dest, -1);
872
873 dir->auth_pin(this);
1adf2230 874 dir->mark_exporting();
7c673cae
FG
875
876 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
877 mdr->more()->export_dir = dir;
878
879 assert(export_state.count(dir) == 0);
880 export_state_t& stat = export_state[dir];
881 stat.state = EXPORT_LOCKING;
882 stat.peer = dest;
883 stat.tid = mdr->reqid.tid;
884 stat.mut = mdr;
885
886 return mds->mdcache->dispatch_request(mdr);
887}
888
889void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
890{
891 dout(7) << "dispatch_export_dir " << *mdr << dendl;
892
893 CDir *dir = mdr->more()->export_dir;
894 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
895 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
896 // export must have aborted.
897 dout(7) << "export must have aborted " << *mdr << dendl;
898 mds->mdcache->request_finish(mdr);
899 return;
900 }
901 assert(it->second.state == EXPORT_LOCKING);
902
903 mds_rank_t dest = it->second.peer;
904
905 if (!mds->is_export_target(dest)) {
906 dout(7) << "dest is not yet an export target" << dendl;
907 if (count > 3) {
908 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
909 export_try_cancel(dir);
910 return;
911 }
224ce89b
WB
912
913 mds->locker->drop_locks(mdr.get());
914 mdr->drop_local_auth_pins();
915
31f18b77 916 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
7c673cae
FG
917 return;
918 }
919
920 if (!dir->inode->get_parent_dn()) {
921 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
31f18b77 922 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
7c673cae
FG
923 return;
924 }
925
926 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
927 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
928 export_try_cancel(dir);
929 return;
930 }
931
932 // locks?
933 set<SimpleLock*> rdlocks;
934 set<SimpleLock*> xlocks;
935 set<SimpleLock*> wrlocks;
936 get_export_lock_set(dir, rdlocks);
937 // If auth MDS of the subtree root inode is neither the exporter MDS
938 // nor the importer MDS and it gathers subtree root's fragstat/neststat
939 // while the subtree is exporting. It's possible that the exporter MDS
940 // and the importer MDS both are auth MDS of the subtree root or both
941 // are not auth MDS of the subtree root at the time they receive the
942 // lock messages. So the auth MDS of the subtree root inode may get no
943 // or duplicated fragstat/neststat for the subtree root dirfrag.
944 wrlocks.insert(&dir->get_inode()->filelock);
945 wrlocks.insert(&dir->get_inode()->nestlock);
946 if (dir->get_inode()->is_auth()) {
947 dir->get_inode()->filelock.set_scatter_wanted();
948 dir->get_inode()->nestlock.set_scatter_wanted();
949 }
950
951 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
952 if (mdr->aborted)
953 export_try_cancel(dir);
954 return;
955 }
956
957 assert(g_conf->mds_kill_export_at != 1);
958 it->second.state = EXPORT_DISCOVERING;
959
960 // send ExportDirDiscover (ask target)
961 filepath path;
962 dir->inode->make_path(path);
963 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
964 mds->get_nodeid(),
965 it->second.tid);
966 mds->send_message_mds(discover, dest);
967 assert(g_conf->mds_kill_export_at != 2);
968
969 it->second.last_cum_auth_pins_change = ceph_clock_now();
970
971 // start the freeze, but hold it up with an auth_pin.
972 dir->freeze_tree();
973 assert(dir->is_freezing_tree());
974 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
975}
976
977/*
978 * called on receipt of MExportDirDiscoverAck
979 * the importer now has the directory's _inode_ in memory, and pinned.
980 *
981 * This function DOES put the passed message before returning
982 */
983void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
984{
985 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
986 mds_rank_t dest(m->get_source().num());
987 utime_t now = ceph_clock_now();
988 assert(dir);
989
990 dout(7) << "export_discover_ack from " << m->get_source()
991 << " on " << *dir << dendl;
992
993 mds->hit_export_target(now, dest, -1);
994
995 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
996 if (it == export_state.end() ||
997 it->second.tid != m->get_tid() ||
998 it->second.peer != dest) {
999 dout(7) << "must have aborted" << dendl;
1000 } else {
1001 assert(it->second.state == EXPORT_DISCOVERING);
c07f9fc5
FG
1002
1003 if (m->is_success()) {
1004 // release locks to avoid deadlock
1005 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
1006 assert(mdr);
1007 mds->mdcache->request_finish(mdr);
1008 it->second.mut.reset();
1009 // freeze the subtree
1010 it->second.state = EXPORT_FREEZING;
1011 dir->auth_unpin(this);
1012 assert(g_conf->mds_kill_export_at != 3);
1013
1014 } else {
1015 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
1016 export_try_cancel(dir, false);
1017 }
7c673cae
FG
1018 }
1019
1020 m->put(); // done
1021}
1022
1023class C_M_ExportSessionsFlushed : public MigratorContext {
1024 CDir *dir;
1025 uint64_t tid;
1026public:
1027 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1028 : MigratorContext(m), dir(d), tid(t) {
1029 assert(dir != NULL);
1030 }
1031 void finish(int r) override {
1032 mig->export_sessions_flushed(dir, tid);
1033 }
1034};
1035
1036void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1037{
1038 dout(7) << "export_sessions_flushed " << *dir << dendl;
1039
1040 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1041 if (it == export_state.end() ||
1042 it->second.state == EXPORT_CANCELLING ||
1043 it->second.tid != tid) {
1044 // export must have aborted.
1045 dout(7) << "export must have aborted on " << dir << dendl;
1046 return;
1047 }
1048
1049 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1050 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1051 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1052 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1053 export_go(dir); // start export.
1054}
1055
1056void Migrator::export_frozen(CDir *dir, uint64_t tid)
1057{
1058 dout(7) << "export_frozen on " << *dir << dendl;
1059
1060 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1061 if (it == export_state.end() || it->second.tid != tid) {
1062 dout(7) << "export must have aborted" << dendl;
1063 return;
1064 }
1065
1066 assert(it->second.state == EXPORT_FREEZING);
1067 assert(dir->is_frozen_tree_root());
1068 assert(dir->get_cum_auth_pins() == 0);
1069
1070 CInode *diri = dir->get_inode();
1071
1072 // ok, try to grab all my locks.
1073 set<SimpleLock*> rdlocks;
1074 get_export_lock_set(dir, rdlocks);
1075 if ((diri->is_auth() && diri->is_frozen()) ||
1076 !mds->locker->can_rdlock_set(rdlocks) ||
1077 !diri->filelock.can_wrlock(-1) ||
1078 !diri->nestlock.can_wrlock(-1)) {
1079 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1080 << *dir << dendl;
7c673cae
FG
1081 // .. unwind ..
1082 dir->unfreeze_tree();
224ce89b 1083 cache->try_subtree_merge(dir);
7c673cae
FG
1084
1085 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae 1086 export_state.erase(it);
224ce89b 1087
1adf2230 1088 dir->clear_exporting();
224ce89b 1089 cache->maybe_send_pending_resolves();
7c673cae
FG
1090 return;
1091 }
1092
1093 it->second.mut = new MutationImpl();
1094 if (diri->is_auth())
1095 it->second.mut->auth_pin(diri);
1096 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1097 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1098 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1099
1100 cache->show_subtrees();
1101
224ce89b
WB
1102 // CDir::_freeze_tree() should have forced it into subtree.
1103 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
b32b8144
FG
1104
1105 set<client_t> export_client_set;
1106 check_export_size(dir, it->second, export_client_set);
1107
7c673cae 1108 // note the bounds.
7c673cae
FG
1109 set<CDir*> bounds;
1110 cache->get_subtree_bounds(dir, bounds);
1111
1112 // generate prep message, log entry.
1113 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1114
1115 // include list of bystanders
181888fb
FG
1116 for (const auto &p : dir->get_replicas()) {
1117 if (p.first != it->second.peer) {
1118 dout(10) << "bystander mds." << p.first << dendl;
1119 prep->add_bystander(p.first);
7c673cae
FG
1120 }
1121 }
1122
1123 // include base dirfrag
1124 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1125
1126 /*
1127 * include spanning tree for all nested exports.
1128 * these need to be on the destination _before_ the final export so that
1129 * dir_auth updates on any nested exports are properly absorbed.
1130 * this includes inodes and dirfrags included in the subtree, but
1131 * only the inodes at the bounds.
1132 *
1133 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1134 */
1135 set<inodeno_t> inodes_added;
1136 set<dirfrag_t> dirfrags_added;
1137
1138 // check bounds
1139 for (set<CDir*>::iterator p = bounds.begin();
1140 p != bounds.end();
1141 ++p) {
1142 CDir *bound = *p;
1143
1144 // pin it.
b32b8144 1145 assert(bound->state_test(CDir::STATE_EXPORTBOUND));
7c673cae
FG
1146
1147 dout(7) << " export bound " << *bound << dendl;
1148 prep->add_bound( bound->dirfrag() );
1149
1150 // trace to bound
1151 bufferlist tracebl;
1152 CDir *cur = bound;
b32b8144 1153
7c673cae 1154 char start = '-';
b32b8144
FG
1155 if (it->second.residual_dirs.count(bound)) {
1156 start = 'f';
1157 cache->replicate_dir(bound, it->second.peer, tracebl);
1158 dout(7) << " added " << *bound << dendl;
1159 }
1160
7c673cae
FG
1161 while (1) {
1162 // don't repeat inodes
1163 if (inodes_added.count(cur->inode->ino()))
1164 break;
1165 inodes_added.insert(cur->inode->ino());
1166
1167 // prepend dentry + inode
1168 assert(cur->inode->is_auth());
1169 bufferlist bl;
1170 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1171 dout(7) << " added " << *cur->inode->parent << dendl;
1172 cache->replicate_inode(cur->inode, it->second.peer, bl,
1173 mds->mdsmap->get_up_features());
1174 dout(7) << " added " << *cur->inode << dendl;
1175 bl.claim_append(tracebl);
1176 tracebl.claim(bl);
1177
1178 cur = cur->get_parent_dir();
1179
1180 // don't repeat dirfrags
1181 if (dirfrags_added.count(cur->dirfrag()) ||
1182 cur == dir) {
1183 start = 'd'; // start with dentry
1184 break;
1185 }
1186 dirfrags_added.insert(cur->dirfrag());
1187
1188 // prepend dir
1189 cache->replicate_dir(cur, it->second.peer, bl);
1190 dout(7) << " added " << *cur << dendl;
1191 bl.claim_append(tracebl);
1192 tracebl.claim(bl);
1193
1194 start = 'f'; // start with dirfrag
1195 }
1196 bufferlist final_bl;
1197 dirfrag_t df = cur->dirfrag();
1198 ::encode(df, final_bl);
1199 ::encode(start, final_bl);
1200 final_bl.claim_append(tracebl);
1201 prep->add_trace(final_bl);
1202 }
1203
1204 // send.
1205 it->second.state = EXPORT_PREPPING;
1206 mds->send_message_mds(prep, it->second.peer);
1207 assert (g_conf->mds_kill_export_at != 4);
1208
1209 // make sure any new instantiations of caps are flushed out
1210 assert(it->second.warning_ack_waiting.empty());
1211
7c673cae
FG
1212 MDSGatherBuilder gather(g_ceph_context);
1213 mds->server->flush_client_sessions(export_client_set, gather);
1214 if (gather.has_subs()) {
1215 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1216 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1217 gather.activate();
1218 }
1219}
1220
b32b8144 1221void Migrator::check_export_size(CDir *dir, export_state_t& stat, set<client_t>& client_set)
7c673cae 1222{
b32b8144
FG
1223 const unsigned frag_size = 800;
1224 const unsigned inode_size = 1000;
1225 const unsigned cap_size = 80;
1226 const unsigned link_size = 10;
1227 const unsigned null_size = 1;
1228
1229 uint64_t max_size = g_conf->get_val<uint64_t>("mds_max_export_size");
1230 uint64_t approx_size = 0;
1231
7c673cae
FG
1232 list<CDir*> dfs;
1233 dfs.push_back(dir);
1234 while (!dfs.empty()) {
1235 CDir *dir = dfs.front();
1236 dfs.pop_front();
b32b8144
FG
1237
1238 approx_size += frag_size;
94b18763
FG
1239 for (auto &p : *dir) {
1240 CDentry *dn = p.second;
b32b8144
FG
1241 if (dn->get_linkage()->is_null()) {
1242 approx_size += null_size;
1243 continue;
1244 }
1245 if (dn->get_linkage()->is_remote()) {
1246 approx_size += link_size;
7c673cae 1247 continue;
b32b8144
FG
1248 }
1249
1250 approx_size += inode_size;
7c673cae
FG
1251 CInode *in = dn->get_linkage()->get_inode();
1252 if (in->is_dir()) {
1253 // directory?
1254 list<CDir*> ls;
1255 in->get_dirfrags(ls);
b32b8144
FG
1256 for (auto q : ls) {
1257 if (q->is_subtree_root()) {
1258 q->state_set(CDir::STATE_EXPORTBOUND);
1259 q->get(CDir::PIN_EXPORTBOUND);
1260 } else {
7c673cae 1261 // include nested dirfrag
b32b8144
FG
1262 assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
1263 dfs.push_front(q);
7c673cae
FG
1264 }
1265 }
1266 }
1267 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1268 q != in->client_caps.end();
b32b8144
FG
1269 ++q) {
1270 approx_size += cap_size;
7c673cae 1271 client_set.insert(q->first);
b32b8144 1272 }
7c673cae 1273 }
b32b8144
FG
1274
1275 if (approx_size >= max_size)
1276 break;
1277 }
1278
1279 while (!dfs.empty()) {
1280 CDir *dir = dfs.front();
1281 dfs.pop_front();
1282
1283 dout(7) << "check_export_size: creating bound " << *dir << dendl;
1284 assert(dir->is_auth());
1285 dir->state_set(CDir::STATE_EXPORTBOUND);
1286 dir->get(CDir::PIN_EXPORTBOUND);
1287
1288 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
1289 // Another choice here is finishing all WAIT_UNFREEZE contexts and keeping
1290 // the newly created subtree unfreeze.
1291 dir->_freeze_tree();
1292
1293 stat.residual_dirs.insert(dir);
7c673cae
FG
1294 }
1295}
1296
1297void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1298{
1299 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1300 q != in->client_caps.end();
1301 ++q)
1302 client_set.insert(q->first);
1303}
1304
1305/* This function DOES put the passed message before returning*/
1306void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1307{
1308 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1309 mds_rank_t dest(m->get_source().num());
1310 utime_t now = ceph_clock_now();
1311 assert(dir);
1312
1313 dout(7) << "export_prep_ack " << *dir << dendl;
1314
1315 mds->hit_export_target(now, dest, -1);
1316
1317 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1318 if (it == export_state.end() ||
1319 it->second.tid != m->get_tid() ||
1320 it->second.peer != mds_rank_t(m->get_source().num())) {
1321 // export must have aborted.
1322 dout(7) << "export must have aborted" << dendl;
1323 m->put();
1324 return;
1325 }
1326 assert(it->second.state == EXPORT_PREPPING);
1327
1328 if (!m->is_success()) {
c07f9fc5 1329 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
7c673cae
FG
1330 export_try_cancel(dir, false);
1331 m->put();
1332 return;
1333 }
1334
1335 assert (g_conf->mds_kill_export_at != 5);
1336 // send warnings
1337 set<CDir*> bounds;
1338 cache->get_subtree_bounds(dir, bounds);
1339
1340 assert(it->second.warning_ack_waiting.empty() ||
1341 (it->second.warning_ack_waiting.size() == 1 &&
1342 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1343 assert(it->second.notify_ack_waiting.empty());
1344
181888fb
FG
1345 for (const auto &p : dir->get_replicas()) {
1346 if (p.first == it->second.peer) continue;
7c673cae 1347 if (mds->is_cluster_degraded() &&
181888fb 1348 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
7c673cae 1349 continue; // only if active
181888fb
FG
1350 it->second.warning_ack_waiting.insert(p.first);
1351 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
7c673cae
FG
1352
1353 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1354 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1355 mds_authority_t(mds->get_nodeid(),it->second.peer));
1356 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1357 notify->get_bounds().push_back((*q)->dirfrag());
181888fb 1358 mds->send_message_mds(notify, p.first);
7c673cae
FG
1359
1360 }
1361
1362 it->second.state = EXPORT_WARNING;
1363
1364 assert(g_conf->mds_kill_export_at != 6);
1365 // nobody to warn?
1366 if (it->second.warning_ack_waiting.empty())
1367 export_go(dir); // start export.
1368
1369 // done.
1370 m->put();
1371}
1372
1373
1374class C_M_ExportGo : public MigratorContext {
1375 CDir *dir;
1376 uint64_t tid;
1377public:
1378 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1379 MigratorContext(m), dir(d), tid(t) {
1380 assert(dir != NULL);
1381 }
1382 void finish(int r) override {
1383 mig->export_go_synced(dir, tid);
1384 }
1385};
1386
1387void Migrator::export_go(CDir *dir)
1388{
b32b8144
FG
1389 auto it = export_state.find(dir);
1390 assert(it != export_state.end());
1391 dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
7c673cae
FG
1392
1393 // first sync log to flush out e.g. any cap imports
b32b8144 1394 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
7c673cae
FG
1395 mds->mdlog->flush();
1396}
1397
1398void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1399{
1400 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1401 if (it == export_state.end() ||
1402 it->second.state == EXPORT_CANCELLING ||
1403 it->second.tid != tid) {
1404 // export must have aborted.
1405 dout(7) << "export must have aborted on " << dir << dendl;
1406 return;
1407 }
1408 assert(it->second.state == EXPORT_WARNING);
1409 mds_rank_t dest = it->second.peer;
1410
1411 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1412
1413 cache->show_subtrees();
1414
1415 it->second.state = EXPORT_EXPORTING;
1416 assert(g_conf->mds_kill_export_at != 7);
1417
1418 assert(dir->is_frozen_tree_root());
1419 assert(dir->get_cum_auth_pins() == 0);
1420
1421 // set ambiguous auth
1422 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1423
1424 // take away the popularity we're sending.
1425 utime_t now = ceph_clock_now();
1426 mds->balancer->subtract_export(dir, now);
1427
1428 // fill export message with cache data
1429 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1430 map<client_t,entity_inst_t> exported_client_map;
1431 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1432 dir, // recur start point
1433 exported_client_map,
1434 now);
1435 ::encode(exported_client_map, req->client_map,
1436 mds->mdsmap->get_up_features());
1437
1438 // add bounds to message
1439 set<CDir*> bounds;
1440 cache->get_subtree_bounds(dir, bounds);
1441 for (set<CDir*>::iterator p = bounds.begin();
1442 p != bounds.end();
1443 ++p)
1444 req->add_export((*p)->dirfrag());
1445
1446 // send
1447 mds->send_message_mds(req, dest);
1448 assert(g_conf->mds_kill_export_at != 8);
1449
1450 mds->hit_export_target(now, dest, num_exported_inodes+1);
1451
1452 // stats
1453 if (mds->logger) mds->logger->inc(l_mds_exported);
1454 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1455
1456 cache->show_subtrees();
1457}
1458
1459
1460/** encode_export_inode
1461 * update our local state for this inode to export.
1462 * encode relevant state to be sent over the wire.
1463 * used by: encode_export_dir, file_rename (if foreign)
1464 *
1465 * FIXME: the separation between CInode.encode_export and these methods
1466 * is pretty arbitrary and dumb.
1467 */
1468void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1469 map<client_t,entity_inst_t>& exported_client_map)
1470{
1471 dout(7) << "encode_export_inode " << *in << dendl;
1472 assert(!in->is_replica(mds->get_nodeid()));
1473
1474 // relax locks?
1475 if (!in->is_replicated()) {
1476 in->replicate_relax_locks();
1477 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1478 }
1479
1480 ::encode(in->inode.ino, enc_state);
1481 ::encode(in->last, enc_state);
1482 in->encode_export(enc_state);
1483
1484 // caps
1485 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1486}
1487
1488void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1489 map<client_t,entity_inst_t>& exported_client_map)
1490{
1491 dout(20) << "encode_export_inode_caps " << *in << dendl;
1492
1493 // encode caps
1494 map<client_t,Capability::Export> cap_map;
1495 in->export_client_caps(cap_map);
1496 ::encode(cap_map, bl);
1497 if (auth_cap) {
1498 ::encode(in->get_mds_caps_wanted(), bl);
1499
1500 in->state_set(CInode::STATE_EXPORTINGCAPS);
1501 in->get(CInode::PIN_EXPORTINGCAPS);
1502 }
1503
1504 // make note of clients named by exported capabilities
1505 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1506 it != in->client_caps.end();
1507 ++it)
1508 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1509}
1510
1511void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1512 map<client_t,Capability::Import>& peer_imported)
1513{
1514 dout(20) << "finish_export_inode_caps " << *in << dendl;
1515
1516 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1517 in->put(CInode::PIN_EXPORTINGCAPS);
1518
1519 // tell (all) clients about migrating caps..
1520 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1521 it != in->client_caps.end();
1522 ++it) {
1523 Capability *cap = it->second;
1524 dout(7) << "finish_export_inode_caps telling client." << it->first
1525 << " exported caps on " << *in << dendl;
1526 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1527 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1528
1529 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1530 assert(q != peer_imported.end());
28e407b8
AA
1531 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
1532 (q->second.cap_id > 0 ? peer : -1), 0);
7c673cae
FG
1533 mds->send_message_client_counted(m, it->first);
1534 }
1535 in->clear_client_caps_after_export();
1536 mds->locker->eval(in, CEPH_CAP_LOCKS);
1537}
1538
1539void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1540 map<client_t,Capability::Import>& peer_imported,
1541 list<MDSInternalContextBase*>& finished)
1542{
1543 dout(12) << "finish_export_inode " << *in << dendl;
1544
1545 // clean
1546 if (in->is_dirty())
1547 in->mark_clean();
1548
1549 // clear/unpin cached_by (we're no longer the authority)
1550 in->clear_replica_map();
1551
1552 // twiddle lock states for auth -> replica transition
1553 in->authlock.export_twiddle();
1554 in->linklock.export_twiddle();
1555 in->dirfragtreelock.export_twiddle();
1556 in->filelock.export_twiddle();
1557 in->nestlock.export_twiddle();
1558 in->xattrlock.export_twiddle();
1559 in->snaplock.export_twiddle();
1560 in->flocklock.export_twiddle();
1561 in->policylock.export_twiddle();
1562
1563 // mark auth
1564 assert(in->is_auth());
1565 in->state_clear(CInode::STATE_AUTH);
1566 in->replica_nonce = CInode::EXPORT_NONCE;
1567
1568 in->clear_dirty_rstat();
1569
1570 // no more auth subtree? clear scatter dirty
1571 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1572 in->clear_scatter_dirty();
1573
1574 in->item_open_file.remove_myself();
1575
1576 in->clear_dirty_parent();
1577
1578 in->clear_file_locks();
1579
1580 // waiters
1581 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1582
1583 in->finish_export(now);
1584
1585 finish_export_inode_caps(in, peer, peer_imported);
7c673cae
FG
1586}
1587
1588uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1589 CDir *dir,
1590 map<client_t,entity_inst_t>& exported_client_map,
1591 utime_t now)
1592{
1593 uint64_t num_exported = 0;
1594
1595 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1596
1597 assert(dir->get_projected_version() == dir->get_version());
1598
1599#ifdef MDS_VERIFY_FRAGSTAT
1600 if (dir->is_complete())
1601 dir->verify_fragstat();
1602#endif
1603
1604 // dir
1605 dirfrag_t df = dir->dirfrag();
1606 ::encode(df, exportbl);
1607 dir->encode_export(exportbl);
1608
1609 __u32 nden = dir->items.size();
1610 ::encode(nden, exportbl);
1611
1612 // dentries
1613 list<CDir*> subdirs;
94b18763
FG
1614 for (auto &p : *dir) {
1615 CDentry *dn = p.second;
7c673cae
FG
1616 CInode *in = dn->get_linkage()->get_inode();
1617
1618 if (!dn->is_replicated())
1619 dn->lock.replicate_relax();
1620
1621 num_exported++;
1622
1623 // -- dentry
1624 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1625
1626 // dn name
94b18763 1627 ::encode(dn->get_name(), exportbl);
7c673cae
FG
1628 ::encode(dn->last, exportbl);
1629
1630 // state
1631 dn->encode_export(exportbl);
1632
1633 // points to...
1634
1635 // null dentry?
1636 if (dn->get_linkage()->is_null()) {
1637 exportbl.append("N", 1); // null dentry
1638 continue;
1639 }
1640
1641 if (dn->get_linkage()->is_remote()) {
1642 // remote link
1643 exportbl.append("L", 1); // remote link
1644
1645 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1646 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1647 ::encode(ino, exportbl);
1648 ::encode(d_type, exportbl);
1649 continue;
1650 }
1651
1652 // primary link
1653 // -- inode
1654 exportbl.append("I", 1); // inode dentry
1655
1656 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1657
1658 // directory?
1659 list<CDir*> dfs;
1660 in->get_dirfrags(dfs);
1661 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1662 CDir *t = *p;
1663 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1664 // include nested dirfrag
1665 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
b32b8144 1666 subdirs.push_front(t); // it's ours, recurse (later)
7c673cae
FG
1667 }
1668 }
1669 }
1670
1671 // subdirs
94b18763
FG
1672 for (auto &dir : subdirs)
1673 num_exported += encode_export_dir(exportbl, dir, exported_client_map, now);
7c673cae
FG
1674
1675 return num_exported;
1676}
1677
1678void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1679 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1680 list<MDSInternalContextBase*>& finished, int *num_dentries)
1681{
1682 dout(10) << "finish_export_dir " << *dir << dendl;
1683
1684 // release open_by
1685 dir->clear_replica_map();
1686
1687 // mark
1688 assert(dir->is_auth());
1689 dir->state_clear(CDir::STATE_AUTH);
1690 dir->remove_bloom();
1691 dir->replica_nonce = CDir::EXPORT_NONCE;
1692
1693 if (dir->is_dirty())
1694 dir->mark_clean();
1695
1696 // suck up all waiters
1697 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1698
1699 // pop
1700 dir->finish_export(now);
1701
1702 // dentries
1703 list<CDir*> subdirs;
94b18763
FG
1704 for (auto &p : *dir) {
1705 CDentry *dn = p.second;
7c673cae
FG
1706 CInode *in = dn->get_linkage()->get_inode();
1707
1708 // dentry
1709 dn->finish_export();
1710
1711 // inode?
1712 if (dn->get_linkage()->is_primary()) {
1713 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1714
1715 // subdirs?
1716 in->get_nested_dirfrags(subdirs);
1717 }
1718
1719 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1720 ++(*num_dentries);
1721 }
1722
1723 // subdirs
1724 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1725 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1726}
1727
1728class C_MDS_ExportFinishLogged : public MigratorLogContext {
1729 CDir *dir;
1730public:
1731 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1732 void finish(int r) override {
1733 mig->export_logged_finish(dir);
1734 }
1735};
1736
1737
1738/*
1739 * i should get an export_ack from the export target.
1740 *
1741 * This function DOES put the passed message before returning
1742 */
1743void Migrator::handle_export_ack(MExportDirAck *m)
1744{
1745 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1746 mds_rank_t dest(m->get_source().num());
1747 utime_t now = ceph_clock_now();
1748 assert(dir);
1749 assert(dir->is_frozen_tree_root()); // i'm exporting!
1750
1751 // yay!
1752 dout(7) << "handle_export_ack " << *dir << dendl;
1753
1754 mds->hit_export_target(now, dest, -1);
1755
1756 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1757 assert(it != export_state.end());
1758 assert(it->second.state == EXPORT_EXPORTING);
1759 assert(it->second.tid == m->get_tid());
1760
1761 bufferlist::iterator bp = m->imported_caps.begin();
1762 ::decode(it->second.peer_imported, bp);
1763
1764 it->second.state = EXPORT_LOGGINGFINISH;
1765 assert (g_conf->mds_kill_export_at != 9);
1766 set<CDir*> bounds;
1767 cache->get_subtree_bounds(dir, bounds);
1768
7c673cae
FG
1769 // log completion.
1770 // include export bounds, to ensure they're in the journal.
31f18b77 1771 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
7c673cae
FG
1772 mds->mdlog->start_entry(le);
1773
1774 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
31f18b77 1775 le->metablob.add_dir(dir, false);
7c673cae
FG
1776 for (set<CDir*>::iterator p = bounds.begin();
1777 p != bounds.end();
1778 ++p) {
1779 CDir *bound = *p;
1780 le->get_bounds().insert(bound->dirfrag());
1781 le->metablob.add_dir_context(bound);
1782 le->metablob.add_dir(bound, false);
1783 }
1784
31f18b77
FG
1785 // list us second, them first.
1786 // this keeps authority().first in sync with subtree auth state in the journal.
1787 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1788
7c673cae
FG
1789 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1790 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1791 mds->mdlog->flush();
1792 assert (g_conf->mds_kill_export_at != 10);
1793
1794 m->put();
1795}
1796
b32b8144 1797void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
7c673cae
FG
1798{
1799 dout(7) << "export_notify_abort " << *dir << dendl;
1800
7c673cae
FG
1801 assert(stat.state == EXPORT_CANCELLING);
1802
1803 if (stat.notify_ack_waiting.empty()) {
1804 stat.state = EXPORT_CANCELLED;
1805 return;
1806 }
1807
1808 dir->auth_pin(this);
1809
1810 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1811 p != stat.notify_ack_waiting.end();
1812 ++p) {
b32b8144
FG
1813 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1814 pair<int,int>(mds->get_nodeid(), stat.peer),
1815 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
7c673cae
FG
1816 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1817 notify->get_bounds().push_back((*i)->dirfrag());
1818 mds->send_message_mds(notify, *p);
1819 }
1820}
1821
1822/*
1823 * this happens if hte dest failes after i send teh export data but before it is acked
1824 * that is, we don't know they safely received and logged it, so we reverse our changes
1825 * and go on.
1826 */
b32b8144 1827void Migrator::export_reverse(CDir *dir, export_state_t& stat)
7c673cae
FG
1828{
1829 dout(7) << "export_reverse " << *dir << dendl;
1830
1831 set<CInode*> to_eval;
1832
1833 set<CDir*> bounds;
1834 cache->get_subtree_bounds(dir, bounds);
1835
1836 // remove exporting pins
1837 list<CDir*> rq;
1838 rq.push_back(dir);
1839 while (!rq.empty()) {
1840 CDir *t = rq.front();
1841 rq.pop_front();
1842 t->abort_export();
94b18763
FG
1843 for (auto &p : *t) {
1844 CDentry *dn = p.second;
1845 dn->abort_export();
1846 if (!dn->get_linkage()->is_primary())
7c673cae 1847 continue;
94b18763 1848 CInode *in = dn->get_linkage()->get_inode();
7c673cae
FG
1849 in->abort_export();
1850 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1851 in->state_clear(CInode::STATE_EVALSTALECAPS);
1852 to_eval.insert(in);
1853 }
1854 if (in->is_dir())
1855 in->get_nested_dirfrags(rq);
1856 }
1857 }
1858
1859 // unpin bounds
b32b8144 1860 for (auto bd : bounds) {
7c673cae
FG
1861 bd->put(CDir::PIN_EXPORTBOUND);
1862 bd->state_clear(CDir::STATE_EXPORTBOUND);
1863 }
1864
7c673cae 1865 // notify bystanders
b32b8144 1866 export_notify_abort(dir, stat, bounds);
7c673cae 1867
224ce89b
WB
1868 // unfreeze tree, with possible subtree merge.
1869 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1870
7c673cae
FG
1871 // process delayed expires
1872 cache->process_delayed_expire(dir);
224ce89b 1873
7c673cae 1874 dir->unfreeze_tree();
224ce89b 1875 cache->try_subtree_merge(dir);
b32b8144
FG
1876 for (auto bd : stat.residual_dirs) {
1877 bd->unfreeze_tree();
1878 cache->try_subtree_merge(bd);
1879 }
7c673cae
FG
1880
1881 // revoke/resume stale caps
1882 for (auto in : to_eval) {
1883 bool need_issue = false;
1884 for (auto& p : in->get_client_caps()) {
1885 Capability *cap = p.second;
1886 if (cap->is_stale()) {
1887 mds->locker->revoke_stale_caps(cap);
1888 } else {
1889 need_issue = true;
1890 }
1891 }
1892 if (need_issue &&
1893 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
1894 mds->locker->issue_caps(in);
1895 }
1896
1897 cache->show_cache();
1898}
1899
1900
1901/*
1902 * once i get the ack, and logged the EExportFinish(true),
1903 * send notifies (if any), otherwise go straight to finish.
1904 *
1905 */
1906void Migrator::export_logged_finish(CDir *dir)
1907{
1908 dout(7) << "export_logged_finish " << *dir << dendl;
1909
1910 export_state_t& stat = export_state[dir];
1911
1912 // send notifies
1913 set<CDir*> bounds;
1914 cache->get_subtree_bounds(dir, bounds);
1915
1916 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1917 p != stat.notify_ack_waiting.end();
1918 ++p) {
1919 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1920 pair<int,int>(mds->get_nodeid(), stat.peer),
1921 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
1922
1923 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1924 notify->get_bounds().push_back((*i)->dirfrag());
1925
1926 mds->send_message_mds(notify, *p);
1927 }
1928
1929 // wait for notifyacks
1930 stat.state = EXPORT_NOTIFYING;
1931 assert (g_conf->mds_kill_export_at != 11);
1932
1933 // no notifies to wait for?
1934 if (stat.notify_ack_waiting.empty()) {
1935 export_finish(dir); // skip notify/notify_ack stage.
1936 } else {
1937 // notify peer to send cap import messages to clients
1938 if (!mds->is_cluster_degraded() ||
1939 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
1940 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
1941 } else {
1942 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
1943 }
1944 }
1945}
1946
1947/*
1948 * warning:
1949 * i'll get an ack from each bystander.
1950 * when i get them all, do the export.
1951 * notify:
1952 * i'll get an ack from each bystander.
1953 * when i get them all, unfreeze and send the finish.
1954 *
1955 * This function DOES put the passed message before returning
1956 */
1957void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
1958{
1959 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1960 mds_rank_t dest(m->get_source().num());
1961 utime_t now = ceph_clock_now();
1962 assert(dir);
1963 mds_rank_t from = mds_rank_t(m->get_source().num());
1964
1965 mds->hit_export_target(now, dest, -1);
1966
1967 auto export_state_entry = export_state.find(dir);
1968 if (export_state_entry != export_state.end()) {
1969 export_state_t& stat = export_state_entry->second;
1970 if (stat.state == EXPORT_WARNING &&
1971 stat.warning_ack_waiting.erase(from)) {
1972 // exporting. process warning.
1973 dout(7) << "handle_export_notify_ack from " << m->get_source()
1974 << ": exporting, processing warning on " << *dir << dendl;
1975 if (stat.warning_ack_waiting.empty())
1976 export_go(dir); // start export.
1977 } else if (stat.state == EXPORT_NOTIFYING &&
1978 stat.notify_ack_waiting.erase(from)) {
1979 // exporting. process notify.
1980 dout(7) << "handle_export_notify_ack from " << m->get_source()
1981 << ": exporting, processing notify on " << *dir << dendl;
1982 if (stat.notify_ack_waiting.empty())
1983 export_finish(dir);
1984 } else if (stat.state == EXPORT_CANCELLING &&
1985 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
1986 stat.notify_ack_waiting.erase(from)) {
1987 dout(7) << "handle_export_notify_ack from " << m->get_source()
1988 << ": cancelling export, processing notify on " << *dir << dendl;
1989 if (stat.notify_ack_waiting.empty()) {
1990 export_state.erase(export_state_entry);
1991 export_cancel_finish(dir);
1992 }
1993 }
1994 }
1995 else {
1996 auto import_state_entry = import_state.find(dir->dirfrag());
1997 if (import_state_entry != import_state.end()) {
1998 import_state_t& stat = import_state_entry->second;
1999 if (stat.state == IMPORT_ABORTING) {
2000 // reversing import
2001 dout(7) << "handle_export_notify_ack from " << m->get_source()
2002 << ": aborting import on " << *dir << dendl;
2003 assert(stat.bystanders.count(from));
2004 stat.bystanders.erase(from);
2005 if (stat.bystanders.empty())
2006 import_reverse_unfreeze(dir);
2007 }
2008 }
2009 }
2010
2011 m->put();
2012}
2013
2014void Migrator::export_finish(CDir *dir)
2015{
2016 dout(5) << "export_finish " << *dir << dendl;
2017
2018 assert (g_conf->mds_kill_export_at != 12);
2019 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2020 if (it == export_state.end()) {
2021 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
2022 return;
2023 }
2024
2025 // send finish/commit to new auth
2026 if (!mds->is_cluster_degraded() ||
2027 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
2028 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
2029 } else {
2030 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
2031 }
2032 assert(g_conf->mds_kill_export_at != 13);
2033
2034 // finish export (adjust local cache state)
2035 int num_dentries = 0;
224ce89b 2036 list<MDSInternalContextBase*> finished;
7c673cae 2037 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
224ce89b
WB
2038 it->second.peer_imported, finished, &num_dentries);
2039
2040 assert(!dir->is_auth());
2041 cache->adjust_subtree_auth(dir, it->second.peer);
2042
7c673cae
FG
2043 // unpin bounds
2044 set<CDir*> bounds;
2045 cache->get_subtree_bounds(dir, bounds);
2046 for (set<CDir*>::iterator p = bounds.begin();
2047 p != bounds.end();
2048 ++p) {
2049 CDir *bd = *p;
2050 bd->put(CDir::PIN_EXPORTBOUND);
2051 bd->state_clear(CDir::STATE_EXPORTBOUND);
2052 }
2053
2054 if (dir->state_test(CDir::STATE_AUXSUBTREE))
2055 dir->state_clear(CDir::STATE_AUXSUBTREE);
2056
224ce89b
WB
2057 // discard delayed expires
2058 cache->discard_delayed_expire(dir);
2059
2060 dout(7) << "export_finish unfreezing" << dendl;
2061
2062 // unfreeze tree, with possible subtree merge.
7c673cae 2063 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
224ce89b 2064 dir->unfreeze_tree();
7c673cae 2065 cache->try_subtree_merge(dir);
b32b8144
FG
2066 for (auto bd : it->second.residual_dirs) {
2067 export_queue.push_front(pair<dirfrag_t,mds_rank_t>(bd->dirfrag(), it->second.peer));
2068 bd->take_waiting(CDir::WAIT_ANY_MASK, finished);
2069 bd->unfreeze_tree();
2070 cache->try_subtree_merge(bd);
2071 }
7c673cae
FG
2072
2073 // no more auth subtree? clear scatter dirty
2074 if (!dir->get_inode()->is_auth() &&
2075 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2076 dir->get_inode()->clear_scatter_dirty();
2077 // wake up scatter_nudge waiters
224ce89b 2078 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
7c673cae
FG
2079 }
2080
224ce89b
WB
2081 if (!finished.empty())
2082 mds->queue_waiters(finished);
7c673cae
FG
2083
2084 MutationRef mut = it->second.mut;
2085 // remove from exporting list, clean up state
2086 export_state.erase(it);
1adf2230 2087 dir->clear_exporting();
7c673cae
FG
2088
2089 cache->show_subtrees();
2090 audit();
2091
181888fb 2092 cache->trim(num_dentries); // try trimming exported dentries
7c673cae
FG
2093
2094 // send pending import_maps?
2095 mds->mdcache->maybe_send_pending_resolves();
2096
2097 // drop locks, unpin path
2098 if (mut) {
2099 mds->locker->drop_locks(mut.get());
2100 mut->cleanup();
2101 }
2102
2103 maybe_do_queued_export();
2104}
2105
2106
2107
2108
2109
2110
2111
2112
2113// ==========================================================
2114// IMPORT
2115
2116void Migrator::handle_export_discover(MExportDirDiscover *m)
2117{
2118 mds_rank_t from = m->get_source_mds();
2119 assert(from != mds->get_nodeid());
2120
2121 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2122
2123 // note import state
2124 dirfrag_t df = m->get_dirfrag();
c07f9fc5
FG
2125
2126 if (!mds->is_active()) {
2127 dout(7) << " not active, send NACK " << dendl;
2128 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
2129 m->put();
2130 return;
2131 }
2132
7c673cae 2133 // only start discovering on this message once.
b32b8144 2134 import_state_t *p_state;
7c673cae
FG
2135 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2136 if (!m->started) {
2137 assert(it == import_state.end());
2138 m->started = true;
b32b8144
FG
2139 p_state = &import_state[df];
2140 p_state->state = IMPORT_DISCOVERING;
2141 p_state->peer = from;
2142 p_state->tid = m->get_tid();
7c673cae
FG
2143 } else {
2144 // am i retrying after ancient path_traverse results?
2145 if (it == import_state.end() ||
2146 it->second.peer != from ||
2147 it->second.tid != m->get_tid()) {
2148 dout(7) << " dropping obsolete message" << dendl;
2149 m->put();
2150 return;
2151 }
2152 assert(it->second.state == IMPORT_DISCOVERING);
b32b8144 2153 p_state = &it->second;
7c673cae
FG
2154 }
2155
2156 if (!mds->mdcache->is_open()) {
2157 dout(5) << " waiting for root" << dendl;
2158 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2159 return;
2160 }
2161
2162 assert (g_conf->mds_kill_import_at != 1);
2163
2164 // do we have it?
2165 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2166 if (!in) {
2167 // must discover it!
2168 filepath fpath(m->get_path());
2169 vector<CDentry*> trace;
2170 MDRequestRef null_ref;
2171 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2172 if (r > 0) return;
2173 if (r < 0) {
2174 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2175 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2176 }
2177
2178 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2179 }
2180
2181 // yay
2182 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2183
b32b8144 2184 p_state->state = IMPORT_DISCOVERED;
7c673cae
FG
2185
2186 // pin inode in the cache (for now)
2187 assert(in->is_dir());
2188 in->get(CInode::PIN_IMPORTING);
2189
2190 // reply
2191 dout(7) << " sending export_discover_ack on " << *in << dendl;
b32b8144 2192 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), p_state->peer);
7c673cae
FG
2193 m->put();
2194 assert (g_conf->mds_kill_import_at != 2);
2195}
2196
2197void Migrator::import_reverse_discovering(dirfrag_t df)
2198{
2199 import_state.erase(df);
2200}
2201
2202void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2203{
2204 // unpin base
2205 diri->put(CInode::PIN_IMPORTING);
2206 import_state.erase(df);
2207}
2208
b32b8144 2209void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
7c673cae
FG
2210{
2211 set<CDir*> bounds;
b32b8144 2212 cache->map_dirfrag_set(stat.bound_ls, bounds);
7c673cae
FG
2213 import_remove_pins(dir, bounds);
2214 import_reverse_final(dir);
2215}
2216
2217/* This function DOES put the passed message before returning*/
2218void Migrator::handle_export_cancel(MExportDirCancel *m)
2219{
2220 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2221 dirfrag_t df = m->get_dirfrag();
2222 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2223 if (it == import_state.end()) {
2224 assert(0 == "got export_cancel in weird state");
2225 } else if (it->second.state == IMPORT_DISCOVERING) {
2226 import_reverse_discovering(df);
2227 } else if (it->second.state == IMPORT_DISCOVERED) {
2228 CInode *in = cache->get_inode(df.ino);
2229 assert(in);
2230 import_reverse_discovered(df, in);
2231 } else if (it->second.state == IMPORT_PREPPING) {
2232 CDir *dir = mds->mdcache->get_dirfrag(df);
2233 assert(dir);
b32b8144 2234 import_reverse_prepping(dir, it->second);
7c673cae
FG
2235 } else if (it->second.state == IMPORT_PREPPED) {
2236 CDir *dir = mds->mdcache->get_dirfrag(df);
2237 assert(dir);
2238 set<CDir*> bounds;
2239 cache->get_subtree_bounds(dir, bounds);
2240 import_remove_pins(dir, bounds);
2241 // adjust auth back to the exportor
2242 cache->adjust_subtree_auth(dir, it->second.peer);
7c673cae
FG
2243 import_reverse_unfreeze(dir);
2244 } else {
2245 assert(0 == "got export_cancel in weird state");
2246 }
2247 m->put();
2248}
2249
2250/* This function DOES put the passed message before returning*/
2251void Migrator::handle_export_prep(MExportDirPrep *m)
2252{
2253 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2254 assert(oldauth != mds->get_nodeid());
2255
2256 CDir *dir;
2257 CInode *diri;
2258 list<MDSInternalContextBase*> finished;
2259
2260 // assimilate root dir.
2261 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2262 if (!m->did_assim()) {
2263 assert(it != import_state.end());
2264 assert(it->second.state == IMPORT_DISCOVERED);
31f18b77 2265 assert(it->second.peer == oldauth);
7c673cae
FG
2266 diri = cache->get_inode(m->get_dirfrag().ino);
2267 assert(diri);
2268 bufferlist::iterator p = m->basedir.begin();
2269 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2270 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2271 } else {
2272 if (it == import_state.end() ||
2273 it->second.peer != oldauth ||
2274 it->second.tid != m->get_tid()) {
2275 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2276 m->put();
2277 return;
2278 }
2279 assert(it->second.state == IMPORT_PREPPING);
31f18b77 2280 assert(it->second.peer == oldauth);
7c673cae
FG
2281
2282 dir = cache->get_dirfrag(m->get_dirfrag());
2283 assert(dir);
2284 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2285 diri = dir->get_inode();
2286 }
2287 assert(dir->is_auth() == false);
2288
2289 cache->show_subtrees();
2290
2291 // build import bound map
2292 map<inodeno_t, fragset_t> import_bound_fragset;
2293 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2294 p != m->get_bounds().end();
2295 ++p) {
2296 dout(10) << " bound " << *p << dendl;
2297 import_bound_fragset[p->ino].insert(p->frag);
2298 }
2299
2300 // assimilate contents?
2301 if (!m->did_assim()) {
2302 dout(7) << "doing assim on " << *dir << dendl;
2303 m->mark_assim(); // only do this the first time!
2304
2305 // change import state
2306 it->second.state = IMPORT_PREPPING;
2307 it->second.bound_ls = m->get_bounds();
2308 it->second.bystanders = m->get_bystanders();
2309 assert(g_conf->mds_kill_import_at != 3);
2310
2311 // bystander list
2312 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2313
2314 // move pin to dir
2315 diri->put(CInode::PIN_IMPORTING);
2316 dir->get(CDir::PIN_IMPORTING);
2317 dir->state_set(CDir::STATE_IMPORTING);
2318
2319 // assimilate traces to exports
2320 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2321 for (list<bufferlist>::iterator p = m->traces.begin();
2322 p != m->traces.end();
2323 ++p) {
2324 bufferlist::iterator q = p->begin();
2325 dirfrag_t df;
2326 ::decode(df, q);
2327 char start;
2328 ::decode(start, q);
2329 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2330
2331 CDir *cur = 0;
2332 if (start == 'd') {
2333 cur = cache->get_dirfrag(df);
2334 assert(cur);
2335 dout(10) << " had " << *cur << dendl;
2336 } else if (start == 'f') {
2337 CInode *in = cache->get_inode(df.ino);
2338 assert(in);
2339 dout(10) << " had " << *in << dendl;
2340 cur = cache->add_replica_dir(q, in, oldauth, finished);
2341 dout(10) << " added " << *cur << dendl;
2342 } else if (start == '-') {
2343 // nothing
2344 } else
2345 assert(0 == "unrecognized start char");
2346
b32b8144 2347 while (!q.end()) {
7c673cae
FG
2348 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2349 dout(10) << " added " << *dn << dendl;
2350 CInode *in = cache->add_replica_inode(q, dn, finished);
2351 dout(10) << " added " << *in << dendl;
2352 if (q.end())
2353 break;
2354 cur = cache->add_replica_dir(q, in, oldauth, finished);
2355 dout(10) << " added " << *cur << dendl;
2356 }
2357 }
2358
2359 // make bound sticky
2360 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2361 p != import_bound_fragset.end();
2362 ++p) {
2363 CInode *in = cache->get_inode(p->first);
2364 assert(in);
2365 in->get_stickydirs();
2366 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2367 }
2368
2369 } else {
2370 dout(7) << " not doing assim on " << *dir << dendl;
2371 }
2372
2373 if (!finished.empty())
2374 mds->queue_waiters(finished);
2375
2376
c07f9fc5
FG
2377 bool success = true;
2378 if (mds->is_active()) {
2379 // open all bounds
2380 set<CDir*> import_bounds;
2381 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2382 p != import_bound_fragset.end();
2383 ++p) {
2384 CInode *in = cache->get_inode(p->first);
2385 assert(in);
7c673cae 2386
c07f9fc5
FG
2387 // map fragset into a frag_t list, based on the inode fragtree
2388 list<frag_t> fglist;
2389 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2390 in->dirfragtree.get_leaves_under(*q, fglist);
2391 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2392
2393 for (list<frag_t>::iterator q = fglist.begin();
2394 q != fglist.end();
2395 ++q) {
2396 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2397 if (!bound) {
2398 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2399 cache->open_remote_dirfrag(in, *q,
2400 new C_MDS_RetryMessage(mds, m));
2401 return;
2402 }
7c673cae 2403
c07f9fc5
FG
2404 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2405 dout(7) << " pinning import bound " << *bound << dendl;
2406 bound->get(CDir::PIN_IMPORTBOUND);
2407 bound->state_set(CDir::STATE_IMPORTBOUND);
2408 } else {
2409 dout(7) << " already pinned import bound " << *bound << dendl;
2410 }
2411 import_bounds.insert(bound);
7c673cae 2412 }
7c673cae 2413 }
7c673cae 2414
c07f9fc5
FG
2415 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2416
2417 if (!mds->mdcache->is_readonly() &&
2418 dir->get_inode()->filelock.can_wrlock(-1) &&
2419 dir->get_inode()->nestlock.can_wrlock(-1)) {
2420 it->second.mut = new MutationImpl();
2421 // force some locks. hacky.
2422 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2423 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2424
2425 // note that i am an ambiguous auth for this subtree.
2426 // specify bounds, since the exporter explicitly defines the region.
2427 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2428 pair<int,int>(oldauth, mds->get_nodeid()));
2429 cache->verify_subtree_bounds(dir, import_bounds);
2430 // freeze.
2431 dir->_freeze_tree();
2432 // note new state
2433 it->second.state = IMPORT_PREPPED;
2434 } else {
2435 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2436 success = false;
2437 }
7c673cae 2438 } else {
c07f9fc5 2439 dout(7) << " not active, failing. " << *dir << dendl;
7c673cae 2440 success = false;
7c673cae
FG
2441 }
2442
c07f9fc5 2443 if (!success)
b32b8144 2444 import_reverse_prepping(dir, it->second);
c07f9fc5 2445
7c673cae
FG
2446 // ok!
2447 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2448 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2449
2450 assert(g_conf->mds_kill_import_at != 4);
2451 // done
2452 m->put();
2453}
2454
2455
2456
2457
2458class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2459 dirfrag_t df;
2460 CDir *dir;
2461 mds_rank_t from;
2462public:
28e407b8 2463 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae
FG
2464
2465 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2466 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2467 }
2468 void finish(int r) override {
28e407b8 2469 mig->import_logged_start(df, dir, from, imported_session_map);
7c673cae
FG
2470 }
2471};
2472
2473/* This function DOES put the passed message before returning*/
2474void Migrator::handle_export_dir(MExportDir *m)
2475{
2476 assert (g_conf->mds_kill_import_at != 5);
2477 CDir *dir = cache->get_dirfrag(m->dirfrag);
2478 assert(dir);
31f18b77
FG
2479
2480 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2481 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2482
2483 assert(!dir->is_auth());
7c673cae
FG
2484
2485 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2486 assert(it != import_state.end());
2487 assert(it->second.state == IMPORT_PREPPED);
2488 assert(it->second.tid == m->get_tid());
31f18b77 2489 assert(it->second.peer == oldauth);
7c673cae
FG
2490
2491 utime_t now = ceph_clock_now();
7c673cae
FG
2492
2493 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2494 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2495
2496 cache->show_subtrees();
2497
31f18b77 2498 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
7c673cae
FG
2499
2500 // start the journal entry
31f18b77 2501 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
7c673cae
FG
2502 mds->mdlog->start_entry(le);
2503
2504 le->metablob.add_dir_context(dir);
2505
2506 // adjust auth (list us _first_)
2507 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2508
2509 // new client sessions, open these after we journal
2510 // include imported sessions in EImportStart
2511 bufferlist::iterator cmp = m->client_map.begin();
28e407b8
AA
2512 map<client_t,entity_inst_t> client_map;
2513 decode(client_map, cmp);
7c673cae 2514 assert(cmp.end());
28e407b8
AA
2515 le->cmapv = mds->server->prepare_force_open_sessions(client_map, onlogged->imported_session_map);
2516 encode(client_map, le->client_map, mds->mdsmap->get_up_features());
7c673cae
FG
2517
2518 bufferlist::iterator blp = m->export_data.begin();
2519 int num_imported_inodes = 0;
2520 while (!blp.end()) {
2521 num_imported_inodes +=
2522 decode_import_dir(blp,
2523 oldauth,
2524 dir, // import root
2525 le,
2526 mds->mdlog->get_current_segment(),
2527 it->second.peer_exports,
2528 it->second.updated_scatterlocks,
2529 now);
2530 }
2531 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2532
2533 // include bounds in EImportStart
2534 set<CDir*> import_bounds;
2535 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2536 p != m->bounds.end();
2537 ++p) {
2538 CDir *bd = cache->get_dirfrag(*p);
2539 assert(bd);
2540 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2541 import_bounds.insert(bd);
2542 }
2543 cache->verify_subtree_bounds(dir, import_bounds);
2544
2545 // adjust popularity
2546 mds->balancer->add_import(dir, now);
2547
2548 dout(7) << "handle_export_dir did " << *dir << dendl;
2549
2550 // note state
2551 it->second.state = IMPORT_LOGGINGSTART;
2552 assert (g_conf->mds_kill_import_at != 6);
2553
2554 // log it
2555 mds->mdlog->submit_entry(le, onlogged);
2556 mds->mdlog->flush();
2557
2558 // some stats
2559 if (mds->logger) {
2560 mds->logger->inc(l_mds_imported);
2561 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2562 }
2563
2564 m->put();
2565}
2566
2567
2568/*
2569 * this is an import helper
2570 * called by import_finish, and import_reverse and friends.
2571 */
2572void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2573{
2574 import_state_t& stat = import_state[dir->dirfrag()];
2575 // root
2576 dir->put(CDir::PIN_IMPORTING);
2577 dir->state_clear(CDir::STATE_IMPORTING);
2578
2579 // bounding inodes
2580 set<inodeno_t> did;
2581 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2582 p != stat.bound_ls.end();
2583 ++p) {
2584 if (did.count(p->ino))
2585 continue;
2586 did.insert(p->ino);
2587 CInode *in = cache->get_inode(p->ino);
2588 assert(in);
2589 in->put_stickydirs();
2590 }
2591
2592 if (stat.state == IMPORT_PREPPING) {
2593 for (auto bd : bounds) {
2594 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2595 bd->put(CDir::PIN_IMPORTBOUND);
2596 bd->state_clear(CDir::STATE_IMPORTBOUND);
2597 }
2598 }
2599 } else if (stat.state >= IMPORT_PREPPED) {
2600 // bounding dirfrags
2601 for (auto bd : bounds) {
2602 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2603 bd->put(CDir::PIN_IMPORTBOUND);
2604 bd->state_clear(CDir::STATE_IMPORTBOUND);
2605 }
2606 }
2607}
2608
2609
2610/*
2611 * note: this does teh full work of reversing and import and cleaning up
2612 * state.
2613 * called by both handle_mds_failure and by handle_resolve (if we are
2614 * a survivor coping with an exporter failure+recovery).
2615 */
2616void Migrator::import_reverse(CDir *dir)
2617{
2618 dout(7) << "import_reverse " << *dir << dendl;
2619
2620 import_state_t& stat = import_state[dir->dirfrag()];
2621 stat.state = IMPORT_ABORTING;
2622
2623 set<CDir*> bounds;
2624 cache->get_subtree_bounds(dir, bounds);
2625
2626 // remove pins
2627 import_remove_pins(dir, bounds);
2628
2629 // update auth, with possible subtree merge.
2630 assert(dir->is_subtree_root());
2631 if (mds->is_resolve())
2632 cache->trim_non_auth_subtree(dir);
2633
2634 cache->adjust_subtree_auth(dir, stat.peer);
2635
2636 C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather> *fin = new C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather>(g_ceph_context);
2637 if (!dir->get_inode()->is_auth() &&
2638 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2639 dir->get_inode()->clear_scatter_dirty();
2640 // wake up scatter_nudge waiters
2641 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2642 }
2643
2644 int num_dentries = 0;
2645 // adjust auth bits.
2646 list<CDir*> q;
2647 q.push_back(dir);
2648 while (!q.empty()) {
2649 CDir *cur = q.front();
2650 q.pop_front();
2651
2652 // dir
2653 assert(cur->is_auth());
2654 cur->state_clear(CDir::STATE_AUTH);
2655 cur->remove_bloom();
2656 cur->clear_replica_map();
2657 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2658 if (cur->is_dirty())
2659 cur->mark_clean();
2660
94b18763
FG
2661 for (auto &p : *cur) {
2662 CDentry *dn = p.second;
7c673cae
FG
2663
2664 // dentry
2665 dn->state_clear(CDentry::STATE_AUTH);
2666 dn->clear_replica_map();
2667 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2668 if (dn->is_dirty())
2669 dn->mark_clean();
2670
2671 // inode?
2672 if (dn->get_linkage()->is_primary()) {
2673 CInode *in = dn->get_linkage()->get_inode();
2674 in->state_clear(CDentry::STATE_AUTH);
2675 in->clear_replica_map();
2676 in->set_replica_nonce(CInode::EXPORT_NONCE);
2677 if (in->is_dirty())
2678 in->mark_clean();
2679 in->clear_dirty_rstat();
2680 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2681 in->clear_scatter_dirty();
2682 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2683 }
2684
2685 in->clear_dirty_parent();
2686
2687 in->authlock.clear_gather();
2688 in->linklock.clear_gather();
2689 in->dirfragtreelock.clear_gather();
2690 in->filelock.clear_gather();
2691
2692 in->clear_file_locks();
2693
2694 // non-bounding dir?
2695 list<CDir*> dfs;
2696 in->get_dirfrags(dfs);
2697 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2698 if (bounds.count(*p) == 0)
2699 q.push_back(*p);
2700 }
2701
2702 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2703 ++num_dentries;
2704 }
2705 }
2706
2707 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2708
2709 if (stat.state == IMPORT_ACKING) {
2710 // remove imported caps
2711 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
28e407b8
AA
2712 p != stat.peer_exports.end();
2713 ++p) {
7c673cae
FG
2714 CInode *in = p->first;
2715 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
28e407b8
AA
2716 q != p->second.end();
2717 ++q) {
7c673cae 2718 Capability *cap = in->get_client_cap(q->first);
28e407b8
AA
2719 if (!cap) {
2720 assert(!stat.session_map.count(q->first));
2721 continue;
2722 }
7c673cae
FG
2723 if (cap->is_importing())
2724 in->remove_client_cap(q->first);
2725 }
2726 in->put(CInode::PIN_IMPORTINGCAPS);
2727 }
28e407b8
AA
2728 for (auto& p : stat.session_map) {
2729 Session *session = p.second.first;
7c673cae
FG
2730 session->dec_importing();
2731 }
2732 }
2733
2734 // log our failure
2735 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2736
181888fb 2737 cache->trim(num_dentries); // try trimming dentries
7c673cae
FG
2738
2739 // notify bystanders; wait in aborting state
2740 import_notify_abort(dir, bounds);
2741}
2742
2743void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2744{
2745 dout(7) << "import_notify_finish " << *dir << dendl;
2746
2747 import_state_t& stat = import_state[dir->dirfrag()];
2748 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2749 p != stat.bystanders.end();
2750 ++p) {
2751 MExportDirNotify *notify =
2752 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
2753 pair<int,int>(stat.peer, mds->get_nodeid()),
2754 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2755 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2756 notify->get_bounds().push_back((*i)->dirfrag());
2757 mds->send_message_mds(notify, *p);
2758 }
2759}
2760
2761void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2762{
2763 dout(7) << "import_notify_abort " << *dir << dendl;
2764
2765 import_state_t& stat = import_state[dir->dirfrag()];
2766 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2767 p != stat.bystanders.end(); ) {
2768 if (mds->is_cluster_degraded() &&
2769 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2770 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2771 stat.bystanders.erase(p++);
2772 continue;
2773 }
2774 MExportDirNotify *notify =
2775 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2776 mds_authority_t(stat.peer, mds->get_nodeid()),
2777 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
2778 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2779 notify->get_bounds().push_back((*i)->dirfrag());
2780 mds->send_message_mds(notify, *p);
2781 ++p;
2782 }
2783 if (stat.bystanders.empty()) {
2784 dout(7) << "no bystanders, finishing reverse now" << dendl;
2785 import_reverse_unfreeze(dir);
2786 } else {
2787 assert (g_conf->mds_kill_import_at != 10);
2788 }
2789}
2790
2791void Migrator::import_reverse_unfreeze(CDir *dir)
2792{
7c673cae 2793 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
224ce89b 2794 assert(!dir->is_auth());
7c673cae 2795 cache->discard_delayed_expire(dir);
224ce89b
WB
2796 dir->unfreeze_tree();
2797 if (dir->is_subtree_root())
2798 cache->try_subtree_merge(dir);
7c673cae
FG
2799 import_reverse_final(dir);
2800}
2801
2802void Migrator::import_reverse_final(CDir *dir)
2803{
2804 dout(7) << "import_reverse_final " << *dir << dendl;
2805
2806 // clean up
2807 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2808 assert(it != import_state.end());
2809
2810 MutationRef mut = it->second.mut;
2811 import_state.erase(it);
2812
2813 // send pending import_maps?
2814 mds->mdcache->maybe_send_pending_resolves();
2815
2816 if (mut) {
2817 mds->locker->drop_locks(mut.get());
2818 mut->cleanup();
2819 }
2820
2821 cache->show_subtrees();
2822 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2823}
2824
2825
2826
2827
2828void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
28e407b8 2829 map<client_t,pair<Session*,uint64_t> >& imported_session_map)
7c673cae
FG
2830{
2831 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2832 if (it == import_state.end() ||
2833 it->second.state != IMPORT_LOGGINGSTART) {
2834 dout(7) << "import " << df << " must have aborted" << dendl;
28e407b8 2835 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae
FG
2836 return;
2837 }
2838
2839 dout(7) << "import_logged " << *dir << dendl;
2840
2841 // note state
2842 it->second.state = IMPORT_ACKING;
2843
2844 assert (g_conf->mds_kill_import_at != 7);
2845
2846 // force open client sessions and finish cap import
28e407b8 2847 mds->server->finish_force_open_sessions(imported_session_map, false);
7c673cae
FG
2848
2849 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2850 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2851 p != it->second.peer_exports.end();
2852 ++p) {
2853 // parameter 'peer' is NONE, delay sending cap import messages to client
28e407b8
AA
2854 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
2855 p->second, imported_caps[p->first->ino()]);
7c673cae 2856 }
28e407b8
AA
2857
2858 it->second.session_map.swap(imported_session_map);
7c673cae
FG
2859
2860 // send notify's etc.
2861 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2862
2863 // test surviving observer of a failed migration that did not complete
2864 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2865
2866 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
2867 ::encode(imported_caps, ack->imported_caps);
2868
2869 mds->send_message_mds(ack, from);
2870 assert (g_conf->mds_kill_import_at != 8);
2871
2872 cache->show_subtrees();
2873}
2874
2875/* This function DOES put the passed message before returning*/
2876void Migrator::handle_export_finish(MExportDirFinish *m)
2877{
2878 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2879 assert(dir);
2880 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
2881
2882 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2883 assert(it != import_state.end());
2884 assert(it->second.tid == m->get_tid());
2885
2886 import_finish(dir, false, m->is_last());
2887
2888 m->put();
2889}
2890
2891void Migrator::import_finish(CDir *dir, bool notify, bool last)
2892{
2893 dout(7) << "import_finish on " << *dir << dendl;
2894
2895 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
2896 assert(it != import_state.end());
2897 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
2898
224ce89b
WB
2899 if (it->second.state == IMPORT_ACKING) {
2900 assert(dir->is_auth());
2901 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2902 }
2903
7c673cae
FG
2904 // log finish
2905 assert(g_conf->mds_kill_import_at != 9);
2906
2907 if (it->second.state == IMPORT_ACKING) {
2908 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2909 p != it->second.peer_exports.end();
2910 ++p) {
2911 CInode *in = p->first;
2912 assert(in->is_auth());
2913 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2914 q != p->second.end();
2915 ++q) {
28e407b8
AA
2916 auto r = it->second.session_map.find(q->first);
2917 if (r == it->second.session_map.end())
2918 continue;
2919
2920 Session *session = r->second.first;
7c673cae
FG
2921 Capability *cap = in->get_client_cap(q->first);
2922 assert(cap);
2923 cap->merge(q->second, true);
2924 cap->clear_importing();
2925 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
2926 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
2927 }
2928 p->second.clear();
2929 in->replica_caps_wanted = 0;
2930 }
28e407b8
AA
2931 for (auto& p : it->second.session_map) {
2932 Session *session = p.second.first;
7c673cae
FG
2933 session->dec_importing();
2934 }
2935 }
2936
2937 if (!last) {
2938 assert(it->second.state == IMPORT_ACKING);
2939 it->second.state = IMPORT_FINISHING;
2940 return;
2941 }
2942
2943 // remove pins
2944 set<CDir*> bounds;
2945 cache->get_subtree_bounds(dir, bounds);
2946
2947 if (notify)
2948 import_notify_finish(dir, bounds);
2949
2950 import_remove_pins(dir, bounds);
2951
2952 map<CInode*, map<client_t,Capability::Export> > peer_exports;
2953 it->second.peer_exports.swap(peer_exports);
2954
2955 // clear import state (we're done!)
2956 MutationRef mut = it->second.mut;
2957 import_state.erase(it);
2958
7c673cae
FG
2959 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
2960
7c673cae
FG
2961 // process delayed expires
2962 cache->process_delayed_expire(dir);
2963
224ce89b 2964 // unfreeze tree, with possible subtree merge.
7c673cae 2965 dir->unfreeze_tree();
224ce89b
WB
2966 cache->try_subtree_merge(dir);
2967
7c673cae
FG
2968 cache->show_subtrees();
2969 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2970
2971 if (mut) {
2972 mds->locker->drop_locks(mut.get());
2973 mut->cleanup();
2974 }
2975
2976 // re-eval imported caps
2977 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
2978 p != peer_exports.end();
2979 ++p) {
2980 if (p->first->is_auth())
2981 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
2982 p->first->put(CInode::PIN_IMPORTINGCAPS);
2983 }
2984
2985 // send pending import_maps?
2986 mds->mdcache->maybe_send_pending_resolves();
2987
2988 // did i just import mydir?
2989 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
2990 cache->populate_mydir();
2991
2992 // is it empty?
2993 if (dir->get_num_head_items() == 0 &&
2994 !dir->inode->is_auth()) {
2995 // reexport!
2996 export_empty_import(dir);
2997 }
2998}
2999
3000
3001void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
3002 mds_rank_t oldauth, LogSegment *ls,
3003 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3004 list<ScatterLock*>& updated_scatterlocks)
3005{
3006 dout(15) << "decode_import_inode on " << *dn << dendl;
3007
3008 inodeno_t ino;
3009 snapid_t last;
3010 ::decode(ino, blp);
3011 ::decode(last, blp);
3012
3013 bool added = false;
3014 CInode *in = cache->get_inode(ino, last);
3015 if (!in) {
3016 in = new CInode(mds->mdcache, true, 1, last);
3017 added = true;
3018 }
3019
3020 // state after link -- or not! -sage
3021 in->decode_import(blp, ls); // cap imports are noted for later action
3022
3023 // caps
3024 decode_import_inode_caps(in, true, blp, peer_exports);
3025
3026 // link before state -- or not! -sage
3027 if (dn->get_linkage()->get_inode() != in) {
3028 assert(!dn->get_linkage()->get_inode());
3029 dn->dir->link_primary_inode(dn, in);
3030 }
28e407b8
AA
3031
3032 if (in->is_dir())
3033 dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
7c673cae
FG
3034
3035 // add inode?
3036 if (added) {
3037 cache->add_inode(in);
3038 dout(10) << "added " << *in << dendl;
3039 } else {
3040 dout(10) << " had " << *in << dendl;
3041 }
3042
3043 if (in->inode.is_dirty_rstat())
3044 in->mark_dirty_rstat();
3045
3046 // clear if dirtyscattered, since we're going to journal this
3047 // but not until we _actually_ finish the import...
3048 if (in->filelock.is_dirty()) {
3049 updated_scatterlocks.push_back(&in->filelock);
3050 mds->locker->mark_updated_scatterlock(&in->filelock);
3051 }
3052
3053 if (in->dirfragtreelock.is_dirty()) {
3054 updated_scatterlocks.push_back(&in->dirfragtreelock);
3055 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
3056 }
3057
3058 // adjust replica list
3059 //assert(!in->is_replica(oldauth)); // not true on failed export
3060 in->add_replica(oldauth, CInode::EXPORT_NONCE);
3061 if (in->is_replica(mds->get_nodeid()))
3062 in->remove_replica(mds->get_nodeid());
3063}
3064
3065void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
3066 bufferlist::iterator &blp,
3067 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3068{
3069 map<client_t,Capability::Export> cap_map;
3070 ::decode(cap_map, blp);
3071 if (auth_cap)
3072 ::decode(in->get_mds_caps_wanted(), blp);
3073 if (!cap_map.empty() ||
b32b8144 3074 (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
7c673cae
FG
3075 peer_exports[in].swap(cap_map);
3076 in->get(CInode::PIN_IMPORTINGCAPS);
3077 }
3078}
3079
3080void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
28e407b8
AA
3081 const map<client_t,pair<Session*,uint64_t> >& session_map,
3082 const map<client_t,Capability::Export> &export_map,
7c673cae
FG
3083 map<client_t,Capability::Import> &import_map)
3084{
28e407b8
AA
3085 for (auto& it : export_map) {
3086 dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
3087
3088 auto p = session_map.find(it.first);
3089 if (p == session_map.end()) {
3090 dout(10) << " no session for client." << it.first << dendl;
3091 (void)import_map[it.first];
3092 continue;
3093 }
7c673cae 3094
28e407b8
AA
3095 Session *session = p->second.first;
3096
3097 Capability *cap = in->get_client_cap(it.first);
7c673cae 3098 if (!cap) {
28e407b8 3099 cap = in->add_client_cap(it.first, session);
7c673cae
FG
3100 if (peer < 0)
3101 cap->mark_importing();
3102 }
3103
1adf2230
AA
3104 // Always ask exporter mds to send cap export messages for auth caps.
3105 // For non-auth caps, ask exporter mds to send cap export messages to
3106 // clients who haven't opened sessions. The cap export messages will
3107 // make clients open sessions.
3108 if (auth_cap || session->connection == nullptr) {
3109 Capability::Import& im = import_map[it.first];
3110 im.cap_id = cap->get_cap_id();
3111 im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
3112 im.issue_seq = cap->get_last_seq() + 1;
3113 }
7c673cae
FG
3114
3115 if (peer >= 0) {
28e407b8
AA
3116 cap->merge(it.second, auth_cap);
3117 mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
3118 it.second.seq, it.second.mseq - 1, peer,
7c673cae
FG
3119 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3120 }
3121 }
3122
3123 if (peer >= 0) {
3124 in->replica_caps_wanted = 0;
3125 in->put(CInode::PIN_IMPORTINGCAPS);
3126 }
3127}
3128
3129int Migrator::decode_import_dir(bufferlist::iterator& blp,
3130 mds_rank_t oldauth,
3131 CDir *import_root,
3132 EImportStart *le,
3133 LogSegment *ls,
3134 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3135 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3136{
3137 // set up dir
3138 dirfrag_t df;
3139 ::decode(df, blp);
3140
3141 CInode *diri = cache->get_inode(df.ino);
3142 assert(diri);
3143 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3144 assert(dir);
3145
3146 dout(7) << "decode_import_dir " << *dir << dendl;
3147
3148 // assimilate state
3149 dir->decode_import(blp, now, ls);
3150
3151 // adjust replica list
3152 //assert(!dir->is_replica(oldauth)); // not true on failed export
3153 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3154 if (dir->is_replica(mds->get_nodeid()))
3155 dir->remove_replica(mds->get_nodeid());
3156
3157 // add to journal entry
3158 if (le)
3159 le->metablob.add_import_dir(dir);
3160
3161 int num_imported = 0;
3162
3163 // take all waiters on this dir
3164 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3165 // a replica's presense in my cache implies/forces it's presense in authority's.
3166 list<MDSInternalContextBase*> waiters;
3167
3168 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3169 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3170 it != waiters.end();
3171 ++it)
3172 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3173
3174 dout(15) << "doing contents" << dendl;
3175
3176 // contents
3177 __u32 nden;
3178 ::decode(nden, blp);
3179
3180 for (; nden>0; nden--) {
3181 num_imported++;
3182
3183 // dentry
3184 string dname;
3185 snapid_t last;
3186 ::decode(dname, blp);
3187 ::decode(last, blp);
3188
3189 CDentry *dn = dir->lookup_exact_snap(dname, last);
3190 if (!dn)
3191 dn = dir->add_null_dentry(dname, 1, last);
3192
3193 dn->decode_import(blp, ls);
3194
3195 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3196 if (dn->is_replica(mds->get_nodeid()))
3197 dn->remove_replica(mds->get_nodeid());
3198
3199 // dentry lock in unreadable state can block path traverse
3200 if (dn->lock.get_state() != LOCK_SYNC)
3201 mds->locker->try_eval(&dn->lock, NULL);
3202
3203 dout(15) << "decode_import_dir got " << *dn << dendl;
3204
3205 // points to...
3206 char icode;
3207 ::decode(icode, blp);
3208
3209 if (icode == 'N') {
3210 // null dentry
3211 assert(dn->get_linkage()->is_null());
3212
3213 // fall thru
3214 }
3215 else if (icode == 'L') {
3216 // remote link
3217 inodeno_t ino;
3218 unsigned char d_type;
3219 ::decode(ino, blp);
3220 ::decode(d_type, blp);
3221 if (dn->get_linkage()->is_remote()) {
3222 assert(dn->get_linkage()->get_remote_ino() == ino);
3223 } else {
3224 dir->link_remote_inode(dn, ino, d_type);
3225 }
3226 }
3227 else if (icode == 'I') {
3228 // inode
3229 assert(le);
3230 decode_import_inode(dn, blp, oldauth, ls,
3231 peer_exports, updated_scatterlocks);
3232 }
3233
3234 // add dentry to journal entry
3235 if (le)
3236 le->metablob.add_import_dentry(dn);
3237 }
3238
3239#ifdef MDS_VERIFY_FRAGSTAT
3240 if (dir->is_complete())
3241 dir->verify_fragstat();
3242#endif
3243
3244 dir->inode->maybe_export_pin();
3245
3246 dout(7) << "decode_import_dir done " << *dir << dendl;
3247 return num_imported;
3248}
3249
3250
3251
3252
3253
3254// authority bystander
3255
3256/* This function DOES put the passed message before returning*/
3257void Migrator::handle_export_notify(MExportDirNotify *m)
3258{
3259 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3260 m->put();
3261 return;
3262 }
3263
3264 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3265
3266 mds_rank_t from = mds_rank_t(m->get_source().num());
3267 mds_authority_t old_auth = m->get_old_auth();
3268 mds_authority_t new_auth = m->get_new_auth();
3269
3270 if (!dir) {
3271 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3272 << " on missing dir " << m->get_dirfrag() << dendl;
3273 } else if (dir->authority() != old_auth) {
3274 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3275 << " != " << old_auth << " -> " << new_auth
3276 << " on " << *dir << dendl;
3277 } else {
3278 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3279 << " on " << *dir << dendl;
3280 // adjust auth
3281 set<CDir*> have;
3282 cache->map_dirfrag_set(m->get_bounds(), have);
3283 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3284
3285 // induce a merge?
3286 cache->try_subtree_merge(dir);
3287 }
3288
3289 // send ack
3290 if (m->wants_ack()) {
3291 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3292 } else {
3293 // aborted. no ack.
3294 dout(7) << "handle_export_notify no ack requested" << dendl;
3295 }
3296
3297 m->put();
3298}
3299
3300/** cap exports **/
3301void Migrator::export_caps(CInode *in)
3302{
3303 mds_rank_t dest = in->authority().first;
3304 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3305
3306 assert(in->is_any_caps());
3307 assert(!in->is_auth());
3308 assert(!in->is_ambiguous_auth());
3309 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3310
3311 MExportCaps *ex = new MExportCaps;
3312 ex->ino = in->ino();
3313
3314 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3315
3316 mds->send_message_mds(ex, dest);
3317}
3318
1adf2230
AA
3319/* This function DOES put the passed message before returning*/
3320void Migrator::handle_export_caps_ack(MExportCapsAck *ack)
3321{
3322 mds_rank_t from = ack->get_source().num();
3323 CInode *in = cache->get_inode(ack->ino);
3324 if (in) {
3325 assert(!in->is_auth());
3326
3327 dout(10) << "handle_export_caps_ack " << *ack << " from "
3328 << ack->get_source() << " on " << *in << dendl;
3329
3330 map<client_t,Capability::Import> imported_caps;
3331 map<client_t,uint64_t> caps_ids;
3332 auto blp = ack->cap_bl.begin();
3333 ::decode(imported_caps, blp);
3334 ::decode(caps_ids, blp);
3335
3336 for (auto& it : imported_caps) {
3337 Capability *cap = in->get_client_cap(it.first);
3338 if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
3339 continue;
3340
3341 dout(7) << __func__ << " telling client." << it.first
3342 << " exported caps on " << *in << dendl;
3343 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
3344 cap->get_cap_id(), cap->get_mseq(),
3345 mds->get_osd_epoch_barrier());
3346 m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
3347 mds->send_message_client_counted(m, it.first);
3348
3349 in->remove_client_cap(it.first);
3350 }
3351
3352 mds->locker->request_inode_file_caps(in);
3353 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
3354 }
3355
3356 ack->put();
3357}
3358
7c673cae
FG
3359void Migrator::handle_gather_caps(MGatherCaps *m)
3360{
3361 CInode *in = cache->get_inode(m->ino);
7c673cae
FG
3362 if (!in)
3363 goto out;
3364
3365 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
1adf2230
AA
3366 << " on " << *in << dendl;
3367
7c673cae
FG
3368 if (in->is_any_caps() &&
3369 !in->is_auth() &&
3370 !in->is_ambiguous_auth() &&
3371 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3372 export_caps(in);
3373
3374out:
3375 m->put();
3376}
3377
3378class C_M_LoggedImportCaps : public MigratorLogContext {
3379 CInode *in;
3380 mds_rank_t from;
3381public:
28e407b8 3382 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae 3383 map<CInode*, map<client_t,Capability::Export> > peer_exports;
7c673cae
FG
3384
3385 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3386 void finish(int r) override {
28e407b8 3387 mig->logged_import_caps(in, from, imported_session_map, peer_exports);
7c673cae
FG
3388 }
3389};
3390
3391/* This function DOES put the passed message before returning*/
3392void Migrator::handle_export_caps(MExportCaps *ex)
3393{
3394 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3395 CInode *in = cache->get_inode(ex->ino);
3396
3397 assert(in);
3398 assert(in->is_auth());
3399
3400 // FIXME
28e407b8
AA
3401 if (!in->can_auth_pin()) {
3402 ex->put();
7c673cae 3403 return;
28e407b8
AA
3404 }
3405
181888fb 3406 in->auth_pin(this);
7c673cae 3407
28e407b8
AA
3408 map<client_t,entity_inst_t> client_map;
3409 client_map.swap(ex->client_map);
3410
7c673cae
FG
3411 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3412 this, in, mds_rank_t(ex->get_source().num()));
7c673cae 3413
28e407b8
AA
3414 version_t pv = mds->server->prepare_force_open_sessions(client_map,
3415 finish->imported_session_map);
7c673cae
FG
3416 // decode new caps
3417 bufferlist::iterator blp = ex->cap_bl.begin();
3418 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3419 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3420
3421 // journal open client sessions
7c673cae 3422
28e407b8 3423 ESessions *le = new ESessions(pv, client_map);
7c673cae
FG
3424 mds->mdlog->start_submit_entry(le, finish);
3425 mds->mdlog->flush();
3426
3427 ex->put();
3428}
3429
3430
3431void Migrator::logged_import_caps(CInode *in,
3432 mds_rank_t from,
28e407b8
AA
3433 map<client_t,pair<Session*,uint64_t> >& imported_session_map,
3434 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
7c673cae
FG
3435{
3436 dout(10) << "logged_import_caps on " << *in << dendl;
3437 // see export_go() vs export_go_synced()
3438 assert(in->is_auth());
3439
3440 // force open client sessions and finish cap import
28e407b8 3441 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae 3442
28e407b8
AA
3443 auto it = peer_exports.find(in);
3444 assert(it != peer_exports.end());
3445
7c673cae 3446 // clients will release caps from the exporter when they receive the cap import message.
1adf2230 3447 map<client_t,Capability::Import> imported_caps;
28e407b8 3448 finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
7c673cae 3449 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
1adf2230
AA
3450
3451 if (!imported_caps.empty()) {
3452 MExportCapsAck *ack = new MExportCapsAck(in->ino());
3453 map<client_t,uint64_t> peer_caps_ids;
3454 for (auto &p : imported_caps )
3455 peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
3456
3457 ::encode(imported_caps, ack->cap_bl);
3458 ::encode(peer_caps_ids, ack->cap_bl);
3459 mds->send_message_mds(ack, from);
3460 }
3461
181888fb 3462 in->auth_unpin(this);
7c673cae 3463}
28e407b8
AA
3464
3465void Migrator::handle_conf_change(const struct md_config_t *conf,
3466 const std::set <std::string> &changed,
3467 const MDSMap &mds_map)
3468{
3469 if (changed.count("mds_inject_migrator_session_race")) {
3470 inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
3471 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
3472 }
1adf2230
AA
3473
3474 if (changed.count("mds_inject_migrator_message_loss")) {
3475 inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
3476 dout(0) << "mds_inject_migrator_message_loss is " << inject_message_loss << dendl;
3477 }
28e407b8 3478}