]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Migrator.cc
import ceph 12.2.12
[ceph.git] / ceph / src / mds / Migrator.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "MDSRank.h"
16 #include "MDCache.h"
17 #include "CInode.h"
18 #include "CDir.h"
19 #include "CDentry.h"
20 #include "Migrator.h"
21 #include "Locker.h"
22 #include "Server.h"
23
24 #include "MDBalancer.h"
25 #include "MDLog.h"
26 #include "MDSMap.h"
27 #include "Mutation.h"
28
29 #include "include/filepath.h"
30 #include "common/likely.h"
31
32 #include "events/EExport.h"
33 #include "events/EImportStart.h"
34 #include "events/EImportFinish.h"
35 #include "events/ESessions.h"
36
37 #include "msg/Messenger.h"
38
39 #include "messages/MClientCaps.h"
40
41 #include "messages/MExportDirDiscover.h"
42 #include "messages/MExportDirDiscoverAck.h"
43 #include "messages/MExportDirCancel.h"
44 #include "messages/MExportDirPrep.h"
45 #include "messages/MExportDirPrepAck.h"
46 #include "messages/MExportDir.h"
47 #include "messages/MExportDirAck.h"
48 #include "messages/MExportDirNotify.h"
49 #include "messages/MExportDirNotifyAck.h"
50 #include "messages/MExportDirFinish.h"
51
52 #include "messages/MExportCaps.h"
53 #include "messages/MExportCapsAck.h"
54 #include "messages/MGatherCaps.h"
55
56
57 /*
58 * this is what the dir->dir_auth values look like
59 *
60 * dir_auth authbits
61 * export
62 * me me - before
63 * me, me me - still me, but preparing for export
64 * me, them me - send MExportDir (peer is preparing)
65 * them, me me - journaled EExport
66 * them them - done
67 *
68 * import:
69 * them them - before
70 * me, them me - journaled EImportStart
71 * me me - done
72 *
73 * which implies:
74 * - auth bit is set if i am listed as first _or_ second dir_auth.
75 */
76
77 #include "common/config.h"
78
79
80 #define dout_context g_ceph_context
81 #define dout_subsys ceph_subsys_mds
82 #undef dout_prefix
83 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
84
85
86 class MigratorContext : public MDSInternalContextBase {
87 protected:
88 Migrator *mig;
89 MDSRank *get_mds() override {
90 return mig->mds;
91 }
92 public:
93 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
94 assert(mig != NULL);
95 }
96 };
97
98 class MigratorLogContext : public MDSLogContextBase {
99 protected:
100 Migrator *mig;
101 MDSRank *get_mds() override {
102 return mig->mds;
103 }
104 public:
105 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
106 assert(mig != NULL);
107 }
108 };
109
110 /* This function DOES put the passed message before returning*/
111 void Migrator::dispatch(Message *m)
112 {
113 if (unlikely(inject_message_loss)) {
114 if (inject_message_loss == m->get_type() - MDS_PORT_MIGRATOR) {
115 dout(0) << "inject message loss " << *m << dendl;
116 m->put();
117 return;
118 }
119 }
120
121 switch (m->get_type()) {
122 // import
123 case MSG_MDS_EXPORTDIRDISCOVER:
124 handle_export_discover(static_cast<MExportDirDiscover*>(m));
125 break;
126 case MSG_MDS_EXPORTDIRPREP:
127 handle_export_prep(static_cast<MExportDirPrep*>(m));
128 break;
129 case MSG_MDS_EXPORTDIR:
130 if (unlikely(inject_session_race)) {
131 dout(0) << "waiting for inject_session_race" << dendl;
132 mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
133 } else {
134 handle_export_dir(static_cast<MExportDir*>(m));
135 }
136 break;
137 case MSG_MDS_EXPORTDIRFINISH:
138 handle_export_finish(static_cast<MExportDirFinish*>(m));
139 break;
140 case MSG_MDS_EXPORTDIRCANCEL:
141 handle_export_cancel(static_cast<MExportDirCancel*>(m));
142 break;
143
144 // export
145 case MSG_MDS_EXPORTDIRDISCOVERACK:
146 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
147 break;
148 case MSG_MDS_EXPORTDIRPREPACK:
149 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
150 break;
151 case MSG_MDS_EXPORTDIRACK:
152 handle_export_ack(static_cast<MExportDirAck*>(m));
153 break;
154 case MSG_MDS_EXPORTDIRNOTIFYACK:
155 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
156 break;
157
158 // export 3rd party (dir_auth adjustments)
159 case MSG_MDS_EXPORTDIRNOTIFY:
160 handle_export_notify(static_cast<MExportDirNotify*>(m));
161 break;
162
163 // caps
164 case MSG_MDS_EXPORTCAPS:
165 handle_export_caps(static_cast<MExportCaps*>(m));
166 break;
167 case MSG_MDS_EXPORTCAPSACK:
168 handle_export_caps_ack(static_cast<MExportCapsAck*>(m));
169 break;
170 case MSG_MDS_GATHERCAPS:
171 handle_gather_caps(static_cast<MGatherCaps*>(m));
172 break;
173
174 default:
175 derr << "migrator unknown message " << m->get_type() << dendl;
176 assert(0 == "migrator unknown message");
177 }
178 }
179
180
181 class C_MDC_EmptyImport : public MigratorContext {
182 CDir *dir;
183 public:
184 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
185 void finish(int r) override {
186 mig->export_empty_import(dir);
187 }
188 };
189
190
191 void Migrator::export_empty_import(CDir *dir)
192 {
193 dout(7) << "export_empty_import " << *dir << dendl;
194 assert(dir->is_subtree_root());
195
196 if (dir->inode->is_auth()) {
197 dout(7) << " inode is auth" << dendl;
198 return;
199 }
200 if (!dir->is_auth()) {
201 dout(7) << " not auth" << dendl;
202 return;
203 }
204 if (dir->is_freezing() || dir->is_frozen()) {
205 dout(7) << " freezing or frozen" << dendl;
206 return;
207 }
208 if (dir->get_num_head_items() > 0) {
209 dout(7) << " not actually empty" << dendl;
210 return;
211 }
212 if (dir->inode->is_root()) {
213 dout(7) << " root" << dendl;
214 return;
215 }
216
217 mds_rank_t dest = dir->inode->authority().first;
218 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
219
220 dout(7) << " really empty, exporting to " << dest << dendl;
221 assert (dest != mds->get_nodeid());
222
223 dout(7) << "exporting to mds." << dest
224 << " empty import " << *dir << dendl;
225 export_dir( dir, dest );
226 }
227
228 void Migrator::find_stale_export_freeze()
229 {
230 utime_t now = ceph_clock_now();
231 utime_t cutoff = now;
232 cutoff -= g_conf->mds_freeze_tree_timeout;
233
234
235 /*
236 * We could have situations like:
237 *
238 * - mds.0 authpins an item in subtree A
239 * - mds.0 sends request to mds.1 to authpin an item in subtree B
240 * - mds.0 freezes subtree A
241 * - mds.1 authpins an item in subtree B
242 * - mds.1 sends request to mds.0 to authpin an item in subtree A
243 * - mds.1 freezes subtree B
244 * - mds.1 receives the remote authpin request from mds.0
245 * (wait because subtree B is freezing)
246 * - mds.0 receives the remote authpin request from mds.1
247 * (wait because subtree A is freezing)
248 *
249 *
250 * - client request authpins items in subtree B
251 * - freeze subtree B
252 * - import subtree A which is parent of subtree B
253 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
254 * - freeze subtree A
255 * - client request tries authpinning items in subtree A
256 * (wait because subtree A is freezing)
257 */
258 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
259 p != export_state.end(); ) {
260 CDir* dir = p->first;
261 export_state_t& stat = p->second;
262 ++p;
263 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
264 continue;
265 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
266 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
267 stat.last_cum_auth_pins_change = now;
268 continue;
269 }
270 if (stat.last_cum_auth_pins_change >= cutoff)
271 continue;
272 if (stat.num_remote_waiters > 0 ||
273 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
274 export_try_cancel(dir);
275 }
276 }
277 }
278
279 void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
280 {
281 dout(10) << "export_try_cancel " << *dir << dendl;
282
283 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
284 assert(it != export_state.end());
285
286 int state = it->second.state;
287 switch (state) {
288 case EXPORT_LOCKING:
289 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
290 num_locking_exports--;
291 it->second.state = EXPORT_CANCELLED;
292 dir->auth_unpin(this);
293 break;
294 case EXPORT_DISCOVERING:
295 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
296 it->second.state = EXPORT_CANCELLED;
297 dir->unfreeze_tree(); // cancel the freeze
298 dir->auth_unpin(this);
299 if (notify_peer &&
300 (!mds->is_cluster_degraded() ||
301 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
302 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
303 break;
304
305 case EXPORT_FREEZING:
306 dout(10) << "export state=freezing : canceling freeze" << dendl;
307 it->second.state = EXPORT_CANCELLED;
308 dir->unfreeze_tree(); // cancel the freeze
309 if (dir->is_subtree_root())
310 cache->try_subtree_merge(dir);
311 if (notify_peer &&
312 (!mds->is_cluster_degraded() ||
313 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
314 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
315 break;
316
317 // NOTE: state order reversal, warning comes after prepping
318 case EXPORT_WARNING:
319 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
320 it->second.state = EXPORT_CANCELLING;
321 // fall-thru
322
323 case EXPORT_PREPPING:
324 if (state != EXPORT_WARNING) {
325 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
326 it->second.state = EXPORT_CANCELLED;
327 }
328
329 {
330 // unpin bounds
331 set<CDir*> bounds;
332 cache->get_subtree_bounds(dir, bounds);
333 for (set<CDir*>::iterator q = bounds.begin();
334 q != bounds.end();
335 ++q) {
336 CDir *bd = *q;
337 bd->put(CDir::PIN_EXPORTBOUND);
338 bd->state_clear(CDir::STATE_EXPORTBOUND);
339 }
340 if (state == EXPORT_WARNING) {
341 // notify bystanders
342 export_notify_abort(dir, it->second, bounds);
343 // process delayed expires
344 cache->process_delayed_expire(dir);
345 }
346 }
347 dir->unfreeze_tree();
348 cache->try_subtree_merge(dir);
349 if (notify_peer &&
350 (!mds->is_cluster_degraded() ||
351 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
352 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
353 break;
354
355 case EXPORT_EXPORTING:
356 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
357 it->second.state = EXPORT_CANCELLING;
358 export_reverse(dir, it->second);
359 break;
360
361 case EXPORT_LOGGINGFINISH:
362 case EXPORT_NOTIFYING:
363 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
364 // leave export_state, don't clean up now.
365 break;
366 case EXPORT_CANCELLING:
367 break;
368
369 default:
370 ceph_abort();
371 }
372
373 // finish clean-up?
374 if (it->second.state == EXPORT_CANCELLING ||
375 it->second.state == EXPORT_CANCELLED) {
376 MutationRef mut;
377 mut.swap(it->second.mut);
378
379 if (it->second.state == EXPORT_CANCELLED) {
380 export_cancel_finish(it);
381 }
382
383 // drop locks
384 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
385 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
386 assert(mdr);
387 mds->mdcache->request_kill(mdr);
388 } else if (mut) {
389 mds->locker->drop_locks(mut.get());
390 mut->cleanup();
391 }
392
393 cache->show_subtrees();
394
395 maybe_do_queued_export();
396 }
397 }
398
399 void Migrator::export_cancel_finish(export_state_iterator& it)
400 {
401 CDir *dir = it->first;
402 bool unpin = (it->second.state == EXPORT_CANCELLING);
403 auto parent = std::move(it->second.parent);
404
405 total_exporting_size -= it->second.approx_size;
406 export_state.erase(it);
407
408 assert(dir->state_test(CDir::STATE_EXPORTING));
409 dir->clear_exporting();
410
411 if (unpin) {
412 // pinned by Migrator::export_notify_abort()
413 dir->auth_unpin(this);
414 }
415 // send pending import_maps? (these need to go out when all exports have finished.)
416 cache->maybe_send_pending_resolves();
417
418 if (parent)
419 child_export_finish(parent, false);
420 }
421
422 // ==========================================================
423 // mds failure handling
424
425 void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
426 {
427 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
428
429 // check my exports
430
431 // first add an extra auth_pin on any freezes, so that canceling a
432 // nested freeze doesn't complete one further up the hierarchy and
433 // confuse the shit out of us. we'll remove it after canceling the
434 // freeze. this way no freeze completions run before we want them
435 // to.
436 list<CDir*> pinned_dirs;
437 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
438 p != export_state.end();
439 ++p) {
440 if (p->second.state == EXPORT_FREEZING) {
441 CDir *dir = p->first;
442 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
443 dir->auth_pin(this);
444 pinned_dirs.push_back(dir);
445 }
446 }
447
448 map<CDir*,export_state_t>::iterator p = export_state.begin();
449 while (p != export_state.end()) {
450 map<CDir*,export_state_t>::iterator next = p;
451 ++next;
452 CDir *dir = p->first;
453
454 // abort exports:
455 // - that are going to the failed node
456 // - that aren't frozen yet (to avoid auth_pin deadlock)
457 // - they havne't prepped yet (they may need to discover bounds to do that)
458 if ((p->second.peer == who &&
459 p->second.state != EXPORT_CANCELLING) ||
460 p->second.state == EXPORT_LOCKING ||
461 p->second.state == EXPORT_DISCOVERING ||
462 p->second.state == EXPORT_FREEZING ||
463 p->second.state == EXPORT_PREPPING) {
464 // the guy i'm exporting to failed, or we're just freezing.
465 dout(10) << "cleaning up export state (" << p->second.state << ")"
466 << get_export_statename(p->second.state) << " of " << *dir << dendl;
467 export_try_cancel(dir);
468 } else if (p->second.peer != who) {
469 // bystander failed.
470 if (p->second.warning_ack_waiting.erase(who)) {
471 if (p->second.state == EXPORT_WARNING) {
472 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
473 // exporter waiting for warning acks, let's fake theirs.
474 dout(10) << "faking export_warning_ack from mds." << who
475 << " on " << *dir << " to mds." << p->second.peer
476 << dendl;
477 if (p->second.warning_ack_waiting.empty())
478 export_go(dir);
479 }
480 }
481 if (p->second.notify_ack_waiting.erase(who)) {
482 // exporter is waiting for notify acks, fake it
483 dout(10) << "faking export_notify_ack from mds." << who
484 << " on " << *dir << " to mds." << p->second.peer
485 << dendl;
486 if (p->second.state == EXPORT_NOTIFYING) {
487 if (p->second.notify_ack_waiting.empty())
488 export_finish(dir);
489 } else if (p->second.state == EXPORT_CANCELLING) {
490 if (p->second.notify_ack_waiting.empty()) {
491 export_cancel_finish(p);
492 }
493 }
494 }
495 }
496
497 // next!
498 p = next;
499 }
500
501
502 // check my imports
503 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
504 while (q != import_state.end()) {
505 map<dirfrag_t,import_state_t>::iterator next = q;
506 ++next;
507 dirfrag_t df = q->first;
508 CInode *diri = mds->mdcache->get_inode(df.ino);
509 CDir *dir = mds->mdcache->get_dirfrag(df);
510
511 if (q->second.peer == who) {
512 if (dir)
513 dout(10) << "cleaning up import state (" << q->second.state << ")"
514 << get_import_statename(q->second.state) << " of " << *dir << dendl;
515 else
516 dout(10) << "cleaning up import state (" << q->second.state << ")"
517 << get_import_statename(q->second.state) << " of " << df << dendl;
518
519 switch (q->second.state) {
520 case IMPORT_DISCOVERING:
521 dout(10) << "import state=discovering : clearing state" << dendl;
522 import_reverse_discovering(df);
523 break;
524
525 case IMPORT_DISCOVERED:
526 assert(diri);
527 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
528 import_reverse_discovered(df, diri);
529 break;
530
531 case IMPORT_PREPPING:
532 assert(dir);
533 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
534 import_reverse_prepping(dir, q->second);
535 break;
536
537 case IMPORT_PREPPED:
538 assert(dir);
539 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
540 {
541 set<CDir*> bounds;
542 cache->get_subtree_bounds(dir, bounds);
543 import_remove_pins(dir, bounds);
544
545 // adjust auth back to the exporter
546 cache->adjust_subtree_auth(dir, q->second.peer);
547
548 // notify bystanders ; wait in aborting state
549 q->second.state = IMPORT_ABORTING;
550 import_notify_abort(dir, bounds);
551 assert(g_conf->mds_kill_import_at != 10);
552 }
553 break;
554
555 case IMPORT_LOGGINGSTART:
556 assert(dir);
557 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
558 import_reverse(dir);
559 break;
560
561 case IMPORT_ACKING:
562 assert(dir);
563 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
564 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
565 {
566 set<CDir*> bounds;
567 cache->get_subtree_bounds(dir, bounds);
568 cache->add_ambiguous_import(dir, bounds);
569 }
570 break;
571
572 case IMPORT_FINISHING:
573 assert(dir);
574 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
575 import_finish(dir, true);
576 break;
577
578 case IMPORT_ABORTING:
579 assert(dir);
580 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
581 break;
582 }
583 } else {
584 auto bystanders_entry = q->second.bystanders.find(who);
585 if (bystanders_entry != q->second.bystanders.end()) {
586 q->second.bystanders.erase(bystanders_entry);
587 if (q->second.state == IMPORT_ABORTING) {
588 assert(dir);
589 dout(10) << "faking export_notify_ack from mds." << who
590 << " on aborting import " << *dir << " from mds." << q->second.peer
591 << dendl;
592 if (q->second.bystanders.empty())
593 import_reverse_unfreeze(dir);
594 }
595 }
596 }
597
598 // next!
599 q = next;
600 }
601
602 while (!pinned_dirs.empty()) {
603 CDir *dir = pinned_dirs.front();
604 dout(10) << "removing temp auth_pin on " << *dir << dendl;
605 dir->auth_unpin(this);
606 pinned_dirs.pop_front();
607 }
608 }
609
610
611
612 void Migrator::show_importing()
613 {
614 dout(10) << "show_importing" << dendl;
615 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
616 p != import_state.end();
617 ++p) {
618 CDir *dir = mds->mdcache->get_dirfrag(p->first);
619 if (dir) {
620 dout(10) << " importing from " << p->second.peer
621 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
622 << " " << p->first << " " << *dir << dendl;
623 } else {
624 dout(10) << " importing from " << p->second.peer
625 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
626 << " " << p->first << dendl;
627 }
628 }
629 }
630
631 void Migrator::show_exporting()
632 {
633 dout(10) << "show_exporting" << dendl;
634 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
635 p != export_state.end();
636 ++p)
637 dout(10) << " exporting to " << p->second.peer
638 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
639 << " " << p->first->dirfrag() << " " << *p->first << dendl;
640 }
641
642
643
644 void Migrator::audit()
645 {
646 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
647 return; // hrm.
648
649 // import_state
650 show_importing();
651 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
652 p != import_state.end();
653 ++p) {
654 if (p->second.state == IMPORT_DISCOVERING)
655 continue;
656 if (p->second.state == IMPORT_DISCOVERED) {
657 CInode *in = cache->get_inode(p->first.ino);
658 assert(in);
659 continue;
660 }
661 CDir *dir = cache->get_dirfrag(p->first);
662 assert(dir);
663 if (p->second.state == IMPORT_PREPPING)
664 continue;
665 if (p->second.state == IMPORT_ABORTING) {
666 assert(!dir->is_ambiguous_dir_auth());
667 assert(dir->get_dir_auth().first != mds->get_nodeid());
668 continue;
669 }
670 assert(dir->is_ambiguous_dir_auth());
671 assert(dir->authority().first == mds->get_nodeid() ||
672 dir->authority().second == mds->get_nodeid());
673 }
674
675 // export_state
676 show_exporting();
677 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
678 p != export_state.end();
679 ++p) {
680 CDir *dir = p->first;
681 if (p->second.state == EXPORT_LOCKING ||
682 p->second.state == EXPORT_DISCOVERING ||
683 p->second.state == EXPORT_FREEZING ||
684 p->second.state == EXPORT_CANCELLING)
685 continue;
686 assert(dir->is_ambiguous_dir_auth());
687 assert(dir->authority().first == mds->get_nodeid() ||
688 dir->authority().second == mds->get_nodeid());
689 }
690
691 // ambiguous+me subtrees should be importing|exporting
692
693 // write me
694 }
695
696
697
698
699
700 // ==========================================================
701 // EXPORT
702
703 void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
704 {
705 // enqueue
706 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
707 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
708
709 maybe_do_queued_export();
710 }
711
712 void Migrator::maybe_do_queued_export()
713 {
714 static bool running;
715 if (running)
716 return;
717 running = true;
718
719 uint64_t max_total_size = max_export_size * 2;
720
721 while (!export_queue.empty() &&
722 max_total_size > total_exporting_size &&
723 max_total_size - total_exporting_size >=
724 max_export_size * (num_locking_exports + 1)) {
725
726 dirfrag_t df = export_queue.front().first;
727 mds_rank_t dest = export_queue.front().second;
728 export_queue.pop_front();
729
730 CDir *dir = mds->mdcache->get_dirfrag(df);
731 if (!dir) continue;
732 if (!dir->is_auth()) continue;
733
734 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
735
736 export_dir(dir, dest);
737 }
738
739 running = false;
740 }
741
742
743
744
745 class C_MDC_ExportFreeze : public MigratorContext {
746 CDir *ex; // dir i'm exporting
747 uint64_t tid;
748 public:
749 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
750 MigratorContext(m), ex(e), tid(t) {
751 assert(ex != NULL);
752 }
753 void finish(int r) override {
754 if (r >= 0)
755 mig->export_frozen(ex, tid);
756 }
757 };
758
759
760 void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
761 {
762 // path
763 vector<CDentry*> trace;
764 cache->make_trace(trace, dir->inode);
765 for (vector<CDentry*>::iterator it = trace.begin();
766 it != trace.end();
767 ++it)
768 locks.insert(&(*it)->lock);
769
770 // prevent scatter gather race
771 locks.insert(&dir->get_inode()->dirfragtreelock);
772
773 // bound dftlocks:
774 // NOTE: We need to take an rdlock on bounding dirfrags during
775 // migration for a rather irritating reason: when we export the
776 // bound inode, we need to send scatterlock state for the dirfrags
777 // as well, so that the new auth also gets the correct info. If we
778 // race with a refragment, this info is useless, as we can't
779 // redivvy it up. And it's needed for the scatterlocks to work
780 // properly: when the auth is in a sync/lock state it keeps each
781 // dirfrag's portion in the local (auth OR replica) dirfrag.
782 set<CDir*> wouldbe_bounds;
783 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
784 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
785 locks.insert(&(*p)->get_inode()->dirfragtreelock);
786 }
787
788
789 /** export_dir(dir, dest)
790 * public method to initiate an export.
791 * will fail if the directory is freezing, frozen, unpinnable, or root.
792 */
793 void Migrator::export_dir(CDir *dir, mds_rank_t dest)
794 {
795 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
796 assert(dir->is_auth());
797 assert(dest != mds->get_nodeid());
798
799 if (!(mds->is_active() || mds->is_stopping())) {
800 dout(7) << "i'm not active, no exports for now" << dendl;
801 return;
802 }
803 if (mds->mdcache->is_readonly()) {
804 dout(7) << "read-only FS, no exports for now" << dendl;
805 return;
806 }
807 if (!mds->mdsmap->is_active(dest)) {
808 dout(7) << "dest not active, no exports for now" << dendl;
809 return;
810 }
811 if (mds->is_cluster_degraded()) {
812 dout(7) << "cluster degraded, no exports for now" << dendl;
813 return;
814 }
815 if (dir->inode->is_system()) {
816 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
817 //ceph_abort();
818 return;
819 }
820
821 CDir* parent_dir = dir->inode->get_projected_parent_dir();
822 if (parent_dir && parent_dir->inode->is_stray()) {
823 if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
824 dout(7) << "i won't export anything in stray" << dendl;
825 return;
826 }
827 } else {
828 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
829 dout(7) << "dir is export pinned" << dendl;
830 return;
831 }
832 }
833
834 if (dir->is_frozen() ||
835 dir->is_freezing()) {
836 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
837 return;
838 }
839 if (dir->state_test(CDir::STATE_EXPORTING)) {
840 dout(7) << "already exporting" << dendl;
841 return;
842 }
843
844 if (g_conf->mds_thrash_exports) {
845 // create random subtree bound (which will not be exported)
846 list<CDir*> ls;
847 for (auto p = dir->begin(); p != dir->end(); ++p) {
848 auto dn = p->second;
849 CDentry::linkage_t *dnl= dn->get_linkage();
850 if (dnl->is_primary()) {
851 CInode *in = dnl->get_inode();
852 if (in->is_dir())
853 in->get_nested_dirfrags(ls);
854 }
855 }
856 if (ls.size() > 0) {
857 int n = rand() % ls.size();
858 auto p = ls.begin();
859 while (n--) ++p;
860 CDir *bd = *p;
861 if (!(bd->is_frozen() || bd->is_freezing())) {
862 assert(bd->is_auth());
863 dir->state_set(CDir::STATE_AUXSUBTREE);
864 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
865 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
866 }
867 }
868 }
869
870 mds->hit_export_target(ceph_clock_now(), dest, -1);
871
872 dir->auth_pin(this);
873 dir->mark_exporting();
874
875 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
876 mdr->more()->export_dir = dir;
877
878 assert(export_state.count(dir) == 0);
879 export_state_t& stat = export_state[dir];
880 num_locking_exports++;
881 stat.state = EXPORT_LOCKING;
882 stat.peer = dest;
883 stat.tid = mdr->reqid.tid;
884 stat.mut = mdr;
885
886 mds->mdcache->dispatch_request(mdr);
887 }
888
889 /*
890 * check if directory is too large to be export in whole. If it is,
891 * choose some subdirs, whose total size is suitable.
892 */
893 void Migrator::maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
894 vector<pair<CDir*, size_t> >& results)
895 {
896 static const unsigned frag_size = 800;
897 static const unsigned inode_size = 1000;
898 static const unsigned cap_size = 80;
899 static const unsigned remote_size = 10;
900 static const unsigned null_size = 1;
901
902 // state for depth-first search
903 struct LevelData {
904 CDir *dir;
905 CDir::dentry_key_map::iterator iter;
906 size_t dirfrag_size = frag_size;
907 size_t subdirs_size = 0;
908 bool complete = true;
909 vector<CDir*> siblings;
910 vector<pair<CDir*, size_t> > subdirs;
911 LevelData(const LevelData&) = default;
912 LevelData(CDir *d) :
913 dir(d), iter(d->begin()) {}
914 };
915
916 vector<LevelData> stack;
917 stack.emplace_back(dir);
918
919 size_t found_size = 0;
920 size_t skipped_size = 0;
921
922 for (;;) {
923 auto& data = stack.back();
924 CDir *cur = data.dir;
925 auto& it = data.iter;
926 auto& dirfrag_size = data.dirfrag_size;
927
928 while(it != cur->end()) {
929 CDentry *dn = it->second;
930 ++it;
931
932 dirfrag_size += dn->name.size();
933 if (dn->get_linkage()->is_null()) {
934 dirfrag_size += null_size;
935 continue;
936 }
937 if (dn->get_linkage()->is_remote()) {
938 dirfrag_size += remote_size;
939 continue;
940 }
941
942 CInode *in = dn->get_linkage()->get_inode();
943 dirfrag_size += inode_size;
944 dirfrag_size += in->get_client_caps().size() * cap_size;
945
946 if (in->is_dir()) {
947 vector<CDir*> ls;
948 in->get_nested_dirfrags(ls);
949 std::reverse(ls.begin(), ls.end());
950
951 bool complete = true;
952 for (auto p = ls.begin(); p != ls.end(); ) {
953 if ((*p)->state_test(CDir::STATE_EXPORTING) ||
954 (*p)->is_freezing_dir() || (*p)->is_frozen_dir()) {
955 complete = false;
956 p = ls.erase(p);
957 } else {
958 ++p;
959 }
960 }
961 if (!complete) {
962 // skip exporting dir's ancestors. because they can't get
963 // frozen (exporting dir's parent inode is auth pinned).
964 for (auto p = stack.rbegin(); p < stack.rend(); ++p) {
965 if (!p->complete)
966 break;
967 p->complete = false;
968 }
969 }
970 if (!ls.empty()) {
971 stack.emplace_back(ls.back());
972 ls.pop_back();
973 stack.back().siblings.swap(ls);
974 break;
975 }
976 }
977 }
978 // did above loop push new dirfrag into the stack?
979 if (stack.back().dir != cur)
980 continue;
981
982 if (data.complete) {
983 auto cur_size = data.subdirs_size + dirfrag_size;
984 // we can do nothing with large dirfrag
985 if (cur_size >= max_size && found_size * 2 > max_size)
986 break;
987
988 found_size += dirfrag_size;
989
990 if (stack.size() > 1) {
991 auto& parent = stack[stack.size() - 2];
992 parent.subdirs.emplace_back(cur, cur_size);
993 parent.subdirs_size += cur_size;
994 }
995 } else {
996 // can't merge current dirfrag to its parent if there is skipped subdir
997 results.insert(results.end(), data.subdirs.begin(), data.subdirs.end());
998 skipped_size += dirfrag_size;
999 }
1000
1001 vector<CDir*> ls;
1002 ls.swap(data.siblings);
1003
1004 stack.pop_back();
1005 if (stack.empty())
1006 break;
1007
1008 if (found_size >= max_size)
1009 break;
1010
1011 // next dirfrag
1012 if (!ls.empty()) {
1013 stack.emplace_back(ls.back());
1014 ls.pop_back();
1015 stack.back().siblings.swap(ls);
1016 }
1017 }
1018
1019 for (auto& p : stack)
1020 results.insert(results.end(), p.subdirs.begin(), p.subdirs.end());
1021
1022 if (results.empty() && (!skipped_size || !null_okay))
1023 results.emplace_back(dir, found_size + skipped_size);
1024 }
1025
1026 class C_M_ExportDirWait : public MigratorContext {
1027 MDRequestRef mdr;
1028 int count;
1029 public:
1030 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
1031 : MigratorContext(m), mdr(mdr), count(count) {}
1032 void finish(int r) override {
1033 mig->dispatch_export_dir(mdr, count);
1034 }
1035 };
1036
1037 void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
1038 {
1039 CDir *dir = mdr->more()->export_dir;
1040 dout(7) << "dispatch_export_dir " << *mdr << " " << *dir << dendl;
1041
1042 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1043 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
1044 // export must have aborted.
1045 dout(7) << "export must have aborted " << *mdr << dendl;
1046 assert(mdr->killed || mdr->aborted);
1047 if (mdr->aborted) {
1048 mdr->aborted = false;
1049 mds->mdcache->request_kill(mdr);
1050 }
1051 return;
1052 }
1053 assert(it->second.state == EXPORT_LOCKING);
1054
1055 mds_rank_t dest = it->second.peer;
1056
1057 if (!mds->is_export_target(dest)) {
1058 dout(7) << "dest is not yet an export target" << dendl;
1059 if (count > 3) {
1060 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
1061 export_try_cancel(dir);
1062 return;
1063 }
1064
1065 mds->locker->drop_locks(mdr.get());
1066 mdr->drop_local_auth_pins();
1067
1068 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
1069 return;
1070 }
1071
1072 if (!dir->inode->get_parent_dn()) {
1073 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
1074 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
1075 return;
1076 }
1077
1078 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
1079 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
1080 export_try_cancel(dir);
1081 return;
1082 }
1083
1084 // locks?
1085 set<SimpleLock*> rdlocks;
1086 set<SimpleLock*> xlocks;
1087 set<SimpleLock*> wrlocks;
1088 get_export_lock_set(dir, rdlocks);
1089 // If auth MDS of the subtree root inode is neither the exporter MDS
1090 // nor the importer MDS and it gathers subtree root's fragstat/neststat
1091 // while the subtree is exporting. It's possible that the exporter MDS
1092 // and the importer MDS both are auth MDS of the subtree root or both
1093 // are not auth MDS of the subtree root at the time they receive the
1094 // lock messages. So the auth MDS of the subtree root inode may get no
1095 // or duplicated fragstat/neststat for the subtree root dirfrag.
1096 wrlocks.insert(&dir->get_inode()->filelock);
1097 wrlocks.insert(&dir->get_inode()->nestlock);
1098 if (dir->get_inode()->is_auth()) {
1099 dir->get_inode()->filelock.set_scatter_wanted();
1100 dir->get_inode()->nestlock.set_scatter_wanted();
1101 }
1102
1103 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
1104 if (mdr->aborted)
1105 export_try_cancel(dir);
1106 return;
1107 }
1108
1109 assert(g_conf->mds_kill_export_at != 1);
1110
1111 auto parent = it->second.parent;
1112
1113 vector<pair<CDir*, size_t> > results;
1114 maybe_split_export(dir, max_export_size, (bool)parent, results);
1115
1116 if (results.size() == 1 && results.front().first == dir) {
1117 num_locking_exports--;
1118 it->second.state = EXPORT_DISCOVERING;
1119 // send ExportDirDiscover (ask target)
1120 filepath path;
1121 dir->inode->make_path(path);
1122 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
1123 mds->get_nodeid(), it->second.tid);
1124 mds->send_message_mds(discover, dest);
1125 assert(g_conf->mds_kill_export_at != 2);
1126
1127 it->second.last_cum_auth_pins_change = ceph_clock_now();
1128 it->second.approx_size = results.front().second;
1129 it->second.orig_size = it->second.approx_size;
1130 total_exporting_size += it->second.approx_size;
1131
1132 // start the freeze, but hold it up with an auth_pin.
1133 dir->freeze_tree();
1134 assert(dir->is_freezing_tree());
1135 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
1136 return;
1137 }
1138
1139 if (parent) {
1140 parent->pending_children += results.size();
1141 } else {
1142 parent = std::make_shared<export_base_t>(dir->dirfrag(), dest,
1143 results.size(), export_queue_gen);
1144 }
1145
1146 if (results.empty()) {
1147 dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
1148 << parent->dirfrag << dendl;
1149 parent->restart = true;
1150 } else {
1151 dout(7) << "subtree is too large, splitting it into: " << dendl;
1152 }
1153
1154 for (auto& p : results) {
1155 CDir *sub = p.first;
1156 assert(sub != dir);
1157 dout(7) << " sub " << *sub << dendl;
1158
1159 sub->auth_pin(this);
1160 sub->mark_exporting();
1161
1162 MDRequestRef _mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
1163 _mdr->more()->export_dir = sub;
1164
1165 assert(export_state.count(sub) == 0);
1166 auto& stat = export_state[sub];
1167 num_locking_exports++;
1168 stat.state = EXPORT_LOCKING;
1169 stat.peer = dest;
1170 stat.tid = _mdr->reqid.tid;
1171 stat.mut = _mdr;
1172 stat.parent = parent;
1173 mds->mdcache->dispatch_request(_mdr);
1174 }
1175
1176 // cancel the original one
1177 export_try_cancel(dir);
1178 }
1179
1180 void Migrator::restart_export_dir(CDir *dir, uint64_t tid)
1181 {
1182 auto it = export_state.find(dir);
1183 if (it == export_state.end() || it->second.tid != tid)
1184 return;
1185 if (it->second.state != EXPORT_DISCOVERING &&
1186 it->second.state != EXPORT_FREEZING)
1187 return;
1188
1189 dout(7) << "restart_export_dir " << *dir << dendl;
1190
1191 std::shared_ptr<export_base_t> parent;
1192 parent.swap(it->second.parent);
1193 if (!parent)
1194 export_queue.emplace_front(dir->dirfrag(), it->second.peer);
1195
1196 export_try_cancel(dir);
1197
1198 if (parent)
1199 child_export_finish(parent, true);
1200 }
1201
1202 class C_MDC_RestartExportDir : public MigratorContext {
1203 CDir *dir;
1204 uint64_t tid;
1205 public:
1206 C_MDC_RestartExportDir(Migrator *m, CDir *d, uint64_t t) :
1207 MigratorContext(m), dir(d), tid(t) {}
1208 void finish(int r) override {
1209 mig->restart_export_dir(dir, tid);
1210 }
1211 };
1212
1213 bool Migrator::adjust_export_size(export_state_t &stat, CDir *dir)
1214 {
1215 if (dir->state_test(CDir::STATE_EXPORTING) ||
1216 dir->is_freezing_dir() || dir->is_frozen_dir())
1217 return false;
1218
1219 if (stat.approx_size >= max_export_size &&
1220 stat.approx_size >= stat.orig_size * 2)
1221 return false;
1222
1223 vector<pair<CDir*, size_t> > results;
1224 maybe_split_export(dir, max_export_size, true, results);
1225 if (results.size() == 1 && results.front().first == dir) {
1226 auto size = results.front().second;
1227 stat.approx_size += size;
1228 total_exporting_size += size;
1229 return true;
1230 }
1231
1232 return false;
1233 }
1234
1235 void Migrator::adjust_export_after_rename(CInode* diri, CDir *olddir)
1236 {
1237 CDir *newdir = diri->get_parent_dir();
1238 if (newdir == olddir)
1239 return;
1240
1241 CDir *freezing_dir = newdir->get_freezing_tree_root();
1242 CDir *old_freezing_dir = olddir->get_freezing_tree_root();
1243 if (!freezing_dir || freezing_dir == old_freezing_dir)
1244 return;
1245
1246 dout(7) << "adjust_export_after_rename " << *diri << dendl;
1247
1248 auto &stat = export_state.at(freezing_dir);
1249 assert(stat.state == EXPORT_DISCOVERING ||
1250 stat.state == EXPORT_FREEZING);
1251
1252 if (g_conf->mds_thrash_exports) {
1253 if (rand() % 3 == 0) {
1254 mds->queue_waiter_front(new C_MDC_RestartExportDir(this, freezing_dir, stat.tid));
1255 return;
1256 }
1257 }
1258
1259 vector<CDir*> ls;
1260 diri->get_nested_dirfrags(ls);
1261 for (auto d : ls) {
1262 if (!adjust_export_size(stat, d)) {
1263 mds->queue_waiter_front(new C_MDC_RestartExportDir(this, freezing_dir, stat.tid));
1264 return;
1265 }
1266 }
1267 }
1268
1269 void Migrator::child_export_finish(std::shared_ptr<export_base_t>& parent, bool success)
1270 {
1271 if (success)
1272 parent->restart = true;
1273 if (--parent->pending_children == 0) {
1274 if (parent->restart &&
1275 parent->export_queue_gen == export_queue_gen) {
1276 CDir *origin = mds->mdcache->get_dirfrag(parent->dirfrag);
1277 if (origin && origin->is_auth()) {
1278 dout(7) << "child_export_finish requeue " << *origin << dendl;
1279 export_queue.emplace_front(origin->dirfrag(), parent->dest);
1280 }
1281 }
1282 }
1283 }
1284
1285 /*
1286 * called on receipt of MExportDirDiscoverAck
1287 * the importer now has the directory's _inode_ in memory, and pinned.
1288 *
1289 * This function DOES put the passed message before returning
1290 */
1291 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
1292 {
1293 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1294 mds_rank_t dest(m->get_source().num());
1295 utime_t now = ceph_clock_now();
1296 assert(dir);
1297
1298 dout(7) << "export_discover_ack from " << m->get_source()
1299 << " on " << *dir << dendl;
1300
1301 mds->hit_export_target(now, dest, -1);
1302
1303 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1304 if (it == export_state.end() ||
1305 it->second.tid != m->get_tid() ||
1306 it->second.peer != dest) {
1307 dout(7) << "must have aborted" << dendl;
1308 } else {
1309 assert(it->second.state == EXPORT_DISCOVERING);
1310
1311 if (m->is_success()) {
1312 // release locks to avoid deadlock
1313 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
1314 assert(mdr);
1315 mds->mdcache->request_finish(mdr);
1316 it->second.mut.reset();
1317 // freeze the subtree
1318 it->second.state = EXPORT_FREEZING;
1319 dir->auth_unpin(this);
1320 assert(g_conf->mds_kill_export_at != 3);
1321
1322 } else {
1323 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
1324 export_try_cancel(dir, false);
1325 }
1326 }
1327
1328 m->put(); // done
1329 }
1330
1331 class C_M_ExportSessionsFlushed : public MigratorContext {
1332 CDir *dir;
1333 uint64_t tid;
1334 public:
1335 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1336 : MigratorContext(m), dir(d), tid(t) {
1337 assert(dir != NULL);
1338 }
1339 void finish(int r) override {
1340 mig->export_sessions_flushed(dir, tid);
1341 }
1342 };
1343
1344 void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1345 {
1346 dout(7) << "export_sessions_flushed " << *dir << dendl;
1347
1348 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1349 if (it == export_state.end() ||
1350 it->second.state == EXPORT_CANCELLING ||
1351 it->second.tid != tid) {
1352 // export must have aborted.
1353 dout(7) << "export must have aborted on " << dir << dendl;
1354 return;
1355 }
1356
1357 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1358 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1359 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1360 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1361 export_go(dir); // start export.
1362 }
1363
1364 void Migrator::export_frozen(CDir *dir, uint64_t tid)
1365 {
1366 dout(7) << "export_frozen on " << *dir << dendl;
1367
1368 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1369 if (it == export_state.end() || it->second.tid != tid) {
1370 dout(7) << "export must have aborted" << dendl;
1371 return;
1372 }
1373
1374 assert(it->second.state == EXPORT_FREEZING);
1375 assert(dir->is_frozen_tree_root());
1376 assert(dir->get_cum_auth_pins() == 0);
1377
1378 CInode *diri = dir->get_inode();
1379
1380 // ok, try to grab all my locks.
1381 set<SimpleLock*> rdlocks;
1382 get_export_lock_set(dir, rdlocks);
1383 if ((diri->is_auth() && diri->is_frozen()) ||
1384 !mds->locker->can_rdlock_set(rdlocks) ||
1385 !diri->filelock.can_wrlock(-1) ||
1386 !diri->nestlock.can_wrlock(-1)) {
1387 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1388 << *dir << dendl;
1389 export_try_cancel(dir);
1390 return;
1391 }
1392
1393 it->second.mut = new MutationImpl();
1394 if (diri->is_auth())
1395 it->second.mut->auth_pin(diri);
1396 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1397 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1398 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1399
1400 cache->show_subtrees();
1401
1402 // CDir::_freeze_tree() should have forced it into subtree.
1403 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
1404 // note the bounds.
1405 set<CDir*> bounds;
1406 cache->get_subtree_bounds(dir, bounds);
1407
1408 // generate prep message, log entry.
1409 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1410
1411 // include list of bystanders
1412 for (const auto &p : dir->get_replicas()) {
1413 if (p.first != it->second.peer) {
1414 dout(10) << "bystander mds." << p.first << dendl;
1415 prep->add_bystander(p.first);
1416 }
1417 }
1418
1419 // include base dirfrag
1420 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1421
1422 /*
1423 * include spanning tree for all nested exports.
1424 * these need to be on the destination _before_ the final export so that
1425 * dir_auth updates on any nested exports are properly absorbed.
1426 * this includes inodes and dirfrags included in the subtree, but
1427 * only the inodes at the bounds.
1428 *
1429 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1430 */
1431 set<inodeno_t> inodes_added;
1432 set<dirfrag_t> dirfrags_added;
1433
1434 // check bounds
1435 for (set<CDir*>::iterator p = bounds.begin();
1436 p != bounds.end();
1437 ++p) {
1438 CDir *bound = *p;
1439
1440 // pin it.
1441 bound->get(CDir::PIN_EXPORTBOUND);
1442 bound->state_set(CDir::STATE_EXPORTBOUND);
1443
1444 dout(7) << " export bound " << *bound << dendl;
1445 prep->add_bound( bound->dirfrag() );
1446
1447 // trace to bound
1448 bufferlist tracebl;
1449 CDir *cur = bound;
1450
1451 char start = '-';
1452 while (1) {
1453 // don't repeat inodes
1454 if (inodes_added.count(cur->inode->ino()))
1455 break;
1456 inodes_added.insert(cur->inode->ino());
1457
1458 // prepend dentry + inode
1459 assert(cur->inode->is_auth());
1460 bufferlist bl;
1461 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1462 dout(7) << " added " << *cur->inode->parent << dendl;
1463 cache->replicate_inode(cur->inode, it->second.peer, bl,
1464 mds->mdsmap->get_up_features());
1465 dout(7) << " added " << *cur->inode << dendl;
1466 bl.claim_append(tracebl);
1467 tracebl.claim(bl);
1468
1469 cur = cur->get_parent_dir();
1470
1471 // don't repeat dirfrags
1472 if (dirfrags_added.count(cur->dirfrag()) ||
1473 cur == dir) {
1474 start = 'd'; // start with dentry
1475 break;
1476 }
1477 dirfrags_added.insert(cur->dirfrag());
1478
1479 // prepend dir
1480 cache->replicate_dir(cur, it->second.peer, bl);
1481 dout(7) << " added " << *cur << dendl;
1482 bl.claim_append(tracebl);
1483 tracebl.claim(bl);
1484
1485 start = 'f'; // start with dirfrag
1486 }
1487 bufferlist final_bl;
1488 dirfrag_t df = cur->dirfrag();
1489 ::encode(df, final_bl);
1490 ::encode(start, final_bl);
1491 final_bl.claim_append(tracebl);
1492 prep->add_trace(final_bl);
1493 }
1494
1495 // send.
1496 it->second.state = EXPORT_PREPPING;
1497 mds->send_message_mds(prep, it->second.peer);
1498 assert (g_conf->mds_kill_export_at != 4);
1499
1500 // make sure any new instantiations of caps are flushed out
1501 assert(it->second.warning_ack_waiting.empty());
1502
1503 set<client_t> export_client_set;
1504 get_export_client_set(dir, export_client_set);
1505
1506 MDSGatherBuilder gather(g_ceph_context);
1507 mds->server->flush_client_sessions(export_client_set, gather);
1508 if (gather.has_subs()) {
1509 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1510 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1511 gather.activate();
1512 }
1513 }
1514
1515 void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
1516 {
1517 deque<CDir*> dfs;
1518 dfs.push_back(dir);
1519 while (!dfs.empty()) {
1520 CDir *dir = dfs.front();
1521 dfs.pop_front();
1522 for (auto& p : *dir) {
1523 CDentry *dn = p.second;
1524 if (!dn->get_linkage()->is_primary())
1525 continue;
1526 CInode *in = dn->get_linkage()->get_inode();
1527 if (in->is_dir()) {
1528 // directory?
1529 vector<CDir*> ls;
1530 in->get_dirfrags(ls);
1531 for (auto& q : ls) {
1532 if (!q->state_test(CDir::STATE_EXPORTBOUND)) {
1533 // include nested dirfrag
1534 assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
1535 dfs.push_back(q); // it's ours, recurse (later)
1536 }
1537 }
1538 }
1539 for (auto& q : in->get_client_caps()) {
1540 client_set.insert(q.first);
1541 }
1542 }
1543 }
1544 }
1545
1546 void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1547 {
1548 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1549 q != in->client_caps.end();
1550 ++q)
1551 client_set.insert(q->first);
1552 }
1553
1554 /* This function DOES put the passed message before returning*/
1555 void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1556 {
1557 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1558 mds_rank_t dest(m->get_source().num());
1559 utime_t now = ceph_clock_now();
1560 assert(dir);
1561
1562 dout(7) << "export_prep_ack " << *dir << dendl;
1563
1564 mds->hit_export_target(now, dest, -1);
1565
1566 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1567 if (it == export_state.end() ||
1568 it->second.tid != m->get_tid() ||
1569 it->second.peer != mds_rank_t(m->get_source().num())) {
1570 // export must have aborted.
1571 dout(7) << "export must have aborted" << dendl;
1572 m->put();
1573 return;
1574 }
1575 assert(it->second.state == EXPORT_PREPPING);
1576
1577 if (!m->is_success()) {
1578 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
1579 export_try_cancel(dir, false);
1580 m->put();
1581 return;
1582 }
1583
1584 assert (g_conf->mds_kill_export_at != 5);
1585 // send warnings
1586 set<CDir*> bounds;
1587 cache->get_subtree_bounds(dir, bounds);
1588
1589 assert(it->second.warning_ack_waiting.empty() ||
1590 (it->second.warning_ack_waiting.size() == 1 &&
1591 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1592 assert(it->second.notify_ack_waiting.empty());
1593
1594 for (const auto &p : dir->get_replicas()) {
1595 if (p.first == it->second.peer) continue;
1596 if (mds->is_cluster_degraded() &&
1597 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
1598 continue; // only if active
1599 it->second.warning_ack_waiting.insert(p.first);
1600 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
1601
1602 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1603 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1604 mds_authority_t(mds->get_nodeid(),it->second.peer));
1605 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1606 notify->get_bounds().push_back((*q)->dirfrag());
1607 mds->send_message_mds(notify, p.first);
1608
1609 }
1610
1611 it->second.state = EXPORT_WARNING;
1612
1613 assert(g_conf->mds_kill_export_at != 6);
1614 // nobody to warn?
1615 if (it->second.warning_ack_waiting.empty())
1616 export_go(dir); // start export.
1617
1618 // done.
1619 m->put();
1620 }
1621
1622
1623 class C_M_ExportGo : public MigratorContext {
1624 CDir *dir;
1625 uint64_t tid;
1626 public:
1627 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1628 MigratorContext(m), dir(d), tid(t) {
1629 assert(dir != NULL);
1630 }
1631 void finish(int r) override {
1632 mig->export_go_synced(dir, tid);
1633 }
1634 };
1635
1636 void Migrator::export_go(CDir *dir)
1637 {
1638 auto it = export_state.find(dir);
1639 assert(it != export_state.end());
1640 dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
1641
1642 // first sync log to flush out e.g. any cap imports
1643 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
1644 mds->mdlog->flush();
1645 }
1646
1647 void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1648 {
1649 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1650 if (it == export_state.end() ||
1651 it->second.state == EXPORT_CANCELLING ||
1652 it->second.tid != tid) {
1653 // export must have aborted.
1654 dout(7) << "export must have aborted on " << dir << dendl;
1655 return;
1656 }
1657 assert(it->second.state == EXPORT_WARNING);
1658 mds_rank_t dest = it->second.peer;
1659
1660 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1661
1662 cache->show_subtrees();
1663
1664 it->second.state = EXPORT_EXPORTING;
1665 assert(g_conf->mds_kill_export_at != 7);
1666
1667 assert(dir->is_frozen_tree_root());
1668 assert(dir->get_cum_auth_pins() == 0);
1669
1670 // set ambiguous auth
1671 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1672
1673 // take away the popularity we're sending.
1674 utime_t now = ceph_clock_now();
1675 mds->balancer->subtract_export(dir, now);
1676
1677 // fill export message with cache data
1678 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1679 map<client_t,entity_inst_t> exported_client_map;
1680 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1681 dir, // recur start point
1682 exported_client_map,
1683 now);
1684 ::encode(exported_client_map, req->client_map,
1685 mds->mdsmap->get_up_features());
1686
1687 // add bounds to message
1688 set<CDir*> bounds;
1689 cache->get_subtree_bounds(dir, bounds);
1690 for (set<CDir*>::iterator p = bounds.begin();
1691 p != bounds.end();
1692 ++p)
1693 req->add_export((*p)->dirfrag());
1694
1695 // send
1696 mds->send_message_mds(req, dest);
1697 assert(g_conf->mds_kill_export_at != 8);
1698
1699 mds->hit_export_target(now, dest, num_exported_inodes+1);
1700
1701 // stats
1702 if (mds->logger) mds->logger->inc(l_mds_exported);
1703 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1704
1705 cache->show_subtrees();
1706 }
1707
1708
1709 /** encode_export_inode
1710 * update our local state for this inode to export.
1711 * encode relevant state to be sent over the wire.
1712 * used by: encode_export_dir, file_rename (if foreign)
1713 *
1714 * FIXME: the separation between CInode.encode_export and these methods
1715 * is pretty arbitrary and dumb.
1716 */
1717 void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1718 map<client_t,entity_inst_t>& exported_client_map)
1719 {
1720 dout(7) << "encode_export_inode " << *in << dendl;
1721 assert(!in->is_replica(mds->get_nodeid()));
1722
1723 // relax locks?
1724 if (!in->is_replicated()) {
1725 in->replicate_relax_locks();
1726 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1727 }
1728
1729 ::encode(in->inode.ino, enc_state);
1730 ::encode(in->last, enc_state);
1731 in->encode_export(enc_state);
1732
1733 // caps
1734 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1735 }
1736
1737 void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1738 map<client_t,entity_inst_t>& exported_client_map)
1739 {
1740 dout(20) << "encode_export_inode_caps " << *in << dendl;
1741
1742 // encode caps
1743 map<client_t,Capability::Export> cap_map;
1744 in->export_client_caps(cap_map);
1745 ::encode(cap_map, bl);
1746 if (auth_cap) {
1747 ::encode(in->get_mds_caps_wanted(), bl);
1748
1749 in->state_set(CInode::STATE_EXPORTINGCAPS);
1750 in->get(CInode::PIN_EXPORTINGCAPS);
1751 }
1752
1753 // make note of clients named by exported capabilities
1754 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1755 it != in->client_caps.end();
1756 ++it)
1757 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1758 }
1759
1760 void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1761 map<client_t,Capability::Import>& peer_imported)
1762 {
1763 dout(20) << "finish_export_inode_caps " << *in << dendl;
1764
1765 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1766 in->put(CInode::PIN_EXPORTINGCAPS);
1767
1768 // tell (all) clients about migrating caps..
1769 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1770 it != in->client_caps.end();
1771 ++it) {
1772 Capability *cap = it->second;
1773 dout(7) << "finish_export_inode_caps telling client." << it->first
1774 << " exported caps on " << *in << dendl;
1775 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1776 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1777
1778 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1779 assert(q != peer_imported.end());
1780 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
1781 (q->second.cap_id > 0 ? peer : -1), 0);
1782 mds->send_message_client_counted(m, it->first);
1783 }
1784 in->clear_client_caps_after_export();
1785 mds->locker->eval(in, CEPH_CAP_LOCKS);
1786 }
1787
1788 void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1789 map<client_t,Capability::Import>& peer_imported,
1790 list<MDSInternalContextBase*>& finished)
1791 {
1792 dout(12) << "finish_export_inode " << *in << dendl;
1793
1794 // clean
1795 if (in->is_dirty())
1796 in->mark_clean();
1797
1798 // clear/unpin cached_by (we're no longer the authority)
1799 in->clear_replica_map();
1800
1801 // twiddle lock states for auth -> replica transition
1802 in->authlock.export_twiddle();
1803 in->linklock.export_twiddle();
1804 in->dirfragtreelock.export_twiddle();
1805 in->filelock.export_twiddle();
1806 in->nestlock.export_twiddle();
1807 in->xattrlock.export_twiddle();
1808 in->snaplock.export_twiddle();
1809 in->flocklock.export_twiddle();
1810 in->policylock.export_twiddle();
1811
1812 // mark auth
1813 assert(in->is_auth());
1814 in->state_clear(CInode::STATE_AUTH);
1815 in->replica_nonce = CInode::EXPORT_NONCE;
1816
1817 in->clear_dirty_rstat();
1818
1819 // no more auth subtree? clear scatter dirty
1820 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1821 in->clear_scatter_dirty();
1822
1823 in->item_open_file.remove_myself();
1824
1825 in->clear_dirty_parent();
1826
1827 in->clear_file_locks();
1828
1829 // waiters
1830 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1831
1832 in->finish_export(now);
1833
1834 finish_export_inode_caps(in, peer, peer_imported);
1835 }
1836
1837 uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1838 CDir *dir,
1839 map<client_t,entity_inst_t>& exported_client_map,
1840 utime_t now)
1841 {
1842 uint64_t num_exported = 0;
1843
1844 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1845
1846 assert(dir->get_projected_version() == dir->get_version());
1847
1848 #ifdef MDS_VERIFY_FRAGSTAT
1849 if (dir->is_complete())
1850 dir->verify_fragstat();
1851 #endif
1852
1853 // dir
1854 dirfrag_t df = dir->dirfrag();
1855 ::encode(df, exportbl);
1856 dir->encode_export(exportbl);
1857
1858 __u32 nden = dir->items.size();
1859 ::encode(nden, exportbl);
1860
1861 // dentries
1862 list<CDir*> subdirs;
1863 for (auto &p : *dir) {
1864 CDentry *dn = p.second;
1865 CInode *in = dn->get_linkage()->get_inode();
1866
1867 if (!dn->is_replicated())
1868 dn->lock.replicate_relax();
1869
1870 num_exported++;
1871
1872 // -- dentry
1873 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1874
1875 // dn name
1876 ::encode(dn->get_name(), exportbl);
1877 ::encode(dn->last, exportbl);
1878
1879 // state
1880 dn->encode_export(exportbl);
1881
1882 // points to...
1883
1884 // null dentry?
1885 if (dn->get_linkage()->is_null()) {
1886 exportbl.append("N", 1); // null dentry
1887 continue;
1888 }
1889
1890 if (dn->get_linkage()->is_remote()) {
1891 // remote link
1892 exportbl.append("L", 1); // remote link
1893
1894 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1895 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1896 ::encode(ino, exportbl);
1897 ::encode(d_type, exportbl);
1898 continue;
1899 }
1900
1901 // primary link
1902 // -- inode
1903 exportbl.append("I", 1); // inode dentry
1904
1905 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1906
1907 // directory?
1908 list<CDir*> dfs;
1909 in->get_dirfrags(dfs);
1910 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1911 CDir *t = *p;
1912 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1913 // include nested dirfrag
1914 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
1915 subdirs.push_front(t); // it's ours, recurse (later)
1916 }
1917 }
1918 }
1919
1920 // subdirs
1921 for (auto &dir : subdirs)
1922 num_exported += encode_export_dir(exportbl, dir, exported_client_map, now);
1923
1924 return num_exported;
1925 }
1926
1927 void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1928 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1929 list<MDSInternalContextBase*>& finished, int *num_dentries)
1930 {
1931 dout(10) << "finish_export_dir " << *dir << dendl;
1932
1933 // release open_by
1934 dir->clear_replica_map();
1935
1936 // mark
1937 assert(dir->is_auth());
1938 dir->state_clear(CDir::STATE_AUTH);
1939 dir->remove_bloom();
1940 dir->replica_nonce = CDir::EXPORT_NONCE;
1941
1942 if (dir->is_dirty())
1943 dir->mark_clean();
1944
1945 // suck up all waiters
1946 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1947
1948 // pop
1949 dir->finish_export(now);
1950
1951 // dentries
1952 list<CDir*> subdirs;
1953 for (auto &p : *dir) {
1954 CDentry *dn = p.second;
1955 CInode *in = dn->get_linkage()->get_inode();
1956
1957 // dentry
1958 dn->finish_export();
1959
1960 // inode?
1961 if (dn->get_linkage()->is_primary()) {
1962 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1963
1964 // subdirs?
1965 in->get_nested_dirfrags(subdirs);
1966 }
1967
1968 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1969 ++(*num_dentries);
1970 }
1971
1972 // subdirs
1973 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1974 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1975 }
1976
1977 class C_MDS_ExportFinishLogged : public MigratorLogContext {
1978 CDir *dir;
1979 public:
1980 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1981 void finish(int r) override {
1982 mig->export_logged_finish(dir);
1983 }
1984 };
1985
1986
1987 /*
1988 * i should get an export_ack from the export target.
1989 *
1990 * This function DOES put the passed message before returning
1991 */
1992 void Migrator::handle_export_ack(MExportDirAck *m)
1993 {
1994 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1995 mds_rank_t dest(m->get_source().num());
1996 utime_t now = ceph_clock_now();
1997 assert(dir);
1998 assert(dir->is_frozen_tree_root()); // i'm exporting!
1999
2000 // yay!
2001 dout(7) << "handle_export_ack " << *dir << dendl;
2002
2003 mds->hit_export_target(now, dest, -1);
2004
2005 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2006 assert(it != export_state.end());
2007 assert(it->second.state == EXPORT_EXPORTING);
2008 assert(it->second.tid == m->get_tid());
2009
2010 bufferlist::iterator bp = m->imported_caps.begin();
2011 ::decode(it->second.peer_imported, bp);
2012
2013 it->second.state = EXPORT_LOGGINGFINISH;
2014 assert (g_conf->mds_kill_export_at != 9);
2015 set<CDir*> bounds;
2016 cache->get_subtree_bounds(dir, bounds);
2017
2018 // log completion.
2019 // include export bounds, to ensure they're in the journal.
2020 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
2021 mds->mdlog->start_entry(le);
2022
2023 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2024 le->metablob.add_dir(dir, false);
2025 for (set<CDir*>::iterator p = bounds.begin();
2026 p != bounds.end();
2027 ++p) {
2028 CDir *bound = *p;
2029 le->get_bounds().insert(bound->dirfrag());
2030 le->metablob.add_dir_context(bound);
2031 le->metablob.add_dir(bound, false);
2032 }
2033
2034 // list us second, them first.
2035 // this keeps authority().first in sync with subtree auth state in the journal.
2036 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
2037
2038 // log export completion, then finish (unfreeze, trigger finish context, etc.)
2039 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
2040 mds->mdlog->flush();
2041 assert (g_conf->mds_kill_export_at != 10);
2042
2043 m->put();
2044 }
2045
2046 void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
2047 {
2048 dout(7) << "export_notify_abort " << *dir << dendl;
2049
2050 assert(stat.state == EXPORT_CANCELLING);
2051
2052 if (stat.notify_ack_waiting.empty()) {
2053 stat.state = EXPORT_CANCELLED;
2054 return;
2055 }
2056
2057 dir->auth_pin(this);
2058
2059 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
2060 p != stat.notify_ack_waiting.end();
2061 ++p) {
2062 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2063 pair<int,int>(mds->get_nodeid(), stat.peer),
2064 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2065 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2066 notify->get_bounds().push_back((*i)->dirfrag());
2067 mds->send_message_mds(notify, *p);
2068 }
2069 }
2070
2071 /*
2072 * this happens if hte dest failes after i send teh export data but before it is acked
2073 * that is, we don't know they safely received and logged it, so we reverse our changes
2074 * and go on.
2075 */
2076 void Migrator::export_reverse(CDir *dir, export_state_t& stat)
2077 {
2078 dout(7) << "export_reverse " << *dir << dendl;
2079
2080 set<CInode*> to_eval;
2081
2082 set<CDir*> bounds;
2083 cache->get_subtree_bounds(dir, bounds);
2084
2085 // remove exporting pins
2086 list<CDir*> rq;
2087 rq.push_back(dir);
2088 while (!rq.empty()) {
2089 CDir *t = rq.front();
2090 rq.pop_front();
2091 t->abort_export();
2092 for (auto &p : *t) {
2093 CDentry *dn = p.second;
2094 dn->abort_export();
2095 if (!dn->get_linkage()->is_primary())
2096 continue;
2097 CInode *in = dn->get_linkage()->get_inode();
2098 in->abort_export();
2099 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
2100 in->state_clear(CInode::STATE_EVALSTALECAPS);
2101 to_eval.insert(in);
2102 }
2103 if (in->is_dir())
2104 in->get_nested_dirfrags(rq);
2105 }
2106 }
2107
2108 // unpin bounds
2109 for (auto bd : bounds) {
2110 bd->put(CDir::PIN_EXPORTBOUND);
2111 bd->state_clear(CDir::STATE_EXPORTBOUND);
2112 }
2113
2114 // notify bystanders
2115 export_notify_abort(dir, stat, bounds);
2116
2117 // unfreeze tree, with possible subtree merge.
2118 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2119
2120 // process delayed expires
2121 cache->process_delayed_expire(dir);
2122
2123 dir->unfreeze_tree();
2124 cache->try_subtree_merge(dir);
2125
2126 // revoke/resume stale caps
2127 for (auto in : to_eval) {
2128 bool need_issue = false;
2129 for (auto& p : in->get_client_caps()) {
2130 Capability *cap = p.second;
2131 if (!cap->is_stale()) {
2132 need_issue = true;
2133 break;
2134 }
2135 }
2136 if (need_issue &&
2137 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
2138 mds->locker->issue_caps(in);
2139 }
2140
2141 cache->show_cache();
2142 }
2143
2144
2145 /*
2146 * once i get the ack, and logged the EExportFinish(true),
2147 * send notifies (if any), otherwise go straight to finish.
2148 *
2149 */
2150 void Migrator::export_logged_finish(CDir *dir)
2151 {
2152 dout(7) << "export_logged_finish " << *dir << dendl;
2153
2154 export_state_t& stat = export_state[dir];
2155
2156 // send notifies
2157 set<CDir*> bounds;
2158 cache->get_subtree_bounds(dir, bounds);
2159
2160 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
2161 p != stat.notify_ack_waiting.end();
2162 ++p) {
2163 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2164 pair<int,int>(mds->get_nodeid(), stat.peer),
2165 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
2166
2167 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2168 notify->get_bounds().push_back((*i)->dirfrag());
2169
2170 mds->send_message_mds(notify, *p);
2171 }
2172
2173 // wait for notifyacks
2174 stat.state = EXPORT_NOTIFYING;
2175 assert (g_conf->mds_kill_export_at != 11);
2176
2177 // no notifies to wait for?
2178 if (stat.notify_ack_waiting.empty()) {
2179 export_finish(dir); // skip notify/notify_ack stage.
2180 } else {
2181 // notify peer to send cap import messages to clients
2182 if (!mds->is_cluster_degraded() ||
2183 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
2184 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
2185 } else {
2186 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
2187 }
2188 }
2189 }
2190
2191 /*
2192 * warning:
2193 * i'll get an ack from each bystander.
2194 * when i get them all, do the export.
2195 * notify:
2196 * i'll get an ack from each bystander.
2197 * when i get them all, unfreeze and send the finish.
2198 *
2199 * This function DOES put the passed message before returning
2200 */
2201 void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
2202 {
2203 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2204 mds_rank_t dest(m->get_source().num());
2205 utime_t now = ceph_clock_now();
2206 assert(dir);
2207 mds_rank_t from = mds_rank_t(m->get_source().num());
2208
2209 mds->hit_export_target(now, dest, -1);
2210
2211 auto export_state_entry = export_state.find(dir);
2212 if (export_state_entry != export_state.end()) {
2213 export_state_t& stat = export_state_entry->second;
2214 if (stat.state == EXPORT_WARNING &&
2215 stat.warning_ack_waiting.erase(from)) {
2216 // exporting. process warning.
2217 dout(7) << "handle_export_notify_ack from " << m->get_source()
2218 << ": exporting, processing warning on " << *dir << dendl;
2219 if (stat.warning_ack_waiting.empty())
2220 export_go(dir); // start export.
2221 } else if (stat.state == EXPORT_NOTIFYING &&
2222 stat.notify_ack_waiting.erase(from)) {
2223 // exporting. process notify.
2224 dout(7) << "handle_export_notify_ack from " << m->get_source()
2225 << ": exporting, processing notify on " << *dir << dendl;
2226 if (stat.notify_ack_waiting.empty())
2227 export_finish(dir);
2228 } else if (stat.state == EXPORT_CANCELLING &&
2229 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
2230 stat.notify_ack_waiting.erase(from)) {
2231 dout(7) << "handle_export_notify_ack from " << m->get_source()
2232 << ": cancelling export, processing notify on " << *dir << dendl;
2233 if (stat.notify_ack_waiting.empty()) {
2234 export_cancel_finish(export_state_entry);
2235 }
2236 }
2237 }
2238 else {
2239 auto import_state_entry = import_state.find(dir->dirfrag());
2240 if (import_state_entry != import_state.end()) {
2241 import_state_t& stat = import_state_entry->second;
2242 if (stat.state == IMPORT_ABORTING) {
2243 // reversing import
2244 dout(7) << "handle_export_notify_ack from " << m->get_source()
2245 << ": aborting import on " << *dir << dendl;
2246 assert(stat.bystanders.count(from));
2247 stat.bystanders.erase(from);
2248 if (stat.bystanders.empty())
2249 import_reverse_unfreeze(dir);
2250 }
2251 }
2252 }
2253
2254 m->put();
2255 }
2256
2257 void Migrator::export_finish(CDir *dir)
2258 {
2259 dout(5) << "export_finish " << *dir << dendl;
2260
2261 assert (g_conf->mds_kill_export_at != 12);
2262 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2263 if (it == export_state.end()) {
2264 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
2265 return;
2266 }
2267
2268 // send finish/commit to new auth
2269 if (!mds->is_cluster_degraded() ||
2270 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
2271 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
2272 } else {
2273 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
2274 }
2275 assert(g_conf->mds_kill_export_at != 13);
2276
2277 // finish export (adjust local cache state)
2278 int num_dentries = 0;
2279 list<MDSInternalContextBase*> finished;
2280 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
2281 it->second.peer_imported, finished, &num_dentries);
2282
2283 assert(!dir->is_auth());
2284 cache->adjust_subtree_auth(dir, it->second.peer);
2285
2286 // unpin bounds
2287 set<CDir*> bounds;
2288 cache->get_subtree_bounds(dir, bounds);
2289 for (set<CDir*>::iterator p = bounds.begin();
2290 p != bounds.end();
2291 ++p) {
2292 CDir *bd = *p;
2293 bd->put(CDir::PIN_EXPORTBOUND);
2294 bd->state_clear(CDir::STATE_EXPORTBOUND);
2295 }
2296
2297 if (dir->state_test(CDir::STATE_AUXSUBTREE))
2298 dir->state_clear(CDir::STATE_AUXSUBTREE);
2299
2300 // discard delayed expires
2301 cache->discard_delayed_expire(dir);
2302
2303 dout(7) << "export_finish unfreezing" << dendl;
2304
2305 // unfreeze tree, with possible subtree merge.
2306 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
2307 dir->unfreeze_tree();
2308 cache->try_subtree_merge(dir);
2309
2310 // no more auth subtree? clear scatter dirty
2311 if (!dir->get_inode()->is_auth() &&
2312 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2313 dir->get_inode()->clear_scatter_dirty();
2314 // wake up scatter_nudge waiters
2315 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
2316 }
2317
2318 if (!finished.empty())
2319 mds->queue_waiters(finished);
2320
2321 MutationRef mut = std::move(it->second.mut);
2322 auto parent = std::move(it->second.parent);
2323 // remove from exporting list, clean up state
2324 total_exporting_size -= it->second.approx_size;
2325 export_state.erase(it);
2326
2327 assert(dir->state_test(CDir::STATE_EXPORTING));
2328 dir->clear_exporting();
2329
2330 cache->show_subtrees();
2331 audit();
2332
2333 cache->trim(num_dentries); // try trimming exported dentries
2334
2335 // send pending import_maps?
2336 mds->mdcache->maybe_send_pending_resolves();
2337
2338 // drop locks, unpin path
2339 if (mut) {
2340 mds->locker->drop_locks(mut.get());
2341 mut->cleanup();
2342 }
2343
2344 if (parent)
2345 child_export_finish(parent, true);
2346
2347 maybe_do_queued_export();
2348 }
2349
2350
2351
2352
2353
2354
2355
2356
2357 // ==========================================================
2358 // IMPORT
2359
2360 void Migrator::handle_export_discover(MExportDirDiscover *m)
2361 {
2362 mds_rank_t from = m->get_source_mds();
2363 assert(from != mds->get_nodeid());
2364
2365 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2366
2367 // note import state
2368 dirfrag_t df = m->get_dirfrag();
2369
2370 if (!mds->is_active()) {
2371 dout(7) << " not active, send NACK " << dendl;
2372 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
2373 m->put();
2374 return;
2375 }
2376
2377 // only start discovering on this message once.
2378 import_state_t *p_state;
2379 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2380 if (!m->started) {
2381 assert(it == import_state.end());
2382 m->started = true;
2383 p_state = &import_state[df];
2384 p_state->state = IMPORT_DISCOVERING;
2385 p_state->peer = from;
2386 p_state->tid = m->get_tid();
2387 } else {
2388 // am i retrying after ancient path_traverse results?
2389 if (it == import_state.end() ||
2390 it->second.peer != from ||
2391 it->second.tid != m->get_tid()) {
2392 dout(7) << " dropping obsolete message" << dendl;
2393 m->put();
2394 return;
2395 }
2396 assert(it->second.state == IMPORT_DISCOVERING);
2397 p_state = &it->second;
2398 }
2399
2400 if (!mds->mdcache->is_open()) {
2401 dout(5) << " waiting for root" << dendl;
2402 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2403 return;
2404 }
2405
2406 assert (g_conf->mds_kill_import_at != 1);
2407
2408 // do we have it?
2409 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2410 if (!in) {
2411 // must discover it!
2412 filepath fpath(m->get_path());
2413 vector<CDentry*> trace;
2414 MDRequestRef null_ref;
2415 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2416 if (r > 0) return;
2417 if (r < 0) {
2418 dout(7) << "handle_export_discover failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2419 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2420 }
2421
2422 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2423 }
2424
2425 // yay
2426 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2427
2428 p_state->state = IMPORT_DISCOVERED;
2429
2430 // pin inode in the cache (for now)
2431 assert(in->is_dir());
2432 in->get(CInode::PIN_IMPORTING);
2433
2434 // reply
2435 dout(7) << " sending export_discover_ack on " << *in << dendl;
2436 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), p_state->peer);
2437 m->put();
2438 assert (g_conf->mds_kill_import_at != 2);
2439 }
2440
2441 void Migrator::import_reverse_discovering(dirfrag_t df)
2442 {
2443 import_state.erase(df);
2444 }
2445
2446 void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2447 {
2448 // unpin base
2449 diri->put(CInode::PIN_IMPORTING);
2450 import_state.erase(df);
2451 }
2452
2453 void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
2454 {
2455 set<CDir*> bounds;
2456 cache->map_dirfrag_set(stat.bound_ls, bounds);
2457 import_remove_pins(dir, bounds);
2458 import_reverse_final(dir);
2459 }
2460
2461 /* This function DOES put the passed message before returning*/
2462 void Migrator::handle_export_cancel(MExportDirCancel *m)
2463 {
2464 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2465 dirfrag_t df = m->get_dirfrag();
2466 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2467 if (it == import_state.end()) {
2468 assert(0 == "got export_cancel in weird state");
2469 } else if (it->second.state == IMPORT_DISCOVERING) {
2470 import_reverse_discovering(df);
2471 } else if (it->second.state == IMPORT_DISCOVERED) {
2472 CInode *in = cache->get_inode(df.ino);
2473 assert(in);
2474 import_reverse_discovered(df, in);
2475 } else if (it->second.state == IMPORT_PREPPING) {
2476 CDir *dir = mds->mdcache->get_dirfrag(df);
2477 assert(dir);
2478 import_reverse_prepping(dir, it->second);
2479 } else if (it->second.state == IMPORT_PREPPED) {
2480 CDir *dir = mds->mdcache->get_dirfrag(df);
2481 assert(dir);
2482 set<CDir*> bounds;
2483 cache->get_subtree_bounds(dir, bounds);
2484 import_remove_pins(dir, bounds);
2485 // adjust auth back to the exportor
2486 cache->adjust_subtree_auth(dir, it->second.peer);
2487 import_reverse_unfreeze(dir);
2488 } else {
2489 assert(0 == "got export_cancel in weird state");
2490 }
2491 m->put();
2492 }
2493
2494 /* This function DOES put the passed message before returning*/
2495 void Migrator::handle_export_prep(MExportDirPrep *m)
2496 {
2497 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2498 assert(oldauth != mds->get_nodeid());
2499
2500 CDir *dir;
2501 CInode *diri;
2502 list<MDSInternalContextBase*> finished;
2503
2504 // assimilate root dir.
2505 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2506 if (!m->did_assim()) {
2507 assert(it != import_state.end());
2508 assert(it->second.state == IMPORT_DISCOVERED);
2509 assert(it->second.peer == oldauth);
2510 diri = cache->get_inode(m->get_dirfrag().ino);
2511 assert(diri);
2512 bufferlist::iterator p = m->basedir.begin();
2513 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2514 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2515 } else {
2516 if (it == import_state.end() ||
2517 it->second.peer != oldauth ||
2518 it->second.tid != m->get_tid()) {
2519 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2520 m->put();
2521 return;
2522 }
2523 assert(it->second.state == IMPORT_PREPPING);
2524 assert(it->second.peer == oldauth);
2525
2526 dir = cache->get_dirfrag(m->get_dirfrag());
2527 assert(dir);
2528 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2529 diri = dir->get_inode();
2530 }
2531 assert(dir->is_auth() == false);
2532
2533 cache->show_subtrees();
2534
2535 // build import bound map
2536 map<inodeno_t, fragset_t> import_bound_fragset;
2537 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2538 p != m->get_bounds().end();
2539 ++p) {
2540 dout(10) << " bound " << *p << dendl;
2541 import_bound_fragset[p->ino].insert(p->frag);
2542 }
2543
2544 // assimilate contents?
2545 if (!m->did_assim()) {
2546 dout(7) << "doing assim on " << *dir << dendl;
2547 m->mark_assim(); // only do this the first time!
2548
2549 // change import state
2550 it->second.state = IMPORT_PREPPING;
2551 it->second.bound_ls = m->get_bounds();
2552 it->second.bystanders = m->get_bystanders();
2553 assert(g_conf->mds_kill_import_at != 3);
2554
2555 // bystander list
2556 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2557
2558 // move pin to dir
2559 diri->put(CInode::PIN_IMPORTING);
2560 dir->get(CDir::PIN_IMPORTING);
2561 dir->state_set(CDir::STATE_IMPORTING);
2562
2563 // assimilate traces to exports
2564 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2565 for (list<bufferlist>::iterator p = m->traces.begin();
2566 p != m->traces.end();
2567 ++p) {
2568 bufferlist::iterator q = p->begin();
2569 dirfrag_t df;
2570 ::decode(df, q);
2571 char start;
2572 ::decode(start, q);
2573 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2574
2575 CDir *cur = 0;
2576 if (start == 'd') {
2577 cur = cache->get_dirfrag(df);
2578 assert(cur);
2579 dout(10) << " had " << *cur << dendl;
2580 } else if (start == 'f') {
2581 CInode *in = cache->get_inode(df.ino);
2582 assert(in);
2583 dout(10) << " had " << *in << dendl;
2584 cur = cache->add_replica_dir(q, in, oldauth, finished);
2585 dout(10) << " added " << *cur << dendl;
2586 } else if (start == '-') {
2587 // nothing
2588 } else
2589 assert(0 == "unrecognized start char");
2590
2591 while (!q.end()) {
2592 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2593 dout(10) << " added " << *dn << dendl;
2594 CInode *in = cache->add_replica_inode(q, dn, finished);
2595 dout(10) << " added " << *in << dendl;
2596 if (q.end())
2597 break;
2598 cur = cache->add_replica_dir(q, in, oldauth, finished);
2599 dout(10) << " added " << *cur << dendl;
2600 }
2601 }
2602
2603 // make bound sticky
2604 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2605 p != import_bound_fragset.end();
2606 ++p) {
2607 CInode *in = cache->get_inode(p->first);
2608 assert(in);
2609 in->get_stickydirs();
2610 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2611 }
2612
2613 } else {
2614 dout(7) << " not doing assim on " << *dir << dendl;
2615 }
2616
2617 if (!finished.empty())
2618 mds->queue_waiters(finished);
2619
2620
2621 bool success = true;
2622 if (mds->is_active()) {
2623 // open all bounds
2624 set<CDir*> import_bounds;
2625 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2626 p != import_bound_fragset.end();
2627 ++p) {
2628 CInode *in = cache->get_inode(p->first);
2629 assert(in);
2630
2631 // map fragset into a frag_t list, based on the inode fragtree
2632 list<frag_t> fglist;
2633 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2634 in->dirfragtree.get_leaves_under(*q, fglist);
2635 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2636
2637 for (list<frag_t>::iterator q = fglist.begin();
2638 q != fglist.end();
2639 ++q) {
2640 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2641 if (!bound) {
2642 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2643 cache->open_remote_dirfrag(in, *q,
2644 new C_MDS_RetryMessage(mds, m));
2645 return;
2646 }
2647
2648 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2649 dout(7) << " pinning import bound " << *bound << dendl;
2650 bound->get(CDir::PIN_IMPORTBOUND);
2651 bound->state_set(CDir::STATE_IMPORTBOUND);
2652 } else {
2653 dout(7) << " already pinned import bound " << *bound << dendl;
2654 }
2655 import_bounds.insert(bound);
2656 }
2657 }
2658
2659 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2660
2661 if (!mds->mdcache->is_readonly() &&
2662 diri->filelock.can_wrlock(-1) &&
2663 diri->nestlock.can_wrlock(-1)) {
2664 it->second.mut = new MutationImpl();
2665 // force some locks. hacky.
2666 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2667 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2668
2669 // note that i am an ambiguous auth for this subtree.
2670 // specify bounds, since the exporter explicitly defines the region.
2671 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2672 pair<int,int>(oldauth, mds->get_nodeid()));
2673 cache->verify_subtree_bounds(dir, import_bounds);
2674 // freeze.
2675 dir->_freeze_tree();
2676 // note new state
2677 it->second.state = IMPORT_PREPPED;
2678 } else {
2679 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2680 success = false;
2681 }
2682 } else {
2683 dout(7) << " not active, failing. " << *dir << dendl;
2684 success = false;
2685 }
2686
2687 if (!success)
2688 import_reverse_prepping(dir, it->second);
2689
2690 // ok!
2691 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2692 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2693
2694 assert(g_conf->mds_kill_import_at != 4);
2695 // done
2696 m->put();
2697 }
2698
2699
2700
2701
2702 class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2703 dirfrag_t df;
2704 CDir *dir;
2705 mds_rank_t from;
2706 public:
2707 map<client_t,pair<Session*,uint64_t> > imported_session_map;
2708
2709 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2710 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2711 }
2712 void finish(int r) override {
2713 mig->import_logged_start(df, dir, from, imported_session_map);
2714 }
2715 };
2716
2717 /* This function DOES put the passed message before returning*/
2718 void Migrator::handle_export_dir(MExportDir *m)
2719 {
2720 assert (g_conf->mds_kill_import_at != 5);
2721 CDir *dir = cache->get_dirfrag(m->dirfrag);
2722 assert(dir);
2723
2724 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2725 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2726
2727 assert(!dir->is_auth());
2728
2729 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2730 assert(it != import_state.end());
2731 assert(it->second.state == IMPORT_PREPPED);
2732 assert(it->second.tid == m->get_tid());
2733 assert(it->second.peer == oldauth);
2734
2735 utime_t now = ceph_clock_now();
2736
2737 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2738 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2739
2740 cache->show_subtrees();
2741
2742 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
2743
2744 // start the journal entry
2745 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
2746 mds->mdlog->start_entry(le);
2747
2748 le->metablob.add_dir_context(dir);
2749
2750 // adjust auth (list us _first_)
2751 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2752
2753 // new client sessions, open these after we journal
2754 // include imported sessions in EImportStart
2755 bufferlist::iterator cmp = m->client_map.begin();
2756 map<client_t,entity_inst_t> client_map;
2757 decode(client_map, cmp);
2758 assert(cmp.end());
2759 le->cmapv = mds->server->prepare_force_open_sessions(client_map, onlogged->imported_session_map);
2760 encode(client_map, le->client_map, mds->mdsmap->get_up_features());
2761
2762 bufferlist::iterator blp = m->export_data.begin();
2763 int num_imported_inodes = 0;
2764 while (!blp.end()) {
2765 num_imported_inodes +=
2766 decode_import_dir(blp,
2767 oldauth,
2768 dir, // import root
2769 le,
2770 mds->mdlog->get_current_segment(),
2771 it->second.peer_exports,
2772 it->second.updated_scatterlocks,
2773 now);
2774 }
2775 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2776
2777 // include bounds in EImportStart
2778 set<CDir*> import_bounds;
2779 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2780 p != m->bounds.end();
2781 ++p) {
2782 CDir *bd = cache->get_dirfrag(*p);
2783 assert(bd);
2784 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2785 import_bounds.insert(bd);
2786 }
2787 cache->verify_subtree_bounds(dir, import_bounds);
2788
2789 // adjust popularity
2790 mds->balancer->add_import(dir, now);
2791
2792 dout(7) << "handle_export_dir did " << *dir << dendl;
2793
2794 // note state
2795 it->second.state = IMPORT_LOGGINGSTART;
2796 assert (g_conf->mds_kill_import_at != 6);
2797
2798 // log it
2799 mds->mdlog->submit_entry(le, onlogged);
2800 mds->mdlog->flush();
2801
2802 // some stats
2803 if (mds->logger) {
2804 mds->logger->inc(l_mds_imported);
2805 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2806 }
2807
2808 m->put();
2809 }
2810
2811
2812 /*
2813 * this is an import helper
2814 * called by import_finish, and import_reverse and friends.
2815 */
2816 void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2817 {
2818 import_state_t& stat = import_state[dir->dirfrag()];
2819 // root
2820 dir->put(CDir::PIN_IMPORTING);
2821 dir->state_clear(CDir::STATE_IMPORTING);
2822
2823 // bounding inodes
2824 set<inodeno_t> did;
2825 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2826 p != stat.bound_ls.end();
2827 ++p) {
2828 if (did.count(p->ino))
2829 continue;
2830 did.insert(p->ino);
2831 CInode *in = cache->get_inode(p->ino);
2832 assert(in);
2833 in->put_stickydirs();
2834 }
2835
2836 if (stat.state == IMPORT_PREPPING) {
2837 for (auto bd : bounds) {
2838 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2839 bd->put(CDir::PIN_IMPORTBOUND);
2840 bd->state_clear(CDir::STATE_IMPORTBOUND);
2841 }
2842 }
2843 } else if (stat.state >= IMPORT_PREPPED) {
2844 // bounding dirfrags
2845 for (auto bd : bounds) {
2846 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2847 bd->put(CDir::PIN_IMPORTBOUND);
2848 bd->state_clear(CDir::STATE_IMPORTBOUND);
2849 }
2850 }
2851 }
2852
2853 class C_MDC_QueueContexts : public MigratorContext {
2854 public:
2855 list<MDSInternalContextBase*> contexts;
2856 C_MDC_QueueContexts(Migrator *m) : MigratorContext(m) {}
2857 void finish(int r) override {
2858 // execute contexts immediately after 'this' context
2859 get_mds()->queue_waiters_front(contexts);
2860 }
2861 };
2862
2863 /*
2864 * note: this does teh full work of reversing and import and cleaning up
2865 * state.
2866 * called by both handle_mds_failure and by handle_resolve (if we are
2867 * a survivor coping with an exporter failure+recovery).
2868 */
2869 void Migrator::import_reverse(CDir *dir)
2870 {
2871 dout(7) << "import_reverse " << *dir << dendl;
2872
2873 import_state_t& stat = import_state[dir->dirfrag()];
2874 stat.state = IMPORT_ABORTING;
2875
2876 set<CDir*> bounds;
2877 cache->get_subtree_bounds(dir, bounds);
2878
2879 // remove pins
2880 import_remove_pins(dir, bounds);
2881
2882 // update auth, with possible subtree merge.
2883 assert(dir->is_subtree_root());
2884 if (mds->is_resolve())
2885 cache->trim_non_auth_subtree(dir);
2886
2887 cache->adjust_subtree_auth(dir, stat.peer);
2888
2889 auto fin = new C_MDC_QueueContexts(this);
2890 if (!dir->get_inode()->is_auth() &&
2891 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2892 dir->get_inode()->clear_scatter_dirty();
2893 // wake up scatter_nudge waiters
2894 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2895 }
2896
2897 int num_dentries = 0;
2898 // adjust auth bits.
2899 list<CDir*> q;
2900 q.push_back(dir);
2901 while (!q.empty()) {
2902 CDir *cur = q.front();
2903 q.pop_front();
2904
2905 // dir
2906 assert(cur->is_auth());
2907 cur->state_clear(CDir::STATE_AUTH);
2908 cur->remove_bloom();
2909 cur->clear_replica_map();
2910 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2911 if (cur->is_dirty())
2912 cur->mark_clean();
2913
2914 for (auto &p : *cur) {
2915 CDentry *dn = p.second;
2916
2917 // dentry
2918 dn->state_clear(CDentry::STATE_AUTH);
2919 dn->clear_replica_map();
2920 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2921 if (dn->is_dirty())
2922 dn->mark_clean();
2923
2924 // inode?
2925 if (dn->get_linkage()->is_primary()) {
2926 CInode *in = dn->get_linkage()->get_inode();
2927 in->state_clear(CDentry::STATE_AUTH);
2928 in->clear_replica_map();
2929 in->set_replica_nonce(CInode::EXPORT_NONCE);
2930 if (in->is_dirty())
2931 in->mark_clean();
2932 in->clear_dirty_rstat();
2933 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2934 in->clear_scatter_dirty();
2935 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2936 }
2937
2938 in->clear_dirty_parent();
2939
2940 in->authlock.clear_gather();
2941 in->linklock.clear_gather();
2942 in->dirfragtreelock.clear_gather();
2943 in->filelock.clear_gather();
2944
2945 in->clear_file_locks();
2946
2947 // non-bounding dir?
2948 list<CDir*> dfs;
2949 in->get_dirfrags(dfs);
2950 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2951 if (bounds.count(*p) == 0)
2952 q.push_back(*p);
2953 }
2954
2955 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2956 ++num_dentries;
2957 }
2958 }
2959
2960 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2961
2962 if (stat.state == IMPORT_ACKING) {
2963 // remove imported caps
2964 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
2965 p != stat.peer_exports.end();
2966 ++p) {
2967 CInode *in = p->first;
2968 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2969 q != p->second.end();
2970 ++q) {
2971 Capability *cap = in->get_client_cap(q->first);
2972 if (!cap) {
2973 assert(!stat.session_map.count(q->first));
2974 continue;
2975 }
2976 if (cap->is_importing())
2977 in->remove_client_cap(q->first);
2978 }
2979 in->put(CInode::PIN_IMPORTINGCAPS);
2980 }
2981 for (auto& p : stat.session_map) {
2982 Session *session = p.second.first;
2983 session->dec_importing();
2984 }
2985 }
2986
2987 // log our failure
2988 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2989
2990 cache->trim(num_dentries); // try trimming dentries
2991
2992 // notify bystanders; wait in aborting state
2993 import_notify_abort(dir, bounds);
2994 }
2995
2996 void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2997 {
2998 dout(7) << "import_notify_finish " << *dir << dendl;
2999
3000 import_state_t& stat = import_state[dir->dirfrag()];
3001 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
3002 p != stat.bystanders.end();
3003 ++p) {
3004 MExportDirNotify *notify =
3005 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
3006 pair<int,int>(stat.peer, mds->get_nodeid()),
3007 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
3008 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
3009 notify->get_bounds().push_back((*i)->dirfrag());
3010 mds->send_message_mds(notify, *p);
3011 }
3012 }
3013
3014 void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
3015 {
3016 dout(7) << "import_notify_abort " << *dir << dendl;
3017
3018 import_state_t& stat = import_state[dir->dirfrag()];
3019 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
3020 p != stat.bystanders.end(); ) {
3021 if (mds->is_cluster_degraded() &&
3022 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
3023 // this can happen if both exporter and bystander fail in the same mdsmap epoch
3024 stat.bystanders.erase(p++);
3025 continue;
3026 }
3027 MExportDirNotify *notify =
3028 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
3029 mds_authority_t(stat.peer, mds->get_nodeid()),
3030 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
3031 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
3032 notify->get_bounds().push_back((*i)->dirfrag());
3033 mds->send_message_mds(notify, *p);
3034 ++p;
3035 }
3036 if (stat.bystanders.empty()) {
3037 dout(7) << "no bystanders, finishing reverse now" << dendl;
3038 import_reverse_unfreeze(dir);
3039 } else {
3040 assert (g_conf->mds_kill_import_at != 10);
3041 }
3042 }
3043
3044 void Migrator::import_reverse_unfreeze(CDir *dir)
3045 {
3046 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
3047 assert(!dir->is_auth());
3048 cache->discard_delayed_expire(dir);
3049 dir->unfreeze_tree();
3050 if (dir->is_subtree_root())
3051 cache->try_subtree_merge(dir);
3052 import_reverse_final(dir);
3053 }
3054
3055 void Migrator::import_reverse_final(CDir *dir)
3056 {
3057 dout(7) << "import_reverse_final " << *dir << dendl;
3058
3059 // clean up
3060 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
3061 assert(it != import_state.end());
3062
3063 MutationRef mut = it->second.mut;
3064 import_state.erase(it);
3065
3066 // send pending import_maps?
3067 mds->mdcache->maybe_send_pending_resolves();
3068
3069 if (mut) {
3070 mds->locker->drop_locks(mut.get());
3071 mut->cleanup();
3072 }
3073
3074 cache->show_subtrees();
3075 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3076 }
3077
3078
3079
3080
3081 void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
3082 map<client_t,pair<Session*,uint64_t> >& imported_session_map)
3083 {
3084 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
3085 if (it == import_state.end() ||
3086 it->second.state != IMPORT_LOGGINGSTART) {
3087 dout(7) << "import " << df << " must have aborted" << dendl;
3088 mds->server->finish_force_open_sessions(imported_session_map);
3089 return;
3090 }
3091
3092 dout(7) << "import_logged " << *dir << dendl;
3093
3094 // note state
3095 it->second.state = IMPORT_ACKING;
3096
3097 assert (g_conf->mds_kill_import_at != 7);
3098
3099 // force open client sessions and finish cap import
3100 mds->server->finish_force_open_sessions(imported_session_map, false);
3101
3102 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
3103 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
3104 p != it->second.peer_exports.end();
3105 ++p) {
3106 // parameter 'peer' is NONE, delay sending cap import messages to client
3107 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
3108 p->second, imported_caps[p->first->ino()]);
3109 }
3110
3111 it->second.session_map.swap(imported_session_map);
3112
3113 // send notify's etc.
3114 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
3115
3116 // test surviving observer of a failed migration that did not complete
3117 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
3118
3119 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
3120 ::encode(imported_caps, ack->imported_caps);
3121
3122 mds->send_message_mds(ack, from);
3123 assert (g_conf->mds_kill_import_at != 8);
3124
3125 cache->show_subtrees();
3126 }
3127
3128 /* This function DOES put the passed message before returning*/
3129 void Migrator::handle_export_finish(MExportDirFinish *m)
3130 {
3131 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3132 assert(dir);
3133 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
3134
3135 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
3136 assert(it != import_state.end());
3137 assert(it->second.tid == m->get_tid());
3138
3139 import_finish(dir, false, m->is_last());
3140
3141 m->put();
3142 }
3143
3144 void Migrator::import_finish(CDir *dir, bool notify, bool last)
3145 {
3146 dout(7) << "import_finish on " << *dir << dendl;
3147
3148 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
3149 assert(it != import_state.end());
3150 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
3151
3152 if (it->second.state == IMPORT_ACKING) {
3153 assert(dir->is_auth());
3154 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
3155 }
3156
3157 // log finish
3158 assert(g_conf->mds_kill_import_at != 9);
3159
3160 if (it->second.state == IMPORT_ACKING) {
3161 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
3162 p != it->second.peer_exports.end();
3163 ++p) {
3164 CInode *in = p->first;
3165 assert(in->is_auth());
3166 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
3167 q != p->second.end();
3168 ++q) {
3169 auto r = it->second.session_map.find(q->first);
3170 if (r == it->second.session_map.end())
3171 continue;
3172
3173 Session *session = r->second.first;
3174 Capability *cap = in->get_client_cap(q->first);
3175 assert(cap);
3176 cap->merge(q->second, true);
3177 cap->clear_importing();
3178 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
3179 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
3180 }
3181 p->second.clear();
3182 in->replica_caps_wanted = 0;
3183 }
3184 for (auto& p : it->second.session_map) {
3185 Session *session = p.second.first;
3186 session->dec_importing();
3187 }
3188 }
3189
3190 if (!last) {
3191 assert(it->second.state == IMPORT_ACKING);
3192 it->second.state = IMPORT_FINISHING;
3193 return;
3194 }
3195
3196 // remove pins
3197 set<CDir*> bounds;
3198 cache->get_subtree_bounds(dir, bounds);
3199
3200 if (notify)
3201 import_notify_finish(dir, bounds);
3202
3203 import_remove_pins(dir, bounds);
3204
3205 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3206 it->second.peer_exports.swap(peer_exports);
3207
3208 // clear import state (we're done!)
3209 MutationRef mut = it->second.mut;
3210 import_state.erase(it);
3211
3212 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3213
3214 // process delayed expires
3215 cache->process_delayed_expire(dir);
3216
3217 // unfreeze tree, with possible subtree merge.
3218 dir->unfreeze_tree();
3219 cache->try_subtree_merge(dir);
3220
3221 cache->show_subtrees();
3222 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3223
3224 if (mut) {
3225 mds->locker->drop_locks(mut.get());
3226 mut->cleanup();
3227 }
3228
3229 // re-eval imported caps
3230 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
3231 p != peer_exports.end();
3232 ++p) {
3233 if (p->first->is_auth())
3234 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
3235 p->first->put(CInode::PIN_IMPORTINGCAPS);
3236 }
3237
3238 // send pending import_maps?
3239 mds->mdcache->maybe_send_pending_resolves();
3240
3241 // did i just import mydir?
3242 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
3243 cache->populate_mydir();
3244
3245 // is it empty?
3246 if (dir->get_num_head_items() == 0 &&
3247 !dir->inode->is_auth()) {
3248 // reexport!
3249 export_empty_import(dir);
3250 }
3251 }
3252
3253
3254 void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
3255 mds_rank_t oldauth, LogSegment *ls,
3256 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3257 list<ScatterLock*>& updated_scatterlocks)
3258 {
3259 dout(15) << "decode_import_inode on " << *dn << dendl;
3260
3261 inodeno_t ino;
3262 snapid_t last;
3263 ::decode(ino, blp);
3264 ::decode(last, blp);
3265
3266 bool added = false;
3267 CInode *in = cache->get_inode(ino, last);
3268 if (!in) {
3269 in = new CInode(mds->mdcache, true, 1, last);
3270 added = true;
3271 }
3272
3273 // state after link -- or not! -sage
3274 in->decode_import(blp, ls); // cap imports are noted for later action
3275
3276 // caps
3277 decode_import_inode_caps(in, true, blp, peer_exports);
3278
3279 // link before state -- or not! -sage
3280 if (dn->get_linkage()->get_inode() != in) {
3281 assert(!dn->get_linkage()->get_inode());
3282 dn->dir->link_primary_inode(dn, in);
3283 }
3284
3285 if (in->is_dir())
3286 dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
3287
3288 // add inode?
3289 if (added) {
3290 cache->add_inode(in);
3291 dout(10) << "added " << *in << dendl;
3292 } else {
3293 dout(10) << " had " << *in << dendl;
3294 }
3295
3296 if (in->inode.is_dirty_rstat())
3297 in->mark_dirty_rstat();
3298
3299 // clear if dirtyscattered, since we're going to journal this
3300 // but not until we _actually_ finish the import...
3301 if (in->filelock.is_dirty()) {
3302 updated_scatterlocks.push_back(&in->filelock);
3303 mds->locker->mark_updated_scatterlock(&in->filelock);
3304 }
3305
3306 if (in->dirfragtreelock.is_dirty()) {
3307 updated_scatterlocks.push_back(&in->dirfragtreelock);
3308 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
3309 }
3310
3311 // adjust replica list
3312 //assert(!in->is_replica(oldauth)); // not true on failed export
3313 in->add_replica(oldauth, CInode::EXPORT_NONCE);
3314 if (in->is_replica(mds->get_nodeid()))
3315 in->remove_replica(mds->get_nodeid());
3316 }
3317
3318 void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
3319 bufferlist::iterator &blp,
3320 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3321 {
3322 map<client_t,Capability::Export> cap_map;
3323 ::decode(cap_map, blp);
3324 if (auth_cap)
3325 ::decode(in->get_mds_caps_wanted(), blp);
3326 if (!cap_map.empty() ||
3327 (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
3328 peer_exports[in].swap(cap_map);
3329 in->get(CInode::PIN_IMPORTINGCAPS);
3330 }
3331 }
3332
3333 void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
3334 const map<client_t,pair<Session*,uint64_t> >& session_map,
3335 const map<client_t,Capability::Export> &export_map,
3336 map<client_t,Capability::Import> &import_map)
3337 {
3338 for (auto& it : export_map) {
3339 dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
3340
3341 auto p = session_map.find(it.first);
3342 if (p == session_map.end()) {
3343 dout(10) << " no session for client." << it.first << dendl;
3344 (void)import_map[it.first];
3345 continue;
3346 }
3347
3348 Session *session = p->second.first;
3349
3350 Capability *cap = in->get_client_cap(it.first);
3351 if (!cap) {
3352 cap = in->add_client_cap(it.first, session);
3353 if (peer < 0)
3354 cap->mark_importing();
3355 }
3356
3357 // Always ask exporter mds to send cap export messages for auth caps.
3358 // For non-auth caps, ask exporter mds to send cap export messages to
3359 // clients who haven't opened sessions. The cap export messages will
3360 // make clients open sessions.
3361 if (auth_cap || session->connection == nullptr) {
3362 Capability::Import& im = import_map[it.first];
3363 im.cap_id = cap->get_cap_id();
3364 im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
3365 im.issue_seq = cap->get_last_seq() + 1;
3366 }
3367
3368 if (peer >= 0) {
3369 cap->merge(it.second, auth_cap);
3370 mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
3371 it.second.seq, it.second.mseq - 1, peer,
3372 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3373 }
3374 }
3375
3376 if (peer >= 0) {
3377 in->replica_caps_wanted = 0;
3378 in->put(CInode::PIN_IMPORTINGCAPS);
3379 }
3380 }
3381
3382 int Migrator::decode_import_dir(bufferlist::iterator& blp,
3383 mds_rank_t oldauth,
3384 CDir *import_root,
3385 EImportStart *le,
3386 LogSegment *ls,
3387 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3388 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3389 {
3390 // set up dir
3391 dirfrag_t df;
3392 ::decode(df, blp);
3393
3394 CInode *diri = cache->get_inode(df.ino);
3395 assert(diri);
3396 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3397 assert(dir);
3398
3399 dout(7) << "decode_import_dir " << *dir << dendl;
3400
3401 // assimilate state
3402 dir->decode_import(blp, now, ls);
3403
3404 // adjust replica list
3405 //assert(!dir->is_replica(oldauth)); // not true on failed export
3406 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3407 if (dir->is_replica(mds->get_nodeid()))
3408 dir->remove_replica(mds->get_nodeid());
3409
3410 // add to journal entry
3411 if (le)
3412 le->metablob.add_import_dir(dir);
3413
3414 int num_imported = 0;
3415
3416 // take all waiters on this dir
3417 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3418 // a replica's presense in my cache implies/forces it's presense in authority's.
3419 list<MDSInternalContextBase*> waiters;
3420
3421 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3422 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3423 it != waiters.end();
3424 ++it)
3425 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3426
3427 dout(15) << "doing contents" << dendl;
3428
3429 // contents
3430 __u32 nden;
3431 ::decode(nden, blp);
3432
3433 for (; nden>0; nden--) {
3434 num_imported++;
3435
3436 // dentry
3437 string dname;
3438 snapid_t last;
3439 ::decode(dname, blp);
3440 ::decode(last, blp);
3441
3442 CDentry *dn = dir->lookup_exact_snap(dname, last);
3443 if (!dn)
3444 dn = dir->add_null_dentry(dname, 1, last);
3445
3446 dn->decode_import(blp, ls);
3447
3448 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3449 if (dn->is_replica(mds->get_nodeid()))
3450 dn->remove_replica(mds->get_nodeid());
3451
3452 // dentry lock in unreadable state can block path traverse
3453 if (dn->lock.get_state() != LOCK_SYNC)
3454 mds->locker->try_eval(&dn->lock, NULL);
3455
3456 dout(15) << "decode_import_dir got " << *dn << dendl;
3457
3458 // points to...
3459 char icode;
3460 ::decode(icode, blp);
3461
3462 if (icode == 'N') {
3463 // null dentry
3464 assert(dn->get_linkage()->is_null());
3465
3466 // fall thru
3467 }
3468 else if (icode == 'L') {
3469 // remote link
3470 inodeno_t ino;
3471 unsigned char d_type;
3472 ::decode(ino, blp);
3473 ::decode(d_type, blp);
3474 if (dn->get_linkage()->is_remote()) {
3475 assert(dn->get_linkage()->get_remote_ino() == ino);
3476 } else {
3477 dir->link_remote_inode(dn, ino, d_type);
3478 }
3479 }
3480 else if (icode == 'I') {
3481 // inode
3482 assert(le);
3483 decode_import_inode(dn, blp, oldauth, ls,
3484 peer_exports, updated_scatterlocks);
3485 }
3486
3487 // add dentry to journal entry
3488 if (le)
3489 le->metablob.add_import_dentry(dn);
3490 }
3491
3492 #ifdef MDS_VERIFY_FRAGSTAT
3493 if (dir->is_complete())
3494 dir->verify_fragstat();
3495 #endif
3496
3497 dir->inode->maybe_export_pin();
3498
3499 dout(7) << "decode_import_dir done " << *dir << dendl;
3500 return num_imported;
3501 }
3502
3503
3504
3505
3506
3507 // authority bystander
3508
3509 /* This function DOES put the passed message before returning*/
3510 void Migrator::handle_export_notify(MExportDirNotify *m)
3511 {
3512 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3513 m->put();
3514 return;
3515 }
3516
3517 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3518
3519 mds_rank_t from = mds_rank_t(m->get_source().num());
3520 mds_authority_t old_auth = m->get_old_auth();
3521 mds_authority_t new_auth = m->get_new_auth();
3522
3523 if (!dir) {
3524 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3525 << " on missing dir " << m->get_dirfrag() << dendl;
3526 } else if (dir->authority() != old_auth) {
3527 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3528 << " != " << old_auth << " -> " << new_auth
3529 << " on " << *dir << dendl;
3530 } else {
3531 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3532 << " on " << *dir << dendl;
3533 // adjust auth
3534 set<CDir*> have;
3535 cache->map_dirfrag_set(m->get_bounds(), have);
3536 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3537
3538 // induce a merge?
3539 cache->try_subtree_merge(dir);
3540 }
3541
3542 // send ack
3543 if (m->wants_ack()) {
3544 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3545 } else {
3546 // aborted. no ack.
3547 dout(7) << "handle_export_notify no ack requested" << dendl;
3548 }
3549
3550 m->put();
3551 }
3552
3553 /** cap exports **/
3554 void Migrator::export_caps(CInode *in)
3555 {
3556 mds_rank_t dest = in->authority().first;
3557 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3558
3559 assert(in->is_any_caps());
3560 assert(!in->is_auth());
3561 assert(!in->is_ambiguous_auth());
3562 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3563
3564 MExportCaps *ex = new MExportCaps;
3565 ex->ino = in->ino();
3566
3567 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3568
3569 mds->send_message_mds(ex, dest);
3570 }
3571
3572 /* This function DOES put the passed message before returning*/
3573 void Migrator::handle_export_caps_ack(MExportCapsAck *ack)
3574 {
3575 mds_rank_t from = ack->get_source().num();
3576 CInode *in = cache->get_inode(ack->ino);
3577 if (in) {
3578 assert(!in->is_auth());
3579
3580 dout(10) << "handle_export_caps_ack " << *ack << " from "
3581 << ack->get_source() << " on " << *in << dendl;
3582
3583 map<client_t,Capability::Import> imported_caps;
3584 map<client_t,uint64_t> caps_ids;
3585 auto blp = ack->cap_bl.begin();
3586 ::decode(imported_caps, blp);
3587 ::decode(caps_ids, blp);
3588
3589 for (auto& it : imported_caps) {
3590 Capability *cap = in->get_client_cap(it.first);
3591 if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
3592 continue;
3593
3594 dout(7) << __func__ << " telling client." << it.first
3595 << " exported caps on " << *in << dendl;
3596 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
3597 cap->get_cap_id(), cap->get_mseq(),
3598 mds->get_osd_epoch_barrier());
3599 m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
3600 mds->send_message_client_counted(m, it.first);
3601
3602 in->remove_client_cap(it.first);
3603 }
3604
3605 mds->locker->request_inode_file_caps(in);
3606 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
3607 }
3608
3609 ack->put();
3610 }
3611
3612 void Migrator::handle_gather_caps(MGatherCaps *m)
3613 {
3614 CInode *in = cache->get_inode(m->ino);
3615 if (!in)
3616 goto out;
3617
3618 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
3619 << " on " << *in << dendl;
3620
3621 if (in->is_any_caps() &&
3622 !in->is_auth() &&
3623 !in->is_ambiguous_auth() &&
3624 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3625 export_caps(in);
3626
3627 out:
3628 m->put();
3629 }
3630
3631 class C_M_LoggedImportCaps : public MigratorLogContext {
3632 CInode *in;
3633 mds_rank_t from;
3634 public:
3635 map<client_t,pair<Session*,uint64_t> > imported_session_map;
3636 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3637
3638 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3639 void finish(int r) override {
3640 mig->logged_import_caps(in, from, imported_session_map, peer_exports);
3641 }
3642 };
3643
3644 /* This function DOES put the passed message before returning*/
3645 void Migrator::handle_export_caps(MExportCaps *ex)
3646 {
3647 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3648 CInode *in = cache->get_inode(ex->ino);
3649
3650 assert(in);
3651 assert(in->is_auth());
3652
3653 // FIXME
3654 if (!in->can_auth_pin()) {
3655 ex->put();
3656 return;
3657 }
3658
3659 in->auth_pin(this);
3660
3661 map<client_t,entity_inst_t> client_map;
3662 client_map.swap(ex->client_map);
3663
3664 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3665 this, in, mds_rank_t(ex->get_source().num()));
3666
3667 version_t pv = mds->server->prepare_force_open_sessions(client_map,
3668 finish->imported_session_map);
3669 // decode new caps
3670 bufferlist::iterator blp = ex->cap_bl.begin();
3671 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3672 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3673
3674 // journal open client sessions
3675
3676 ESessions *le = new ESessions(pv, client_map);
3677 mds->mdlog->start_submit_entry(le, finish);
3678 mds->mdlog->flush();
3679
3680 ex->put();
3681 }
3682
3683
3684 void Migrator::logged_import_caps(CInode *in,
3685 mds_rank_t from,
3686 map<client_t,pair<Session*,uint64_t> >& imported_session_map,
3687 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3688 {
3689 dout(10) << "logged_import_caps on " << *in << dendl;
3690 // see export_go() vs export_go_synced()
3691 assert(in->is_auth());
3692
3693 // force open client sessions and finish cap import
3694 mds->server->finish_force_open_sessions(imported_session_map);
3695
3696 auto it = peer_exports.find(in);
3697 assert(it != peer_exports.end());
3698
3699 // clients will release caps from the exporter when they receive the cap import message.
3700 map<client_t,Capability::Import> imported_caps;
3701 finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
3702 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
3703
3704 if (!imported_caps.empty()) {
3705 MExportCapsAck *ack = new MExportCapsAck(in->ino());
3706 map<client_t,uint64_t> peer_caps_ids;
3707 for (auto &p : imported_caps )
3708 peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
3709
3710 ::encode(imported_caps, ack->cap_bl);
3711 ::encode(peer_caps_ids, ack->cap_bl);
3712 mds->send_message_mds(ack, from);
3713 }
3714
3715 in->auth_unpin(this);
3716 }
3717
3718 Migrator::Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
3719 max_export_size = g_conf->get_val<uint64_t>("mds_max_export_size");
3720 inject_session_race = g_conf->get_val<bool>("mds_inject_migrator_session_race");
3721 }
3722
3723 void Migrator::handle_conf_change(const struct md_config_t *conf,
3724 const std::set <std::string> &changed,
3725 const MDSMap &mds_map)
3726 {
3727 if (changed.count("mds_max_export_size"))
3728 max_export_size = conf->get_val<uint64_t>("mds_max_export_size");
3729
3730 if (changed.count("mds_inject_migrator_session_race")) {
3731 inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
3732 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
3733 }
3734
3735 if (changed.count("mds_inject_migrator_message_loss")) {
3736 inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
3737 dout(0) << "mds_inject_migrator_message_loss is " << inject_message_loss << dendl;
3738 }
3739 }