]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Migrator.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / Migrator.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MDSRank.h"
16#include "MDCache.h"
17#include "CInode.h"
18#include "CDir.h"
19#include "CDentry.h"
20#include "Migrator.h"
21#include "Locker.h"
22#include "Server.h"
23
24#include "MDBalancer.h"
25#include "MDLog.h"
26#include "MDSMap.h"
27#include "Mutation.h"
28
29#include "include/filepath.h"
28e407b8 30#include "common/likely.h"
7c673cae
FG
31
32#include "events/EExport.h"
33#include "events/EImportStart.h"
34#include "events/EImportFinish.h"
35#include "events/ESessions.h"
36
37#include "msg/Messenger.h"
38
39#include "messages/MClientCaps.h"
40
7c673cae
FG
41/*
42 * this is what the dir->dir_auth values look like
43 *
44 * dir_auth authbits
45 * export
46 * me me - before
47 * me, me me - still me, but preparing for export
48 * me, them me - send MExportDir (peer is preparing)
49 * them, me me - journaled EExport
50 * them them - done
51 *
52 * import:
53 * them them - before
54 * me, them me - journaled EImportStart
55 * me me - done
56 *
57 * which implies:
58 * - auth bit is set if i am listed as first _or_ second dir_auth.
59 */
60
61#include "common/config.h"
62
63
64#define dout_context g_ceph_context
65#define dout_subsys ceph_subsys_mds
66#undef dout_prefix
67#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
68
69
11fdf7f2 70class MigratorContext : public MDSContext {
7c673cae
FG
71protected:
72 Migrator *mig;
73 MDSRank *get_mds() override {
74 return mig->mds;
75 }
76public:
77 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
11fdf7f2 78 ceph_assert(mig != NULL);
7c673cae
FG
79 }
80};
81
82class MigratorLogContext : public MDSLogContextBase {
83protected:
84 Migrator *mig;
85 MDSRank *get_mds() override {
86 return mig->mds;
87 }
88public:
89 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
11fdf7f2 90 ceph_assert(mig != NULL);
7c673cae
FG
91 }
92};
93
11fdf7f2 94void Migrator::dispatch(const Message::const_ref &m)
7c673cae
FG
95{
96 switch (m->get_type()) {
97 // import
98 case MSG_MDS_EXPORTDIRDISCOVER:
11fdf7f2 99 handle_export_discover(MExportDirDiscover::msgref_cast(m));
7c673cae
FG
100 break;
101 case MSG_MDS_EXPORTDIRPREP:
11fdf7f2 102 handle_export_prep(MExportDirPrep::msgref_cast(m));
7c673cae
FG
103 break;
104 case MSG_MDS_EXPORTDIR:
28e407b8
AA
105 if (unlikely(inject_session_race)) {
106 dout(0) << "waiting for inject_session_race" << dendl;
107 mds->wait_for_any_client_connection(new C_MDS_RetryMessage(mds, m));
108 } else {
11fdf7f2 109 handle_export_dir(MExportDir::msgref_cast(m));
28e407b8 110 }
7c673cae
FG
111 break;
112 case MSG_MDS_EXPORTDIRFINISH:
11fdf7f2 113 handle_export_finish(MExportDirFinish::msgref_cast(m));
7c673cae
FG
114 break;
115 case MSG_MDS_EXPORTDIRCANCEL:
11fdf7f2 116 handle_export_cancel(MExportDirCancel::msgref_cast(m));
7c673cae
FG
117 break;
118
119 // export
120 case MSG_MDS_EXPORTDIRDISCOVERACK:
11fdf7f2 121 handle_export_discover_ack(MExportDirDiscoverAck::msgref_cast(m));
7c673cae
FG
122 break;
123 case MSG_MDS_EXPORTDIRPREPACK:
11fdf7f2 124 handle_export_prep_ack(MExportDirPrepAck::msgref_cast(m));
7c673cae
FG
125 break;
126 case MSG_MDS_EXPORTDIRACK:
11fdf7f2 127 handle_export_ack(MExportDirAck::msgref_cast(m));
7c673cae
FG
128 break;
129 case MSG_MDS_EXPORTDIRNOTIFYACK:
11fdf7f2
TL
130 handle_export_notify_ack(MExportDirNotifyAck::msgref_cast(m));
131 break;
7c673cae
FG
132
133 // export 3rd party (dir_auth adjustments)
134 case MSG_MDS_EXPORTDIRNOTIFY:
11fdf7f2 135 handle_export_notify(MExportDirNotify::msgref_cast(m));
7c673cae
FG
136 break;
137
138 // caps
139 case MSG_MDS_EXPORTCAPS:
11fdf7f2 140 handle_export_caps(MExportCaps::msgref_cast(m));
7c673cae 141 break;
1adf2230 142 case MSG_MDS_EXPORTCAPSACK:
11fdf7f2 143 handle_export_caps_ack(MExportCapsAck::msgref_cast(m));
1adf2230 144 break;
7c673cae 145 case MSG_MDS_GATHERCAPS:
11fdf7f2 146 handle_gather_caps(MGatherCaps::msgref_cast(m));
7c673cae
FG
147 break;
148
149 default:
150 derr << "migrator unknown message " << m->get_type() << dendl;
11fdf7f2 151 ceph_abort_msg("migrator unknown message");
7c673cae
FG
152 }
153}
154
155
156class C_MDC_EmptyImport : public MigratorContext {
157 CDir *dir;
158public:
159 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
160 void finish(int r) override {
161 mig->export_empty_import(dir);
162 }
163};
164
165
166void Migrator::export_empty_import(CDir *dir)
167{
168 dout(7) << "export_empty_import " << *dir << dendl;
11fdf7f2 169 ceph_assert(dir->is_subtree_root());
7c673cae
FG
170
171 if (dir->inode->is_auth()) {
172 dout(7) << " inode is auth" << dendl;
173 return;
174 }
175 if (!dir->is_auth()) {
176 dout(7) << " not auth" << dendl;
177 return;
178 }
179 if (dir->is_freezing() || dir->is_frozen()) {
180 dout(7) << " freezing or frozen" << dendl;
181 return;
182 }
183 if (dir->get_num_head_items() > 0) {
184 dout(7) << " not actually empty" << dendl;
185 return;
186 }
187 if (dir->inode->is_root()) {
188 dout(7) << " root" << dendl;
189 return;
190 }
191
192 mds_rank_t dest = dir->inode->authority().first;
193 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
194
195 dout(7) << " really empty, exporting to " << dest << dendl;
196 assert (dest != mds->get_nodeid());
197
198 dout(7) << "exporting to mds." << dest
199 << " empty import " << *dir << dendl;
200 export_dir( dir, dest );
201}
202
203void Migrator::find_stale_export_freeze()
204{
205 utime_t now = ceph_clock_now();
206 utime_t cutoff = now;
11fdf7f2 207 cutoff -= g_conf()->mds_freeze_tree_timeout;
7c673cae
FG
208
209
210 /*
211 * We could have situations like:
212 *
213 * - mds.0 authpins an item in subtree A
214 * - mds.0 sends request to mds.1 to authpin an item in subtree B
215 * - mds.0 freezes subtree A
216 * - mds.1 authpins an item in subtree B
217 * - mds.1 sends request to mds.0 to authpin an item in subtree A
218 * - mds.1 freezes subtree B
219 * - mds.1 receives the remote authpin request from mds.0
220 * (wait because subtree B is freezing)
221 * - mds.0 receives the remote authpin request from mds.1
222 * (wait because subtree A is freezing)
223 *
224 *
225 * - client request authpins items in subtree B
226 * - freeze subtree B
227 * - import subtree A which is parent of subtree B
228 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
229 * - freeze subtree A
230 * - client request tries authpinning items in subtree A
231 * (wait because subtree A is freezing)
232 */
233 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
234 p != export_state.end(); ) {
235 CDir* dir = p->first;
236 export_state_t& stat = p->second;
237 ++p;
238 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
239 continue;
11fdf7f2
TL
240 ceph_assert(dir->freeze_tree_state);
241 if (stat.last_cum_auth_pins != dir->freeze_tree_state->auth_pins) {
242 stat.last_cum_auth_pins = dir->freeze_tree_state->auth_pins;
7c673cae
FG
243 stat.last_cum_auth_pins_change = now;
244 continue;
245 }
246 if (stat.last_cum_auth_pins_change >= cutoff)
247 continue;
248 if (stat.num_remote_waiters > 0 ||
249 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
250 export_try_cancel(dir);
251 }
252 }
253}
254
255void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
256{
257 dout(10) << "export_try_cancel " << *dir << dendl;
258
259 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
11fdf7f2 260 ceph_assert(it != export_state.end());
7c673cae
FG
261
262 int state = it->second.state;
263 switch (state) {
264 case EXPORT_LOCKING:
265 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
91327a77 266 num_locking_exports--;
7c673cae
FG
267 it->second.state = EXPORT_CANCELLED;
268 dir->auth_unpin(this);
269 break;
270 case EXPORT_DISCOVERING:
271 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
272 it->second.state = EXPORT_CANCELLED;
273 dir->unfreeze_tree(); // cancel the freeze
274 dir->auth_unpin(this);
275 if (notify_peer &&
276 (!mds->is_cluster_degraded() ||
277 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
11fdf7f2 278 mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae
FG
279 break;
280
281 case EXPORT_FREEZING:
282 dout(10) << "export state=freezing : canceling freeze" << dendl;
283 it->second.state = EXPORT_CANCELLED;
284 dir->unfreeze_tree(); // cancel the freeze
224ce89b
WB
285 if (dir->is_subtree_root())
286 cache->try_subtree_merge(dir);
7c673cae
FG
287 if (notify_peer &&
288 (!mds->is_cluster_degraded() ||
289 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
11fdf7f2 290 mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae
FG
291 break;
292
293 // NOTE: state order reversal, warning comes after prepping
294 case EXPORT_WARNING:
295 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
296 it->second.state = EXPORT_CANCELLING;
297 // fall-thru
298
299 case EXPORT_PREPPING:
300 if (state != EXPORT_WARNING) {
301 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
302 it->second.state = EXPORT_CANCELLED;
303 }
304
305 {
306 // unpin bounds
307 set<CDir*> bounds;
308 cache->get_subtree_bounds(dir, bounds);
309 for (set<CDir*>::iterator q = bounds.begin();
310 q != bounds.end();
311 ++q) {
312 CDir *bd = *q;
313 bd->put(CDir::PIN_EXPORTBOUND);
314 bd->state_clear(CDir::STATE_EXPORTBOUND);
315 }
316 if (state == EXPORT_WARNING) {
317 // notify bystanders
b32b8144 318 export_notify_abort(dir, it->second, bounds);
7c673cae
FG
319 // process delayed expires
320 cache->process_delayed_expire(dir);
321 }
322 }
323 dir->unfreeze_tree();
7c673cae
FG
324 cache->try_subtree_merge(dir);
325 if (notify_peer &&
326 (!mds->is_cluster_degraded() ||
327 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
11fdf7f2 328 mds->send_message_mds(MExportDirCancel::create(dir->dirfrag(), it->second.tid), it->second.peer);
7c673cae
FG
329 break;
330
331 case EXPORT_EXPORTING:
332 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
333 it->second.state = EXPORT_CANCELLING;
b32b8144 334 export_reverse(dir, it->second);
7c673cae
FG
335 break;
336
337 case EXPORT_LOGGINGFINISH:
338 case EXPORT_NOTIFYING:
339 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
340 // leave export_state, don't clean up now.
341 break;
342 case EXPORT_CANCELLING:
343 break;
344
345 default:
346 ceph_abort();
347 }
348
349 // finish clean-up?
350 if (it->second.state == EXPORT_CANCELLING ||
351 it->second.state == EXPORT_CANCELLED) {
352 MutationRef mut;
353 mut.swap(it->second.mut);
354
355 if (it->second.state == EXPORT_CANCELLED) {
91327a77 356 export_cancel_finish(it);
7c673cae
FG
357 }
358
359 // drop locks
360 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
361 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
11fdf7f2 362 ceph_assert(mdr);
91327a77 363 mds->mdcache->request_kill(mdr);
7c673cae
FG
364 } else if (mut) {
365 mds->locker->drop_locks(mut.get());
366 mut->cleanup();
367 }
368
369 cache->show_subtrees();
370
371 maybe_do_queued_export();
372 }
373}
374
91327a77 375void Migrator::export_cancel_finish(export_state_iterator& it)
7c673cae 376{
91327a77
AA
377 CDir *dir = it->first;
378 bool unpin = (it->second.state == EXPORT_CANCELLING);
379 auto parent = std::move(it->second.parent);
380
381 total_exporting_size -= it->second.approx_size;
382 export_state.erase(it);
383
11fdf7f2 384 ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
1adf2230 385 dir->clear_exporting();
7c673cae 386
91327a77
AA
387 if (unpin) {
388 // pinned by Migrator::export_notify_abort()
389 dir->auth_unpin(this);
390 }
7c673cae
FG
391 // send pending import_maps? (these need to go out when all exports have finished.)
392 cache->maybe_send_pending_resolves();
91327a77
AA
393
394 if (parent)
395 child_export_finish(parent, false);
7c673cae
FG
396}
397
398// ==========================================================
399// mds failure handling
400
401void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
402{
403 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
404
405 // check my exports
406
407 // first add an extra auth_pin on any freezes, so that canceling a
408 // nested freeze doesn't complete one further up the hierarchy and
409 // confuse the shit out of us. we'll remove it after canceling the
410 // freeze. this way no freeze completions run before we want them
411 // to.
412 list<CDir*> pinned_dirs;
413 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
414 p != export_state.end();
415 ++p) {
416 if (p->second.state == EXPORT_FREEZING) {
417 CDir *dir = p->first;
418 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
419 dir->auth_pin(this);
420 pinned_dirs.push_back(dir);
421 }
422 }
423
424 map<CDir*,export_state_t>::iterator p = export_state.begin();
425 while (p != export_state.end()) {
426 map<CDir*,export_state_t>::iterator next = p;
427 ++next;
428 CDir *dir = p->first;
429
430 // abort exports:
431 // - that are going to the failed node
432 // - that aren't frozen yet (to avoid auth_pin deadlock)
433 // - they havne't prepped yet (they may need to discover bounds to do that)
434 if ((p->second.peer == who &&
435 p->second.state != EXPORT_CANCELLING) ||
436 p->second.state == EXPORT_LOCKING ||
437 p->second.state == EXPORT_DISCOVERING ||
438 p->second.state == EXPORT_FREEZING ||
439 p->second.state == EXPORT_PREPPING) {
440 // the guy i'm exporting to failed, or we're just freezing.
441 dout(10) << "cleaning up export state (" << p->second.state << ")"
442 << get_export_statename(p->second.state) << " of " << *dir << dendl;
443 export_try_cancel(dir);
444 } else if (p->second.peer != who) {
445 // bystander failed.
446 if (p->second.warning_ack_waiting.erase(who)) {
447 if (p->second.state == EXPORT_WARNING) {
448 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
449 // exporter waiting for warning acks, let's fake theirs.
450 dout(10) << "faking export_warning_ack from mds." << who
451 << " on " << *dir << " to mds." << p->second.peer
452 << dendl;
453 if (p->second.warning_ack_waiting.empty())
454 export_go(dir);
455 }
456 }
457 if (p->second.notify_ack_waiting.erase(who)) {
458 // exporter is waiting for notify acks, fake it
459 dout(10) << "faking export_notify_ack from mds." << who
460 << " on " << *dir << " to mds." << p->second.peer
461 << dendl;
462 if (p->second.state == EXPORT_NOTIFYING) {
463 if (p->second.notify_ack_waiting.empty())
464 export_finish(dir);
465 } else if (p->second.state == EXPORT_CANCELLING) {
466 if (p->second.notify_ack_waiting.empty()) {
91327a77 467 export_cancel_finish(p);
7c673cae
FG
468 }
469 }
470 }
471 }
472
473 // next!
474 p = next;
475 }
476
477
478 // check my imports
479 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
480 while (q != import_state.end()) {
481 map<dirfrag_t,import_state_t>::iterator next = q;
482 ++next;
483 dirfrag_t df = q->first;
484 CInode *diri = mds->mdcache->get_inode(df.ino);
485 CDir *dir = mds->mdcache->get_dirfrag(df);
486
487 if (q->second.peer == who) {
488 if (dir)
489 dout(10) << "cleaning up import state (" << q->second.state << ")"
490 << get_import_statename(q->second.state) << " of " << *dir << dendl;
491 else
492 dout(10) << "cleaning up import state (" << q->second.state << ")"
493 << get_import_statename(q->second.state) << " of " << df << dendl;
494
495 switch (q->second.state) {
496 case IMPORT_DISCOVERING:
497 dout(10) << "import state=discovering : clearing state" << dendl;
498 import_reverse_discovering(df);
499 break;
500
501 case IMPORT_DISCOVERED:
11fdf7f2 502 ceph_assert(diri);
7c673cae
FG
503 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
504 import_reverse_discovered(df, diri);
505 break;
506
507 case IMPORT_PREPPING:
11fdf7f2 508 ceph_assert(dir);
7c673cae 509 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
b32b8144 510 import_reverse_prepping(dir, q->second);
7c673cae
FG
511 break;
512
513 case IMPORT_PREPPED:
11fdf7f2 514 ceph_assert(dir);
7c673cae
FG
515 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
516 {
517 set<CDir*> bounds;
518 cache->get_subtree_bounds(dir, bounds);
519 import_remove_pins(dir, bounds);
520
521 // adjust auth back to the exporter
522 cache->adjust_subtree_auth(dir, q->second.peer);
7c673cae
FG
523
524 // notify bystanders ; wait in aborting state
b32b8144 525 q->second.state = IMPORT_ABORTING;
7c673cae 526 import_notify_abort(dir, bounds);
11fdf7f2 527 ceph_assert(g_conf()->mds_kill_import_at != 10);
7c673cae
FG
528 }
529 break;
530
531 case IMPORT_LOGGINGSTART:
11fdf7f2 532 ceph_assert(dir);
7c673cae
FG
533 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
534 import_reverse(dir);
535 break;
536
537 case IMPORT_ACKING:
11fdf7f2 538 ceph_assert(dir);
7c673cae
FG
539 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
540 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
541 {
542 set<CDir*> bounds;
543 cache->get_subtree_bounds(dir, bounds);
544 cache->add_ambiguous_import(dir, bounds);
545 }
546 break;
547
548 case IMPORT_FINISHING:
11fdf7f2 549 ceph_assert(dir);
7c673cae
FG
550 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
551 import_finish(dir, true);
552 break;
553
554 case IMPORT_ABORTING:
11fdf7f2 555 ceph_assert(dir);
7c673cae
FG
556 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
557 break;
558 }
559 } else {
560 auto bystanders_entry = q->second.bystanders.find(who);
561 if (bystanders_entry != q->second.bystanders.end()) {
562 q->second.bystanders.erase(bystanders_entry);
563 if (q->second.state == IMPORT_ABORTING) {
11fdf7f2 564 ceph_assert(dir);
7c673cae
FG
565 dout(10) << "faking export_notify_ack from mds." << who
566 << " on aborting import " << *dir << " from mds." << q->second.peer
567 << dendl;
224ce89b 568 if (q->second.bystanders.empty())
7c673cae 569 import_reverse_unfreeze(dir);
7c673cae
FG
570 }
571 }
572 }
573
574 // next!
575 q = next;
576 }
577
578 while (!pinned_dirs.empty()) {
579 CDir *dir = pinned_dirs.front();
580 dout(10) << "removing temp auth_pin on " << *dir << dendl;
581 dir->auth_unpin(this);
582 pinned_dirs.pop_front();
583 }
584}
585
586
587
588void Migrator::show_importing()
589{
590 dout(10) << "show_importing" << dendl;
591 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
592 p != import_state.end();
593 ++p) {
594 CDir *dir = mds->mdcache->get_dirfrag(p->first);
595 if (dir) {
596 dout(10) << " importing from " << p->second.peer
597 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
598 << " " << p->first << " " << *dir << dendl;
599 } else {
600 dout(10) << " importing from " << p->second.peer
601 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
602 << " " << p->first << dendl;
603 }
604 }
605}
606
607void Migrator::show_exporting()
608{
609 dout(10) << "show_exporting" << dendl;
610 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
611 p != export_state.end();
612 ++p)
613 dout(10) << " exporting to " << p->second.peer
614 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
615 << " " << p->first->dirfrag() << " " << *p->first << dendl;
616}
617
618
619
620void Migrator::audit()
621{
11fdf7f2 622 if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 5>())
7c673cae
FG
623 return; // hrm.
624
625 // import_state
626 show_importing();
627 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
628 p != import_state.end();
629 ++p) {
630 if (p->second.state == IMPORT_DISCOVERING)
631 continue;
632 if (p->second.state == IMPORT_DISCOVERED) {
633 CInode *in = cache->get_inode(p->first.ino);
11fdf7f2 634 ceph_assert(in);
7c673cae
FG
635 continue;
636 }
637 CDir *dir = cache->get_dirfrag(p->first);
11fdf7f2 638 ceph_assert(dir);
7c673cae
FG
639 if (p->second.state == IMPORT_PREPPING)
640 continue;
641 if (p->second.state == IMPORT_ABORTING) {
11fdf7f2
TL
642 ceph_assert(!dir->is_ambiguous_dir_auth());
643 ceph_assert(dir->get_dir_auth().first != mds->get_nodeid());
7c673cae
FG
644 continue;
645 }
11fdf7f2
TL
646 ceph_assert(dir->is_ambiguous_dir_auth());
647 ceph_assert(dir->authority().first == mds->get_nodeid() ||
7c673cae
FG
648 dir->authority().second == mds->get_nodeid());
649 }
650
651 // export_state
652 show_exporting();
653 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
654 p != export_state.end();
655 ++p) {
656 CDir *dir = p->first;
657 if (p->second.state == EXPORT_LOCKING ||
658 p->second.state == EXPORT_DISCOVERING ||
659 p->second.state == EXPORT_FREEZING ||
660 p->second.state == EXPORT_CANCELLING)
661 continue;
11fdf7f2
TL
662 ceph_assert(dir->is_ambiguous_dir_auth());
663 ceph_assert(dir->authority().first == mds->get_nodeid() ||
7c673cae
FG
664 dir->authority().second == mds->get_nodeid());
665 }
666
667 // ambiguous+me subtrees should be importing|exporting
668
669 // write me
670}
671
672
673
674
675
676// ==========================================================
677// EXPORT
678
679void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
680{
681 // enqueue
682 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
683 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
684
685 maybe_do_queued_export();
686}
687
688void Migrator::maybe_do_queued_export()
689{
690 static bool running;
691 if (running)
692 return;
693 running = true;
91327a77
AA
694
695 uint64_t max_total_size = max_export_size * 2;
696
7c673cae 697 while (!export_queue.empty() &&
91327a77
AA
698 max_total_size > total_exporting_size &&
699 max_total_size - total_exporting_size >=
700 max_export_size * (num_locking_exports + 1)) {
701
7c673cae
FG
702 dirfrag_t df = export_queue.front().first;
703 mds_rank_t dest = export_queue.front().second;
704 export_queue.pop_front();
705
706 CDir *dir = mds->mdcache->get_dirfrag(df);
707 if (!dir) continue;
708 if (!dir->is_auth()) continue;
709
710 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
711
712 export_dir(dir, dest);
713 }
91327a77 714
7c673cae
FG
715 running = false;
716}
717
718
719
720
721class C_MDC_ExportFreeze : public MigratorContext {
722 CDir *ex; // dir i'm exporting
723 uint64_t tid;
724public:
725 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
726 MigratorContext(m), ex(e), tid(t) {
11fdf7f2 727 ceph_assert(ex != NULL);
7c673cae
FG
728 }
729 void finish(int r) override {
730 if (r >= 0)
731 mig->export_frozen(ex, tid);
732 }
733};
734
735
11fdf7f2 736void Migrator::get_export_lock_set(CDir *dir, MutationImpl::LockOpVec& lov)
7c673cae
FG
737{
738 // path
739 vector<CDentry*> trace;
740 cache->make_trace(trace, dir->inode);
11fdf7f2
TL
741
742 set<CDir*> wouldbe_bounds;
743 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
744
745 lov.reserve(trace.size() + wouldbe_bounds.size() + 8);
746
747 for (auto& dn : trace)
748 lov.add_rdlock(&dn->lock);
7c673cae
FG
749
750 // prevent scatter gather race
11fdf7f2 751 lov.add_rdlock(&dir->get_inode()->dirfragtreelock);
7c673cae
FG
752
753 // bound dftlocks:
754 // NOTE: We need to take an rdlock on bounding dirfrags during
755 // migration for a rather irritating reason: when we export the
756 // bound inode, we need to send scatterlock state for the dirfrags
757 // as well, so that the new auth also gets the correct info. If we
758 // race with a refragment, this info is useless, as we can't
759 // redivvy it up. And it's needed for the scatterlocks to work
760 // properly: when the auth is in a sync/lock state it keeps each
761 // dirfrag's portion in the local (auth OR replica) dirfrag.
11fdf7f2
TL
762 for (auto& dir : wouldbe_bounds)
763 lov.add_rdlock(&dir->get_inode()->dirfragtreelock);
764
765 // above code may add duplicated locks
766 lov.sort_and_merge();
7c673cae
FG
767}
768
769
7c673cae
FG
770/** export_dir(dir, dest)
771 * public method to initiate an export.
772 * will fail if the directory is freezing, frozen, unpinnable, or root.
773 */
774void Migrator::export_dir(CDir *dir, mds_rank_t dest)
775{
776 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
11fdf7f2
TL
777 ceph_assert(dir->is_auth());
778 ceph_assert(dest != mds->get_nodeid());
7c673cae 779
181888fb
FG
780 if (!(mds->is_active() || mds->is_stopping())) {
781 dout(7) << "i'm not active, no exports for now" << dendl;
782 return;
783 }
7c673cae
FG
784 if (mds->mdcache->is_readonly()) {
785 dout(7) << "read-only FS, no exports for now" << dendl;
786 return;
787 }
31f18b77
FG
788 if (!mds->mdsmap->is_active(dest)) {
789 dout(7) << "dest not active, no exports for now" << dendl;
790 return;
791 }
7c673cae
FG
792 if (mds->is_cluster_degraded()) {
793 dout(7) << "cluster degraded, no exports for now" << dendl;
794 return;
795 }
796 if (dir->inode->is_system()) {
797 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
798 //ceph_abort();
799 return;
800 }
801
b32b8144
FG
802 CDir* parent_dir = dir->inode->get_projected_parent_dir();
803 if (parent_dir && parent_dir->inode->is_stray()) {
804 if (parent_dir->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
805 dout(7) << "i won't export anything in stray" << dendl;
806 return;
807 }
808 } else {
809 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
810 dout(7) << "dir is export pinned" << dendl;
811 return;
812 }
7c673cae
FG
813 }
814
815 if (dir->is_frozen() ||
816 dir->is_freezing()) {
817 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
818 return;
819 }
820 if (dir->state_test(CDir::STATE_EXPORTING)) {
821 dout(7) << "already exporting" << dendl;
822 return;
823 }
824
11fdf7f2 825 if (g_conf()->mds_thrash_exports) {
7c673cae
FG
826 // create random subtree bound (which will not be exported)
827 list<CDir*> ls;
828 for (auto p = dir->begin(); p != dir->end(); ++p) {
829 auto dn = p->second;
830 CDentry::linkage_t *dnl= dn->get_linkage();
831 if (dnl->is_primary()) {
832 CInode *in = dnl->get_inode();
833 if (in->is_dir())
834 in->get_nested_dirfrags(ls);
835 }
836 }
837 if (ls.size() > 0) {
838 int n = rand() % ls.size();
839 auto p = ls.begin();
840 while (n--) ++p;
841 CDir *bd = *p;
842 if (!(bd->is_frozen() || bd->is_freezing())) {
11fdf7f2 843 ceph_assert(bd->is_auth());
7c673cae
FG
844 dir->state_set(CDir::STATE_AUXSUBTREE);
845 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
846 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
847 }
848 }
849 }
850
11fdf7f2 851 mds->hit_export_target(dest, -1);
7c673cae
FG
852
853 dir->auth_pin(this);
1adf2230 854 dir->mark_exporting();
7c673cae
FG
855
856 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
857 mdr->more()->export_dir = dir;
858
11fdf7f2 859 ceph_assert(export_state.count(dir) == 0);
7c673cae 860 export_state_t& stat = export_state[dir];
91327a77 861 num_locking_exports++;
7c673cae
FG
862 stat.state = EXPORT_LOCKING;
863 stat.peer = dest;
864 stat.tid = mdr->reqid.tid;
865 stat.mut = mdr;
866
91327a77 867 mds->mdcache->dispatch_request(mdr);
7c673cae
FG
868}
869
91327a77
AA
870/*
871 * check if directory is too large to be export in whole. If it is,
872 * choose some subdirs, whose total size is suitable.
873 */
874void Migrator::maybe_split_export(CDir* dir, uint64_t max_size, bool null_okay,
875 vector<pair<CDir*, size_t> >& results)
7c673cae 876{
91327a77
AA
877 static const unsigned frag_size = 800;
878 static const unsigned inode_size = 1000;
879 static const unsigned cap_size = 80;
880 static const unsigned remote_size = 10;
881 static const unsigned null_size = 1;
882
883 // state for depth-first search
884 struct LevelData {
885 CDir *dir;
886 CDir::dentry_key_map::iterator iter;
887 size_t dirfrag_size = frag_size;
888 size_t subdirs_size = 0;
889 bool complete = true;
890 vector<CDir*> siblings;
891 vector<pair<CDir*, size_t> > subdirs;
892 LevelData(const LevelData&) = default;
893 LevelData(CDir *d) :
894 dir(d), iter(d->begin()) {}
895 };
896
897 vector<LevelData> stack;
898 stack.emplace_back(dir);
899
900 size_t found_size = 0;
901 size_t skipped_size = 0;
902
903 for (;;) {
904 auto& data = stack.back();
905 CDir *cur = data.dir;
906 auto& it = data.iter;
907 auto& dirfrag_size = data.dirfrag_size;
908
909 while(it != cur->end()) {
910 CDentry *dn = it->second;
911 ++it;
912
913 dirfrag_size += dn->name.size();
914 if (dn->get_linkage()->is_null()) {
915 dirfrag_size += null_size;
916 continue;
917 }
918 if (dn->get_linkage()->is_remote()) {
919 dirfrag_size += remote_size;
920 continue;
921 }
922
923 CInode *in = dn->get_linkage()->get_inode();
924 dirfrag_size += inode_size;
925 dirfrag_size += in->get_client_caps().size() * cap_size;
926
927 if (in->is_dir()) {
928 vector<CDir*> ls;
929 in->get_nested_dirfrags(ls);
930 std::reverse(ls.begin(), ls.end());
931
932 bool complete = true;
933 for (auto p = ls.begin(); p != ls.end(); ) {
934 if ((*p)->state_test(CDir::STATE_EXPORTING) ||
935 (*p)->is_freezing_dir() || (*p)->is_frozen_dir()) {
936 complete = false;
937 p = ls.erase(p);
938 } else {
939 ++p;
940 }
941 }
942 if (!complete) {
943 // skip exporting dir's ancestors. because they can't get
944 // frozen (exporting dir's parent inode is auth pinned).
945 for (auto p = stack.rbegin(); p < stack.rend(); ++p) {
946 if (!p->complete)
947 break;
948 p->complete = false;
949 }
950 }
951 if (!ls.empty()) {
952 stack.emplace_back(ls.back());
953 ls.pop_back();
954 stack.back().siblings.swap(ls);
955 break;
956 }
957 }
958 }
959 // did above loop push new dirfrag into the stack?
960 if (stack.back().dir != cur)
961 continue;
962
963 if (data.complete) {
964 auto cur_size = data.subdirs_size + dirfrag_size;
965 // we can do nothing with large dirfrag
966 if (cur_size >= max_size && found_size * 2 > max_size)
967 break;
7c673cae 968
91327a77
AA
969 found_size += dirfrag_size;
970
971 if (stack.size() > 1) {
972 auto& parent = stack[stack.size() - 2];
973 parent.subdirs.emplace_back(cur, cur_size);
974 parent.subdirs_size += cur_size;
975 }
976 } else {
977 // can't merge current dirfrag to its parent if there is skipped subdir
978 results.insert(results.end(), data.subdirs.begin(), data.subdirs.end());
979 skipped_size += dirfrag_size;
980 }
981
982 vector<CDir*> ls;
983 ls.swap(data.siblings);
984
985 stack.pop_back();
986 if (stack.empty())
987 break;
988
989 if (found_size >= max_size)
990 break;
991
992 // next dirfrag
993 if (!ls.empty()) {
994 stack.emplace_back(ls.back());
995 ls.pop_back();
996 stack.back().siblings.swap(ls);
997 }
998 }
999
1000 for (auto& p : stack)
1001 results.insert(results.end(), p.subdirs.begin(), p.subdirs.end());
1002
1003 if (results.empty() && (!skipped_size || !null_okay))
1004 results.emplace_back(dir, found_size + skipped_size);
1005}
1006
1007class C_M_ExportDirWait : public MigratorContext {
1008 MDRequestRef mdr;
1009 int count;
1010public:
1011 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
1012 : MigratorContext(m), mdr(mdr), count(count) {}
1013 void finish(int r) override {
1014 mig->dispatch_export_dir(mdr, count);
1015 }
1016};
1017
1018void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
1019{
7c673cae 1020 CDir *dir = mdr->more()->export_dir;
91327a77
AA
1021 dout(7) << "dispatch_export_dir " << *mdr << " " << *dir << dendl;
1022
7c673cae
FG
1023 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1024 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
1025 // export must have aborted.
1026 dout(7) << "export must have aborted " << *mdr << dendl;
11fdf7f2 1027 ceph_assert(mdr->killed || mdr->aborted);
91327a77
AA
1028 if (mdr->aborted) {
1029 mdr->aborted = false;
1030 mds->mdcache->request_kill(mdr);
1031 }
7c673cae
FG
1032 return;
1033 }
11fdf7f2 1034 ceph_assert(it->second.state == EXPORT_LOCKING);
7c673cae
FG
1035
1036 mds_rank_t dest = it->second.peer;
1037
1038 if (!mds->is_export_target(dest)) {
1039 dout(7) << "dest is not yet an export target" << dendl;
1040 if (count > 3) {
1041 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
1042 export_try_cancel(dir);
1043 return;
1044 }
224ce89b
WB
1045
1046 mds->locker->drop_locks(mdr.get());
1047 mdr->drop_local_auth_pins();
1048
31f18b77 1049 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
7c673cae
FG
1050 return;
1051 }
1052
1053 if (!dir->inode->get_parent_dn()) {
1054 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
31f18b77 1055 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
7c673cae
FG
1056 return;
1057 }
1058
1059 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
1060 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
1061 export_try_cancel(dir);
1062 return;
1063 }
1064
1065 // locks?
11fdf7f2
TL
1066 MutationImpl::LockOpVec lov;
1067 get_export_lock_set(dir, lov);
7c673cae
FG
1068 // If auth MDS of the subtree root inode is neither the exporter MDS
1069 // nor the importer MDS and it gathers subtree root's fragstat/neststat
1070 // while the subtree is exporting. It's possible that the exporter MDS
1071 // and the importer MDS both are auth MDS of the subtree root or both
1072 // are not auth MDS of the subtree root at the time they receive the
1073 // lock messages. So the auth MDS of the subtree root inode may get no
1074 // or duplicated fragstat/neststat for the subtree root dirfrag.
11fdf7f2
TL
1075 lov.add_wrlock(&dir->get_inode()->filelock);
1076 lov.add_wrlock(&dir->get_inode()->nestlock);
7c673cae
FG
1077 if (dir->get_inode()->is_auth()) {
1078 dir->get_inode()->filelock.set_scatter_wanted();
1079 dir->get_inode()->nestlock.set_scatter_wanted();
1080 }
1081
11fdf7f2 1082 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
7c673cae
FG
1083 if (mdr->aborted)
1084 export_try_cancel(dir);
1085 return;
1086 }
1087
11fdf7f2 1088 ceph_assert(g_conf()->mds_kill_export_at != 1);
7c673cae 1089
91327a77
AA
1090 auto parent = it->second.parent;
1091
1092 vector<pair<CDir*, size_t> > results;
1093 maybe_split_export(dir, max_export_size, (bool)parent, results);
1094
1095 if (results.size() == 1 && results.front().first == dir) {
1096 num_locking_exports--;
1097 it->second.state = EXPORT_DISCOVERING;
1098 // send ExportDirDiscover (ask target)
1099 filepath path;
1100 dir->inode->make_path(path);
11fdf7f2
TL
1101 auto discover = MExportDirDiscover::create(dir->dirfrag(), path,
1102 mds->get_nodeid(), it->second.tid);
91327a77 1103 mds->send_message_mds(discover, dest);
11fdf7f2 1104 ceph_assert(g_conf()->mds_kill_export_at != 2);
91327a77
AA
1105
1106 it->second.last_cum_auth_pins_change = ceph_clock_now();
1107 it->second.approx_size = results.front().second;
91327a77
AA
1108 total_exporting_size += it->second.approx_size;
1109
1110 // start the freeze, but hold it up with an auth_pin.
1111 dir->freeze_tree();
11fdf7f2 1112 ceph_assert(dir->is_freezing_tree());
91327a77
AA
1113 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
1114 return;
1115 }
1116
1117 if (parent) {
1118 parent->pending_children += results.size();
1119 } else {
1120 parent = std::make_shared<export_base_t>(dir->dirfrag(), dest,
1121 results.size(), export_queue_gen);
1122 }
1123
1124 if (results.empty()) {
1125 dout(7) << "subtree's children all are under exporting, retry rest parts of parent export "
1126 << parent->dirfrag << dendl;
1127 parent->restart = true;
1128 } else {
1129 dout(7) << "subtree is too large, splitting it into: " << dendl;
1130 }
1131
1132 for (auto& p : results) {
1133 CDir *sub = p.first;
11fdf7f2 1134 ceph_assert(sub != dir);
91327a77
AA
1135 dout(7) << " sub " << *sub << dendl;
1136
1137 sub->auth_pin(this);
1138 sub->mark_exporting();
1139
1140 MDRequestRef _mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
1141 _mdr->more()->export_dir = sub;
1142
11fdf7f2 1143 ceph_assert(export_state.count(sub) == 0);
91327a77
AA
1144 auto& stat = export_state[sub];
1145 num_locking_exports++;
1146 stat.state = EXPORT_LOCKING;
1147 stat.peer = dest;
1148 stat.tid = _mdr->reqid.tid;
1149 stat.mut = _mdr;
1150 stat.parent = parent;
1151 mds->mdcache->dispatch_request(_mdr);
1152 }
1153
1154 // cancel the original one
1155 export_try_cancel(dir);
1156}
1157
91327a77
AA
1158void Migrator::child_export_finish(std::shared_ptr<export_base_t>& parent, bool success)
1159{
1160 if (success)
1161 parent->restart = true;
1162 if (--parent->pending_children == 0) {
1163 if (parent->restart &&
1164 parent->export_queue_gen == export_queue_gen) {
1165 CDir *origin = mds->mdcache->get_dirfrag(parent->dirfrag);
1166 if (origin && origin->is_auth()) {
1167 dout(7) << "child_export_finish requeue " << *origin << dendl;
1168 export_queue.emplace_front(origin->dirfrag(), parent->dest);
1169 }
1170 }
1171 }
7c673cae
FG
1172}
1173
1174/*
1175 * called on receipt of MExportDirDiscoverAck
1176 * the importer now has the directory's _inode_ in memory, and pinned.
7c673cae 1177 */
11fdf7f2 1178void Migrator::handle_export_discover_ack(const MExportDirDiscoverAck::const_ref &m)
7c673cae
FG
1179{
1180 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1181 mds_rank_t dest(m->get_source().num());
11fdf7f2 1182 ceph_assert(dir);
7c673cae
FG
1183
1184 dout(7) << "export_discover_ack from " << m->get_source()
1185 << " on " << *dir << dendl;
1186
11fdf7f2 1187 mds->hit_export_target(dest, -1);
7c673cae
FG
1188
1189 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1190 if (it == export_state.end() ||
1191 it->second.tid != m->get_tid() ||
1192 it->second.peer != dest) {
1193 dout(7) << "must have aborted" << dendl;
1194 } else {
11fdf7f2 1195 ceph_assert(it->second.state == EXPORT_DISCOVERING);
c07f9fc5
FG
1196
1197 if (m->is_success()) {
1198 // release locks to avoid deadlock
1199 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
11fdf7f2 1200 ceph_assert(mdr);
c07f9fc5
FG
1201 mds->mdcache->request_finish(mdr);
1202 it->second.mut.reset();
1203 // freeze the subtree
1204 it->second.state = EXPORT_FREEZING;
1205 dir->auth_unpin(this);
11fdf7f2 1206 ceph_assert(g_conf()->mds_kill_export_at != 3);
c07f9fc5
FG
1207
1208 } else {
1209 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
1210 export_try_cancel(dir, false);
1211 }
7c673cae 1212 }
7c673cae
FG
1213}
1214
1215class C_M_ExportSessionsFlushed : public MigratorContext {
1216 CDir *dir;
1217 uint64_t tid;
1218public:
1219 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1220 : MigratorContext(m), dir(d), tid(t) {
11fdf7f2 1221 ceph_assert(dir != NULL);
7c673cae
FG
1222 }
1223 void finish(int r) override {
1224 mig->export_sessions_flushed(dir, tid);
1225 }
1226};
1227
1228void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1229{
1230 dout(7) << "export_sessions_flushed " << *dir << dendl;
1231
1232 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1233 if (it == export_state.end() ||
1234 it->second.state == EXPORT_CANCELLING ||
1235 it->second.tid != tid) {
1236 // export must have aborted.
1237 dout(7) << "export must have aborted on " << dir << dendl;
1238 return;
1239 }
1240
11fdf7f2
TL
1241 ceph_assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1242 ceph_assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
7c673cae
FG
1243 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1244 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1245 export_go(dir); // start export.
1246}
1247
1248void Migrator::export_frozen(CDir *dir, uint64_t tid)
1249{
1250 dout(7) << "export_frozen on " << *dir << dendl;
1251
1252 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1253 if (it == export_state.end() || it->second.tid != tid) {
1254 dout(7) << "export must have aborted" << dendl;
1255 return;
1256 }
1257
11fdf7f2
TL
1258 ceph_assert(it->second.state == EXPORT_FREEZING);
1259 ceph_assert(dir->is_frozen_tree_root());
7c673cae
FG
1260
1261 CInode *diri = dir->get_inode();
1262
1263 // ok, try to grab all my locks.
11fdf7f2
TL
1264 MutationImpl::LockOpVec lov;
1265 get_export_lock_set(dir, lov);
7c673cae 1266 if ((diri->is_auth() && diri->is_frozen()) ||
11fdf7f2 1267 !mds->locker->can_rdlock_set(lov) ||
7c673cae
FG
1268 !diri->filelock.can_wrlock(-1) ||
1269 !diri->nestlock.can_wrlock(-1)) {
1270 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1271 << *dir << dendl;
91327a77 1272 export_try_cancel(dir);
7c673cae
FG
1273 return;
1274 }
1275
1276 it->second.mut = new MutationImpl();
1277 if (diri->is_auth())
1278 it->second.mut->auth_pin(diri);
11fdf7f2 1279 mds->locker->rdlock_take_set(lov, it->second.mut);
7c673cae
FG
1280 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1281 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1282
1283 cache->show_subtrees();
1284
224ce89b 1285 // CDir::_freeze_tree() should have forced it into subtree.
11fdf7f2 1286 ceph_assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
7c673cae 1287 // note the bounds.
7c673cae
FG
1288 set<CDir*> bounds;
1289 cache->get_subtree_bounds(dir, bounds);
1290
1291 // generate prep message, log entry.
11fdf7f2 1292 auto prep = MExportDirPrep::create(dir->dirfrag(), it->second.tid);
7c673cae
FG
1293
1294 // include list of bystanders
181888fb
FG
1295 for (const auto &p : dir->get_replicas()) {
1296 if (p.first != it->second.peer) {
1297 dout(10) << "bystander mds." << p.first << dendl;
1298 prep->add_bystander(p.first);
7c673cae
FG
1299 }
1300 }
1301
1302 // include base dirfrag
1303 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1304
1305 /*
1306 * include spanning tree for all nested exports.
1307 * these need to be on the destination _before_ the final export so that
1308 * dir_auth updates on any nested exports are properly absorbed.
1309 * this includes inodes and dirfrags included in the subtree, but
1310 * only the inodes at the bounds.
1311 *
1312 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1313 */
1314 set<inodeno_t> inodes_added;
1315 set<dirfrag_t> dirfrags_added;
1316
1317 // check bounds
1318 for (set<CDir*>::iterator p = bounds.begin();
1319 p != bounds.end();
1320 ++p) {
1321 CDir *bound = *p;
1322
1323 // pin it.
91327a77
AA
1324 bound->get(CDir::PIN_EXPORTBOUND);
1325 bound->state_set(CDir::STATE_EXPORTBOUND);
7c673cae
FG
1326
1327 dout(7) << " export bound " << *bound << dendl;
1328 prep->add_bound( bound->dirfrag() );
1329
1330 // trace to bound
1331 bufferlist tracebl;
1332 CDir *cur = bound;
b32b8144 1333
7c673cae
FG
1334 char start = '-';
1335 while (1) {
1336 // don't repeat inodes
1337 if (inodes_added.count(cur->inode->ino()))
1338 break;
1339 inodes_added.insert(cur->inode->ino());
1340
1341 // prepend dentry + inode
11fdf7f2 1342 ceph_assert(cur->inode->is_auth());
7c673cae
FG
1343 bufferlist bl;
1344 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1345 dout(7) << " added " << *cur->inode->parent << dendl;
1346 cache->replicate_inode(cur->inode, it->second.peer, bl,
1347 mds->mdsmap->get_up_features());
1348 dout(7) << " added " << *cur->inode << dendl;
1349 bl.claim_append(tracebl);
1350 tracebl.claim(bl);
1351
1352 cur = cur->get_parent_dir();
1353
1354 // don't repeat dirfrags
1355 if (dirfrags_added.count(cur->dirfrag()) ||
1356 cur == dir) {
1357 start = 'd'; // start with dentry
1358 break;
1359 }
1360 dirfrags_added.insert(cur->dirfrag());
1361
1362 // prepend dir
1363 cache->replicate_dir(cur, it->second.peer, bl);
1364 dout(7) << " added " << *cur << dendl;
1365 bl.claim_append(tracebl);
1366 tracebl.claim(bl);
1367
1368 start = 'f'; // start with dirfrag
1369 }
1370 bufferlist final_bl;
1371 dirfrag_t df = cur->dirfrag();
11fdf7f2
TL
1372 encode(df, final_bl);
1373 encode(start, final_bl);
7c673cae
FG
1374 final_bl.claim_append(tracebl);
1375 prep->add_trace(final_bl);
1376 }
1377
1378 // send.
1379 it->second.state = EXPORT_PREPPING;
1380 mds->send_message_mds(prep, it->second.peer);
11fdf7f2 1381 assert (g_conf()->mds_kill_export_at != 4);
7c673cae
FG
1382
1383 // make sure any new instantiations of caps are flushed out
11fdf7f2 1384 ceph_assert(it->second.warning_ack_waiting.empty());
7c673cae 1385
91327a77
AA
1386 set<client_t> export_client_set;
1387 get_export_client_set(dir, export_client_set);
1388
7c673cae
FG
1389 MDSGatherBuilder gather(g_ceph_context);
1390 mds->server->flush_client_sessions(export_client_set, gather);
1391 if (gather.has_subs()) {
1392 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1393 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1394 gather.activate();
1395 }
1396}
1397
91327a77 1398void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
7c673cae 1399{
91327a77 1400 deque<CDir*> dfs;
7c673cae
FG
1401 dfs.push_back(dir);
1402 while (!dfs.empty()) {
1403 CDir *dir = dfs.front();
1404 dfs.pop_front();
91327a77 1405 for (auto& p : *dir) {
94b18763 1406 CDentry *dn = p.second;
91327a77 1407 if (!dn->get_linkage()->is_primary())
7c673cae
FG
1408 continue;
1409 CInode *in = dn->get_linkage()->get_inode();
1410 if (in->is_dir()) {
1411 // directory?
91327a77 1412 vector<CDir*> ls;
7c673cae 1413 in->get_dirfrags(ls);
91327a77
AA
1414 for (auto& q : ls) {
1415 if (!q->state_test(CDir::STATE_EXPORTBOUND)) {
7c673cae 1416 // include nested dirfrag
11fdf7f2 1417 ceph_assert(q->get_dir_auth().first == CDIR_AUTH_PARENT);
91327a77 1418 dfs.push_back(q); // it's ours, recurse (later)
7c673cae
FG
1419 }
1420 }
1421 }
91327a77
AA
1422 for (auto& q : in->get_client_caps()) {
1423 client_set.insert(q.first);
b32b8144 1424 }
7c673cae
FG
1425 }
1426 }
1427}
1428
1429void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1430{
11fdf7f2
TL
1431 for (const auto &p : in->get_client_caps()) {
1432 client_set.insert(p.first);
1433 }
7c673cae
FG
1434}
1435
11fdf7f2 1436void Migrator::handle_export_prep_ack(const MExportDirPrepAck::const_ref &m)
7c673cae
FG
1437{
1438 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1439 mds_rank_t dest(m->get_source().num());
11fdf7f2 1440 ceph_assert(dir);
7c673cae
FG
1441
1442 dout(7) << "export_prep_ack " << *dir << dendl;
1443
11fdf7f2 1444 mds->hit_export_target(dest, -1);
7c673cae
FG
1445
1446 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1447 if (it == export_state.end() ||
1448 it->second.tid != m->get_tid() ||
1449 it->second.peer != mds_rank_t(m->get_source().num())) {
1450 // export must have aborted.
1451 dout(7) << "export must have aborted" << dendl;
7c673cae
FG
1452 return;
1453 }
11fdf7f2 1454 ceph_assert(it->second.state == EXPORT_PREPPING);
7c673cae
FG
1455
1456 if (!m->is_success()) {
c07f9fc5 1457 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
7c673cae 1458 export_try_cancel(dir, false);
7c673cae
FG
1459 return;
1460 }
1461
11fdf7f2 1462 assert (g_conf()->mds_kill_export_at != 5);
7c673cae
FG
1463 // send warnings
1464 set<CDir*> bounds;
1465 cache->get_subtree_bounds(dir, bounds);
1466
11fdf7f2 1467 ceph_assert(it->second.warning_ack_waiting.empty() ||
7c673cae
FG
1468 (it->second.warning_ack_waiting.size() == 1 &&
1469 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
11fdf7f2 1470 ceph_assert(it->second.notify_ack_waiting.empty());
7c673cae 1471
181888fb
FG
1472 for (const auto &p : dir->get_replicas()) {
1473 if (p.first == it->second.peer) continue;
7c673cae 1474 if (mds->is_cluster_degraded() &&
181888fb 1475 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
7c673cae 1476 continue; // only if active
181888fb
FG
1477 it->second.warning_ack_waiting.insert(p.first);
1478 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
7c673cae 1479
11fdf7f2
TL
1480 auto notify = MExportDirNotify::create(dir->dirfrag(), it->second.tid, true,
1481 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1482 mds_authority_t(mds->get_nodeid(),it->second.peer));
1483 for (auto &cdir : bounds) {
1484 notify->get_bounds().push_back(cdir->dirfrag());
1485 }
181888fb 1486 mds->send_message_mds(notify, p.first);
7c673cae
FG
1487
1488 }
1489
1490 it->second.state = EXPORT_WARNING;
1491
11fdf7f2 1492 ceph_assert(g_conf()->mds_kill_export_at != 6);
7c673cae
FG
1493 // nobody to warn?
1494 if (it->second.warning_ack_waiting.empty())
1495 export_go(dir); // start export.
7c673cae
FG
1496}
1497
1498
1499class C_M_ExportGo : public MigratorContext {
1500 CDir *dir;
1501 uint64_t tid;
1502public:
1503 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1504 MigratorContext(m), dir(d), tid(t) {
11fdf7f2 1505 ceph_assert(dir != NULL);
7c673cae
FG
1506 }
1507 void finish(int r) override {
1508 mig->export_go_synced(dir, tid);
1509 }
1510};
1511
1512void Migrator::export_go(CDir *dir)
1513{
b32b8144 1514 auto it = export_state.find(dir);
11fdf7f2 1515 ceph_assert(it != export_state.end());
b32b8144 1516 dout(7) << "export_go " << *dir << " to " << it->second.peer << dendl;
7c673cae
FG
1517
1518 // first sync log to flush out e.g. any cap imports
b32b8144 1519 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, it->second.tid));
7c673cae
FG
1520 mds->mdlog->flush();
1521}
1522
1523void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1524{
1525 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1526 if (it == export_state.end() ||
1527 it->second.state == EXPORT_CANCELLING ||
1528 it->second.tid != tid) {
1529 // export must have aborted.
1530 dout(7) << "export must have aborted on " << dir << dendl;
1531 return;
1532 }
11fdf7f2 1533 ceph_assert(it->second.state == EXPORT_WARNING);
7c673cae
FG
1534 mds_rank_t dest = it->second.peer;
1535
1536 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1537
1538 cache->show_subtrees();
1539
1540 it->second.state = EXPORT_EXPORTING;
11fdf7f2 1541 ceph_assert(g_conf()->mds_kill_export_at != 7);
7c673cae 1542
11fdf7f2 1543 ceph_assert(dir->is_frozen_tree_root());
7c673cae
FG
1544
1545 // set ambiguous auth
1546 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1547
1548 // take away the popularity we're sending.
11fdf7f2 1549 mds->balancer->subtract_export(dir);
7c673cae
FG
1550
1551 // fill export message with cache data
11fdf7f2 1552 auto req = MExportDir::create(dir->dirfrag(), it->second.tid);
7c673cae 1553 map<client_t,entity_inst_t> exported_client_map;
11fdf7f2 1554 map<client_t,client_metadata_t> exported_client_metadata_map;
7c673cae
FG
1555 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1556 dir, // recur start point
1557 exported_client_map,
11fdf7f2
TL
1558 exported_client_metadata_map);
1559 encode(exported_client_map, req->client_map, mds->mdsmap->get_up_features());
1560 encode(exported_client_metadata_map, req->client_map);
7c673cae
FG
1561
1562 // add bounds to message
1563 set<CDir*> bounds;
1564 cache->get_subtree_bounds(dir, bounds);
1565 for (set<CDir*>::iterator p = bounds.begin();
1566 p != bounds.end();
1567 ++p)
1568 req->add_export((*p)->dirfrag());
1569
1570 // send
1571 mds->send_message_mds(req, dest);
11fdf7f2 1572 ceph_assert(g_conf()->mds_kill_export_at != 8);
7c673cae 1573
11fdf7f2 1574 mds->hit_export_target(dest, num_exported_inodes+1);
7c673cae
FG
1575
1576 // stats
1577 if (mds->logger) mds->logger->inc(l_mds_exported);
1578 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1579
1580 cache->show_subtrees();
1581}
1582
1583
1584/** encode_export_inode
1585 * update our local state for this inode to export.
1586 * encode relevant state to be sent over the wire.
1587 * used by: encode_export_dir, file_rename (if foreign)
1588 *
1589 * FIXME: the separation between CInode.encode_export and these methods
1590 * is pretty arbitrary and dumb.
1591 */
1592void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
11fdf7f2
TL
1593 map<client_t,entity_inst_t>& exported_client_map,
1594 map<client_t,client_metadata_t>& exported_client_metadata_map)
7c673cae
FG
1595{
1596 dout(7) << "encode_export_inode " << *in << dendl;
11fdf7f2 1597 ceph_assert(!in->is_replica(mds->get_nodeid()));
7c673cae 1598
11fdf7f2
TL
1599 encode(in->inode.ino, enc_state);
1600 encode(in->last, enc_state);
7c673cae
FG
1601 in->encode_export(enc_state);
1602
1603 // caps
11fdf7f2 1604 encode_export_inode_caps(in, true, enc_state, exported_client_map, exported_client_metadata_map);
7c673cae
FG
1605}
1606
1607void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
11fdf7f2
TL
1608 map<client_t,entity_inst_t>& exported_client_map,
1609 map<client_t,client_metadata_t>& exported_client_metadata_map)
7c673cae
FG
1610{
1611 dout(20) << "encode_export_inode_caps " << *in << dendl;
1612
1613 // encode caps
1614 map<client_t,Capability::Export> cap_map;
1615 in->export_client_caps(cap_map);
11fdf7f2 1616 encode(cap_map, bl);
7c673cae 1617 if (auth_cap) {
11fdf7f2 1618 encode(in->get_mds_caps_wanted(), bl);
7c673cae
FG
1619
1620 in->state_set(CInode::STATE_EXPORTINGCAPS);
1621 in->get(CInode::PIN_EXPORTINGCAPS);
1622 }
1623
1624 // make note of clients named by exported capabilities
11fdf7f2
TL
1625 for (const auto &p : in->get_client_caps()) {
1626 if (exported_client_map.count(p.first))
1627 continue;
1628 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
1629 exported_client_map[p.first] = session->info.inst;
1630 exported_client_metadata_map[p.first] = session->info.client_metadata;
1631 }
7c673cae
FG
1632}
1633
1634void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1635 map<client_t,Capability::Import>& peer_imported)
1636{
1637 dout(20) << "finish_export_inode_caps " << *in << dendl;
1638
1639 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1640 in->put(CInode::PIN_EXPORTINGCAPS);
1641
1642 // tell (all) clients about migrating caps..
11fdf7f2
TL
1643 for (const auto &p : in->get_client_caps()) {
1644 const Capability *cap = &p.second;
1645 dout(7) << "finish_export_inode_caps telling client." << p.first
7c673cae 1646 << " exported caps on " << *in << dendl;
11fdf7f2 1647 auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0,
7c673cae
FG
1648 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1649
11fdf7f2
TL
1650 map<client_t,Capability::Import>::iterator q = peer_imported.find(p.first);
1651 ceph_assert(q != peer_imported.end());
28e407b8
AA
1652 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
1653 (q->second.cap_id > 0 ? peer : -1), 0);
11fdf7f2 1654 mds->send_message_client_counted(m, p.first);
7c673cae
FG
1655 }
1656 in->clear_client_caps_after_export();
1657 mds->locker->eval(in, CEPH_CAP_LOCKS);
1658}
1659
11fdf7f2 1660void Migrator::finish_export_inode(CInode *in, mds_rank_t peer,
7c673cae 1661 map<client_t,Capability::Import>& peer_imported,
11fdf7f2 1662 MDSContext::vec& finished)
7c673cae
FG
1663{
1664 dout(12) << "finish_export_inode " << *in << dendl;
1665
1666 // clean
1667 if (in->is_dirty())
1668 in->mark_clean();
1669
1670 // clear/unpin cached_by (we're no longer the authority)
1671 in->clear_replica_map();
1672
1673 // twiddle lock states for auth -> replica transition
1674 in->authlock.export_twiddle();
1675 in->linklock.export_twiddle();
1676 in->dirfragtreelock.export_twiddle();
1677 in->filelock.export_twiddle();
1678 in->nestlock.export_twiddle();
1679 in->xattrlock.export_twiddle();
1680 in->snaplock.export_twiddle();
1681 in->flocklock.export_twiddle();
1682 in->policylock.export_twiddle();
1683
1684 // mark auth
11fdf7f2 1685 ceph_assert(in->is_auth());
7c673cae
FG
1686 in->state_clear(CInode::STATE_AUTH);
1687 in->replica_nonce = CInode::EXPORT_NONCE;
1688
1689 in->clear_dirty_rstat();
1690
1691 // no more auth subtree? clear scatter dirty
1692 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1693 in->clear_scatter_dirty();
1694
7c673cae
FG
1695 in->clear_dirty_parent();
1696
1697 in->clear_file_locks();
1698
1699 // waiters
1700 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1701
11fdf7f2 1702 in->finish_export();
7c673cae
FG
1703
1704 finish_export_inode_caps(in, peer, peer_imported);
7c673cae
FG
1705}
1706
1707uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1708 CDir *dir,
1709 map<client_t,entity_inst_t>& exported_client_map,
11fdf7f2 1710 map<client_t,client_metadata_t>& exported_client_metadata_map)
7c673cae
FG
1711{
1712 uint64_t num_exported = 0;
1713
1714 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1715
11fdf7f2 1716 ceph_assert(dir->get_projected_version() == dir->get_version());
7c673cae
FG
1717
1718#ifdef MDS_VERIFY_FRAGSTAT
1719 if (dir->is_complete())
1720 dir->verify_fragstat();
1721#endif
1722
1723 // dir
1724 dirfrag_t df = dir->dirfrag();
11fdf7f2 1725 encode(df, exportbl);
7c673cae
FG
1726 dir->encode_export(exportbl);
1727
1728 __u32 nden = dir->items.size();
11fdf7f2 1729 encode(nden, exportbl);
7c673cae
FG
1730
1731 // dentries
1732 list<CDir*> subdirs;
94b18763
FG
1733 for (auto &p : *dir) {
1734 CDentry *dn = p.second;
7c673cae 1735 CInode *in = dn->get_linkage()->get_inode();
7c673cae
FG
1736
1737 num_exported++;
1738
1739 // -- dentry
1740 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1741
1742 // dn name
11fdf7f2
TL
1743 encode(dn->get_name(), exportbl);
1744 encode(dn->last, exportbl);
7c673cae
FG
1745
1746 // state
1747 dn->encode_export(exportbl);
1748
1749 // points to...
1750
1751 // null dentry?
1752 if (dn->get_linkage()->is_null()) {
1753 exportbl.append("N", 1); // null dentry
1754 continue;
1755 }
1756
1757 if (dn->get_linkage()->is_remote()) {
1758 // remote link
1759 exportbl.append("L", 1); // remote link
1760
1761 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1762 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
11fdf7f2
TL
1763 encode(ino, exportbl);
1764 encode(d_type, exportbl);
7c673cae
FG
1765 continue;
1766 }
1767
1768 // primary link
1769 // -- inode
1770 exportbl.append("I", 1); // inode dentry
1771
11fdf7f2 1772 encode_export_inode(in, exportbl, exported_client_map, exported_client_metadata_map); // encode, and (update state for) export
7c673cae
FG
1773
1774 // directory?
1775 list<CDir*> dfs;
1776 in->get_dirfrags(dfs);
1777 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1778 CDir *t = *p;
1779 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1780 // include nested dirfrag
11fdf7f2 1781 ceph_assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
b32b8144 1782 subdirs.push_front(t); // it's ours, recurse (later)
7c673cae
FG
1783 }
1784 }
1785 }
1786
1787 // subdirs
94b18763 1788 for (auto &dir : subdirs)
11fdf7f2 1789 num_exported += encode_export_dir(exportbl, dir, exported_client_map, exported_client_metadata_map);
7c673cae
FG
1790
1791 return num_exported;
1792}
1793
11fdf7f2 1794void Migrator::finish_export_dir(CDir *dir, mds_rank_t peer,
7c673cae 1795 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
11fdf7f2 1796 MDSContext::vec& finished, int *num_dentries)
7c673cae
FG
1797{
1798 dout(10) << "finish_export_dir " << *dir << dendl;
1799
1800 // release open_by
1801 dir->clear_replica_map();
1802
1803 // mark
11fdf7f2 1804 ceph_assert(dir->is_auth());
7c673cae
FG
1805 dir->state_clear(CDir::STATE_AUTH);
1806 dir->remove_bloom();
1807 dir->replica_nonce = CDir::EXPORT_NONCE;
1808
1809 if (dir->is_dirty())
1810 dir->mark_clean();
1811
1812 // suck up all waiters
1813 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1814
1815 // pop
11fdf7f2 1816 dir->finish_export();
7c673cae
FG
1817
1818 // dentries
1819 list<CDir*> subdirs;
94b18763
FG
1820 for (auto &p : *dir) {
1821 CDentry *dn = p.second;
7c673cae
FG
1822 CInode *in = dn->get_linkage()->get_inode();
1823
1824 // dentry
1825 dn->finish_export();
1826
1827 // inode?
1828 if (dn->get_linkage()->is_primary()) {
11fdf7f2 1829 finish_export_inode(in, peer, peer_imported[in->ino()], finished);
7c673cae
FG
1830
1831 // subdirs?
1832 in->get_nested_dirfrags(subdirs);
1833 }
1834
1835 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1836 ++(*num_dentries);
1837 }
1838
1839 // subdirs
1840 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
11fdf7f2 1841 finish_export_dir(*it, peer, peer_imported, finished, num_dentries);
7c673cae
FG
1842}
1843
1844class C_MDS_ExportFinishLogged : public MigratorLogContext {
1845 CDir *dir;
1846public:
1847 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1848 void finish(int r) override {
1849 mig->export_logged_finish(dir);
1850 }
1851};
1852
1853
1854/*
1855 * i should get an export_ack from the export target.
7c673cae 1856 */
11fdf7f2 1857void Migrator::handle_export_ack(const MExportDirAck::const_ref &m)
7c673cae
FG
1858{
1859 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1860 mds_rank_t dest(m->get_source().num());
11fdf7f2
TL
1861 ceph_assert(dir);
1862 ceph_assert(dir->is_frozen_tree_root()); // i'm exporting!
7c673cae
FG
1863
1864 // yay!
1865 dout(7) << "handle_export_ack " << *dir << dendl;
1866
11fdf7f2 1867 mds->hit_export_target(dest, -1);
7c673cae
FG
1868
1869 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
11fdf7f2
TL
1870 ceph_assert(it != export_state.end());
1871 ceph_assert(it->second.state == EXPORT_EXPORTING);
1872 ceph_assert(it->second.tid == m->get_tid());
7c673cae 1873
11fdf7f2
TL
1874 auto bp = m->imported_caps.cbegin();
1875 decode(it->second.peer_imported, bp);
7c673cae
FG
1876
1877 it->second.state = EXPORT_LOGGINGFINISH;
11fdf7f2 1878 assert (g_conf()->mds_kill_export_at != 9);
7c673cae
FG
1879 set<CDir*> bounds;
1880 cache->get_subtree_bounds(dir, bounds);
1881
7c673cae
FG
1882 // log completion.
1883 // include export bounds, to ensure they're in the journal.
31f18b77 1884 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
7c673cae
FG
1885 mds->mdlog->start_entry(le);
1886
1887 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
31f18b77 1888 le->metablob.add_dir(dir, false);
7c673cae
FG
1889 for (set<CDir*>::iterator p = bounds.begin();
1890 p != bounds.end();
1891 ++p) {
1892 CDir *bound = *p;
1893 le->get_bounds().insert(bound->dirfrag());
1894 le->metablob.add_dir_context(bound);
1895 le->metablob.add_dir(bound, false);
1896 }
1897
31f18b77
FG
1898 // list us second, them first.
1899 // this keeps authority().first in sync with subtree auth state in the journal.
1900 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1901
7c673cae
FG
1902 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1903 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1904 mds->mdlog->flush();
11fdf7f2 1905 assert (g_conf()->mds_kill_export_at != 10);
7c673cae
FG
1906}
1907
b32b8144 1908void Migrator::export_notify_abort(CDir *dir, export_state_t& stat, set<CDir*>& bounds)
7c673cae
FG
1909{
1910 dout(7) << "export_notify_abort " << *dir << dendl;
1911
11fdf7f2 1912 ceph_assert(stat.state == EXPORT_CANCELLING);
7c673cae
FG
1913
1914 if (stat.notify_ack_waiting.empty()) {
1915 stat.state = EXPORT_CANCELLED;
1916 return;
1917 }
1918
1919 dir->auth_pin(this);
1920
1921 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1922 p != stat.notify_ack_waiting.end();
1923 ++p) {
11fdf7f2
TL
1924 auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
1925 pair<int,int>(mds->get_nodeid(), stat.peer),
1926 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
7c673cae
FG
1927 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1928 notify->get_bounds().push_back((*i)->dirfrag());
1929 mds->send_message_mds(notify, *p);
1930 }
1931}
1932
1933/*
1934 * this happens if hte dest failes after i send teh export data but before it is acked
1935 * that is, we don't know they safely received and logged it, so we reverse our changes
1936 * and go on.
1937 */
b32b8144 1938void Migrator::export_reverse(CDir *dir, export_state_t& stat)
7c673cae
FG
1939{
1940 dout(7) << "export_reverse " << *dir << dendl;
1941
1942 set<CInode*> to_eval;
1943
1944 set<CDir*> bounds;
1945 cache->get_subtree_bounds(dir, bounds);
1946
1947 // remove exporting pins
1948 list<CDir*> rq;
1949 rq.push_back(dir);
1950 while (!rq.empty()) {
1951 CDir *t = rq.front();
1952 rq.pop_front();
1953 t->abort_export();
94b18763
FG
1954 for (auto &p : *t) {
1955 CDentry *dn = p.second;
1956 dn->abort_export();
1957 if (!dn->get_linkage()->is_primary())
7c673cae 1958 continue;
94b18763 1959 CInode *in = dn->get_linkage()->get_inode();
7c673cae
FG
1960 in->abort_export();
1961 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1962 in->state_clear(CInode::STATE_EVALSTALECAPS);
1963 to_eval.insert(in);
1964 }
1965 if (in->is_dir())
1966 in->get_nested_dirfrags(rq);
1967 }
1968 }
1969
1970 // unpin bounds
b32b8144 1971 for (auto bd : bounds) {
7c673cae
FG
1972 bd->put(CDir::PIN_EXPORTBOUND);
1973 bd->state_clear(CDir::STATE_EXPORTBOUND);
1974 }
1975
7c673cae 1976 // notify bystanders
b32b8144 1977 export_notify_abort(dir, stat, bounds);
7c673cae 1978
224ce89b
WB
1979 // unfreeze tree, with possible subtree merge.
1980 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1981
7c673cae
FG
1982 // process delayed expires
1983 cache->process_delayed_expire(dir);
224ce89b 1984
7c673cae 1985 dir->unfreeze_tree();
224ce89b 1986 cache->try_subtree_merge(dir);
7c673cae
FG
1987
1988 // revoke/resume stale caps
1989 for (auto in : to_eval) {
1990 bool need_issue = false;
11fdf7f2
TL
1991 for (auto &p : in->client_caps) {
1992 Capability *cap = &p.second;
a8e16298 1993 if (!cap->is_stale()) {
7c673cae 1994 need_issue = true;
a8e16298 1995 break;
7c673cae
FG
1996 }
1997 }
1998 if (need_issue &&
1999 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
2000 mds->locker->issue_caps(in);
2001 }
2002
2003 cache->show_cache();
2004}
2005
2006
2007/*
2008 * once i get the ack, and logged the EExportFinish(true),
2009 * send notifies (if any), otherwise go straight to finish.
2010 *
2011 */
2012void Migrator::export_logged_finish(CDir *dir)
2013{
2014 dout(7) << "export_logged_finish " << *dir << dendl;
2015
2016 export_state_t& stat = export_state[dir];
2017
2018 // send notifies
2019 set<CDir*> bounds;
2020 cache->get_subtree_bounds(dir, bounds);
2021
2022 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
2023 p != stat.notify_ack_waiting.end();
2024 ++p) {
11fdf7f2
TL
2025 auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
2026 pair<int,int>(mds->get_nodeid(), stat.peer),
2027 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
7c673cae
FG
2028
2029 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2030 notify->get_bounds().push_back((*i)->dirfrag());
2031
2032 mds->send_message_mds(notify, *p);
2033 }
2034
2035 // wait for notifyacks
2036 stat.state = EXPORT_NOTIFYING;
11fdf7f2 2037 assert (g_conf()->mds_kill_export_at != 11);
7c673cae
FG
2038
2039 // no notifies to wait for?
2040 if (stat.notify_ack_waiting.empty()) {
2041 export_finish(dir); // skip notify/notify_ack stage.
2042 } else {
2043 // notify peer to send cap import messages to clients
2044 if (!mds->is_cluster_degraded() ||
2045 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
11fdf7f2 2046 mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), false, stat.tid), stat.peer);
7c673cae
FG
2047 } else {
2048 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
2049 }
2050 }
2051}
2052
2053/*
2054 * warning:
2055 * i'll get an ack from each bystander.
2056 * when i get them all, do the export.
2057 * notify:
2058 * i'll get an ack from each bystander.
2059 * when i get them all, unfreeze and send the finish.
7c673cae 2060 */
11fdf7f2 2061void Migrator::handle_export_notify_ack(const MExportDirNotifyAck::const_ref &m)
7c673cae
FG
2062{
2063 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2064 mds_rank_t dest(m->get_source().num());
11fdf7f2 2065 ceph_assert(dir);
7c673cae
FG
2066 mds_rank_t from = mds_rank_t(m->get_source().num());
2067
11fdf7f2 2068 mds->hit_export_target(dest, -1);
7c673cae
FG
2069
2070 auto export_state_entry = export_state.find(dir);
2071 if (export_state_entry != export_state.end()) {
2072 export_state_t& stat = export_state_entry->second;
2073 if (stat.state == EXPORT_WARNING &&
2074 stat.warning_ack_waiting.erase(from)) {
2075 // exporting. process warning.
2076 dout(7) << "handle_export_notify_ack from " << m->get_source()
2077 << ": exporting, processing warning on " << *dir << dendl;
2078 if (stat.warning_ack_waiting.empty())
2079 export_go(dir); // start export.
2080 } else if (stat.state == EXPORT_NOTIFYING &&
2081 stat.notify_ack_waiting.erase(from)) {
2082 // exporting. process notify.
2083 dout(7) << "handle_export_notify_ack from " << m->get_source()
2084 << ": exporting, processing notify on " << *dir << dendl;
2085 if (stat.notify_ack_waiting.empty())
2086 export_finish(dir);
2087 } else if (stat.state == EXPORT_CANCELLING &&
2088 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
2089 stat.notify_ack_waiting.erase(from)) {
2090 dout(7) << "handle_export_notify_ack from " << m->get_source()
2091 << ": cancelling export, processing notify on " << *dir << dendl;
2092 if (stat.notify_ack_waiting.empty()) {
91327a77 2093 export_cancel_finish(export_state_entry);
7c673cae
FG
2094 }
2095 }
2096 }
2097 else {
2098 auto import_state_entry = import_state.find(dir->dirfrag());
2099 if (import_state_entry != import_state.end()) {
2100 import_state_t& stat = import_state_entry->second;
2101 if (stat.state == IMPORT_ABORTING) {
2102 // reversing import
2103 dout(7) << "handle_export_notify_ack from " << m->get_source()
2104 << ": aborting import on " << *dir << dendl;
11fdf7f2 2105 ceph_assert(stat.bystanders.count(from));
7c673cae
FG
2106 stat.bystanders.erase(from);
2107 if (stat.bystanders.empty())
2108 import_reverse_unfreeze(dir);
2109 }
2110 }
2111 }
7c673cae
FG
2112}
2113
2114void Migrator::export_finish(CDir *dir)
2115{
2116 dout(5) << "export_finish " << *dir << dendl;
2117
11fdf7f2 2118 assert (g_conf()->mds_kill_export_at != 12);
7c673cae
FG
2119 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
2120 if (it == export_state.end()) {
2121 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
2122 return;
2123 }
2124
2125 // send finish/commit to new auth
2126 if (!mds->is_cluster_degraded() ||
2127 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
11fdf7f2 2128 mds->send_message_mds(MExportDirFinish::create(dir->dirfrag(), true, it->second.tid), it->second.peer);
7c673cae
FG
2129 } else {
2130 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
2131 }
11fdf7f2 2132 ceph_assert(g_conf()->mds_kill_export_at != 13);
7c673cae
FG
2133
2134 // finish export (adjust local cache state)
2135 int num_dentries = 0;
11fdf7f2
TL
2136 MDSContext::vec finished;
2137 finish_export_dir(dir, it->second.peer,
224ce89b
WB
2138 it->second.peer_imported, finished, &num_dentries);
2139
11fdf7f2 2140 ceph_assert(!dir->is_auth());
224ce89b
WB
2141 cache->adjust_subtree_auth(dir, it->second.peer);
2142
7c673cae
FG
2143 // unpin bounds
2144 set<CDir*> bounds;
2145 cache->get_subtree_bounds(dir, bounds);
2146 for (set<CDir*>::iterator p = bounds.begin();
2147 p != bounds.end();
2148 ++p) {
2149 CDir *bd = *p;
2150 bd->put(CDir::PIN_EXPORTBOUND);
2151 bd->state_clear(CDir::STATE_EXPORTBOUND);
2152 }
2153
2154 if (dir->state_test(CDir::STATE_AUXSUBTREE))
2155 dir->state_clear(CDir::STATE_AUXSUBTREE);
2156
224ce89b
WB
2157 // discard delayed expires
2158 cache->discard_delayed_expire(dir);
2159
2160 dout(7) << "export_finish unfreezing" << dendl;
2161
2162 // unfreeze tree, with possible subtree merge.
7c673cae 2163 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
224ce89b 2164 dir->unfreeze_tree();
7c673cae
FG
2165 cache->try_subtree_merge(dir);
2166
2167 // no more auth subtree? clear scatter dirty
2168 if (!dir->get_inode()->is_auth() &&
2169 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2170 dir->get_inode()->clear_scatter_dirty();
2171 // wake up scatter_nudge waiters
224ce89b 2172 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
7c673cae
FG
2173 }
2174
224ce89b
WB
2175 if (!finished.empty())
2176 mds->queue_waiters(finished);
7c673cae 2177
91327a77
AA
2178 MutationRef mut = std::move(it->second.mut);
2179 auto parent = std::move(it->second.parent);
7c673cae 2180 // remove from exporting list, clean up state
91327a77 2181 total_exporting_size -= it->second.approx_size;
7c673cae 2182 export_state.erase(it);
91327a77 2183
11fdf7f2 2184 ceph_assert(dir->state_test(CDir::STATE_EXPORTING));
1adf2230 2185 dir->clear_exporting();
7c673cae
FG
2186
2187 cache->show_subtrees();
2188 audit();
2189
181888fb 2190 cache->trim(num_dentries); // try trimming exported dentries
7c673cae
FG
2191
2192 // send pending import_maps?
2193 mds->mdcache->maybe_send_pending_resolves();
2194
2195 // drop locks, unpin path
2196 if (mut) {
2197 mds->locker->drop_locks(mut.get());
2198 mut->cleanup();
2199 }
91327a77
AA
2200
2201 if (parent)
2202 child_export_finish(parent, true);
2203
7c673cae
FG
2204 maybe_do_queued_export();
2205}
2206
2207
2208
11fdf7f2
TL
2209class C_MDS_ExportDiscover : public MigratorContext {
2210public:
2211 C_MDS_ExportDiscover(Migrator *mig, const MExportDirDiscover::const_ref& m) : MigratorContext(mig), m(m) {}
2212 void finish(int r) override {
2213 mig->handle_export_discover(m, true);
2214 }
2215private:
2216 MExportDirDiscover::const_ref m;
2217};
7c673cae 2218
11fdf7f2
TL
2219class C_MDS_ExportDiscoverFactory : public MDSContextFactory {
2220public:
2221 C_MDS_ExportDiscoverFactory(Migrator *mig, MExportDirDiscover::const_ref m) : mig(mig), m(m) {}
2222 MDSContext *build() {
2223 return new C_MDS_ExportDiscover(mig, m);
2224 }
2225private:
2226 Migrator *mig;
2227 MExportDirDiscover::const_ref m;
2228};
7c673cae
FG
2229
2230// ==========================================================
2231// IMPORT
2232
11fdf7f2 2233void Migrator::handle_export_discover(const MExportDirDiscover::const_ref &m, bool started)
7c673cae
FG
2234{
2235 mds_rank_t from = m->get_source_mds();
11fdf7f2 2236 ceph_assert(from != mds->get_nodeid());
7c673cae
FG
2237
2238 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2239
2240 // note import state
2241 dirfrag_t df = m->get_dirfrag();
c07f9fc5
FG
2242
2243 if (!mds->is_active()) {
2244 dout(7) << " not active, send NACK " << dendl;
11fdf7f2 2245 mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid(), false), from);
c07f9fc5
FG
2246 return;
2247 }
2248
7c673cae 2249 // only start discovering on this message once.
b32b8144 2250 import_state_t *p_state;
7c673cae 2251 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
11fdf7f2
TL
2252 if (!started) {
2253 ceph_assert(it == import_state.end());
b32b8144
FG
2254 p_state = &import_state[df];
2255 p_state->state = IMPORT_DISCOVERING;
2256 p_state->peer = from;
2257 p_state->tid = m->get_tid();
7c673cae
FG
2258 } else {
2259 // am i retrying after ancient path_traverse results?
2260 if (it == import_state.end() ||
2261 it->second.peer != from ||
2262 it->second.tid != m->get_tid()) {
2263 dout(7) << " dropping obsolete message" << dendl;
7c673cae
FG
2264 return;
2265 }
11fdf7f2 2266 ceph_assert(it->second.state == IMPORT_DISCOVERING);
b32b8144 2267 p_state = &it->second;
7c673cae
FG
2268 }
2269
11fdf7f2 2270 C_MDS_ExportDiscoverFactory cf(this, m);
7c673cae
FG
2271 if (!mds->mdcache->is_open()) {
2272 dout(5) << " waiting for root" << dendl;
11fdf7f2 2273 mds->mdcache->wait_for_open(cf.build());
7c673cae
FG
2274 return;
2275 }
2276
11fdf7f2 2277 assert (g_conf()->mds_kill_import_at != 1);
7c673cae
FG
2278
2279 // do we have it?
2280 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2281 if (!in) {
2282 // must discover it!
2283 filepath fpath(m->get_path());
2284 vector<CDentry*> trace;
2285 MDRequestRef null_ref;
11fdf7f2 2286 int r = cache->path_traverse(null_ref, cf, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
7c673cae
FG
2287 if (r > 0) return;
2288 if (r < 0) {
91327a77 2289 dout(7) << "handle_export_discover failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
7c673cae
FG
2290 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2291 }
2292
2293 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2294 }
2295
2296 // yay
2297 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2298
b32b8144 2299 p_state->state = IMPORT_DISCOVERED;
7c673cae
FG
2300
2301 // pin inode in the cache (for now)
11fdf7f2 2302 ceph_assert(in->is_dir());
7c673cae
FG
2303 in->get(CInode::PIN_IMPORTING);
2304
2305 // reply
2306 dout(7) << " sending export_discover_ack on " << *in << dendl;
11fdf7f2
TL
2307 mds->send_message_mds(MExportDirDiscoverAck::create(df, m->get_tid()), p_state->peer);
2308 assert (g_conf()->mds_kill_import_at != 2);
7c673cae
FG
2309}
2310
2311void Migrator::import_reverse_discovering(dirfrag_t df)
2312{
2313 import_state.erase(df);
2314}
2315
2316void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2317{
2318 // unpin base
2319 diri->put(CInode::PIN_IMPORTING);
2320 import_state.erase(df);
2321}
2322
b32b8144 2323void Migrator::import_reverse_prepping(CDir *dir, import_state_t& stat)
7c673cae
FG
2324{
2325 set<CDir*> bounds;
b32b8144 2326 cache->map_dirfrag_set(stat.bound_ls, bounds);
7c673cae
FG
2327 import_remove_pins(dir, bounds);
2328 import_reverse_final(dir);
2329}
2330
11fdf7f2 2331void Migrator::handle_export_cancel(const MExportDirCancel::const_ref &m)
7c673cae
FG
2332{
2333 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2334 dirfrag_t df = m->get_dirfrag();
2335 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2336 if (it == import_state.end()) {
11fdf7f2 2337 ceph_abort_msg("got export_cancel in weird state");
7c673cae
FG
2338 } else if (it->second.state == IMPORT_DISCOVERING) {
2339 import_reverse_discovering(df);
2340 } else if (it->second.state == IMPORT_DISCOVERED) {
2341 CInode *in = cache->get_inode(df.ino);
11fdf7f2 2342 ceph_assert(in);
7c673cae
FG
2343 import_reverse_discovered(df, in);
2344 } else if (it->second.state == IMPORT_PREPPING) {
2345 CDir *dir = mds->mdcache->get_dirfrag(df);
11fdf7f2 2346 ceph_assert(dir);
b32b8144 2347 import_reverse_prepping(dir, it->second);
7c673cae
FG
2348 } else if (it->second.state == IMPORT_PREPPED) {
2349 CDir *dir = mds->mdcache->get_dirfrag(df);
11fdf7f2 2350 ceph_assert(dir);
7c673cae
FG
2351 set<CDir*> bounds;
2352 cache->get_subtree_bounds(dir, bounds);
2353 import_remove_pins(dir, bounds);
2354 // adjust auth back to the exportor
2355 cache->adjust_subtree_auth(dir, it->second.peer);
7c673cae
FG
2356 import_reverse_unfreeze(dir);
2357 } else {
11fdf7f2 2358 ceph_abort_msg("got export_cancel in weird state");
7c673cae 2359 }
7c673cae
FG
2360}
2361
11fdf7f2
TL
2362class C_MDS_ExportPrep : public MigratorContext {
2363public:
2364 C_MDS_ExportPrep(Migrator *mig, const MExportDirPrep::const_ref& m) : MigratorContext(mig), m(m) {}
2365 void finish(int r) override {
2366 mig->handle_export_prep(m, true);
2367 }
2368private:
2369 MExportDirPrep::const_ref m;
2370};
2371
2372class C_MDS_ExportPrepFactory : public MDSContextFactory {
2373public:
2374 C_MDS_ExportPrepFactory(Migrator *mig, MExportDirPrep::const_ref m) : mig(mig), m(m) {}
2375 MDSContext *build() {
2376 return new C_MDS_ExportPrep(mig, m);
2377 }
2378private:
2379 Migrator *mig;
2380 MExportDirPrep::const_ref m;
2381};
2382
2383void Migrator::handle_export_prep(const MExportDirPrep::const_ref &m, bool did_assim)
7c673cae
FG
2384{
2385 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
11fdf7f2 2386 ceph_assert(oldauth != mds->get_nodeid());
7c673cae
FG
2387
2388 CDir *dir;
2389 CInode *diri;
11fdf7f2 2390 MDSContext::vec finished;
7c673cae
FG
2391
2392 // assimilate root dir.
2393 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
11fdf7f2
TL
2394 if (!did_assim) {
2395 ceph_assert(it != import_state.end());
2396 ceph_assert(it->second.state == IMPORT_DISCOVERED);
2397 ceph_assert(it->second.peer == oldauth);
7c673cae 2398 diri = cache->get_inode(m->get_dirfrag().ino);
11fdf7f2
TL
2399 ceph_assert(diri);
2400 auto p = m->basedir.cbegin();
7c673cae
FG
2401 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2402 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2403 } else {
2404 if (it == import_state.end() ||
2405 it->second.peer != oldauth ||
2406 it->second.tid != m->get_tid()) {
2407 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
7c673cae
FG
2408 return;
2409 }
11fdf7f2
TL
2410 ceph_assert(it->second.state == IMPORT_PREPPING);
2411 ceph_assert(it->second.peer == oldauth);
7c673cae
FG
2412
2413 dir = cache->get_dirfrag(m->get_dirfrag());
11fdf7f2 2414 ceph_assert(dir);
7c673cae
FG
2415 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2416 diri = dir->get_inode();
2417 }
11fdf7f2 2418 ceph_assert(dir->is_auth() == false);
7c673cae
FG
2419
2420 cache->show_subtrees();
2421
2422 // build import bound map
2423 map<inodeno_t, fragset_t> import_bound_fragset;
11fdf7f2
TL
2424 for (const auto &bound : m->get_bounds()) {
2425 dout(10) << " bound " << bound << dendl;
2426 import_bound_fragset[bound.ino].insert(bound.frag);
7c673cae
FG
2427 }
2428
2429 // assimilate contents?
11fdf7f2 2430 if (!did_assim) {
7c673cae 2431 dout(7) << "doing assim on " << *dir << dendl;
7c673cae
FG
2432
2433 // change import state
2434 it->second.state = IMPORT_PREPPING;
2435 it->second.bound_ls = m->get_bounds();
2436 it->second.bystanders = m->get_bystanders();
11fdf7f2 2437 ceph_assert(g_conf()->mds_kill_import_at != 3);
7c673cae
FG
2438
2439 // bystander list
2440 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2441
2442 // move pin to dir
2443 diri->put(CInode::PIN_IMPORTING);
2444 dir->get(CDir::PIN_IMPORTING);
2445 dir->state_set(CDir::STATE_IMPORTING);
2446
2447 // assimilate traces to exports
2448 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
11fdf7f2
TL
2449 for (const auto &bl : m->traces) {
2450 auto q = bl.cbegin();
7c673cae 2451 dirfrag_t df;
11fdf7f2 2452 decode(df, q);
7c673cae 2453 char start;
11fdf7f2
TL
2454 decode(start, q);
2455 dout(10) << " trace from " << df << " start " << start << " len " << bl.length() << dendl;
7c673cae
FG
2456
2457 CDir *cur = 0;
2458 if (start == 'd') {
2459 cur = cache->get_dirfrag(df);
11fdf7f2 2460 ceph_assert(cur);
7c673cae
FG
2461 dout(10) << " had " << *cur << dendl;
2462 } else if (start == 'f') {
2463 CInode *in = cache->get_inode(df.ino);
11fdf7f2 2464 ceph_assert(in);
7c673cae
FG
2465 dout(10) << " had " << *in << dendl;
2466 cur = cache->add_replica_dir(q, in, oldauth, finished);
2467 dout(10) << " added " << *cur << dendl;
2468 } else if (start == '-') {
2469 // nothing
2470 } else
11fdf7f2 2471 ceph_abort_msg("unrecognized start char");
7c673cae 2472
b32b8144 2473 while (!q.end()) {
7c673cae
FG
2474 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2475 dout(10) << " added " << *dn << dendl;
2476 CInode *in = cache->add_replica_inode(q, dn, finished);
2477 dout(10) << " added " << *in << dendl;
2478 if (q.end())
2479 break;
2480 cur = cache->add_replica_dir(q, in, oldauth, finished);
2481 dout(10) << " added " << *cur << dendl;
2482 }
2483 }
2484
2485 // make bound sticky
2486 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2487 p != import_bound_fragset.end();
2488 ++p) {
2489 CInode *in = cache->get_inode(p->first);
11fdf7f2 2490 ceph_assert(in);
7c673cae
FG
2491 in->get_stickydirs();
2492 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2493 }
2494
2495 } else {
2496 dout(7) << " not doing assim on " << *dir << dendl;
2497 }
2498
11fdf7f2
TL
2499 C_MDS_ExportPrepFactory cf(this, m);
2500
7c673cae
FG
2501 if (!finished.empty())
2502 mds->queue_waiters(finished);
2503
2504
c07f9fc5
FG
2505 bool success = true;
2506 if (mds->is_active()) {
2507 // open all bounds
2508 set<CDir*> import_bounds;
2509 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2510 p != import_bound_fragset.end();
2511 ++p) {
2512 CInode *in = cache->get_inode(p->first);
11fdf7f2 2513 ceph_assert(in);
7c673cae 2514
c07f9fc5 2515 // map fragset into a frag_t list, based on the inode fragtree
11fdf7f2
TL
2516 frag_vec_t leaves;
2517 for (const auto& frag : p->second) {
2518 in->dirfragtree.get_leaves_under(frag, leaves);
2519 }
2520 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << leaves << dendl;
c07f9fc5 2521
11fdf7f2
TL
2522 for (const auto& leaf : leaves) {
2523 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, leaf));
c07f9fc5 2524 if (!bound) {
11fdf7f2
TL
2525 dout(7) << " opening bounding dirfrag " << leaf << " on " << *in << dendl;
2526 cache->open_remote_dirfrag(in, leaf, cf.build());
c07f9fc5
FG
2527 return;
2528 }
7c673cae 2529
c07f9fc5
FG
2530 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2531 dout(7) << " pinning import bound " << *bound << dendl;
2532 bound->get(CDir::PIN_IMPORTBOUND);
2533 bound->state_set(CDir::STATE_IMPORTBOUND);
2534 } else {
2535 dout(7) << " already pinned import bound " << *bound << dendl;
2536 }
2537 import_bounds.insert(bound);
7c673cae 2538 }
7c673cae 2539 }
7c673cae 2540
c07f9fc5
FG
2541 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2542
2543 if (!mds->mdcache->is_readonly() &&
91327a77
AA
2544 diri->filelock.can_wrlock(-1) &&
2545 diri->nestlock.can_wrlock(-1)) {
c07f9fc5
FG
2546 it->second.mut = new MutationImpl();
2547 // force some locks. hacky.
2548 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2549 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2550
2551 // note that i am an ambiguous auth for this subtree.
2552 // specify bounds, since the exporter explicitly defines the region.
2553 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2554 pair<int,int>(oldauth, mds->get_nodeid()));
2555 cache->verify_subtree_bounds(dir, import_bounds);
2556 // freeze.
2557 dir->_freeze_tree();
2558 // note new state
2559 it->second.state = IMPORT_PREPPED;
2560 } else {
2561 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2562 success = false;
2563 }
7c673cae 2564 } else {
c07f9fc5 2565 dout(7) << " not active, failing. " << *dir << dendl;
7c673cae 2566 success = false;
7c673cae
FG
2567 }
2568
c07f9fc5 2569 if (!success)
b32b8144 2570 import_reverse_prepping(dir, it->second);
c07f9fc5 2571
7c673cae
FG
2572 // ok!
2573 dout(7) << " sending export_prep_ack on " << *dir << dendl;
11fdf7f2 2574 mds->send_message(MExportDirPrepAck::create(dir->dirfrag(), success, m->get_tid()), m->get_connection());
7c673cae 2575
11fdf7f2 2576 ceph_assert(g_conf()->mds_kill_import_at != 4);
7c673cae
FG
2577}
2578
2579
2580
2581
2582class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2583 dirfrag_t df;
2584 CDir *dir;
2585 mds_rank_t from;
2586public:
28e407b8 2587 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae
FG
2588
2589 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2590 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2591 }
2592 void finish(int r) override {
28e407b8 2593 mig->import_logged_start(df, dir, from, imported_session_map);
7c673cae
FG
2594 }
2595};
2596
11fdf7f2 2597void Migrator::handle_export_dir(const MExportDir::const_ref &m)
7c673cae 2598{
11fdf7f2 2599 assert (g_conf()->mds_kill_import_at != 5);
7c673cae 2600 CDir *dir = cache->get_dirfrag(m->dirfrag);
11fdf7f2 2601 ceph_assert(dir);
31f18b77
FG
2602
2603 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2604 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2605
11fdf7f2
TL
2606 ceph_assert(!dir->is_auth());
2607 ceph_assert(dir->freeze_tree_state);
7c673cae
FG
2608
2609 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
11fdf7f2
TL
2610 ceph_assert(it != import_state.end());
2611 ceph_assert(it->second.state == IMPORT_PREPPED);
2612 ceph_assert(it->second.tid == m->get_tid());
2613 ceph_assert(it->second.peer == oldauth);
7c673cae
FG
2614
2615 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2616 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2617
2618 cache->show_subtrees();
2619
31f18b77 2620 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
7c673cae
FG
2621
2622 // start the journal entry
31f18b77 2623 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
7c673cae
FG
2624 mds->mdlog->start_entry(le);
2625
2626 le->metablob.add_dir_context(dir);
2627
2628 // adjust auth (list us _first_)
2629 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2630
2631 // new client sessions, open these after we journal
2632 // include imported sessions in EImportStart
11fdf7f2 2633 auto cmp = m->client_map.cbegin();
28e407b8 2634 map<client_t,entity_inst_t> client_map;
11fdf7f2 2635 map<client_t,client_metadata_t> client_metadata_map;
28e407b8 2636 decode(client_map, cmp);
11fdf7f2
TL
2637 decode(client_metadata_map, cmp);
2638 ceph_assert(cmp.end());
2639 le->cmapv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
2640 onlogged->imported_session_map);
28e407b8 2641 encode(client_map, le->client_map, mds->mdsmap->get_up_features());
11fdf7f2 2642 encode(client_metadata_map, le->client_map);
7c673cae 2643
11fdf7f2 2644 auto blp = m->export_data.cbegin();
7c673cae
FG
2645 int num_imported_inodes = 0;
2646 while (!blp.end()) {
2647 num_imported_inodes +=
2648 decode_import_dir(blp,
2649 oldauth,
2650 dir, // import root
2651 le,
2652 mds->mdlog->get_current_segment(),
2653 it->second.peer_exports,
11fdf7f2 2654 it->second.updated_scatterlocks);
7c673cae
FG
2655 }
2656 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2657
2658 // include bounds in EImportStart
2659 set<CDir*> import_bounds;
11fdf7f2
TL
2660 for (const auto &bound : m->bounds) {
2661 CDir *bd = cache->get_dirfrag(bound);
2662 ceph_assert(bd);
7c673cae
FG
2663 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2664 import_bounds.insert(bd);
2665 }
2666 cache->verify_subtree_bounds(dir, import_bounds);
2667
2668 // adjust popularity
11fdf7f2 2669 mds->balancer->add_import(dir);
7c673cae
FG
2670
2671 dout(7) << "handle_export_dir did " << *dir << dendl;
2672
2673 // note state
2674 it->second.state = IMPORT_LOGGINGSTART;
11fdf7f2 2675 assert (g_conf()->mds_kill_import_at != 6);
7c673cae
FG
2676
2677 // log it
2678 mds->mdlog->submit_entry(le, onlogged);
2679 mds->mdlog->flush();
2680
2681 // some stats
2682 if (mds->logger) {
2683 mds->logger->inc(l_mds_imported);
2684 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2685 }
7c673cae
FG
2686}
2687
2688
2689/*
2690 * this is an import helper
2691 * called by import_finish, and import_reverse and friends.
2692 */
2693void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2694{
2695 import_state_t& stat = import_state[dir->dirfrag()];
2696 // root
2697 dir->put(CDir::PIN_IMPORTING);
2698 dir->state_clear(CDir::STATE_IMPORTING);
2699
2700 // bounding inodes
2701 set<inodeno_t> did;
2702 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2703 p != stat.bound_ls.end();
2704 ++p) {
2705 if (did.count(p->ino))
2706 continue;
2707 did.insert(p->ino);
2708 CInode *in = cache->get_inode(p->ino);
11fdf7f2 2709 ceph_assert(in);
7c673cae
FG
2710 in->put_stickydirs();
2711 }
2712
2713 if (stat.state == IMPORT_PREPPING) {
2714 for (auto bd : bounds) {
2715 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2716 bd->put(CDir::PIN_IMPORTBOUND);
2717 bd->state_clear(CDir::STATE_IMPORTBOUND);
2718 }
2719 }
2720 } else if (stat.state >= IMPORT_PREPPED) {
2721 // bounding dirfrags
2722 for (auto bd : bounds) {
11fdf7f2 2723 ceph_assert(bd->state_test(CDir::STATE_IMPORTBOUND));
7c673cae
FG
2724 bd->put(CDir::PIN_IMPORTBOUND);
2725 bd->state_clear(CDir::STATE_IMPORTBOUND);
2726 }
2727 }
2728}
2729
91327a77
AA
2730class C_MDC_QueueContexts : public MigratorContext {
2731public:
11fdf7f2 2732 MDSContext::vec contexts;
91327a77
AA
2733 C_MDC_QueueContexts(Migrator *m) : MigratorContext(m) {}
2734 void finish(int r) override {
2735 // execute contexts immediately after 'this' context
2736 get_mds()->queue_waiters_front(contexts);
2737 }
2738};
7c673cae
FG
2739
2740/*
2741 * note: this does teh full work of reversing and import and cleaning up
2742 * state.
2743 * called by both handle_mds_failure and by handle_resolve (if we are
2744 * a survivor coping with an exporter failure+recovery).
2745 */
2746void Migrator::import_reverse(CDir *dir)
2747{
2748 dout(7) << "import_reverse " << *dir << dendl;
2749
2750 import_state_t& stat = import_state[dir->dirfrag()];
2751 stat.state = IMPORT_ABORTING;
2752
2753 set<CDir*> bounds;
2754 cache->get_subtree_bounds(dir, bounds);
2755
2756 // remove pins
2757 import_remove_pins(dir, bounds);
2758
2759 // update auth, with possible subtree merge.
11fdf7f2 2760 ceph_assert(dir->is_subtree_root());
7c673cae
FG
2761 if (mds->is_resolve())
2762 cache->trim_non_auth_subtree(dir);
2763
2764 cache->adjust_subtree_auth(dir, stat.peer);
2765
91327a77 2766 auto fin = new C_MDC_QueueContexts(this);
7c673cae
FG
2767 if (!dir->get_inode()->is_auth() &&
2768 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2769 dir->get_inode()->clear_scatter_dirty();
2770 // wake up scatter_nudge waiters
2771 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2772 }
2773
2774 int num_dentries = 0;
2775 // adjust auth bits.
2776 list<CDir*> q;
2777 q.push_back(dir);
2778 while (!q.empty()) {
2779 CDir *cur = q.front();
2780 q.pop_front();
2781
2782 // dir
11fdf7f2 2783 cur->abort_import();
7c673cae 2784
94b18763
FG
2785 for (auto &p : *cur) {
2786 CDentry *dn = p.second;
7c673cae
FG
2787
2788 // dentry
2789 dn->state_clear(CDentry::STATE_AUTH);
2790 dn->clear_replica_map();
2791 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2792 if (dn->is_dirty())
2793 dn->mark_clean();
2794
2795 // inode?
2796 if (dn->get_linkage()->is_primary()) {
2797 CInode *in = dn->get_linkage()->get_inode();
2798 in->state_clear(CDentry::STATE_AUTH);
2799 in->clear_replica_map();
2800 in->set_replica_nonce(CInode::EXPORT_NONCE);
2801 if (in->is_dirty())
2802 in->mark_clean();
2803 in->clear_dirty_rstat();
2804 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2805 in->clear_scatter_dirty();
2806 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2807 }
2808
2809 in->clear_dirty_parent();
2810
2811 in->authlock.clear_gather();
2812 in->linklock.clear_gather();
2813 in->dirfragtreelock.clear_gather();
2814 in->filelock.clear_gather();
2815
2816 in->clear_file_locks();
2817
2818 // non-bounding dir?
2819 list<CDir*> dfs;
2820 in->get_dirfrags(dfs);
2821 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2822 if (bounds.count(*p) == 0)
2823 q.push_back(*p);
2824 }
2825
2826 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2827 ++num_dentries;
2828 }
2829 }
2830
2831 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2832
2833 if (stat.state == IMPORT_ACKING) {
2834 // remove imported caps
2835 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
28e407b8
AA
2836 p != stat.peer_exports.end();
2837 ++p) {
7c673cae
FG
2838 CInode *in = p->first;
2839 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
28e407b8
AA
2840 q != p->second.end();
2841 ++q) {
7c673cae 2842 Capability *cap = in->get_client_cap(q->first);
28e407b8 2843 if (!cap) {
11fdf7f2 2844 ceph_assert(!stat.session_map.count(q->first));
28e407b8
AA
2845 continue;
2846 }
7c673cae
FG
2847 if (cap->is_importing())
2848 in->remove_client_cap(q->first);
2849 }
2850 in->put(CInode::PIN_IMPORTINGCAPS);
2851 }
28e407b8
AA
2852 for (auto& p : stat.session_map) {
2853 Session *session = p.second.first;
7c673cae
FG
2854 session->dec_importing();
2855 }
2856 }
2857
2858 // log our failure
2859 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2860
181888fb 2861 cache->trim(num_dentries); // try trimming dentries
7c673cae
FG
2862
2863 // notify bystanders; wait in aborting state
2864 import_notify_abort(dir, bounds);
2865}
2866
2867void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2868{
2869 dout(7) << "import_notify_finish " << *dir << dendl;
2870
2871 import_state_t& stat = import_state[dir->dirfrag()];
2872 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2873 p != stat.bystanders.end();
2874 ++p) {
11fdf7f2
TL
2875 auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, false,
2876 pair<int,int>(stat.peer, mds->get_nodeid()),
2877 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
7c673cae
FG
2878 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2879 notify->get_bounds().push_back((*i)->dirfrag());
2880 mds->send_message_mds(notify, *p);
2881 }
2882}
2883
2884void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2885{
2886 dout(7) << "import_notify_abort " << *dir << dendl;
2887
2888 import_state_t& stat = import_state[dir->dirfrag()];
2889 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2890 p != stat.bystanders.end(); ) {
2891 if (mds->is_cluster_degraded() &&
2892 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2893 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2894 stat.bystanders.erase(p++);
2895 continue;
2896 }
11fdf7f2
TL
2897 auto notify = MExportDirNotify::create(dir->dirfrag(), stat.tid, true,
2898 mds_authority_t(stat.peer, mds->get_nodeid()),
2899 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
7c673cae
FG
2900 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2901 notify->get_bounds().push_back((*i)->dirfrag());
2902 mds->send_message_mds(notify, *p);
2903 ++p;
2904 }
2905 if (stat.bystanders.empty()) {
2906 dout(7) << "no bystanders, finishing reverse now" << dendl;
2907 import_reverse_unfreeze(dir);
2908 } else {
11fdf7f2 2909 assert (g_conf()->mds_kill_import_at != 10);
7c673cae
FG
2910 }
2911}
2912
2913void Migrator::import_reverse_unfreeze(CDir *dir)
2914{
7c673cae 2915 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
11fdf7f2 2916 ceph_assert(!dir->is_auth());
7c673cae 2917 cache->discard_delayed_expire(dir);
224ce89b
WB
2918 dir->unfreeze_tree();
2919 if (dir->is_subtree_root())
2920 cache->try_subtree_merge(dir);
7c673cae
FG
2921 import_reverse_final(dir);
2922}
2923
2924void Migrator::import_reverse_final(CDir *dir)
2925{
2926 dout(7) << "import_reverse_final " << *dir << dendl;
2927
2928 // clean up
2929 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
11fdf7f2 2930 ceph_assert(it != import_state.end());
7c673cae
FG
2931
2932 MutationRef mut = it->second.mut;
2933 import_state.erase(it);
2934
2935 // send pending import_maps?
2936 mds->mdcache->maybe_send_pending_resolves();
2937
2938 if (mut) {
2939 mds->locker->drop_locks(mut.get());
2940 mut->cleanup();
2941 }
2942
2943 cache->show_subtrees();
2944 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2945}
2946
2947
2948
2949
2950void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
28e407b8 2951 map<client_t,pair<Session*,uint64_t> >& imported_session_map)
7c673cae
FG
2952{
2953 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2954 if (it == import_state.end() ||
2955 it->second.state != IMPORT_LOGGINGSTART) {
2956 dout(7) << "import " << df << " must have aborted" << dendl;
28e407b8 2957 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae
FG
2958 return;
2959 }
2960
2961 dout(7) << "import_logged " << *dir << dendl;
2962
2963 // note state
2964 it->second.state = IMPORT_ACKING;
2965
11fdf7f2 2966 assert (g_conf()->mds_kill_import_at != 7);
7c673cae
FG
2967
2968 // force open client sessions and finish cap import
28e407b8 2969 mds->server->finish_force_open_sessions(imported_session_map, false);
7c673cae
FG
2970
2971 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2972 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2973 p != it->second.peer_exports.end();
2974 ++p) {
2975 // parameter 'peer' is NONE, delay sending cap import messages to client
28e407b8
AA
2976 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, imported_session_map,
2977 p->second, imported_caps[p->first->ino()]);
7c673cae 2978 }
28e407b8
AA
2979
2980 it->second.session_map.swap(imported_session_map);
7c673cae
FG
2981
2982 // send notify's etc.
2983 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2984
2985 // test surviving observer of a failed migration that did not complete
2986 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2987
11fdf7f2
TL
2988 auto ack = MExportDirAck::create(dir->dirfrag(), it->second.tid);
2989 encode(imported_caps, ack->imported_caps);
7c673cae
FG
2990
2991 mds->send_message_mds(ack, from);
11fdf7f2 2992 assert (g_conf()->mds_kill_import_at != 8);
7c673cae
FG
2993
2994 cache->show_subtrees();
2995}
2996
11fdf7f2 2997void Migrator::handle_export_finish(const MExportDirFinish::const_ref &m)
7c673cae
FG
2998{
2999 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
11fdf7f2 3000 ceph_assert(dir);
7c673cae
FG
3001 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
3002
3003 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
11fdf7f2
TL
3004 ceph_assert(it != import_state.end());
3005 ceph_assert(it->second.tid == m->get_tid());
7c673cae
FG
3006
3007 import_finish(dir, false, m->is_last());
7c673cae
FG
3008}
3009
3010void Migrator::import_finish(CDir *dir, bool notify, bool last)
3011{
3012 dout(7) << "import_finish on " << *dir << dendl;
3013
3014 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
11fdf7f2
TL
3015 ceph_assert(it != import_state.end());
3016 ceph_assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
7c673cae 3017
224ce89b 3018 if (it->second.state == IMPORT_ACKING) {
11fdf7f2 3019 ceph_assert(dir->is_auth());
224ce89b
WB
3020 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
3021 }
3022
7c673cae 3023 // log finish
11fdf7f2 3024 ceph_assert(g_conf()->mds_kill_import_at != 9);
7c673cae
FG
3025
3026 if (it->second.state == IMPORT_ACKING) {
3027 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
3028 p != it->second.peer_exports.end();
3029 ++p) {
3030 CInode *in = p->first;
11fdf7f2 3031 ceph_assert(in->is_auth());
7c673cae
FG
3032 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
3033 q != p->second.end();
3034 ++q) {
28e407b8
AA
3035 auto r = it->second.session_map.find(q->first);
3036 if (r == it->second.session_map.end())
3037 continue;
3038
3039 Session *session = r->second.first;
7c673cae 3040 Capability *cap = in->get_client_cap(q->first);
11fdf7f2 3041 ceph_assert(cap);
7c673cae
FG
3042 cap->merge(q->second, true);
3043 cap->clear_importing();
3044 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
3045 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
3046 }
3047 p->second.clear();
3048 in->replica_caps_wanted = 0;
3049 }
28e407b8
AA
3050 for (auto& p : it->second.session_map) {
3051 Session *session = p.second.first;
7c673cae
FG
3052 session->dec_importing();
3053 }
3054 }
3055
3056 if (!last) {
11fdf7f2 3057 ceph_assert(it->second.state == IMPORT_ACKING);
7c673cae
FG
3058 it->second.state = IMPORT_FINISHING;
3059 return;
3060 }
3061
3062 // remove pins
3063 set<CDir*> bounds;
3064 cache->get_subtree_bounds(dir, bounds);
3065
3066 if (notify)
3067 import_notify_finish(dir, bounds);
3068
3069 import_remove_pins(dir, bounds);
3070
3071 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3072 it->second.peer_exports.swap(peer_exports);
3073
3074 // clear import state (we're done!)
3075 MutationRef mut = it->second.mut;
3076 import_state.erase(it);
3077
7c673cae
FG
3078 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3079
7c673cae
FG
3080 // process delayed expires
3081 cache->process_delayed_expire(dir);
3082
224ce89b 3083 // unfreeze tree, with possible subtree merge.
7c673cae 3084 dir->unfreeze_tree();
224ce89b
WB
3085 cache->try_subtree_merge(dir);
3086
7c673cae
FG
3087 cache->show_subtrees();
3088 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
3089
3090 if (mut) {
3091 mds->locker->drop_locks(mut.get());
3092 mut->cleanup();
3093 }
3094
3095 // re-eval imported caps
3096 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
3097 p != peer_exports.end();
3098 ++p) {
3099 if (p->first->is_auth())
3100 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
3101 p->first->put(CInode::PIN_IMPORTINGCAPS);
3102 }
3103
3104 // send pending import_maps?
3105 mds->mdcache->maybe_send_pending_resolves();
3106
3107 // did i just import mydir?
3108 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
3109 cache->populate_mydir();
3110
3111 // is it empty?
3112 if (dir->get_num_head_items() == 0 &&
3113 !dir->inode->is_auth()) {
3114 // reexport!
3115 export_empty_import(dir);
3116 }
3117}
3118
3119
11fdf7f2 3120void Migrator::decode_import_inode(CDentry *dn, bufferlist::const_iterator& blp,
7c673cae
FG
3121 mds_rank_t oldauth, LogSegment *ls,
3122 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3123 list<ScatterLock*>& updated_scatterlocks)
3124{
3125 dout(15) << "decode_import_inode on " << *dn << dendl;
3126
3127 inodeno_t ino;
3128 snapid_t last;
11fdf7f2
TL
3129 decode(ino, blp);
3130 decode(last, blp);
7c673cae
FG
3131
3132 bool added = false;
3133 CInode *in = cache->get_inode(ino, last);
3134 if (!in) {
3135 in = new CInode(mds->mdcache, true, 1, last);
3136 added = true;
3137 }
3138
3139 // state after link -- or not! -sage
3140 in->decode_import(blp, ls); // cap imports are noted for later action
3141
3142 // caps
3143 decode_import_inode_caps(in, true, blp, peer_exports);
3144
3145 // link before state -- or not! -sage
3146 if (dn->get_linkage()->get_inode() != in) {
11fdf7f2 3147 ceph_assert(!dn->get_linkage()->get_inode());
7c673cae
FG
3148 dn->dir->link_primary_inode(dn, in);
3149 }
28e407b8
AA
3150
3151 if (in->is_dir())
3152 dn->dir->pop_lru_subdirs.push_back(&in->item_pop_lru);
7c673cae
FG
3153
3154 // add inode?
3155 if (added) {
3156 cache->add_inode(in);
3157 dout(10) << "added " << *in << dendl;
3158 } else {
3159 dout(10) << " had " << *in << dendl;
3160 }
3161
3162 if (in->inode.is_dirty_rstat())
3163 in->mark_dirty_rstat();
3164
3165 // clear if dirtyscattered, since we're going to journal this
3166 // but not until we _actually_ finish the import...
3167 if (in->filelock.is_dirty()) {
3168 updated_scatterlocks.push_back(&in->filelock);
3169 mds->locker->mark_updated_scatterlock(&in->filelock);
3170 }
3171
3172 if (in->dirfragtreelock.is_dirty()) {
3173 updated_scatterlocks.push_back(&in->dirfragtreelock);
3174 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
3175 }
3176
3177 // adjust replica list
3178 //assert(!in->is_replica(oldauth)); // not true on failed export
3179 in->add_replica(oldauth, CInode::EXPORT_NONCE);
3180 if (in->is_replica(mds->get_nodeid()))
3181 in->remove_replica(mds->get_nodeid());
11fdf7f2
TL
3182
3183 if (in->snaplock.is_stable() &&
3184 in->snaplock.get_state() != LOCK_SYNC)
3185 mds->locker->try_eval(&in->snaplock, NULL);
7c673cae
FG
3186}
3187
3188void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
11fdf7f2 3189 bufferlist::const_iterator &blp,
7c673cae
FG
3190 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
3191{
3192 map<client_t,Capability::Export> cap_map;
11fdf7f2
TL
3193 decode(cap_map, blp);
3194 if (auth_cap) {
3195 mempool::mds_co::compact_map<int32_t,int32_t> mds_wanted;
3196 decode(mds_wanted, blp);
3197 mds_wanted.erase(mds->get_nodeid());
3198 in->set_mds_caps_wanted(mds_wanted);
3199 }
7c673cae 3200 if (!cap_map.empty() ||
b32b8144 3201 (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
7c673cae
FG
3202 peer_exports[in].swap(cap_map);
3203 in->get(CInode::PIN_IMPORTINGCAPS);
3204 }
3205}
3206
3207void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
28e407b8
AA
3208 const map<client_t,pair<Session*,uint64_t> >& session_map,
3209 const map<client_t,Capability::Export> &export_map,
7c673cae
FG
3210 map<client_t,Capability::Import> &import_map)
3211{
28e407b8
AA
3212 for (auto& it : export_map) {
3213 dout(10) << "finish_import_inode_caps for client." << it.first << " on " << *in << dendl;
3214
3215 auto p = session_map.find(it.first);
3216 if (p == session_map.end()) {
3217 dout(10) << " no session for client." << it.first << dendl;
3218 (void)import_map[it.first];
3219 continue;
3220 }
7c673cae 3221
28e407b8
AA
3222 Session *session = p->second.first;
3223
3224 Capability *cap = in->get_client_cap(it.first);
7c673cae 3225 if (!cap) {
28e407b8 3226 cap = in->add_client_cap(it.first, session);
7c673cae
FG
3227 if (peer < 0)
3228 cap->mark_importing();
3229 }
3230
1adf2230
AA
3231 // Always ask exporter mds to send cap export messages for auth caps.
3232 // For non-auth caps, ask exporter mds to send cap export messages to
3233 // clients who haven't opened sessions. The cap export messages will
3234 // make clients open sessions.
11fdf7f2 3235 if (auth_cap || !session->get_connection()) {
1adf2230
AA
3236 Capability::Import& im = import_map[it.first];
3237 im.cap_id = cap->get_cap_id();
3238 im.mseq = auth_cap ? it.second.mseq : cap->get_mseq();
3239 im.issue_seq = cap->get_last_seq() + 1;
3240 }
7c673cae
FG
3241
3242 if (peer >= 0) {
28e407b8
AA
3243 cap->merge(it.second, auth_cap);
3244 mds->mdcache->do_cap_import(session, in, cap, it.second.cap_id,
3245 it.second.seq, it.second.mseq - 1, peer,
7c673cae
FG
3246 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3247 }
3248 }
3249
3250 if (peer >= 0) {
3251 in->replica_caps_wanted = 0;
3252 in->put(CInode::PIN_IMPORTINGCAPS);
3253 }
3254}
3255
11fdf7f2 3256int Migrator::decode_import_dir(bufferlist::const_iterator& blp,
7c673cae
FG
3257 mds_rank_t oldauth,
3258 CDir *import_root,
3259 EImportStart *le,
3260 LogSegment *ls,
3261 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
11fdf7f2 3262 list<ScatterLock*>& updated_scatterlocks)
7c673cae
FG
3263{
3264 // set up dir
3265 dirfrag_t df;
11fdf7f2 3266 decode(df, blp);
7c673cae
FG
3267
3268 CInode *diri = cache->get_inode(df.ino);
11fdf7f2 3269 ceph_assert(diri);
7c673cae 3270 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
11fdf7f2 3271 ceph_assert(dir);
7c673cae
FG
3272
3273 dout(7) << "decode_import_dir " << *dir << dendl;
3274
11fdf7f2
TL
3275 if (!dir->freeze_tree_state) {
3276 ceph_assert(dir->get_version() == 0);
3277 dir->freeze_tree_state = import_root->freeze_tree_state;
3278 }
3279
7c673cae 3280 // assimilate state
11fdf7f2 3281 dir->decode_import(blp, ls);
7c673cae
FG
3282
3283 // adjust replica list
3284 //assert(!dir->is_replica(oldauth)); // not true on failed export
3285 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3286 if (dir->is_replica(mds->get_nodeid()))
3287 dir->remove_replica(mds->get_nodeid());
3288
3289 // add to journal entry
3290 if (le)
3291 le->metablob.add_import_dir(dir);
3292
3293 int num_imported = 0;
3294
3295 // take all waiters on this dir
3296 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3297 // a replica's presense in my cache implies/forces it's presense in authority's.
11fdf7f2 3298 MDSContext::vec waiters;
7c673cae 3299 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
11fdf7f2
TL
3300 for (auto c : waiters)
3301 dir->add_waiter(CDir::WAIT_UNFREEZE, c); // UNFREEZE will get kicked both on success or failure
7c673cae
FG
3302
3303 dout(15) << "doing contents" << dendl;
3304
3305 // contents
3306 __u32 nden;
11fdf7f2 3307 decode(nden, blp);
7c673cae
FG
3308
3309 for (; nden>0; nden--) {
3310 num_imported++;
3311
3312 // dentry
3313 string dname;
3314 snapid_t last;
11fdf7f2
TL
3315 decode(dname, blp);
3316 decode(last, blp);
7c673cae
FG
3317
3318 CDentry *dn = dir->lookup_exact_snap(dname, last);
3319 if (!dn)
3320 dn = dir->add_null_dentry(dname, 1, last);
3321
3322 dn->decode_import(blp, ls);
3323
3324 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3325 if (dn->is_replica(mds->get_nodeid()))
3326 dn->remove_replica(mds->get_nodeid());
3327
3328 // dentry lock in unreadable state can block path traverse
3329 if (dn->lock.get_state() != LOCK_SYNC)
3330 mds->locker->try_eval(&dn->lock, NULL);
3331
3332 dout(15) << "decode_import_dir got " << *dn << dendl;
3333
3334 // points to...
3335 char icode;
11fdf7f2 3336 decode(icode, blp);
7c673cae
FG
3337
3338 if (icode == 'N') {
3339 // null dentry
11fdf7f2 3340 ceph_assert(dn->get_linkage()->is_null());
7c673cae
FG
3341
3342 // fall thru
3343 }
3344 else if (icode == 'L') {
3345 // remote link
3346 inodeno_t ino;
3347 unsigned char d_type;
11fdf7f2
TL
3348 decode(ino, blp);
3349 decode(d_type, blp);
7c673cae 3350 if (dn->get_linkage()->is_remote()) {
11fdf7f2 3351 ceph_assert(dn->get_linkage()->get_remote_ino() == ino);
7c673cae
FG
3352 } else {
3353 dir->link_remote_inode(dn, ino, d_type);
3354 }
3355 }
3356 else if (icode == 'I') {
3357 // inode
11fdf7f2 3358 ceph_assert(le);
7c673cae
FG
3359 decode_import_inode(dn, blp, oldauth, ls,
3360 peer_exports, updated_scatterlocks);
3361 }
3362
3363 // add dentry to journal entry
3364 if (le)
3365 le->metablob.add_import_dentry(dn);
3366 }
3367
3368#ifdef MDS_VERIFY_FRAGSTAT
3369 if (dir->is_complete())
3370 dir->verify_fragstat();
3371#endif
3372
3373 dir->inode->maybe_export_pin();
3374
3375 dout(7) << "decode_import_dir done " << *dir << dendl;
3376 return num_imported;
3377}
3378
3379
3380
3381
3382
3383// authority bystander
3384
11fdf7f2 3385void Migrator::handle_export_notify(const MExportDirNotify::const_ref &m)
7c673cae
FG
3386{
3387 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
7c673cae
FG
3388 return;
3389 }
3390
3391 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3392
3393 mds_rank_t from = mds_rank_t(m->get_source().num());
3394 mds_authority_t old_auth = m->get_old_auth();
3395 mds_authority_t new_auth = m->get_new_auth();
3396
3397 if (!dir) {
3398 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3399 << " on missing dir " << m->get_dirfrag() << dendl;
3400 } else if (dir->authority() != old_auth) {
3401 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3402 << " != " << old_auth << " -> " << new_auth
3403 << " on " << *dir << dendl;
3404 } else {
3405 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3406 << " on " << *dir << dendl;
3407 // adjust auth
3408 set<CDir*> have;
3409 cache->map_dirfrag_set(m->get_bounds(), have);
3410 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3411
3412 // induce a merge?
3413 cache->try_subtree_merge(dir);
3414 }
3415
3416 // send ack
3417 if (m->wants_ack()) {
11fdf7f2 3418 mds->send_message_mds(MExportDirNotifyAck::create(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
7c673cae
FG
3419 } else {
3420 // aborted. no ack.
3421 dout(7) << "handle_export_notify no ack requested" << dendl;
3422 }
7c673cae
FG
3423}
3424
3425/** cap exports **/
3426void Migrator::export_caps(CInode *in)
3427{
3428 mds_rank_t dest = in->authority().first;
3429 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3430
11fdf7f2
TL
3431 ceph_assert(in->is_any_caps());
3432 ceph_assert(!in->is_auth());
3433 ceph_assert(!in->is_ambiguous_auth());
3434 ceph_assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
7c673cae 3435
11fdf7f2 3436 auto ex = MExportCaps::create();
7c673cae
FG
3437 ex->ino = in->ino();
3438
11fdf7f2 3439 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map, ex->client_metadata_map);
7c673cae
FG
3440
3441 mds->send_message_mds(ex, dest);
3442}
3443
11fdf7f2 3444void Migrator::handle_export_caps_ack(const MExportCapsAck::const_ref &ack)
1adf2230
AA
3445{
3446 mds_rank_t from = ack->get_source().num();
3447 CInode *in = cache->get_inode(ack->ino);
3448 if (in) {
11fdf7f2 3449 ceph_assert(!in->is_auth());
1adf2230
AA
3450
3451 dout(10) << "handle_export_caps_ack " << *ack << " from "
3452 << ack->get_source() << " on " << *in << dendl;
3453
3454 map<client_t,Capability::Import> imported_caps;
3455 map<client_t,uint64_t> caps_ids;
11fdf7f2
TL
3456 auto blp = ack->cap_bl.cbegin();
3457 decode(imported_caps, blp);
3458 decode(caps_ids, blp);
1adf2230
AA
3459
3460 for (auto& it : imported_caps) {
3461 Capability *cap = in->get_client_cap(it.first);
3462 if (!cap || cap->get_cap_id() != caps_ids.at(it.first))
3463 continue;
3464
3465 dout(7) << __func__ << " telling client." << it.first
3466 << " exported caps on " << *in << dendl;
11fdf7f2 3467 auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1adf2230
AA
3468 cap->get_cap_id(), cap->get_mseq(),
3469 mds->get_osd_epoch_barrier());
3470 m->set_cap_peer(it.second.cap_id, it.second.issue_seq, it.second.mseq, from, 0);
3471 mds->send_message_client_counted(m, it.first);
3472
3473 in->remove_client_cap(it.first);
3474 }
3475
3476 mds->locker->request_inode_file_caps(in);
3477 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
3478 }
1adf2230
AA
3479}
3480
11fdf7f2 3481void Migrator::handle_gather_caps(const MGatherCaps::const_ref &m)
7c673cae
FG
3482{
3483 CInode *in = cache->get_inode(m->ino);
7c673cae 3484 if (!in)
11fdf7f2 3485 return;
7c673cae
FG
3486
3487 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
1adf2230
AA
3488 << " on " << *in << dendl;
3489
7c673cae
FG
3490 if (in->is_any_caps() &&
3491 !in->is_auth() &&
3492 !in->is_ambiguous_auth() &&
3493 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3494 export_caps(in);
7c673cae
FG
3495}
3496
3497class C_M_LoggedImportCaps : public MigratorLogContext {
3498 CInode *in;
3499 mds_rank_t from;
3500public:
28e407b8 3501 map<client_t,pair<Session*,uint64_t> > imported_session_map;
7c673cae 3502 map<CInode*, map<client_t,Capability::Export> > peer_exports;
7c673cae
FG
3503
3504 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3505 void finish(int r) override {
28e407b8 3506 mig->logged_import_caps(in, from, imported_session_map, peer_exports);
7c673cae
FG
3507 }
3508};
3509
11fdf7f2 3510void Migrator::handle_export_caps(const MExportCaps::const_ref &ex)
7c673cae
FG
3511{
3512 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3513 CInode *in = cache->get_inode(ex->ino);
3514
11fdf7f2
TL
3515 ceph_assert(in);
3516 ceph_assert(in->is_auth());
7c673cae
FG
3517
3518 // FIXME
28e407b8 3519 if (!in->can_auth_pin()) {
7c673cae 3520 return;
28e407b8
AA
3521 }
3522
181888fb 3523 in->auth_pin(this);
7c673cae 3524
11fdf7f2
TL
3525 map<client_t,entity_inst_t> client_map{ex->client_map};
3526 map<client_t,client_metadata_t> client_metadata_map{ex->client_metadata_map};
28e407b8 3527
7c673cae
FG
3528 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3529 this, in, mds_rank_t(ex->get_source().num()));
7c673cae 3530
11fdf7f2 3531 version_t pv = mds->server->prepare_force_open_sessions(client_map, client_metadata_map,
28e407b8 3532 finish->imported_session_map);
7c673cae 3533 // decode new caps
11fdf7f2 3534 auto blp = ex->cap_bl.cbegin();
7c673cae 3535 decode_import_inode_caps(in, false, blp, finish->peer_exports);
11fdf7f2 3536 ceph_assert(!finish->peer_exports.empty()); // thus, inode is pinned.
7c673cae
FG
3537
3538 // journal open client sessions
11fdf7f2
TL
3539 ESessions *le = new ESessions(pv, std::move(client_map),
3540 std::move(client_metadata_map));
7c673cae
FG
3541 mds->mdlog->start_submit_entry(le, finish);
3542 mds->mdlog->flush();
7c673cae
FG
3543}
3544
3545
3546void Migrator::logged_import_caps(CInode *in,
3547 mds_rank_t from,
28e407b8
AA
3548 map<client_t,pair<Session*,uint64_t> >& imported_session_map,
3549 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
7c673cae
FG
3550{
3551 dout(10) << "logged_import_caps on " << *in << dendl;
3552 // see export_go() vs export_go_synced()
11fdf7f2 3553 ceph_assert(in->is_auth());
7c673cae
FG
3554
3555 // force open client sessions and finish cap import
28e407b8 3556 mds->server->finish_force_open_sessions(imported_session_map);
7c673cae 3557
28e407b8 3558 auto it = peer_exports.find(in);
11fdf7f2 3559 ceph_assert(it != peer_exports.end());
28e407b8 3560
7c673cae 3561 // clients will release caps from the exporter when they receive the cap import message.
1adf2230 3562 map<client_t,Capability::Import> imported_caps;
28e407b8 3563 finish_import_inode_caps(in, from, false, imported_session_map, it->second, imported_caps);
7c673cae 3564 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
1adf2230
AA
3565
3566 if (!imported_caps.empty()) {
11fdf7f2 3567 auto ack = MExportCapsAck::create(in->ino());
1adf2230
AA
3568 map<client_t,uint64_t> peer_caps_ids;
3569 for (auto &p : imported_caps )
3570 peer_caps_ids[p.first] = it->second.at(p.first).cap_id;
3571
11fdf7f2
TL
3572 encode(imported_caps, ack->cap_bl);
3573 encode(peer_caps_ids, ack->cap_bl);
1adf2230
AA
3574 mds->send_message_mds(ack, from);
3575 }
3576
181888fb 3577 in->auth_unpin(this);
7c673cae 3578}
28e407b8 3579
91327a77 3580Migrator::Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
11fdf7f2
TL
3581 max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
3582 inject_session_race = g_conf().get_val<bool>("mds_inject_migrator_session_race");
91327a77
AA
3583}
3584
11fdf7f2 3585void Migrator::handle_conf_change(const ConfigProxy& conf,
28e407b8
AA
3586 const std::set <std::string> &changed,
3587 const MDSMap &mds_map)
3588{
91327a77 3589 if (changed.count("mds_max_export_size"))
11fdf7f2 3590 max_export_size = g_conf().get_val<Option::size_t>("mds_max_export_size");
28e407b8 3591 if (changed.count("mds_inject_migrator_session_race")) {
11fdf7f2 3592 inject_session_race = conf.get_val<bool>("mds_inject_migrator_session_race");
28e407b8
AA
3593 dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
3594 }
3595}