]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <map>
21
22 #include "MDCache.h"
23 #include "MDSRank.h"
24 #include "Server.h"
25 #include "Locker.h"
26 #include "MDLog.h"
27 #include "MDBalancer.h"
28 #include "Migrator.h"
29 #include "ScrubStack.h"
30
31 #include "SnapClient.h"
32
33 #include "MDSMap.h"
34
35 #include "CInode.h"
36 #include "CDir.h"
37
38 #include "Mutation.h"
39
40 #include "include/ceph_fs.h"
41 #include "include/filepath.h"
42
43 #include "msg/Message.h"
44 #include "msg/Messenger.h"
45
46 #include "common/errno.h"
47 #include "common/safe_io.h"
48 #include "common/perf_counters.h"
49 #include "common/MemoryModel.h"
50 #include "osdc/Journaler.h"
51 #include "osdc/Filer.h"
52
53 #include "events/ESubtreeMap.h"
54 #include "events/EUpdate.h"
55 #include "events/ESlaveUpdate.h"
56 #include "events/EImportFinish.h"
57 #include "events/EFragment.h"
58 #include "events/ECommitted.h"
59 #include "events/ESessions.h"
60
61 #include "messages/MGenericMessage.h"
62
63 #include "messages/MMDSResolve.h"
64 #include "messages/MMDSResolveAck.h"
65 #include "messages/MMDSCacheRejoin.h"
66
67 #include "messages/MDiscover.h"
68 #include "messages/MDiscoverReply.h"
69
70 //#include "messages/MInodeUpdate.h"
71 #include "messages/MDirUpdate.h"
72 #include "messages/MCacheExpire.h"
73
74 #include "messages/MInodeFileCaps.h"
75
76 #include "messages/MLock.h"
77 #include "messages/MDentryLink.h"
78 #include "messages/MDentryUnlink.h"
79
80 #include "messages/MMDSFindIno.h"
81 #include "messages/MMDSFindInoReply.h"
82
83 #include "messages/MMDSOpenIno.h"
84 #include "messages/MMDSOpenInoReply.h"
85
86 #include "messages/MClientRequest.h"
87 #include "messages/MClientCaps.h"
88 #include "messages/MClientSnap.h"
89 #include "messages/MClientQuota.h"
90
91 #include "messages/MMDSSlaveRequest.h"
92
93 #include "messages/MMDSFragmentNotify.h"
94
95 #include "messages/MGatherCaps.h"
96
97 #include "InoTable.h"
98
99 #include "common/Timer.h"
100
101 #include "perfglue/heap_profiler.h"
102
103 using namespace std;
104
105 #include "common/config.h"
106 #include "include/assert.h"
107
108 #define dout_context g_ceph_context
109 #define dout_subsys ceph_subsys_mds
110 #undef dout_prefix
111 #define dout_prefix _prefix(_dout, mds)
112 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
113 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
114 }
115
116 set<int> SimpleLock::empty_gather_set;
117
118
119 /**
120 * All non-I/O contexts that require a reference
121 * to an MDCache instance descend from this.
122 */
123 class MDCacheContext : public virtual MDSInternalContextBase {
124 protected:
125 MDCache *mdcache;
126 MDSRank *get_mds() override
127 {
128 assert(mdcache != NULL);
129 return mdcache->mds;
130 }
131 public:
132 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
133 };
134
135
136 /**
137 * Only for contexts called back from an I/O completion
138 *
139 * Note: duplication of members wrt MDCacheContext, because
140 * it'ls the lesser of two evils compared with introducing
141 * yet another piece of (multiple) inheritance.
142 */
143 class MDCacheIOContext : public virtual MDSIOContextBase {
144 protected:
145 MDCache *mdcache;
146 MDSRank *get_mds() override
147 {
148 assert(mdcache != NULL);
149 return mdcache->mds;
150 }
151 public:
152 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
153 };
154
155 class MDCacheLogContext : public virtual MDSLogContextBase {
156 protected:
157 MDCache *mdcache;
158 MDSRank *get_mds() override
159 {
160 assert(mdcache != NULL);
161 return mdcache->mds;
162 }
163 public:
164 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
165 };
166
167 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
168 mds(m),
169 filer(m->objecter, m->finisher),
170 exceeded_size_limit(false),
171 recovery_queue(m),
172 stray_manager(m, purge_queue_)
173 {
174 migrator.reset(new Migrator(mds, this));
175 root = NULL;
176 myin = NULL;
177 readonly = false;
178
179 stray_index = 0;
180 for (int i = 0; i < NUM_STRAY; ++i) {
181 strays[i] = NULL;
182 }
183
184 num_inodes_with_caps = 0;
185
186 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
187 (g_conf->mds_dir_max_commit_size << 20) :
188 (0.9 *(g_conf->osd_max_write_size << 20));
189
190 discover_last_tid = 0;
191 open_ino_last_tid = 0;
192 find_ino_peer_last_tid = 0;
193
194 last_cap_id = 0;
195
196 client_lease_durations[0] = 5.0;
197 client_lease_durations[1] = 30.0;
198 client_lease_durations[2] = 300.0;
199
200 resolves_pending = false;
201 rejoins_pending = false;
202 cap_imports_num_opening = 0;
203
204 opening_root = open = false;
205 lru.lru_set_max(g_conf->mds_cache_size);
206 lru.lru_set_midpoint(g_conf->mds_cache_mid);
207
208 decayrate.set_halflife(g_conf->mds_decay_halflife);
209
210 did_shutdown_log_cap = false;
211 }
212
213 MDCache::~MDCache()
214 {
215 if (logger) {
216 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
217 }
218 }
219
220
221
222 void MDCache::log_stat()
223 {
224 mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
225 mds->logger->set(l_mds_inodes, lru.lru_get_size());
226 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
227 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
228 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
229 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
230 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
231 mds->logger->set(l_mds_caps, Capability::count());
232 }
233
234
235 //
236
237 bool MDCache::shutdown()
238 {
239 if (lru.lru_get_size() > 0) {
240 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
241 //show_cache();
242 show_subtrees();
243 //dump();
244 }
245 return true;
246 }
247
248
249 // ====================================================================
250 // some inode functions
251
252 void MDCache::add_inode(CInode *in)
253 {
254 // add to lru, inode map
255 assert(inode_map.count(in->vino()) == 0); // should be no dup inos!
256 inode_map[ in->vino() ] = in;
257
258 if (in->ino() < MDS_INO_SYSTEM_BASE) {
259 if (in->ino() == MDS_INO_ROOT)
260 root = in;
261 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
262 myin = in;
263 else if (in->is_stray()) {
264 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
265 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
266 }
267 }
268 if (in->is_base())
269 base_inodes.insert(in);
270 }
271
272 if (CInode::count() >
273 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
274 exceeded_size_limit = true;
275 }
276 }
277
278 void MDCache::remove_inode(CInode *o)
279 {
280 dout(14) << "remove_inode " << *o << dendl;
281
282 if (o->get_parent_dn()) {
283 // FIXME: multiple parents?
284 CDentry *dn = o->get_parent_dn();
285 assert(!dn->is_dirty());
286 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
287 }
288
289 if (o->is_dirty())
290 o->mark_clean();
291 if (o->is_dirty_parent())
292 o->clear_dirty_parent();
293
294 o->clear_scatter_dirty();
295
296 o->item_open_file.remove_myself();
297
298 export_pin_queue.erase(o);
299
300 // remove from inode map
301 inode_map.erase(o->vino());
302
303 if (o->ino() < MDS_INO_SYSTEM_BASE) {
304 if (o == root) root = 0;
305 if (o == myin) myin = 0;
306 if (o->is_stray()) {
307 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
308 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
309 }
310 }
311 if (o->is_base())
312 base_inodes.erase(o);
313 }
314
315 // delete it
316 assert(o->get_num_ref() == 0);
317 delete o;
318 }
319
320 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
321 {
322 file_layout_t result = file_layout_t::get_default();
323 result.pool_id = mdsmap.get_first_data_pool();
324 return result;
325 }
326
327 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
328 {
329 file_layout_t result = file_layout_t::get_default();
330 result.pool_id = mdsmap.get_metadata_pool();
331 if (g_conf->mds_log_segment_size > 0) {
332 result.object_size = g_conf->mds_log_segment_size;
333 result.stripe_unit = g_conf->mds_log_segment_size;
334 }
335 return result;
336 }
337
338 void MDCache::init_layouts()
339 {
340 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
341 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
342 }
343
344 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
345 int mode) const
346 {
347 in->inode.ino = ino;
348 in->inode.version = 1;
349 in->inode.xattr_version = 1;
350 in->inode.mode = 0500 | mode;
351 in->inode.size = 0;
352 in->inode.ctime =
353 in->inode.mtime =
354 in->inode.btime = ceph_clock_now();
355 in->inode.nlink = 1;
356 in->inode.truncate_size = -1ull;
357 in->inode.change_attr = 0;
358 in->inode.export_pin = MDS_RANK_NONE;
359
360 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
361 if (in->inode.is_dir()) {
362 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
363 ++in->inode.rstat.rsubdirs;
364 } else {
365 in->inode.layout = default_file_layout;
366 ++in->inode.rstat.rfiles;
367 }
368 in->inode.accounted_rstat = in->inode.rstat;
369
370 if (in->is_base()) {
371 if (in->is_root())
372 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
373 else
374 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
375 in->open_snaprealm(); // empty snaprealm
376 assert(!in->snaprealm->parent); // created its own
377 in->snaprealm->srnode.seq = 1;
378 }
379 }
380
381 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
382 {
383 dout(0) << "creating system inode with ino:" << ino << dendl;
384 CInode *in = new CInode(this);
385 create_unlinked_system_inode(in, ino, mode);
386 add_inode(in);
387 return in;
388 }
389
390 CInode *MDCache::create_root_inode()
391 {
392 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
393 i->inode.uid = g_conf->mds_root_ino_uid;
394 i->inode.gid = g_conf->mds_root_ino_gid;
395 i->inode.layout = default_file_layout;
396 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
397 return i;
398 }
399
400 void MDCache::create_empty_hierarchy(MDSGather *gather)
401 {
402 // create root dir
403 CInode *root = create_root_inode();
404
405 // force empty root dir
406 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
407 adjust_subtree_auth(rootdir, mds->get_nodeid());
408 rootdir->dir_rep = CDir::REP_ALL; //NONE;
409
410 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
411 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
412
413 root->inode.dirstat = rootdir->fnode.fragstat;
414 root->inode.rstat = rootdir->fnode.rstat;
415 ++root->inode.rstat.rsubdirs;
416 root->inode.accounted_rstat = root->inode.rstat;
417
418 rootdir->mark_complete();
419 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
420 rootdir->commit(0, gather->new_sub());
421
422 root->store(gather->new_sub());
423 }
424
425 void MDCache::create_mydir_hierarchy(MDSGather *gather)
426 {
427 // create mds dir
428 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
429
430 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
431 adjust_subtree_auth(mydir, mds->get_nodeid());
432
433 LogSegment *ls = mds->mdlog->get_current_segment();
434
435 // stray dir
436 for (int i = 0; i < NUM_STRAY; ++i) {
437 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
438 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
439 stringstream name;
440 name << "stray" << i;
441 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
442 sdn->_mark_dirty(mds->mdlog->get_current_segment());
443
444 stray->inode.dirstat = straydir->fnode.fragstat;
445
446 mydir->fnode.rstat.add(stray->inode.rstat);
447 mydir->fnode.fragstat.nsubdirs++;
448 // save them
449 straydir->mark_complete();
450 straydir->mark_dirty(straydir->pre_dirty(), ls);
451 straydir->commit(0, gather->new_sub());
452 stray->_mark_dirty_parent(ls, true);
453 stray->store_backtrace(gather->new_sub());
454 }
455
456 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
457 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
458
459 myin->inode.dirstat = mydir->fnode.fragstat;
460 myin->inode.rstat = mydir->fnode.rstat;
461 ++myin->inode.rstat.rsubdirs;
462 myin->inode.accounted_rstat = myin->inode.rstat;
463
464 mydir->mark_complete();
465 mydir->mark_dirty(mydir->pre_dirty(), ls);
466 mydir->commit(0, gather->new_sub());
467
468 myin->store(gather->new_sub());
469 }
470
471 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
472 MutationRef mut;
473 CDentry *dn;
474 version_t dpv;
475 MDSInternalContextBase *fin;
476 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
477 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
478 void finish(int r) override {
479 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
480 }
481 };
482
483 void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
484 {
485 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
486 CDentry *dn = dir->add_null_dentry(name);
487
488 dn->push_projected_linkage(in);
489 version_t dpv = dn->pre_dirty();
490
491 CDir *mdir = 0;
492 if (in->inode.is_dir()) {
493 in->inode.rstat.rsubdirs = 1;
494
495 mdir = in->get_or_open_dirfrag(this, frag_t());
496 mdir->mark_complete();
497 mdir->pre_dirty();
498 } else
499 in->inode.rstat.rfiles = 1;
500 in->inode.version = dn->pre_dirty();
501
502 SnapRealm *realm = dir->get_inode()->find_snaprealm();
503 dn->first = in->first = realm->get_newest_seq() + 1;
504
505 MutationRef mut(new MutationImpl());
506
507 // force some locks. hacky.
508 mds->locker->wrlock_force(&dir->inode->filelock, mut);
509 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
510
511 mut->ls = mds->mdlog->get_current_segment();
512 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
513 mds->mdlog->start_entry(le);
514
515 if (!in->is_mdsdir()) {
516 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
517 le->metablob.add_primary_dentry(dn, in, true);
518 } else {
519 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
520 journal_dirty_inode(mut.get(), &le->metablob, in);
521 dn->push_projected_linkage(in->ino(), in->d_type());
522 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
523 le->metablob.add_root(true, in);
524 }
525 if (mdir)
526 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
527
528 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
529 mds->mdlog->flush();
530 }
531
532 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
533 {
534 dout(10) << "_create_system_file_finish " << *dn << dendl;
535
536 dn->pop_projected_linkage();
537 dn->mark_dirty(dpv, mut->ls);
538
539 CInode *in = dn->get_linkage()->get_inode();
540 in->inode.version--;
541 in->mark_dirty(in->inode.version + 1, mut->ls);
542
543 if (in->inode.is_dir()) {
544 CDir *dir = in->get_dirfrag(frag_t());
545 assert(dir);
546 dir->mark_dirty(1, mut->ls);
547 dir->mark_new(mut->ls);
548 }
549
550 mut->apply();
551 mds->locker->drop_locks(mut.get());
552 mut->cleanup();
553
554 fin->complete(0);
555
556 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
557 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
558 }
559
560
561
562 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
563 MDCache *cache;
564 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
565 void finish(int r) override {
566 if (r < 0) {
567 // If we can't open root, something disastrous has happened: mark
568 // this rank damaged for operator intervention. Note that
569 // it is not okay to call suicide() here because we are in
570 // a Finisher callback.
571 cache->mds->damaged();
572 ceph_abort(); // damaged should never return
573 } else {
574 cache->open_root();
575 }
576 }
577 };
578
579 void MDCache::open_root_inode(MDSInternalContextBase *c)
580 {
581 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
582 CInode *in;
583 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
584 in->fetch(c);
585 } else {
586 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
587 }
588 }
589
590 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
591 {
592 MDSGatherBuilder gather(g_ceph_context);
593
594 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
595 in->fetch(gather.new_sub());
596
597 gather.set_finisher(c);
598 gather.activate();
599 }
600
601 void MDCache::open_root()
602 {
603 dout(10) << "open_root" << dendl;
604
605 if (!root) {
606 open_root_inode(new C_MDS_RetryOpenRoot(this));
607 return;
608 }
609 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
610 assert(root->is_auth());
611 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
612 assert(rootdir);
613 if (!rootdir->is_subtree_root())
614 adjust_subtree_auth(rootdir, mds->get_nodeid());
615 if (!rootdir->is_complete()) {
616 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
617 return;
618 }
619 } else {
620 assert(!root->is_auth());
621 CDir *rootdir = root->get_dirfrag(frag_t());
622 if (!rootdir) {
623 discover_dir_frag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
624 return;
625 }
626 }
627
628 if (!myin) {
629 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
630 in->fetch(new C_MDS_RetryOpenRoot(this));
631 return;
632 }
633 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
634 assert(mydir);
635 adjust_subtree_auth(mydir, mds->get_nodeid());
636
637 populate_mydir();
638 }
639
640 void MDCache::populate_mydir()
641 {
642 assert(myin);
643 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
644 assert(mydir);
645
646 dout(10) << "populate_mydir " << *mydir << dendl;
647
648 if (!mydir->is_complete()) {
649 mydir->fetch(new C_MDS_RetryOpenRoot(this));
650 return;
651 }
652
653 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
654 // A missing dirfrag, we will recreate it. Before that, we must dirty
655 // it before dirtying any of the strays we create within it.
656 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
657 "recreating it now";
658 LogSegment *ls = mds->mdlog->get_current_segment();
659 mydir->state_clear(CDir::STATE_BADFRAG);
660 mydir->mark_complete();
661 mydir->mark_dirty(mydir->pre_dirty(), ls);
662 }
663
664 // open or create stray
665 uint64_t num_strays = 0;
666 for (int i = 0; i < NUM_STRAY; ++i) {
667 stringstream name;
668 name << "stray" << i;
669 CDentry *straydn = mydir->lookup(name.str());
670
671 // allow for older fs's with stray instead of stray0
672 if (straydn == NULL && i == 0)
673 straydn = mydir->lookup("stray");
674
675 if (!straydn || !straydn->get_linkage()->get_inode()) {
676 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
677 new C_MDS_RetryOpenRoot(this));
678 return;
679 }
680 assert(straydn);
681 assert(strays[i]);
682 // we make multiple passes through this method; make sure we only pin each stray once.
683 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
684 strays[i]->get(CInode::PIN_STRAY);
685 strays[i]->state_set(CInode::STATE_STRAYPINNED);
686 strays[i]->get_stickydirs();
687 }
688 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
689
690 // open all frags
691 list<frag_t> ls;
692 strays[i]->dirfragtree.get_leaves(ls);
693 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
694 frag_t fg = *p;
695 CDir *dir = strays[i]->get_dirfrag(fg);
696 if (!dir) {
697 dir = strays[i]->get_or_open_dirfrag(this, fg);
698 }
699
700 // DamageTable applies special handling to strays: it will
701 // have damaged() us out if one is damaged.
702 assert(!dir->state_test(CDir::STATE_BADFRAG));
703
704 if (dir->get_version() == 0) {
705 dir->fetch(new C_MDS_RetryOpenRoot(this));
706 return;
707 }
708
709 if (dir->get_frag_size() > 0)
710 num_strays += dir->get_frag_size();
711 }
712 }
713
714 stray_manager.set_num_strays(num_strays);
715
716 // okay!
717 dout(10) << "populate_mydir done" << dendl;
718 assert(!open);
719 open = true;
720 mds->queue_waiters(waiting_for_open);
721
722 scan_stray_dir();
723 }
724
725 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
726 {
727 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
728 }
729
730 CDir *MDCache::get_stray_dir(CInode *in)
731 {
732 string straydname;
733 in->name_stray_dentry(straydname);
734
735 CInode *strayi = get_stray();
736 assert(strayi);
737 frag_t fg = strayi->pick_dirfrag(straydname);
738 CDir *straydir = strayi->get_dirfrag(fg);
739 assert(straydir);
740 return straydir;
741 }
742
743 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
744 {
745 CDir *straydir = get_stray_dir(in);
746 string straydname;
747 in->name_stray_dentry(straydname);
748 CDentry *straydn = straydir->lookup(straydname);
749 if (!straydn) {
750 straydn = straydir->add_null_dentry(straydname);
751 straydn->mark_new();
752 } else {
753 assert(straydn->get_projected_linkage()->is_null());
754 }
755
756 straydn->state_set(CDentry::STATE_STRAY);
757 return straydn;
758 }
759
760
761
762 MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
763 {
764 // inode?
765 if (info.ino)
766 return get_inode(info.ino, info.snapid);
767
768 // dir or dentry.
769 CDir *dir = get_dirfrag(info.dirfrag);
770 if (!dir) return 0;
771
772 if (info.dname.length())
773 return dir->lookup(info.dname, info.snapid);
774 else
775 return dir;
776 }
777
778
779
780
781 // ====================================================================
782 // subtree management
783
784 void MDCache::list_subtrees(list<CDir*>& ls)
785 {
786 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
787 p != subtrees.end();
788 ++p)
789 ls.push_back(p->first);
790 }
791
792 /*
793 * adjust the dir_auth of a subtree.
794 * merge with parent and/or child subtrees, if is it appropriate.
795 * merge can ONLY happen if both parent and child have unambiguous auth.
796 */
797 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool do_eval)
798 {
799 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
800 << " on " << *dir << dendl;
801
802 if (mds->is_any_replay() || mds->is_resolve())
803 do_eval = false;
804
805 show_subtrees();
806
807 CDir *root;
808 if (dir->inode->is_base()) {
809 root = dir; // bootstrap hack.
810 if (subtrees.count(root) == 0) {
811 subtrees[root];
812 root->get(CDir::PIN_SUBTREE);
813 }
814 } else {
815 root = get_subtree_root(dir); // subtree root
816 }
817 assert(root);
818 assert(subtrees.count(root));
819 dout(7) << " current root is " << *root << dendl;
820
821 if (root == dir) {
822 // i am already a subtree.
823 dir->set_dir_auth(auth);
824 } else {
825 // i am a new subtree.
826 dout(10) << " new subtree at " << *dir << dendl;
827 assert(subtrees.count(dir) == 0);
828 subtrees[dir]; // create empty subtree bounds list for me.
829 dir->get(CDir::PIN_SUBTREE);
830
831 // set dir_auth
832 dir->set_dir_auth(auth);
833
834 // move items nested beneath me, under me.
835 set<CDir*>::iterator p = subtrees[root].begin();
836 while (p != subtrees[root].end()) {
837 set<CDir*>::iterator next = p;
838 ++next;
839 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
840 // move under me
841 dout(10) << " claiming child bound " << **p << dendl;
842 subtrees[dir].insert(*p);
843 subtrees[root].erase(p);
844 }
845 p = next;
846 }
847
848 // i am a bound of the parent subtree.
849 subtrees[root].insert(dir);
850
851 // i am now the subtree root.
852 root = dir;
853
854 // adjust recursive pop counters
855 if (dir->is_auth()) {
856 utime_t now = ceph_clock_now();
857 CDir *p = dir->get_parent_dir();
858 while (p) {
859 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
860 if (p->is_subtree_root()) break;
861 p = p->inode->get_parent_dir();
862 }
863 }
864
865 if (do_eval)
866 eval_subtree_root(dir->get_inode());
867 }
868
869 show_subtrees();
870 }
871
872
873 void MDCache::try_subtree_merge(CDir *dir)
874 {
875 dout(7) << "try_subtree_merge " << *dir << dendl;
876 assert(subtrees.count(dir));
877 set<CDir*> oldbounds = subtrees[dir];
878
879 // try merge at my root
880 try_subtree_merge_at(dir);
881
882 // try merge at my old bounds
883 for (set<CDir*>::iterator p = oldbounds.begin();
884 p != oldbounds.end();
885 ++p)
886 try_subtree_merge_at(*p);
887 }
888
889 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
890 CInode *in;
891 MutationRef mut;
892 public:
893 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
894 void finish(int r) override {
895 mdcache->subtree_merge_writebehind_finish(in, mut);
896 }
897 };
898
899 void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
900 {
901 dout(10) << "try_subtree_merge_at " << *dir << dendl;
902 assert(subtrees.count(dir));
903
904 if (mds->is_any_replay() || mds->is_resolve())
905 do_eval = false;
906
907 // merge with parent?
908 CDir *parent = dir;
909 if (!dir->inode->is_base())
910 parent = get_subtree_root(dir->get_parent_dir());
911
912 if (parent != dir && // we have a parent,
913 parent->dir_auth == dir->dir_auth && // auth matches,
914 dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous,
915 !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound,
916 !dir->state_test(CDir::STATE_AUXSUBTREE)) { // not aux subtree
917 // merge with parent.
918 dout(10) << " subtree merge at " << *dir << dendl;
919 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
920
921 // move our bounds under the parent
922 for (set<CDir*>::iterator p = subtrees[dir].begin();
923 p != subtrees[dir].end();
924 ++p)
925 subtrees[parent].insert(*p);
926
927 // we are no longer a subtree or bound
928 dir->put(CDir::PIN_SUBTREE);
929 subtrees.erase(dir);
930 subtrees[parent].erase(dir);
931
932 // adjust popularity?
933 if (dir->is_auth()) {
934 utime_t now = ceph_clock_now();
935 CDir *p = dir->get_parent_dir();
936 while (p) {
937 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
938 if (p->is_subtree_root()) break;
939 p = p->inode->get_parent_dir();
940 }
941 }
942
943 if (do_eval)
944 eval_subtree_root(dir->get_inode());
945 }
946
947 show_subtrees(15);
948 }
949
950 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
951 {
952 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
953 in->pop_and_dirty_projected_inode(mut->ls);
954
955 mut->apply();
956 mds->locker->drop_locks(mut.get());
957 mut->cleanup();
958
959 in->auth_unpin(this);
960 }
961
962 void MDCache::eval_subtree_root(CInode *diri)
963 {
964 // evaluate subtree inode filelock?
965 // (we should scatter the filelock on subtree bounds)
966 if (diri->is_auth())
967 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
968 }
969
970
971 void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
972 {
973 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
974 << " on " << *dir
975 << " bounds " << bounds
976 << dendl;
977
978 show_subtrees();
979
980 CDir *root;
981 if (dir->ino() == MDS_INO_ROOT) {
982 root = dir; // bootstrap hack.
983 if (subtrees.count(root) == 0) {
984 subtrees[root];
985 root->get(CDir::PIN_SUBTREE);
986 }
987 } else {
988 root = get_subtree_root(dir); // subtree root
989 }
990 assert(root);
991 assert(subtrees.count(root));
992 dout(7) << " current root is " << *root << dendl;
993
994 mds_authority_t oldauth = dir->authority();
995
996 if (root == dir) {
997 // i am already a subtree.
998 dir->set_dir_auth(auth);
999 } else {
1000 // i am a new subtree.
1001 dout(10) << " new subtree at " << *dir << dendl;
1002 assert(subtrees.count(dir) == 0);
1003 subtrees[dir]; // create empty subtree bounds list for me.
1004 dir->get(CDir::PIN_SUBTREE);
1005
1006 // set dir_auth
1007 dir->set_dir_auth(auth);
1008
1009 // move items nested beneath me, under me.
1010 set<CDir*>::iterator p = subtrees[root].begin();
1011 while (p != subtrees[root].end()) {
1012 set<CDir*>::iterator next = p;
1013 ++next;
1014 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1015 // move under me
1016 dout(10) << " claiming child bound " << **p << dendl;
1017 subtrees[dir].insert(*p);
1018 subtrees[root].erase(p);
1019 }
1020 p = next;
1021 }
1022
1023 // i am a bound of the parent subtree.
1024 subtrees[root].insert(dir);
1025
1026 // i am now the subtree root.
1027 root = dir;
1028 }
1029
1030 // verify/adjust bounds.
1031 // - these may be new, or
1032 // - beneath existing ambiguous bounds (which will be collapsed),
1033 // - but NOT beneath unambiguous bounds.
1034 for (set<CDir*>::iterator p = bounds.begin();
1035 p != bounds.end();
1036 ++p) {
1037 CDir *bound = *p;
1038
1039 // new bound?
1040 if (subtrees[dir].count(bound) == 0) {
1041 if (get_subtree_root(bound) == dir) {
1042 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1043 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1044 }
1045 else {
1046 dout(10) << " want bound " << *bound << dendl;
1047 CDir *t = get_subtree_root(bound->get_parent_dir());
1048 if (subtrees[t].count(bound) == 0) {
1049 assert(t != dir);
1050 dout(10) << " new bound " << *bound << dendl;
1051 adjust_subtree_auth(bound, t->authority());
1052 }
1053 // make sure it's nested beneath ambiguous subtree(s)
1054 while (1) {
1055 while (subtrees[dir].count(t) == 0)
1056 t = get_subtree_root(t->get_parent_dir());
1057 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1058 adjust_subtree_auth(t, auth);
1059 try_subtree_merge_at(t);
1060 t = get_subtree_root(bound->get_parent_dir());
1061 if (t == dir) break;
1062 }
1063 }
1064 }
1065 else {
1066 dout(10) << " already have bound " << *bound << dendl;
1067 }
1068 }
1069 // merge stray bounds?
1070 while (!subtrees[dir].empty()) {
1071 set<CDir*> copy = subtrees[dir];
1072 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1073 if (bounds.count(*p) == 0) {
1074 CDir *stray = *p;
1075 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1076 adjust_subtree_auth(stray, auth);
1077 try_subtree_merge_at(stray);
1078 }
1079 }
1080 // swallowing subtree may add new subtree bounds
1081 if (copy == subtrees[dir])
1082 break;
1083 }
1084
1085 // bound should now match.
1086 verify_subtree_bounds(dir, bounds);
1087
1088 show_subtrees();
1089 }
1090
1091
1092 /*
1093 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1094 * fragmentation as necessary to get an equivalent bounding set. That is, only
1095 * split if one of our frags spans the provided bounding set. Never merge.
1096 */
1097 void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1098 {
1099 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1100
1101 // sort by ino
1102 map<inodeno_t, fragset_t> byino;
1103 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1104 byino[p->ino].insert(p->frag);
1105 dout(10) << " by ino: " << byino << dendl;
1106
1107 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1108 CInode *diri = get_inode(p->first);
1109 if (!diri)
1110 continue;
1111 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1112
1113 fragtree_t tmpdft;
1114 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1115 tmpdft.force_to_leaf(g_ceph_context, *q);
1116
1117 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1118 frag_t fg = *q;
1119 list<frag_t> fgls;
1120 diri->dirfragtree.get_leaves_under(fg, fgls);
1121 if (fgls.empty()) {
1122 bool all = true;
1123 frag_t approx_fg = diri->dirfragtree[fg.value()];
1124 list<frag_t> ls;
1125 tmpdft.get_leaves_under(approx_fg, ls);
1126 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1127 if (p->second.get().count(*r) == 0) {
1128 // not bound, so the resolve message is from auth MDS of the dirfrag
1129 force_dir_fragment(diri, *r);
1130 all = false;
1131 }
1132 }
1133 if (all)
1134 fgls.push_back(approx_fg);
1135 else
1136 diri->dirfragtree.get_leaves_under(fg, fgls);
1137 }
1138 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1139 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1140 CDir *dir = diri->get_dirfrag(*r);
1141 if (dir)
1142 bounds.insert(dir);
1143 }
1144 }
1145 }
1146 }
1147
1148 void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1149 {
1150 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1151 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1152
1153 set<CDir*> bounds;
1154 get_force_dirfrag_bound_set(bound_dfs, bounds);
1155 adjust_bounded_subtree_auth(dir, bounds, auth);
1156 }
1157
1158 void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1159 {
1160 dout(10) << "map_dirfrag_set " << dfs << dendl;
1161
1162 // group by inode
1163 map<inodeno_t, fragset_t> ino_fragset;
1164 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1165 ino_fragset[p->ino].insert(p->frag);
1166
1167 // get frags
1168 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1169 p != ino_fragset.end();
1170 ++p) {
1171 CInode *in = get_inode(p->first);
1172 if (!in)
1173 continue;
1174
1175 list<frag_t> fglist;
1176 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1177 in->dirfragtree.get_leaves_under(*q, fglist);
1178
1179 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1180 << " on " << *in << dendl;
1181
1182 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1183 CDir *dir = in->get_dirfrag(*q);
1184 if (dir)
1185 result.insert(dir);
1186 }
1187 }
1188 }
1189
1190
1191
1192 CDir *MDCache::get_subtree_root(CDir *dir)
1193 {
1194 // find the underlying dir that delegates (or is about to delegate) auth
1195 while (true) {
1196 if (dir->is_subtree_root())
1197 return dir;
1198 dir = dir->get_inode()->get_parent_dir();
1199 if (!dir)
1200 return 0; // none
1201 }
1202 }
1203
1204 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1205 {
1206 // find the underlying dir that delegates (or is about to delegate) auth
1207 while (true) {
1208 if (dir->is_subtree_root())
1209 return dir;
1210 dir = dir->get_inode()->get_projected_parent_dir();
1211 if (!dir)
1212 return 0; // none
1213 }
1214 }
1215
1216 void MDCache::remove_subtree(CDir *dir)
1217 {
1218 dout(10) << "remove_subtree " << *dir << dendl;
1219 assert(subtrees.count(dir));
1220 assert(subtrees[dir].empty());
1221 subtrees.erase(dir);
1222 dir->put(CDir::PIN_SUBTREE);
1223 if (dir->get_parent_dir()) {
1224 CDir *p = get_subtree_root(dir->get_parent_dir());
1225 assert(subtrees[p].count(dir));
1226 subtrees[p].erase(dir);
1227 }
1228 }
1229
1230 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1231 {
1232 assert(subtrees.count(dir));
1233 bounds = subtrees[dir];
1234 }
1235
1236 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1237 {
1238 if (subtrees.count(dir)) {
1239 // just copy them, dir is a subtree.
1240 get_subtree_bounds(dir, bounds);
1241 } else {
1242 // find them
1243 CDir *root = get_subtree_root(dir);
1244 for (set<CDir*>::iterator p = subtrees[root].begin();
1245 p != subtrees[root].end();
1246 ++p) {
1247 CDir *t = *p;
1248 while (t != root) {
1249 t = t->get_parent_dir();
1250 assert(t);
1251 if (t == dir) {
1252 bounds.insert(*p);
1253 continue;
1254 }
1255 }
1256 }
1257 }
1258 }
1259
1260 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1261 {
1262 // for debugging only.
1263 assert(subtrees.count(dir));
1264 if (bounds != subtrees[dir]) {
1265 dout(0) << "verify_subtree_bounds failed" << dendl;
1266 set<CDir*> b = bounds;
1267 for (auto &cd : subtrees[dir]) {
1268 if (bounds.count(cd)) {
1269 b.erase(cd);
1270 continue;
1271 }
1272 dout(0) << " missing bound " << *cd << dendl;
1273 }
1274 for (const auto &cd : b)
1275 dout(0) << " extra bound " << *cd << dendl;
1276 }
1277 assert(bounds == subtrees[dir]);
1278 }
1279
1280 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1281 {
1282 // for debugging only.
1283 assert(subtrees.count(dir));
1284
1285 // make sure that any bounds i do have are properly noted as such.
1286 int failed = 0;
1287 for (const auto &fg : bounds) {
1288 CDir *bd = get_dirfrag(fg);
1289 if (!bd) continue;
1290 if (subtrees[dir].count(bd) == 0) {
1291 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1292 failed++;
1293 }
1294 }
1295 assert(failed == 0);
1296 }
1297
1298 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1299 {
1300 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1301 << " to " << *newdir << dendl;
1302 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1303 }
1304
1305 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir,
1306 bool pop, bool imported)
1307 {
1308 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1309
1310 //show_subtrees();
1311
1312 CDir *newdir = diri->get_parent_dir();
1313
1314 if (pop) {
1315 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1316 assert(p != projected_subtree_renames.end());
1317 assert(!p->second.empty());
1318 assert(p->second.front().first == olddir);
1319 assert(p->second.front().second == newdir);
1320 p->second.pop_front();
1321 if (p->second.empty())
1322 projected_subtree_renames.erase(p);
1323 }
1324
1325 // adjust subtree
1326 list<CDir*> dfls;
1327 // make sure subtree dirfrags are at the front of the list
1328 diri->get_subtree_dirfrags(dfls);
1329 diri->get_nested_dirfrags(dfls);
1330 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1331 CDir *dir = *p;
1332
1333 dout(10) << "dirfrag " << *dir << dendl;
1334 CDir *oldparent = get_subtree_root(olddir);
1335 dout(10) << " old parent " << *oldparent << dendl;
1336 CDir *newparent = get_subtree_root(newdir);
1337 dout(10) << " new parent " << *newparent << dendl;
1338
1339 if (oldparent == newparent) {
1340 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1341 continue;
1342 }
1343
1344 if (dir->is_subtree_root()) {
1345 // children are fine. change parent.
1346 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1347 assert(subtrees[oldparent].count(dir));
1348 subtrees[oldparent].erase(dir);
1349 assert(subtrees.count(newparent));
1350 subtrees[newparent].insert(dir);
1351 try_subtree_merge_at(dir, !imported);
1352 } else {
1353 // mid-subtree.
1354
1355 // see if any old bounds move to the new parent.
1356 list<CDir*> tomove;
1357 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1358 p != subtrees[oldparent].end();
1359 ++p) {
1360 CDir *bound = *p;
1361 CDir *broot = get_subtree_root(bound->get_parent_dir());
1362 if (broot != oldparent) {
1363 assert(broot == newparent);
1364 tomove.push_back(bound);
1365 }
1366 }
1367 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1368 CDir *bound = *p;
1369 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1370 subtrees[oldparent].erase(bound);
1371 subtrees[newparent].insert(bound);
1372 }
1373
1374 // did auth change?
1375 if (oldparent->authority() != newparent->authority()) {
1376 adjust_subtree_auth(dir, oldparent->authority(), !imported); // caller is responsible for *diri.
1377 try_subtree_merge_at(dir, !imported);
1378 }
1379 }
1380 }
1381
1382 show_subtrees();
1383 }
1384
1385
1386 void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1387 {
1388 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1389 p != subtrees.end();
1390 ++p) {
1391 CDir *root = p->first;
1392 if (root->is_full_dir_auth())
1393 s.insert(root);
1394 }
1395 }
1396 void MDCache::get_auth_subtrees(set<CDir*>& s)
1397 {
1398 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1399 p != subtrees.end();
1400 ++p) {
1401 CDir *root = p->first;
1402 if (root->is_auth())
1403 s.insert(root);
1404 }
1405 }
1406
1407
1408 // count.
1409
1410 int MDCache::num_subtrees()
1411 {
1412 return subtrees.size();
1413 }
1414
1415 int MDCache::num_subtrees_fullauth()
1416 {
1417 int n = 0;
1418 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1419 p != subtrees.end();
1420 ++p) {
1421 CDir *root = p->first;
1422 if (root->is_full_dir_auth())
1423 n++;
1424 }
1425 return n;
1426 }
1427
1428 int MDCache::num_subtrees_fullnonauth()
1429 {
1430 int n = 0;
1431 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1432 p != subtrees.end();
1433 ++p) {
1434 CDir *root = p->first;
1435 if (root->is_full_dir_nonauth())
1436 n++;
1437 }
1438 return n;
1439 }
1440
1441
1442
1443 // ===================================
1444 // journal and snap/cow helpers
1445
1446
1447 /*
1448 * find first inode in cache that follows given snapid. otherwise, return current.
1449 */
1450 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1451 {
1452 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1453 assert(in->last == CEPH_NOSNAP);
1454
1455 SnapRealm *realm = in->find_snaprealm();
1456 const set<snapid_t>& snaps = realm->get_snaps();
1457 dout(10) << " realm " << *realm << " " << *realm->inode << dendl;
1458 dout(10) << " snaps " << snaps << dendl;
1459
1460 if (snaps.empty())
1461 return in;
1462
1463 for (set<snapid_t>::const_iterator p = snaps.upper_bound(follows); // first item > follows
1464 p != snaps.end();
1465 ++p) {
1466 CInode *t = get_inode(in->ino(), *p);
1467 if (t) {
1468 in = t;
1469 dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl;
1470 break;
1471 }
1472 }
1473 return in;
1474 }
1475
1476
1477 /*
1478 * note: i'm currently cheating wrt dirty and inode.version on cow
1479 * items. instead of doing a full dir predirty, i just take the
1480 * original item's version, and set the dirty flag (via
1481 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1482 * means a special case in the dir commit clean sweep assertions.
1483 * bah.
1484 */
1485 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1486 {
1487 assert(last >= in->first);
1488
1489 SnapRealm *realm = in->find_snaprealm();
1490 const set<snapid_t>& snaps = realm->get_snaps();
1491
1492 // make sure snap inode's last match existing snapshots.
1493 // MDCache::pick_inode_snap() requires this.
1494 snapid_t last_snap = last;
1495 if (snaps.count(last) == 0) {
1496 set<snapid_t>::const_iterator p = snaps.upper_bound(last);
1497 if (p != snaps.begin()) {
1498 --p;
1499 if (*p >= in->first)
1500 last_snap = *p;
1501 }
1502 }
1503
1504 CInode *oldin = new CInode(this, true, in->first, last_snap);
1505 oldin->inode = *in->get_previous_projected_inode();
1506 oldin->symlink = in->symlink;
1507 oldin->xattrs = *in->get_previous_projected_xattrs();
1508 oldin->inode.trim_client_ranges(last);
1509
1510 if (in->first < in->oldest_snap)
1511 in->oldest_snap = in->first;
1512
1513 in->first = last+1;
1514
1515 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1516 add_inode(oldin);
1517
1518 if (in->last != CEPH_NOSNAP) {
1519 CInode *head_in = get_inode(in->ino());
1520 assert(head_in);
1521 if (head_in->split_need_snapflush(oldin, in)) {
1522 oldin->client_snap_caps = in->client_snap_caps;
1523 for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
1524 p != in->client_snap_caps.end();
1525 ++p) {
1526 SimpleLock *lock = oldin->get_lock(p->first);
1527 assert(lock);
1528 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
1529 oldin->auth_pin(lock);
1530 lock->set_state(LOCK_SNAP_SYNC); // gathering
1531 lock->get_wrlock(true);
1532 }
1533 }
1534 }
1535 return oldin;
1536 }
1537
1538 // clone caps?
1539 for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
1540 p != in->client_caps.end();
1541 ++p) {
1542 client_t client = p->first;
1543 Capability *cap = p->second;
1544 int issued = cap->issued();
1545 if ((issued & CEPH_CAP_ANY_WR) &&
1546 cap->client_follows < last) {
1547 // note in oldin
1548 for (int i = 0; i < num_cinode_locks; i++) {
1549 if (issued & cinode_lock_info[i].wr_caps) {
1550 int lockid = cinode_lock_info[i].lock;
1551 SimpleLock *lock = oldin->get_lock(lockid);
1552 assert(lock);
1553 oldin->client_snap_caps[lockid].insert(client);
1554 oldin->auth_pin(lock);
1555 lock->set_state(LOCK_SNAP_SYNC); // gathering
1556 lock->get_wrlock(true);
1557 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1558 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1559 }
1560 }
1561 cap->client_follows = last;
1562
1563 // we need snapflushes for any intervening snaps
1564 dout(10) << " snaps " << snaps << dendl;
1565 for (set<snapid_t>::const_iterator q = snaps.lower_bound(oldin->first);
1566 q != snaps.end() && *q <= last;
1567 ++q) {
1568 in->add_need_snapflush(oldin, *q, client);
1569 }
1570 } else {
1571 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1572 }
1573 }
1574
1575 return oldin;
1576 }
1577
1578 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1579 CDentry *dn, snapid_t follows,
1580 CInode **pcow_inode, CDentry::linkage_t *dnl)
1581 {
1582 if (!dn) {
1583 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1584 return;
1585 }
1586 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1587 assert(dn->is_auth());
1588
1589 // nothing to cow on a null dentry, fix caller
1590 if (!dnl)
1591 dnl = dn->get_projected_linkage();
1592 assert(!dnl->is_null());
1593
1594 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1595 // multiversion inode.
1596 CInode *in = dnl->get_inode();
1597 SnapRealm *realm = NULL;
1598
1599 if (in->get_projected_parent_dn() != dn) {
1600 assert(follows == CEPH_NOSNAP);
1601 realm = dn->dir->inode->find_snaprealm();
1602 snapid_t dir_follows = realm->get_newest_snap();
1603
1604 if (dir_follows+1 > dn->first) {
1605 snapid_t oldfirst = dn->first;
1606 dn->first = dir_follows+1;
1607 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1608 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
1609 oldfirst, dir_follows);
1610 olddn->pre_dirty();
1611 dout(10) << " olddn " << *olddn << dendl;
1612 metablob->add_remote_dentry(olddn, true);
1613 mut->add_cow_dentry(olddn);
1614 // FIXME: adjust link count here? hmm.
1615
1616 if (dir_follows+1 > in->first)
1617 in->cow_old_inode(dir_follows, false);
1618 }
1619 }
1620
1621 if (in->snaprealm) {
1622 realm = in->snaprealm;
1623 follows = realm->get_newest_seq();
1624 } else
1625 follows = dir_follows;
1626 } else {
1627 realm = in->find_snaprealm();
1628 if (follows == CEPH_NOSNAP)
1629 follows = realm->get_newest_seq();
1630 }
1631
1632 // already cloned?
1633 if (follows < in->first) {
1634 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1635 return;
1636 }
1637
1638 if (!realm->has_snaps_in_range(in->first, follows)) {
1639 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1640 in->first = follows + 1;
1641 return;
1642 }
1643
1644 in->cow_old_inode(follows, false);
1645
1646 } else {
1647 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1648 if (follows == CEPH_NOSNAP)
1649 follows = realm->get_newest_seq();
1650
1651 // already cloned?
1652 if (follows < dn->first) {
1653 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1654 return;
1655 }
1656
1657 // update dn.first before adding old dentry to cdir's map
1658 snapid_t oldfirst = dn->first;
1659 dn->first = follows+1;
1660
1661 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1662
1663 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1664 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1665 if (in)
1666 in->first = follows+1;
1667 return;
1668 }
1669
1670 dout(10) << " dn " << *dn << dendl;
1671 if (in) {
1672 CInode *oldin = cow_inode(in, follows);
1673 mut->add_cow_inode(oldin);
1674 if (pcow_inode)
1675 *pcow_inode = oldin;
1676 CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
1677 oldin->inode.version = olddn->pre_dirty();
1678 dout(10) << " olddn " << *olddn << dendl;
1679 bool need_snapflush = !oldin->client_snap_caps.empty();
1680 if (need_snapflush)
1681 mut->ls->open_files.push_back(&oldin->item_open_file);
1682 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1683 mut->add_cow_dentry(olddn);
1684 } else {
1685 assert(dnl->is_remote());
1686 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
1687 oldfirst, follows);
1688 olddn->pre_dirty();
1689 dout(10) << " olddn " << *olddn << dendl;
1690 metablob->add_remote_dentry(olddn, true);
1691 mut->add_cow_dentry(olddn);
1692 }
1693 }
1694 }
1695
1696
1697 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1698 CInode *in, snapid_t follows,
1699 CInode **pcow_inode)
1700 {
1701 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1702 CDentry *dn = in->get_projected_parent_dn();
1703 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1704 }
1705
1706 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1707 {
1708 if (in->is_base()) {
1709 metablob->add_root(true, in, in->get_projected_inode());
1710 } else {
1711 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1712 follows = in->first - 1;
1713 CDentry *dn = in->get_projected_parent_dn();
1714 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1715 journal_cow_dentry(mut, metablob, dn, follows);
1716 if (in->get_projected_inode()->is_backtrace_updated()) {
1717 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1718 in->get_previous_projected_inode()->layout.pool_id;
1719 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1720 } else {
1721 metablob->add_primary_dentry(dn, in, true);
1722 }
1723 }
1724 }
1725
1726
1727
1728 // nested ---------------------------------------------------------------
1729
1730 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1731 int linkunlink, SnapRealm *prealm)
1732 {
1733 CDentry *parentdn = cur->get_projected_parent_dn();
1734 inode_t *curi = cur->get_projected_inode();
1735
1736 if (cur->first > first)
1737 first = cur->first;
1738
1739 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1740 << " " << *cur << dendl;
1741 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1742 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1743
1744 /*
1745 * FIXME. this incompletely propagates rstats to _old_ parents
1746 * (i.e. shortly after a directory rename). but we need full
1747 * blown hard link backpointers to make this work properly...
1748 */
1749 snapid_t floor = parentdn->first;
1750 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1751
1752 if (!prealm)
1753 prealm = parent->inode->find_snaprealm();
1754 const set<snapid_t> snaps = prealm->get_snaps();
1755
1756 if (cur->last != CEPH_NOSNAP) {
1757 assert(cur->dirty_old_rstats.empty());
1758 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1759 if (q == snaps.end() || *q > cur->last)
1760 return;
1761 }
1762
1763 if (cur->last >= floor) {
1764 bool update = true;
1765 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1766 // rename src inode is not projected in the slave rename prep case. so we should
1767 // avoid updateing the inode.
1768 assert(linkunlink < 0);
1769 assert(cur->is_frozen_inode());
1770 update = false;
1771 }
1772 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1773 linkunlink, update);
1774 }
1775
1776 if (g_conf->mds_snap_rstat) {
1777 for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
1778 p != cur->dirty_old_rstats.end();
1779 ++p) {
1780 old_inode_t& old = cur->old_inodes[*p];
1781 snapid_t ofirst = MAX(old.first, floor);
1782 set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
1783 if (q == snaps.end() || *q > *p)
1784 continue;
1785 if (*p >= floor)
1786 _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
1787 }
1788 }
1789 cur->dirty_old_rstats.clear();
1790 }
1791
1792
1793 void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
1794 CDir *parent, int linkunlink, bool update_inode)
1795 {
1796 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1797 dout(20) << " inode rstat " << inode.rstat << dendl;
1798 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1799 nest_info_t delta;
1800 if (linkunlink == 0) {
1801 delta.add(inode.rstat);
1802 delta.sub(inode.accounted_rstat);
1803 } else if (linkunlink < 0) {
1804 delta.sub(inode.accounted_rstat);
1805 } else {
1806 delta.add(inode.rstat);
1807 }
1808 dout(20) << " delta " << delta << dendl;
1809
1810 if (update_inode)
1811 inode.accounted_rstat = inode.rstat;
1812
1813 while (last >= ofirst) {
1814 /*
1815 * pick fnode version to update. at each iteration, we want to
1816 * pick a segment ending in 'last' to update. split as necessary
1817 * to make that work. then, adjust first up so that we only
1818 * update one segment at a time. then loop to cover the whole
1819 * [ofirst,last] interval.
1820 */
1821 nest_info_t *prstat;
1822 snapid_t first;
1823 fnode_t *pf = parent->get_projected_fnode();
1824 if (last == CEPH_NOSNAP) {
1825 if (g_conf->mds_snap_rstat)
1826 first = MAX(ofirst, parent->first);
1827 else
1828 first = parent->first;
1829 prstat = &pf->rstat;
1830 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1831
1832 if (first > parent->first &&
1833 !(pf->rstat == pf->accounted_rstat)) {
1834 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1835 << parent->first << "," << (first-1) << "] "
1836 << " " << *prstat << "/" << pf->accounted_rstat
1837 << dendl;
1838 parent->dirty_old_rstat[first-1].first = parent->first;
1839 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1840 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1841 }
1842 parent->first = first;
1843 } else if (!g_conf->mds_snap_rstat) {
1844 // drop snapshots' rstats
1845 break;
1846 } else if (last >= parent->first) {
1847 first = parent->first;
1848 parent->dirty_old_rstat[last].first = first;
1849 parent->dirty_old_rstat[last].rstat = pf->rstat;
1850 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1851 prstat = &parent->dirty_old_rstat[last].rstat;
1852 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1853 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1854 } else {
1855 // be careful, dirty_old_rstat is a _sparse_ map.
1856 // sorry, this is ugly.
1857 first = ofirst;
1858
1859 // find any intersection with last
1860 compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
1861 if (p == parent->dirty_old_rstat.end()) {
1862 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1863 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1864 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1865 first = parent->dirty_old_rstat.rbegin()->first+1;
1866 }
1867 } else {
1868 // *p last is >= last
1869 if (p->second.first <= last) {
1870 // *p intersects [first,last]
1871 if (p->second.first < first) {
1872 dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
1873 parent->dirty_old_rstat[first-1] = p->second;
1874 p->second.first = first;
1875 }
1876 if (p->second.first > first)
1877 first = p->second.first;
1878 if (last < p->first) {
1879 dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
1880 parent->dirty_old_rstat[last] = p->second;
1881 p->second.first = last+1;
1882 }
1883 } else {
1884 // *p is to the _right_ of [first,last]
1885 p = parent->dirty_old_rstat.lower_bound(first);
1886 // new *p last is >= first
1887 if (p->second.first <= last && // new *p isn't also to the right, and
1888 p->first >= first) { // it intersects our first bit,
1889 dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
1890 first = p->first+1;
1891 }
1892 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1893 }
1894 }
1895 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1896 parent->dirty_old_rstat[last].first = first;
1897 prstat = &parent->dirty_old_rstat[last].rstat;
1898 }
1899
1900 // apply
1901 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1902 assert(last >= first);
1903 prstat->add(delta);
1904 if (update_inode)
1905 inode.accounted_rstat = inode.rstat;
1906 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1907
1908 last = first-1;
1909 }
1910 }
1911
1912 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1913 snapid_t ofirst, snapid_t last,
1914 CInode *pin, bool cow_head)
1915 {
1916 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1917 dout(20) << " frag rstat " << rstat << dendl;
1918 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1919 nest_info_t delta = rstat;
1920 delta.sub(accounted_rstat);
1921 dout(20) << " delta " << delta << dendl;
1922
1923 while (last >= ofirst) {
1924 inode_t *pi;
1925 snapid_t first;
1926 if (last == pin->last) {
1927 pi = pin->get_projected_inode();
1928 first = MAX(ofirst, pin->first);
1929 if (first > pin->first) {
1930 old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
1931 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1932 }
1933 } else {
1934 if (last >= pin->first) {
1935 first = pin->first;
1936 pin->cow_old_inode(last, cow_head);
1937 } else {
1938 // our life is easier here because old_inodes is not sparse
1939 // (although it may not begin at snapid 1)
1940 compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
1941 if (p == pin->old_inodes.end()) {
1942 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1943 break;
1944 }
1945 first = p->second.first;
1946 if (first > last) {
1947 dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
1948 //assert(p == pin->old_inodes.begin());
1949 break;
1950 }
1951 if (p->first > last) {
1952 dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
1953 << (last+1) << "," << p->first << "]" << dendl;
1954 pin->old_inodes[last] = p->second;
1955 p->second.first = last+1;
1956 pin->dirty_old_rstats.insert(p->first);
1957 }
1958 }
1959 if (first < ofirst) {
1960 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1961 << first << "," << ofirst-1 << "]" << dendl;
1962 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1963 pin->dirty_old_rstats.insert(ofirst-1);
1964 pin->old_inodes[last].first = first = ofirst;
1965 }
1966 pi = &pin->old_inodes[last].inode;
1967 pin->dirty_old_rstats.insert(last);
1968 }
1969 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1970 pi->rstat.add(delta);
1971 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1972
1973 last = first-1;
1974 }
1975 }
1976
1977 void MDCache::broadcast_quota_to_client(CInode *in)
1978 {
1979 if (!in->is_auth() || in->is_frozen())
1980 return;
1981
1982 inode_t *i = in->get_projected_inode();
1983
1984 if (!i->quota.is_enable())
1985 return;
1986
1987 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1988 it != in->client_caps.end();
1989 ++it) {
1990 Session *session = mds->get_session(it->first);
1991 if (!session || !session->connection ||
1992 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
1993 continue;
1994
1995 Capability *cap = it->second;
1996 if (cap->last_rbytes == i->rstat.rbytes &&
1997 cap->last_rsize == i->rstat.rsize())
1998 continue;
1999
2000 if (i->quota.max_files > 0) {
2001 if (i->rstat.rsize() >= i->quota.max_files)
2002 goto update;
2003
2004 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2005 abs(cap->last_rsize - i->rstat.rsize()))
2006 goto update;
2007 }
2008
2009 if (i->quota.max_bytes > 0) {
2010 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2011 goto update;
2012
2013 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2014 abs(cap->last_rbytes - i->rstat.rbytes))
2015 goto update;
2016 }
2017
2018 continue;
2019
2020 update:
2021 cap->last_rsize = i->rstat.rsize();
2022 cap->last_rbytes = i->rstat.rbytes;
2023
2024 MClientQuota *msg = new MClientQuota();
2025 msg->ino = in->ino();
2026 msg->rstat = i->rstat;
2027 msg->quota = i->quota;
2028 mds->send_message_client_counted(msg, session->connection);
2029 }
2030 for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
2031 it != in->replicas_end();
2032 ++it) {
2033 MGatherCaps *msg = new MGatherCaps;
2034 msg->ino = in->ino();
2035 mds->send_message_mds(msg, it->first);
2036 }
2037 }
2038
2039 /*
2040 * NOTE: we _have_ to delay the scatter if we are called during a
2041 * rejoin, because we can't twiddle locks between when the
2042 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2043 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2044 * (no requests), and a survivor acks immediately. _except_ that
2045 * during rejoin_(weak|strong) processing, we may complete a lock
2046 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2047 * scatterlock state in that case or the lock states will get out of
2048 * sync between the auth and replica.
2049 *
2050 * the simple solution is to never do the scatter here. instead, put
2051 * the scatterlock on a list if it isn't already wrlockable. this is
2052 * probably the best plan anyway, since we avoid too many
2053 * scatters/locks under normal usage.
2054 */
2055 /*
2056 * some notes on dirlock/nestlock scatterlock semantics:
2057 *
2058 * the fragstat (dirlock) will never be updated without
2059 * dirlock+nestlock wrlock held by the caller.
2060 *
2061 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2062 * data is pushed up the tree. this could be changed with some
2063 * restructuring here, but in its current form we ensure that the
2064 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2065 * frag, which is nice. and, we only need to track frags that need to
2066 * be nudged (and not inodes with pending rstat changes that need to
2067 * be pushed into the frag). a consequence of this is that the
2068 * accounted_rstat on scatterlock sync may not match our current
2069 * rstat. this is normal and expected.
2070 */
2071 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2072 CInode *in, CDir *parent,
2073 int flags, int linkunlink,
2074 snapid_t cfollows)
2075 {
2076 bool primary_dn = flags & PREDIRTY_PRIMARY;
2077 bool do_parent_mtime = flags & PREDIRTY_DIR;
2078 bool shallow = flags & PREDIRTY_SHALLOW;
2079
2080 assert(mds->mdlog->entry_is_open());
2081
2082 // make sure stamp is set
2083 if (mut->get_mds_stamp() == utime_t())
2084 mut->set_mds_stamp(ceph_clock_now());
2085
2086 if (in->is_base())
2087 return;
2088
2089 dout(10) << "predirty_journal_parents"
2090 << (do_parent_mtime ? " do_parent_mtime":"")
2091 << " linkunlink=" << linkunlink
2092 << (primary_dn ? " primary_dn":" remote_dn")
2093 << (shallow ? " SHALLOW":"")
2094 << " follows " << cfollows
2095 << " " << *in << dendl;
2096
2097 if (!parent) {
2098 assert(primary_dn);
2099 parent = in->get_projected_parent_dn()->get_dir();
2100 }
2101
2102 if (flags == 0 && linkunlink == 0) {
2103 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2104 blob->add_dir_context(parent);
2105 return;
2106 }
2107
2108 // build list of inodes to wrlock, dirty, and update
2109 list<CInode*> lsi;
2110 CInode *cur = in;
2111 CDentry *parentdn = NULL;
2112 bool first = true;
2113 while (parent) {
2114 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2115 assert(parent->is_auth());
2116
2117 // opportunistically adjust parent dirfrag
2118 CInode *pin = parent->get_inode();
2119
2120 // inode -> dirfrag
2121 mut->auth_pin(parent);
2122 mut->add_projected_fnode(parent);
2123
2124 fnode_t *pf = parent->project_fnode();
2125 pf->version = parent->pre_dirty();
2126
2127 if (do_parent_mtime || linkunlink) {
2128 assert(mut->wrlocks.count(&pin->filelock));
2129 assert(mut->wrlocks.count(&pin->nestlock));
2130 assert(cfollows == CEPH_NOSNAP);
2131
2132 // update stale fragstat/rstat?
2133 parent->resync_accounted_fragstat();
2134 parent->resync_accounted_rstat();
2135
2136 if (do_parent_mtime) {
2137 pf->fragstat.mtime = mut->get_op_stamp();
2138 pf->fragstat.change_attr++;
2139 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2140 if (pf->fragstat.mtime > pf->rstat.rctime) {
2141 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2142 pf->rstat.rctime = pf->fragstat.mtime;
2143 } else {
2144 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2145 }
2146 }
2147 if (linkunlink) {
2148 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2149 if (in->is_dir()) {
2150 pf->fragstat.nsubdirs += linkunlink;
2151 //pf->rstat.rsubdirs += linkunlink;
2152 } else {
2153 pf->fragstat.nfiles += linkunlink;
2154 //pf->rstat.rfiles += linkunlink;
2155 }
2156 }
2157 }
2158
2159 // rstat
2160 if (!primary_dn) {
2161 // don't update parent this pass
2162 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2163 pin->versionlock.can_wrlock())) {
2164 dout(20) << " unwritable parent nestlock " << pin->nestlock
2165 << ", marking dirty rstat on " << *cur << dendl;
2166 cur->mark_dirty_rstat();
2167 } else {
2168 // if we don't hold a wrlock reference on this nestlock, take one,
2169 // because we are about to write into the dirfrag fnode and that needs
2170 // to commit before the lock can cycle.
2171 if (linkunlink) {
2172 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2173 }
2174
2175 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2176 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2177 mds->locker->wrlock_force(&pin->nestlock, mut);
2178 }
2179
2180 // now we can project the inode rstat diff the dirfrag
2181 SnapRealm *prealm = pin->find_snaprealm();
2182
2183 snapid_t follows = cfollows;
2184 if (follows == CEPH_NOSNAP)
2185 follows = prealm->get_newest_seq();
2186
2187 snapid_t first = follows+1;
2188
2189 // first, if the frag is stale, bring it back in sync.
2190 parent->resync_accounted_rstat();
2191
2192 // now push inode rstats into frag
2193 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2194 cur->clear_dirty_rstat();
2195 }
2196
2197 bool stop = false;
2198 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2199 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2200 stop = true;
2201 }
2202
2203 // delay propagating until later?
2204 if (!stop && !first &&
2205 g_conf->mds_dirstat_min_interval > 0) {
2206 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2207 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2208 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2209 << " < " << g_conf->mds_dirstat_min_interval
2210 << ", stopping" << dendl;
2211 stop = true;
2212 } else {
2213 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2214 }
2215 }
2216
2217 // can cast only because i'm passing nowait=true in the sole user
2218 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2219 if (!stop &&
2220 mut->wrlocks.count(&pin->nestlock) == 0 &&
2221 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2222 //true
2223 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2224 )) { // ** do not initiate.. see above comment **
2225 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2226 << " on " << *pin << dendl;
2227 stop = true;
2228 }
2229 if (stop) {
2230 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2231 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2232 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2233 mut->add_updated_lock(&pin->nestlock);
2234 if (do_parent_mtime || linkunlink) {
2235 mds->locker->mark_updated_scatterlock(&pin->filelock);
2236 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2237 mut->add_updated_lock(&pin->filelock);
2238 }
2239 break;
2240 }
2241 if (!mut->wrlocks.count(&pin->versionlock))
2242 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2243
2244 assert(mut->wrlocks.count(&pin->nestlock) ||
2245 mut->is_slave());
2246
2247 pin->last_dirstat_prop = mut->get_mds_stamp();
2248
2249 // dirfrag -> diri
2250 mut->auth_pin(pin);
2251 mut->add_projected_inode(pin);
2252 lsi.push_front(pin);
2253
2254 pin->pre_cow_old_inode(); // avoid cow mayhem!
2255
2256 inode_t *pi = pin->project_inode();
2257 pi->version = pin->pre_dirty();
2258
2259 // dirstat
2260 if (do_parent_mtime || linkunlink) {
2261 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2262 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2263 bool touched_mtime = false, touched_chattr = false;
2264 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2265 pf->accounted_fragstat = pf->fragstat;
2266 if (touched_mtime)
2267 pi->mtime = pi->ctime = pi->dirstat.mtime;
2268 if (touched_chattr)
2269 pi->change_attr = pi->dirstat.change_attr;
2270 dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
2271
2272 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2273 if (pi->dirstat.size() < 0)
2274 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2275 if (pi->dirstat.size() != pf->fragstat.size()) {
2276 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2277 << parent->dirfrag() << ", inode has " << pi->dirstat
2278 << ", dirfrag has " << pf->fragstat;
2279
2280 // trust the dirfrag for now
2281 pi->dirstat = pf->fragstat;
2282
2283 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2284 }
2285 }
2286 }
2287
2288 /*
2289 * the rule here is to follow the _oldest_ parent with dirty rstat
2290 * data. if we don't propagate all data, we add ourselves to the
2291 * nudge list. that way all rstat data will (eventually) get
2292 * pushed up the tree.
2293 *
2294 * actually, no. for now, silently drop rstats for old parents. we need
2295 * hard link backpointers to do the above properly.
2296 */
2297
2298 // stop?
2299 if (pin->is_base())
2300 break;
2301 parentdn = pin->get_projected_parent_dn();
2302 assert(parentdn);
2303
2304 // rstat
2305 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2306
2307 // first, if the frag is stale, bring it back in sync.
2308 parent->resync_accounted_rstat();
2309
2310 if (g_conf->mds_snap_rstat) {
2311 for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
2312 p != parent->dirty_old_rstat.end();
2313 ++p)
2314 project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
2315 p->first, pin, true);//false);
2316 }
2317 parent->dirty_old_rstat.clear();
2318 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2319
2320 pf->accounted_rstat = pf->rstat;
2321
2322 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2323 if (pi->rstat.rbytes != pf->rstat.rbytes) {
2324 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2325 << parent->dirfrag() << ", inode has " << pi->rstat
2326 << ", dirfrag has " << pf->rstat;
2327
2328 // trust the dirfrag for now
2329 pi->rstat = pf->rstat;
2330
2331 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2332 }
2333 }
2334
2335 parent->check_rstats();
2336 broadcast_quota_to_client(pin);
2337 // next parent!
2338 cur = pin;
2339 parent = parentdn->get_dir();
2340 linkunlink = 0;
2341 do_parent_mtime = false;
2342 primary_dn = true;
2343 first = false;
2344 }
2345
2346 // now, stick it in the blob
2347 assert(parent);
2348 assert(parent->is_auth());
2349 blob->add_dir_context(parent);
2350 blob->add_dir(parent, true);
2351 for (list<CInode*>::iterator p = lsi.begin();
2352 p != lsi.end();
2353 ++p) {
2354 CInode *cur = *p;
2355 journal_dirty_inode(mut.get(), blob, cur);
2356 }
2357
2358 }
2359
2360
2361
2362
2363
2364 // ===================================
2365 // slave requests
2366
2367
2368 /*
2369 * some handlers for master requests with slaves. we need to make
2370 * sure slaves journal commits before we forget we mastered them and
2371 * remove them from the uncommitted_masters map (used during recovery
2372 * to commit|abort slaves).
2373 */
2374 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2375 metareqid_t reqid;
2376 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2377 void finish(int r) override {
2378 mdcache->_logged_master_commit(reqid);
2379 }
2380 };
2381
2382 void MDCache::log_master_commit(metareqid_t reqid)
2383 {
2384 dout(10) << "log_master_commit " << reqid << dendl;
2385 uncommitted_masters[reqid].committing = true;
2386 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2387 new C_MDC_CommittedMaster(this, reqid));
2388 }
2389
2390 void MDCache::_logged_master_commit(metareqid_t reqid)
2391 {
2392 dout(10) << "_logged_master_commit " << reqid << dendl;
2393 assert(uncommitted_masters.count(reqid));
2394 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2395 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2396 uncommitted_masters.erase(reqid);
2397 }
2398
2399 // while active...
2400
2401 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2402 {
2403 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2404 assert(uncommitted_masters.count(r));
2405 uncommitted_masters[r].slaves.erase(from);
2406 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2407 log_master_commit(r);
2408 }
2409
2410 void MDCache::logged_master_update(metareqid_t reqid)
2411 {
2412 dout(10) << "logged_master_update " << reqid << dendl;
2413 assert(uncommitted_masters.count(reqid));
2414 uncommitted_masters[reqid].safe = true;
2415 if (pending_masters.count(reqid)) {
2416 pending_masters.erase(reqid);
2417 if (pending_masters.empty())
2418 process_delayed_resolve();
2419 }
2420 }
2421
2422 /*
2423 * Master may crash after receiving all slaves' commit acks, but before journalling
2424 * the final commit. Slaves may crash after journalling the slave commit, but before
2425 * sending commit ack to the master. Commit masters with no uncommitted slave when
2426 * resolve finishes.
2427 */
2428 void MDCache::finish_committed_masters()
2429 {
2430 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2431 p != uncommitted_masters.end();
2432 ++p) {
2433 p->second.recovering = false;
2434 if (!p->second.committing && p->second.slaves.empty()) {
2435 dout(10) << "finish_committed_masters " << p->first << dendl;
2436 log_master_commit(p->first);
2437 }
2438 }
2439 }
2440
2441 /*
2442 * at end of resolve... we must journal a commit|abort for all slave
2443 * updates, before moving on.
2444 *
2445 * this is so that the master can safely journal ECommitted on ops it
2446 * masters when it reaches up:active (all other recovering nodes must
2447 * complete resolve before that happens).
2448 */
2449 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2450 mds_rank_t from;
2451 metareqid_t reqid;
2452 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2453 void finish(int r) override {
2454 mdcache->_logged_slave_commit(from, reqid);
2455 }
2456 };
2457
2458 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2459 {
2460 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2461
2462 // send a message
2463 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2464 mds->send_message_mds(req, from);
2465 }
2466
2467
2468
2469
2470
2471
2472 // ====================================================================
2473 // import map, recovery
2474
2475 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2476 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2477 {
2478 if (subtrees.count(oldparent)) {
2479 vector<dirfrag_t>& v = subtrees[oldparent];
2480 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2481 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2482 if (*it == df) {
2483 v.erase(it);
2484 break;
2485 }
2486 }
2487 if (subtrees.count(newparent)) {
2488 vector<dirfrag_t>& v = subtrees[newparent];
2489 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2490 v.push_back(df);
2491 }
2492 }
2493
2494 ESubtreeMap *MDCache::create_subtree_map()
2495 {
2496 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2497 << num_subtrees_fullauth() << " fullauth"
2498 << dendl;
2499
2500 show_subtrees();
2501
2502 ESubtreeMap *le = new ESubtreeMap();
2503 mds->mdlog->_start_entry(le);
2504
2505 map<dirfrag_t, CDir*> dirs_to_add;
2506
2507 if (myin) {
2508 CDir* mydir = myin->get_dirfrag(frag_t());
2509 dirs_to_add[mydir->dirfrag()] = mydir;
2510 }
2511
2512 // include all auth subtrees, and their bounds.
2513 // and a spanning tree to tie it to the root.
2514 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2515 p != subtrees.end();
2516 ++p) {
2517 CDir *dir = p->first;
2518
2519 // journal subtree as "ours" if we are
2520 // me, -2
2521 // me, me
2522 // me, !me (may be importing and ambiguous!)
2523
2524 // so not
2525 // !me, *
2526 if (dir->get_dir_auth().first != mds->get_nodeid())
2527 continue;
2528
2529 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2530 my_ambiguous_imports.count(dir->dirfrag())) {
2531 dout(15) << " ambig subtree " << *dir << dendl;
2532 le->ambiguous_subtrees.insert(dir->dirfrag());
2533 } else {
2534 dout(15) << " subtree " << *dir << dendl;
2535 }
2536
2537 dirs_to_add[dir->dirfrag()] = dir;
2538 le->subtrees[dir->dirfrag()].clear();
2539
2540
2541 // bounds
2542 for (set<CDir*>::iterator q = p->second.begin();
2543 q != p->second.end();
2544 ++q) {
2545 CDir *bound = *q;
2546 dout(15) << " subtree bound " << *bound << dendl;
2547 dirs_to_add[bound->dirfrag()] = bound;
2548 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2549 }
2550 }
2551
2552 // apply projected renames
2553 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2554 p != projected_subtree_renames.end();
2555 ++p) {
2556 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2557 CInode *diri = p->first;
2558 CDir *olddir = q->first;
2559 CDir *newdir = q->second;
2560 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2561
2562 list<CDir*> dfls;
2563 diri->get_dirfrags(dfls);
2564 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2565 CDir *dir = *p;
2566 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2567 CDir *oldparent = get_projected_subtree_root(olddir);
2568 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2569 CDir *newparent = get_projected_subtree_root(newdir);
2570 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2571
2572 if (oldparent == newparent) {
2573 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2574 << oldparent->dirfrag() << dendl;
2575 continue;
2576 }
2577
2578 if (dir->is_subtree_root()) {
2579 if (le->subtrees.count(newparent->dirfrag()) &&
2580 oldparent->get_dir_auth() != newparent->get_dir_auth())
2581 dirs_to_add[dir->dirfrag()] = dir;
2582 // children are fine. change parent.
2583 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2584 le->subtrees);
2585 } else {
2586 // mid-subtree.
2587
2588 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2589 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2590 // if oldparent is auth, subtree is mine; include it.
2591 if (le->subtrees.count(oldparent->dirfrag())) {
2592 dirs_to_add[dir->dirfrag()] = dir;
2593 le->subtrees[dir->dirfrag()].clear();
2594 }
2595 // if newparent is auth, subtree is a new bound
2596 if (le->subtrees.count(newparent->dirfrag())) {
2597 dirs_to_add[dir->dirfrag()] = dir;
2598 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2599 }
2600 newparent = dir;
2601 }
2602
2603 // see if any old bounds move to the new parent.
2604 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2605 p != subtrees[oldparent].end();
2606 ++p) {
2607 CDir *bound = *p;
2608 if (dir->contains(bound->get_parent_dir()))
2609 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2610 le->subtrees);
2611 }
2612 }
2613 }
2614 }
2615 }
2616
2617 // simplify the journaled map. our in memory map may have more
2618 // subtrees than needed due to migrations that are just getting
2619 // started or just completing. but on replay, the "live" map will
2620 // be simple and we can do a straight comparison.
2621 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2622 if (le->ambiguous_subtrees.count(p->first))
2623 continue;
2624 unsigned i = 0;
2625 while (i < p->second.size()) {
2626 dirfrag_t b = p->second[i];
2627 if (le->subtrees.count(b) &&
2628 le->ambiguous_subtrees.count(b) == 0) {
2629 vector<dirfrag_t>& bb = le->subtrees[b];
2630 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2631 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2632 p->second.push_back(*r);
2633 dirs_to_add.erase(b);
2634 le->subtrees.erase(b);
2635 p->second.erase(p->second.begin() + i);
2636 } else {
2637 ++i;
2638 }
2639 }
2640 }
2641
2642 for (auto p : dirs_to_add) {
2643 CDir *dir = p.second;
2644 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2645 le->metablob.add_dir(dir, false);
2646 }
2647
2648 dout(15) << " subtrees " << le->subtrees << dendl;
2649 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2650
2651 //le->metablob.print(cout);
2652 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2653 return le;
2654 }
2655
2656 void MDCache::dump_resolve_status(Formatter *f) const
2657 {
2658 f->open_object_section("resolve_status");
2659 f->dump_stream("resolve_gather") << resolve_gather;
2660 f->dump_stream("resolve_ack_gather") << resolve_gather;
2661 f->close_section();
2662 }
2663
2664 void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2665 {
2666 dout(10) << "resolve_start" << dendl;
2667 assert(!resolve_done);
2668 resolve_done.reset(resolve_done_);
2669
2670 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2671 // if we don't have the root dir, adjust it to UNKNOWN. during
2672 // resolve we want mds0 to explicit claim the portion of it that
2673 // it owns, so that anything beyond its bounds get left as
2674 // unknown.
2675 CDir *rootdir = root->get_dirfrag(frag_t());
2676 if (rootdir)
2677 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2678 }
2679 resolve_gather = recovery_set;
2680 }
2681
2682 void MDCache::send_resolves()
2683 {
2684 send_slave_resolves();
2685 if (!resolve_ack_gather.empty()) {
2686 dout(10) << "send_resolves still waiting for resolve ack from ("
2687 << resolve_ack_gather << ")" << dendl;
2688 return;
2689 }
2690 if (!need_resolve_rollback.empty()) {
2691 dout(10) << "send_resolves still waiting for rollback to commit on ("
2692 << need_resolve_rollback << ")" << dendl;
2693 return;
2694 }
2695 send_subtree_resolves();
2696 }
2697
2698 void MDCache::send_slave_resolves()
2699 {
2700 dout(10) << "send_slave_resolves" << dendl;
2701
2702 map<mds_rank_t, MMDSResolve*> resolves;
2703
2704 if (mds->is_resolve()) {
2705 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2706 p != uncommitted_slave_updates.end();
2707 ++p) {
2708 resolves[p->first] = new MMDSResolve;
2709 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2710 q != p->second.end();
2711 ++q) {
2712 dout(10) << " including uncommitted " << q->first << dendl;
2713 resolves[p->first]->add_slave_request(q->first, false);
2714 }
2715 }
2716 } else {
2717 set<mds_rank_t> resolve_set;
2718 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2719 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2720 p != active_requests.end();
2721 ++p) {
2722 MDRequestRef& mdr = p->second;
2723 if (!mdr->is_slave())
2724 continue;
2725 if (!mdr->slave_did_prepare() && !mdr->committing) {
2726 continue;
2727 }
2728 mds_rank_t master = mdr->slave_to_mds;
2729 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2730 dout(10) << " including uncommitted " << *mdr << dendl;
2731 if (!resolves.count(master))
2732 resolves[master] = new MMDSResolve;
2733 if (!mdr->committing &&
2734 mdr->has_more() && mdr->more()->is_inode_exporter) {
2735 // re-send cap exports
2736 CInode *in = mdr->more()->rename_inode;
2737 map<client_t, Capability::Export> cap_map;
2738 in->export_client_caps(cap_map);
2739 bufferlist bl;
2740 ::encode(in->ino(), bl);
2741 ::encode(cap_map, bl);
2742 resolves[master]->add_slave_request(p->first, bl);
2743 } else {
2744 resolves[master]->add_slave_request(p->first, mdr->committing);
2745 }
2746 }
2747 }
2748 }
2749
2750 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2751 p != resolves.end();
2752 ++p) {
2753 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2754 mds->send_message_mds(p->second, p->first);
2755 resolve_ack_gather.insert(p->first);
2756 }
2757 }
2758
2759 void MDCache::send_subtree_resolves()
2760 {
2761 dout(10) << "send_subtree_resolves" << dendl;
2762
2763 if (migrator->is_exporting() || migrator->is_importing()) {
2764 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2765 migrator->show_importing();
2766 migrator->show_exporting();
2767 resolves_pending = true;
2768 return; // not now
2769 }
2770
2771 map<mds_rank_t, MMDSResolve*> resolves;
2772 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2773 p != recovery_set.end();
2774 ++p) {
2775 if (*p == mds->get_nodeid())
2776 continue;
2777 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2778 resolves[*p] = new MMDSResolve;
2779 }
2780
2781 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2782 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2783
2784 // known
2785 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2786 p != subtrees.end();
2787 ++p) {
2788 CDir *dir = p->first;
2789
2790 // only our subtrees
2791 if (dir->authority().first != mds->get_nodeid())
2792 continue;
2793
2794 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2795 continue; // we'll add it below
2796
2797 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2798 // ambiguous (mid-import)
2799 set<CDir*> bounds;
2800 get_subtree_bounds(dir, bounds);
2801 vector<dirfrag_t> dfls;
2802 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2803 dfls.push_back((*q)->dirfrag());
2804
2805 my_ambig_imports[dir->dirfrag()] = dfls;
2806 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2807 } else {
2808 // not ambiguous.
2809 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2810 q != resolves.end();
2811 ++q)
2812 resolves[q->first]->add_subtree(dir->dirfrag());
2813 // bounds too
2814 vector<dirfrag_t> dfls;
2815 for (set<CDir*>::iterator q = subtrees[dir].begin();
2816 q != subtrees[dir].end();
2817 ++q) {
2818 CDir *bound = *q;
2819 dfls.push_back(bound->dirfrag());
2820 }
2821
2822 my_subtrees[dir->dirfrag()] = dfls;
2823 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2824 }
2825 }
2826
2827 // ambiguous
2828 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2829 p != my_ambiguous_imports.end();
2830 ++p) {
2831 my_ambig_imports[p->first] = p->second;
2832 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2833 }
2834
2835 // simplify the claimed subtree.
2836 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2837 unsigned i = 0;
2838 while (i < p->second.size()) {
2839 dirfrag_t b = p->second[i];
2840 if (my_subtrees.count(b)) {
2841 vector<dirfrag_t>& bb = my_subtrees[b];
2842 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2843 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2844 p->second.push_back(*r);
2845 my_subtrees.erase(b);
2846 p->second.erase(p->second.begin() + i);
2847 } else {
2848 ++i;
2849 }
2850 }
2851 }
2852
2853 // send
2854 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2855 p != resolves.end();
2856 ++p) {
2857 MMDSResolve* m = p->second;
2858 m->subtrees = my_subtrees;
2859 m->ambiguous_imports = my_ambig_imports;
2860 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2861 mds->send_message_mds(m, p->first);
2862 }
2863 resolves_pending = false;
2864 }
2865
2866 void MDCache::handle_mds_failure(mds_rank_t who)
2867 {
2868 dout(7) << "handle_mds_failure mds." << who << dendl;
2869
2870 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2871
2872 resolve_gather.insert(who);
2873 discard_delayed_resolve(who);
2874 ambiguous_slave_updates.erase(who);
2875
2876 rejoin_gather.insert(who);
2877 rejoin_sent.erase(who); // i need to send another
2878 rejoin_ack_gather.erase(who); // i'll need/get another.
2879
2880 dout(10) << " resolve_gather " << resolve_gather << dendl;
2881 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2882 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2883 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2884 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2885
2886
2887 // tell the migrator too.
2888 migrator->handle_mds_failure_or_stop(who);
2889
2890 // clean up any requests slave to/from this node
2891 list<MDRequestRef> finish;
2892 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2893 p != active_requests.end();
2894 ++p) {
2895 MDRequestRef& mdr = p->second;
2896 // slave to the failed node?
2897 if (mdr->slave_to_mds == who) {
2898 if (mdr->slave_did_prepare()) {
2899 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2900 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2901 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2902
2903 if (!mdr->more()->waiting_on_slave.empty()) {
2904 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2905 // will rollback, no need to wait
2906 if (mdr->slave_request) {
2907 mdr->slave_request->put();
2908 mdr->slave_request = 0;
2909 }
2910 mdr->more()->waiting_on_slave.clear();
2911 }
2912 } else if (!mdr->committing) {
2913 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2914 if (mdr->slave_request || mdr->slave_rolling_back())
2915 mdr->aborted = true;
2916 else
2917 finish.push_back(mdr);
2918 }
2919 }
2920
2921 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2922 if (mdr->more()->waiting_on_slave.count(who)) {
2923 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2924 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2925 << who << dendl;
2926 mdr->more()->waiting_on_slave.erase(who);
2927 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2928 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2929 }
2930
2931 if (mdr->more()->srcdn_auth_mds == who &&
2932 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2933 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2934 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2935 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2936 }
2937 }
2938
2939 // failed node is slave?
2940 if (mdr->is_master() && !mdr->committing) {
2941 if (mdr->more()->srcdn_auth_mds == who) {
2942 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2943 << who << " to recover" << dendl;
2944 assert(mdr->more()->witnessed.count(who) == 0);
2945 if (mdr->more()->is_ambiguous_auth)
2946 mdr->clear_ambiguous_auth();
2947 // rename srcdn's auth mds failed, all witnesses will rollback
2948 mdr->more()->witnessed.clear();
2949 pending_masters.erase(p->first);
2950 }
2951
2952 if (mdr->more()->witnessed.count(who)) {
2953 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2954 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2955 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2956 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2957 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2958 // until either the request is committing or the slave also fails.
2959 assert(mdr->more()->waiting_on_slave.size() == 1);
2960 pending_masters.insert(p->first);
2961 } else {
2962 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2963 << who << " to recover" << dendl;
2964 if (srcdn_auth >= 0)
2965 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2966
2967 // discard this peer's prepare (if any)
2968 mdr->more()->witnessed.erase(who);
2969 }
2970 }
2971
2972 if (mdr->more()->waiting_on_slave.count(who)) {
2973 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2974 << " to recover" << dendl;
2975 // retry request when peer recovers
2976 mdr->more()->waiting_on_slave.erase(who);
2977 if (mdr->more()->waiting_on_slave.empty())
2978 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2979 }
2980
2981 if (mdr->locking && mdr->locking_target_mds == who)
2982 mdr->finish_locking(mdr->locking);
2983 }
2984 }
2985
2986 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2987 p != uncommitted_masters.end();
2988 ++p) {
2989 // The failed MDS may have already committed the slave update
2990 if (p->second.slaves.count(who)) {
2991 p->second.recovering = true;
2992 p->second.slaves.erase(who);
2993 }
2994 }
2995
2996 while (!finish.empty()) {
2997 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
2998 request_finish(finish.front());
2999 finish.pop_front();
3000 }
3001
3002 kick_find_ino_peers(who);
3003 kick_open_ino_peers(who);
3004
3005 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3006 p != fragments.end(); ) {
3007 dirfrag_t df = p->first;
3008 fragment_info_t& info = p->second;
3009 ++p;
3010 if (info.is_fragmenting())
3011 continue;
3012 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3013 list<CDir*> dirs;
3014 info.dirs.swap(dirs);
3015 fragments.erase(df);
3016 fragment_unmark_unfreeze_dirs(dirs);
3017 }
3018
3019 // MDCache::shutdown_export_strays() always exports strays to mds.0
3020 if (who == mds_rank_t(0))
3021 shutdown_exported_strays.clear();
3022
3023 show_subtrees();
3024 }
3025
3026 /*
3027 * handle_mds_recovery - called on another node's transition
3028 * from resolve -> active.
3029 */
3030 void MDCache::handle_mds_recovery(mds_rank_t who)
3031 {
3032 dout(7) << "handle_mds_recovery mds." << who << dendl;
3033
3034 // exclude all discover waiters. kick_discovers() will do the job
3035 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3036 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3037
3038 list<MDSInternalContextBase*> waiters;
3039
3040 // wake up any waiters in their subtrees
3041 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3042 p != subtrees.end();
3043 ++p) {
3044 CDir *dir = p->first;
3045
3046 if (dir->authority().first != who ||
3047 dir->authority().second == mds->get_nodeid())
3048 continue;
3049 assert(!dir->is_auth());
3050
3051 // wake any waiters
3052 list<CDir*> q;
3053 q.push_back(dir);
3054
3055 while (!q.empty()) {
3056 CDir *d = q.front();
3057 q.pop_front();
3058 d->take_waiting(d_mask, waiters);
3059
3060 // inode waiters too
3061 for (CDir::map_t::iterator p = d->items.begin();
3062 p != d->items.end();
3063 ++p) {
3064 CDentry *dn = p->second;
3065 CDentry::linkage_t *dnl = dn->get_linkage();
3066 if (dnl->is_primary()) {
3067 dnl->get_inode()->take_waiting(i_mask, waiters);
3068
3069 // recurse?
3070 list<CDir*> ls;
3071 dnl->get_inode()->get_dirfrags(ls);
3072 for (list<CDir*>::iterator p = ls.begin();
3073 p != ls.end();
3074 ++p) {
3075 CDir *subdir = *p;
3076 if (!subdir->is_subtree_root())
3077 q.push_back(subdir);
3078 }
3079 }
3080 }
3081 }
3082 }
3083
3084 kick_open_ino_peers(who);
3085 kick_find_ino_peers(who);
3086
3087 // queue them up.
3088 mds->queue_waiters(waiters);
3089 }
3090
3091 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3092 {
3093 dout(7) << "set_recovery_set " << s << dendl;
3094 recovery_set = s;
3095 }
3096
3097
3098 /*
3099 * during resolve state, we share resolves to determine who
3100 * is authoritative for which trees. we expect to get an resolve
3101 * from _everyone_ in the recovery_set (the mds cluster at the time of
3102 * the first failure).
3103 *
3104 * This functions puts the passed message before returning
3105 */
3106 void MDCache::handle_resolve(MMDSResolve *m)
3107 {
3108 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3109 mds_rank_t from = mds_rank_t(m->get_source().num());
3110
3111 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3112 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3113 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3114 return;
3115 }
3116 // wait until we reach the resolve stage!
3117 m->put();
3118 return;
3119 }
3120
3121 discard_delayed_resolve(from);
3122
3123 // ambiguous slave requests?
3124 if (!m->slave_requests.empty()) {
3125 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3126 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3127 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3128 assert(!p->second.committing);
3129 pending_masters.insert(p->first);
3130 }
3131 }
3132
3133 if (!pending_masters.empty()) {
3134 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3135 delayed_resolve[from] = m;
3136 return;
3137 }
3138 }
3139
3140 MMDSResolveAck *ack = new MMDSResolveAck;
3141 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3142 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3143 // COMMIT
3144 if (p->second.committing) {
3145 // already committing, waiting for the OP_COMMITTED slave reply
3146 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3147 } else {
3148 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3149 ack->add_commit(p->first);
3150 }
3151 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3152
3153 if (p->second.inode_caps.length() > 0) {
3154 // slave wants to export caps (rename)
3155 assert(mds->is_resolve());
3156
3157 inodeno_t ino;
3158 map<client_t,Capability::Export> cap_exports;
3159 bufferlist::iterator q = p->second.inode_caps.begin();
3160 ::decode(ino, q);
3161 ::decode(cap_exports, q);
3162
3163 assert(get_inode(ino));
3164
3165 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3166 q != cap_exports.end();
3167 ++q) {
3168 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3169 im.cap_id = ++last_cap_id; // assign a new cap ID
3170 im.issue_seq = 1;
3171 im.mseq = q->second.mseq;
3172 }
3173
3174 // will process these caps in rejoin stage
3175 rejoin_slave_exports[ino].first = from;
3176 rejoin_slave_exports[ino].second.swap(cap_exports);
3177
3178 // send information of imported caps back to slave
3179 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3180 }
3181 } else {
3182 // ABORT
3183 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3184 assert(!p->second.committing);
3185 ack->add_abort(p->first);
3186 }
3187 }
3188 mds->send_message(ack, m->get_connection());
3189 m->put();
3190 return;
3191 }
3192
3193 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3194 dout(10) << "delay processing subtree resolve" << dendl;
3195 delayed_resolve[from] = m;
3196 return;
3197 }
3198
3199 bool survivor = false;
3200 // am i a surviving ambiguous importer?
3201 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3202 survivor = true;
3203 // check for any import success/failure (from this node)
3204 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3205 while (p != my_ambiguous_imports.end()) {
3206 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3207 ++next;
3208 CDir *dir = get_dirfrag(p->first);
3209 assert(dir);
3210 dout(10) << "checking ambiguous import " << *dir << dendl;
3211 if (migrator->is_importing(dir->dirfrag()) &&
3212 migrator->get_import_peer(dir->dirfrag()) == from) {
3213 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3214
3215 // check if sender claims the subtree
3216 bool claimed_by_sender = false;
3217 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3218 q != m->subtrees.end();
3219 ++q) {
3220 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3221 CDir *base = get_force_dirfrag(q->first, false);
3222 if (!base || !base->contains(dir))
3223 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3224
3225 bool inside = true;
3226 set<CDir*> bounds;
3227 get_force_dirfrag_bound_set(q->second, bounds);
3228 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3229 CDir *bound = *p;
3230 if (bound->contains(dir)) {
3231 inside = false; // nope, bound is dir or parent of dir, not inside.
3232 break;
3233 }
3234 }
3235 if (inside)
3236 claimed_by_sender = true;
3237 }
3238
3239 my_ambiguous_imports.erase(p); // no longer ambiguous.
3240 if (claimed_by_sender) {
3241 dout(7) << "ambiguous import failed on " << *dir << dendl;
3242 migrator->import_reverse(dir);
3243 } else {
3244 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3245 migrator->import_finish(dir, true);
3246 }
3247 }
3248 p = next;
3249 }
3250 }
3251
3252 // update my dir_auth values
3253 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3254 // migrations between other nodes)
3255 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3256 pi != m->subtrees.end();
3257 ++pi) {
3258 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3259 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3260 if (!dir)
3261 continue;
3262 adjust_bounded_subtree_auth(dir, pi->second, from);
3263 try_subtree_merge(dir);
3264 }
3265
3266 show_subtrees();
3267
3268 // note ambiguous imports too
3269 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3270 pi != m->ambiguous_imports.end();
3271 ++pi) {
3272 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3273 other_ambiguous_imports[from][pi->first].swap( pi->second );
3274 }
3275
3276 // did i get them all?
3277 resolve_gather.erase(from);
3278
3279 maybe_resolve_finish();
3280
3281 m->put();
3282 }
3283
3284 void MDCache::process_delayed_resolve()
3285 {
3286 dout(10) << "process_delayed_resolve" << dendl;
3287 map<mds_rank_t, MMDSResolve*> tmp;
3288 tmp.swap(delayed_resolve);
3289 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3290 handle_resolve(p->second);
3291 }
3292
3293 void MDCache::discard_delayed_resolve(mds_rank_t who)
3294 {
3295 if (delayed_resolve.count(who)) {
3296 delayed_resolve[who]->put();
3297 delayed_resolve.erase(who);
3298 }
3299 }
3300
3301 void MDCache::maybe_resolve_finish()
3302 {
3303 assert(resolve_ack_gather.empty());
3304 assert(need_resolve_rollback.empty());
3305
3306 if (!resolve_gather.empty()) {
3307 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3308 << resolve_gather << ")" << dendl;
3309 return;
3310 }
3311
3312 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3313 disambiguate_my_imports();
3314 finish_committed_masters();
3315
3316 if (resolve_done) {
3317 assert(mds->is_resolve());
3318 trim_unlinked_inodes();
3319 recalc_auth_bits(false);
3320 resolve_done.release()->complete(0);
3321 } else {
3322 maybe_send_pending_rejoins();
3323 }
3324 }
3325
3326 /* This functions puts the passed message before returning */
3327 void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3328 {
3329 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3330 mds_rank_t from = mds_rank_t(ack->get_source().num());
3331
3332 if (!resolve_ack_gather.count(from) ||
3333 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3334 ack->put();
3335 return;
3336 }
3337
3338 if (ambiguous_slave_updates.count(from)) {
3339 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3340 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3341 }
3342
3343 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3344 p != ack->commit.end();
3345 ++p) {
3346 dout(10) << " commit on slave " << p->first << dendl;
3347
3348 if (ambiguous_slave_updates.count(from)) {
3349 remove_ambiguous_slave_update(p->first, from);
3350 continue;
3351 }
3352
3353 if (mds->is_resolve()) {
3354 // replay
3355 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3356 assert(su);
3357
3358 // log commit
3359 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3360 ESlaveUpdate::OP_COMMIT, su->origop),
3361 new C_MDC_SlaveCommit(this, from, p->first));
3362 mds->mdlog->flush();
3363
3364 finish_uncommitted_slave_update(p->first, from);
3365 } else {
3366 MDRequestRef mdr = request_get(p->first);
3367 // information about master imported caps
3368 if (p->second.length() > 0)
3369 mdr->more()->inode_import.claim(p->second);
3370
3371 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3372 request_finish(mdr);
3373 }
3374 }
3375
3376 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3377 p != ack->abort.end();
3378 ++p) {
3379 dout(10) << " abort on slave " << *p << dendl;
3380
3381 if (mds->is_resolve()) {
3382 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3383 assert(su);
3384
3385 // perform rollback (and journal a rollback entry)
3386 // note: this will hold up the resolve a bit, until the rollback entries journal.
3387 MDRequestRef null_ref;
3388 switch (su->origop) {
3389 case ESlaveUpdate::LINK:
3390 mds->server->do_link_rollback(su->rollback, from, null_ref);
3391 break;
3392 case ESlaveUpdate::RENAME:
3393 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3394 break;
3395 case ESlaveUpdate::RMDIR:
3396 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3397 break;
3398 default:
3399 ceph_abort();
3400 }
3401 } else {
3402 MDRequestRef mdr = request_get(*p);
3403 mdr->aborted = true;
3404 if (mdr->slave_request) {
3405 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3406 add_rollback(*p, from);
3407 } else {
3408 request_finish(mdr);
3409 }
3410 }
3411 }
3412
3413 if (!ambiguous_slave_updates.count(from))
3414 resolve_ack_gather.erase(from);
3415 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3416 send_subtree_resolves();
3417 process_delayed_resolve();
3418 }
3419
3420 ack->put();
3421 }
3422
3423 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3424 {
3425 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3426 uncommitted_slave_updates[master][reqid] = su;
3427 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3428 uncommitted_slave_rename_olddir[*p]++;
3429 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3430 uncommitted_slave_unlink[*p]++;
3431 }
3432
3433 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3434 {
3435 assert(uncommitted_slave_updates[master].count(reqid));
3436 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3437
3438 uncommitted_slave_updates[master].erase(reqid);
3439 if (uncommitted_slave_updates[master].empty())
3440 uncommitted_slave_updates.erase(master);
3441 // discard the non-auth subtree we renamed out of
3442 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3443 CInode *diri = *p;
3444 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3445 assert(it != uncommitted_slave_rename_olddir.end());
3446 it->second--;
3447 if (it->second == 0) {
3448 uncommitted_slave_rename_olddir.erase(it);
3449 list<CDir*> ls;
3450 diri->get_dirfrags(ls);
3451 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3452 CDir *root = get_subtree_root(*q);
3453 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3454 try_trim_non_auth_subtree(root);
3455 if (*q != root)
3456 break;
3457 }
3458 }
3459 } else
3460 assert(it->second > 0);
3461 }
3462 // removed the inodes that were unlinked by slave update
3463 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3464 CInode *in = *p;
3465 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3466 assert(it != uncommitted_slave_unlink.end());
3467 it->second--;
3468 if (it->second == 0) {
3469 uncommitted_slave_unlink.erase(it);
3470 if (!in->get_projected_parent_dn())
3471 mds->mdcache->remove_inode_recursive(in);
3472 } else
3473 assert(it->second > 0);
3474 }
3475 delete su;
3476 }
3477
3478 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3479 {
3480
3481 MDSlaveUpdate* su = NULL;
3482 if (uncommitted_slave_updates.count(master) &&
3483 uncommitted_slave_updates[master].count(reqid)) {
3484 su = uncommitted_slave_updates[master][reqid];
3485 assert(su);
3486 }
3487 return su;
3488 }
3489
3490 void MDCache::finish_rollback(metareqid_t reqid) {
3491 assert(need_resolve_rollback.count(reqid));
3492 if (mds->is_resolve())
3493 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3494 need_resolve_rollback.erase(reqid);
3495 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3496 send_subtree_resolves();
3497 process_delayed_resolve();
3498 }
3499 }
3500
3501 void MDCache::disambiguate_other_imports()
3502 {
3503 dout(10) << "disambiguate_other_imports" << dendl;
3504
3505 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3506 // other nodes' ambiguous imports
3507 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3508 p != other_ambiguous_imports.end();
3509 ++p) {
3510 mds_rank_t who = p->first;
3511 dout(10) << "ambiguous imports for mds." << who << dendl;
3512
3513 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3514 q != p->second.end();
3515 ++q) {
3516 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3517 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3518 CDir *dir = get_force_dirfrag(q->first, recovering);
3519 if (!dir) continue;
3520
3521 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3522 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3523 dout(10) << " mds." << who << " did import " << *dir << dendl;
3524 adjust_bounded_subtree_auth(dir, q->second, who);
3525 try_subtree_merge(dir);
3526 } else {
3527 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3528 }
3529 }
3530 }
3531 other_ambiguous_imports.clear();
3532 }
3533
3534 void MDCache::disambiguate_my_imports()
3535 {
3536 dout(10) << "disambiguate_my_imports" << dendl;
3537
3538 if (!mds->is_resolve()) {
3539 assert(my_ambiguous_imports.empty());
3540 return;
3541 }
3542
3543 disambiguate_other_imports();
3544
3545 // my ambiguous imports
3546 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3547 while (!my_ambiguous_imports.empty()) {
3548 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3549
3550 CDir *dir = get_dirfrag(q->first);
3551 assert(dir);
3552
3553 if (dir->authority() != me_ambig) {
3554 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3555 cancel_ambiguous_import(dir);
3556
3557 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3558
3559 // subtree may have been swallowed by another node claiming dir
3560 // as their own.
3561 CDir *root = get_subtree_root(dir);
3562 if (root != dir)
3563 dout(10) << " subtree root is " << *root << dendl;
3564 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3565 try_trim_non_auth_subtree(root);
3566 } else {
3567 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3568 finish_ambiguous_import(q->first);
3569 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3570 }
3571 }
3572 assert(my_ambiguous_imports.empty());
3573 mds->mdlog->flush();
3574
3575 // verify all my subtrees are unambiguous!
3576 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3577 p != subtrees.end();
3578 ++p) {
3579 CDir *dir = p->first;
3580 if (dir->is_ambiguous_dir_auth()) {
3581 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3582 }
3583 assert(!dir->is_ambiguous_dir_auth());
3584 }
3585
3586 show_subtrees();
3587 }
3588
3589
3590 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3591 {
3592 assert(my_ambiguous_imports.count(base) == 0);
3593 my_ambiguous_imports[base] = bounds;
3594 }
3595
3596
3597 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3598 {
3599 // make a list
3600 vector<dirfrag_t> binos;
3601 for (set<CDir*>::iterator p = bounds.begin();
3602 p != bounds.end();
3603 ++p)
3604 binos.push_back((*p)->dirfrag());
3605
3606 // note: this can get called twice if the exporter fails during recovery
3607 if (my_ambiguous_imports.count(base->dirfrag()))
3608 my_ambiguous_imports.erase(base->dirfrag());
3609
3610 add_ambiguous_import(base->dirfrag(), binos);
3611 }
3612
3613 void MDCache::cancel_ambiguous_import(CDir *dir)
3614 {
3615 dirfrag_t df = dir->dirfrag();
3616 assert(my_ambiguous_imports.count(df));
3617 dout(10) << "cancel_ambiguous_import " << df
3618 << " bounds " << my_ambiguous_imports[df]
3619 << " " << *dir
3620 << dendl;
3621 my_ambiguous_imports.erase(df);
3622 }
3623
3624 void MDCache::finish_ambiguous_import(dirfrag_t df)
3625 {
3626 assert(my_ambiguous_imports.count(df));
3627 vector<dirfrag_t> bounds;
3628 bounds.swap(my_ambiguous_imports[df]);
3629 my_ambiguous_imports.erase(df);
3630
3631 dout(10) << "finish_ambiguous_import " << df
3632 << " bounds " << bounds
3633 << dendl;
3634 CDir *dir = get_dirfrag(df);
3635 assert(dir);
3636
3637 // adjust dir_auth, import maps
3638 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3639 try_subtree_merge(dir);
3640 }
3641
3642 void MDCache::remove_inode_recursive(CInode *in)
3643 {
3644 dout(10) << "remove_inode_recursive " << *in << dendl;
3645 list<CDir*> ls;
3646 in->get_dirfrags(ls);
3647 list<CDir*>::iterator p = ls.begin();
3648 while (p != ls.end()) {
3649 CDir *subdir = *p++;
3650
3651 dout(10) << " removing dirfrag " << subdir << dendl;
3652 CDir::map_t::iterator q = subdir->items.begin();
3653 while (q != subdir->items.end()) {
3654 CDentry *dn = q->second;
3655 ++q;
3656 CDentry::linkage_t *dnl = dn->get_linkage();
3657 if (dnl->is_primary()) {
3658 CInode *tin = dnl->get_inode();
3659 subdir->unlink_inode(dn);
3660 remove_inode_recursive(tin);
3661 }
3662 subdir->remove_dentry(dn);
3663 }
3664
3665 if (subdir->is_subtree_root())
3666 remove_subtree(subdir);
3667 in->close_dirfrag(subdir->dirfrag().frag);
3668 }
3669 remove_inode(in);
3670 }
3671
3672 bool MDCache::expire_recursive(
3673 CInode *in,
3674 map<mds_rank_t, MCacheExpire*>& expiremap)
3675 {
3676 assert(!in->is_auth());
3677
3678 dout(10) << __func__ << ":" << *in << dendl;
3679
3680 // Recurse into any dirfrags beneath this inode
3681 list<CDir*> ls;
3682 in->get_dirfrags(ls);
3683 for (auto subdir : ls) {
3684 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3685 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3686 return true;
3687 }
3688
3689 for (auto &it : subdir->items) {
3690 CDentry *dn = it.second;
3691 CDentry::linkage_t *dnl = dn->get_linkage();
3692 if (dnl->is_primary()) {
3693 CInode *tin = dnl->get_inode();
3694
3695 /* Remote strays with linkage (i.e. hardlinks) should not be
3696 * expired, because they may be the target of
3697 * a rename() as the owning MDS shuts down */
3698 if (!tin->is_stray() && tin->inode.nlink) {
3699 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3700 return true;
3701 }
3702
3703 const bool abort = expire_recursive(tin, expiremap);
3704 if (abort) {
3705 return true;
3706 }
3707 }
3708 if (dn->lru_is_expireable()) {
3709 trim_dentry(dn, expiremap);
3710 } else {
3711 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3712 return true;
3713 }
3714 }
3715 }
3716
3717 return false;
3718 }
3719
3720 void MDCache::trim_unlinked_inodes()
3721 {
3722 dout(7) << "trim_unlinked_inodes" << dendl;
3723 list<CInode*> q;
3724 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
3725 p != inode_map.end();
3726 ++p) {
3727 CInode *in = p->second;
3728 if (in->get_parent_dn() == NULL && !in->is_base()) {
3729 dout(7) << " will trim from " << *in << dendl;
3730 q.push_back(in);
3731 }
3732 }
3733 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3734 remove_inode_recursive(*p);
3735 }
3736
3737 /** recalc_auth_bits()
3738 * once subtree auth is disambiguated, we need to adjust all the
3739 * auth and dirty bits in our cache before moving on.
3740 */
3741 void MDCache::recalc_auth_bits(bool replay)
3742 {
3743 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3744
3745 if (root) {
3746 root->inode_auth.first = mds->mdsmap->get_root();
3747 bool auth = mds->get_nodeid() == root->inode_auth.first;
3748 if (auth) {
3749 root->state_set(CInode::STATE_AUTH);
3750 } else {
3751 root->state_clear(CInode::STATE_AUTH);
3752 if (!replay)
3753 root->state_set(CInode::STATE_REJOINING);
3754 }
3755 }
3756
3757 set<CInode*> subtree_inodes;
3758 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3759 p != subtrees.end();
3760 ++p) {
3761 if (p->first->dir_auth.first == mds->get_nodeid())
3762 subtree_inodes.insert(p->first->inode);
3763 }
3764
3765 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3766 p != subtrees.end();
3767 ++p) {
3768 if (p->first->inode->is_mdsdir()) {
3769 CInode *in = p->first->inode;
3770 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3771 if (auth) {
3772 in->state_set(CInode::STATE_AUTH);
3773 } else {
3774 in->state_clear(CInode::STATE_AUTH);
3775 if (!replay)
3776 in->state_set(CInode::STATE_REJOINING);
3777 }
3778 }
3779
3780 list<CDir*> dfq; // dirfrag queue
3781 dfq.push_back(p->first);
3782
3783 bool auth = p->first->authority().first == mds->get_nodeid();
3784 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3785
3786 while (!dfq.empty()) {
3787 CDir *dir = dfq.front();
3788 dfq.pop_front();
3789
3790 // dir
3791 if (auth) {
3792 dir->state_set(CDir::STATE_AUTH);
3793 } else {
3794 dir->state_clear(CDir::STATE_AUTH);
3795 if (!replay) {
3796 // close empty non-auth dirfrag
3797 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3798 dir->inode->close_dirfrag(dir->get_frag());
3799 continue;
3800 }
3801 dir->state_set(CDir::STATE_REJOINING);
3802 dir->state_clear(CDir::STATE_COMPLETE);
3803 if (dir->is_dirty())
3804 dir->mark_clean();
3805 }
3806 }
3807
3808 // dentries in this dir
3809 for (CDir::map_t::iterator q = dir->items.begin();
3810 q != dir->items.end();
3811 ++q) {
3812 // dn
3813 CDentry *dn = q->second;
3814 CDentry::linkage_t *dnl = dn->get_linkage();
3815 if (auth) {
3816 dn->state_set(CDentry::STATE_AUTH);
3817 } else {
3818 dn->state_clear(CDentry::STATE_AUTH);
3819 if (!replay) {
3820 dn->state_set(CDentry::STATE_REJOINING);
3821 if (dn->is_dirty())
3822 dn->mark_clean();
3823 }
3824 }
3825
3826 if (dnl->is_primary()) {
3827 // inode
3828 CInode *in = dnl->get_inode();
3829 if (auth) {
3830 in->state_set(CInode::STATE_AUTH);
3831 } else {
3832 in->state_clear(CInode::STATE_AUTH);
3833 if (!replay) {
3834 in->state_set(CInode::STATE_REJOINING);
3835 if (in->is_dirty())
3836 in->mark_clean();
3837 if (in->is_dirty_parent())
3838 in->clear_dirty_parent();
3839 // avoid touching scatterlocks for our subtree roots!
3840 if (subtree_inodes.count(in) == 0)
3841 in->clear_scatter_dirty();
3842 }
3843 }
3844 // recurse?
3845 if (in->is_dir())
3846 in->get_nested_dirfrags(dfq);
3847 }
3848 }
3849 }
3850 }
3851
3852 show_subtrees();
3853 show_cache();
3854 }
3855
3856
3857
3858 // ===========================================================================
3859 // REJOIN
3860
3861 /*
3862 * notes on scatterlock recovery:
3863 *
3864 * - recovering inode replica sends scatterlock data for any subtree
3865 * roots (the only ones that are possibly dirty).
3866 *
3867 * - surviving auth incorporates any provided scatterlock data. any
3868 * pending gathers are then finished, as with the other lock types.
3869 *
3870 * that takes care of surviving auth + (recovering replica)*.
3871 *
3872 * - surviving replica sends strong_inode, which includes current
3873 * scatterlock state, AND any dirty scatterlock data. this
3874 * provides the recovering auth with everything it might need.
3875 *
3876 * - recovering auth must pick initial scatterlock state based on
3877 * (weak|strong) rejoins.
3878 * - always assimilate scatterlock data (it can't hurt)
3879 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3880 * - include base inode in ack for all inodes that saw scatterlock content
3881 *
3882 * also, for scatter gather,
3883 *
3884 * - auth increments {frag,r}stat.version on completion of any gather.
3885 *
3886 * - auth incorporates changes in a gather _only_ if the version
3887 * matches.
3888 *
3889 * - replica discards changes any time the scatterlock syncs, and
3890 * after recovery.
3891 */
3892
3893 void MDCache::dump_rejoin_status(Formatter *f) const
3894 {
3895 f->open_object_section("rejoin_status");
3896 f->dump_stream("rejoin_gather") << rejoin_gather;
3897 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3898 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3899 f->close_section();
3900 }
3901
3902 void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3903 {
3904 dout(10) << "rejoin_start" << dendl;
3905 assert(!rejoin_done);
3906 rejoin_done.reset(rejoin_done_);
3907
3908 rejoin_gather = recovery_set;
3909 // need finish opening cap inodes before sending cache rejoins
3910 rejoin_gather.insert(mds->get_nodeid());
3911 process_imported_caps();
3912 }
3913
3914 /*
3915 * rejoin phase!
3916 *
3917 * this initiates rejoin. it shoudl be called before we get any
3918 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3919 *
3920 * we start out by sending rejoins to everyone in the recovery set.
3921 *
3922 * if we are rejoin, send for all regions in our cache.
3923 * if we are active|stopping, send only to nodes that are are rejoining.
3924 */
3925 void MDCache::rejoin_send_rejoins()
3926 {
3927 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3928
3929 if (rejoin_gather.count(mds->get_nodeid())) {
3930 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3931 rejoins_pending = true;
3932 return;
3933 }
3934 if (!resolve_gather.empty()) {
3935 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3936 << resolve_gather << ")" << dendl;
3937 rejoins_pending = true;
3938 return;
3939 }
3940
3941 assert(!migrator->is_importing());
3942 assert(!migrator->is_exporting());
3943
3944 if (!mds->is_rejoin()) {
3945 disambiguate_other_imports();
3946 }
3947
3948 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3949
3950
3951 // if i am rejoining, send a rejoin to everyone.
3952 // otherwise, just send to others who are rejoining.
3953 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3954 p != recovery_set.end();
3955 ++p) {
3956 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3957 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3958 if (mds->is_rejoin())
3959 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3960 else if (mds->mdsmap->is_rejoin(*p))
3961 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3962 }
3963
3964 if (mds->is_rejoin()) {
3965 map<client_t, set<mds_rank_t> > client_exports;
3966 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3967 assert(cap_export_targets.count(p->first));
3968 mds_rank_t target = cap_export_targets[p->first];
3969 if (rejoins.count(target) == 0)
3970 continue;
3971 rejoins[target]->cap_exports[p->first] = p->second;
3972 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3973 client_exports[q->first].insert(target);
3974 }
3975 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3976 p != client_exports.end();
3977 ++p) {
3978 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3979 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3980 rejoins[*q]->client_map[p->first] = inst;
3981 }
3982 }
3983
3984
3985 // check all subtrees
3986 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
3987 p != subtrees.end();
3988 ++p) {
3989 CDir *dir = p->first;
3990 assert(dir->is_subtree_root());
3991 if (dir->is_ambiguous_dir_auth()) {
3992 // exporter is recovering, importer is survivor.
3993 assert(rejoins.count(dir->authority().first));
3994 assert(!rejoins.count(dir->authority().second));
3995 continue;
3996 }
3997
3998 // my subtree?
3999 if (dir->is_auth())
4000 continue; // skip my own regions!
4001
4002 mds_rank_t auth = dir->get_dir_auth().first;
4003 assert(auth >= 0);
4004 if (rejoins.count(auth) == 0)
4005 continue; // don't care about this node's subtrees
4006
4007 rejoin_walk(dir, rejoins[auth]);
4008 }
4009
4010 // rejoin root inodes, too
4011 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4012 p != rejoins.end();
4013 ++p) {
4014 if (mds->is_rejoin()) {
4015 // weak
4016 if (p->first == 0 && root) {
4017 p->second->add_weak_inode(root->vino());
4018 if (root->is_dirty_scattered()) {
4019 dout(10) << " sending scatterlock state on root " << *root << dendl;
4020 p->second->add_scatterlock_state(root);
4021 }
4022 }
4023 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4024 if (in)
4025 p->second->add_weak_inode(in->vino());
4026 }
4027 } else {
4028 // strong
4029 if (p->first == 0 && root) {
4030 p->second->add_strong_inode(root->vino(),
4031 root->get_replica_nonce(),
4032 root->get_caps_wanted(),
4033 root->filelock.get_state(),
4034 root->nestlock.get_state(),
4035 root->dirfragtreelock.get_state());
4036 root->state_set(CInode::STATE_REJOINING);
4037 if (root->is_dirty_scattered()) {
4038 dout(10) << " sending scatterlock state on root " << *root << dendl;
4039 p->second->add_scatterlock_state(root);
4040 }
4041 }
4042
4043 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4044 p->second->add_strong_inode(in->vino(),
4045 in->get_replica_nonce(),
4046 in->get_caps_wanted(),
4047 in->filelock.get_state(),
4048 in->nestlock.get_state(),
4049 in->dirfragtreelock.get_state());
4050 in->state_set(CInode::STATE_REJOINING);
4051 }
4052 }
4053 }
4054
4055 if (!mds->is_rejoin()) {
4056 // i am survivor. send strong rejoin.
4057 // note request remote_auth_pins, xlocks
4058 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4059 p != active_requests.end();
4060 ++p) {
4061 MDRequestRef& mdr = p->second;
4062 if (mdr->is_slave())
4063 continue;
4064 // auth pins
4065 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4066 q != mdr->remote_auth_pins.end();
4067 ++q) {
4068 if (!q->first->is_auth()) {
4069 assert(q->second == q->first->authority().first);
4070 if (rejoins.count(q->second) == 0) continue;
4071 MMDSCacheRejoin *rejoin = rejoins[q->second];
4072
4073 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4074 MDSCacheObjectInfo i;
4075 q->first->set_object_info(i);
4076 if (i.ino)
4077 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4078 else
4079 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4080
4081 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4082 mdr->more()->rename_inode == q->first)
4083 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4084 mdr->reqid, mdr->attempt);
4085 }
4086 }
4087 // xlocks
4088 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4089 q != mdr->xlocks.end();
4090 ++q) {
4091 if (!(*q)->get_parent()->is_auth()) {
4092 mds_rank_t who = (*q)->get_parent()->authority().first;
4093 if (rejoins.count(who) == 0) continue;
4094 MMDSCacheRejoin *rejoin = rejoins[who];
4095
4096 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4097 MDSCacheObjectInfo i;
4098 (*q)->get_parent()->set_object_info(i);
4099 if (i.ino)
4100 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4101 mdr->reqid, mdr->attempt);
4102 else
4103 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4104 mdr->reqid, mdr->attempt);
4105 }
4106 }
4107 // remote wrlocks
4108 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4109 q != mdr->remote_wrlocks.end();
4110 ++q) {
4111 mds_rank_t who = q->second;
4112 if (rejoins.count(who) == 0) continue;
4113 MMDSCacheRejoin *rejoin = rejoins[who];
4114
4115 dout(15) << " " << *mdr << " wrlock on " << q->second
4116 << " " << q->first->get_parent() << dendl;
4117 MDSCacheObjectInfo i;
4118 q->first->get_parent()->set_object_info(i);
4119 assert(i.ino);
4120 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4121 mdr->reqid, mdr->attempt);
4122 }
4123 }
4124 }
4125
4126 // send the messages
4127 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4128 p != rejoins.end();
4129 ++p) {
4130 assert(rejoin_sent.count(p->first) == 0);
4131 assert(rejoin_ack_gather.count(p->first) == 0);
4132 rejoin_sent.insert(p->first);
4133 rejoin_ack_gather.insert(p->first);
4134 mds->send_message_mds(p->second, p->first);
4135 }
4136 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4137 rejoins_pending = false;
4138
4139 // nothing?
4140 if (mds->is_rejoin() && rejoins.empty()) {
4141 dout(10) << "nothing to rejoin" << dendl;
4142 rejoin_gather_finish();
4143 }
4144 }
4145
4146
4147 /**
4148 * rejoin_walk - build rejoin declarations for a subtree
4149 *
4150 * @param dir subtree root
4151 * @param rejoin rejoin message
4152 *
4153 * from a rejoining node:
4154 * weak dirfrag
4155 * weak dentries (w/ connectivity)
4156 *
4157 * from a surviving node:
4158 * strong dirfrag
4159 * strong dentries (no connectivity!)
4160 * strong inodes
4161 */
4162 void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4163 {
4164 dout(10) << "rejoin_walk " << *dir << dendl;
4165
4166 list<CDir*> nested; // finish this dir, then do nested items
4167
4168 if (mds->is_rejoin()) {
4169 // WEAK
4170 rejoin->add_weak_dirfrag(dir->dirfrag());
4171 for (CDir::map_t::iterator p = dir->items.begin();
4172 p != dir->items.end();
4173 ++p) {
4174 CDentry *dn = p->second;
4175 CDentry::linkage_t *dnl = dn->get_linkage();
4176 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4177 assert(dnl->is_primary());
4178 CInode *in = dnl->get_inode();
4179 assert(dnl->get_inode()->is_dir());
4180 rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
4181 in->get_nested_dirfrags(nested);
4182 if (in->is_dirty_scattered()) {
4183 dout(10) << " sending scatterlock state on " << *in << dendl;
4184 rejoin->add_scatterlock_state(in);
4185 }
4186 }
4187 } else {
4188 // STRONG
4189 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4190 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4191 dir->state_set(CDir::STATE_REJOINING);
4192
4193 for (CDir::map_t::iterator p = dir->items.begin();
4194 p != dir->items.end();
4195 ++p) {
4196 CDentry *dn = p->second;
4197 CDentry::linkage_t *dnl = dn->get_linkage();
4198 dout(15) << " add_strong_dentry " << *dn << dendl;
4199 rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4200 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4201 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4202 dnl->is_remote() ? dnl->get_remote_d_type():0,
4203 dn->get_replica_nonce(),
4204 dn->lock.get_state());
4205 dn->state_set(CDentry::STATE_REJOINING);
4206 if (dnl->is_primary()) {
4207 CInode *in = dnl->get_inode();
4208 dout(15) << " add_strong_inode " << *in << dendl;
4209 rejoin->add_strong_inode(in->vino(),
4210 in->get_replica_nonce(),
4211 in->get_caps_wanted(),
4212 in->filelock.get_state(),
4213 in->nestlock.get_state(),
4214 in->dirfragtreelock.get_state());
4215 in->state_set(CInode::STATE_REJOINING);
4216 in->get_nested_dirfrags(nested);
4217 if (in->is_dirty_scattered()) {
4218 dout(10) << " sending scatterlock state on " << *in << dendl;
4219 rejoin->add_scatterlock_state(in);
4220 }
4221 }
4222 }
4223 }
4224
4225 // recurse into nested dirs
4226 for (list<CDir*>::iterator p = nested.begin();
4227 p != nested.end();
4228 ++p)
4229 rejoin_walk(*p, rejoin);
4230 }
4231
4232
4233 /*
4234 * i got a rejoin.
4235 * - reply with the lockstate
4236 *
4237 * if i am active|stopping,
4238 * - remove source from replica list for everything not referenced here.
4239 * This function puts the passed message before returning.
4240 */
4241 void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4242 {
4243 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4244 << " (" << m->get_payload().length() << " bytes)"
4245 << dendl;
4246
4247 switch (m->op) {
4248 case MMDSCacheRejoin::OP_WEAK:
4249 handle_cache_rejoin_weak(m);
4250 break;
4251 case MMDSCacheRejoin::OP_STRONG:
4252 handle_cache_rejoin_strong(m);
4253 break;
4254 case MMDSCacheRejoin::OP_ACK:
4255 handle_cache_rejoin_ack(m);
4256 break;
4257
4258 default:
4259 ceph_abort();
4260 }
4261 m->put();
4262 }
4263
4264
4265 /*
4266 * handle_cache_rejoin_weak
4267 *
4268 * the sender
4269 * - is recovering from their journal.
4270 * - may have incorrect (out of date) inode contents
4271 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4272 *
4273 * if the sender didn't trim_non_auth(), they
4274 * - may have incorrect (out of date) dentry/inode linkage
4275 * - may have deleted/purged inodes
4276 * and i may have to go to disk to get accurate inode contents. yuck.
4277 * This functions DOES NOT put the passed message before returning
4278 */
4279 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4280 {
4281 mds_rank_t from = mds_rank_t(weak->get_source().num());
4282
4283 // possible response(s)
4284 MMDSCacheRejoin *ack = 0; // if survivor
4285 set<vinodeno_t> acked_inodes; // if survivor
4286 set<SimpleLock *> gather_locks; // if survivor
4287 bool survivor = false; // am i a survivor?
4288
4289 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4290 survivor = true;
4291 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4292 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4293
4294 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4295
4296 // check cap exports
4297 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4298 CInode *in = get_inode(p->first);
4299 assert(!in || in->is_auth());
4300 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4301 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4302 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4303 Capability::Import& im = imported_caps[p->first][q->first];
4304 if (cap) {
4305 im.cap_id = cap->get_cap_id();
4306 im.issue_seq = cap->get_last_seq();
4307 im.mseq = cap->get_mseq();
4308 } else {
4309 // all are zero
4310 }
4311 }
4312 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4313 }
4314
4315 ::encode(imported_caps, ack->imported_caps);
4316 } else {
4317 assert(mds->is_rejoin());
4318
4319 // we may have already received a strong rejoin from the sender.
4320 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4321 assert(gather_locks.empty());
4322
4323 // check cap exports.
4324 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4325
4326 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4327 CInode *in = get_inode(p->first);
4328 assert(in && in->is_auth());
4329 // note
4330 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4331 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4332 cap_imports[p->first][q->first][from] = q->second;
4333 }
4334 }
4335 }
4336
4337 // assimilate any potentially dirty scatterlock state
4338 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4339 p != weak->inode_scatterlocks.end();
4340 ++p) {
4341 CInode *in = get_inode(p->first);
4342 assert(in);
4343 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4344 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4345 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4346 if (!survivor)
4347 rejoin_potential_updated_scatterlocks.insert(in);
4348 }
4349
4350 // recovering peer may send incorrect dirfrags here. we need to
4351 // infer which dirfrag they meant. the ack will include a
4352 // strong_dirfrag that will set them straight on the fragmentation.
4353
4354 // walk weak map
4355 set<CDir*> dirs_to_share;
4356 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4357 p != weak->weak_dirfrags.end();
4358 ++p) {
4359 CInode *diri = get_inode(p->ino);
4360 if (!diri)
4361 dout(0) << " missing dir ino " << p->ino << dendl;
4362 assert(diri);
4363
4364 list<frag_t> ls;
4365 if (diri->dirfragtree.is_leaf(p->frag)) {
4366 ls.push_back(p->frag);
4367 } else {
4368 diri->dirfragtree.get_leaves_under(p->frag, ls);
4369 if (ls.empty())
4370 ls.push_back(diri->dirfragtree[p->frag.value()]);
4371 }
4372 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4373 frag_t fg = *q;
4374 CDir *dir = diri->get_dirfrag(fg);
4375 if (!dir) {
4376 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4377 continue;
4378 }
4379 assert(dir);
4380 if (dirs_to_share.count(dir)) {
4381 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4382 } else {
4383 dirs_to_share.insert(dir);
4384 unsigned nonce = dir->add_replica(from);
4385 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4386 if (ack) {
4387 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4388 ack->add_dirfrag_base(dir);
4389 }
4390 }
4391 }
4392 }
4393
4394 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4395 p != weak->weak.end();
4396 ++p) {
4397 CInode *diri = get_inode(p->first);
4398 if (!diri)
4399 dout(0) << " missing dir ino " << p->first << dendl;
4400 assert(diri);
4401
4402 // weak dentries
4403 CDir *dir = 0;
4404 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4405 q != p->second.end();
4406 ++q) {
4407 // locate proper dirfrag.
4408 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4409 frag_t fg = diri->pick_dirfrag(q->first.name);
4410 if (!dir || dir->get_frag() != fg) {
4411 dir = diri->get_dirfrag(fg);
4412 if (!dir)
4413 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4414 assert(dir);
4415 assert(dirs_to_share.count(dir));
4416 }
4417
4418 // and dentry
4419 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4420 assert(dn);
4421 CDentry::linkage_t *dnl = dn->get_linkage();
4422 assert(dnl->is_primary());
4423
4424 if (survivor && dn->is_replica(from))
4425 dentry_remove_replica(dn, from, gather_locks);
4426 unsigned dnonce = dn->add_replica(from);
4427 dout(10) << " have " << *dn << dendl;
4428 if (ack)
4429 ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4430 dnl->get_inode()->ino(), inodeno_t(0), 0,
4431 dnonce, dn->lock.get_replica_state());
4432
4433 // inode
4434 CInode *in = dnl->get_inode();
4435 assert(in);
4436
4437 if (survivor && in->is_replica(from))
4438 inode_remove_replica(in, from, true, gather_locks);
4439 unsigned inonce = in->add_replica(from);
4440 dout(10) << " have " << *in << dendl;
4441
4442 // scatter the dirlock, just in case?
4443 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4444 in->filelock.set_state(LOCK_MIX);
4445
4446 if (ack) {
4447 acked_inodes.insert(in->vino());
4448 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4449 bufferlist bl;
4450 in->_encode_locks_state_for_rejoin(bl, from);
4451 ack->add_inode_locks(in, inonce, bl);
4452 }
4453 }
4454 }
4455
4456 // weak base inodes? (root, stray, etc.)
4457 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4458 p != weak->weak_inodes.end();
4459 ++p) {
4460 CInode *in = get_inode(*p);
4461 assert(in); // hmm fixme wrt stray?
4462 if (survivor && in->is_replica(from))
4463 inode_remove_replica(in, from, true, gather_locks);
4464 unsigned inonce = in->add_replica(from);
4465 dout(10) << " have base " << *in << dendl;
4466
4467 if (ack) {
4468 acked_inodes.insert(in->vino());
4469 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4470 bufferlist bl;
4471 in->_encode_locks_state_for_rejoin(bl, from);
4472 ack->add_inode_locks(in, inonce, bl);
4473 }
4474 }
4475
4476 assert(rejoin_gather.count(from));
4477 rejoin_gather.erase(from);
4478 if (survivor) {
4479 // survivor. do everything now.
4480 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4481 p != weak->inode_scatterlocks.end();
4482 ++p) {
4483 CInode *in = get_inode(p->first);
4484 assert(in);
4485 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4486 acked_inodes.insert(in->vino());
4487 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4488 }
4489
4490 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4491 mds->send_message(ack, weak->get_connection());
4492
4493 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4494 if (!(*p)->is_stable())
4495 mds->locker->eval_gather(*p);
4496 }
4497 } else {
4498 // done?
4499 if (rejoin_gather.empty()) {
4500 rejoin_gather_finish();
4501 } else {
4502 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4503 }
4504 }
4505 }
4506
4507 class C_MDC_RejoinGatherFinish : public MDCacheContext {
4508 public:
4509 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4510 void finish(int r) override {
4511 mdcache->rejoin_gather_finish();
4512 }
4513 };
4514
4515 /*
4516 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4517 *
4518 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4519 * ack, the replica dne, and we can remove it from our replica maps.
4520 */
4521 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4522 set<vinodeno_t>& acked_inodes,
4523 set<SimpleLock *>& gather_locks)
4524 {
4525 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4526
4527 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
4528 p != inode_map.end();
4529 ++p) {
4530 CInode *in = p->second;
4531
4532 // inode?
4533 if (in->is_auth() &&
4534 in->is_replica(from) &&
4535 (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) {
4536 inode_remove_replica(in, from, false, gather_locks);
4537 dout(10) << " rem " << *in << dendl;
4538 }
4539
4540 if (!in->is_dir()) continue;
4541
4542 list<CDir*> dfs;
4543 in->get_dirfrags(dfs);
4544 for (list<CDir*>::iterator p = dfs.begin();
4545 p != dfs.end();
4546 ++p) {
4547 CDir *dir = *p;
4548
4549 if (dir->is_auth() &&
4550 dir->is_replica(from) &&
4551 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4552 dir->remove_replica(from);
4553 dout(10) << " rem " << *dir << dendl;
4554 }
4555
4556 // dentries
4557 for (CDir::map_t::iterator p = dir->items.begin();
4558 p != dir->items.end();
4559 ++p) {
4560 CDentry *dn = p->second;
4561
4562 if (dn->is_replica(from) &&
4563 (ack == NULL ||
4564 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4565 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
4566 dentry_remove_replica(dn, from, gather_locks);
4567 dout(10) << " rem " << *dn << dendl;
4568 }
4569 }
4570 }
4571 }
4572 }
4573
4574
4575 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4576 {
4577 CInode *in = new CInode(this, true, 1, last);
4578 in->inode.ino = ino;
4579 in->state_set(CInode::STATE_REJOINUNDEF);
4580 add_inode(in);
4581 rejoin_undef_inodes.insert(in);
4582 dout(10) << " invented " << *in << dendl;
4583 return in;
4584 }
4585
4586 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4587 {
4588 CInode *in = get_inode(df.ino);
4589 if (!in)
4590 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4591 if (!in->is_dir()) {
4592 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4593 in->inode.mode = S_IFDIR;
4594 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4595 }
4596 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4597 dir->state_set(CDir::STATE_REJOINUNDEF);
4598 rejoin_undef_dirfrags.insert(dir);
4599 dout(10) << " invented " << *dir << dendl;
4600 return dir;
4601 }
4602
4603 /* This functions DOES NOT put the passed message before returning */
4604 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4605 {
4606 mds_rank_t from = mds_rank_t(strong->get_source().num());
4607
4608 // only a recovering node will get a strong rejoin.
4609 assert(mds->is_rejoin());
4610
4611 // assimilate any potentially dirty scatterlock state
4612 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4613 p != strong->inode_scatterlocks.end();
4614 ++p) {
4615 CInode *in = get_inode(p->first);
4616 assert(in);
4617 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4618 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4619 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4620 rejoin_potential_updated_scatterlocks.insert(in);
4621 }
4622
4623 rejoin_unlinked_inodes[from].clear();
4624
4625 // surviving peer may send incorrect dirfrag here (maybe they didn't
4626 // get the fragment notify, or maybe we rolled back?). we need to
4627 // infer the right frag and get them with the program. somehow.
4628 // we don't normally send ACK.. so we'll need to bundle this with
4629 // MISSING or something.
4630
4631 // strong dirfrags/dentries.
4632 // also process auth_pins, xlocks.
4633 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4634 p != strong->strong_dirfrags.end();
4635 ++p) {
4636 CInode *diri = get_inode(p->first.ino);
4637 if (!diri)
4638 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4639 CDir *dir = diri->get_dirfrag(p->first.frag);
4640 bool refragged = false;
4641 if (dir) {
4642 dout(10) << " have " << *dir << dendl;
4643 } else {
4644 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4645 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4646 else if (diri->dirfragtree.is_leaf(p->first.frag))
4647 dir = rejoin_invent_dirfrag(p->first);
4648 }
4649 if (dir) {
4650 dir->add_replica(from, p->second.nonce);
4651 dir->dir_rep = p->second.dir_rep;
4652 } else {
4653 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4654 list<frag_t> ls;
4655 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4656 if (ls.empty())
4657 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4658 dout(10) << " maps to frag(s) " << ls << dendl;
4659 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4660 CDir *dir = diri->get_dirfrag(*q);
4661 if (!dir)
4662 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4663 else
4664 dout(10) << " have(approx) " << *dir << dendl;
4665 dir->add_replica(from, p->second.nonce);
4666 dir->dir_rep = p->second.dir_rep;
4667 }
4668 refragged = true;
4669 }
4670
4671 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4672 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4673 q != dmap.end();
4674 ++q) {
4675 CDentry *dn;
4676 if (!refragged)
4677 dn = dir->lookup(q->first.name, q->first.snapid);
4678 else {
4679 frag_t fg = diri->pick_dirfrag(q->first.name);
4680 dir = diri->get_dirfrag(fg);
4681 assert(dir);
4682 dn = dir->lookup(q->first.name, q->first.snapid);
4683 }
4684 if (!dn) {
4685 if (q->second.is_remote()) {
4686 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4687 q->second.first, q->first.snapid);
4688 } else if (q->second.is_null()) {
4689 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4690 } else {
4691 CInode *in = get_inode(q->second.ino, q->first.snapid);
4692 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4693 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4694 }
4695 dout(10) << " invented " << *dn << dendl;
4696 }
4697 CDentry::linkage_t *dnl = dn->get_linkage();
4698
4699 // dn auth_pin?
4700 if (strong->authpinned_dentries.count(p->first) &&
4701 strong->authpinned_dentries[p->first].count(q->first)) {
4702 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4703 r != strong->authpinned_dentries[p->first][q->first].end();
4704 ++r) {
4705 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4706
4707 // get/create slave mdrequest
4708 MDRequestRef mdr;
4709 if (have_request(r->reqid))
4710 mdr = request_get(r->reqid);
4711 else
4712 mdr = request_start_slave(r->reqid, r->attempt, strong);
4713 mdr->auth_pin(dn);
4714 }
4715 }
4716
4717 // dn xlock?
4718 if (strong->xlocked_dentries.count(p->first) &&
4719 strong->xlocked_dentries[p->first].count(q->first)) {
4720 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4721 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4722 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4723 assert(mdr->is_auth_pinned(dn));
4724 if (!mdr->xlocks.count(&dn->versionlock)) {
4725 assert(dn->versionlock.can_xlock_local());
4726 dn->versionlock.get_xlock(mdr, mdr->get_client());
4727 mdr->xlocks.insert(&dn->versionlock);
4728 mdr->locks.insert(&dn->versionlock);
4729 }
4730 if (dn->lock.is_stable())
4731 dn->auth_pin(&dn->lock);
4732 dn->lock.set_state(LOCK_XLOCK);
4733 dn->lock.get_xlock(mdr, mdr->get_client());
4734 mdr->xlocks.insert(&dn->lock);
4735 mdr->locks.insert(&dn->lock);
4736 }
4737
4738 dn->add_replica(from, q->second.nonce);
4739 dout(10) << " have " << *dn << dendl;
4740
4741 if (dnl->is_primary()) {
4742 if (q->second.is_primary()) {
4743 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4744 // the survivor missed MDentryUnlink+MDentryLink messages ?
4745 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4746 CInode *in = get_inode(q->second.ino, q->first.snapid);
4747 assert(in);
4748 assert(in->get_parent_dn());
4749 rejoin_unlinked_inodes[from].insert(in);
4750 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4751 }
4752 } else {
4753 // the survivor missed MDentryLink message ?
4754 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4755 dout(7) << " sender doesn't have primay dentry" << dendl;
4756 }
4757 } else {
4758 if (q->second.is_primary()) {
4759 // the survivor missed MDentryUnlink message ?
4760 CInode *in = get_inode(q->second.ino, q->first.snapid);
4761 assert(in);
4762 assert(in->get_parent_dn());
4763 rejoin_unlinked_inodes[from].insert(in);
4764 dout(7) << " sender has primary dentry but we don't" << dendl;
4765 }
4766 }
4767 }
4768 }
4769
4770 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4771 p != strong->strong_inodes.end();
4772 ++p) {
4773 CInode *in = get_inode(p->first);
4774 assert(in);
4775 in->add_replica(from, p->second.nonce);
4776 dout(10) << " have " << *in << dendl;
4777
4778 MMDSCacheRejoin::inode_strong &is = p->second;
4779
4780 // caps_wanted
4781 if (is.caps_wanted) {
4782 in->mds_caps_wanted[from] = is.caps_wanted;
4783 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4784 << " on " << *in << dendl;
4785 }
4786
4787 // scatterlocks?
4788 // infer state from replica state:
4789 // * go to MIX if they might have wrlocks
4790 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4791 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4792 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4793 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4794
4795 // auth pin?
4796 if (strong->authpinned_inodes.count(in->vino())) {
4797 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4798 r != strong->authpinned_inodes[in->vino()].end();
4799 ++r) {
4800 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4801
4802 // get/create slave mdrequest
4803 MDRequestRef mdr;
4804 if (have_request(r->reqid))
4805 mdr = request_get(r->reqid);
4806 else
4807 mdr = request_start_slave(r->reqid, r->attempt, strong);
4808 if (strong->frozen_authpin_inodes.count(in->vino())) {
4809 assert(!in->get_num_auth_pins());
4810 mdr->freeze_auth_pin(in);
4811 } else {
4812 assert(!in->is_frozen_auth_pin());
4813 }
4814 mdr->auth_pin(in);
4815 }
4816 }
4817 // xlock(s)?
4818 if (strong->xlocked_inodes.count(in->vino())) {
4819 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4820 q != strong->xlocked_inodes[in->vino()].end();
4821 ++q) {
4822 SimpleLock *lock = in->get_lock(q->first);
4823 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4824 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4825 assert(mdr->is_auth_pinned(in));
4826 if (!mdr->xlocks.count(&in->versionlock)) {
4827 assert(in->versionlock.can_xlock_local());
4828 in->versionlock.get_xlock(mdr, mdr->get_client());
4829 mdr->xlocks.insert(&in->versionlock);
4830 mdr->locks.insert(&in->versionlock);
4831 }
4832 if (lock->is_stable())
4833 in->auth_pin(lock);
4834 lock->set_state(LOCK_XLOCK);
4835 if (lock == &in->filelock)
4836 in->loner_cap = -1;
4837 lock->get_xlock(mdr, mdr->get_client());
4838 mdr->xlocks.insert(lock);
4839 mdr->locks.insert(lock);
4840 }
4841 }
4842 }
4843 // wrlock(s)?
4844 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4845 p != strong->wrlocked_inodes.end();
4846 ++p) {
4847 CInode *in = get_inode(p->first);
4848 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4849 q != p->second.end();
4850 ++q) {
4851 SimpleLock *lock = in->get_lock(q->first);
4852 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4853 r != q->second.end();
4854 ++r) {
4855 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4856 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4857 if (in->is_auth())
4858 assert(mdr->is_auth_pinned(in));
4859 lock->set_state(LOCK_MIX);
4860 if (lock == &in->filelock)
4861 in->loner_cap = -1;
4862 lock->get_wrlock(true);
4863 mdr->wrlocks.insert(lock);
4864 mdr->locks.insert(lock);
4865 }
4866 }
4867 }
4868
4869 // done?
4870 assert(rejoin_gather.count(from));
4871 rejoin_gather.erase(from);
4872 if (rejoin_gather.empty()) {
4873 rejoin_gather_finish();
4874 } else {
4875 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4876 }
4877 }
4878
4879 /* This functions DOES NOT put the passed message before returning */
4880 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4881 {
4882 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4883 mds_rank_t from = mds_rank_t(ack->get_source().num());
4884
4885 // for sending cache expire message
4886 set<CInode*> isolated_inodes;
4887 set<CInode*> refragged_inodes;
4888
4889 // dirs
4890 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4891 p != ack->strong_dirfrags.end();
4892 ++p) {
4893 // we may have had incorrect dir fragmentation; refragment based
4894 // on what they auth tells us.
4895 CDir *dir = get_dirfrag(p->first);
4896 if (!dir) {
4897 dir = get_force_dirfrag(p->first, false);
4898 if (dir)
4899 refragged_inodes.insert(dir->get_inode());
4900 }
4901 if (!dir) {
4902 CInode *diri = get_inode(p->first.ino);
4903 if (!diri) {
4904 // barebones inode; the full inode loop below will clean up.
4905 diri = new CInode(this, false);
4906 diri->inode.ino = p->first.ino;
4907 diri->inode.mode = S_IFDIR;
4908 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4909 add_inode(diri);
4910 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4911 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4912 dout(10) << " add inode " << *diri << dendl;
4913 } else {
4914 diri->inode_auth = CDIR_AUTH_DEFAULT;
4915 isolated_inodes.insert(diri);
4916 dout(10) << " unconnected dirfrag " << p->first << dendl;
4917 }
4918 }
4919 // barebones dirfrag; the full dirfrag loop below will clean up.
4920 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4921 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4922 (dir->authority() != CDIR_AUTH_UNDEF &&
4923 dir->authority().first != from))
4924 adjust_subtree_auth(dir, from);
4925 dout(10) << " add dirfrag " << *dir << dendl;
4926 }
4927
4928 dir->set_replica_nonce(p->second.nonce);
4929 dir->state_clear(CDir::STATE_REJOINING);
4930 dout(10) << " got " << *dir << dendl;
4931
4932 // dentries
4933 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4934 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4935 q != dmap.end();
4936 ++q) {
4937 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4938 if(!dn)
4939 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4940
4941 CDentry::linkage_t *dnl = dn->get_linkage();
4942
4943 assert(dn->last == q->first.snapid);
4944 if (dn->first != q->second.first) {
4945 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4946 dn->first = q->second.first;
4947 }
4948
4949 // may have bad linkage if we missed dentry link/unlink messages
4950 if (dnl->is_primary()) {
4951 CInode *in = dnl->get_inode();
4952 if (!q->second.is_primary() ||
4953 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4954 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4955 dir->unlink_inode(dn);
4956 }
4957 } else if (dnl->is_remote()) {
4958 if (!q->second.is_remote() ||
4959 q->second.remote_ino != dnl->get_remote_ino() ||
4960 q->second.remote_d_type != dnl->get_remote_d_type()) {
4961 dout(10) << " had bad linkage for " << *dn << dendl;
4962 dir->unlink_inode(dn);
4963 }
4964 } else {
4965 if (!q->second.is_null())
4966 dout(10) << " had bad linkage for " << *dn << dendl;
4967 }
4968
4969 // hmm, did we have the proper linkage here?
4970 if (dnl->is_null() && !q->second.is_null()) {
4971 if (q->second.is_remote()) {
4972 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4973 } else {
4974 CInode *in = get_inode(q->second.ino, q->first.snapid);
4975 if (!in) {
4976 // barebones inode; assume it's dir, the full inode loop below will clean up.
4977 in = new CInode(this, false, q->second.first, q->first.snapid);
4978 in->inode.ino = q->second.ino;
4979 in->inode.mode = S_IFDIR;
4980 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4981 add_inode(in);
4982 dout(10) << " add inode " << *in << dendl;
4983 } else if (in->get_parent_dn()) {
4984 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
4985 << ", unlinking " << *in << dendl;
4986 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
4987 }
4988 dn->dir->link_primary_inode(dn, in);
4989 isolated_inodes.erase(in);
4990 }
4991 }
4992
4993 dn->set_replica_nonce(q->second.nonce);
4994 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
4995 dn->state_clear(CDentry::STATE_REJOINING);
4996 dout(10) << " got " << *dn << dendl;
4997 }
4998 }
4999
5000 for (set<CInode*>::iterator p = refragged_inodes.begin();
5001 p != refragged_inodes.end();
5002 ++p) {
5003 list<CDir*> ls;
5004 (*p)->get_nested_dirfrags(ls);
5005 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5006 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5007 continue;
5008 assert((*q)->get_num_any() == 0);
5009 (*p)->close_dirfrag((*q)->get_frag());
5010 }
5011 }
5012
5013 // full dirfrags
5014 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5015 p != ack->dirfrag_bases.end();
5016 ++p) {
5017 CDir *dir = get_dirfrag(p->first);
5018 assert(dir);
5019 bufferlist::iterator q = p->second.begin();
5020 dir->_decode_base(q);
5021 dout(10) << " got dir replica " << *dir << dendl;
5022 }
5023
5024 // full inodes
5025 bufferlist::iterator p = ack->inode_base.begin();
5026 while (!p.end()) {
5027 inodeno_t ino;
5028 snapid_t last;
5029 bufferlist basebl;
5030 ::decode(ino, p);
5031 ::decode(last, p);
5032 ::decode(basebl, p);
5033 CInode *in = get_inode(ino, last);
5034 assert(in);
5035 bufferlist::iterator q = basebl.begin();
5036 in->_decode_base(q);
5037 dout(10) << " got inode base " << *in << dendl;
5038 }
5039
5040 // inodes
5041 p = ack->inode_locks.begin();
5042 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5043 while (!p.end()) {
5044 inodeno_t ino;
5045 snapid_t last;
5046 __u32 nonce;
5047 bufferlist lockbl;
5048 ::decode(ino, p);
5049 ::decode(last, p);
5050 ::decode(nonce, p);
5051 ::decode(lockbl, p);
5052
5053 CInode *in = get_inode(ino, last);
5054 assert(in);
5055 in->set_replica_nonce(nonce);
5056 bufferlist::iterator q = lockbl.begin();
5057 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
5058 in->state_clear(CInode::STATE_REJOINING);
5059 dout(10) << " got inode locks " << *in << dendl;
5060 }
5061
5062 // FIXME: This can happen if entire subtree, together with the inode subtree root
5063 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5064 assert(isolated_inodes.empty());
5065
5066 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5067 bufferlist::iterator bp = ack->imported_caps.begin();
5068 ::decode(peer_imported, bp);
5069
5070 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5071 p != peer_imported.end();
5072 ++p) {
5073 assert(cap_exports.count(p->first));
5074 assert(cap_export_targets.count(p->first));
5075 assert(cap_export_targets[p->first] == from);
5076 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5077 q != p->second.end();
5078 ++q) {
5079 assert(cap_exports[p->first].count(q->first));
5080
5081 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5082 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5083 assert(session);
5084
5085 // mark client caps stale.
5086 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5087 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5088 mds->get_osd_epoch_barrier());
5089 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5090 (q->second.cap_id > 0 ? from : -1), 0);
5091 mds->send_message_client_counted(m, session);
5092
5093 cap_exports[p->first].erase(q->first);
5094 }
5095 assert(cap_exports[p->first].empty());
5096 }
5097
5098 // done?
5099 assert(rejoin_ack_gather.count(from));
5100 rejoin_ack_gather.erase(from);
5101 if (mds->is_rejoin()) {
5102
5103 if (rejoin_gather.empty()) {
5104 // eval unstable scatter locks after all wrlocks are rejoined.
5105 while (!rejoin_eval_locks.empty()) {
5106 SimpleLock *lock = rejoin_eval_locks.front();
5107 rejoin_eval_locks.pop_front();
5108 if (!lock->is_stable())
5109 mds->locker->eval_gather(lock);
5110 }
5111 }
5112
5113 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5114 rejoin_ack_gather.empty()) {
5115 // finally, kickstart past snap parent opens
5116 open_snap_parents();
5117 } else {
5118 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5119 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5120 }
5121 } else {
5122 // survivor.
5123 mds->queue_waiters(rejoin_waiters);
5124 }
5125 }
5126
5127 /**
5128 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5129 *
5130 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5131 * messages that clean these guys up...
5132 */
5133 void MDCache::rejoin_trim_undef_inodes()
5134 {
5135 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5136
5137 while (!rejoin_undef_inodes.empty()) {
5138 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5139 CInode *in = *p;
5140 rejoin_undef_inodes.erase(p);
5141
5142 in->clear_replica_map();
5143
5144 // close out dirfrags
5145 if (in->is_dir()) {
5146 list<CDir*> dfls;
5147 in->get_dirfrags(dfls);
5148 for (list<CDir*>::iterator p = dfls.begin();
5149 p != dfls.end();
5150 ++p) {
5151 CDir *dir = *p;
5152 dir->clear_replica_map();
5153
5154 for (CDir::map_t::iterator p = dir->items.begin();
5155 p != dir->items.end();
5156 ++p) {
5157 CDentry *dn = p->second;
5158 dn->clear_replica_map();
5159
5160 dout(10) << " trimming " << *dn << dendl;
5161 dir->remove_dentry(dn);
5162 }
5163
5164 dout(10) << " trimming " << *dir << dendl;
5165 in->close_dirfrag(dir->dirfrag().frag);
5166 }
5167 }
5168
5169 CDentry *dn = in->get_parent_dn();
5170 if (dn) {
5171 dn->clear_replica_map();
5172 dout(10) << " trimming " << *dn << dendl;
5173 dn->dir->remove_dentry(dn);
5174 } else {
5175 dout(10) << " trimming " << *in << dendl;
5176 remove_inode(in);
5177 }
5178 }
5179
5180 assert(rejoin_undef_inodes.empty());
5181 }
5182
5183 void MDCache::rejoin_gather_finish()
5184 {
5185 dout(10) << "rejoin_gather_finish" << dendl;
5186 assert(mds->is_rejoin());
5187
5188 if (open_undef_inodes_dirfrags())
5189 return;
5190
5191 if (process_imported_caps())
5192 return;
5193
5194 choose_lock_states_and_reconnect_caps();
5195
5196 identify_files_to_recover();
5197 rejoin_send_acks();
5198
5199 // signal completion of fetches, rejoin_gather_finish, etc.
5200 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5201 rejoin_ack_gather.erase(mds->get_nodeid());
5202
5203 // did we already get our acks too?
5204 if (rejoin_ack_gather.empty()) {
5205 // finally, kickstart past snap parent opens
5206 open_snap_parents();
5207 }
5208 }
5209
5210 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5211 inodeno_t ino;
5212 public:
5213 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5214 void finish(int r) override {
5215 mdcache->rejoin_open_ino_finish(ino, r);
5216 }
5217 };
5218
5219 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5220 {
5221 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5222
5223 if (ret < 0) {
5224 cap_imports_missing.insert(ino);
5225 } else if (ret == mds->get_nodeid()) {
5226 assert(get_inode(ino));
5227 } else {
5228 auto p = cap_imports.find(ino);
5229 assert(p != cap_imports.end());
5230 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5231 assert(q->second.count(MDS_RANK_NONE));
5232 assert(q->second.size() == 1);
5233 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5234 }
5235 cap_imports.erase(p);
5236 }
5237
5238 assert(cap_imports_num_opening > 0);
5239 cap_imports_num_opening--;
5240
5241 if (cap_imports_num_opening == 0) {
5242 if (rejoin_gather.empty())
5243 rejoin_gather_finish();
5244 else if (rejoin_gather.count(mds->get_nodeid()))
5245 process_imported_caps();
5246 }
5247 }
5248
5249 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5250 public:
5251 map<client_t,entity_inst_t> client_map;
5252 map<client_t,uint64_t> sseqmap;
5253
5254 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5255 MDCacheLogContext(c), client_map(cm) {}
5256 void finish(int r) override {
5257 assert(r == 0);
5258 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5259 }
5260 };
5261
5262 void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5263 map<client_t,uint64_t>& sseqmap)
5264 {
5265 dout(10) << "rejoin_open_sessions_finish" << dendl;
5266 mds->server->finish_force_open_sessions(client_map, sseqmap);
5267 if (rejoin_gather.empty())
5268 rejoin_gather_finish();
5269 }
5270
5271 bool MDCache::process_imported_caps()
5272 {
5273 dout(10) << "process_imported_caps" << dendl;
5274
5275 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5276 CInode *in = get_inode(p->first);
5277 if (in) {
5278 assert(in->is_auth());
5279 cap_imports_missing.erase(p->first);
5280 continue;
5281 }
5282 if (cap_imports_missing.count(p->first) > 0)
5283 continue;
5284
5285 cap_imports_num_opening++;
5286 dout(10) << " opening missing ino " << p->first << dendl;
5287 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5288 }
5289
5290 if (cap_imports_num_opening > 0)
5291 return true;
5292
5293 // called by rejoin_gather_finish() ?
5294 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5295 // if sessions for imported caps are all open ?
5296 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5297 p != rejoin_client_map.end();
5298 ++p) {
5299 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5300 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5301 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5302 ESessions *le = new ESessions(pv, rejoin_client_map);
5303 mds->mdlog->start_submit_entry(le, finish);
5304 mds->mdlog->flush();
5305 rejoin_client_map.clear();
5306 return true;
5307 }
5308 }
5309 rejoin_client_map.clear();
5310
5311 // process caps that were exported by slave rename
5312 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5313 p != rejoin_slave_exports.end();
5314 ++p) {
5315 CInode *in = get_inode(p->first);
5316 assert(in);
5317 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5318 q != p->second.second.end();
5319 ++q) {
5320 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5321 assert(session);
5322
5323 Capability *cap = in->get_client_cap(q->first);
5324 if (!cap)
5325 cap = in->add_client_cap(q->first, session);
5326 cap->merge(q->second, true);
5327
5328 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5329 assert(cap->get_last_seq() == im.issue_seq);
5330 assert(cap->get_mseq() == im.mseq);
5331 cap->set_cap_id(im.cap_id);
5332 // send cap import because we assigned a new cap ID
5333 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5334 p->second.first, CEPH_CAP_FLAG_AUTH);
5335 }
5336 }
5337 rejoin_slave_exports.clear();
5338 rejoin_imported_caps.clear();
5339
5340 // process cap imports
5341 // ino -> client -> frommds -> capex
5342 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5343 CInode *in = get_inode(p->first);
5344 if (!in) {
5345 dout(10) << " still missing ino " << p->first
5346 << ", will try again after replayed client requests" << dendl;
5347 ++p;
5348 continue;
5349 }
5350 assert(in->is_auth());
5351 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5352 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5353 assert(session);
5354 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5355 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5356 add_reconnected_cap(q->first, in->ino(), r->second);
5357 if (r->first >= 0) {
5358 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5359 cap->inc_mseq();
5360 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5361
5362 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5363 im.cap_id = cap->get_cap_id();
5364 im.issue_seq = cap->get_last_seq();
5365 im.mseq = cap->get_mseq();
5366 }
5367 }
5368 }
5369 cap_imports.erase(p++); // remove and move on
5370 }
5371 } else {
5372 trim_non_auth();
5373
5374 rejoin_gather.erase(mds->get_nodeid());
5375 maybe_send_pending_rejoins();
5376
5377 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5378 rejoin_gather_finish();
5379 }
5380 return false;
5381 }
5382
5383 void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5384 {
5385 // are this realm's parents fully open?
5386 if (realm->have_past_parents_open()) {
5387 dout(10) << " have past snap parents for realm " << *realm
5388 << " on " << *realm->inode << dendl;
5389 if (reconnect) {
5390 // finish off client snaprealm reconnects?
5391 auto p = reconnected_snaprealms.find(realm->inode->ino());
5392 if (p != reconnected_snaprealms.end()) {
5393 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5394 finish_snaprealm_reconnect(q->first, realm, q->second);
5395 reconnected_snaprealms.erase(p);
5396 }
5397 }
5398 } else {
5399 if (!missing_snap_parents.count(realm->inode)) {
5400 dout(10) << " MISSING past snap parents for realm " << *realm
5401 << " on " << *realm->inode << dendl;
5402 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5403 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5404 } else {
5405 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5406 << " on " << *realm->inode << dendl;
5407 }
5408 }
5409 }
5410
5411 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5412 client_t client, snapid_t snap_follows)
5413 {
5414 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5415
5416 const set<snapid_t>& snaps = realm->get_snaps();
5417 snapid_t follows = snap_follows;
5418
5419 while (true) {
5420 CInode *in = pick_inode_snap(head_in, follows);
5421 if (in == head_in)
5422 break;
5423 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5424
5425 /* TODO: we can check the reconnected/flushing caps to find
5426 * which locks need gathering */
5427 for (int i = 0; i < num_cinode_locks; i++) {
5428 int lockid = cinode_lock_info[i].lock;
5429 SimpleLock *lock = in->get_lock(lockid);
5430 assert(lock);
5431 in->client_snap_caps[lockid].insert(client);
5432 in->auth_pin(lock);
5433 lock->set_state(LOCK_SNAP_SYNC);
5434 lock->get_wrlock(true);
5435 }
5436
5437 for (auto p = snaps.lower_bound(in->first);
5438 p != snaps.end() && *p <= in->last;
5439 ++p) {
5440 head_in->add_need_snapflush(in, *p, client);
5441 }
5442
5443 follows = in->last;
5444 }
5445 }
5446
5447 /*
5448 * choose lock states based on reconnected caps
5449 */
5450 void MDCache::choose_lock_states_and_reconnect_caps()
5451 {
5452 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5453
5454 map<client_t,MClientSnap*> splits;
5455
5456 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator i = inode_map.begin();
5457 i != inode_map.end();
5458 ++i) {
5459 CInode *in = i->second;
5460
5461 if (in->last != CEPH_NOSNAP)
5462 continue;
5463
5464 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5465 in->mark_dirty_rstat();
5466
5467 auto p = reconnected_caps.find(in->ino());
5468
5469 int dirty_caps = 0;
5470 if (p != reconnected_caps.end()) {
5471 for (const auto &it : p->second)
5472 dirty_caps |= it.second.dirty_caps;
5473 }
5474 in->choose_lock_states(dirty_caps);
5475 dout(15) << " chose lock states on " << *in << dendl;
5476
5477 SnapRealm *realm = in->find_snaprealm();
5478
5479 check_realm_past_parents(realm, realm == in->snaprealm);
5480
5481 if (p != reconnected_caps.end()) {
5482 bool missing_snap_parent = false;
5483 // also, make sure client's cap is in the correct snaprealm.
5484 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5485 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5486 if (realm->have_past_parents_open()) {
5487 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5488 } else {
5489 missing_snap_parent = true;
5490 }
5491 }
5492
5493 if (q->second.realm_ino == realm->inode->ino()) {
5494 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5495 } else {
5496 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5497 << " != " << realm->inode->ino() << dendl;
5498 if (realm->have_past_parents_open()) {
5499 // ok, include in a split message _now_.
5500 prepare_realm_split(realm, q->first, in->ino(), splits);
5501 } else {
5502 // send the split later.
5503 missing_snap_parent = true;
5504 }
5505 }
5506 }
5507 if (missing_snap_parent)
5508 missing_snap_parents[realm->inode].insert(in);
5509 }
5510 }
5511
5512 send_snaps(splits);
5513 }
5514
5515 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5516 map<client_t,MClientSnap*>& splits)
5517 {
5518 MClientSnap *snap;
5519 if (splits.count(client) == 0) {
5520 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5521 snap->head.split = realm->inode->ino();
5522 realm->build_snap_trace(snap->bl);
5523
5524 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5525 p != realm->open_children.end();
5526 ++p)
5527 snap->split_realms.push_back((*p)->inode->ino());
5528
5529 } else
5530 snap = splits[client];
5531 snap->split_inos.push_back(ino);
5532 }
5533
5534 void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5535 {
5536 dout(10) << "send_snaps" << dendl;
5537
5538 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5539 p != splits.end();
5540 ++p) {
5541 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5542 if (session) {
5543 dout(10) << " client." << p->first
5544 << " split " << p->second->head.split
5545 << " inos " << p->second->split_inos
5546 << dendl;
5547 mds->send_message_client_counted(p->second, session);
5548 } else {
5549 dout(10) << " no session for client." << p->first << dendl;
5550 p->second->put();
5551 }
5552 }
5553 splits.clear();
5554 }
5555
5556
5557 /*
5558 * remove any items from logsegment open_file lists that don't have
5559 * any caps
5560 */
5561 void MDCache::clean_open_file_lists()
5562 {
5563 dout(10) << "clean_open_file_lists" << dendl;
5564
5565 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5566 p != mds->mdlog->segments.end();
5567 ++p) {
5568 LogSegment *ls = p->second;
5569
5570 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5571 while (!q.end()) {
5572 CInode *in = *q;
5573 ++q;
5574 if (in->last == CEPH_NOSNAP) {
5575 if (!in->is_any_caps_wanted()) {
5576 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5577 in->item_open_file.remove_myself();
5578 }
5579 } else if (in->last != CEPH_NOSNAP) {
5580 if (in->client_snap_caps.empty()) {
5581 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5582 in->item_open_file.remove_myself();
5583 }
5584 }
5585 }
5586 }
5587 }
5588
5589
5590
5591 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5592 {
5593 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5594 << " on " << *in << dendl;
5595 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5596 if (!session) {
5597 dout(10) << " no session for client." << client << dendl;
5598 return NULL;
5599 }
5600
5601 Capability *cap = in->reconnect_cap(client, icr, session);
5602
5603 if (frommds >= 0) {
5604 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5605 cap->inc_mseq();
5606 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5607 }
5608
5609 return cap;
5610 }
5611
5612 void MDCache::export_remaining_imported_caps()
5613 {
5614 dout(10) << "export_remaining_imported_caps" << dendl;
5615
5616 stringstream warn_str;
5617
5618 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5619 warn_str << " ino " << p->first << "\n";
5620 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5621 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5622 if (session) {
5623 // mark client caps stale.
5624 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5625 stale->set_cap_peer(0, 0, 0, -1, 0);
5626 mds->send_message_client_counted(stale, q->first);
5627 }
5628 }
5629
5630 mds->heartbeat_reset();
5631 }
5632
5633 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5634 p != cap_reconnect_waiters.end();
5635 ++p)
5636 mds->queue_waiters(p->second);
5637
5638 cap_imports.clear();
5639 cap_reconnect_waiters.clear();
5640
5641 if (warn_str.peek() != EOF) {
5642 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5643 mds->clog->warn(warn_str);
5644 }
5645 }
5646
5647 void MDCache::try_reconnect_cap(CInode *in, Session *session)
5648 {
5649 client_t client = session->info.get_client();
5650 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5651 if (rc) {
5652 in->reconnect_cap(client, *rc, session);
5653 dout(10) << "try_reconnect_cap client." << client
5654 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5655 << " issue " << ccap_string(rc->capinfo.issued)
5656 << " on " << *in << dendl;
5657 remove_replay_cap_reconnect(in->ino(), client);
5658
5659 if (in->is_replicated()) {
5660 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5661 } else {
5662 int dirty_caps = 0;
5663 auto p = reconnected_caps.find(in->ino());
5664 if (p != reconnected_caps.end()) {
5665 auto q = p->second.find(client);
5666 if (q != p->second.end())
5667 dirty_caps = q->second.dirty_caps;
5668 }
5669 in->choose_lock_states(dirty_caps);
5670 dout(15) << " chose lock states on " << *in << dendl;
5671 }
5672
5673 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5674 cap_reconnect_waiters.find(in->ino());
5675 if (it != cap_reconnect_waiters.end()) {
5676 mds->queue_waiters(it->second);
5677 cap_reconnect_waiters.erase(it);
5678 }
5679 }
5680 }
5681
5682
5683
5684 // -------
5685 // cap imports and delayed snap parent opens
5686
5687 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5688 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5689 int peer, int p_flags)
5690 {
5691 client_t client = session->info.inst.name.num();
5692 SnapRealm *realm = in->find_snaprealm();
5693 if (realm->have_past_parents_open()) {
5694 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5695 if (cap->get_last_seq() == 0) // reconnected cap
5696 cap->inc_last_seq();
5697 cap->set_last_issue();
5698 cap->set_last_issue_stamp(ceph_clock_now());
5699 cap->clear_new();
5700 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5701 in->ino(),
5702 realm->inode->ino(),
5703 cap->get_cap_id(), cap->get_last_seq(),
5704 cap->pending(), cap->wanted(), 0,
5705 cap->get_mseq(), mds->get_osd_epoch_barrier());
5706 in->encode_cap_message(reap, cap);
5707 realm->build_snap_trace(reap->snapbl);
5708 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5709 mds->send_message_client_counted(reap, session);
5710 } else {
5711 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5712 << cap->get_mseq() << " on " << *in << dendl;
5713 in->auth_pin(this);
5714 cap->inc_suppress();
5715 delayed_imported_caps[client].insert(in);
5716 missing_snap_parents[in].size();
5717 }
5718 }
5719
5720 void MDCache::do_delayed_cap_imports()
5721 {
5722 dout(10) << "do_delayed_cap_imports" << dendl;
5723
5724 assert(delayed_imported_caps.empty());
5725 }
5726
5727 struct C_MDC_OpenSnapParents : public MDCacheContext {
5728 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5729 void finish(int r) override {
5730 mdcache->open_snap_parents();
5731 }
5732 };
5733
5734 void MDCache::open_snap_parents()
5735 {
5736 dout(10) << "open_snap_parents" << dendl;
5737
5738 map<client_t,MClientSnap*> splits;
5739 MDSGatherBuilder gather(g_ceph_context);
5740
5741 auto p = missing_snap_parents.begin();
5742 while (p != missing_snap_parents.end()) {
5743 CInode *in = p->first;
5744 assert(in->snaprealm);
5745 if (in->snaprealm->open_parents(gather.new_sub())) {
5746 dout(10) << " past parents now open on " << *in << dendl;
5747
5748 for (CInode *child : p->second) {
5749 auto q = reconnected_caps.find(child->ino());
5750 assert(q != reconnected_caps.end());
5751 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5752 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5753 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5754 }
5755 // make sure client's cap is in the correct snaprealm.
5756 if (r->second.realm_ino != in->ino()) {
5757 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5758 }
5759 }
5760 }
5761
5762 missing_snap_parents.erase(p++);
5763
5764 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5765
5766 // finish off client snaprealm reconnects?
5767 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5768 if (q != reconnected_snaprealms.end()) {
5769 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5770 r != q->second.end();
5771 ++r)
5772 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5773 reconnected_snaprealms.erase(q);
5774 }
5775 } else {
5776 dout(10) << " opening past parents on " << *in << dendl;
5777 ++p;
5778 }
5779 }
5780
5781 send_snaps(splits);
5782
5783 if (gather.has_subs()) {
5784 dout(10) << "open_snap_parents - waiting for "
5785 << gather.num_subs_remaining() << dendl;
5786 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5787 gather.activate();
5788 } else {
5789 if (!reconnected_snaprealms.empty()) {
5790 stringstream warn_str;
5791 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5792 p != reconnected_snaprealms.end();
5793 ++p) {
5794 warn_str << " unconnected snaprealm " << p->first << "\n";
5795 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5796 q != p->second.end();
5797 ++q)
5798 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5799 }
5800 mds->clog->warn() << "open_snap_parents has:";
5801 mds->clog->warn(warn_str);
5802 }
5803 assert(rejoin_waiters.empty());
5804 assert(missing_snap_parents.empty());
5805 dout(10) << "open_snap_parents - all open" << dendl;
5806 do_delayed_cap_imports();
5807
5808 assert(rejoin_done);
5809 rejoin_done.release()->complete(0);
5810 reconnected_caps.clear();
5811 }
5812 }
5813
5814 bool MDCache::open_undef_inodes_dirfrags()
5815 {
5816 dout(10) << "open_undef_inodes_dirfrags "
5817 << rejoin_undef_inodes.size() << " inodes "
5818 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5819
5820 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5821
5822 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5823 p != rejoin_undef_inodes.end();
5824 ++p) {
5825 CInode *in = *p;
5826 assert(!in->is_base());
5827 fetch_queue.insert(in->get_parent_dir());
5828 }
5829
5830 if (fetch_queue.empty())
5831 return false;
5832
5833 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5834 for (set<CDir*>::iterator p = fetch_queue.begin();
5835 p != fetch_queue.end();
5836 ++p) {
5837 CDir *dir = *p;
5838 CInode *diri = dir->get_inode();
5839 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5840 continue;
5841 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5842 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5843 dir->fetch(gather.new_sub());
5844 }
5845 assert(gather.has_subs());
5846 gather.activate();
5847 return true;
5848 }
5849
5850 void MDCache::opened_undef_inode(CInode *in) {
5851 dout(10) << "opened_undef_inode " << *in << dendl;
5852 rejoin_undef_inodes.erase(in);
5853 if (in->is_dir()) {
5854 // FIXME: re-hash dentries if necessary
5855 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5856 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5857 CDir *dir = in->get_dirfrag(frag_t());
5858 assert(dir);
5859 rejoin_undef_dirfrags.erase(dir);
5860 in->force_dirfrags();
5861 list<CDir*> ls;
5862 in->get_dirfrags(ls);
5863 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5864 rejoin_undef_dirfrags.insert(*p);
5865 }
5866 }
5867 }
5868
5869 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5870 {
5871 if (seq < realm->get_newest_seq()) {
5872 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5873 << realm->get_newest_seq()
5874 << " on " << *realm << dendl;
5875 // send an update
5876 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5877 if (session) {
5878 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5879 realm->build_snap_trace(snap->bl);
5880 mds->send_message_client_counted(snap, session);
5881 } else {
5882 dout(10) << " ...or not, no session for this client!" << dendl;
5883 }
5884 } else {
5885 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5886 << " on " << *realm << dendl;
5887 }
5888 }
5889
5890
5891
5892 void MDCache::rejoin_send_acks()
5893 {
5894 dout(7) << "rejoin_send_acks" << dendl;
5895
5896 // replicate stray
5897 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5898 p != rejoin_unlinked_inodes.end();
5899 ++p) {
5900 for (set<CInode*>::iterator q = p->second.begin();
5901 q != p->second.end();
5902 ++q) {
5903 CInode *in = *q;
5904 dout(7) << " unlinked inode " << *in << dendl;
5905 // inode expired
5906 if (!in->is_replica(p->first))
5907 continue;
5908 while (1) {
5909 CDentry *dn = in->get_parent_dn();
5910 if (dn->is_replica(p->first))
5911 break;
5912 dn->add_replica(p->first);
5913 CDir *dir = dn->get_dir();
5914 if (dir->is_replica(p->first))
5915 break;
5916 dir->add_replica(p->first);
5917 in = dir->get_inode();
5918 if (in->is_replica(p->first))
5919 break;
5920 if (in->is_base())
5921 break;
5922 }
5923 }
5924 }
5925 rejoin_unlinked_inodes.clear();
5926
5927 // send acks to everyone in the recovery set
5928 map<mds_rank_t,MMDSCacheRejoin*> ack;
5929 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5930 p != recovery_set.end();
5931 ++p)
5932 ack[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5933
5934 // walk subtrees
5935 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5936 p != subtrees.end();
5937 ++p) {
5938 CDir *dir = p->first;
5939 if (!dir->is_auth())
5940 continue;
5941 dout(10) << "subtree " << *dir << dendl;
5942
5943 // auth items in this subtree
5944 list<CDir*> dq;
5945 dq.push_back(dir);
5946
5947 while (!dq.empty()) {
5948 CDir *dir = dq.front();
5949 dq.pop_front();
5950
5951 // dir
5952 for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
5953 r != dir->replicas_end();
5954 ++r) {
5955 ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
5956 ack[r->first]->add_dirfrag_base(dir);
5957 }
5958
5959 for (CDir::map_t::iterator q = dir->items.begin();
5960 q != dir->items.end();
5961 ++q) {
5962 CDentry *dn = q->second;
5963 CDentry::linkage_t *dnl = dn->get_linkage();
5964
5965 // inode
5966 CInode *in = NULL;
5967 if (dnl->is_primary())
5968 in = dnl->get_inode();
5969
5970 // dentry
5971 for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
5972 r != dn->replicas_end();
5973 ++r) {
5974 ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
5975 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
5976 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
5977 dnl->is_remote() ? dnl->get_remote_d_type():0,
5978 ++r->second,
5979 dn->lock.get_replica_state());
5980 // peer missed MDentrylink message ?
5981 if (in && !in->is_replica(r->first))
5982 in->add_replica(r->first);
5983 }
5984
5985 if (!in)
5986 continue;
5987
5988 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
5989 r != in->replicas_end();
5990 ++r) {
5991 ack[r->first]->add_inode_base(in, mds->mdsmap->get_up_features());
5992 bufferlist bl;
5993 in->_encode_locks_state_for_rejoin(bl, r->first);
5994 ack[r->first]->add_inode_locks(in, ++r->second, bl);
5995 }
5996
5997 // subdirs in this subtree?
5998 in->get_nested_dirfrags(dq);
5999 }
6000 }
6001 }
6002
6003 // base inodes too
6004 if (root && root->is_auth())
6005 for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
6006 r != root->replicas_end();
6007 ++r) {
6008 ack[r->first]->add_inode_base(root, mds->mdsmap->get_up_features());
6009 bufferlist bl;
6010 root->_encode_locks_state_for_rejoin(bl, r->first);
6011 ack[r->first]->add_inode_locks(root, ++r->second, bl);
6012 }
6013 if (myin)
6014 for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
6015 r != myin->replicas_end();
6016 ++r) {
6017 ack[r->first]->add_inode_base(myin, mds->mdsmap->get_up_features());
6018 bufferlist bl;
6019 myin->_encode_locks_state_for_rejoin(bl, r->first);
6020 ack[r->first]->add_inode_locks(myin, ++r->second, bl);
6021 }
6022
6023 // include inode base for any inodes whose scatterlocks may have updated
6024 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6025 p != rejoin_potential_updated_scatterlocks.end();
6026 ++p) {
6027 CInode *in = *p;
6028 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
6029 r != in->replicas_end();
6030 ++r)
6031 ack[r->first]->add_inode_base(in, mds->mdsmap->get_up_features());
6032 }
6033
6034 // send acks
6035 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = ack.begin();
6036 p != ack.end();
6037 ++p) {
6038 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6039 mds->send_message_mds(p->second, p->first);
6040 }
6041
6042 rejoin_imported_caps.clear();
6043 }
6044
6045
6046 void MDCache::reissue_all_caps()
6047 {
6048 dout(10) << "reissue_all_caps" << dendl;
6049
6050 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6051 p != inode_map.end();
6052 ++p) {
6053 CInode *in = p->second;
6054 if (in->is_head() && in->is_any_caps()) {
6055 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6056 mds->locker->issue_caps(in);
6057 }
6058 }
6059 }
6060
6061
6062 // ===============================================================================
6063
6064 struct C_MDC_QueuedCow : public MDCacheContext {
6065 CInode *in;
6066 MutationRef mut;
6067 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6068 MDCacheContext(mdc), in(i), mut(m) {}
6069 void finish(int r) override {
6070 mdcache->_queued_file_recover_cow(in, mut);
6071 }
6072 };
6073
6074
6075 void MDCache::queue_file_recover(CInode *in)
6076 {
6077 dout(10) << "queue_file_recover " << *in << dendl;
6078 assert(in->is_auth());
6079
6080 // cow?
6081 /*
6082 SnapRealm *realm = in->find_snaprealm();
6083 set<snapid_t> s = realm->get_snaps();
6084 while (!s.empty() && *s.begin() < in->first)
6085 s.erase(s.begin());
6086 while (!s.empty() && *s.rbegin() > in->last)
6087 s.erase(*s.rbegin());
6088 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6089 if (s.size() > 1) {
6090 inode_t *pi = in->project_inode();
6091 pi->version = in->pre_dirty();
6092
6093 auto mut(std::make_shared<MutationImpl>());
6094 mut->ls = mds->mdlog->get_current_segment();
6095 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6096 mds->mdlog->start_entry(le);
6097 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6098
6099 s.erase(*s.begin());
6100 while (!s.empty()) {
6101 snapid_t snapid = *s.begin();
6102 CInode *cow_inode = 0;
6103 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6104 assert(cow_inode);
6105 recovery_queue.enqueue(cow_inode);
6106 s.erase(*s.begin());
6107 }
6108
6109 in->parent->first = in->first;
6110 le->metablob.add_primary_dentry(in->parent, in, true);
6111 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6112 mds->mdlog->flush();
6113 }
6114 */
6115
6116 recovery_queue.enqueue(in);
6117 }
6118
6119 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6120 {
6121 in->pop_and_dirty_projected_inode(mut->ls);
6122 mut->apply();
6123 mds->locker->drop_locks(mut.get());
6124 mut->cleanup();
6125 }
6126
6127
6128 /*
6129 * called after recovery to recover file sizes for previously opened (for write)
6130 * files. that is, those where max_size > size.
6131 */
6132 void MDCache::identify_files_to_recover()
6133 {
6134 dout(10) << "identify_files_to_recover" << dendl;
6135 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6136 p != inode_map.end();
6137 ++p) {
6138 CInode *in = p->second;
6139 if (!in->is_auth())
6140 continue;
6141
6142 if (in->last != CEPH_NOSNAP)
6143 continue;
6144
6145 // Only normal files need file size recovery
6146 if (!in->is_file()) {
6147 continue;
6148 }
6149
6150 bool recover = false;
6151 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6152 p != in->inode.client_ranges.end();
6153 ++p) {
6154 Capability *cap = in->get_client_cap(p->first);
6155 if (!cap) {
6156 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6157 recover = true;
6158 break;
6159 }
6160 }
6161
6162 if (recover) {
6163 if (in->filelock.is_stable()) {
6164 in->auth_pin(&in->filelock);
6165 } else {
6166 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6167 }
6168 in->filelock.set_state(LOCK_PRE_SCAN);
6169 rejoin_recover_q.push_back(in);
6170 } else {
6171 rejoin_check_q.push_back(in);
6172 }
6173 }
6174 }
6175
6176 void MDCache::start_files_to_recover()
6177 {
6178 for (CInode *in : rejoin_check_q) {
6179 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6180 mds->locker->issue_caps(in);
6181 mds->locker->check_inode_max_size(in);
6182 }
6183 rejoin_check_q.clear();
6184 for (CInode *in : rejoin_recover_q) {
6185 mds->locker->file_recover(&in->filelock);
6186 }
6187 if (!rejoin_recover_q.empty()) {
6188 rejoin_recover_q.clear();
6189 do_file_recover();
6190 }
6191 }
6192
6193 void MDCache::do_file_recover()
6194 {
6195 recovery_queue.advance();
6196 }
6197
6198 // ===============================================================================
6199
6200
6201 // ----------------------------
6202 // truncate
6203
6204 class C_MDC_RetryTruncate : public MDCacheContext {
6205 CInode *in;
6206 LogSegment *ls;
6207 public:
6208 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6209 MDCacheContext(c), in(i), ls(l) {}
6210 void finish(int r) override {
6211 mdcache->_truncate_inode(in, ls);
6212 }
6213 };
6214
6215 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6216 {
6217 inode_t *pi = in->get_projected_inode();
6218 dout(10) << "truncate_inode "
6219 << pi->truncate_from << " -> " << pi->truncate_size
6220 << " on " << *in
6221 << dendl;
6222
6223 ls->truncating_inodes.insert(in);
6224 in->get(CInode::PIN_TRUNCATING);
6225 in->auth_pin(this);
6226
6227 if (!in->client_need_snapflush.empty() &&
6228 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6229 assert(in->filelock.is_xlocked());
6230 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6231 mds->locker->issue_caps(in);
6232 return;
6233 }
6234
6235 _truncate_inode(in, ls);
6236 }
6237
6238 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6239 CInode *in;
6240 LogSegment *ls;
6241 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6242 MDCacheIOContext(c), in(i), ls(l) {}
6243 void finish(int r) override {
6244 assert(r == 0 || r == -ENOENT);
6245 mdcache->truncate_inode_finish(in, ls);
6246 }
6247 };
6248
6249 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6250 {
6251 inode_t *pi = &in->inode;
6252 dout(10) << "_truncate_inode "
6253 << pi->truncate_from << " -> " << pi->truncate_size
6254 << " on " << *in << dendl;
6255
6256 assert(pi->is_truncating());
6257 assert(pi->truncate_size < (1ULL << 63));
6258 assert(pi->truncate_from < (1ULL << 63));
6259 assert(pi->truncate_size < pi->truncate_from);
6260
6261
6262 SnapRealm *realm = in->find_snaprealm();
6263 SnapContext nullsnap;
6264 const SnapContext *snapc;
6265 if (realm) {
6266 dout(10) << " realm " << *realm << dendl;
6267 snapc = &realm->get_snap_context();
6268 } else {
6269 dout(10) << " NO realm, using null context" << dendl;
6270 snapc = &nullsnap;
6271 assert(in->last == CEPH_NOSNAP);
6272 }
6273 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6274 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6275 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6276 pi->truncate_seq, ceph::real_time::min(), 0,
6277 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6278 mds->finisher));
6279 }
6280
6281 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6282 CInode *in;
6283 MutationRef mut;
6284 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6285 MDCacheLogContext(m), in(i), mut(mu) {}
6286 void finish(int r) override {
6287 mdcache->truncate_inode_logged(in, mut);
6288 }
6289 };
6290
6291 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6292 {
6293 dout(10) << "truncate_inode_finish " << *in << dendl;
6294
6295 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6296 assert(p != ls->truncating_inodes.end());
6297 ls->truncating_inodes.erase(p);
6298
6299 // update
6300 inode_t *pi = in->project_inode();
6301 pi->version = in->pre_dirty();
6302 pi->truncate_from = 0;
6303 pi->truncate_pending--;
6304
6305 MutationRef mut(new MutationImpl());
6306 mut->ls = mds->mdlog->get_current_segment();
6307 mut->add_projected_inode(in);
6308
6309 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6310 mds->mdlog->start_entry(le);
6311 CDentry *dn = in->get_projected_parent_dn();
6312 le->metablob.add_dir_context(dn->get_dir());
6313 le->metablob.add_primary_dentry(dn, in, true);
6314 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6315
6316 journal_dirty_inode(mut.get(), &le->metablob, in);
6317 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6318
6319 // flush immediately if there are readers/writers waiting
6320 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6321 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6322 mds->mdlog->flush();
6323 }
6324
6325 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6326 {
6327 dout(10) << "truncate_inode_logged " << *in << dendl;
6328 mut->apply();
6329 mds->locker->drop_locks(mut.get());
6330 mut->cleanup();
6331
6332 in->put(CInode::PIN_TRUNCATING);
6333 in->auth_unpin(this);
6334
6335 list<MDSInternalContextBase*> waiters;
6336 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6337 mds->queue_waiters(waiters);
6338 }
6339
6340
6341 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6342 {
6343 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6344 << ls->seq << "/" << ls->offset << dendl;
6345 ls->truncating_inodes.insert(in);
6346 in->get(CInode::PIN_TRUNCATING);
6347 }
6348
6349 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6350 {
6351 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6352 << ls->seq << "/" << ls->offset << dendl;
6353 // if we have the logseg the truncate started in, it must be in our list.
6354 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6355 assert(p != ls->truncating_inodes.end());
6356 ls->truncating_inodes.erase(p);
6357 in->put(CInode::PIN_TRUNCATING);
6358 }
6359
6360 void MDCache::start_recovered_truncates()
6361 {
6362 dout(10) << "start_recovered_truncates" << dendl;
6363 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6364 p != mds->mdlog->segments.end();
6365 ++p) {
6366 LogSegment *ls = p->second;
6367 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6368 q != ls->truncating_inodes.end();
6369 ++q) {
6370 CInode *in = *q;
6371 in->auth_pin(this);
6372
6373 if (!in->client_need_snapflush.empty() &&
6374 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6375 assert(in->filelock.is_stable());
6376 in->filelock.set_state(LOCK_XLOCKDONE);
6377 in->auth_pin(&in->filelock);
6378 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6379 // start_files_to_recover will revoke caps
6380 continue;
6381 }
6382 _truncate_inode(in, ls);
6383 }
6384 }
6385 }
6386
6387
6388
6389
6390
6391
6392 // ================================================================================
6393 // cache trimming
6394
6395
6396 /*
6397 * note: only called while MDS is active or stopping... NOT during recovery.
6398 * however, we may expire a replica whose authority is recovering.
6399 *
6400 */
6401 bool MDCache::trim(int max, int count)
6402 {
6403 // trim LRU
6404 if (count > 0) {
6405 max = lru.lru_get_size() - count;
6406 if (max <= 0)
6407 max = 1;
6408 } else if (max < 0) {
6409 max = g_conf->mds_cache_size;
6410 if (max <= 0)
6411 return false;
6412 }
6413 dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl;
6414
6415 // process delayed eval_stray()
6416 stray_manager.advance_delayed();
6417
6418 map<mds_rank_t, MCacheExpire*> expiremap;
6419 bool is_standby_replay = mds->is_standby_replay();
6420 int unexpirable = 0;
6421 list<CDentry*> unexpirables;
6422
6423 // trim dentries from the LRU: only enough to satisfy `max`,
6424 // unless we see null dentries at the bottom of the LRU,
6425 // in which case trim all those.
6426 bool trimming_nulls = true;
6427 while (trimming_nulls || lru.lru_get_size() + unexpirable > (unsigned)max) {
6428 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6429 if (!dn) {
6430 break;
6431 }
6432 if (!dn->get_linkage()->is_null()) {
6433 trimming_nulls = false;
6434 if (lru.lru_get_size() + unexpirable < (unsigned)max) {
6435 unexpirables.push_back(dn);
6436 break;
6437 }
6438 }
6439 if ((is_standby_replay && dn->get_linkage()->inode &&
6440 dn->get_linkage()->inode->item_open_file.is_on_list()) ||
6441 trim_dentry(dn, expiremap)) {
6442 unexpirables.push_back(dn);
6443 ++unexpirable;
6444 }
6445 }
6446 for(list<CDentry*>::iterator i = unexpirables.begin();
6447 i != unexpirables.end();
6448 ++i)
6449 lru.lru_insert_mid(*i);
6450
6451 // trim non-auth, non-bound subtrees
6452 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6453 p != subtrees.end();) {
6454 CDir *dir = p->first;
6455 ++p;
6456 if (!dir->is_auth() && !dir->get_inode()->is_auth()) {
6457 // don't trim subtree root if its auth MDS is recovering.
6458 // This simplify the cache rejoin code.
6459 if (dir->is_subtree_root() &&
6460 rejoin_ack_gather.count(dir->get_dir_auth().first))
6461 continue;
6462 if (dir->get_num_ref() == 1) // subtree pin
6463 trim_dirfrag(dir, 0, expiremap);
6464 }
6465 }
6466
6467 // trim root?
6468 if (max == 0 && root) {
6469 list<CDir*> ls;
6470 root->get_dirfrags(ls);
6471 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6472 CDir *dir = *p;
6473 if (dir->get_num_ref() == 1) // subtree pin
6474 trim_dirfrag(dir, 0, expiremap);
6475 }
6476 if (root->get_num_ref() == 0)
6477 trim_inode(0, root, 0, expiremap);
6478 }
6479
6480 std::set<mds_rank_t> stopping;
6481 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6482 stopping.erase(mds->get_nodeid());
6483 for (auto rank : stopping) {
6484 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6485 if (!mdsdir_in)
6486 continue;
6487
6488 if (expiremap.count(rank) == 0) {
6489 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6490 }
6491
6492 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6493
6494 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6495 if (!aborted) {
6496 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6497 list<CDir*> ls;
6498 mdsdir_in->get_dirfrags(ls);
6499 for (auto dir : ls) {
6500 if (dir->get_num_ref() == 1) // subtree pin
6501 trim_dirfrag(dir, dir, expiremap);
6502 }
6503 if (mdsdir_in->get_num_ref() == 0)
6504 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6505 } else {
6506 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6507 }
6508 }
6509
6510 // Other rank's base inodes (when I'm stopping)
6511 if (max == 0) {
6512 for (set<CInode*>::iterator p = base_inodes.begin();
6513 p != base_inodes.end(); ++p) {
6514 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6515 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6516 if ((*p)->get_num_ref() == 0) {
6517 trim_inode(NULL, *p, NULL, expiremap);
6518 }
6519 }
6520 }
6521 }
6522
6523 // send any expire messages
6524 send_expire_messages(expiremap);
6525
6526 return true;
6527 }
6528
6529 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6530 {
6531 // send expires
6532 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6533 it != expiremap.end();
6534 ++it) {
6535 if (mds->is_cluster_degraded() &&
6536 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6537 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6538 rejoin_sent.count(it->first) == 0))) {
6539 it->second->put();
6540 continue;
6541 }
6542 dout(7) << "sending cache_expire to " << it->first << dendl;
6543 mds->send_message_mds(it->second, it->first);
6544 }
6545 }
6546
6547
6548 bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6549 {
6550 dout(12) << "trim_dentry " << *dn << dendl;
6551
6552 CDentry::linkage_t *dnl = dn->get_linkage();
6553
6554 CDir *dir = dn->get_dir();
6555 assert(dir);
6556
6557 CDir *con = get_subtree_root(dir);
6558 if (con)
6559 dout(12) << " in container " << *con << dendl;
6560 else {
6561 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6562 assert(dn->is_auth());
6563 }
6564
6565 // If replica dentry is not readable, it's likely we will receive
6566 // MDentryLink/MDentryUnlink message soon (It's possible we first
6567 // receive a MDentryUnlink message, then MDentryLink message)
6568 // MDentryLink message only replicates an inode, so we should
6569 // avoid trimming the inode's parent dentry. This is because that
6570 // unconnected replicas are problematic for subtree migration.
6571 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6572 !dn->get_dir()->get_inode()->is_stray())
6573 return true;
6574
6575 // adjust the dir state
6576 // NOTE: we can safely remove a clean, null dentry without effecting
6577 // directory completeness.
6578 // (check this _before_ we unlink the inode, below!)
6579 bool clear_complete = false;
6580 if (!(dnl->is_null() && dn->is_clean()))
6581 clear_complete = true;
6582
6583 // unlink the dentry
6584 if (dnl->is_remote()) {
6585 // just unlink.
6586 dir->unlink_inode(dn);
6587 } else if (dnl->is_primary()) {
6588 // expire the inode, too.
6589 CInode *in = dnl->get_inode();
6590 assert(in);
6591 if (trim_inode(dn, in, con, expiremap))
6592 return true; // purging stray instead of trimming
6593 } else {
6594 assert(dnl->is_null());
6595 }
6596
6597 if (!dn->is_auth()) {
6598 // notify dentry authority.
6599 mds_authority_t auth = dn->authority();
6600
6601 for (int p=0; p<2; p++) {
6602 mds_rank_t a = auth.first;
6603 if (p) a = auth.second;
6604 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6605 if (mds->get_nodeid() == auth.second &&
6606 con->is_importing()) break; // don't send any expire while importing.
6607 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6608
6609 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6610 assert(a != mds->get_nodeid());
6611 if (expiremap.count(a) == 0)
6612 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6613 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
6614 }
6615 }
6616
6617 // remove dentry
6618 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6619 dir->add_to_bloom(dn);
6620 dir->remove_dentry(dn);
6621
6622 if (clear_complete)
6623 dir->state_clear(CDir::STATE_COMPLETE);
6624
6625 // reexport?
6626 if (dir->get_num_head_items() == 0 && dir->is_subtree_root())
6627 migrator->export_empty_import(dir);
6628
6629 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6630 return false;
6631 }
6632
6633
6634 void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6635 {
6636 dout(15) << "trim_dirfrag " << *dir << dendl;
6637
6638 if (dir->is_subtree_root()) {
6639 assert(!dir->is_auth() ||
6640 (!dir->is_replicated() && dir->inode->is_base()));
6641 remove_subtree(dir); // remove from subtree map
6642 }
6643 assert(dir->get_num_ref() == 0);
6644
6645 CInode *in = dir->get_inode();
6646
6647 if (!dir->is_auth()) {
6648 mds_authority_t auth = dir->authority();
6649
6650 // was this an auth delegation? (if so, slightly modified container)
6651 dirfrag_t condf;
6652 if (dir->is_subtree_root()) {
6653 dout(12) << " subtree root, container is " << *dir << dendl;
6654 con = dir;
6655 condf = dir->dirfrag();
6656 } else {
6657 condf = con->dirfrag();
6658 }
6659
6660 for (int p=0; p<2; p++) {
6661 mds_rank_t a = auth.first;
6662 if (p) a = auth.second;
6663 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6664 if (mds->get_nodeid() == auth.second &&
6665 con->is_importing()) break; // don't send any expire while importing.
6666 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6667
6668 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6669 assert(a != mds->get_nodeid());
6670 if (expiremap.count(a) == 0)
6671 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6672 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6673 }
6674 }
6675
6676 in->close_dirfrag(dir->dirfrag().frag);
6677 }
6678
6679 /**
6680 * Try trimming an inode from the cache
6681 *
6682 * @return true if the inode is still in cache, else false if it was trimmed
6683 */
6684 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6685 {
6686 dout(15) << "trim_inode " << *in << dendl;
6687 assert(in->get_num_ref() == 0);
6688
6689 if (in->is_dir()) {
6690 // If replica inode's dirfragtreelock is not readable, it's likely
6691 // some dirfrags of the inode are being fragmented and we will receive
6692 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6693 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6694 // This is because that unconnected replicas are problematic for
6695 // subtree migration.
6696 //
6697 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6698 return true;
6699
6700 // DIR
6701 list<CDir*> dfls;
6702 in->get_dirfrags(dfls);
6703 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6704 CDir *dir = *p;
6705 assert(!dir->is_subtree_root());
6706 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6707 }
6708 }
6709
6710 // INODE
6711 if (in->is_auth()) {
6712 // eval stray after closing dirfrags
6713 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6714 maybe_eval_stray(in);
6715 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6716 return true;
6717 }
6718 } else {
6719 mds_authority_t auth = in->authority();
6720
6721 dirfrag_t df;
6722 if (con)
6723 df = con->dirfrag();
6724 else
6725 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6726
6727 for (int p=0; p<2; p++) {
6728 mds_rank_t a = auth.first;
6729 if (p) a = auth.second;
6730 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6731 if (con && mds->get_nodeid() == auth.second &&
6732 con->is_importing()) break; // don't send any expire while importing.
6733 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6734
6735 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6736 assert(a != mds->get_nodeid());
6737 if (expiremap.count(a) == 0)
6738 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6739 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6740 }
6741 }
6742
6743 /*
6744 if (in->is_auth()) {
6745 if (in->hack_accessed)
6746 mds->logger->inc("outt");
6747 else {
6748 mds->logger->inc("outut");
6749 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6750 }
6751 }
6752 */
6753
6754 // unlink
6755 if (dn)
6756 dn->get_dir()->unlink_inode(dn);
6757 remove_inode(in);
6758 return false;
6759 }
6760
6761
6762 /**
6763 * trim_non_auth - remove any non-auth items from our cache
6764 *
6765 * this reduces the amount of non-auth metadata in our cache, reducing the
6766 * load incurred by the rejoin phase.
6767 *
6768 * the only non-auth items that remain are those that are needed to
6769 * attach our own subtrees to the root.
6770 *
6771 * when we are done, all dentries will be in the top bit of the lru.
6772 *
6773 * why we have to do this:
6774 * we may not have accurate linkage for non-auth items. which means we will
6775 * know which subtree it falls into, and can not be sure to declare it to the
6776 * correct authority.
6777 */
6778 void MDCache::trim_non_auth()
6779 {
6780 dout(7) << "trim_non_auth" << dendl;
6781
6782 // temporarily pin all subtree roots
6783 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6784 p != subtrees.end();
6785 ++p)
6786 p->first->get(CDir::PIN_SUBTREETEMP);
6787
6788 // note first auth item we see.
6789 // when we see it the second time, stop.
6790 CDentry *first_auth = 0;
6791
6792 // trim non-auth items from the lru
6793 while (lru.lru_get_size() > 0) {
6794 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6795 if (!dn) break;
6796 CDentry::linkage_t *dnl = dn->get_linkage();
6797
6798 if (dn->is_auth()) {
6799 // add back into lru (at the top)
6800 lru.lru_insert_top(dn);
6801
6802 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6803 dn->unlink_remote(dnl);
6804
6805 if (!first_auth) {
6806 first_auth = dn;
6807 } else {
6808 if (first_auth == dn)
6809 break;
6810 }
6811 } else {
6812 // non-auth. expire.
6813 CDir *dir = dn->get_dir();
6814 assert(dir);
6815
6816 // unlink the dentry
6817 dout(10) << " removing " << *dn << dendl;
6818 if (dnl->is_remote()) {
6819 dir->unlink_inode(dn);
6820 }
6821 else if (dnl->is_primary()) {
6822 CInode *in = dnl->get_inode();
6823 dout(10) << " removing " << *in << dendl;
6824 list<CDir*> ls;
6825 in->get_dirfrags(ls);
6826 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6827 CDir *subdir = *p;
6828 assert(!subdir->is_subtree_root());
6829 in->close_dirfrag(subdir->dirfrag().frag);
6830 }
6831 dir->unlink_inode(dn);
6832 remove_inode(in);
6833 }
6834 else {
6835 assert(dnl->is_null());
6836 }
6837
6838 assert(!dir->has_bloom());
6839 dir->remove_dentry(dn);
6840 // adjust the dir state
6841 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6842 // close empty non-auth dirfrag
6843 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6844 dir->inode->close_dirfrag(dir->get_frag());
6845 }
6846 }
6847
6848 // move everything in the pintail to the top bit of the lru.
6849 lru.lru_touch_entire_pintail();
6850
6851 // unpin all subtrees
6852 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6853 p != subtrees.end();
6854 ++p)
6855 p->first->put(CDir::PIN_SUBTREETEMP);
6856
6857 if (lru.lru_get_size() == 0) {
6858 // root, stray, etc.?
6859 ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6860 while (p != inode_map.end()) {
6861 ceph::unordered_map<vinodeno_t,CInode*>::iterator next = p;
6862 ++next;
6863 CInode *in = p->second;
6864 if (!in->is_auth()) {
6865 list<CDir*> ls;
6866 in->get_dirfrags(ls);
6867 for (list<CDir*>::iterator p = ls.begin();
6868 p != ls.end();
6869 ++p) {
6870 dout(10) << " removing " << **p << dendl;
6871 assert((*p)->get_num_ref() == 1); // SUBTREE
6872 remove_subtree((*p));
6873 in->close_dirfrag((*p)->dirfrag().frag);
6874 }
6875 dout(10) << " removing " << *in << dendl;
6876 assert(!in->get_parent_dn());
6877 assert(in->get_num_ref() == 0);
6878 remove_inode(in);
6879 }
6880 p = next;
6881 }
6882 }
6883
6884 show_subtrees();
6885 }
6886
6887 /**
6888 * Recursively trim the subtree rooted at directory to remove all
6889 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6890 * of those links. This is used to clear invalid data out of the cache.
6891 * Note that it doesn't clear the passed-in directory, since that's not
6892 * always safe.
6893 */
6894 bool MDCache::trim_non_auth_subtree(CDir *dir)
6895 {
6896 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6897
6898 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6899
6900 CDir::map_t::iterator j = dir->begin();
6901 CDir::map_t::iterator i = j;
6902 while (j != dir->end()) {
6903 i = j++;
6904 CDentry *dn = i->second;
6905 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6906 CDentry::linkage_t *dnl = dn->get_linkage();
6907 if (dnl->is_primary()) { // check for subdirectories, etc
6908 CInode *in = dnl->get_inode();
6909 bool keep_inode = false;
6910 if (in->is_dir()) {
6911 list<CDir*> subdirs;
6912 in->get_dirfrags(subdirs);
6913 for (list<CDir*>::iterator subdir = subdirs.begin();
6914 subdir != subdirs.end();
6915 ++subdir) {
6916 if ((*subdir)->is_subtree_root()) {
6917 keep_inode = true;
6918 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
6919 } else {
6920 if (trim_non_auth_subtree(*subdir))
6921 keep_inode = true;
6922 else {
6923 in->close_dirfrag((*subdir)->get_frag());
6924 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
6925 }
6926 }
6927 }
6928
6929 }
6930 if (!keep_inode) { // remove it!
6931 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
6932 dir->unlink_inode(dn);
6933 remove_inode(in);
6934 assert(!dir->has_bloom());
6935 dir->remove_dentry(dn);
6936 } else {
6937 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
6938 dn->state_clear(CDentry::STATE_AUTH);
6939 in->state_clear(CInode::STATE_AUTH);
6940 }
6941 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
6942 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
6943 } else { // just remove it
6944 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
6945 if (dnl->is_remote())
6946 dir->unlink_inode(dn);
6947 dir->remove_dentry(dn);
6948 }
6949 }
6950 dir->state_clear(CDir::STATE_AUTH);
6951 /**
6952 * We've now checked all our children and deleted those that need it.
6953 * Now return to caller, and tell them if *we're* a keeper.
6954 */
6955 return keep_dir || dir->get_num_any();
6956 }
6957
6958 /*
6959 * during replay, when we determine a subtree is no longer ours, we
6960 * try to trim it from our cache. because subtrees must be connected
6961 * to the root, the fact that we can trim this tree may mean that our
6962 * children or parents can also be trimmed.
6963 */
6964 void MDCache::try_trim_non_auth_subtree(CDir *dir)
6965 {
6966 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
6967
6968 // can we now trim child subtrees?
6969 set<CDir*> bounds;
6970 get_subtree_bounds(dir, bounds);
6971 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
6972 CDir *bd = *p;
6973 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
6974 bd->get_num_any() == 0 && // and empty
6975 can_trim_non_auth_dirfrag(bd)) {
6976 CInode *bi = bd->get_inode();
6977 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
6978 remove_subtree(bd);
6979 bd->mark_clean();
6980 bi->close_dirfrag(bd->get_frag());
6981 }
6982 }
6983
6984 if (trim_non_auth_subtree(dir)) {
6985 // keep
6986 try_subtree_merge(dir);
6987 } else {
6988 // can we trim this subtree (and possibly our ancestors) too?
6989 while (true) {
6990 CInode *diri = dir->get_inode();
6991 if (diri->is_base()) {
6992 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
6993 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
6994 remove_subtree(dir);
6995 dir->mark_clean();
6996 diri->close_dirfrag(dir->get_frag());
6997
6998 dout(10) << " removing " << *diri << dendl;
6999 assert(!diri->get_parent_dn());
7000 assert(diri->get_num_ref() == 0);
7001 remove_inode(diri);
7002 }
7003 break;
7004 }
7005
7006 CDir *psub = get_subtree_root(diri->get_parent_dir());
7007 dout(10) << " parent subtree is " << *psub << dendl;
7008 if (psub->get_dir_auth().first == mds->get_nodeid())
7009 break; // we are auth, keep.
7010
7011 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7012 remove_subtree(dir);
7013 dir->mark_clean();
7014 diri->close_dirfrag(dir->get_frag());
7015
7016 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7017 if (trim_non_auth_subtree(psub))
7018 break;
7019 dir = psub;
7020 }
7021 }
7022
7023 show_subtrees();
7024 }
7025
7026 void MDCache::standby_trim_segment(LogSegment *ls)
7027 {
7028 ls->new_dirfrags.clear_list();
7029 ls->open_files.clear_list();
7030
7031 while (!ls->dirty_dirfrags.empty()) {
7032 CDir *dir = ls->dirty_dirfrags.front();
7033 dir->mark_clean();
7034 }
7035 while (!ls->dirty_inodes.empty()) {
7036 CInode *in = ls->dirty_inodes.front();
7037 in->mark_clean();
7038 }
7039 while (!ls->dirty_dentries.empty()) {
7040 CDentry *dn = ls->dirty_dentries.front();
7041 dn->mark_clean();
7042 }
7043 while (!ls->dirty_parent_inodes.empty()) {
7044 CInode *in = ls->dirty_parent_inodes.front();
7045 in->clear_dirty_parent();
7046 }
7047 while (!ls->dirty_dirfrag_dir.empty()) {
7048 CInode *in = ls->dirty_dirfrag_dir.front();
7049 in->filelock.remove_dirty();
7050 }
7051 while (!ls->dirty_dirfrag_nest.empty()) {
7052 CInode *in = ls->dirty_dirfrag_nest.front();
7053 in->nestlock.remove_dirty();
7054 }
7055 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7056 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7057 in->dirfragtreelock.remove_dirty();
7058 }
7059 }
7060
7061 /* This function DOES put the passed message before returning */
7062 void MDCache::handle_cache_expire(MCacheExpire *m)
7063 {
7064 mds_rank_t from = mds_rank_t(m->get_from());
7065
7066 dout(7) << "cache_expire from mds." << from << dendl;
7067
7068 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7069 m->put();
7070 return;
7071 }
7072
7073 set<SimpleLock *> gather_locks;
7074 // loop over realms
7075 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7076 p != m->realms.end();
7077 ++p) {
7078 // check container?
7079 if (p->first.ino > 0) {
7080 CInode *expired_inode = get_inode(p->first.ino);
7081 assert(expired_inode); // we had better have this.
7082 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7083 assert(parent_dir);
7084
7085 int export_state = -1;
7086 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7087 export_state = migrator->get_export_state(parent_dir);
7088 assert(export_state >= 0);
7089 }
7090
7091 if (!parent_dir->is_auth() ||
7092 (export_state != -1 &&
7093 ((export_state == Migrator::EXPORT_WARNING &&
7094 migrator->export_has_warned(parent_dir,from)) ||
7095 export_state == Migrator::EXPORT_EXPORTING ||
7096 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7097 (export_state == Migrator::EXPORT_NOTIFYING &&
7098 !migrator->export_has_notified(parent_dir,from))))) {
7099
7100 // not auth.
7101 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7102 assert(parent_dir->is_frozen_tree_root());
7103
7104 // make a message container
7105 if (delayed_expire[parent_dir].count(from) == 0)
7106 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7107
7108 // merge these expires into it
7109 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7110 continue;
7111 }
7112 assert(export_state <= Migrator::EXPORT_PREPPING ||
7113 (export_state == Migrator::EXPORT_WARNING &&
7114 !migrator->export_has_warned(parent_dir, from)));
7115
7116 dout(7) << "expires for " << *parent_dir << dendl;
7117 } else {
7118 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7119 }
7120
7121 // INODES
7122 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7123 it != p->second.inodes.end();
7124 ++it) {
7125 CInode *in = get_inode(it->first);
7126 unsigned nonce = it->second;
7127
7128 if (!in) {
7129 dout(0) << " inode expire on " << it->first << " from " << from
7130 << ", don't have it" << dendl;
7131 assert(in);
7132 }
7133 assert(in->is_auth());
7134 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7135
7136 // check nonce
7137 if (nonce == in->get_replica_nonce(from)) {
7138 // remove from our cached_by
7139 dout(7) << " inode expire on " << *in << " from mds." << from
7140 << " cached_by was " << in->get_replicas() << dendl;
7141 inode_remove_replica(in, from, false, gather_locks);
7142 }
7143 else {
7144 // this is an old nonce, ignore expire.
7145 dout(7) << " inode expire on " << *in << " from mds." << from
7146 << " with old nonce " << nonce
7147 << " (current " << in->get_replica_nonce(from) << "), dropping"
7148 << dendl;
7149 }
7150 }
7151
7152 // DIRS
7153 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7154 it != p->second.dirs.end();
7155 ++it) {
7156 CDir *dir = get_dirfrag(it->first);
7157 unsigned nonce = it->second;
7158
7159 if (!dir) {
7160 CInode *diri = get_inode(it->first.ino);
7161 if (diri) {
7162 if (mds->is_rejoin() &&
7163 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7164 !diri->is_replica(from)) {
7165 list<CDir*> ls;
7166 diri->get_nested_dirfrags(ls);
7167 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7168 << " while rejoining, inode isn't replicated" << dendl;
7169 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7170 dir = *q;
7171 if (dir->is_replica(from)) {
7172 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7173 dir->remove_replica(from);
7174 }
7175 }
7176 continue;
7177 }
7178 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7179 if (other) {
7180 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7181 << " have " << *other << ", mismatched frags, dropping" << dendl;
7182 continue;
7183 }
7184 }
7185 dout(0) << " dir expire on " << it->first << " from " << from
7186 << ", don't have it" << dendl;
7187 assert(dir);
7188 }
7189 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7190
7191 assert(dir->is_auth());
7192
7193 // check nonce
7194 if (nonce == dir->get_replica_nonce(from)) {
7195 // remove from our cached_by
7196 dout(7) << " dir expire on " << *dir << " from mds." << from
7197 << " replicas was " << dir->replica_map << dendl;
7198 dir->remove_replica(from);
7199 }
7200 else {
7201 // this is an old nonce, ignore expire.
7202 dout(7) << " dir expire on " << *dir << " from mds." << from
7203 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7204 << "), dropping" << dendl;
7205 }
7206 }
7207
7208 // DENTRIES
7209 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7210 pd != p->second.dentries.end();
7211 ++pd) {
7212 dout(10) << " dn expires in dir " << pd->first << dendl;
7213 CInode *diri = get_inode(pd->first.ino);
7214 assert(diri);
7215 CDir *dir = diri->get_dirfrag(pd->first.frag);
7216
7217 if (!dir) {
7218 dout(0) << " dn expires on " << pd->first << " from " << from
7219 << ", must have refragmented" << dendl;
7220 } else {
7221 assert(dir->is_auth());
7222 }
7223
7224 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7225 p != pd->second.end();
7226 ++p) {
7227 unsigned nonce = p->second;
7228 CDentry *dn;
7229
7230 if (dir) {
7231 dn = dir->lookup(p->first.first, p->first.second);
7232 } else {
7233 // which dirfrag for this dentry?
7234 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7235 assert(dir);
7236 assert(dir->is_auth());
7237 dn = dir->lookup(p->first.first, p->first.second);
7238 }
7239
7240 if (!dn) {
7241 if (dir)
7242 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7243 else
7244 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7245 }
7246 assert(dn);
7247
7248 if (nonce == dn->get_replica_nonce(from)) {
7249 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7250 dentry_remove_replica(dn, from, gather_locks);
7251 }
7252 else {
7253 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7254 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7255 << "), dropping" << dendl;
7256 }
7257 }
7258 }
7259 }
7260
7261 // done
7262 m->put();
7263
7264 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7265 if (!(*p)->is_stable())
7266 mds->locker->eval_gather(*p);
7267 }
7268 }
7269
7270 void MDCache::process_delayed_expire(CDir *dir)
7271 {
7272 dout(7) << "process_delayed_expire on " << *dir << dendl;
7273 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7274 p != delayed_expire[dir].end();
7275 ++p)
7276 handle_cache_expire(p->second);
7277 delayed_expire.erase(dir);
7278 }
7279
7280 void MDCache::discard_delayed_expire(CDir *dir)
7281 {
7282 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7283 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7284 p != delayed_expire[dir].end();
7285 ++p)
7286 p->second->put();
7287 delayed_expire.erase(dir);
7288 }
7289
7290 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7291 set<SimpleLock *>& gather_locks)
7292 {
7293 in->remove_replica(from);
7294 in->mds_caps_wanted.erase(from);
7295
7296 // note: this code calls _eval more often than it needs to!
7297 // fix lock
7298 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7299 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7300 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7301 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7302 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7303 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7304
7305 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7306 // Don't remove the recovering mds from lock's gathering list because
7307 // it may hold rejoined wrlocks.
7308 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7309 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7310 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7311 }
7312
7313 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7314 {
7315 dn->remove_replica(from);
7316
7317 // fix lock
7318 if (dn->lock.remove_replica(from))
7319 gather_locks.insert(&dn->lock);
7320
7321 // Replicated strays might now be elegible for purge
7322 CDentry::linkage_t *dnl = dn->get_linkage();
7323 if (dnl->is_primary()) {
7324 maybe_eval_stray(dnl->get_inode());
7325 }
7326 }
7327
7328 void MDCache::trim_client_leases()
7329 {
7330 utime_t now = ceph_clock_now();
7331
7332 dout(10) << "trim_client_leases" << dendl;
7333
7334 for (int pool=0; pool<client_lease_pools; pool++) {
7335 int before = client_leases[pool].size();
7336 if (client_leases[pool].empty())
7337 continue;
7338
7339 while (!client_leases[pool].empty()) {
7340 ClientLease *r = client_leases[pool].front();
7341 if (r->ttl > now) break;
7342 CDentry *dn = static_cast<CDentry*>(r->parent);
7343 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7344 dn->remove_client_lease(r, mds->locker);
7345 }
7346 int after = client_leases[pool].size();
7347 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7348 << (before-after) << " leases, " << after << " left" << dendl;
7349 }
7350 }
7351
7352
7353 void MDCache::check_memory_usage()
7354 {
7355 static MemoryModel mm(g_ceph_context);
7356 static MemoryModel::snap last;
7357 mm.sample(&last);
7358 static MemoryModel::snap baseline = last;
7359
7360 // check client caps
7361 assert(CInode::count() == inode_map.size());
7362 float caps_per_inode = 0.0;
7363 if (CInode::count())
7364 caps_per_inode = (float)Capability::count() / (float)CInode::count();
7365
7366 dout(2) << "check_memory_usage"
7367 << " total " << last.get_total()
7368 << ", rss " << last.get_rss()
7369 << ", heap " << last.get_heap()
7370 << ", baseline " << baseline.get_heap()
7371 << ", buffers " << (buffer::get_total_alloc() >> 10)
7372 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7373 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7374 << dendl;
7375
7376 mds->mlogger->set(l_mdm_rss, last.get_rss());
7377 mds->mlogger->set(l_mdm_heap, last.get_heap());
7378
7379 if (num_inodes_with_caps > g_conf->mds_cache_size) {
7380 float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
7381 if (ratio < 1.0) {
7382 last_recall_state = ceph_clock_now();
7383 mds->server->recall_client_state(ratio);
7384 }
7385 }
7386
7387 // If the cache size had exceeded its limit, but we're back in bounds
7388 // now, free any unused pool memory so that our memory usage isn't
7389 // permanently bloated.
7390 if (exceeded_size_limit
7391 && CInode::count() <=
7392 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
7393 // Only do this once we are back in bounds: otherwise the releases would
7394 // slow down whatever process caused us to exceed bounds to begin with
7395 if (ceph_using_tcmalloc()) {
7396 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7397 << dendl;
7398 ceph_heap_release_free_memory();
7399 }
7400 exceeded_size_limit = false;
7401 }
7402 }
7403
7404
7405
7406 // =========================================================================================
7407 // shutdown
7408
7409 class C_MDC_ShutdownCheck : public MDCacheContext {
7410 public:
7411 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7412 void finish(int) override {
7413 mdcache->shutdown_check();
7414 }
7415 };
7416
7417 void MDCache::shutdown_check()
7418 {
7419 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7420
7421 // cache
7422 char old_val[32] = { 0 };
7423 char *o = old_val;
7424 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7425 g_conf->set_val("debug_mds", "10");
7426 g_conf->apply_changes(NULL);
7427 show_cache();
7428 g_conf->set_val("debug_mds", old_val);
7429 g_conf->apply_changes(NULL);
7430 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7431
7432 // this
7433 dout(0) << "lru size now " << lru.lru_get_size() << dendl;
7434 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7435
7436
7437 if (mds->objecter->is_active()) {
7438 dout(0) << "objecter still active" << dendl;
7439 mds->objecter->dump_active();
7440 }
7441 }
7442
7443
7444 void MDCache::shutdown_start()
7445 {
7446 dout(2) << "shutdown_start" << dendl;
7447
7448 if (g_conf->mds_shutdown_check)
7449 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7450
7451 // g_conf->debug_mds = 10;
7452 }
7453
7454
7455
7456 bool MDCache::shutdown_pass()
7457 {
7458 dout(7) << "shutdown_pass" << dendl;
7459
7460 if (mds->is_stopped()) {
7461 dout(7) << " already shut down" << dendl;
7462 show_cache();
7463 show_subtrees();
7464 return true;
7465 }
7466
7467 // empty stray dir
7468 if (!shutdown_export_strays()) {
7469 dout(7) << "waiting for strays to migrate" << dendl;
7470 return false;
7471 }
7472
7473 // drop our reference to our stray dir inode
7474 for (int i = 0; i < NUM_STRAY; ++i) {
7475 if (strays[i] &&
7476 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7477 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7478 strays[i]->put(CInode::PIN_STRAY);
7479 strays[i]->put_stickydirs();
7480 }
7481 }
7482
7483 // trim cache
7484 trim(0);
7485 dout(5) << "lru size now " << lru.lru_get_size() << dendl;
7486
7487 // SUBTREES
7488 int num_auth_subtree = 0;
7489 if (!subtrees.empty() &&
7490 mds->get_nodeid() != 0 &&
7491 migrator->get_export_queue_size() == 0) {
7492 dout(7) << "looking for subtrees to export to mds0" << dendl;
7493 list<CDir*> ls;
7494 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7495 it != subtrees.end();
7496 ++it) {
7497 CDir *dir = it->first;
7498 if (dir->get_inode()->is_mdsdir())
7499 continue;
7500 if (dir->is_auth()) {
7501 num_auth_subtree++;
7502 if (dir->is_frozen() ||
7503 dir->is_freezing() ||
7504 dir->is_ambiguous_dir_auth() ||
7505 dir->state_test(CDir::STATE_EXPORTING))
7506 continue;
7507 ls.push_back(dir);
7508 }
7509 }
7510 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7511 CDir *dir = *p;
7512 mds_rank_t dest = dir->get_inode()->authority().first;
7513 if (dest > 0 && !mds->mdsmap->is_active(dest))
7514 dest = 0;
7515 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7516 migrator->export_dir_nicely(dir, dest);
7517 }
7518 }
7519
7520 if (num_auth_subtree > 0) {
7521 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7522 show_subtrees();
7523 return false;
7524 }
7525
7526 // close out any sessions (and open files!) before we try to trim the log, etc.
7527 if (mds->sessionmap.have_unclosed_sessions()) {
7528 if (!mds->server->terminating_sessions)
7529 mds->server->terminate_sessions();
7530 return false;
7531 }
7532
7533 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7534 if (mydir && !mydir->is_subtree_root())
7535 mydir = NULL;
7536
7537 // subtrees map not empty yet?
7538 if (subtrees.size() > (mydir ? 1 : 0)) {
7539 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7540 show_subtrees();
7541 migrator->show_importing();
7542 migrator->show_exporting();
7543 if (!migrator->is_importing() && !migrator->is_exporting())
7544 show_cache();
7545 return false;
7546 }
7547 assert(!migrator->is_exporting());
7548 assert(!migrator->is_importing());
7549
7550 // make mydir subtree go away
7551 if (mydir) {
7552 adjust_subtree_auth(mydir, CDIR_AUTH_UNKNOWN);
7553 remove_subtree(mydir);
7554 }
7555 assert(subtrees.empty());
7556
7557 // Still replicas of mydir?
7558 if ((mydir != NULL) && mydir->inode->is_replicated()) {
7559 // We do this because otherwise acks to locks could come in after
7560 // we cap the log.
7561 dout(7) << "waiting for mydir replicas to release: " << *mydir << dendl;
7562 return false;
7563 }
7564
7565 // flush what we can from the log
7566 mds->mdlog->trim(0);
7567 if (mds->mdlog->get_num_segments() > 1) {
7568 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7569 return false;
7570 }
7571
7572 // (only do this once!)
7573 if (!mds->mdlog->is_capped()) {
7574 dout(7) << "capping the log" << dendl;
7575 mds->mdlog->cap();
7576 mds->mdlog->trim();
7577 }
7578
7579 if (!mds->mdlog->empty()) {
7580 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7581 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7582 return false;
7583 }
7584
7585 if (!did_shutdown_log_cap) {
7586 // flush journal header
7587 dout(7) << "writing header for (now-empty) journal" << dendl;
7588 assert(mds->mdlog->empty());
7589 mds->mdlog->write_head(0);
7590 // NOTE: filer active checker below will block us until this completes.
7591 did_shutdown_log_cap = true;
7592 return false;
7593 }
7594
7595 // filer active?
7596 if (mds->objecter->is_active()) {
7597 dout(7) << "objecter still active" << dendl;
7598 mds->objecter->dump_active();
7599 return false;
7600 }
7601
7602 // trim what we can from the cache
7603 if (lru.lru_get_size() > 0) {
7604 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl;
7605 show_cache();
7606 //dump();
7607 return false;
7608 }
7609
7610 // done!
7611 dout(2) << "shutdown done." << dendl;
7612 return true;
7613 }
7614
7615 bool MDCache::shutdown_export_strays()
7616 {
7617 if (mds->get_nodeid() == 0)
7618 return true;
7619
7620 dout(10) << "shutdown_export_strays" << dendl;
7621
7622 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7623
7624 bool done = true;
7625
7626 list<CDir*> dfs;
7627 for (int i = 0; i < NUM_STRAY; ++i) {
7628 if (!strays[i]) {
7629 continue;
7630 }
7631 strays[i]->get_dirfrags(dfs);
7632 }
7633
7634 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7635 dfs_i != dfs.end(); ++dfs_i)
7636 {
7637 CDir *dir = *dfs_i;
7638
7639 if (!dir->is_complete()) {
7640 dir->fetch(0);
7641 done = false;
7642 if (!mds0_active)
7643 break;
7644 }
7645
7646 for (CDir::map_t::iterator p = dir->items.begin();
7647 p != dir->items.end();
7648 ++p) {
7649 CDentry *dn = p->second;
7650 CDentry::linkage_t *dnl = dn->get_linkage();
7651 if (dnl->is_null())
7652 continue;
7653 done = false;
7654 if (!mds0_active)
7655 break;
7656
7657 if (dn->state_test(CDentry::STATE_PURGING)) {
7658 // Don't try to migrate anything that is actually
7659 // being purged right now
7660 continue;
7661 }
7662
7663 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7664 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7665 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7666 } else {
7667 dout(10) << "already exporting " << *dn << dendl;
7668 }
7669 }
7670 }
7671
7672 return done;
7673 }
7674
7675 // ========= messaging ==============
7676
7677 /* This function DOES put the passed message before returning */
7678 void MDCache::dispatch(Message *m)
7679 {
7680 switch (m->get_type()) {
7681
7682 // RESOLVE
7683 case MSG_MDS_RESOLVE:
7684 handle_resolve(static_cast<MMDSResolve*>(m));
7685 break;
7686 case MSG_MDS_RESOLVEACK:
7687 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7688 break;
7689
7690 // REJOIN
7691 case MSG_MDS_CACHEREJOIN:
7692 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7693 break;
7694
7695 case MSG_MDS_DISCOVER:
7696 handle_discover(static_cast<MDiscover*>(m));
7697 break;
7698 case MSG_MDS_DISCOVERREPLY:
7699 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7700 break;
7701
7702 case MSG_MDS_DIRUPDATE:
7703 handle_dir_update(static_cast<MDirUpdate*>(m));
7704 break;
7705
7706 case MSG_MDS_CACHEEXPIRE:
7707 handle_cache_expire(static_cast<MCacheExpire*>(m));
7708 break;
7709
7710 case MSG_MDS_DENTRYLINK:
7711 handle_dentry_link(static_cast<MDentryLink*>(m));
7712 break;
7713 case MSG_MDS_DENTRYUNLINK:
7714 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7715 break;
7716
7717 case MSG_MDS_FRAGMENTNOTIFY:
7718 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7719 break;
7720
7721 case MSG_MDS_FINDINO:
7722 handle_find_ino(static_cast<MMDSFindIno *>(m));
7723 break;
7724 case MSG_MDS_FINDINOREPLY:
7725 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7726 break;
7727
7728 case MSG_MDS_OPENINO:
7729 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7730 break;
7731 case MSG_MDS_OPENINOREPLY:
7732 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7733 break;
7734
7735 default:
7736 derr << "cache unknown message " << m->get_type() << dendl;
7737 assert(0 == "cache unknown message");
7738 }
7739 }
7740
7741 MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7742 {
7743 if (mdr) {
7744 dout(20) << "_get_waiter retryrequest" << dendl;
7745 return new C_MDS_RetryRequest(this, mdr);
7746 } else if (req) {
7747 dout(20) << "_get_waiter retrymessage" << dendl;
7748 return new C_MDS_RetryMessage(mds, req);
7749 } else {
7750 return fin;
7751 }
7752 }
7753
7754 int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7755 const filepath& path, // what
7756 vector<CDentry*> *pdnvec, // result
7757 CInode **pin,
7758 int onfail)
7759 {
7760 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7761 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7762 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7763
7764 assert(mdr || req || fin);
7765 assert(!forward || mdr || req); // forward requires a request
7766
7767 snapid_t snapid = CEPH_NOSNAP;
7768 if (mdr)
7769 mdr->snapid = snapid;
7770
7771 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7772
7773 if (mds->logger) mds->logger->inc(l_mds_traverse);
7774
7775 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7776 CInode *cur = get_inode(path.get_ino());
7777 if (cur == NULL) {
7778 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7779 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7780 else {
7781 //ceph_abort(); // hrm.. broken
7782 return -ESTALE;
7783 }
7784 return 1;
7785 }
7786 if (cur->state_test(CInode::STATE_PURGING))
7787 return -ESTALE;
7788
7789 // make sure snaprealm are open...
7790 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7791 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7792 return 1;
7793 }
7794
7795 // start trace
7796 if (pdnvec)
7797 pdnvec->clear();
7798 if (pin)
7799 *pin = cur;
7800
7801 unsigned depth = 0;
7802 while (depth < path.depth()) {
7803 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7804 << "' snapid " << snapid << dendl;
7805
7806 if (!cur->is_dir()) {
7807 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7808 return -ENOTDIR;
7809 }
7810
7811 // walk into snapdir?
7812 if (path[depth].length() == 0) {
7813 dout(10) << "traverse: snapdir" << dendl;
7814 if (!mdr)
7815 return -EINVAL;
7816 snapid = CEPH_SNAPDIR;
7817 mdr->snapid = snapid;
7818 depth++;
7819 continue;
7820 }
7821 // walk thru snapdir?
7822 if (snapid == CEPH_SNAPDIR) {
7823 if (!mdr)
7824 return -EINVAL;
7825 SnapRealm *realm = cur->find_snaprealm();
7826 snapid = realm->resolve_snapname(path[depth], cur->ino());
7827 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7828 if (!snapid)
7829 return -ENOENT;
7830 mdr->snapid = snapid;
7831 depth++;
7832 continue;
7833 }
7834
7835 // open dir
7836 frag_t fg = cur->pick_dirfrag(path[depth]);
7837 CDir *curdir = cur->get_dirfrag(fg);
7838 if (!curdir) {
7839 if (cur->is_auth()) {
7840 // parent dir frozen_dir?
7841 if (cur->is_frozen()) {
7842 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7843 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7844 return 1;
7845 }
7846 curdir = cur->get_or_open_dirfrag(this, fg);
7847 } else {
7848 // discover?
7849 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7850 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7851 null_okay);
7852 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7853 return 1;
7854 }
7855 }
7856 assert(curdir);
7857
7858 #ifdef MDS_VERIFY_FRAGSTAT
7859 if (curdir->is_complete())
7860 curdir->verify_fragstat();
7861 #endif
7862
7863 // frozen?
7864 /*
7865 if (curdir->is_frozen()) {
7866 // doh!
7867 // FIXME: traverse is allowed?
7868 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7869 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7870 if (onfinish) delete onfinish;
7871 return 1;
7872 }
7873 */
7874
7875 // Before doing dirfrag->dn lookup, compare with DamageTable's
7876 // record of which dentries were unreadable
7877 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7878 dout(4) << "traverse: stopped lookup at damaged dentry "
7879 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7880 return -EIO;
7881 }
7882
7883 // dentry
7884 CDentry *dn = curdir->lookup(path[depth], snapid);
7885 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7886
7887 // null and last_bit and xlocked by me?
7888 if (dnl && dnl->is_null() && null_okay) {
7889 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7890 if (pdnvec)
7891 pdnvec->push_back(dn);
7892 if (pin)
7893 *pin = 0;
7894 break; // done!
7895 }
7896
7897 if (dnl &&
7898 dn->lock.is_xlocked() &&
7899 dn->lock.get_xlock_by() != mdr &&
7900 !dn->lock.can_read(client) &&
7901 (dnl->is_null() || forward)) {
7902 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7903 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7904 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7905 mds->mdlog->flush();
7906 return 1;
7907 }
7908
7909 // can we conclude ENOENT?
7910 if (dnl && dnl->is_null()) {
7911 if (dn->lock.can_read(client) ||
7912 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
7913 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
7914 if (pdnvec) {
7915 if (depth == path.depth() - 1)
7916 pdnvec->push_back(dn);
7917 else
7918 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
7919 }
7920 return -ENOENT;
7921 } else {
7922 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
7923 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7924 return 1;
7925 }
7926 }
7927
7928 if (dnl && !dnl->is_null()) {
7929 CInode *in = dnl->get_inode();
7930
7931 // do we have inode?
7932 if (!in) {
7933 assert(dnl->is_remote());
7934 // do i have it?
7935 in = get_inode(dnl->get_remote_ino());
7936 if (in) {
7937 dout(7) << "linking in remote in " << *in << dendl;
7938 dn->link_remote(dnl, in);
7939 } else {
7940 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
7941 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7942 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
7943 dout(4) << "traverse: remote dentry points to damaged ino "
7944 << *dn << dendl;
7945 return -EIO;
7946 }
7947 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
7948 (null_okay && depth == path.depth() - 1));
7949 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
7950 return 1;
7951 }
7952 }
7953
7954 cur = in;
7955 // make sure snaprealm are open...
7956 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7957 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7958 return 1;
7959 }
7960
7961 // add to trace, continue.
7962 touch_inode(cur);
7963 if (pdnvec)
7964 pdnvec->push_back(dn);
7965 if (pin)
7966 *pin = cur;
7967 depth++;
7968 continue;
7969 }
7970
7971
7972 // MISS. dentry doesn't exist.
7973 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
7974
7975 if (curdir->is_auth()) {
7976 // dentry is mine.
7977 if (curdir->is_complete() ||
7978 (snapid == CEPH_NOSNAP &&
7979 curdir->has_bloom() &&
7980 !curdir->is_in_bloom(path[depth]))){
7981 // file not found
7982 if (pdnvec) {
7983 // instantiate a null dn?
7984 if (depth < path.depth()-1){
7985 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
7986 dn = NULL;
7987 } else if (dn) {
7988 ceph_abort(); // should have fallen out in ->is_null() check above
7989 } else if (curdir->is_frozen()) {
7990 dout(20) << " not adding null to frozen dir " << dendl;
7991 } else if (snapid < CEPH_MAXSNAP) {
7992 dout(20) << " not adding null for snapid " << snapid << dendl;
7993 } else {
7994 // create a null dentry
7995 dn = curdir->add_null_dentry(path[depth]);
7996 dout(20) << " added null " << *dn << dendl;
7997 }
7998 if (dn)
7999 pdnvec->push_back(dn);
8000 else
8001 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8002 }
8003 return -ENOENT;
8004 } else {
8005
8006 // Check DamageTable for missing fragments before trying to fetch
8007 // this
8008 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8009 dout(4) << "traverse: damaged dirfrag " << *curdir
8010 << ", blocking fetch" << dendl;
8011 return -EIO;
8012 }
8013
8014 // directory isn't complete; reload
8015 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8016 touch_inode(cur);
8017 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8018 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8019 return 1;
8020 }
8021 } else {
8022 // dirfrag/dentry is not mine.
8023 mds_authority_t dauth = curdir->authority();
8024
8025 if (forward &&
8026 snapid && mdr && mdr->client_request &&
8027 (int)depth < mdr->client_request->get_num_fwd()) {
8028 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8029 << " < fwd " << mdr->client_request->get_num_fwd()
8030 << ", discovering instead of forwarding" << dendl;
8031 discover = true;
8032 }
8033
8034 if ((discover || null_okay)) {
8035 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8036 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8037 null_okay);
8038 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8039 return 1;
8040 }
8041 if (forward) {
8042 // forward
8043 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8044
8045 if (curdir->is_ambiguous_auth()) {
8046 // wait
8047 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8048 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8049 return 1;
8050 }
8051
8052 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8053
8054 if (mdr)
8055 request_forward(mdr, dauth.first);
8056 else
8057 mds->forward_message_mds(req, dauth.first);
8058
8059 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8060 assert(fin == NULL);
8061 return 2;
8062 }
8063 }
8064
8065 ceph_abort(); // i shouldn't get here
8066 }
8067
8068 // success.
8069 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8070 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8071 if (mdr)
8072 assert(mdr->snapid == snapid);
8073 return 0;
8074 }
8075
8076 CInode *MDCache::cache_traverse(const filepath& fp)
8077 {
8078 dout(10) << "cache_traverse " << fp << dendl;
8079
8080 CInode *in;
8081 if (fp.get_ino())
8082 in = get_inode(fp.get_ino());
8083 else
8084 in = root;
8085 if (!in)
8086 return NULL;
8087
8088 for (unsigned i = 0; i < fp.depth(); i++) {
8089 const string& dname = fp[i];
8090 frag_t fg = in->pick_dirfrag(dname);
8091 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8092 CDir *curdir = in->get_dirfrag(fg);
8093 if (!curdir)
8094 return NULL;
8095 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8096 if (!dn)
8097 return NULL;
8098 in = dn->get_linkage()->get_inode();
8099 if (!in)
8100 return NULL;
8101 }
8102 dout(10) << " got " << *in << dendl;
8103 return in;
8104 }
8105
8106
8107 /**
8108 * open_remote_dir -- open up a remote dirfrag
8109 *
8110 * @param diri base inode
8111 * @param approxfg approximate fragment.
8112 * @param fin completion callback
8113 */
8114 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8115 {
8116 dout(10) << "open_remote_dir on " << *diri << dendl;
8117
8118 assert(diri->is_dir());
8119 assert(!diri->is_auth());
8120 assert(diri->get_dirfrag(approxfg) == 0);
8121
8122 mds_rank_t auth = diri->authority().first;
8123
8124 if (!mds->is_cluster_degraded() ||
8125 mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
8126 discover_dir_frag(diri, approxfg, fin);
8127 } else {
8128 // mds is down or recovering. forge a replica!
8129 forge_replica_dir(diri, approxfg, auth);
8130 if (fin)
8131 mds->queue_waiter(fin);
8132 }
8133 }
8134
8135
8136 /**
8137 * get_dentry_inode - get or open inode
8138 *
8139 * @param dn the dentry
8140 * @param mdr current request
8141 *
8142 * will return inode for primary, or link up/open up remote link's inode as necessary.
8143 * If it's not available right now, puts mdr on wait list and returns null.
8144 */
8145 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8146 {
8147 CDentry::linkage_t *dnl;
8148 if (projected)
8149 dnl = dn->get_projected_linkage();
8150 else
8151 dnl = dn->get_linkage();
8152
8153 assert(!dnl->is_null());
8154
8155 if (dnl->is_primary())
8156 return dnl->inode;
8157
8158 assert(dnl->is_remote());
8159 CInode *in = get_inode(dnl->get_remote_ino());
8160 if (in) {
8161 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8162 dn->link_remote(dnl, in);
8163 return in;
8164 } else {
8165 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8166 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8167 return 0;
8168 }
8169 }
8170
8171 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8172 CDentry *dn;
8173 inodeno_t ino;
8174 MDSInternalContextBase *onfinish;
8175 bool want_xlocked;
8176 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
8177 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {}
8178 void finish(int r) override {
8179 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8180 }
8181 };
8182
8183 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8184 {
8185 dout(10) << "open_remote_dentry " << *dn << dendl;
8186 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8187 inodeno_t ino = dnl->get_remote_ino();
8188 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8189 open_ino(ino, pool,
8190 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8191 }
8192
8193 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8194 bool want_xlocked, int r)
8195 {
8196 if (r < 0) {
8197 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8198 dn->state_set(CDentry::STATE_BADREMOTEINO);
8199
8200 std::string path;
8201 CDir *dir = dn->get_dir();
8202 if (dir) {
8203 dir->get_inode()->make_path_string(path);
8204 path = path + "/" + dn->get_name();
8205 }
8206
8207 bool fatal = mds->damage_table.notify_remote_damaged(
8208 dn->get_projected_linkage()->get_remote_ino(), path);
8209 if (fatal) {
8210 mds->damaged();
8211 ceph_abort(); // unreachable, damaged() respawns us
8212 }
8213 }
8214 fin->complete(r < 0 ? r : 0);
8215 }
8216
8217
8218 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8219 {
8220 // empty trace if we're a base inode
8221 if (in->is_base())
8222 return;
8223
8224 CInode *parent = in->get_parent_inode();
8225 assert(parent);
8226 make_trace(trace, parent);
8227
8228 CDentry *dn = in->get_parent_dn();
8229 dout(15) << "make_trace adding " << *dn << dendl;
8230 trace.push_back(dn);
8231 }
8232
8233
8234 // -------------------------------------------------------------------------------
8235 // Open inode by inode number
8236
8237 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8238 inodeno_t ino;
8239 public:
8240 bufferlist bl;
8241 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8242 MDCacheIOContext(c), ino(i) {}
8243 void finish(int r) override {
8244 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8245 }
8246 };
8247
8248 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8249 inodeno_t ino;
8250 MMDSOpenIno *msg;
8251 bool parent;
8252 public:
8253 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8254 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8255 void finish(int r) override {
8256 if (r < 0 && !parent)
8257 r = -EAGAIN;
8258 if (msg) {
8259 mdcache->handle_open_ino(msg, r);
8260 return;
8261 }
8262 assert(mdcache->opening_inodes.count(ino));
8263 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8264 }
8265 };
8266
8267 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8268 inodeno_t ino;
8269 public:
8270 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8271 void finish(int r) override {
8272 mdcache->_open_ino_parent_opened(ino, r);
8273 }
8274 };
8275
8276 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8277 {
8278 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8279
8280 assert(opening_inodes.count(ino));
8281 open_ino_info_t& info = opening_inodes[ino];
8282
8283 CInode *in = get_inode(ino);
8284 if (in) {
8285 dout(10) << " found cached " << *in << dendl;
8286 open_ino_finish(ino, info, in->authority().first);
8287 return;
8288 }
8289
8290 inode_backtrace_t backtrace;
8291 if (err == 0) {
8292 try {
8293 ::decode(backtrace, bl);
8294 } catch (const buffer::error &decode_exc) {
8295 derr << "corrupt backtrace on ino x0" << std::hex << ino
8296 << std::dec << ": " << decode_exc << dendl;
8297 open_ino_finish(ino, info, -EIO);
8298 return;
8299 }
8300 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8301 dout(10) << " old object in pool " << info.pool
8302 << ", retrying pool " << backtrace.pool << dendl;
8303 info.pool = backtrace.pool;
8304 C_IO_MDC_OpenInoBacktraceFetched *fin =
8305 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8306 fetch_backtrace(ino, info.pool, fin->bl,
8307 new C_OnFinisher(fin, mds->finisher));
8308 return;
8309 }
8310 } else if (err == -ENOENT) {
8311 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8312 if (info.pool != meta_pool) {
8313 dout(10) << " no object in pool " << info.pool
8314 << ", retrying pool " << meta_pool << dendl;
8315 info.pool = meta_pool;
8316 C_IO_MDC_OpenInoBacktraceFetched *fin =
8317 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8318 fetch_backtrace(ino, info.pool, fin->bl,
8319 new C_OnFinisher(fin, mds->finisher));
8320 return;
8321 }
8322 err = 0; // backtrace.ancestors.empty() is checked below
8323 }
8324
8325 if (err == 0) {
8326 if (backtrace.ancestors.empty()) {
8327 dout(10) << " got empty backtrace " << dendl;
8328 err = -EIO;
8329 } else if (!info.ancestors.empty()) {
8330 if (info.ancestors[0] == backtrace.ancestors[0]) {
8331 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8332 err = -EINVAL;
8333 } else {
8334 info.last_err = 0;
8335 }
8336 }
8337 }
8338 if (err) {
8339 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8340 if (info.last_err)
8341 err = info.last_err;
8342 open_ino_finish(ino, info, err);
8343 return;
8344 }
8345
8346 dout(10) << " got backtrace " << backtrace << dendl;
8347 info.ancestors = backtrace.ancestors;
8348
8349 _open_ino_traverse_dir(ino, info, 0);
8350 }
8351
8352 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8353 {
8354 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8355
8356 assert(opening_inodes.count(ino));
8357 open_ino_info_t& info = opening_inodes[ino];
8358
8359 CInode *in = get_inode(ino);
8360 if (in) {
8361 dout(10) << " found cached " << *in << dendl;
8362 open_ino_finish(ino, info, in->authority().first);
8363 return;
8364 }
8365
8366 if (ret == mds->get_nodeid()) {
8367 _open_ino_traverse_dir(ino, info, 0);
8368 } else {
8369 if (ret >= 0) {
8370 mds_rank_t checked_rank = mds_rank_t(ret);
8371 info.check_peers = true;
8372 info.auth_hint = checked_rank;
8373 info.checked.erase(checked_rank);
8374 }
8375 do_open_ino(ino, info, ret);
8376 }
8377 }
8378
8379 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8380 {
8381 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8382
8383 CInode *in = get_inode(ino);
8384 if (in) {
8385 dout(10) << " found cached " << *in << dendl;
8386 open_ino_finish(ino, info, in->authority().first);
8387 return;
8388 }
8389
8390 if (ret) {
8391 do_open_ino(ino, info, ret);
8392 return;
8393 }
8394
8395 mds_rank_t hint = info.auth_hint;
8396 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8397 info.discover, info.want_xlocked, &hint);
8398 if (ret > 0)
8399 return;
8400 if (hint != mds->get_nodeid())
8401 info.auth_hint = hint;
8402 do_open_ino(ino, info, ret);
8403 }
8404
8405 void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8406 {
8407 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8408 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8409 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8410 }
8411
8412 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8413 vector<inode_backpointer_t>& ancestors,
8414 bool discover, bool want_xlocked, mds_rank_t *hint)
8415 {
8416 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8417 int err = 0;
8418 for (unsigned i = 0; i < ancestors.size(); i++) {
8419 CInode *diri = get_inode(ancestors[i].dirino);
8420
8421 if (!diri) {
8422 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8423 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8424 return 1;
8425 }
8426 continue;
8427 }
8428
8429 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8430 CDir *dir = diri->get_parent_dir();
8431 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8432 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8433 dir = dir->get_inode()->get_parent_dir();
8434 _open_ino_fetch_dir(ino, m, dir, i == 0);
8435 return 1;
8436 }
8437
8438 if (!diri->is_dir()) {
8439 dout(10) << " " << *diri << " is not dir" << dendl;
8440 if (i == 0)
8441 err = -ENOTDIR;
8442 break;
8443 }
8444
8445 string &name = ancestors[i].dname;
8446 frag_t fg = diri->pick_dirfrag(name);
8447 CDir *dir = diri->get_dirfrag(fg);
8448 if (!dir) {
8449 if (diri->is_auth()) {
8450 if (diri->is_frozen()) {
8451 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8452 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8453 return 1;
8454 }
8455 dir = diri->get_or_open_dirfrag(this, fg);
8456 } else if (discover) {
8457 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8458 return 1;
8459 }
8460 }
8461 if (dir) {
8462 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8463 CDentry *dn = dir->lookup(name);
8464 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8465 if (dir->is_auth()) {
8466 if (dnl && dnl->is_primary() &&
8467 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8468 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8469 _open_ino_fetch_dir(ino, m, dir, i == 0);
8470 return 1;
8471 }
8472
8473 if (!dnl && !dir->is_complete() &&
8474 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8475 dout(10) << " fetching incomplete " << *dir << dendl;
8476 _open_ino_fetch_dir(ino, m, dir, i == 0);
8477 return 1;
8478 }
8479
8480 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8481 if (i == 0)
8482 err = -ENOENT;
8483 } else if (discover) {
8484 if (!dnl) {
8485 filepath path(name, 0);
8486 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8487 (i == 0 && want_xlocked));
8488 return 1;
8489 }
8490 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8491 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8492 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8493 return 1;
8494 }
8495 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8496 if (i == 0)
8497 err = -ENOENT;
8498 }
8499 }
8500 if (hint && i == 0)
8501 *hint = dir ? dir->authority().first : diri->authority().first;
8502 break;
8503 }
8504 return err;
8505 }
8506
8507 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8508 {
8509 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8510
8511 list<MDSInternalContextBase*> waiters;
8512 waiters.swap(info.waiters);
8513 opening_inodes.erase(ino);
8514 finish_contexts(g_ceph_context, waiters, ret);
8515 }
8516
8517 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8518 {
8519 if (err < 0 && err != -EAGAIN) {
8520 info.checked.clear();
8521 info.checked.insert(mds->get_nodeid());
8522 info.checking = MDS_RANK_NONE;
8523 info.check_peers = true;
8524 info.fetch_backtrace = true;
8525 if (info.discover) {
8526 info.discover = false;
8527 info.ancestors.clear();
8528 }
8529 if (err != -ENOENT && err != -ENOTDIR)
8530 info.last_err = err;
8531 }
8532
8533 if (info.check_peers) {
8534 info.check_peers = false;
8535 info.checking = MDS_RANK_NONE;
8536 do_open_ino_peer(ino, info);
8537 } else if (info.fetch_backtrace) {
8538 info.check_peers = true;
8539 info.fetch_backtrace = false;
8540 info.checking = mds->get_nodeid();
8541 info.checked.clear();
8542 info.checked.insert(mds->get_nodeid());
8543 C_IO_MDC_OpenInoBacktraceFetched *fin =
8544 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8545 fetch_backtrace(ino, info.pool, fin->bl,
8546 new C_OnFinisher(fin, mds->finisher));
8547 } else {
8548 assert(!info.ancestors.empty());
8549 info.checking = mds->get_nodeid();
8550 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8551 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8552 }
8553 }
8554
8555 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8556 {
8557 set<mds_rank_t> all, active;
8558 mds->mdsmap->get_mds_set(all);
8559 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8560 if (mds->get_state() == MDSMap::STATE_REJOIN)
8561 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8562
8563 dout(10) << "do_open_ino_peer " << ino << " active " << active
8564 << " all " << all << " checked " << info.checked << dendl;
8565
8566 mds_rank_t peer = MDS_RANK_NONE;
8567 if (info.auth_hint >= 0) {
8568 if (active.count(info.auth_hint)) {
8569 peer = info.auth_hint;
8570 info.auth_hint = MDS_RANK_NONE;
8571 }
8572 } else {
8573 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8574 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8575 peer = *p;
8576 break;
8577 }
8578 }
8579 if (peer < 0) {
8580 if (all.size() > active.size() && all != info.checked) {
8581 dout(10) << " waiting for more peers to be active" << dendl;
8582 } else {
8583 dout(10) << " all MDS peers have been checked " << dendl;
8584 do_open_ino(ino, info, 0);
8585 }
8586 } else {
8587 info.checking = peer;
8588 vector<inode_backpointer_t> *pa = NULL;
8589 // got backtrace from peer or backtrace just fetched
8590 if (info.discover || !info.fetch_backtrace)
8591 pa = &info.ancestors;
8592 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8593 }
8594 }
8595
8596 void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8597 {
8598 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8599 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8600 m->put();
8601 return;
8602 }
8603
8604 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8605
8606 inodeno_t ino = m->ino;
8607 MMDSOpenInoReply *reply;
8608 CInode *in = get_inode(ino);
8609 if (in) {
8610 dout(10) << " have " << *in << dendl;
8611 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8612 if (in->is_auth()) {
8613 touch_inode(in);
8614 while (1) {
8615 CDentry *pdn = in->get_parent_dn();
8616 if (!pdn)
8617 break;
8618 CInode *diri = pdn->get_dir()->get_inode();
8619 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
8620 in->inode.version));
8621 in = diri;
8622 }
8623 } else {
8624 reply->hint = in->authority().first;
8625 }
8626 } else if (err < 0) {
8627 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8628 } else {
8629 mds_rank_t hint = MDS_RANK_NONE;
8630 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8631 if (ret > 0)
8632 return;
8633 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8634 }
8635 m->get_connection()->send_message(reply);
8636 m->put();
8637 }
8638
8639 void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8640 {
8641 dout(10) << "handle_open_ino_reply " << *m << dendl;
8642
8643 inodeno_t ino = m->ino;
8644 mds_rank_t from = mds_rank_t(m->get_source().num());
8645 auto it = opening_inodes.find(ino);
8646 if (it != opening_inodes.end() && it->second.checking == from) {
8647 open_ino_info_t& info = it->second;
8648 info.checking = MDS_RANK_NONE;
8649 info.checked.insert(from);
8650
8651 CInode *in = get_inode(ino);
8652 if (in) {
8653 dout(10) << " found cached " << *in << dendl;
8654 open_ino_finish(ino, info, in->authority().first);
8655 } else if (!m->ancestors.empty()) {
8656 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8657 if (!info.want_replica) {
8658 open_ino_finish(ino, info, from);
8659 m->put();
8660 return;
8661 }
8662
8663 info.ancestors = m->ancestors;
8664 info.auth_hint = from;
8665 info.checking = mds->get_nodeid();
8666 info.discover = true;
8667 _open_ino_traverse_dir(ino, info, 0);
8668 } else if (m->error) {
8669 dout(10) << " error " << m->error << " from mds." << from << dendl;
8670 do_open_ino(ino, info, m->error);
8671 } else {
8672 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8673 info.auth_hint = m->hint;
8674 info.checked.erase(m->hint);
8675 }
8676 do_open_ino_peer(ino, info);
8677 }
8678 }
8679 m->put();
8680 }
8681
8682 void MDCache::kick_open_ino_peers(mds_rank_t who)
8683 {
8684 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8685
8686 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8687 p != opening_inodes.end();
8688 ++p) {
8689 open_ino_info_t& info = p->second;
8690 if (info.checking == who) {
8691 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8692 info.checking = MDS_RANK_NONE;
8693 do_open_ino_peer(p->first, info);
8694 } else if (info.checking == MDS_RANK_NONE) {
8695 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8696 do_open_ino_peer(p->first, info);
8697 }
8698 }
8699 }
8700
8701 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8702 bool want_replica, bool want_xlocked)
8703 {
8704 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8705 << want_replica << dendl;
8706
8707 if (opening_inodes.count(ino)) {
8708 open_ino_info_t& info = opening_inodes[ino];
8709 if (want_replica) {
8710 info.want_replica = true;
8711 if (want_xlocked && !info.want_xlocked) {
8712 if (!info.ancestors.empty()) {
8713 CInode *diri = get_inode(info.ancestors[0].dirino);
8714 if (diri) {
8715 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8716 CDir *dir = diri->get_dirfrag(fg);
8717 if (dir && !dir->is_auth()) {
8718 filepath path(info.ancestors[0].dname, 0);
8719 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8720 }
8721 }
8722 }
8723 info.want_xlocked = true;
8724 }
8725 }
8726 info.waiters.push_back(fin);
8727 } else {
8728 open_ino_info_t& info = opening_inodes[ino];
8729 info.checked.insert(mds->get_nodeid());
8730 info.want_replica = want_replica;
8731 info.want_xlocked = want_xlocked;
8732 info.tid = ++open_ino_last_tid;
8733 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8734 info.waiters.push_back(fin);
8735 do_open_ino(ino, info, 0);
8736 }
8737 }
8738
8739 /* ---------------------------- */
8740
8741 /*
8742 * search for a given inode on MDS peers. optionally start with the given node.
8743
8744
8745 TODO
8746 - recover from mds node failure, recovery
8747 - traverse path
8748
8749 */
8750 void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8751 {
8752 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8753 assert(!have_inode(ino));
8754
8755 ceph_tid_t tid = ++find_ino_peer_last_tid;
8756 find_ino_peer_info_t& fip = find_ino_peer[tid];
8757 fip.ino = ino;
8758 fip.tid = tid;
8759 fip.fin = c;
8760 fip.hint = hint;
8761 fip.checked.insert(mds->get_nodeid());
8762 _do_find_ino_peer(fip);
8763 }
8764
8765 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8766 {
8767 set<mds_rank_t> all, active;
8768 mds->mdsmap->get_mds_set(all);
8769 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8770
8771 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8772 << " active " << active << " all " << all
8773 << " checked " << fip.checked
8774 << dendl;
8775
8776 mds_rank_t m = MDS_RANK_NONE;
8777 if (fip.hint >= 0) {
8778 m = fip.hint;
8779 fip.hint = MDS_RANK_NONE;
8780 } else {
8781 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8782 if (*p != mds->get_nodeid() &&
8783 fip.checked.count(*p) == 0) {
8784 m = *p;
8785 break;
8786 }
8787 }
8788 if (m == MDS_RANK_NONE) {
8789 if (all.size() > active.size()) {
8790 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8791 } else {
8792 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8793 fip.fin->complete(-ESTALE);
8794 find_ino_peer.erase(fip.tid);
8795 }
8796 } else {
8797 fip.checking = m;
8798 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8799 }
8800 }
8801
8802 void MDCache::handle_find_ino(MMDSFindIno *m)
8803 {
8804 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8805 m->put();
8806 return;
8807 }
8808
8809 dout(10) << "handle_find_ino " << *m << dendl;
8810 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8811 CInode *in = get_inode(m->ino);
8812 if (in) {
8813 in->make_path(r->path);
8814 dout(10) << " have " << r->path << " " << *in << dendl;
8815 }
8816 m->get_connection()->send_message(r);
8817 m->put();
8818 }
8819
8820
8821 void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8822 {
8823 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8824 if (p != find_ino_peer.end()) {
8825 dout(10) << "handle_find_ino_reply " << *m << dendl;
8826 find_ino_peer_info_t& fip = p->second;
8827
8828 // success?
8829 if (get_inode(fip.ino)) {
8830 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8831 mds->queue_waiter(fip.fin);
8832 find_ino_peer.erase(p);
8833 m->put();
8834 return;
8835 }
8836
8837 mds_rank_t from = mds_rank_t(m->get_source().num());
8838 if (fip.checking == from)
8839 fip.checking = MDS_RANK_NONE;
8840 fip.checked.insert(from);
8841
8842 if (!m->path.empty()) {
8843 // we got a path!
8844 vector<CDentry*> trace;
8845 MDRequestRef null_ref;
8846 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8847 if (r > 0)
8848 return;
8849 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8850 << ", retrying" << dendl;
8851 fip.checked.clear();
8852 _do_find_ino_peer(fip);
8853 } else {
8854 // nope, continue.
8855 _do_find_ino_peer(fip);
8856 }
8857 } else {
8858 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8859 }
8860 m->put();
8861 }
8862
8863 void MDCache::kick_find_ino_peers(mds_rank_t who)
8864 {
8865 // find_ino_peers requests we should move on from
8866 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8867 p != find_ino_peer.end();
8868 ++p) {
8869 find_ino_peer_info_t& fip = p->second;
8870 if (fip.checking == who) {
8871 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8872 fip.checking = MDS_RANK_NONE;
8873 _do_find_ino_peer(fip);
8874 } else if (fip.checking == MDS_RANK_NONE) {
8875 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8876 _do_find_ino_peer(fip);
8877 }
8878 }
8879 }
8880
8881 /* ---------------------------- */
8882
8883 int MDCache::get_num_client_requests()
8884 {
8885 int count = 0;
8886 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8887 p != active_requests.end();
8888 ++p) {
8889 MDRequestRef& mdr = p->second;
8890 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8891 count++;
8892 }
8893 return count;
8894 }
8895
8896 /* This function takes over the reference to the passed Message */
8897 MDRequestRef MDCache::request_start(MClientRequest *req)
8898 {
8899 // did we win a forward race against a slave?
8900 if (active_requests.count(req->get_reqid())) {
8901 MDRequestRef& mdr = active_requests[req->get_reqid()];
8902 assert(mdr);
8903 if (mdr->is_slave()) {
8904 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8905 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8906 } else {
8907 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8908 req->put();
8909 }
8910 return MDRequestRef();
8911 }
8912
8913 // register new client request
8914 MDRequestImpl::Params params;
8915 params.reqid = req->get_reqid();
8916 params.attempt = req->get_num_fwd();
8917 params.client_req = req;
8918 params.initiated = req->get_recv_stamp();
8919 params.throttled = req->get_throttle_stamp();
8920 params.all_read = req->get_recv_complete_stamp();
8921 params.dispatched = req->get_dispatch_stamp();
8922
8923 MDRequestRef mdr =
8924 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8925 active_requests[params.reqid] = mdr;
8926 mdr->set_op_stamp(req->get_stamp());
8927 dout(7) << "request_start " << *mdr << dendl;
8928 return mdr;
8929 }
8930
8931 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
8932 {
8933 int by = m->get_source().num();
8934 MDRequestImpl::Params params;
8935 params.reqid = ri;
8936 params.attempt = attempt;
8937 params.triggering_slave_req = m;
8938 params.slave_to = by;
8939 params.initiated = m->get_recv_stamp();
8940 params.throttled = m->get_throttle_stamp();
8941 params.all_read = m->get_recv_complete_stamp();
8942 params.dispatched = m->get_dispatch_stamp();
8943 MDRequestRef mdr =
8944 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8945 assert(active_requests.count(mdr->reqid) == 0);
8946 active_requests[mdr->reqid] = mdr;
8947 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
8948 return mdr;
8949 }
8950
8951 MDRequestRef MDCache::request_start_internal(int op)
8952 {
8953 MDRequestImpl::Params params;
8954 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
8955 params.reqid.tid = mds->issue_tid();
8956 params.initiated = ceph_clock_now();
8957 params.internal_op = op;
8958 MDRequestRef mdr =
8959 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8960
8961 assert(active_requests.count(mdr->reqid) == 0);
8962 active_requests[mdr->reqid] = mdr;
8963 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
8964 return mdr;
8965 }
8966
8967 MDRequestRef MDCache::request_get(metareqid_t rid)
8968 {
8969 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
8970 assert(p != active_requests.end());
8971 dout(7) << "request_get " << rid << " " << *p->second << dendl;
8972 return p->second;
8973 }
8974
8975 void MDCache::request_finish(MDRequestRef& mdr)
8976 {
8977 dout(7) << "request_finish " << *mdr << dendl;
8978 mdr->mark_event("finishing request");
8979
8980 // slave finisher?
8981 if (mdr->has_more() && mdr->more()->slave_commit) {
8982 Context *fin = mdr->more()->slave_commit;
8983 mdr->more()->slave_commit = 0;
8984 int ret;
8985 if (mdr->aborted) {
8986 mdr->aborted = false;
8987 ret = -1;
8988 mdr->more()->slave_rolling_back = true;
8989 } else {
8990 ret = 0;
8991 mdr->committing = true;
8992 }
8993 fin->complete(ret); // this must re-call request_finish.
8994 return;
8995 }
8996
8997 request_cleanup(mdr);
8998 }
8999
9000
9001 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9002 {
9003 mdr->mark_event("forwarding request");
9004 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9005 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9006 << *mdr->client_request << dendl;
9007 mds->forward_message_mds(mdr->client_request, who);
9008 mdr->client_request = 0;
9009 if (mds->logger) mds->logger->inc(l_mds_forward);
9010 } else if (mdr->internal_op >= 0) {
9011 dout(10) << "request_forward on internal op; cancelling" << dendl;
9012 mdr->internal_op_finish->complete(-EXDEV);
9013 } else {
9014 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9015 << " was from mds" << dendl;
9016 }
9017 request_cleanup(mdr);
9018 }
9019
9020
9021 void MDCache::dispatch_request(MDRequestRef& mdr)
9022 {
9023 if (mdr->client_request) {
9024 mds->server->dispatch_client_request(mdr);
9025 } else if (mdr->slave_request) {
9026 mds->server->dispatch_slave_request(mdr);
9027 } else {
9028 switch (mdr->internal_op) {
9029 case CEPH_MDS_OP_FRAGMENTDIR:
9030 dispatch_fragment_dir(mdr);
9031 break;
9032 case CEPH_MDS_OP_EXPORTDIR:
9033 migrator->dispatch_export_dir(mdr, 0);
9034 break;
9035 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9036 enqueue_scrub_work(mdr);
9037 break;
9038 case CEPH_MDS_OP_FLUSH:
9039 flush_dentry_work(mdr);
9040 break;
9041 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9042 repair_dirfrag_stats_work(mdr);
9043 break;
9044 case CEPH_MDS_OP_REPAIR_INODESTATS:
9045 repair_inode_stats_work(mdr);
9046 break;
9047 default:
9048 ceph_abort();
9049 }
9050 }
9051 }
9052
9053
9054 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9055 {
9056 if (!mdr->has_more())
9057 return;
9058
9059 // clean up slaves
9060 // (will implicitly drop remote dn pins)
9061 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9062 p != mdr->more()->slaves.end();
9063 ++p) {
9064 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9065 MMDSSlaveRequest::OP_FINISH);
9066
9067 if (mdr->killed && !mdr->committing) {
9068 r->mark_abort();
9069 } else if (mdr->more()->srcdn_auth_mds == *p &&
9070 mdr->more()->inode_import.length() > 0) {
9071 // information about rename imported caps
9072 r->inode_export.claim(mdr->more()->inode_import);
9073 }
9074
9075 mds->send_message_mds(r, *p);
9076 }
9077
9078 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9079 * implicitly. Note that we don't call the finishers -- there shouldn't
9080 * be any on a remote lock and the request finish wakes up all
9081 * the waiters anyway! */
9082 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9083 while (p != mdr->xlocks.end()) {
9084 if ((*p)->get_parent()->is_auth())
9085 ++p;
9086 else {
9087 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9088 << " on " << *(*p)->get_parent() << dendl;
9089 (*p)->put_xlock();
9090 mdr->locks.erase(*p);
9091 mdr->xlocks.erase(p++);
9092 }
9093 }
9094
9095 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9096 while (q != mdr->remote_wrlocks.end()) {
9097 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9098 << " on mds." << q->second
9099 << " on " << *(q->first)->get_parent() << dendl;
9100 mdr->locks.erase(q->first);
9101 mdr->remote_wrlocks.erase(q++);
9102 }
9103
9104 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9105 * leaving them in can cause double-notifies as
9106 * this function can get called more than once */
9107 }
9108
9109 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9110 {
9111 request_drop_foreign_locks(mdr);
9112 mds->locker->drop_non_rdlocks(mdr.get());
9113 }
9114
9115 void MDCache::request_drop_locks(MDRequestRef& mdr)
9116 {
9117 request_drop_foreign_locks(mdr);
9118 mds->locker->drop_locks(mdr.get());
9119 }
9120
9121 void MDCache::request_cleanup(MDRequestRef& mdr)
9122 {
9123 dout(15) << "request_cleanup " << *mdr << dendl;
9124
9125 if (mdr->has_more()) {
9126 if (mdr->more()->is_ambiguous_auth)
9127 mdr->clear_ambiguous_auth();
9128 if (!mdr->more()->waiting_for_finish.empty())
9129 mds->queue_waiters(mdr->more()->waiting_for_finish);
9130 }
9131
9132 request_drop_locks(mdr);
9133
9134 // drop (local) auth pins
9135 mdr->drop_local_auth_pins();
9136
9137 // drop stickydirs
9138 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9139 p != mdr->stickydirs.end();
9140 ++p)
9141 (*p)->put_stickydirs();
9142
9143 mds->locker->kick_cap_releases(mdr);
9144
9145 // drop cache pins
9146 mdr->drop_pins();
9147
9148 // remove from session
9149 mdr->item_session_request.remove_myself();
9150
9151 // remove from map
9152 active_requests.erase(mdr->reqid);
9153
9154 if (mds->logger)
9155 log_stat();
9156
9157 mdr->mark_event("cleaned up request");
9158 }
9159
9160 void MDCache::request_kill(MDRequestRef& mdr)
9161 {
9162 // rollback slave requests is tricky. just let the request proceed.
9163 if (mdr->done_locking && mdr->has_more() &&
9164 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9165 dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
9166
9167 assert(mdr->used_prealloc_ino == 0);
9168 assert(mdr->prealloc_inos.empty());
9169
9170 mdr->session = NULL;
9171 mdr->item_session_request.remove_myself();
9172 return;
9173 }
9174
9175 mdr->killed = true;
9176 mdr->mark_event("killing request");
9177
9178 if (mdr->committing) {
9179 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9180 } else {
9181 dout(10) << "request_kill " << *mdr << dendl;
9182 request_cleanup(mdr);
9183 }
9184 }
9185
9186 // -------------------------------------------------------------------------------
9187 // SNAPREALMS
9188
9189 struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9190 MDRequestRef mdr;
9191 MutationRef mut;
9192 CInode *in;
9193 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9194 MutationRef& mu, CInode *i) :
9195 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9196 void finish(int r) override {
9197 mdcache->_snaprealm_create_finish(mdr, mut, in);
9198 }
9199 };
9200
9201 void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9202 {
9203 dout(10) << "snaprealm_create " << *in << dendl;
9204 assert(!in->snaprealm);
9205
9206 // allocate an id..
9207 if (!mdr->more()->stid) {
9208 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9209 new C_MDS_RetryRequest(this, mdr));
9210 return;
9211 }
9212
9213 MutationRef mut(new MutationImpl());
9214 mut->ls = mds->mdlog->get_current_segment();
9215 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9216 mds->mdlog->start_entry(le);
9217
9218 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9219
9220 inode_t *pi = in->project_inode();
9221 pi->version = in->pre_dirty();
9222 pi->rstat.rsnaprealms++;
9223
9224 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9225 snapid_t seq;
9226 ::decode(seq, p);
9227
9228 sr_t *newsnap = in->project_snaprealm(seq);
9229 newsnap->seq = seq;
9230 newsnap->last_created = seq;
9231
9232 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9233 journal_cow_inode(mut, &le->metablob, in);
9234 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9235
9236 mds->server->submit_mdlog_entry(le,
9237 new C_MDC_snaprealm_create_finish(this, mdr,
9238 mut, in),
9239 mdr, __func__);
9240 mds->mdlog->flush();
9241 }
9242
9243
9244 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9245 {
9246 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9247
9248 vector<inodeno_t> split_inos;
9249 vector<inodeno_t> split_realms;
9250
9251 if (snapop == CEPH_SNAP_OP_SPLIT) {
9252 // notify clients of update|split
9253 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9254 !p.end(); ++p)
9255 split_inos.push_back((*p)->ino());
9256
9257 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9258 p != in->snaprealm->open_children.end();
9259 ++p)
9260 split_realms.push_back((*p)->inode->ino());
9261 }
9262
9263 bufferlist snapbl;
9264 in->snaprealm->build_snap_trace(snapbl);
9265
9266 set<SnapRealm*> past_children;
9267 map<client_t, MClientSnap*> updates;
9268 list<SnapRealm*> q;
9269 q.push_back(in->snaprealm);
9270 while (!q.empty()) {
9271 SnapRealm *realm = q.front();
9272 q.pop_front();
9273
9274 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9275 realm->invalidate_cached_snaps();
9276
9277 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9278 p != realm->client_caps.end();
9279 ++p) {
9280 assert(!p->second->empty());
9281 if (!nosend && updates.count(p->first) == 0) {
9282 MClientSnap *update = new MClientSnap(snapop);
9283 update->head.split = in->ino();
9284 update->split_inos = split_inos;
9285 update->split_realms = split_realms;
9286 update->bl = snapbl;
9287 updates[p->first] = update;
9288 }
9289 }
9290
9291 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9292 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9293 p != realm->open_past_children.end();
9294 ++p)
9295 past_children.insert(*p);
9296 }
9297
9298 // notify for active children, too.
9299 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9300 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9301 p != realm->open_children.end();
9302 ++p)
9303 q.push_back(*p);
9304 }
9305
9306 if (!nosend)
9307 send_snaps(updates);
9308
9309 // notify past children and their descendants if we update/delete old snapshots
9310 for (set<SnapRealm*>::iterator p = past_children.begin();
9311 p != past_children.end();
9312 ++p)
9313 q.push_back(*p);
9314
9315 while (!q.empty()) {
9316 SnapRealm *realm = q.front();
9317 q.pop_front();
9318
9319 realm->invalidate_cached_snaps();
9320
9321 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9322 p != realm->open_children.end();
9323 ++p) {
9324 if (past_children.count(*p) == 0)
9325 q.push_back(*p);
9326 }
9327
9328 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9329 p != realm->open_past_children.end();
9330 ++p) {
9331 if (past_children.count(*p) == 0) {
9332 q.push_back(*p);
9333 past_children.insert(*p);
9334 }
9335 }
9336 }
9337
9338 if (snapop == CEPH_SNAP_OP_DESTROY) {
9339 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9340 for (set<SnapRealm*>::iterator p = past_children.begin();
9341 p != past_children.end();
9342 ++p)
9343 maybe_eval_stray((*p)->inode, true);
9344 }
9345 }
9346
9347 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9348 {
9349 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9350
9351 // apply
9352 in->pop_and_dirty_projected_inode(mut->ls);
9353 mut->apply();
9354 mds->locker->drop_locks(mut.get());
9355 mut->cleanup();
9356
9357 // tell table we've committed
9358 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9359
9360 // create
9361 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9362 snapid_t seq;
9363 ::decode(seq, p);
9364
9365 in->open_snaprealm();
9366 in->snaprealm->srnode.seq = seq;
9367 in->snaprealm->srnode.created = seq;
9368 bool ok = in->snaprealm->_open_parents(NULL);
9369 assert(ok);
9370
9371 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9372
9373 /*
9374 static int count = 5;
9375 if (--count == 0)
9376 ceph_abort(); // hack test test **********
9377 */
9378
9379 // done.
9380 mdr->more()->stid = 0; // caller will likely need to reuse this
9381 dispatch_request(mdr);
9382 }
9383
9384
9385 // -------------------------------------------------------------------------------
9386 // STRAYS
9387
9388 struct C_MDC_RetryScanStray : public MDCacheContext {
9389 dirfrag_t next;
9390 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9391 void finish(int r) override {
9392 mdcache->scan_stray_dir(next);
9393 }
9394 };
9395
9396 void MDCache::scan_stray_dir(dirfrag_t next)
9397 {
9398 dout(10) << "scan_stray_dir " << next << dendl;
9399
9400 list<CDir*> ls;
9401 for (int i = 0; i < NUM_STRAY; ++i) {
9402 if (strays[i]->ino() < next.ino)
9403 continue;
9404 strays[i]->get_dirfrags(ls);
9405 }
9406
9407 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9408 CDir *dir = *p;
9409 if (dir->dirfrag() < next)
9410 continue;
9411 if (!dir->is_complete()) {
9412 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9413 return;
9414 }
9415 for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
9416 CDentry *dn = q->second;
9417 dn->state_set(CDentry::STATE_STRAY);
9418 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9419 if (dnl->is_primary()) {
9420 CInode *in = dnl->get_inode();
9421 if (in->inode.nlink == 0)
9422 in->state_set(CInode::STATE_ORPHAN);
9423 maybe_eval_stray(in);
9424 }
9425 }
9426 }
9427 }
9428
9429 /**
9430 * If a remote dentry refers to an inode whose primary
9431 * dentry is a stray, then evaluate the inode for purging if
9432 * we have the auth copy, or migrate the stray to use if we
9433 * do not.
9434 */
9435 void MDCache::eval_remote(CDentry *remote_dn)
9436 {
9437 assert(remote_dn);
9438 dout(10) << __func__ << " " << *remote_dn << dendl;
9439
9440 CDentry::linkage_t *dnl = remote_dn->get_projected_linkage();
9441 assert(dnl->is_remote());
9442 CInode *in = dnl->get_inode();
9443
9444 if (!in) {
9445 dout(20) << __func__ << ": no inode, cannot evaluate" << dendl;
9446 return;
9447 }
9448
9449 if (remote_dn->last != CEPH_NOSNAP) {
9450 dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl;
9451 return;
9452 }
9453
9454 // refers to stray?
9455 CDentry *primary_dn = in->get_projected_parent_dn();
9456 assert(primary_dn != NULL);
9457 if (primary_dn->get_dir()->get_inode()->is_stray()) {
9458 if (in->is_auth()) {
9459 dout(20) << __func__ << ": have auth for inode, evaluating" << dendl;
9460
9461 stray_manager.eval_remote_stray(primary_dn, remote_dn);
9462 } else {
9463 dout(20) << __func__ << ": do not have auth for inode, migrating " << dendl;
9464 /*
9465 * Inodes get filed into a stray dentry when a client unlinks
9466 * the primary DN for them. However, that doesn't mean there
9467 * isn't a remote DN still in the world. The remote DN just
9468 * ends up pointing at a stray. Strays can pretty much live
9469 * forever in this scenario.
9470 *
9471 * Therefore, we have a special behaviour here: migrate a stray
9472 * to <me> when <I> handle a client request with a trace referring
9473 * to a stray inode on another MDS.
9474 */
9475 stray_manager.migrate_stray(primary_dn, mds->get_nodeid());
9476 }
9477 } else {
9478 dout(20) << __func__ << ": inode's primary dn not stray" << dendl;
9479 }
9480 }
9481
9482 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9483 {
9484 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9485 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9486 }
9487
9488
9489
9490
9491
9492 // ========================================================================================
9493 // DISCOVER
9494 /*
9495
9496 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9497 to the parent metadata object in the cache (pinning it).
9498
9499 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9500
9501 */
9502
9503 void MDCache::_send_discover(discover_info_t& d)
9504 {
9505 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9506 d.want_base_dir, d.want_xlocked);
9507 dis->set_tid(d.tid);
9508 mds->send_message_mds(dis, d.mds);
9509 }
9510
9511 void MDCache::discover_base_ino(inodeno_t want_ino,
9512 MDSInternalContextBase *onfinish,
9513 mds_rank_t from)
9514 {
9515 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9516 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9517 discover_info_t& d = _create_discover(from);
9518 d.ino = want_ino;
9519 _send_discover(d);
9520 }
9521 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9522 }
9523
9524
9525 void MDCache::discover_dir_frag(CInode *base,
9526 frag_t approx_fg,
9527 MDSInternalContextBase *onfinish,
9528 mds_rank_t from)
9529 {
9530 if (from < 0)
9531 from = base->authority().first;
9532
9533 dirfrag_t df(base->ino(), approx_fg);
9534 dout(7) << "discover_dir_frag " << df
9535 << " from mds." << from << dendl;
9536
9537 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9538 discover_info_t& d = _create_discover(from);
9539 d.pin_base(base);
9540 d.ino = base->ino();
9541 d.frag = approx_fg;
9542 d.want_base_dir = true;
9543 _send_discover(d);
9544 }
9545
9546 if (onfinish)
9547 base->add_dir_waiter(approx_fg, onfinish);
9548 }
9549
9550 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9551 CInode *base;
9552 snapid_t snapid;
9553 filepath path;
9554 mds_rank_t from;
9555 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9556 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9557 void finish(int r) override {
9558 mdcache->discover_path(base, snapid, path, 0, from);
9559 }
9560 };
9561
9562 void MDCache::discover_path(CInode *base,
9563 snapid_t snap,
9564 filepath want_path,
9565 MDSInternalContextBase *onfinish,
9566 bool want_xlocked,
9567 mds_rank_t from)
9568 {
9569 if (from < 0)
9570 from = base->authority().first;
9571
9572 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9573 << (want_xlocked ? " want_xlocked":"")
9574 << dendl;
9575
9576 if (base->is_ambiguous_auth()) {
9577 dout(10) << " waiting for single auth on " << *base << dendl;
9578 if (!onfinish)
9579 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9580 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9581 return;
9582 } else if (from == mds->get_nodeid()) {
9583 list<MDSInternalContextBase*> finished;
9584 base->take_waiting(CInode::WAIT_DIR, finished);
9585 mds->queue_waiters(finished);
9586 return;
9587 }
9588
9589 frag_t fg = base->pick_dirfrag(want_path[0]);
9590 if ((want_xlocked && want_path.depth() == 1) ||
9591 !base->is_waiting_for_dir(fg) || !onfinish) {
9592 discover_info_t& d = _create_discover(from);
9593 d.ino = base->ino();
9594 d.pin_base(base);
9595 d.frag = fg;
9596 d.snap = snap;
9597 d.want_path = want_path;
9598 d.want_base_dir = true;
9599 d.want_xlocked = want_xlocked;
9600 _send_discover(d);
9601 }
9602
9603 // register + wait
9604 if (onfinish)
9605 base->add_dir_waiter(fg, onfinish);
9606 }
9607
9608 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9609 CDir *base;
9610 snapid_t snapid;
9611 filepath path;
9612 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9613 MDCacheContext(c), base(b), snapid(s), path(p) {}
9614 void finish(int r) override {
9615 mdcache->discover_path(base, snapid, path, 0);
9616 }
9617 };
9618
9619 void MDCache::discover_path(CDir *base,
9620 snapid_t snap,
9621 filepath want_path,
9622 MDSInternalContextBase *onfinish,
9623 bool want_xlocked)
9624 {
9625 mds_rank_t from = base->authority().first;
9626
9627 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9628 << (want_xlocked ? " want_xlocked":"")
9629 << dendl;
9630
9631 if (base->is_ambiguous_auth()) {
9632 dout(7) << " waiting for single auth on " << *base << dendl;
9633 if (!onfinish)
9634 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9635 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9636 return;
9637 } else if (from == mds->get_nodeid()) {
9638 list<MDSInternalContextBase*> finished;
9639 base->take_sub_waiting(finished);
9640 mds->queue_waiters(finished);
9641 return;
9642 }
9643
9644 if ((want_xlocked && want_path.depth() == 1) ||
9645 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9646 discover_info_t& d = _create_discover(from);
9647 d.ino = base->ino();
9648 d.pin_base(base);
9649 d.frag = base->get_frag();
9650 d.snap = snap;
9651 d.want_path = want_path;
9652 d.want_base_dir = false;
9653 d.want_xlocked = want_xlocked;
9654 _send_discover(d);
9655 }
9656
9657 // register + wait
9658 if (onfinish)
9659 base->add_dentry_waiter(want_path[0], snap, onfinish);
9660 }
9661
9662 void MDCache::kick_discovers(mds_rank_t who)
9663 {
9664 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9665 p != discovers.end();
9666 ++p) {
9667 if (p->second.mds != who)
9668 continue;
9669 _send_discover(p->second);
9670 }
9671 }
9672
9673
9674 /* This function DOES put the passed message before returning */
9675 void MDCache::handle_discover(MDiscover *dis)
9676 {
9677 mds_rank_t whoami = mds->get_nodeid();
9678 mds_rank_t from = mds_rank_t(dis->get_source().num());
9679
9680 assert(from != whoami);
9681
9682 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9683 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9684 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9685 dis->put();
9686 return;
9687 }
9688
9689 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9690 // delay processing request from survivor because we may not yet choose lock states.
9691 if (!mds->mdsmap->is_rejoin(from)) {
9692 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9693 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9694 return;
9695 }
9696 }
9697
9698
9699 CInode *cur = 0;
9700 MDiscoverReply *reply = new MDiscoverReply(dis);
9701
9702 snapid_t snapid = dis->get_snapid();
9703
9704 // get started.
9705 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9706 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9707 // wants root
9708 dout(7) << "handle_discover from mds." << from
9709 << " wants base + " << dis->get_want().get_path()
9710 << " snap " << snapid
9711 << dendl;
9712
9713 cur = get_inode(dis->get_base_ino());
9714 assert(cur);
9715
9716 // add root
9717 reply->starts_with = MDiscoverReply::INODE;
9718 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9719 dout(10) << "added base " << *cur << dendl;
9720 }
9721 else {
9722 // there's a base inode
9723 cur = get_inode(dis->get_base_ino(), snapid);
9724 if (!cur && snapid != CEPH_NOSNAP) {
9725 cur = get_inode(dis->get_base_ino());
9726 if (cur && !cur->is_multiversion())
9727 cur = NULL; // nope!
9728 }
9729
9730 if (!cur) {
9731 dout(7) << "handle_discover mds." << from
9732 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9733 << dendl;
9734 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9735 reply->set_error_dentry(dis->get_dentry(0));
9736 reply->set_flag_error_dir();
9737 } else if (dis->wants_base_dir()) {
9738 dout(7) << "handle_discover mds." << from
9739 << " wants basedir+" << dis->get_want().get_path()
9740 << " has " << *cur
9741 << dendl;
9742 } else {
9743 dout(7) << "handle_discover mds." << from
9744 << " wants " << dis->get_want().get_path()
9745 << " has " << *cur
9746 << dendl;
9747 }
9748 }
9749
9750 assert(reply);
9751
9752 // add content
9753 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9754 for (unsigned i = 0;
9755 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9756 i++) {
9757
9758 // -- figure out the dir
9759
9760 // is *cur even a dir at all?
9761 if (!cur->is_dir()) {
9762 dout(7) << *cur << " not a dir" << dendl;
9763 reply->set_flag_error_dir();
9764 break;
9765 }
9766
9767 // pick frag
9768 frag_t fg;
9769 if (dis->get_want().depth()) {
9770 // dentry specifies
9771 fg = cur->pick_dirfrag(dis->get_dentry(i));
9772 } else {
9773 // requester explicity specified the frag
9774 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9775 fg = dis->get_base_dir_frag();
9776 if (!cur->dirfragtree.is_leaf(fg))
9777 fg = cur->dirfragtree[fg.value()];
9778 }
9779 CDir *curdir = cur->get_dirfrag(fg);
9780
9781 if ((!curdir && !cur->is_auth()) ||
9782 (curdir && !curdir->is_auth())) {
9783
9784 /* before:
9785 * ONLY set flag if empty!!
9786 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9787 * resulting in duplicate discovers in flight,
9788 * which can wreak havoc when discovering rename srcdn (which may move)
9789 */
9790
9791 if (reply->is_empty()) {
9792 // only hint if empty.
9793 // someday this could be better, but right now the waiter logic isn't smart enough.
9794
9795 // hint
9796 if (curdir) {
9797 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9798 reply->set_dir_auth_hint(curdir->authority().first);
9799 } else {
9800 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9801 << *cur << dendl;
9802 reply->set_dir_auth_hint(cur->authority().first);
9803 }
9804
9805 // note error dentry, if any
9806 // NOTE: important, as it allows requester to issue an equivalent discover
9807 // to whomever we hint at.
9808 if (dis->get_want().depth() > i)
9809 reply->set_error_dentry(dis->get_dentry(i));
9810 }
9811
9812 break;
9813 }
9814
9815 if (!curdir) { // open dir?
9816 if (cur->is_frozen()) {
9817 if (!reply->is_empty()) {
9818 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9819 break;
9820 }
9821 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9822 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9823 reply->put();
9824 return;
9825 }
9826 curdir = cur->get_or_open_dirfrag(this, fg);
9827 } else if (curdir->is_frozen_tree() ||
9828 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
9829 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9830 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9831 reply->set_flag_error_dir();
9832 break;
9833 }
9834 if (!reply->is_empty()) {
9835 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9836 break;
9837 }
9838 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9839 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9840 reply->put();
9841 return;
9842 }
9843
9844 // add dir
9845 if (curdir->get_version() == 0) {
9846 // fetch newly opened dir
9847 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9848 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9849 // make sure the base frag is correct, though, in there was a refragment since the
9850 // original request was sent.
9851 reply->set_base_dir_frag(curdir->get_frag());
9852 } else {
9853 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9854 if (!reply->trace.length())
9855 reply->starts_with = MDiscoverReply::DIR;
9856 replicate_dir(curdir, from, reply->trace);
9857 dout(7) << "handle_discover added dir " << *curdir << dendl;
9858 }
9859
9860 // lookup
9861 CDentry *dn = 0;
9862 if (curdir->get_version() == 0) {
9863 // fetch newly opened dir
9864 } else if (dis->get_want().depth() > 0) {
9865 // lookup dentry
9866 dn = curdir->lookup(dis->get_dentry(i), snapid);
9867 } else
9868 break; // done!
9869
9870 // incomplete dir?
9871 if (!dn) {
9872 if (!curdir->is_complete()) {
9873 // readdir
9874 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9875 if (reply->is_empty()) {
9876 // fetch and wait
9877 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9878 dis->wants_base_dir() && curdir->get_version() == 0);
9879 reply->put();
9880 return;
9881 } else {
9882 // initiate fetch, but send what we have so far
9883 curdir->fetch(0);
9884 break;
9885 }
9886 }
9887
9888 // send null dentry
9889 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9890 << *curdir << dendl;
9891 dn = curdir->add_null_dentry(dis->get_dentry(i));
9892 }
9893 assert(dn);
9894
9895 CDentry::linkage_t *dnl = dn->get_linkage();
9896
9897 // xlocked dentry?
9898 // ...always block on non-tail items (they are unrelated)
9899 // ...allow xlocked tail disocvery _only_ if explicitly requested
9900 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9901 if (dn->lock.is_xlocked()) {
9902 // is this the last (tail) item in the discover traversal?
9903 if (tailitem && dis->wants_xlocked()) {
9904 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9905 } else if (reply->is_empty()) {
9906 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9907 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9908 reply->put();
9909 return;
9910 } else {
9911 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9912 break;
9913 }
9914 }
9915
9916 // frozen inode?
9917 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9918 if (tailitem && dis->wants_xlocked()) {
9919 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9920 } else if (reply->is_empty()) {
9921 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9922 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9923 reply->put();
9924 return;
9925 } else {
9926 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9927 break;
9928 }
9929 }
9930
9931 // add dentry
9932 if (!reply->trace.length())
9933 reply->starts_with = MDiscoverReply::DENTRY;
9934 replicate_dentry(dn, from, reply->trace);
9935 dout(7) << "handle_discover added dentry " << *dn << dendl;
9936
9937 if (!dnl->is_primary()) break; // stop on null or remote link.
9938
9939 // add inode
9940 CInode *next = dnl->get_inode();
9941 assert(next->is_auth());
9942
9943 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
9944 dout(7) << "handle_discover added inode " << *next << dendl;
9945
9946 // descend, keep going.
9947 cur = next;
9948 continue;
9949 }
9950
9951 // how did we do?
9952 assert(!reply->is_empty());
9953 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
9954 mds->send_message(reply, dis->get_connection());
9955
9956 dis->put();
9957 }
9958
9959 /* This function DOES put the passed message before returning */
9960 void MDCache::handle_discover_reply(MDiscoverReply *m)
9961 {
9962 /*
9963 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
9964 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
9965 m->put();
9966 return;
9967 }
9968 */
9969 dout(7) << "discover_reply " << *m << dendl;
9970 if (m->is_flag_error_dir())
9971 dout(7) << " flag error, dir" << dendl;
9972 if (m->is_flag_error_dn())
9973 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
9974
9975 list<MDSInternalContextBase*> finished, error;
9976 mds_rank_t from = mds_rank_t(m->get_source().num());
9977
9978 // starting point
9979 CInode *cur = get_inode(m->get_base_ino());
9980 bufferlist::iterator p = m->trace.begin();
9981
9982 int next = m->starts_with;
9983
9984 // decrement discover counters
9985 if (m->get_tid()) {
9986 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
9987 if (p != discovers.end()) {
9988 dout(10) << " found tid " << m->get_tid() << dendl;
9989 discovers.erase(p);
9990 } else {
9991 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
9992 }
9993 }
9994
9995 // discover may start with an inode
9996 if (!p.end() && next == MDiscoverReply::INODE) {
9997 cur = add_replica_inode(p, NULL, finished);
9998 dout(7) << "discover_reply got base inode " << *cur << dendl;
9999 assert(cur->is_base());
10000
10001 next = MDiscoverReply::DIR;
10002
10003 // take waiters?
10004 if (cur->is_base() &&
10005 waiting_for_base_ino[from].count(cur->ino())) {
10006 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10007 waiting_for_base_ino[from].erase(cur->ino());
10008 }
10009 }
10010 assert(cur);
10011
10012 // loop over discover results.
10013 // indexes follow each ([[dir] dentry] inode)
10014 // can start, end with any type.
10015 while (!p.end()) {
10016 // dir
10017 frag_t fg;
10018 CDir *curdir = 0;
10019 if (next == MDiscoverReply::DIR) {
10020 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10021 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10022 assert(m->get_wanted_base_dir());
10023 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10024 }
10025 } else {
10026 // note: this can only happen our first way around this loop.
10027 if (p.end() && m->is_flag_error_dn()) {
10028 fg = cur->pick_dirfrag(m->get_error_dentry());
10029 curdir = cur->get_dirfrag(fg);
10030 } else
10031 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10032 }
10033
10034 if (p.end())
10035 break;
10036
10037 // dentry
10038 CDentry *dn = add_replica_dentry(p, curdir, finished);
10039
10040 if (p.end())
10041 break;
10042
10043 // inode
10044 cur = add_replica_inode(p, dn, finished);
10045
10046 next = MDiscoverReply::DIR;
10047 }
10048
10049 // dir error?
10050 // or dir_auth hint?
10051 if (m->is_flag_error_dir() && !cur->is_dir()) {
10052 // not a dir.
10053 cur->take_waiting(CInode::WAIT_DIR, error);
10054 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10055 mds_rank_t who = m->get_dir_auth_hint();
10056 if (who == mds->get_nodeid()) who = -1;
10057 if (who >= 0)
10058 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10059
10060 frag_t fg = m->get_base_dir_frag();
10061 CDir *dir = cur->get_dirfrag(fg);
10062
10063 if (m->get_wanted_base_dir()) {
10064 if (cur->is_waiting_for_dir(fg)) {
10065 if (cur->is_auth())
10066 cur->take_waiting(CInode::WAIT_DIR, finished);
10067 else if (dir || !cur->dirfragtree.is_leaf(fg))
10068 cur->take_dir_waiting(fg, finished);
10069 else
10070 discover_dir_frag(cur, fg, 0, who);
10071 } else
10072 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10073 }
10074
10075 // try again?
10076 if (m->get_error_dentry().length()) {
10077 // wanted a dentry
10078 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10079 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10080 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10081 m->get_wanted_snapid(), finished);
10082 } else {
10083 filepath relpath(m->get_error_dentry(), 0);
10084 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10085 }
10086 } else
10087 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10088 << m->get_error_dentry() << dendl;
10089 }
10090 }
10091
10092 // waiters
10093 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10094 mds->queue_waiters(finished);
10095
10096 // done
10097 m->put();
10098 }
10099
10100
10101
10102 // ----------------------------
10103 // REPLICAS
10104
10105 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10106 list<MDSInternalContextBase*>& finished)
10107 {
10108 dirfrag_t df;
10109 ::decode(df, p);
10110
10111 assert(diri->ino() == df.ino);
10112
10113 // add it (_replica_)
10114 CDir *dir = diri->get_dirfrag(df.frag);
10115
10116 if (dir) {
10117 // had replica. update w/ new nonce.
10118 dir->decode_replica(p);
10119 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10120 } else {
10121 // force frag to leaf in the diri tree
10122 if (!diri->dirfragtree.is_leaf(df.frag)) {
10123 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10124 << diri->dirfragtree << dendl;
10125 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10126 }
10127
10128 // add replica.
10129 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10130 dir->decode_replica(p);
10131
10132 // is this a dir_auth delegation boundary?
10133 if (from != diri->authority().first ||
10134 diri->is_ambiguous_auth() ||
10135 diri->is_base())
10136 adjust_subtree_auth(dir, from);
10137
10138 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10139
10140 // get waiters
10141 diri->take_dir_waiting(df.frag, finished);
10142 }
10143
10144 return dir;
10145 }
10146
10147 CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, mds_rank_t from)
10148 {
10149 assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN);
10150
10151 // forge a replica.
10152 CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) );
10153
10154 // i'm assuming this is a subtree root.
10155 adjust_subtree_auth(dir, from);
10156
10157 dout(7) << "forge_replica_dir added " << *dir << " while mds." << from << " is down" << dendl;
10158
10159 return dir;
10160 }
10161
10162 CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10163 {
10164 string name;
10165 snapid_t last;
10166 ::decode(name, p);
10167 ::decode(last, p);
10168
10169 CDentry *dn = dir->lookup(name, last);
10170
10171 // have it?
10172 if (dn) {
10173 dn->decode_replica(p, false);
10174 dout(7) << "add_replica_dentry had " << *dn << dendl;
10175 } else {
10176 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10177 dn->decode_replica(p, true);
10178 dout(7) << "add_replica_dentry added " << *dn << dendl;
10179 }
10180
10181 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10182
10183 return dn;
10184 }
10185
10186 CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10187 {
10188 inodeno_t ino;
10189 snapid_t last;
10190 ::decode(ino, p);
10191 ::decode(last, p);
10192 CInode *in = get_inode(ino, last);
10193 if (!in) {
10194 in = new CInode(this, false, 1, last);
10195 in->decode_replica(p, true);
10196 add_inode(in);
10197 if (in->ino() == MDS_INO_ROOT)
10198 in->inode_auth.first = 0;
10199 else if (in->is_mdsdir())
10200 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10201 dout(10) << "add_replica_inode added " << *in << dendl;
10202 if (dn) {
10203 assert(dn->get_linkage()->is_null());
10204 dn->dir->link_primary_inode(dn, in);
10205 }
10206 } else {
10207 in->decode_replica(p, false);
10208 dout(10) << "add_replica_inode had " << *in << dendl;
10209 }
10210
10211 if (dn) {
10212 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10213 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10214 }
10215
10216 return in;
10217 }
10218
10219
10220 void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10221 {
10222 uint64_t features = mds->mdsmap->get_up_features();
10223 replicate_inode(get_myin(), who, bl, features);
10224 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10225 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10226 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10227 replicate_dir(straydn->get_dir(), who, bl);
10228 replicate_dentry(straydn, who, bl);
10229 }
10230
10231 CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10232 {
10233 list<MDSInternalContextBase*> finished;
10234 bufferlist::iterator p = bl.begin();
10235
10236 CInode *mdsin = add_replica_inode(p, NULL, finished);
10237 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10238 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10239 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10240 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10241 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10242 if (!finished.empty())
10243 mds->queue_waiters(finished);
10244
10245 return straydn;
10246 }
10247
10248
10249 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10250 {
10251 // this is an FYI, re: replication
10252
10253 set<mds_rank_t> who;
10254 if (bcast) {
10255 mds->get_mds_map()->get_active_mds_set(who);
10256 } else {
10257 for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
10258 p != dir->replicas_end();
10259 ++p)
10260 who.insert(p->first);
10261 }
10262
10263 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10264
10265 filepath path;
10266 dir->inode->make_path(path);
10267
10268 mds_rank_t whoami = mds->get_nodeid();
10269 for (set<mds_rank_t>::iterator it = who.begin();
10270 it != who.end();
10271 ++it) {
10272 if (*it == whoami) continue;
10273 //if (*it == except) continue;
10274 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10275
10276 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10277 dir->dirfrag(),
10278 dir->dir_rep,
10279 dir->dir_rep_by,
10280 path,
10281 bcast),
10282 *it);
10283 }
10284
10285 return 0;
10286 }
10287
10288 /* This function DOES put the passed message before returning */
10289 void MDCache::handle_dir_update(MDirUpdate *m)
10290 {
10291 CDir *dir = get_dirfrag(m->get_dirfrag());
10292 if (!dir) {
10293 dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl;
10294
10295 // discover it?
10296 if (m->should_discover()) {
10297 // only try once!
10298 // this is key to avoid a fragtree update race, among other things.
10299 m->tried_discover();
10300 vector<CDentry*> trace;
10301 CInode *in;
10302 filepath path = m->get_path();
10303 dout(5) << "trying discover on dir_update for " << path << dendl;
10304 MDRequestRef null_ref;
10305 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10306 if (r > 0)
10307 return;
10308 assert(r == 0);
10309 open_remote_dirfrag(in, m->get_dirfrag().frag,
10310 new C_MDS_RetryMessage(mds, m));
10311 return;
10312 }
10313
10314 m->put();
10315 return;
10316 }
10317
10318 // update
10319 dout(5) << "dir_update on " << *dir << dendl;
10320 dir->dir_rep = m->get_dir_rep();
10321 dir->dir_rep_by = m->get_dir_rep_by();
10322
10323 // done
10324 m->put();
10325 }
10326
10327
10328
10329
10330
10331 // LINK
10332
10333 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10334 {
10335 dout(7) << "send_dentry_link " << *dn << dendl;
10336
10337 CDir *subtree = get_subtree_root(dn->get_dir());
10338 for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
10339 p != dn->replicas_end();
10340 ++p) {
10341 // don't tell (rename) witnesses; they already know
10342 if (mdr.get() && mdr->more()->witnessed.count(p->first))
10343 continue;
10344 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
10345 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
10346 rejoin_gather.count(p->first)))
10347 continue;
10348 CDentry::linkage_t *dnl = dn->get_linkage();
10349 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10350 dn->name, dnl->is_primary());
10351 if (dnl->is_primary()) {
10352 dout(10) << " primary " << *dnl->get_inode() << dendl;
10353 replicate_inode(dnl->get_inode(), p->first, m->bl,
10354 mds->mdsmap->get_up_features());
10355 } else if (dnl->is_remote()) {
10356 inodeno_t ino = dnl->get_remote_ino();
10357 __u8 d_type = dnl->get_remote_d_type();
10358 dout(10) << " remote " << ino << " " << d_type << dendl;
10359 ::encode(ino, m->bl);
10360 ::encode(d_type, m->bl);
10361 } else
10362 ceph_abort(); // aie, bad caller!
10363 mds->send_message_mds(m, p->first);
10364 }
10365 }
10366
10367 /* This function DOES put the passed message before returning */
10368 void MDCache::handle_dentry_link(MDentryLink *m)
10369 {
10370
10371 CDentry *dn = NULL;
10372 CDir *dir = get_dirfrag(m->get_dirfrag());
10373 if (!dir) {
10374 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10375 } else {
10376 dn = dir->lookup(m->get_dn());
10377 if (!dn) {
10378 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10379 } else {
10380 dout(7) << "handle_dentry_link on " << *dn << dendl;
10381 CDentry::linkage_t *dnl = dn->get_linkage();
10382
10383 assert(!dn->is_auth());
10384 assert(dnl->is_null());
10385 }
10386 }
10387
10388 bufferlist::iterator p = m->bl.begin();
10389 list<MDSInternalContextBase*> finished;
10390 if (dn) {
10391 if (m->get_is_primary()) {
10392 // primary link.
10393 add_replica_inode(p, dn, finished);
10394 } else {
10395 // remote link, easy enough.
10396 inodeno_t ino;
10397 __u8 d_type;
10398 ::decode(ino, p);
10399 ::decode(d_type, p);
10400 dir->link_remote_inode(dn, ino, d_type);
10401 }
10402 } else {
10403 ceph_abort();
10404 }
10405
10406 if (!finished.empty())
10407 mds->queue_waiters(finished);
10408
10409 m->put();
10410 return;
10411 }
10412
10413
10414 // UNLINK
10415
10416 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10417 {
10418 dout(10) << "send_dentry_unlink " << *dn << dendl;
10419 // share unlink news with replicas
10420 set<mds_rank_t> replicas;
10421 dn->list_replicas(replicas);
10422 if (straydn)
10423 straydn->list_replicas(replicas);
10424 for (set<mds_rank_t>::iterator it = replicas.begin();
10425 it != replicas.end();
10426 ++it) {
10427 // don't tell (rmdir) witnesses; they already know
10428 if (mdr.get() && mdr->more()->witnessed.count(*it))
10429 continue;
10430
10431 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10432 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10433 rejoin_gather.count(*it)))
10434 continue;
10435
10436 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
10437 if (straydn)
10438 replicate_stray(straydn, *it, unlink->straybl);
10439 mds->send_message_mds(unlink, *it);
10440 }
10441 }
10442
10443 /* This function DOES put the passed message before returning */
10444 void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10445 {
10446 // straydn
10447 CDentry *straydn = NULL;
10448 if (m->straybl.length())
10449 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10450
10451 CDir *dir = get_dirfrag(m->get_dirfrag());
10452 if (!dir) {
10453 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10454 } else {
10455 CDentry *dn = dir->lookup(m->get_dn());
10456 if (!dn) {
10457 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10458 } else {
10459 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10460 CDentry::linkage_t *dnl = dn->get_linkage();
10461
10462 // open inode?
10463 if (dnl->is_primary()) {
10464 CInode *in = dnl->get_inode();
10465 dn->dir->unlink_inode(dn);
10466 assert(straydn);
10467 straydn->dir->link_primary_inode(straydn, in);
10468
10469 // in->first is lazily updated on replica; drag it forward so
10470 // that we always keep it in sync with the dnq
10471 assert(straydn->first >= in->first);
10472 in->first = straydn->first;
10473
10474 // update subtree map?
10475 if (in->is_dir())
10476 adjust_subtree_after_rename(in, dir, false);
10477
10478 // send caps to auth (if we're not already)
10479 if (in->is_any_caps() &&
10480 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10481 migrator->export_caps(in);
10482
10483 touch_dentry_bottom(straydn); // move stray to end of lru
10484 straydn = NULL;
10485 } else {
10486 assert(!straydn);
10487 assert(dnl->is_remote());
10488 dn->dir->unlink_inode(dn);
10489 }
10490 assert(dnl->is_null());
10491
10492 // move to bottom of lru
10493 touch_dentry_bottom(dn);
10494 }
10495 }
10496
10497 // race with trim_dentry()
10498 if (straydn) {
10499 assert(straydn->get_num_ref() == 0);
10500 assert(straydn->get_linkage()->is_null());
10501 map<mds_rank_t, MCacheExpire*> expiremap;
10502 trim_dentry(straydn, expiremap);
10503 send_expire_messages(expiremap);
10504 }
10505
10506 m->put();
10507 return;
10508 }
10509
10510
10511
10512
10513
10514
10515 // ===================================================================
10516
10517
10518
10519 // ===================================================================
10520 // FRAGMENT
10521
10522
10523 /**
10524 * adjust_dir_fragments -- adjust fragmentation for a directory
10525 *
10526 * @param diri directory inode
10527 * @param basefrag base fragment
10528 * @param bits bit adjustment. positive for split, negative for merge.
10529 */
10530 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10531 list<CDir*>& resultfrags,
10532 list<MDSInternalContextBase*>& waiters,
10533 bool replay)
10534 {
10535 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10536 << " on " << *diri << dendl;
10537
10538 list<CDir*> srcfrags;
10539 diri->get_dirfrags_under(basefrag, srcfrags);
10540
10541 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10542 }
10543
10544 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10545 {
10546 CDir *dir = diri->get_dirfrag(fg);
10547 if (dir)
10548 return dir;
10549
10550 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10551
10552 list<CDir*> src, result;
10553 list<MDSInternalContextBase*> waiters;
10554
10555 // split a parent?
10556 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10557 while (1) {
10558 CDir *pdir = diri->get_dirfrag(parent);
10559 if (pdir) {
10560 int split = fg.bits() - parent.bits();
10561 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10562 src.push_back(pdir);
10563 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10564 dir = diri->get_dirfrag(fg);
10565 if (dir) {
10566 dout(10) << "force_dir_fragment result " << *dir << dendl;
10567 break;
10568 }
10569 }
10570 if (parent == frag_t())
10571 break;
10572 frag_t last = parent;
10573 parent = parent.parent();
10574 dout(10) << " " << last << " parent is " << parent << dendl;
10575 }
10576
10577 if (!dir) {
10578 // hoover up things under fg?
10579 diri->get_dirfrags_under(fg, src);
10580 if (src.empty()) {
10581 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10582 } else {
10583 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10584 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10585 dir = result.front();
10586 dout(10) << "force_dir_fragment result " << *dir << dendl;
10587 }
10588 }
10589 if (!replay)
10590 mds->queue_waiters(waiters);
10591 return dir;
10592 }
10593
10594 void MDCache::adjust_dir_fragments(CInode *diri,
10595 list<CDir*>& srcfrags,
10596 frag_t basefrag, int bits,
10597 list<CDir*>& resultfrags,
10598 list<MDSInternalContextBase*>& waiters,
10599 bool replay)
10600 {
10601 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10602 << " srcfrags " << srcfrags
10603 << " on " << *diri << dendl;
10604
10605 // adjust fragtree
10606 // yuck. we may have discovered the inode while it was being fragmented.
10607 if (!diri->dirfragtree.is_leaf(basefrag))
10608 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10609
10610 if (bits > 0)
10611 diri->dirfragtree.split(basefrag, bits);
10612 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10613
10614 if (srcfrags.empty())
10615 return;
10616
10617 // split
10618 CDir *parent_dir = diri->get_parent_dir();
10619 CDir *parent_subtree = 0;
10620 if (parent_dir)
10621 parent_subtree = get_subtree_root(parent_dir);
10622
10623 if (bits > 0) {
10624 // SPLIT
10625 assert(srcfrags.size() == 1);
10626 CDir *dir = srcfrags.front();
10627
10628 dir->split(bits, resultfrags, waiters, replay);
10629
10630 // did i change the subtree map?
10631 if (dir->is_subtree_root()) {
10632 // new frags are now separate subtrees
10633 for (list<CDir*>::iterator p = resultfrags.begin();
10634 p != resultfrags.end();
10635 ++p)
10636 subtrees[*p].clear(); // new frag is now its own subtree
10637
10638 // was i a bound?
10639 if (parent_subtree) {
10640 assert(subtrees[parent_subtree].count(dir));
10641 subtrees[parent_subtree].erase(dir);
10642 for (list<CDir*>::iterator p = resultfrags.begin();
10643 p != resultfrags.end();
10644 ++p) {
10645 assert((*p)->is_subtree_root());
10646 subtrees[parent_subtree].insert(*p);
10647 }
10648 }
10649
10650 // adjust my bounds.
10651 set<CDir*> bounds;
10652 bounds.swap(subtrees[dir]);
10653 subtrees.erase(dir);
10654 for (set<CDir*>::iterator p = bounds.begin();
10655 p != bounds.end();
10656 ++p) {
10657 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10658 subtrees[frag].insert(*p);
10659 }
10660
10661 show_subtrees(10);
10662
10663 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10664 dir->dir_auth = CDIR_AUTH_DEFAULT;
10665 }
10666
10667 diri->close_dirfrag(dir->get_frag());
10668
10669 } else {
10670 // MERGE
10671
10672 // are my constituent bits subtrees? if so, i will be too.
10673 // (it's all or none, actually.)
10674 bool was_subtree = false;
10675 set<CDir*> new_bounds;
10676 for (list<CDir*>::iterator p = srcfrags.begin(); p != srcfrags.end(); ++p) {
10677 CDir *dir = *p;
10678 if (dir->is_subtree_root()) {
10679 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
10680 was_subtree = true;
10681 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10682 set<CDir*>::iterator r = q->second.begin();
10683 while (r != subtrees[dir].end()) {
10684 new_bounds.insert(*r);
10685 subtrees[dir].erase(r++);
10686 }
10687 subtrees.erase(q);
10688
10689 // remove myself as my parent's bound
10690 if (parent_subtree)
10691 subtrees[parent_subtree].erase(dir);
10692 }
10693 }
10694
10695 // merge
10696 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10697 f->merge(srcfrags, waiters, replay);
10698 diri->add_dirfrag(f);
10699
10700 if (was_subtree) {
10701 assert(f->is_subtree_root());
10702 subtrees[f].swap(new_bounds);
10703 if (parent_subtree)
10704 subtrees[parent_subtree].insert(f);
10705
10706 show_subtrees(10);
10707 }
10708
10709 resultfrags.push_back(f);
10710 }
10711 }
10712
10713
10714 class C_MDC_FragmentFrozen : public MDSInternalContext {
10715 MDCache *mdcache;
10716 MDRequestRef mdr;
10717 public:
10718 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10719 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10720 void finish(int r) override {
10721 mdcache->fragment_frozen(mdr, r);
10722 }
10723 };
10724
10725 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10726 {
10727 if (is_readonly()) {
10728 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10729 return false;
10730 }
10731 if (mds->is_cluster_degraded()) {
10732 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10733 return false;
10734 }
10735 if (diri->get_parent_dir() &&
10736 diri->get_parent_dir()->get_inode()->is_stray()) {
10737 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10738 return false;
10739 }
10740 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10741 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10742 return false;
10743 }
10744
10745 if (diri->scrub_is_in_progress()) {
10746 dout(7) << "can_fragment: scrub in progress" << dendl;
10747 return false;
10748 }
10749
10750 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10751 CDir *dir = *p;
10752 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10753 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10754 return false;
10755 }
10756 if (!dir->is_auth()) {
10757 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10758 return false;
10759 }
10760 if (dir->is_bad()) {
10761 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10762 return false;
10763 }
10764 if (dir->is_frozen() ||
10765 dir->is_freezing()) {
10766 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10767 return false;
10768 }
10769 }
10770
10771 return true;
10772 }
10773
10774 void MDCache::split_dir(CDir *dir, int bits)
10775 {
10776 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10777 assert(dir->is_auth());
10778 CInode *diri = dir->inode;
10779
10780 list<CDir*> dirs;
10781 dirs.push_back(dir);
10782
10783 if (!can_fragment(diri, dirs)) {
10784 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10785 return;
10786 }
10787
10788 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10789 mdr->more()->fragment_base = dir->dirfrag();
10790
10791 assert(fragments.count(dir->dirfrag()) == 0);
10792 fragment_info_t& info = fragments[dir->dirfrag()];
10793 info.mdr = mdr;
10794 info.dirs.push_back(dir);
10795 info.bits = bits;
10796 info.last_cum_auth_pins_change = ceph_clock_now();
10797
10798 fragment_freeze_dirs(dirs);
10799 // initial mark+complete pass
10800 fragment_mark_and_complete(mdr);
10801 }
10802
10803 void MDCache::merge_dir(CInode *diri, frag_t frag)
10804 {
10805 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10806
10807 list<CDir*> dirs;
10808 if (!diri->get_dirfrags_under(frag, dirs)) {
10809 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10810 return;
10811 }
10812
10813 if (diri->dirfragtree.is_leaf(frag)) {
10814 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10815 return;
10816 }
10817
10818 if (!can_fragment(diri, dirs))
10819 return;
10820
10821 CDir *first = dirs.front();
10822 int bits = first->get_frag().bits() - frag.bits();
10823 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10824
10825 dirfrag_t basedirfrag(diri->ino(), frag);
10826 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10827 mdr->more()->fragment_base = basedirfrag;
10828
10829 assert(fragments.count(basedirfrag) == 0);
10830 fragment_info_t& info = fragments[basedirfrag];
10831 info.mdr = mdr;
10832 info.dirs = dirs;
10833 info.bits = -bits;
10834 info.last_cum_auth_pins_change = ceph_clock_now();
10835
10836 fragment_freeze_dirs(dirs);
10837 // initial mark+complete pass
10838 fragment_mark_and_complete(mdr);
10839 }
10840
10841 void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10842 {
10843 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10844 CDir *dir = *p;
10845 dir->auth_pin(dir); // until we mark and complete them
10846 dir->state_set(CDir::STATE_FRAGMENTING);
10847 dir->freeze_dir();
10848 assert(dir->is_freezing_dir());
10849 }
10850 }
10851
10852 class C_MDC_FragmentMarking : public MDCacheContext {
10853 MDRequestRef mdr;
10854 public:
10855 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10856 void finish(int r) override {
10857 mdcache->fragment_mark_and_complete(mdr);
10858 }
10859 };
10860
10861 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10862 {
10863 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10864 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10865 if (it == fragments.end() || it->second.mdr != mdr) {
10866 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10867 request_finish(mdr);
10868 return;
10869 }
10870
10871 fragment_info_t& info = it->second;
10872 CInode *diri = info.dirs.front()->get_inode();
10873 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10874
10875 MDSGatherBuilder gather(g_ceph_context);
10876
10877 for (list<CDir*>::iterator p = info.dirs.begin();
10878 p != info.dirs.end();
10879 ++p) {
10880 CDir *dir = *p;
10881
10882 bool ready = true;
10883 if (!dir->is_complete()) {
10884 dout(15) << " fetching incomplete " << *dir << dendl;
10885 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10886 ready = false;
10887 } else if (dir->get_frag() == frag_t()) {
10888 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10889 // the operation. To avoid CDir::fetch() complaining about missing object,
10890 // we commit new dirfrag first.
10891 if (dir->state_test(CDir::STATE_CREATING)) {
10892 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10893 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10894 ready = false;
10895 } else if (dir->is_new()) {
10896 dout(15) << " committing new " << *dir << dendl;
10897 assert(dir->is_dirty());
10898 dir->commit(0, gather.new_sub(), true);
10899 ready = false;
10900 }
10901 }
10902 if (!ready)
10903 continue;
10904
10905 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10906 dout(15) << " marking " << *dir << dendl;
10907 for (CDir::map_t::iterator p = dir->items.begin();
10908 p != dir->items.end();
10909 ++p) {
10910 CDentry *dn = p->second;
10911 dn->get(CDentry::PIN_FRAGMENTING);
10912 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
10913 dn->state_set(CDentry::STATE_FRAGMENTING);
10914 }
10915 dir->state_set(CDir::STATE_DNPINNEDFRAG);
10916 dir->auth_unpin(dir);
10917 } else {
10918 dout(15) << " already marked " << *dir << dendl;
10919 }
10920 }
10921 if (gather.has_subs()) {
10922 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
10923 gather.activate();
10924 return;
10925 }
10926
10927 for (list<CDir*>::iterator p = info.dirs.begin();
10928 p != info.dirs.end();
10929 ++p) {
10930 CDir *dir = *p;
10931 if (!dir->is_frozen_dir()) {
10932 assert(dir->is_freezing_dir());
10933 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
10934 }
10935 }
10936 if (gather.has_subs()) {
10937 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
10938 gather.activate();
10939 // flush log so that request auth_pins are retired
10940 mds->mdlog->flush();
10941 return;
10942 }
10943
10944 fragment_frozen(mdr, 0);
10945 }
10946
10947 void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
10948 {
10949 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
10950 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10951 CDir *dir = *p;
10952 dout(10) << " frag " << *dir << dendl;
10953
10954 assert(dir->state_test(CDir::STATE_FRAGMENTING));
10955 dir->state_clear(CDir::STATE_FRAGMENTING);
10956
10957 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10958 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
10959
10960 for (CDir::map_t::iterator p = dir->items.begin();
10961 p != dir->items.end();
10962 ++p) {
10963 CDentry *dn = p->second;
10964 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
10965 dn->state_clear(CDentry::STATE_FRAGMENTING);
10966 dn->put(CDentry::PIN_FRAGMENTING);
10967 }
10968 } else {
10969 dir->auth_unpin(dir);
10970 }
10971
10972 dir->unfreeze_dir();
10973 }
10974 }
10975
10976 bool MDCache::fragment_are_all_frozen(CDir *dir)
10977 {
10978 assert(dir->is_frozen_dir());
10979 map<dirfrag_t,fragment_info_t>::iterator p;
10980 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
10981 p != fragments.end() && p->first.ino == dir->ino();
10982 ++p) {
10983 if (p->first.frag.contains(dir->get_frag()))
10984 return p->second.all_frozen;
10985 }
10986 ceph_abort();
10987 return false;
10988 }
10989
10990 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
10991 {
10992 map<dirfrag_t,fragment_info_t>::iterator p;
10993 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
10994 p != fragments.end() && p->first.ino == dir->ino();
10995 ++p) {
10996 if (p->first.frag.contains(dir->get_frag())) {
10997 p->second.num_remote_waiters++;
10998 return;
10999 }
11000 }
11001 ceph_abort();
11002 }
11003
11004 void MDCache::find_stale_fragment_freeze()
11005 {
11006 dout(10) << "find_stale_fragment_freeze" << dendl;
11007 // see comment in Migrator::find_stale_export_freeze()
11008 utime_t now = ceph_clock_now();
11009 utime_t cutoff = now;
11010 cutoff -= g_conf->mds_freeze_tree_timeout;
11011
11012 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11013 p != fragments.end(); ) {
11014 dirfrag_t df = p->first;
11015 fragment_info_t& info = p->second;
11016 ++p;
11017 if (info.all_frozen)
11018 continue;
11019 CDir *dir;
11020 int total_auth_pins = 0;
11021 for (list<CDir*>::iterator q = info.dirs.begin();
11022 q != info.dirs.end();
11023 ++q) {
11024 dir = *q;
11025 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11026 total_auth_pins = -1;
11027 break;
11028 }
11029 if (dir->is_frozen_dir())
11030 continue;
11031 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11032 }
11033 if (total_auth_pins < 0)
11034 continue;
11035 if (info.last_cum_auth_pins != total_auth_pins) {
11036 info.last_cum_auth_pins = total_auth_pins;
11037 info.last_cum_auth_pins_change = now;
11038 continue;
11039 }
11040 if (info.last_cum_auth_pins_change >= cutoff)
11041 continue;
11042 dir = info.dirs.front();
11043 if (info.num_remote_waiters > 0 ||
11044 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11045 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11046 list<CDir*> dirs;
11047 info.dirs.swap(dirs);
11048 fragments.erase(df);
11049 fragment_unmark_unfreeze_dirs(dirs);
11050 }
11051 }
11052 }
11053
11054 class C_MDC_FragmentPrep : public MDCacheLogContext {
11055 MDRequestRef mdr;
11056 public:
11057 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11058 void finish(int r) override {
11059 mdcache->_fragment_logged(mdr);
11060 }
11061 };
11062
11063 class C_MDC_FragmentStore : public MDCacheContext {
11064 MDRequestRef mdr;
11065 public:
11066 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11067 void finish(int r) override {
11068 mdcache->_fragment_stored(mdr);
11069 }
11070 };
11071
11072 class C_MDC_FragmentCommit : public MDCacheLogContext {
11073 dirfrag_t basedirfrag;
11074 list<CDir*> resultfrags;
11075 public:
11076 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11077 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11078 void finish(int r) override {
11079 mdcache->_fragment_committed(basedirfrag, resultfrags);
11080 }
11081 };
11082
11083 class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11084 dirfrag_t basedirfrag;
11085 list<CDir*> resultfrags;
11086 public:
11087 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11088 MDCacheIOContext(m), basedirfrag(f) {
11089 resultfrags.swap(l);
11090 }
11091 void finish(int r) override {
11092 assert(r == 0 || r == -ENOENT);
11093 mdcache->_fragment_finish(basedirfrag, resultfrags);
11094 }
11095 };
11096
11097 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11098 {
11099 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11100 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11101 if (it == fragments.end() || it->second.mdr != mdr) {
11102 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11103 request_finish(mdr);
11104 return;
11105 }
11106
11107 assert(r == 0);
11108 fragment_info_t& info = it->second;
11109 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11110 << " on " << info.dirs.front()->get_inode() << dendl;
11111
11112 info.all_frozen = true;
11113 dispatch_fragment_dir(mdr);
11114 }
11115
11116 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11117 {
11118 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11119 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11120 if (it == fragments.end() || it->second.mdr != mdr) {
11121 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11122 request_finish(mdr);
11123 return;
11124 }
11125
11126 fragment_info_t& info = it->second;
11127 CInode *diri = info.dirs.front()->get_inode();
11128
11129 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11130 << " on " << *diri << dendl;
11131 if (!mdr->aborted) {
11132 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11133 wrlocks.insert(&diri->dirfragtreelock);
11134 // prevent a racing gather on any other scatterlocks too
11135 wrlocks.insert(&diri->nestlock);
11136 wrlocks.insert(&diri->filelock);
11137 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11138 if (!mdr->aborted)
11139 return;
11140 }
11141
11142 if (mdr->aborted) {
11143 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11144 << info.dirs.front()->dirfrag() << dendl;
11145 if (info.bits > 0)
11146 mds->balancer->queue_split(info.dirs.front(), false);
11147 else
11148 mds->balancer->queue_merge(info.dirs.front());
11149 fragment_unmark_unfreeze_dirs(info.dirs);
11150 fragments.erase(it);
11151 request_finish(mdr);
11152 return;
11153 }
11154
11155 mdr->ls = mds->mdlog->get_current_segment();
11156 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11157 mds->mdlog->start_entry(le);
11158
11159 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11160 CDir *dir = *p;
11161 dirfrag_rollback rollback;
11162 rollback.fnode = dir->fnode;
11163 le->add_orig_frag(dir->get_frag(), &rollback);
11164 }
11165
11166 // refragment
11167 list<MDSInternalContextBase*> waiters;
11168 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11169 info.resultfrags, waiters, false);
11170 if (g_conf->mds_debug_frag)
11171 diri->verify_dirfrags();
11172 mds->queue_waiters(waiters);
11173
11174 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11175 assert(!diri->dirfragtree.is_leaf(*p));
11176
11177 le->metablob.add_dir_context(*info.resultfrags.begin());
11178 for (list<CDir*>::iterator p = info.resultfrags.begin();
11179 p != info.resultfrags.end();
11180 ++p) {
11181 if (diri->is_auth()) {
11182 le->metablob.add_fragmented_dir(*p, false, false);
11183 } else {
11184 (*p)->state_set(CDir::STATE_DIRTYDFT);
11185 le->metablob.add_fragmented_dir(*p, false, true);
11186 }
11187 }
11188
11189 // dft lock
11190 if (diri->is_auth()) {
11191 // journal dirfragtree
11192 inode_t *pi = diri->project_inode();
11193 pi->version = diri->pre_dirty();
11194 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11195 } else {
11196 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11197 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11198 mdr->add_updated_lock(&diri->dirfragtreelock);
11199 }
11200
11201 /*
11202 // filelock
11203 mds->locker->mark_updated_scatterlock(&diri->filelock);
11204 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11205 mut->add_updated_lock(&diri->filelock);
11206
11207 // dirlock
11208 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11209 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11210 mut->add_updated_lock(&diri->nestlock);
11211 */
11212
11213 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11214 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11215 mdr, __func__);
11216 mds->mdlog->flush();
11217 }
11218
11219 void MDCache::_fragment_logged(MDRequestRef& mdr)
11220 {
11221 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11222 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11223 assert(it != fragments.end());
11224 fragment_info_t &info = it->second;
11225 CInode *diri = info.resultfrags.front()->get_inode();
11226
11227 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11228 << " on " << *diri << dendl;
11229
11230 if (diri->is_auth())
11231 diri->pop_and_dirty_projected_inode(mdr->ls);
11232
11233 mdr->apply(); // mark scatterlock
11234
11235 // store resulting frags
11236 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11237
11238 for (list<CDir*>::iterator p = info.resultfrags.begin();
11239 p != info.resultfrags.end();
11240 ++p) {
11241 CDir *dir = *p;
11242 dout(10) << " storing result frag " << *dir << dendl;
11243
11244 // freeze and store them too
11245 dir->auth_pin(this);
11246 dir->state_set(CDir::STATE_FRAGMENTING);
11247 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11248 }
11249
11250 gather.activate();
11251 }
11252
11253 void MDCache::_fragment_stored(MDRequestRef& mdr)
11254 {
11255 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11256 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11257 assert(it != fragments.end());
11258 fragment_info_t &info = it->second;
11259 CInode *diri = info.resultfrags.front()->get_inode();
11260
11261 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11262 << " on " << *diri << dendl;
11263
11264 // tell peers
11265 CDir *first = *info.resultfrags.begin();
11266 for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
11267 p != first->replicas_end();
11268 ++p) {
11269 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
11270 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
11271 rejoin_gather.count(p->first)))
11272 continue;
11273
11274 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11275
11276 // freshly replicate new dirs to peers
11277 for (list<CDir*>::iterator q = info.resultfrags.begin();
11278 q != info.resultfrags.end();
11279 ++q)
11280 replicate_dir(*q, p->first, notify->basebl);
11281
11282 mds->send_message_mds(notify, p->first);
11283 }
11284
11285 // journal commit
11286 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11287 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11288 info.resultfrags));
11289
11290 mds->locker->drop_locks(mdr.get());
11291
11292 // unfreeze resulting frags
11293 for (list<CDir*>::iterator p = info.resultfrags.begin();
11294 p != info.resultfrags.end();
11295 ++p) {
11296 CDir *dir = *p;
11297 dout(10) << " result frag " << *dir << dendl;
11298
11299 for (CDir::map_t::iterator p = dir->items.begin();
11300 p != dir->items.end();
11301 ++p) {
11302 CDentry *dn = p->second;
11303 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11304 dn->state_clear(CDentry::STATE_FRAGMENTING);
11305 dn->put(CDentry::PIN_FRAGMENTING);
11306 }
11307
11308 // unfreeze
11309 dir->unfreeze_dir();
11310 }
11311
11312 fragments.erase(it);
11313 request_finish(mdr);
11314 }
11315
11316 void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11317 {
11318 dout(10) << "fragment_committed " << basedirfrag << dendl;
11319 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11320 assert(it != uncommitted_fragments.end());
11321 ufragment &uf = it->second;
11322
11323 // remove old frags
11324 C_GatherBuilder gather(
11325 g_ceph_context,
11326 new C_OnFinisher(
11327 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11328 mds->finisher));
11329
11330 SnapContext nullsnapc;
11331 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11332 for (list<frag_t>::iterator p = uf.old_frags.begin();
11333 p != uf.old_frags.end();
11334 ++p) {
11335 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11336 ObjectOperation op;
11337 if (*p == frag_t()) {
11338 // backtrace object
11339 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11340 op.truncate(0);
11341 op.omap_clear();
11342 } else {
11343 dout(10) << " removing orphan dirfrag " << oid << dendl;
11344 op.remove();
11345 }
11346 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11347 ceph::real_clock::now(),
11348 0, gather.new_sub());
11349 }
11350
11351 assert(gather.has_subs());
11352 gather.activate();
11353 }
11354
11355 void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11356 {
11357 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11358 << resultfrags.size() << dendl;
11359 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11360 assert(it != uncommitted_fragments.end());
11361 ufragment &uf = it->second;
11362
11363 // unmark & auth_unpin
11364 for (const auto &dir : resultfrags) {
11365 dir->state_clear(CDir::STATE_FRAGMENTING);
11366 dir->auth_unpin(this);
11367
11368 // In case the resulting fragments are beyond the split size,
11369 // we might need to split them again right away (they could
11370 // have been taking inserts between unfreezing and getting
11371 // here)
11372 mds->balancer->maybe_fragment(dir, false);
11373 }
11374
11375 if (mds->logger) {
11376 if (resultfrags.size() > 1) {
11377 mds->logger->inc(l_mds_dir_split);
11378 } else {
11379 mds->logger->inc(l_mds_dir_merge);
11380 }
11381 }
11382
11383 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11384 mds->mdlog->start_submit_entry(le);
11385
11386 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11387 }
11388
11389 /* This function DOES put the passed message before returning */
11390 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11391 {
11392 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11393
11394 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11395 notify->put();
11396 return;
11397 }
11398
11399 CInode *diri = get_inode(notify->get_ino());
11400 if (diri) {
11401 frag_t base = notify->get_basefrag();
11402 int bits = notify->get_bits();
11403
11404 /*
11405 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11406 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11407 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11408 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11409 notify->put();
11410 return;
11411 }
11412 */
11413
11414 // refragment
11415 list<MDSInternalContextBase*> waiters;
11416 list<CDir*> resultfrags;
11417 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11418 if (g_conf->mds_debug_frag)
11419 diri->verify_dirfrags();
11420
11421 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11422 diri->take_dir_waiting((*p)->get_frag(), waiters);
11423
11424 // add new replica dirs values
11425 bufferlist::iterator p = notify->basebl.begin();
11426 while (!p.end())
11427 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11428
11429 mds->queue_waiters(waiters);
11430 } else {
11431 ceph_abort();
11432 }
11433
11434 notify->put();
11435 }
11436
11437 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11438 LogSegment *ls, bufferlist *rollback)
11439 {
11440 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11441 assert(!uncommitted_fragments.count(basedirfrag));
11442 ufragment& uf = uncommitted_fragments[basedirfrag];
11443 uf.old_frags = old_frags;
11444 uf.bits = bits;
11445 uf.ls = ls;
11446 ls->uncommitted_fragments.insert(basedirfrag);
11447 if (rollback)
11448 uf.rollback.swap(*rollback);
11449 }
11450
11451 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11452 {
11453 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11454 << " op " << EFragment::op_name(op) << dendl;
11455 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11456 if (it != uncommitted_fragments.end()) {
11457 ufragment& uf = it->second;
11458 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11459 uf.committed = true;
11460 } else {
11461 uf.ls->uncommitted_fragments.erase(basedirfrag);
11462 mds->queue_waiters(uf.waiters);
11463 uncommitted_fragments.erase(it);
11464 }
11465 }
11466 }
11467
11468 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11469 {
11470 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11471 << " old_frags (" << old_frags << ")" << dendl;
11472 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11473 if (it != uncommitted_fragments.end()) {
11474 ufragment& uf = it->second;
11475 if (!uf.old_frags.empty()) {
11476 uf.old_frags.swap(old_frags);
11477 uf.committed = true;
11478 } else {
11479 uf.ls->uncommitted_fragments.erase(basedirfrag);
11480 uncommitted_fragments.erase(it);
11481 }
11482 }
11483 }
11484
11485 void MDCache::rollback_uncommitted_fragments()
11486 {
11487 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11488 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11489 p != uncommitted_fragments.end();
11490 ++p) {
11491 ufragment &uf = p->second;
11492 CInode *diri = get_inode(p->first.ino);
11493 assert(diri);
11494
11495 if (uf.committed) {
11496 list<CDir*> frags;
11497 diri->get_dirfrags_under(p->first.frag, frags);
11498 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11499 CDir *dir = *q;
11500 dir->auth_pin(this);
11501 dir->state_set(CDir::STATE_FRAGMENTING);
11502 }
11503 _fragment_committed(p->first, frags);
11504 continue;
11505 }
11506
11507 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11508
11509 LogSegment *ls = mds->mdlog->get_current_segment();
11510 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11511 mds->mdlog->start_entry(le);
11512 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11513
11514 list<frag_t> old_frags;
11515 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11516
11517 list<CDir*> resultfrags;
11518 if (uf.old_frags.empty()) {
11519 // created by old format EFragment
11520 list<MDSInternalContextBase*> waiters;
11521 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11522 } else {
11523 bufferlist::iterator bp = uf.rollback.begin();
11524 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11525 CDir *dir = force_dir_fragment(diri, *q);
11526 resultfrags.push_back(dir);
11527
11528 dirfrag_rollback rollback;
11529 ::decode(rollback, bp);
11530
11531 dir->set_version(rollback.fnode.version);
11532 dir->fnode = rollback.fnode;
11533
11534 dir->_mark_dirty(ls);
11535
11536 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11537 dout(10) << " dirty nestinfo on " << *dir << dendl;
11538 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11539 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11540 }
11541 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11542 dout(10) << " dirty fragstat on " << *dir << dendl;
11543 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11544 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11545 }
11546
11547 le->add_orig_frag(dir->get_frag());
11548 le->metablob.add_dir_context(dir);
11549 if (diri_auth) {
11550 le->metablob.add_fragmented_dir(dir, true, false);
11551 } else {
11552 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11553 dir->state_set(CDir::STATE_DIRTYDFT);
11554 le->metablob.add_fragmented_dir(dir, true, true);
11555 }
11556 }
11557 }
11558
11559 if (diri_auth) {
11560 diri->project_inode()->version = diri->pre_dirty();
11561 diri->pop_and_dirty_projected_inode(ls); // hacky
11562 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11563 } else {
11564 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11565 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11566 }
11567
11568 if (g_conf->mds_debug_frag)
11569 diri->verify_dirfrags();
11570
11571 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11572 assert(!diri->dirfragtree.is_leaf(*q));
11573
11574 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11575 CDir *dir = *q;
11576 dir->auth_pin(this);
11577 dir->state_set(CDir::STATE_FRAGMENTING);
11578 }
11579
11580 mds->mdlog->submit_entry(le);
11581
11582 uf.old_frags.swap(old_frags);
11583 _fragment_committed(p->first, resultfrags);
11584 }
11585 }
11586
11587 void MDCache::force_readonly()
11588 {
11589 if (is_readonly())
11590 return;
11591
11592 dout(1) << "force file system read-only" << dendl;
11593 mds->clog->warn() << "force file system read-only";
11594
11595 set_readonly();
11596
11597 mds->server->force_clients_readonly();
11598
11599 // revoke write caps
11600 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
11601 p != inode_map.end();
11602 ++p) {
11603 CInode *in = p->second;
11604 if (in->is_head())
11605 mds->locker->eval(in, CEPH_CAP_LOCKS);
11606 }
11607
11608 mds->mdlog->flush();
11609 }
11610
11611
11612 // ==============================================================
11613 // debug crap
11614
11615 void MDCache::show_subtrees(int dbl)
11616 {
11617 if (g_conf->mds_thrash_exports)
11618 dbl += 15;
11619
11620 //dout(10) << "show_subtrees" << dendl;
11621
11622 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11623 return; // i won't print anything.
11624
11625 if (subtrees.empty()) {
11626 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11627 return;
11628 }
11629
11630 // root frags
11631 list<CDir*> basefrags;
11632 for (set<CInode*>::iterator p = base_inodes.begin();
11633 p != base_inodes.end();
11634 ++p)
11635 (*p)->get_dirfrags(basefrags);
11636 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11637 dout(15) << "show_subtrees" << dendl;
11638
11639 // queue stuff
11640 list<pair<CDir*,int> > q;
11641 string indent;
11642 set<CDir*> seen;
11643
11644 // calc max depth
11645 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11646 q.push_back(pair<CDir*,int>(*p, 0));
11647
11648 set<CDir*> subtrees_seen;
11649
11650 int depth = 0;
11651 while (!q.empty()) {
11652 CDir *dir = q.front().first;
11653 int d = q.front().second;
11654 q.pop_front();
11655
11656 if (subtrees.count(dir) == 0) continue;
11657
11658 subtrees_seen.insert(dir);
11659
11660 if (d > depth) depth = d;
11661
11662 // sanity check
11663 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11664 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11665 assert(seen.count(dir) == 0);
11666 seen.insert(dir);
11667
11668 // nested items?
11669 if (!subtrees[dir].empty()) {
11670 for (set<CDir*>::iterator p = subtrees[dir].begin();
11671 p != subtrees[dir].end();
11672 ++p) {
11673 //dout(25) << " saw sub " << **p << dendl;
11674 q.push_front(pair<CDir*,int>(*p, d+1));
11675 }
11676 }
11677 }
11678
11679
11680 // print tree
11681 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11682 q.push_back(pair<CDir*,int>(*p, 0));
11683
11684 while (!q.empty()) {
11685 CDir *dir = q.front().first;
11686 int d = q.front().second;
11687 q.pop_front();
11688
11689 if (subtrees.count(dir) == 0) continue;
11690
11691 // adjust indenter
11692 while ((unsigned)d < indent.size())
11693 indent.resize(d);
11694
11695 // pad
11696 string pad = "______________________________________";
11697 pad.resize(depth*2+1-indent.size());
11698 if (!subtrees[dir].empty())
11699 pad[0] = '.'; // parent
11700
11701
11702 string auth;
11703 if (dir->is_auth())
11704 auth = "auth ";
11705 else
11706 auth = " rep ";
11707
11708 char s[10];
11709 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11710 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11711 else
11712 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11713
11714 // print
11715 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11716
11717 if (dir->ino() == MDS_INO_ROOT)
11718 assert(dir->inode == root);
11719 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11720 assert(dir->inode == myin);
11721 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11722 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11723
11724 // nested items?
11725 if (!subtrees[dir].empty()) {
11726 // more at my level?
11727 if (!q.empty() && q.front().second == d)
11728 indent += "| ";
11729 else
11730 indent += " ";
11731
11732 for (set<CDir*>::iterator p = subtrees[dir].begin();
11733 p != subtrees[dir].end();
11734 ++p)
11735 q.push_front(pair<CDir*,int>(*p, d+2));
11736 }
11737 }
11738
11739 // verify there isn't stray crap in subtree map
11740 int lost = 0;
11741 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11742 p != subtrees.end();
11743 ++p) {
11744 if (subtrees_seen.count(p->first)) continue;
11745 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11746 lost++;
11747 }
11748 assert(lost == 0);
11749 }
11750
11751
11752 void MDCache::show_cache()
11753 {
11754 dout(7) << "show_cache" << dendl;
11755
11756 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11757 it != inode_map.end();
11758 ++it) {
11759 // unlinked?
11760 if (!it->second->parent)
11761 dout(7) << " unlinked " << *it->second << dendl;
11762
11763 // dirfrags?
11764 list<CDir*> dfs;
11765 it->second->get_dirfrags(dfs);
11766 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11767 CDir *dir = *p;
11768 dout(7) << " dirfrag " << *dir << dendl;
11769
11770 for (CDir::map_t::iterator p = dir->items.begin();
11771 p != dir->items.end();
11772 ++p) {
11773 CDentry *dn = p->second;
11774 dout(7) << " dentry " << *dn << dendl;
11775 CDentry::linkage_t *dnl = dn->get_linkage();
11776 if (dnl->is_primary() && dnl->get_inode())
11777 dout(7) << " inode " << *dnl->get_inode() << dendl;
11778 }
11779 }
11780 }
11781 }
11782
11783 void MDCache::dump_cache(std::string const &file_name)
11784 {
11785 dump_cache(file_name.c_str(), NULL);
11786 }
11787
11788 void MDCache::dump_cache(Formatter *f)
11789 {
11790 dump_cache(NULL, f);
11791 }
11792
11793 void MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
11794 {
11795 dump_cache(NULL, f, dump_root, depth);
11796 }
11797
11798 /**
11799 * Dump the metadata cache, either to a Formatter, if
11800 * provided, else to a plain text file.
11801 */
11802 void MDCache::dump_cache(const char *fn, Formatter *f,
11803 const string& dump_root, int depth)
11804 {
11805 int r = 0;
11806 int fd = -1;
11807
11808 if (f) {
11809 f->open_array_section("inodes");
11810 } else {
11811 char deffn[200];
11812 if (!fn) {
11813 snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11814 fn = deffn;
11815 }
11816
11817 dout(1) << "dump_cache to " << fn << dendl;
11818
11819 fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
11820 if (fd < 0) {
11821 derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
11822 return;
11823 }
11824 }
11825
11826 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11827 it != inode_map.end();
11828 ++it) {
11829 CInode *in = it->second;
11830
11831 if (!dump_root.empty()) {
11832 string ipath;
11833 if (in->is_root())
11834 ipath = "/";
11835 else
11836 in->make_path_string(ipath);
11837
11838 if (dump_root.length() > ipath.length() ||
11839 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11840 continue;
11841
11842 if (depth >= 0 &&
11843 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11844 continue;
11845 }
11846
11847 if (f) {
11848 f->open_object_section("inode");
11849 in->dump(f);
11850 } else {
11851 ostringstream ss;
11852 ss << *in << std::endl;
11853 std::string s = ss.str();
11854 r = safe_write(fd, s.c_str(), s.length());
11855 if (r < 0) {
11856 goto out;
11857 }
11858 }
11859
11860 list<CDir*> dfs;
11861 in->get_dirfrags(dfs);
11862 if (f) {
11863 f->open_array_section("dirfrags");
11864 }
11865 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11866 CDir *dir = *p;
11867 if (f) {
11868 f->open_object_section("dir");
11869 dir->dump(f);
11870 } else {
11871 ostringstream tt;
11872 tt << " " << *dir << std::endl;
11873 string t = tt.str();
11874 r = safe_write(fd, t.c_str(), t.length());
11875 if (r < 0) {
11876 goto out;
11877 }
11878 }
11879
11880 if (f) {
11881 f->open_array_section("dentries");
11882 }
11883 for (CDir::map_t::iterator q = dir->items.begin();
11884 q != dir->items.end();
11885 ++q) {
11886 CDentry *dn = q->second;
11887 if (f) {
11888 f->open_object_section("dentry");
11889 dn->dump(f);
11890 f->close_section();
11891 } else {
11892 ostringstream uu;
11893 uu << " " << *dn << std::endl;
11894 string u = uu.str();
11895 r = safe_write(fd, u.c_str(), u.length());
11896 if (r < 0) {
11897 goto out;
11898 }
11899 }
11900 }
11901 if (f) {
11902 f->close_section(); //dentries
11903 }
11904 dir->check_rstats();
11905 if (f) {
11906 f->close_section(); //dir
11907 }
11908 }
11909 if (f) {
11910 f->close_section(); // dirfrags
11911 }
11912
11913 if (f) {
11914 f->close_section(); // inode
11915 }
11916 }
11917
11918 out:
11919 if (f) {
11920 f->close_section(); // inodes
11921 } else {
11922 ::close(fd);
11923 }
11924 }
11925
11926
11927
11928 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
11929 : MDSInternalContext(c->mds), cache(c), mdr(r)
11930 {}
11931
11932 void C_MDS_RetryRequest::finish(int r)
11933 {
11934 mdr->retry++;
11935 cache->dispatch_request(mdr);
11936 }
11937
11938
11939 class C_MDS_EnqueueScrub : public Context
11940 {
11941 Formatter *formatter;
11942 Context *on_finish;
11943 public:
11944 ScrubHeaderRef header;
11945 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
11946 formatter(f), on_finish(fin), header(nullptr) {}
11947
11948 Context *take_finisher() {
11949 Context *fin = on_finish;
11950 on_finish = NULL;
11951 return fin;
11952 }
11953
11954 void finish(int r) override {
11955 if (r < 0) { // we failed the lookup or something; dump ourselves
11956 formatter->open_object_section("results");
11957 formatter->dump_int("return_code", r);
11958 formatter->close_section(); // results
11959 }
11960 if (on_finish)
11961 on_finish->complete(r);
11962 }
11963 };
11964
11965 void MDCache::enqueue_scrub(
11966 const string& path,
11967 const std::string &tag,
11968 bool force, bool recursive, bool repair,
11969 Formatter *f, Context *fin)
11970 {
11971 dout(10) << __func__ << path << dendl;
11972 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
11973 filepath fp(path.c_str());
11974 mdr->set_filepath(fp);
11975
11976 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
11977 cs->header = std::make_shared<ScrubHeader>(
11978 tag, force, recursive, repair, f);
11979
11980 mdr->internal_op_finish = cs;
11981 enqueue_scrub_work(mdr);
11982 }
11983
11984 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
11985 {
11986 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11987 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
11988 if (NULL == in)
11989 return;
11990
11991 // TODO: Remove this restriction
11992 assert(in->is_auth());
11993
11994 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
11995 if (!locked)
11996 return;
11997
11998 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
11999 ScrubHeaderRef &header = cs->header;
12000
12001 // Cannot scrub same dentry twice at same time
12002 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12003 mds->server->respond_to_request(mdr, -EBUSY);
12004 return;
12005 } else {
12006 in->scrub_info();
12007 }
12008
12009 header->set_origin(in);
12010
12011 // only set completion context for non-recursive scrub, because we don't
12012 // want to block asok caller on long running scrub
12013 if (!header->get_recursive()) {
12014 Context *fin = cs->take_finisher();
12015 mds->scrubstack->enqueue_inode_top(in, header,
12016 new MDSInternalContextWrapper(mds, fin));
12017 } else
12018 mds->scrubstack->enqueue_inode_bottom(in, header, NULL);
12019
12020 mds->server->respond_to_request(mdr, 0);
12021 return;
12022 }
12023
12024 struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12025 MDRequestRef mdr;
12026 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12027 MDCacheLogContext(c), mdr(m) {}
12028 void finish(int r) override {
12029 mdr->apply();
12030 get_mds()->server->respond_to_request(mdr, r);
12031 }
12032 };
12033
12034 void MDCache::repair_dirfrag_stats(CDir *dir)
12035 {
12036 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12037 mdr->pin(dir);
12038 mdr->internal_op_private = dir;
12039 mdr->internal_op_finish = new C_MDSInternalNoop;
12040 repair_dirfrag_stats_work(mdr);
12041 }
12042
12043 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12044 {
12045 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12046 dout(10) << __func__ << " " << *dir << dendl;
12047
12048 if (!dir->is_auth()) {
12049 mds->server->respond_to_request(mdr, -ESTALE);
12050 return;
12051 }
12052
12053 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12054 mds->locker->drop_locks(mdr.get());
12055 mdr->drop_local_auth_pins();
12056 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12057 return;
12058 }
12059
12060 mdr->auth_pin(dir);
12061
12062 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12063 CInode *diri = dir->inode;
12064 rdlocks.insert(&diri->dirfragtreelock);
12065 wrlocks.insert(&diri->nestlock);
12066 wrlocks.insert(&diri->filelock);
12067 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12068 return;
12069
12070 if (!dir->is_complete()) {
12071 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12072 return;
12073 }
12074
12075 frag_info_t frag_info;
12076 nest_info_t nest_info;
12077 for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
12078 CDentry *dn = it->second;
12079 if (dn->last != CEPH_NOSNAP)
12080 continue;
12081 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12082 if (dnl->is_primary()) {
12083 CInode *in = dnl->get_inode();
12084 nest_info.add(in->get_projected_inode()->accounted_rstat);
12085 if (in->is_dir())
12086 frag_info.nsubdirs++;
12087 else
12088 frag_info.nfiles++;
12089 } else if (dnl->is_remote())
12090 frag_info.nfiles++;
12091 }
12092
12093 fnode_t *pf = dir->get_projected_fnode();
12094 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12095 bool good_rstat = nest_info.same_sums(pf->rstat);
12096 if (good_fragstat && good_rstat) {
12097 dout(10) << __func__ << " no corruption found" << dendl;
12098 mds->server->respond_to_request(mdr, 0);
12099 return;
12100 }
12101
12102 pf = dir->project_fnode();
12103 pf->version = dir->pre_dirty();
12104 mdr->add_projected_fnode(dir);
12105
12106 mdr->ls = mds->mdlog->get_current_segment();
12107 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12108 mds->mdlog->start_entry(le);
12109
12110 if (!good_fragstat) {
12111 if (pf->fragstat.mtime > frag_info.mtime)
12112 frag_info.mtime = pf->fragstat.mtime;
12113 if (pf->fragstat.change_attr > frag_info.change_attr)
12114 frag_info.change_attr = pf->fragstat.change_attr;
12115 pf->fragstat = frag_info;
12116 mds->locker->mark_updated_scatterlock(&diri->filelock);
12117 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12118 mdr->add_updated_lock(&diri->filelock);
12119 }
12120
12121 if (!good_rstat) {
12122 if (pf->rstat.rctime > nest_info.rctime)
12123 nest_info.rctime = pf->rstat.rctime;
12124 pf->rstat = nest_info;
12125 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12126 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12127 mdr->add_updated_lock(&diri->nestlock);
12128 }
12129
12130 le->metablob.add_dir_context(dir);
12131 le->metablob.add_dir(dir, true);
12132
12133 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12134 }
12135
12136 void MDCache::repair_inode_stats(CInode *diri)
12137 {
12138 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12139 mdr->pin(diri);
12140 mdr->internal_op_private = diri;
12141 mdr->internal_op_finish = new C_MDSInternalNoop;
12142 repair_inode_stats_work(mdr);
12143 }
12144
12145 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12146 {
12147 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12148 dout(10) << __func__ << " " << *diri << dendl;
12149
12150 if (!diri->is_auth()) {
12151 mds->server->respond_to_request(mdr, -ESTALE);
12152 return;
12153 }
12154 if (!diri->is_dir()) {
12155 mds->server->respond_to_request(mdr, -ENOTDIR);
12156 return;
12157 }
12158
12159 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12160 std::list<frag_t> frags;
12161
12162 if (mdr->ls) // already marked filelock/nestlock dirty ?
12163 goto do_rdlocks;
12164
12165 rdlocks.insert(&diri->dirfragtreelock);
12166 wrlocks.insert(&diri->nestlock);
12167 wrlocks.insert(&diri->filelock);
12168 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12169 return;
12170
12171 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12172 // the scatter-gather process, which will fix any fragstat/rstat errors.
12173 diri->dirfragtree.get_leaves(frags);
12174 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12175 CDir *dir = diri->get_dirfrag(*p);
12176 if (!dir) {
12177 assert(mdr->is_auth_pinned(diri));
12178 dir = diri->get_or_open_dirfrag(this, *p);
12179 }
12180 if (dir->get_version() == 0) {
12181 assert(dir->is_auth());
12182 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12183 return;
12184 }
12185 }
12186
12187 diri->state_set(CInode::STATE_REPAIRSTATS);
12188 mdr->ls = mds->mdlog->get_current_segment();
12189 mds->locker->mark_updated_scatterlock(&diri->filelock);
12190 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12191 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12192 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12193
12194 mds->locker->drop_locks(mdr.get());
12195
12196 do_rdlocks:
12197 // force the scatter-gather process
12198 rdlocks.insert(&diri->dirfragtreelock);
12199 rdlocks.insert(&diri->nestlock);
12200 rdlocks.insert(&diri->filelock);
12201 wrlocks.clear();
12202 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12203 return;
12204
12205 diri->state_clear(CInode::STATE_REPAIRSTATS);
12206
12207 frag_info_t dir_info;
12208 nest_info_t nest_info;
12209 nest_info.rsubdirs++; // it gets one to account for self
12210
12211 diri->dirfragtree.get_leaves(frags);
12212 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12213 CDir *dir = diri->get_dirfrag(*p);
12214 assert(dir);
12215 assert(dir->get_version() > 0);
12216 dir_info.add(dir->fnode.accounted_fragstat);
12217 nest_info.add(dir->fnode.accounted_rstat);
12218 }
12219
12220 if (!dir_info.same_sums(diri->inode.dirstat) ||
12221 !nest_info.same_sums(diri->inode.rstat)) {
12222 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12223 << *diri << dendl;
12224 }
12225
12226 mds->server->respond_to_request(mdr, 0);
12227 }
12228
12229 void MDCache::flush_dentry(const string& path, Context *fin)
12230 {
12231 if (is_readonly()) {
12232 dout(10) << __func__ << ": read-only FS" << dendl;
12233 fin->complete(-EROFS);
12234 return;
12235 }
12236 dout(10) << "flush_dentry " << path << dendl;
12237 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12238 filepath fp(path.c_str());
12239 mdr->set_filepath(fp);
12240 mdr->internal_op_finish = fin;
12241 flush_dentry_work(mdr);
12242 }
12243
12244 class C_FinishIOMDR : public MDSInternalContextBase {
12245 protected:
12246 MDSRank *mds;
12247 MDRequestRef mdr;
12248 MDSRank *get_mds() override { return mds; }
12249 public:
12250 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12251 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12252 };
12253
12254 void MDCache::flush_dentry_work(MDRequestRef& mdr)
12255 {
12256 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12257 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12258 if (NULL == in)
12259 return;
12260
12261 // TODO: Is this necessary? Fix it if so
12262 assert(in->is_auth());
12263 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12264 if (!locked)
12265 return;
12266 in->flush(new C_FinishIOMDR(mds, mdr));
12267 }
12268
12269
12270 /**
12271 * Initialize performance counters with global perfcounter
12272 * collection.
12273 */
12274 void MDCache::register_perfcounters()
12275 {
12276 PerfCountersBuilder pcb(g_ceph_context,
12277 "mds_cache", l_mdc_first, l_mdc_last);
12278
12279 /* Stray/purge statistics */
12280 pcb.add_u64(l_mdc_num_strays, "num_strays",
12281 "Stray dentries", "stry");
12282 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12283 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12284
12285 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12286 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12287 "Stray dentries enqueued for purge");
12288 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12289 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12290
12291
12292 /* Recovery queue statistics */
12293 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12294 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
12295 "Files waiting for recovery", "recy");
12296 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12297 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12298 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
12299 "File recoveries completed", "recd");
12300
12301 logger.reset(pcb.create_perf_counters());
12302 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12303 recovery_queue.set_logger(logger.get());
12304 stray_manager.set_logger(logger.get());
12305 }
12306
12307 void MDCache::activate_stray_manager()
12308 {
12309 if (open) {
12310 stray_manager.activate();
12311 } else {
12312 wait_for_open(
12313 new MDSInternalContextWrapper(mds,
12314 new FunctionContext([this](int r){
12315 stray_manager.activate();
12316 })
12317 )
12318 );
12319 }
12320 }
12321
12322 /**
12323 * Call this when putting references to an inode/dentry or
12324 * when attempting to trim it.
12325 *
12326 * If this inode is no longer linked by anyone, and this MDS
12327 * rank holds the primary dentry, and that dentry is in a stray
12328 * directory, then give up the dentry to the StrayManager, never
12329 * to be seen again by MDCache.
12330 *
12331 * @param delay if true, then purgeable inodes are stashed til
12332 * the next trim(), rather than being purged right
12333 * away.
12334 */
12335 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
12336 if (in->inode.nlink > 0 || in->is_base() || is_readonly() || mds->is_standby_replay())
12337 return;
12338 CDentry *dn = in->get_projected_parent_dn();
12339
12340 if (dn->state_test(CDentry::STATE_PURGING)) {
12341 /* We have already entered the purging process, no need
12342 * to re-evaluate me ! */
12343 return;
12344 }
12345
12346 if (dn->get_projected_linkage()->is_primary() &&
12347 dn->get_dir()->get_inode()->is_stray()) {
12348 stray_manager.eval_stray(dn, delay);
12349 }
12350 }
12351