]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
update sources to v12.2.3
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <map>
21
22 #include "MDCache.h"
23 #include "MDSRank.h"
24 #include "Server.h"
25 #include "Locker.h"
26 #include "MDLog.h"
27 #include "MDBalancer.h"
28 #include "Migrator.h"
29 #include "ScrubStack.h"
30
31 #include "SnapClient.h"
32
33 #include "MDSMap.h"
34
35 #include "CInode.h"
36 #include "CDir.h"
37
38 #include "Mutation.h"
39
40 #include "include/ceph_fs.h"
41 #include "include/filepath.h"
42 #include "include/util.h"
43
44 #include "msg/Message.h"
45 #include "msg/Messenger.h"
46
47 #include "common/MemoryModel.h"
48 #include "common/errno.h"
49 #include "common/perf_counters.h"
50 #include "common/safe_io.h"
51
52 #include "osdc/Journaler.h"
53 #include "osdc/Filer.h"
54
55 #include "events/ESubtreeMap.h"
56 #include "events/EUpdate.h"
57 #include "events/ESlaveUpdate.h"
58 #include "events/EImportFinish.h"
59 #include "events/EFragment.h"
60 #include "events/ECommitted.h"
61 #include "events/ESessions.h"
62
63 #include "messages/MGenericMessage.h"
64
65 #include "messages/MMDSResolve.h"
66 #include "messages/MMDSResolveAck.h"
67 #include "messages/MMDSCacheRejoin.h"
68
69 #include "messages/MDiscover.h"
70 #include "messages/MDiscoverReply.h"
71
72 //#include "messages/MInodeUpdate.h"
73 #include "messages/MDirUpdate.h"
74 #include "messages/MCacheExpire.h"
75
76 #include "messages/MInodeFileCaps.h"
77
78 #include "messages/MLock.h"
79 #include "messages/MDentryLink.h"
80 #include "messages/MDentryUnlink.h"
81
82 #include "messages/MMDSFindIno.h"
83 #include "messages/MMDSFindInoReply.h"
84
85 #include "messages/MMDSOpenIno.h"
86 #include "messages/MMDSOpenInoReply.h"
87
88 #include "messages/MClientRequest.h"
89 #include "messages/MClientCaps.h"
90 #include "messages/MClientSnap.h"
91 #include "messages/MClientQuota.h"
92
93 #include "messages/MMDSSlaveRequest.h"
94
95 #include "messages/MMDSFragmentNotify.h"
96
97 #include "messages/MGatherCaps.h"
98
99 #include "InoTable.h"
100
101 #include "common/Timer.h"
102
103 #include "perfglue/heap_profiler.h"
104
105 using namespace std;
106
107 #include "common/config.h"
108 #include "include/assert.h"
109
110 #define dout_context g_ceph_context
111 #define dout_subsys ceph_subsys_mds
112 #undef dout_prefix
113 #define dout_prefix _prefix(_dout, mds)
114 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
115 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
116 }
117
118 set<int> SimpleLock::empty_gather_set;
119
120
121 /**
122 * All non-I/O contexts that require a reference
123 * to an MDCache instance descend from this.
124 */
125 class MDCacheContext : public virtual MDSInternalContextBase {
126 protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
130 assert(mdcache != NULL);
131 return mdcache->mds;
132 }
133 public:
134 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
135 };
136
137
138 /**
139 * Only for contexts called back from an I/O completion
140 *
141 * Note: duplication of members wrt MDCacheContext, because
142 * it'ls the lesser of two evils compared with introducing
143 * yet another piece of (multiple) inheritance.
144 */
145 class MDCacheIOContext : public virtual MDSIOContextBase {
146 protected:
147 MDCache *mdcache;
148 MDSRank *get_mds() override
149 {
150 assert(mdcache != NULL);
151 return mdcache->mds;
152 }
153 public:
154 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
155 };
156
157 class MDCacheLogContext : public virtual MDSLogContextBase {
158 protected:
159 MDCache *mdcache;
160 MDSRank *get_mds() override
161 {
162 assert(mdcache != NULL);
163 return mdcache->mds;
164 }
165 public:
166 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
167 };
168
169 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
170 mds(m),
171 filer(m->objecter, m->finisher),
172 exceeded_size_limit(false),
173 recovery_queue(m),
174 stray_manager(m, purge_queue_)
175 {
176 migrator.reset(new Migrator(mds, this));
177 root = NULL;
178 myin = NULL;
179 readonly = false;
180
181 stray_index = 0;
182 for (int i = 0; i < NUM_STRAY; ++i) {
183 strays[i] = NULL;
184 }
185
186 num_shadow_inodes = 0;
187 num_inodes_with_caps = 0;
188
189 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
190 (g_conf->mds_dir_max_commit_size << 20) :
191 (0.9 *(g_conf->osd_max_write_size << 20));
192
193 discover_last_tid = 0;
194 open_ino_last_tid = 0;
195 find_ino_peer_last_tid = 0;
196
197 last_cap_id = 0;
198
199 client_lease_durations[0] = 5.0;
200 client_lease_durations[1] = 30.0;
201 client_lease_durations[2] = 300.0;
202
203 resolves_pending = false;
204 rejoins_pending = false;
205 cap_imports_num_opening = 0;
206
207 opening_root = open = false;
208 lru.lru_set_midpoint(cache_mid());
209
210 bottom_lru.lru_set_midpoint(0);
211
212 decayrate.set_halflife(g_conf->mds_decay_halflife);
213
214 did_shutdown_log_cap = false;
215 }
216
217 MDCache::~MDCache()
218 {
219 if (logger) {
220 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
221 }
222 }
223
224
225
226 void MDCache::log_stat()
227 {
228 mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
229 mds->logger->set(l_mds_inodes, lru.lru_get_size());
230 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
231 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
232 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
233 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
234 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
235 mds->logger->set(l_mds_caps, Capability::count());
236 }
237
238
239 //
240
241 bool MDCache::shutdown()
242 {
243 if (lru.lru_get_size() > 0) {
244 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
245 //show_cache();
246 show_subtrees();
247 //dump();
248 }
249 return true;
250 }
251
252
253 // ====================================================================
254 // some inode functions
255
256 void MDCache::add_inode(CInode *in)
257 {
258 // add to lru, inode map
259 if (in->last == CEPH_NOSNAP) {
260 auto &p = inode_map[in->ino()];
261 assert(!p); // should be no dup inos!
262 p = in;
263 } else {
264 auto &p = snap_inode_map[in->vino()];
265 assert(!p); // should be no dup inos!
266 p = in;
267 }
268
269 if (in->ino() < MDS_INO_SYSTEM_BASE) {
270 if (in->ino() == MDS_INO_ROOT)
271 root = in;
272 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
273 myin = in;
274 else if (in->is_stray()) {
275 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
276 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
277 }
278 }
279 if (in->is_base())
280 base_inodes.insert(in);
281 }
282
283 if (cache_toofull()) {
284 exceeded_size_limit = true;
285 }
286 }
287
288 void MDCache::remove_inode(CInode *o)
289 {
290 dout(14) << "remove_inode " << *o << dendl;
291
292 if (o->get_parent_dn()) {
293 // FIXME: multiple parents?
294 CDentry *dn = o->get_parent_dn();
295 assert(!dn->is_dirty());
296 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
297 }
298
299 if (o->is_dirty())
300 o->mark_clean();
301 if (o->is_dirty_parent())
302 o->clear_dirty_parent();
303
304 o->clear_scatter_dirty();
305
306 o->item_open_file.remove_myself();
307
308 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
309 export_pin_queue.erase(o);
310
311 // remove from inode map
312 if (o->last == CEPH_NOSNAP)
313 inode_map.erase(o->ino());
314 else
315 snap_inode_map.erase(o->vino());
316
317 if (o->ino() < MDS_INO_SYSTEM_BASE) {
318 if (o == root) root = 0;
319 if (o == myin) myin = 0;
320 if (o->is_stray()) {
321 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
322 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
323 }
324 }
325 if (o->is_base())
326 base_inodes.erase(o);
327 }
328
329 // delete it
330 assert(o->get_num_ref() == 0);
331 delete o;
332 }
333
334 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
335 {
336 file_layout_t result = file_layout_t::get_default();
337 result.pool_id = mdsmap.get_first_data_pool();
338 return result;
339 }
340
341 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
342 {
343 file_layout_t result = file_layout_t::get_default();
344 result.pool_id = mdsmap.get_metadata_pool();
345 if (g_conf->mds_log_segment_size > 0) {
346 result.object_size = g_conf->mds_log_segment_size;
347 result.stripe_unit = g_conf->mds_log_segment_size;
348 }
349 return result;
350 }
351
352 void MDCache::init_layouts()
353 {
354 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
355 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
356 }
357
358 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
359 int mode) const
360 {
361 in->inode.ino = ino;
362 in->inode.version = 1;
363 in->inode.xattr_version = 1;
364 in->inode.mode = 0500 | mode;
365 in->inode.size = 0;
366 in->inode.ctime =
367 in->inode.mtime =
368 in->inode.btime = ceph_clock_now();
369 in->inode.nlink = 1;
370 in->inode.truncate_size = -1ull;
371 in->inode.change_attr = 0;
372 in->inode.export_pin = MDS_RANK_NONE;
373
374 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
375 if (in->inode.is_dir()) {
376 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
377 ++in->inode.rstat.rsubdirs;
378 } else {
379 in->inode.layout = default_file_layout;
380 ++in->inode.rstat.rfiles;
381 }
382 in->inode.accounted_rstat = in->inode.rstat;
383
384 if (in->is_base()) {
385 if (in->is_root())
386 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
387 else
388 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
389 in->open_snaprealm(); // empty snaprealm
390 assert(!in->snaprealm->parent); // created its own
391 in->snaprealm->srnode.seq = 1;
392 }
393 }
394
395 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
396 {
397 dout(0) << "creating system inode with ino:" << ino << dendl;
398 CInode *in = new CInode(this);
399 create_unlinked_system_inode(in, ino, mode);
400 add_inode(in);
401 return in;
402 }
403
404 CInode *MDCache::create_root_inode()
405 {
406 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
407 i->inode.uid = g_conf->mds_root_ino_uid;
408 i->inode.gid = g_conf->mds_root_ino_gid;
409 i->inode.layout = default_file_layout;
410 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
411 return i;
412 }
413
414 void MDCache::create_empty_hierarchy(MDSGather *gather)
415 {
416 // create root dir
417 CInode *root = create_root_inode();
418
419 // force empty root dir
420 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
421 adjust_subtree_auth(rootdir, mds->get_nodeid());
422 rootdir->dir_rep = CDir::REP_ALL; //NONE;
423
424 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
425 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
426
427 root->inode.dirstat = rootdir->fnode.fragstat;
428 root->inode.rstat = rootdir->fnode.rstat;
429 ++root->inode.rstat.rsubdirs;
430 root->inode.accounted_rstat = root->inode.rstat;
431
432 rootdir->mark_complete();
433 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
434 rootdir->commit(0, gather->new_sub());
435
436 root->store(gather->new_sub());
437 }
438
439 void MDCache::create_mydir_hierarchy(MDSGather *gather)
440 {
441 // create mds dir
442 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
443
444 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
445 adjust_subtree_auth(mydir, mds->get_nodeid());
446
447 LogSegment *ls = mds->mdlog->get_current_segment();
448
449 // stray dir
450 for (int i = 0; i < NUM_STRAY; ++i) {
451 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
452 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
453 stringstream name;
454 name << "stray" << i;
455 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
456 sdn->_mark_dirty(mds->mdlog->get_current_segment());
457
458 stray->inode.dirstat = straydir->fnode.fragstat;
459
460 mydir->fnode.rstat.add(stray->inode.rstat);
461 mydir->fnode.fragstat.nsubdirs++;
462 // save them
463 straydir->mark_complete();
464 straydir->mark_dirty(straydir->pre_dirty(), ls);
465 straydir->commit(0, gather->new_sub());
466 stray->_mark_dirty_parent(ls, true);
467 stray->store_backtrace(gather->new_sub());
468 }
469
470 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
471 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
472
473 myin->inode.dirstat = mydir->fnode.fragstat;
474 myin->inode.rstat = mydir->fnode.rstat;
475 ++myin->inode.rstat.rsubdirs;
476 myin->inode.accounted_rstat = myin->inode.rstat;
477
478 mydir->mark_complete();
479 mydir->mark_dirty(mydir->pre_dirty(), ls);
480 mydir->commit(0, gather->new_sub());
481
482 myin->store(gather->new_sub());
483 }
484
485 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
486 MutationRef mut;
487 CDentry *dn;
488 version_t dpv;
489 MDSInternalContextBase *fin;
490 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
491 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
492 void finish(int r) override {
493 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
494 }
495 };
496
497 void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
498 {
499 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
500 CDentry *dn = dir->add_null_dentry(name);
501
502 dn->push_projected_linkage(in);
503 version_t dpv = dn->pre_dirty();
504
505 CDir *mdir = 0;
506 if (in->inode.is_dir()) {
507 in->inode.rstat.rsubdirs = 1;
508
509 mdir = in->get_or_open_dirfrag(this, frag_t());
510 mdir->mark_complete();
511 mdir->pre_dirty();
512 } else
513 in->inode.rstat.rfiles = 1;
514 in->inode.version = dn->pre_dirty();
515
516 SnapRealm *realm = dir->get_inode()->find_snaprealm();
517 dn->first = in->first = realm->get_newest_seq() + 1;
518
519 MutationRef mut(new MutationImpl());
520
521 // force some locks. hacky.
522 mds->locker->wrlock_force(&dir->inode->filelock, mut);
523 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
524
525 mut->ls = mds->mdlog->get_current_segment();
526 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
527 mds->mdlog->start_entry(le);
528
529 if (!in->is_mdsdir()) {
530 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
531 le->metablob.add_primary_dentry(dn, in, true);
532 } else {
533 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
534 journal_dirty_inode(mut.get(), &le->metablob, in);
535 dn->push_projected_linkage(in->ino(), in->d_type());
536 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
537 le->metablob.add_root(true, in);
538 }
539 if (mdir)
540 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
541
542 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
543 mds->mdlog->flush();
544 }
545
546 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
547 {
548 dout(10) << "_create_system_file_finish " << *dn << dendl;
549
550 dn->pop_projected_linkage();
551 dn->mark_dirty(dpv, mut->ls);
552
553 CInode *in = dn->get_linkage()->get_inode();
554 in->inode.version--;
555 in->mark_dirty(in->inode.version + 1, mut->ls);
556
557 if (in->inode.is_dir()) {
558 CDir *dir = in->get_dirfrag(frag_t());
559 assert(dir);
560 dir->mark_dirty(1, mut->ls);
561 dir->mark_new(mut->ls);
562 }
563
564 mut->apply();
565 mds->locker->drop_locks(mut.get());
566 mut->cleanup();
567
568 fin->complete(0);
569
570 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
571 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
572 }
573
574
575
576 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
577 MDCache *cache;
578 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
579 void finish(int r) override {
580 if (r < 0) {
581 // If we can't open root, something disastrous has happened: mark
582 // this rank damaged for operator intervention. Note that
583 // it is not okay to call suicide() here because we are in
584 // a Finisher callback.
585 cache->mds->damaged();
586 ceph_abort(); // damaged should never return
587 } else {
588 cache->open_root();
589 }
590 }
591 };
592
593 void MDCache::open_root_inode(MDSInternalContextBase *c)
594 {
595 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
596 CInode *in;
597 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
598 in->fetch(c);
599 } else {
600 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
601 }
602 }
603
604 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
605 {
606 MDSGatherBuilder gather(g_ceph_context);
607
608 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
609 in->fetch(gather.new_sub());
610
611 gather.set_finisher(c);
612 gather.activate();
613 }
614
615 void MDCache::open_root()
616 {
617 dout(10) << "open_root" << dendl;
618
619 if (!root) {
620 open_root_inode(new C_MDS_RetryOpenRoot(this));
621 return;
622 }
623 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
624 assert(root->is_auth());
625 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
626 assert(rootdir);
627 if (!rootdir->is_subtree_root())
628 adjust_subtree_auth(rootdir, mds->get_nodeid());
629 if (!rootdir->is_complete()) {
630 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
631 return;
632 }
633 } else {
634 assert(!root->is_auth());
635 CDir *rootdir = root->get_dirfrag(frag_t());
636 if (!rootdir) {
637 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
638 return;
639 }
640 }
641
642 if (!myin) {
643 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
644 in->fetch(new C_MDS_RetryOpenRoot(this));
645 return;
646 }
647 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
648 assert(mydir);
649 adjust_subtree_auth(mydir, mds->get_nodeid());
650
651 populate_mydir();
652 }
653
654 void MDCache::populate_mydir()
655 {
656 assert(myin);
657 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
658 assert(mydir);
659
660 dout(10) << "populate_mydir " << *mydir << dendl;
661
662 if (!mydir->is_complete()) {
663 mydir->fetch(new C_MDS_RetryOpenRoot(this));
664 return;
665 }
666
667 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
668 // A missing dirfrag, we will recreate it. Before that, we must dirty
669 // it before dirtying any of the strays we create within it.
670 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
671 "recreating it now";
672 LogSegment *ls = mds->mdlog->get_current_segment();
673 mydir->state_clear(CDir::STATE_BADFRAG);
674 mydir->mark_complete();
675 mydir->mark_dirty(mydir->pre_dirty(), ls);
676 }
677
678 // open or create stray
679 uint64_t num_strays = 0;
680 for (int i = 0; i < NUM_STRAY; ++i) {
681 stringstream name;
682 name << "stray" << i;
683 CDentry *straydn = mydir->lookup(name.str());
684
685 // allow for older fs's with stray instead of stray0
686 if (straydn == NULL && i == 0)
687 straydn = mydir->lookup("stray");
688
689 if (!straydn || !straydn->get_linkage()->get_inode()) {
690 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
691 new C_MDS_RetryOpenRoot(this));
692 return;
693 }
694 assert(straydn);
695 assert(strays[i]);
696 // we make multiple passes through this method; make sure we only pin each stray once.
697 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
698 strays[i]->get(CInode::PIN_STRAY);
699 strays[i]->state_set(CInode::STATE_STRAYPINNED);
700 strays[i]->get_stickydirs();
701 }
702 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
703
704 // open all frags
705 list<frag_t> ls;
706 strays[i]->dirfragtree.get_leaves(ls);
707 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
708 frag_t fg = *p;
709 CDir *dir = strays[i]->get_dirfrag(fg);
710 if (!dir) {
711 dir = strays[i]->get_or_open_dirfrag(this, fg);
712 }
713
714 // DamageTable applies special handling to strays: it will
715 // have damaged() us out if one is damaged.
716 assert(!dir->state_test(CDir::STATE_BADFRAG));
717
718 if (dir->get_version() == 0) {
719 dir->fetch(new C_MDS_RetryOpenRoot(this));
720 return;
721 }
722
723 if (dir->get_frag_size() > 0)
724 num_strays += dir->get_frag_size();
725 }
726 }
727
728 stray_manager.set_num_strays(num_strays);
729
730 // okay!
731 dout(10) << "populate_mydir done" << dendl;
732 assert(!open);
733 open = true;
734 mds->queue_waiters(waiting_for_open);
735
736 scan_stray_dir();
737 }
738
739 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
740 {
741 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
742 }
743
744 CDir *MDCache::get_stray_dir(CInode *in)
745 {
746 string straydname;
747 in->name_stray_dentry(straydname);
748
749 CInode *strayi = get_stray();
750 assert(strayi);
751 frag_t fg = strayi->pick_dirfrag(straydname);
752 CDir *straydir = strayi->get_dirfrag(fg);
753 assert(straydir);
754 return straydir;
755 }
756
757 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
758 {
759 CDir *straydir = get_stray_dir(in);
760 string straydname;
761 in->name_stray_dentry(straydname);
762 CDentry *straydn = straydir->lookup(straydname);
763 if (!straydn) {
764 straydn = straydir->add_null_dentry(straydname);
765 straydn->mark_new();
766 } else {
767 assert(straydn->get_projected_linkage()->is_null());
768 }
769
770 straydn->state_set(CDentry::STATE_STRAY);
771 return straydn;
772 }
773
774
775
776 MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
777 {
778 // inode?
779 if (info.ino)
780 return get_inode(info.ino, info.snapid);
781
782 // dir or dentry.
783 CDir *dir = get_dirfrag(info.dirfrag);
784 if (!dir) return 0;
785
786 if (info.dname.length())
787 return dir->lookup(info.dname, info.snapid);
788 else
789 return dir;
790 }
791
792
793
794
795 // ====================================================================
796 // subtree management
797
798 void MDCache::list_subtrees(list<CDir*>& ls)
799 {
800 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
801 p != subtrees.end();
802 ++p)
803 ls.push_back(p->first);
804 }
805
806 /*
807 * adjust the dir_auth of a subtree.
808 * merge with parent and/or child subtrees, if is it appropriate.
809 * merge can ONLY happen if both parent and child have unambiguous auth.
810 */
811 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
812 {
813 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
814 << " on " << *dir << dendl;
815
816 show_subtrees();
817
818 CDir *root;
819 if (dir->inode->is_base()) {
820 root = dir; // bootstrap hack.
821 if (subtrees.count(root) == 0) {
822 subtrees[root];
823 root->get(CDir::PIN_SUBTREE);
824 }
825 } else {
826 root = get_subtree_root(dir); // subtree root
827 }
828 assert(root);
829 assert(subtrees.count(root));
830 dout(7) << " current root is " << *root << dendl;
831
832 if (root == dir) {
833 // i am already a subtree.
834 dir->set_dir_auth(auth);
835 } else {
836 // i am a new subtree.
837 dout(10) << " new subtree at " << *dir << dendl;
838 assert(subtrees.count(dir) == 0);
839 subtrees[dir]; // create empty subtree bounds list for me.
840 dir->get(CDir::PIN_SUBTREE);
841
842 // set dir_auth
843 dir->set_dir_auth(auth);
844
845 // move items nested beneath me, under me.
846 set<CDir*>::iterator p = subtrees[root].begin();
847 while (p != subtrees[root].end()) {
848 set<CDir*>::iterator next = p;
849 ++next;
850 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
851 // move under me
852 dout(10) << " claiming child bound " << **p << dendl;
853 subtrees[dir].insert(*p);
854 subtrees[root].erase(p);
855 }
856 p = next;
857 }
858
859 // i am a bound of the parent subtree.
860 subtrees[root].insert(dir);
861
862 // i am now the subtree root.
863 root = dir;
864
865 // adjust recursive pop counters
866 if (dir->is_auth()) {
867 utime_t now = ceph_clock_now();
868 CDir *p = dir->get_parent_dir();
869 while (p) {
870 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
871 if (p->is_subtree_root()) break;
872 p = p->inode->get_parent_dir();
873 }
874 }
875 }
876
877 show_subtrees();
878 }
879
880
881 void MDCache::try_subtree_merge(CDir *dir)
882 {
883 dout(7) << "try_subtree_merge " << *dir << dendl;
884 // record my old bounds
885 auto oldbounds = subtrees.at(dir);
886
887 set<CInode*> to_eval;
888 // try merge at my root
889 try_subtree_merge_at(dir, &to_eval);
890
891 // try merge at my old bounds
892 for (auto bound : oldbounds)
893 try_subtree_merge_at(bound, &to_eval);
894
895 if (!(mds->is_any_replay() || mds->is_resolve())) {
896 for(auto in : to_eval)
897 eval_subtree_root(in);
898 }
899 }
900
901 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
902 CInode *in;
903 MutationRef mut;
904 public:
905 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
906 void finish(int r) override {
907 mdcache->subtree_merge_writebehind_finish(in, mut);
908 }
909 };
910
911 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
912 {
913 dout(10) << "try_subtree_merge_at " << *dir << dendl;
914
915 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
916 dir->state_test(CDir::STATE_EXPORTBOUND) ||
917 dir->state_test(CDir::STATE_AUXSUBTREE))
918 return;
919
920 auto it = subtrees.find(dir);
921 assert(it != subtrees.end());
922
923 // merge with parent?
924 CDir *parent = dir;
925 if (!dir->inode->is_base())
926 parent = get_subtree_root(dir->get_parent_dir());
927
928 if (parent != dir && // we have a parent,
929 parent->dir_auth == dir->dir_auth) { // auth matches,
930 // merge with parent.
931 dout(10) << " subtree merge at " << *dir << dendl;
932 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
933
934 // move our bounds under the parent
935 subtrees[parent].insert(it->second.begin(), it->second.end());
936
937 // we are no longer a subtree or bound
938 dir->put(CDir::PIN_SUBTREE);
939 subtrees.erase(it);
940 subtrees[parent].erase(dir);
941
942 // adjust popularity?
943 if (dir->is_auth()) {
944 utime_t now = ceph_clock_now();
945 CDir *p = dir->get_parent_dir();
946 while (p) {
947 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
948 if (p->is_subtree_root()) break;
949 p = p->inode->get_parent_dir();
950 }
951 }
952
953 if (to_eval && dir->get_inode()->is_auth())
954 to_eval->insert(dir->get_inode());
955
956 show_subtrees(15);
957 }
958 }
959
960 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
961 {
962 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
963 in->pop_and_dirty_projected_inode(mut->ls);
964
965 mut->apply();
966 mds->locker->drop_locks(mut.get());
967 mut->cleanup();
968
969 in->auth_unpin(this);
970 }
971
972 void MDCache::eval_subtree_root(CInode *diri)
973 {
974 // evaluate subtree inode filelock?
975 // (we should scatter the filelock on subtree bounds)
976 assert(diri->is_auth());
977 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
978 }
979
980
981 void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
982 {
983 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
984 << " on " << *dir
985 << " bounds " << bounds
986 << dendl;
987
988 show_subtrees();
989
990 CDir *root;
991 if (dir->ino() == MDS_INO_ROOT) {
992 root = dir; // bootstrap hack.
993 if (subtrees.count(root) == 0) {
994 subtrees[root];
995 root->get(CDir::PIN_SUBTREE);
996 }
997 } else {
998 root = get_subtree_root(dir); // subtree root
999 }
1000 assert(root);
1001 assert(subtrees.count(root));
1002 dout(7) << " current root is " << *root << dendl;
1003
1004 mds_authority_t oldauth = dir->authority();
1005
1006 if (root == dir) {
1007 // i am already a subtree.
1008 dir->set_dir_auth(auth);
1009 } else {
1010 // i am a new subtree.
1011 dout(10) << " new subtree at " << *dir << dendl;
1012 assert(subtrees.count(dir) == 0);
1013 subtrees[dir]; // create empty subtree bounds list for me.
1014 dir->get(CDir::PIN_SUBTREE);
1015
1016 // set dir_auth
1017 dir->set_dir_auth(auth);
1018
1019 // move items nested beneath me, under me.
1020 set<CDir*>::iterator p = subtrees[root].begin();
1021 while (p != subtrees[root].end()) {
1022 set<CDir*>::iterator next = p;
1023 ++next;
1024 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1025 // move under me
1026 dout(10) << " claiming child bound " << **p << dendl;
1027 subtrees[dir].insert(*p);
1028 subtrees[root].erase(p);
1029 }
1030 p = next;
1031 }
1032
1033 // i am a bound of the parent subtree.
1034 subtrees[root].insert(dir);
1035
1036 // i am now the subtree root.
1037 root = dir;
1038 }
1039
1040 set<CInode*> to_eval;
1041
1042 // verify/adjust bounds.
1043 // - these may be new, or
1044 // - beneath existing ambiguous bounds (which will be collapsed),
1045 // - but NOT beneath unambiguous bounds.
1046 for (set<CDir*>::iterator p = bounds.begin();
1047 p != bounds.end();
1048 ++p) {
1049 CDir *bound = *p;
1050
1051 // new bound?
1052 if (subtrees[dir].count(bound) == 0) {
1053 if (get_subtree_root(bound) == dir) {
1054 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1055 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1056 }
1057 else {
1058 dout(10) << " want bound " << *bound << dendl;
1059 CDir *t = get_subtree_root(bound->get_parent_dir());
1060 if (subtrees[t].count(bound) == 0) {
1061 assert(t != dir);
1062 dout(10) << " new bound " << *bound << dendl;
1063 adjust_subtree_auth(bound, t->authority());
1064 }
1065 // make sure it's nested beneath ambiguous subtree(s)
1066 while (1) {
1067 while (subtrees[dir].count(t) == 0)
1068 t = get_subtree_root(t->get_parent_dir());
1069 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1070 adjust_subtree_auth(t, auth);
1071 try_subtree_merge_at(t, &to_eval);
1072 t = get_subtree_root(bound->get_parent_dir());
1073 if (t == dir) break;
1074 }
1075 }
1076 }
1077 else {
1078 dout(10) << " already have bound " << *bound << dendl;
1079 }
1080 }
1081 // merge stray bounds?
1082 while (!subtrees[dir].empty()) {
1083 set<CDir*> copy = subtrees[dir];
1084 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1085 if (bounds.count(*p) == 0) {
1086 CDir *stray = *p;
1087 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1088 adjust_subtree_auth(stray, auth);
1089 try_subtree_merge_at(stray, &to_eval);
1090 }
1091 }
1092 // swallowing subtree may add new subtree bounds
1093 if (copy == subtrees[dir])
1094 break;
1095 }
1096
1097 // bound should now match.
1098 verify_subtree_bounds(dir, bounds);
1099
1100 show_subtrees();
1101
1102 if (!(mds->is_any_replay() || mds->is_resolve())) {
1103 for(auto in : to_eval)
1104 eval_subtree_root(in);
1105 }
1106 }
1107
1108
1109 /*
1110 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1111 * fragmentation as necessary to get an equivalent bounding set. That is, only
1112 * split if one of our frags spans the provided bounding set. Never merge.
1113 */
1114 void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1115 {
1116 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1117
1118 // sort by ino
1119 map<inodeno_t, fragset_t> byino;
1120 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1121 byino[p->ino].insert(p->frag);
1122 dout(10) << " by ino: " << byino << dendl;
1123
1124 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1125 CInode *diri = get_inode(p->first);
1126 if (!diri)
1127 continue;
1128 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1129
1130 fragtree_t tmpdft;
1131 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1132 tmpdft.force_to_leaf(g_ceph_context, *q);
1133
1134 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1135 frag_t fg = *q;
1136 list<frag_t> fgls;
1137 diri->dirfragtree.get_leaves_under(fg, fgls);
1138 if (fgls.empty()) {
1139 bool all = true;
1140 frag_t approx_fg = diri->dirfragtree[fg.value()];
1141 list<frag_t> ls;
1142 tmpdft.get_leaves_under(approx_fg, ls);
1143 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1144 if (p->second.get().count(*r) == 0) {
1145 // not bound, so the resolve message is from auth MDS of the dirfrag
1146 force_dir_fragment(diri, *r);
1147 all = false;
1148 }
1149 }
1150 if (all)
1151 fgls.push_back(approx_fg);
1152 else
1153 diri->dirfragtree.get_leaves_under(fg, fgls);
1154 }
1155 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1156 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1157 CDir *dir = diri->get_dirfrag(*r);
1158 if (dir)
1159 bounds.insert(dir);
1160 }
1161 }
1162 }
1163 }
1164
1165 void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1166 {
1167 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1168 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1169
1170 set<CDir*> bounds;
1171 get_force_dirfrag_bound_set(bound_dfs, bounds);
1172 adjust_bounded_subtree_auth(dir, bounds, auth);
1173 }
1174
1175 void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1176 {
1177 dout(10) << "map_dirfrag_set " << dfs << dendl;
1178
1179 // group by inode
1180 map<inodeno_t, fragset_t> ino_fragset;
1181 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1182 ino_fragset[p->ino].insert(p->frag);
1183
1184 // get frags
1185 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1186 p != ino_fragset.end();
1187 ++p) {
1188 CInode *in = get_inode(p->first);
1189 if (!in)
1190 continue;
1191
1192 list<frag_t> fglist;
1193 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1194 in->dirfragtree.get_leaves_under(*q, fglist);
1195
1196 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1197 << " on " << *in << dendl;
1198
1199 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1200 CDir *dir = in->get_dirfrag(*q);
1201 if (dir)
1202 result.insert(dir);
1203 }
1204 }
1205 }
1206
1207
1208
1209 CDir *MDCache::get_subtree_root(CDir *dir)
1210 {
1211 // find the underlying dir that delegates (or is about to delegate) auth
1212 while (true) {
1213 if (dir->is_subtree_root())
1214 return dir;
1215 dir = dir->get_inode()->get_parent_dir();
1216 if (!dir)
1217 return 0; // none
1218 }
1219 }
1220
1221 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1222 {
1223 // find the underlying dir that delegates (or is about to delegate) auth
1224 while (true) {
1225 if (dir->is_subtree_root())
1226 return dir;
1227 dir = dir->get_inode()->get_projected_parent_dir();
1228 if (!dir)
1229 return 0; // none
1230 }
1231 }
1232
1233 void MDCache::remove_subtree(CDir *dir)
1234 {
1235 dout(10) << "remove_subtree " << *dir << dendl;
1236 assert(subtrees.count(dir));
1237 assert(subtrees[dir].empty());
1238 subtrees.erase(dir);
1239 dir->put(CDir::PIN_SUBTREE);
1240 if (dir->get_parent_dir()) {
1241 CDir *p = get_subtree_root(dir->get_parent_dir());
1242 assert(subtrees[p].count(dir));
1243 subtrees[p].erase(dir);
1244 }
1245 }
1246
1247 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1248 {
1249 assert(subtrees.count(dir));
1250 bounds = subtrees[dir];
1251 }
1252
1253 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1254 {
1255 if (subtrees.count(dir)) {
1256 // just copy them, dir is a subtree.
1257 get_subtree_bounds(dir, bounds);
1258 } else {
1259 // find them
1260 CDir *root = get_subtree_root(dir);
1261 for (set<CDir*>::iterator p = subtrees[root].begin();
1262 p != subtrees[root].end();
1263 ++p) {
1264 CDir *t = *p;
1265 while (t != root) {
1266 t = t->get_parent_dir();
1267 assert(t);
1268 if (t == dir) {
1269 bounds.insert(*p);
1270 continue;
1271 }
1272 }
1273 }
1274 }
1275 }
1276
1277 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1278 {
1279 // for debugging only.
1280 assert(subtrees.count(dir));
1281 if (bounds != subtrees[dir]) {
1282 dout(0) << "verify_subtree_bounds failed" << dendl;
1283 set<CDir*> b = bounds;
1284 for (auto &cd : subtrees[dir]) {
1285 if (bounds.count(cd)) {
1286 b.erase(cd);
1287 continue;
1288 }
1289 dout(0) << " missing bound " << *cd << dendl;
1290 }
1291 for (const auto &cd : b)
1292 dout(0) << " extra bound " << *cd << dendl;
1293 }
1294 assert(bounds == subtrees[dir]);
1295 }
1296
1297 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1298 {
1299 // for debugging only.
1300 assert(subtrees.count(dir));
1301
1302 // make sure that any bounds i do have are properly noted as such.
1303 int failed = 0;
1304 for (const auto &fg : bounds) {
1305 CDir *bd = get_dirfrag(fg);
1306 if (!bd) continue;
1307 if (subtrees[dir].count(bd) == 0) {
1308 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1309 failed++;
1310 }
1311 }
1312 assert(failed == 0);
1313 }
1314
1315 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1316 {
1317 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1318 << " to " << *newdir << dendl;
1319 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1320 }
1321
1322 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1323 {
1324 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1325
1326 //show_subtrees();
1327
1328 CDir *newdir = diri->get_parent_dir();
1329
1330 if (pop) {
1331 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1332 assert(p != projected_subtree_renames.end());
1333 assert(!p->second.empty());
1334 assert(p->second.front().first == olddir);
1335 assert(p->second.front().second == newdir);
1336 p->second.pop_front();
1337 if (p->second.empty())
1338 projected_subtree_renames.erase(p);
1339 }
1340
1341 // adjust subtree
1342 list<CDir*> dfls;
1343 // make sure subtree dirfrags are at the front of the list
1344 diri->get_subtree_dirfrags(dfls);
1345 diri->get_nested_dirfrags(dfls);
1346 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1347 CDir *dir = *p;
1348
1349 dout(10) << "dirfrag " << *dir << dendl;
1350 CDir *oldparent = get_subtree_root(olddir);
1351 dout(10) << " old parent " << *oldparent << dendl;
1352 CDir *newparent = get_subtree_root(newdir);
1353 dout(10) << " new parent " << *newparent << dendl;
1354
1355 if (oldparent == newparent) {
1356 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1357 continue;
1358 }
1359
1360 if (dir->is_subtree_root()) {
1361 // children are fine. change parent.
1362 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1363 assert(subtrees[oldparent].count(dir));
1364 subtrees[oldparent].erase(dir);
1365 assert(subtrees.count(newparent));
1366 subtrees[newparent].insert(dir);
1367 // caller is responsible for 'eval diri'
1368 try_subtree_merge_at(dir, NULL);
1369 } else {
1370 // mid-subtree.
1371
1372 // see if any old bounds move to the new parent.
1373 list<CDir*> tomove;
1374 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1375 p != subtrees[oldparent].end();
1376 ++p) {
1377 CDir *bound = *p;
1378 CDir *broot = get_subtree_root(bound->get_parent_dir());
1379 if (broot != oldparent) {
1380 assert(broot == newparent);
1381 tomove.push_back(bound);
1382 }
1383 }
1384 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1385 CDir *bound = *p;
1386 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1387 subtrees[oldparent].erase(bound);
1388 subtrees[newparent].insert(bound);
1389 }
1390
1391 // did auth change?
1392 if (oldparent->authority() != newparent->authority()) {
1393 adjust_subtree_auth(dir, oldparent->authority());
1394 // caller is responsible for 'eval diri'
1395 try_subtree_merge_at(dir, NULL);
1396 }
1397 }
1398 }
1399
1400 show_subtrees();
1401 }
1402
1403
1404 void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1405 {
1406 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1407 p != subtrees.end();
1408 ++p) {
1409 CDir *root = p->first;
1410 if (root->is_full_dir_auth())
1411 s.insert(root);
1412 }
1413 }
1414 void MDCache::get_auth_subtrees(set<CDir*>& s)
1415 {
1416 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1417 p != subtrees.end();
1418 ++p) {
1419 CDir *root = p->first;
1420 if (root->is_auth())
1421 s.insert(root);
1422 }
1423 }
1424
1425
1426 // count.
1427
1428 int MDCache::num_subtrees()
1429 {
1430 return subtrees.size();
1431 }
1432
1433 int MDCache::num_subtrees_fullauth()
1434 {
1435 int n = 0;
1436 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1437 p != subtrees.end();
1438 ++p) {
1439 CDir *root = p->first;
1440 if (root->is_full_dir_auth())
1441 n++;
1442 }
1443 return n;
1444 }
1445
1446 int MDCache::num_subtrees_fullnonauth()
1447 {
1448 int n = 0;
1449 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1450 p != subtrees.end();
1451 ++p) {
1452 CDir *root = p->first;
1453 if (root->is_full_dir_nonauth())
1454 n++;
1455 }
1456 return n;
1457 }
1458
1459
1460
1461 // ===================================
1462 // journal and snap/cow helpers
1463
1464
1465 /*
1466 * find first inode in cache that follows given snapid. otherwise, return current.
1467 */
1468 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1469 {
1470 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1471 assert(in->last == CEPH_NOSNAP);
1472
1473 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1474 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1475 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1476 in = p->second;
1477 }
1478
1479 return in;
1480 }
1481
1482
1483 /*
1484 * note: i'm currently cheating wrt dirty and inode.version on cow
1485 * items. instead of doing a full dir predirty, i just take the
1486 * original item's version, and set the dirty flag (via
1487 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1488 * means a special case in the dir commit clean sweep assertions.
1489 * bah.
1490 */
1491 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1492 {
1493 assert(last >= in->first);
1494
1495 CInode *oldin = new CInode(this, true, in->first, last);
1496 oldin->inode = *in->get_previous_projected_inode();
1497 oldin->symlink = in->symlink;
1498 oldin->xattrs = *in->get_previous_projected_xattrs();
1499 oldin->inode.trim_client_ranges(last);
1500
1501 if (in->first < in->oldest_snap)
1502 in->oldest_snap = in->first;
1503
1504 in->first = last+1;
1505
1506 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1507 add_inode(oldin);
1508
1509 if (in->last != CEPH_NOSNAP) {
1510 CInode *head_in = get_inode(in->ino());
1511 assert(head_in);
1512 if (head_in->split_need_snapflush(oldin, in)) {
1513 oldin->client_snap_caps = in->client_snap_caps;
1514 for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
1515 p != in->client_snap_caps.end();
1516 ++p) {
1517 SimpleLock *lock = oldin->get_lock(p->first);
1518 assert(lock);
1519 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
1520 oldin->auth_pin(lock);
1521 lock->set_state(LOCK_SNAP_SYNC); // gathering
1522 lock->get_wrlock(true);
1523 }
1524 }
1525 }
1526 return oldin;
1527 }
1528
1529 if (!in->client_caps.empty()) {
1530 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1531 // clone caps?
1532 for (auto p : in->client_caps) {
1533 client_t client = p.first;
1534 Capability *cap = p.second;
1535 int issued = cap->issued();
1536 if ((issued & CEPH_CAP_ANY_WR) &&
1537 cap->client_follows < last) {
1538 // note in oldin
1539 for (int i = 0; i < num_cinode_locks; i++) {
1540 if (issued & cinode_lock_info[i].wr_caps) {
1541 int lockid = cinode_lock_info[i].lock;
1542 SimpleLock *lock = oldin->get_lock(lockid);
1543 assert(lock);
1544 oldin->client_snap_caps[lockid].insert(client);
1545 oldin->auth_pin(lock);
1546 lock->set_state(LOCK_SNAP_SYNC); // gathering
1547 lock->get_wrlock(true);
1548 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1549 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1550 }
1551 }
1552 cap->client_follows = last;
1553
1554 // we need snapflushes for any intervening snaps
1555 dout(10) << " snaps " << snaps << dendl;
1556 for (auto q = snaps.lower_bound(oldin->first);
1557 q != snaps.end() && *q <= last;
1558 ++q) {
1559 in->add_need_snapflush(oldin, *q, client);
1560 }
1561 } else {
1562 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1563 }
1564 }
1565 }
1566 return oldin;
1567 }
1568
1569 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1570 CDentry *dn, snapid_t follows,
1571 CInode **pcow_inode, CDentry::linkage_t *dnl)
1572 {
1573 if (!dn) {
1574 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1575 return;
1576 }
1577 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1578 assert(dn->is_auth());
1579
1580 // nothing to cow on a null dentry, fix caller
1581 if (!dnl)
1582 dnl = dn->get_projected_linkage();
1583 assert(!dnl->is_null());
1584
1585 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1586 // multiversion inode.
1587 CInode *in = dnl->get_inode();
1588 SnapRealm *realm = NULL;
1589
1590 if (in->get_projected_parent_dn() != dn) {
1591 assert(follows == CEPH_NOSNAP);
1592 realm = dn->dir->inode->find_snaprealm();
1593 snapid_t dir_follows = realm->get_newest_snap();
1594
1595 if (dir_follows+1 > dn->first) {
1596 snapid_t oldfirst = dn->first;
1597 dn->first = dir_follows+1;
1598 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1599 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
1600 oldfirst, dir_follows);
1601 olddn->pre_dirty();
1602 dout(10) << " olddn " << *olddn << dendl;
1603 metablob->add_remote_dentry(olddn, true);
1604 mut->add_cow_dentry(olddn);
1605 // FIXME: adjust link count here? hmm.
1606
1607 if (dir_follows+1 > in->first)
1608 in->cow_old_inode(dir_follows, false);
1609 }
1610 }
1611
1612 if (in->snaprealm) {
1613 realm = in->snaprealm;
1614 follows = realm->get_newest_seq();
1615 } else
1616 follows = dir_follows;
1617 } else {
1618 realm = in->find_snaprealm();
1619 if (follows == CEPH_NOSNAP)
1620 follows = realm->get_newest_seq();
1621 }
1622
1623 // already cloned?
1624 if (follows < in->first) {
1625 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1626 return;
1627 }
1628
1629 if (!realm->has_snaps_in_range(in->first, follows)) {
1630 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1631 in->first = follows + 1;
1632 return;
1633 }
1634
1635 in->cow_old_inode(follows, false);
1636
1637 } else {
1638 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1639 if (follows == CEPH_NOSNAP)
1640 follows = realm->get_newest_seq();
1641
1642 // already cloned?
1643 if (follows < dn->first) {
1644 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1645 return;
1646 }
1647
1648 // update dn.first before adding old dentry to cdir's map
1649 snapid_t oldfirst = dn->first;
1650 dn->first = follows+1;
1651
1652 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1653
1654 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1655 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1656 if (in)
1657 in->first = follows+1;
1658 return;
1659 }
1660
1661 dout(10) << " dn " << *dn << dendl;
1662 if (in) {
1663 CInode *oldin = cow_inode(in, follows);
1664 mut->add_cow_inode(oldin);
1665 if (pcow_inode)
1666 *pcow_inode = oldin;
1667 CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last);
1668 oldin->inode.version = olddn->pre_dirty();
1669 dout(10) << " olddn " << *olddn << dendl;
1670 bool need_snapflush = !oldin->client_snap_caps.empty();
1671 if (need_snapflush)
1672 mut->ls->open_files.push_back(&oldin->item_open_file);
1673 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1674 mut->add_cow_dentry(olddn);
1675 } else {
1676 assert(dnl->is_remote());
1677 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
1678 oldfirst, follows);
1679 olddn->pre_dirty();
1680 dout(10) << " olddn " << *olddn << dendl;
1681 metablob->add_remote_dentry(olddn, true);
1682 mut->add_cow_dentry(olddn);
1683 }
1684 }
1685 }
1686
1687
1688 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1689 CInode *in, snapid_t follows,
1690 CInode **pcow_inode)
1691 {
1692 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1693 CDentry *dn = in->get_projected_parent_dn();
1694 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1695 }
1696
1697 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1698 {
1699 if (in->is_base()) {
1700 metablob->add_root(true, in, in->get_projected_inode());
1701 } else {
1702 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1703 follows = in->first - 1;
1704 CDentry *dn = in->get_projected_parent_dn();
1705 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1706 journal_cow_dentry(mut, metablob, dn, follows);
1707 if (in->get_projected_inode()->is_backtrace_updated()) {
1708 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1709 in->get_previous_projected_inode()->layout.pool_id;
1710 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1711 } else {
1712 metablob->add_primary_dentry(dn, in, true);
1713 }
1714 }
1715 }
1716
1717
1718
1719 // nested ---------------------------------------------------------------
1720
1721 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1722 int linkunlink, SnapRealm *prealm)
1723 {
1724 CDentry *parentdn = cur->get_projected_parent_dn();
1725 inode_t *curi = cur->get_projected_inode();
1726
1727 if (cur->first > first)
1728 first = cur->first;
1729
1730 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1731 << " " << *cur << dendl;
1732 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1733 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1734
1735 /*
1736 * FIXME. this incompletely propagates rstats to _old_ parents
1737 * (i.e. shortly after a directory rename). but we need full
1738 * blown hard link backpointers to make this work properly...
1739 */
1740 snapid_t floor = parentdn->first;
1741 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1742
1743 if (!prealm)
1744 prealm = parent->inode->find_snaprealm();
1745 const set<snapid_t> snaps = prealm->get_snaps();
1746
1747 if (cur->last != CEPH_NOSNAP) {
1748 assert(cur->dirty_old_rstats.empty());
1749 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1750 if (q == snaps.end() || *q > cur->last)
1751 return;
1752 }
1753
1754 if (cur->last >= floor) {
1755 bool update = true;
1756 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1757 // rename src inode is not projected in the slave rename prep case. so we should
1758 // avoid updateing the inode.
1759 assert(linkunlink < 0);
1760 assert(cur->is_frozen_inode());
1761 update = false;
1762 }
1763 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1764 linkunlink, update);
1765 }
1766
1767 if (g_conf->mds_snap_rstat) {
1768 for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
1769 p != cur->dirty_old_rstats.end();
1770 ++p) {
1771 old_inode_t& old = cur->old_inodes[*p];
1772 snapid_t ofirst = MAX(old.first, floor);
1773 set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
1774 if (q == snaps.end() || *q > *p)
1775 continue;
1776 if (*p >= floor)
1777 _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
1778 }
1779 }
1780 cur->dirty_old_rstats.clear();
1781 }
1782
1783
1784 void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
1785 CDir *parent, int linkunlink, bool update_inode)
1786 {
1787 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1788 dout(20) << " inode rstat " << inode.rstat << dendl;
1789 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1790 nest_info_t delta;
1791 if (linkunlink == 0) {
1792 delta.add(inode.rstat);
1793 delta.sub(inode.accounted_rstat);
1794 } else if (linkunlink < 0) {
1795 delta.sub(inode.accounted_rstat);
1796 } else {
1797 delta.add(inode.rstat);
1798 }
1799 dout(20) << " delta " << delta << dendl;
1800
1801 if (update_inode)
1802 inode.accounted_rstat = inode.rstat;
1803
1804 while (last >= ofirst) {
1805 /*
1806 * pick fnode version to update. at each iteration, we want to
1807 * pick a segment ending in 'last' to update. split as necessary
1808 * to make that work. then, adjust first up so that we only
1809 * update one segment at a time. then loop to cover the whole
1810 * [ofirst,last] interval.
1811 */
1812 nest_info_t *prstat;
1813 snapid_t first;
1814 fnode_t *pf = parent->get_projected_fnode();
1815 if (last == CEPH_NOSNAP) {
1816 if (g_conf->mds_snap_rstat)
1817 first = MAX(ofirst, parent->first);
1818 else
1819 first = parent->first;
1820 prstat = &pf->rstat;
1821 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1822
1823 if (first > parent->first &&
1824 !(pf->rstat == pf->accounted_rstat)) {
1825 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1826 << parent->first << "," << (first-1) << "] "
1827 << " " << *prstat << "/" << pf->accounted_rstat
1828 << dendl;
1829 parent->dirty_old_rstat[first-1].first = parent->first;
1830 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1831 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1832 }
1833 parent->first = first;
1834 } else if (!g_conf->mds_snap_rstat) {
1835 // drop snapshots' rstats
1836 break;
1837 } else if (last >= parent->first) {
1838 first = parent->first;
1839 parent->dirty_old_rstat[last].first = first;
1840 parent->dirty_old_rstat[last].rstat = pf->rstat;
1841 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1842 prstat = &parent->dirty_old_rstat[last].rstat;
1843 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1844 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1845 } else {
1846 // be careful, dirty_old_rstat is a _sparse_ map.
1847 // sorry, this is ugly.
1848 first = ofirst;
1849
1850 // find any intersection with last
1851 compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
1852 if (p == parent->dirty_old_rstat.end()) {
1853 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1854 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1855 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1856 first = parent->dirty_old_rstat.rbegin()->first+1;
1857 }
1858 } else {
1859 // *p last is >= last
1860 if (p->second.first <= last) {
1861 // *p intersects [first,last]
1862 if (p->second.first < first) {
1863 dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
1864 parent->dirty_old_rstat[first-1] = p->second;
1865 p->second.first = first;
1866 }
1867 if (p->second.first > first)
1868 first = p->second.first;
1869 if (last < p->first) {
1870 dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
1871 parent->dirty_old_rstat[last] = p->second;
1872 p->second.first = last+1;
1873 }
1874 } else {
1875 // *p is to the _right_ of [first,last]
1876 p = parent->dirty_old_rstat.lower_bound(first);
1877 // new *p last is >= first
1878 if (p->second.first <= last && // new *p isn't also to the right, and
1879 p->first >= first) { // it intersects our first bit,
1880 dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
1881 first = p->first+1;
1882 }
1883 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1884 }
1885 }
1886 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1887 parent->dirty_old_rstat[last].first = first;
1888 prstat = &parent->dirty_old_rstat[last].rstat;
1889 }
1890
1891 // apply
1892 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1893 assert(last >= first);
1894 prstat->add(delta);
1895 if (update_inode)
1896 inode.accounted_rstat = inode.rstat;
1897 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1898
1899 last = first-1;
1900 }
1901 }
1902
1903 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1904 snapid_t ofirst, snapid_t last,
1905 CInode *pin, bool cow_head)
1906 {
1907 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1908 dout(20) << " frag rstat " << rstat << dendl;
1909 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1910 nest_info_t delta = rstat;
1911 delta.sub(accounted_rstat);
1912 dout(20) << " delta " << delta << dendl;
1913
1914 while (last >= ofirst) {
1915 inode_t *pi;
1916 snapid_t first;
1917 if (last == pin->last) {
1918 pi = pin->get_projected_inode();
1919 first = MAX(ofirst, pin->first);
1920 if (first > pin->first) {
1921 old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
1922 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1923 }
1924 } else {
1925 if (last >= pin->first) {
1926 first = pin->first;
1927 pin->cow_old_inode(last, cow_head);
1928 } else {
1929 // our life is easier here because old_inodes is not sparse
1930 // (although it may not begin at snapid 1)
1931 compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
1932 if (p == pin->old_inodes.end()) {
1933 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1934 break;
1935 }
1936 first = p->second.first;
1937 if (first > last) {
1938 dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
1939 //assert(p == pin->old_inodes.begin());
1940 break;
1941 }
1942 if (p->first > last) {
1943 dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
1944 << (last+1) << "," << p->first << "]" << dendl;
1945 pin->old_inodes[last] = p->second;
1946 p->second.first = last+1;
1947 pin->dirty_old_rstats.insert(p->first);
1948 }
1949 }
1950 if (first < ofirst) {
1951 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1952 << first << "," << ofirst-1 << "]" << dendl;
1953 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1954 pin->dirty_old_rstats.insert(ofirst-1);
1955 pin->old_inodes[last].first = first = ofirst;
1956 }
1957 pi = &pin->old_inodes[last].inode;
1958 pin->dirty_old_rstats.insert(last);
1959 }
1960 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1961 pi->rstat.add(delta);
1962 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1963
1964 last = first-1;
1965 }
1966 }
1967
1968 void MDCache::broadcast_quota_to_client(CInode *in)
1969 {
1970 if (!in->is_auth() || in->is_frozen())
1971 return;
1972
1973 inode_t *i = in->get_projected_inode();
1974
1975 if (!i->quota.is_enable())
1976 return;
1977
1978 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1979 it != in->client_caps.end();
1980 ++it) {
1981 Session *session = mds->get_session(it->first);
1982 if (!session || !session->connection ||
1983 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
1984 continue;
1985
1986 Capability *cap = it->second;
1987 if (cap->last_rbytes == i->rstat.rbytes &&
1988 cap->last_rsize == i->rstat.rsize())
1989 continue;
1990
1991 if (i->quota.max_files > 0) {
1992 if (i->rstat.rsize() >= i->quota.max_files)
1993 goto update;
1994
1995 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
1996 abs(cap->last_rsize - i->rstat.rsize()))
1997 goto update;
1998 }
1999
2000 if (i->quota.max_bytes > 0) {
2001 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2002 goto update;
2003
2004 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2005 abs(cap->last_rbytes - i->rstat.rbytes))
2006 goto update;
2007 }
2008
2009 continue;
2010
2011 update:
2012 cap->last_rsize = i->rstat.rsize();
2013 cap->last_rbytes = i->rstat.rbytes;
2014
2015 MClientQuota *msg = new MClientQuota();
2016 msg->ino = in->ino();
2017 msg->rstat = i->rstat;
2018 msg->quota = i->quota;
2019 mds->send_message_client_counted(msg, session->connection);
2020 }
2021 for (const auto &it : in->get_replicas()) {
2022 MGatherCaps *msg = new MGatherCaps;
2023 msg->ino = in->ino();
2024 mds->send_message_mds(msg, it.first);
2025 }
2026 }
2027
2028 /*
2029 * NOTE: we _have_ to delay the scatter if we are called during a
2030 * rejoin, because we can't twiddle locks between when the
2031 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2032 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2033 * (no requests), and a survivor acks immediately. _except_ that
2034 * during rejoin_(weak|strong) processing, we may complete a lock
2035 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2036 * scatterlock state in that case or the lock states will get out of
2037 * sync between the auth and replica.
2038 *
2039 * the simple solution is to never do the scatter here. instead, put
2040 * the scatterlock on a list if it isn't already wrlockable. this is
2041 * probably the best plan anyway, since we avoid too many
2042 * scatters/locks under normal usage.
2043 */
2044 /*
2045 * some notes on dirlock/nestlock scatterlock semantics:
2046 *
2047 * the fragstat (dirlock) will never be updated without
2048 * dirlock+nestlock wrlock held by the caller.
2049 *
2050 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2051 * data is pushed up the tree. this could be changed with some
2052 * restructuring here, but in its current form we ensure that the
2053 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2054 * frag, which is nice. and, we only need to track frags that need to
2055 * be nudged (and not inodes with pending rstat changes that need to
2056 * be pushed into the frag). a consequence of this is that the
2057 * accounted_rstat on scatterlock sync may not match our current
2058 * rstat. this is normal and expected.
2059 */
2060 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2061 CInode *in, CDir *parent,
2062 int flags, int linkunlink,
2063 snapid_t cfollows)
2064 {
2065 bool primary_dn = flags & PREDIRTY_PRIMARY;
2066 bool do_parent_mtime = flags & PREDIRTY_DIR;
2067 bool shallow = flags & PREDIRTY_SHALLOW;
2068
2069 assert(mds->mdlog->entry_is_open());
2070
2071 // make sure stamp is set
2072 if (mut->get_mds_stamp() == utime_t())
2073 mut->set_mds_stamp(ceph_clock_now());
2074
2075 if (in->is_base())
2076 return;
2077
2078 dout(10) << "predirty_journal_parents"
2079 << (do_parent_mtime ? " do_parent_mtime":"")
2080 << " linkunlink=" << linkunlink
2081 << (primary_dn ? " primary_dn":" remote_dn")
2082 << (shallow ? " SHALLOW":"")
2083 << " follows " << cfollows
2084 << " " << *in << dendl;
2085
2086 if (!parent) {
2087 assert(primary_dn);
2088 parent = in->get_projected_parent_dn()->get_dir();
2089 }
2090
2091 if (flags == 0 && linkunlink == 0) {
2092 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2093 blob->add_dir_context(parent);
2094 return;
2095 }
2096
2097 // build list of inodes to wrlock, dirty, and update
2098 list<CInode*> lsi;
2099 CInode *cur = in;
2100 CDentry *parentdn = NULL;
2101 bool first = true;
2102 while (parent) {
2103 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2104 assert(parent->is_auth());
2105
2106 // opportunistically adjust parent dirfrag
2107 CInode *pin = parent->get_inode();
2108
2109 // inode -> dirfrag
2110 mut->auth_pin(parent);
2111 mut->add_projected_fnode(parent);
2112
2113 fnode_t *pf = parent->project_fnode();
2114 pf->version = parent->pre_dirty();
2115
2116 if (do_parent_mtime || linkunlink) {
2117 assert(mut->wrlocks.count(&pin->filelock));
2118 assert(mut->wrlocks.count(&pin->nestlock));
2119 assert(cfollows == CEPH_NOSNAP);
2120
2121 // update stale fragstat/rstat?
2122 parent->resync_accounted_fragstat();
2123 parent->resync_accounted_rstat();
2124
2125 if (do_parent_mtime) {
2126 pf->fragstat.mtime = mut->get_op_stamp();
2127 pf->fragstat.change_attr++;
2128 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2129 if (pf->fragstat.mtime > pf->rstat.rctime) {
2130 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2131 pf->rstat.rctime = pf->fragstat.mtime;
2132 } else {
2133 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2134 }
2135 }
2136 if (linkunlink) {
2137 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2138 if (in->is_dir()) {
2139 pf->fragstat.nsubdirs += linkunlink;
2140 //pf->rstat.rsubdirs += linkunlink;
2141 } else {
2142 pf->fragstat.nfiles += linkunlink;
2143 //pf->rstat.rfiles += linkunlink;
2144 }
2145 }
2146 }
2147
2148 // rstat
2149 if (!primary_dn) {
2150 // don't update parent this pass
2151 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2152 pin->versionlock.can_wrlock())) {
2153 dout(20) << " unwritable parent nestlock " << pin->nestlock
2154 << ", marking dirty rstat on " << *cur << dendl;
2155 cur->mark_dirty_rstat();
2156 } else {
2157 // if we don't hold a wrlock reference on this nestlock, take one,
2158 // because we are about to write into the dirfrag fnode and that needs
2159 // to commit before the lock can cycle.
2160 if (linkunlink) {
2161 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2162 }
2163
2164 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2165 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2166 mds->locker->wrlock_force(&pin->nestlock, mut);
2167 }
2168
2169 // now we can project the inode rstat diff the dirfrag
2170 SnapRealm *prealm = pin->find_snaprealm();
2171
2172 snapid_t follows = cfollows;
2173 if (follows == CEPH_NOSNAP)
2174 follows = prealm->get_newest_seq();
2175
2176 snapid_t first = follows+1;
2177
2178 // first, if the frag is stale, bring it back in sync.
2179 parent->resync_accounted_rstat();
2180
2181 // now push inode rstats into frag
2182 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2183 cur->clear_dirty_rstat();
2184 }
2185
2186 bool stop = false;
2187 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2188 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2189 stop = true;
2190 }
2191
2192 // delay propagating until later?
2193 if (!stop && !first &&
2194 g_conf->mds_dirstat_min_interval > 0) {
2195 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2196 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2197 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2198 << " < " << g_conf->mds_dirstat_min_interval
2199 << ", stopping" << dendl;
2200 stop = true;
2201 } else {
2202 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2203 }
2204 }
2205
2206 // can cast only because i'm passing nowait=true in the sole user
2207 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2208 if (!stop &&
2209 mut->wrlocks.count(&pin->nestlock) == 0 &&
2210 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2211 //true
2212 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2213 )) { // ** do not initiate.. see above comment **
2214 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2215 << " on " << *pin << dendl;
2216 stop = true;
2217 }
2218 if (stop) {
2219 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2220 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2221 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2222 mut->add_updated_lock(&pin->nestlock);
2223 if (do_parent_mtime || linkunlink) {
2224 mds->locker->mark_updated_scatterlock(&pin->filelock);
2225 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2226 mut->add_updated_lock(&pin->filelock);
2227 }
2228 break;
2229 }
2230 if (!mut->wrlocks.count(&pin->versionlock))
2231 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2232
2233 assert(mut->wrlocks.count(&pin->nestlock) ||
2234 mut->is_slave());
2235
2236 pin->last_dirstat_prop = mut->get_mds_stamp();
2237
2238 // dirfrag -> diri
2239 mut->auth_pin(pin);
2240 mut->add_projected_inode(pin);
2241 lsi.push_front(pin);
2242
2243 pin->pre_cow_old_inode(); // avoid cow mayhem!
2244
2245 inode_t *pi = pin->project_inode();
2246 pi->version = pin->pre_dirty();
2247
2248 // dirstat
2249 if (do_parent_mtime || linkunlink) {
2250 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2251 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2252 bool touched_mtime = false, touched_chattr = false;
2253 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2254 pf->accounted_fragstat = pf->fragstat;
2255 if (touched_mtime)
2256 pi->mtime = pi->ctime = pi->dirstat.mtime;
2257 if (touched_chattr)
2258 pi->change_attr = pi->dirstat.change_attr;
2259 dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
2260
2261 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2262 if (pi->dirstat.size() < 0)
2263 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2264 if (pi->dirstat.size() != pf->fragstat.size()) {
2265 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2266 << parent->dirfrag() << ", inode has " << pi->dirstat
2267 << ", dirfrag has " << pf->fragstat;
2268
2269 // trust the dirfrag for now
2270 pi->dirstat = pf->fragstat;
2271
2272 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2273 }
2274 }
2275 }
2276
2277 /*
2278 * the rule here is to follow the _oldest_ parent with dirty rstat
2279 * data. if we don't propagate all data, we add ourselves to the
2280 * nudge list. that way all rstat data will (eventually) get
2281 * pushed up the tree.
2282 *
2283 * actually, no. for now, silently drop rstats for old parents. we need
2284 * hard link backpointers to do the above properly.
2285 */
2286
2287 // stop?
2288 if (pin->is_base())
2289 break;
2290 parentdn = pin->get_projected_parent_dn();
2291 assert(parentdn);
2292
2293 // rstat
2294 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2295
2296 // first, if the frag is stale, bring it back in sync.
2297 parent->resync_accounted_rstat();
2298
2299 if (g_conf->mds_snap_rstat) {
2300 for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
2301 p != parent->dirty_old_rstat.end();
2302 ++p)
2303 project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
2304 p->first, pin, true);//false);
2305 }
2306 parent->dirty_old_rstat.clear();
2307 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2308
2309 pf->accounted_rstat = pf->rstat;
2310
2311 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2312 if (pi->rstat.rbytes != pf->rstat.rbytes) {
2313 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2314 << parent->dirfrag() << ", inode has " << pi->rstat
2315 << ", dirfrag has " << pf->rstat;
2316
2317 // trust the dirfrag for now
2318 pi->rstat = pf->rstat;
2319
2320 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2321 }
2322 }
2323
2324 parent->check_rstats();
2325 broadcast_quota_to_client(pin);
2326 // next parent!
2327 cur = pin;
2328 parent = parentdn->get_dir();
2329 linkunlink = 0;
2330 do_parent_mtime = false;
2331 primary_dn = true;
2332 first = false;
2333 }
2334
2335 // now, stick it in the blob
2336 assert(parent);
2337 assert(parent->is_auth());
2338 blob->add_dir_context(parent);
2339 blob->add_dir(parent, true);
2340 for (list<CInode*>::iterator p = lsi.begin();
2341 p != lsi.end();
2342 ++p) {
2343 CInode *cur = *p;
2344 journal_dirty_inode(mut.get(), blob, cur);
2345 }
2346
2347 }
2348
2349
2350
2351
2352
2353 // ===================================
2354 // slave requests
2355
2356
2357 /*
2358 * some handlers for master requests with slaves. we need to make
2359 * sure slaves journal commits before we forget we mastered them and
2360 * remove them from the uncommitted_masters map (used during recovery
2361 * to commit|abort slaves).
2362 */
2363 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2364 metareqid_t reqid;
2365 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2366 void finish(int r) override {
2367 mdcache->_logged_master_commit(reqid);
2368 }
2369 };
2370
2371 void MDCache::log_master_commit(metareqid_t reqid)
2372 {
2373 dout(10) << "log_master_commit " << reqid << dendl;
2374 uncommitted_masters[reqid].committing = true;
2375 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2376 new C_MDC_CommittedMaster(this, reqid));
2377 }
2378
2379 void MDCache::_logged_master_commit(metareqid_t reqid)
2380 {
2381 dout(10) << "_logged_master_commit " << reqid << dendl;
2382 assert(uncommitted_masters.count(reqid));
2383 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2384 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2385 uncommitted_masters.erase(reqid);
2386 }
2387
2388 // while active...
2389
2390 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2391 {
2392 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2393 assert(uncommitted_masters.count(r));
2394 uncommitted_masters[r].slaves.erase(from);
2395 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2396 log_master_commit(r);
2397 }
2398
2399 void MDCache::logged_master_update(metareqid_t reqid)
2400 {
2401 dout(10) << "logged_master_update " << reqid << dendl;
2402 assert(uncommitted_masters.count(reqid));
2403 uncommitted_masters[reqid].safe = true;
2404 if (pending_masters.count(reqid)) {
2405 pending_masters.erase(reqid);
2406 if (pending_masters.empty())
2407 process_delayed_resolve();
2408 }
2409 }
2410
2411 /*
2412 * Master may crash after receiving all slaves' commit acks, but before journalling
2413 * the final commit. Slaves may crash after journalling the slave commit, but before
2414 * sending commit ack to the master. Commit masters with no uncommitted slave when
2415 * resolve finishes.
2416 */
2417 void MDCache::finish_committed_masters()
2418 {
2419 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2420 p != uncommitted_masters.end();
2421 ++p) {
2422 p->second.recovering = false;
2423 if (!p->second.committing && p->second.slaves.empty()) {
2424 dout(10) << "finish_committed_masters " << p->first << dendl;
2425 log_master_commit(p->first);
2426 }
2427 }
2428 }
2429
2430 /*
2431 * at end of resolve... we must journal a commit|abort for all slave
2432 * updates, before moving on.
2433 *
2434 * this is so that the master can safely journal ECommitted on ops it
2435 * masters when it reaches up:active (all other recovering nodes must
2436 * complete resolve before that happens).
2437 */
2438 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2439 mds_rank_t from;
2440 metareqid_t reqid;
2441 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2442 void finish(int r) override {
2443 mdcache->_logged_slave_commit(from, reqid);
2444 }
2445 };
2446
2447 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2448 {
2449 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2450
2451 // send a message
2452 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2453 mds->send_message_mds(req, from);
2454 }
2455
2456
2457
2458
2459
2460
2461 // ====================================================================
2462 // import map, recovery
2463
2464 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2465 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2466 {
2467 if (subtrees.count(oldparent)) {
2468 vector<dirfrag_t>& v = subtrees[oldparent];
2469 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2470 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2471 if (*it == df) {
2472 v.erase(it);
2473 break;
2474 }
2475 }
2476 if (subtrees.count(newparent)) {
2477 vector<dirfrag_t>& v = subtrees[newparent];
2478 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2479 v.push_back(df);
2480 }
2481 }
2482
2483 ESubtreeMap *MDCache::create_subtree_map()
2484 {
2485 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2486 << num_subtrees_fullauth() << " fullauth"
2487 << dendl;
2488
2489 show_subtrees();
2490
2491 ESubtreeMap *le = new ESubtreeMap();
2492 mds->mdlog->_start_entry(le);
2493
2494 map<dirfrag_t, CDir*> dirs_to_add;
2495
2496 if (myin) {
2497 CDir* mydir = myin->get_dirfrag(frag_t());
2498 dirs_to_add[mydir->dirfrag()] = mydir;
2499 }
2500
2501 // include all auth subtrees, and their bounds.
2502 // and a spanning tree to tie it to the root.
2503 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2504 p != subtrees.end();
2505 ++p) {
2506 CDir *dir = p->first;
2507
2508 // journal subtree as "ours" if we are
2509 // me, -2
2510 // me, me
2511 // me, !me (may be importing and ambiguous!)
2512
2513 // so not
2514 // !me, *
2515 if (dir->get_dir_auth().first != mds->get_nodeid())
2516 continue;
2517
2518 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2519 my_ambiguous_imports.count(dir->dirfrag())) {
2520 dout(15) << " ambig subtree " << *dir << dendl;
2521 le->ambiguous_subtrees.insert(dir->dirfrag());
2522 } else {
2523 dout(15) << " subtree " << *dir << dendl;
2524 }
2525
2526 dirs_to_add[dir->dirfrag()] = dir;
2527 le->subtrees[dir->dirfrag()].clear();
2528
2529
2530 // bounds
2531 for (set<CDir*>::iterator q = p->second.begin();
2532 q != p->second.end();
2533 ++q) {
2534 CDir *bound = *q;
2535 dout(15) << " subtree bound " << *bound << dendl;
2536 dirs_to_add[bound->dirfrag()] = bound;
2537 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2538 }
2539 }
2540
2541 // apply projected renames
2542 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2543 p != projected_subtree_renames.end();
2544 ++p) {
2545 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2546 CInode *diri = p->first;
2547 CDir *olddir = q->first;
2548 CDir *newdir = q->second;
2549 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2550
2551 list<CDir*> dfls;
2552 diri->get_dirfrags(dfls);
2553 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2554 CDir *dir = *p;
2555 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2556 CDir *oldparent = get_projected_subtree_root(olddir);
2557 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2558 CDir *newparent = get_projected_subtree_root(newdir);
2559 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2560
2561 if (oldparent == newparent) {
2562 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2563 << oldparent->dirfrag() << dendl;
2564 continue;
2565 }
2566
2567 if (dir->is_subtree_root()) {
2568 if (le->subtrees.count(newparent->dirfrag()) &&
2569 oldparent->get_dir_auth() != newparent->get_dir_auth())
2570 dirs_to_add[dir->dirfrag()] = dir;
2571 // children are fine. change parent.
2572 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2573 le->subtrees);
2574 } else {
2575 // mid-subtree.
2576
2577 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2578 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2579 // if oldparent is auth, subtree is mine; include it.
2580 if (le->subtrees.count(oldparent->dirfrag())) {
2581 dirs_to_add[dir->dirfrag()] = dir;
2582 le->subtrees[dir->dirfrag()].clear();
2583 }
2584 // if newparent is auth, subtree is a new bound
2585 if (le->subtrees.count(newparent->dirfrag())) {
2586 dirs_to_add[dir->dirfrag()] = dir;
2587 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2588 }
2589 newparent = dir;
2590 }
2591
2592 // see if any old bounds move to the new parent.
2593 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2594 p != subtrees[oldparent].end();
2595 ++p) {
2596 CDir *bound = *p;
2597 if (dir->contains(bound->get_parent_dir()))
2598 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2599 le->subtrees);
2600 }
2601 }
2602 }
2603 }
2604 }
2605
2606 // simplify the journaled map. our in memory map may have more
2607 // subtrees than needed due to migrations that are just getting
2608 // started or just completing. but on replay, the "live" map will
2609 // be simple and we can do a straight comparison.
2610 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2611 if (le->ambiguous_subtrees.count(p->first))
2612 continue;
2613 unsigned i = 0;
2614 while (i < p->second.size()) {
2615 dirfrag_t b = p->second[i];
2616 if (le->subtrees.count(b) &&
2617 le->ambiguous_subtrees.count(b) == 0) {
2618 vector<dirfrag_t>& bb = le->subtrees[b];
2619 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2620 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2621 p->second.push_back(*r);
2622 dirs_to_add.erase(b);
2623 le->subtrees.erase(b);
2624 p->second.erase(p->second.begin() + i);
2625 } else {
2626 ++i;
2627 }
2628 }
2629 }
2630
2631 for (auto p : dirs_to_add) {
2632 CDir *dir = p.second;
2633 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2634 le->metablob.add_dir(dir, false);
2635 }
2636
2637 dout(15) << " subtrees " << le->subtrees << dendl;
2638 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2639
2640 //le->metablob.print(cout);
2641 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2642 return le;
2643 }
2644
2645 void MDCache::dump_resolve_status(Formatter *f) const
2646 {
2647 f->open_object_section("resolve_status");
2648 f->dump_stream("resolve_gather") << resolve_gather;
2649 f->dump_stream("resolve_ack_gather") << resolve_gather;
2650 f->close_section();
2651 }
2652
2653 void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2654 {
2655 dout(10) << "resolve_start" << dendl;
2656 assert(!resolve_done);
2657 resolve_done.reset(resolve_done_);
2658
2659 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2660 // if we don't have the root dir, adjust it to UNKNOWN. during
2661 // resolve we want mds0 to explicit claim the portion of it that
2662 // it owns, so that anything beyond its bounds get left as
2663 // unknown.
2664 CDir *rootdir = root->get_dirfrag(frag_t());
2665 if (rootdir)
2666 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2667 }
2668 resolve_gather = recovery_set;
2669 }
2670
2671 void MDCache::send_resolves()
2672 {
2673 send_slave_resolves();
2674 if (!resolve_ack_gather.empty()) {
2675 dout(10) << "send_resolves still waiting for resolve ack from ("
2676 << resolve_ack_gather << ")" << dendl;
2677 return;
2678 }
2679 if (!need_resolve_rollback.empty()) {
2680 dout(10) << "send_resolves still waiting for rollback to commit on ("
2681 << need_resolve_rollback << ")" << dendl;
2682 return;
2683 }
2684 send_subtree_resolves();
2685 }
2686
2687 void MDCache::send_slave_resolves()
2688 {
2689 dout(10) << "send_slave_resolves" << dendl;
2690
2691 map<mds_rank_t, MMDSResolve*> resolves;
2692
2693 if (mds->is_resolve()) {
2694 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2695 p != uncommitted_slave_updates.end();
2696 ++p) {
2697 resolves[p->first] = new MMDSResolve;
2698 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2699 q != p->second.end();
2700 ++q) {
2701 dout(10) << " including uncommitted " << q->first << dendl;
2702 resolves[p->first]->add_slave_request(q->first, false);
2703 }
2704 }
2705 } else {
2706 set<mds_rank_t> resolve_set;
2707 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2708 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2709 p != active_requests.end();
2710 ++p) {
2711 MDRequestRef& mdr = p->second;
2712 if (!mdr->is_slave())
2713 continue;
2714 if (!mdr->slave_did_prepare() && !mdr->committing) {
2715 continue;
2716 }
2717 mds_rank_t master = mdr->slave_to_mds;
2718 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2719 dout(10) << " including uncommitted " << *mdr << dendl;
2720 if (!resolves.count(master))
2721 resolves[master] = new MMDSResolve;
2722 if (!mdr->committing &&
2723 mdr->has_more() && mdr->more()->is_inode_exporter) {
2724 // re-send cap exports
2725 CInode *in = mdr->more()->rename_inode;
2726 map<client_t, Capability::Export> cap_map;
2727 in->export_client_caps(cap_map);
2728 bufferlist bl;
2729 ::encode(in->ino(), bl);
2730 ::encode(cap_map, bl);
2731 resolves[master]->add_slave_request(p->first, bl);
2732 } else {
2733 resolves[master]->add_slave_request(p->first, mdr->committing);
2734 }
2735 }
2736 }
2737 }
2738
2739 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2740 p != resolves.end();
2741 ++p) {
2742 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2743 mds->send_message_mds(p->second, p->first);
2744 resolve_ack_gather.insert(p->first);
2745 }
2746 }
2747
2748 void MDCache::send_subtree_resolves()
2749 {
2750 dout(10) << "send_subtree_resolves" << dendl;
2751
2752 if (migrator->is_exporting() || migrator->is_importing()) {
2753 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2754 migrator->show_importing();
2755 migrator->show_exporting();
2756 resolves_pending = true;
2757 return; // not now
2758 }
2759
2760 map<mds_rank_t, MMDSResolve*> resolves;
2761 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2762 p != recovery_set.end();
2763 ++p) {
2764 if (*p == mds->get_nodeid())
2765 continue;
2766 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2767 resolves[*p] = new MMDSResolve;
2768 }
2769
2770 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2771 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2772
2773 // known
2774 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2775 p != subtrees.end();
2776 ++p) {
2777 CDir *dir = p->first;
2778
2779 // only our subtrees
2780 if (dir->authority().first != mds->get_nodeid())
2781 continue;
2782
2783 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2784 continue; // we'll add it below
2785
2786 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2787 // ambiguous (mid-import)
2788 set<CDir*> bounds;
2789 get_subtree_bounds(dir, bounds);
2790 vector<dirfrag_t> dfls;
2791 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2792 dfls.push_back((*q)->dirfrag());
2793
2794 my_ambig_imports[dir->dirfrag()] = dfls;
2795 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2796 } else {
2797 // not ambiguous.
2798 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2799 q != resolves.end();
2800 ++q)
2801 resolves[q->first]->add_subtree(dir->dirfrag());
2802 // bounds too
2803 vector<dirfrag_t> dfls;
2804 for (set<CDir*>::iterator q = subtrees[dir].begin();
2805 q != subtrees[dir].end();
2806 ++q) {
2807 CDir *bound = *q;
2808 dfls.push_back(bound->dirfrag());
2809 }
2810
2811 my_subtrees[dir->dirfrag()] = dfls;
2812 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2813 }
2814 }
2815
2816 // ambiguous
2817 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2818 p != my_ambiguous_imports.end();
2819 ++p) {
2820 my_ambig_imports[p->first] = p->second;
2821 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2822 }
2823
2824 // simplify the claimed subtree.
2825 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2826 unsigned i = 0;
2827 while (i < p->second.size()) {
2828 dirfrag_t b = p->second[i];
2829 if (my_subtrees.count(b)) {
2830 vector<dirfrag_t>& bb = my_subtrees[b];
2831 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2832 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2833 p->second.push_back(*r);
2834 my_subtrees.erase(b);
2835 p->second.erase(p->second.begin() + i);
2836 } else {
2837 ++i;
2838 }
2839 }
2840 }
2841
2842 // send
2843 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2844 p != resolves.end();
2845 ++p) {
2846 MMDSResolve* m = p->second;
2847 m->subtrees = my_subtrees;
2848 m->ambiguous_imports = my_ambig_imports;
2849 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2850 mds->send_message_mds(m, p->first);
2851 }
2852 resolves_pending = false;
2853 }
2854
2855 void MDCache::handle_mds_failure(mds_rank_t who)
2856 {
2857 dout(7) << "handle_mds_failure mds." << who << dendl;
2858
2859 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2860
2861 resolve_gather.insert(who);
2862 discard_delayed_resolve(who);
2863 ambiguous_slave_updates.erase(who);
2864
2865 rejoin_gather.insert(who);
2866 rejoin_sent.erase(who); // i need to send another
2867 rejoin_ack_sent.erase(who); // i need to send another
2868 rejoin_ack_gather.erase(who); // i'll need/get another.
2869
2870 dout(10) << " resolve_gather " << resolve_gather << dendl;
2871 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2872 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2873 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2874 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2875
2876
2877 // tell the migrator too.
2878 migrator->handle_mds_failure_or_stop(who);
2879
2880 // tell the balancer too.
2881 mds->balancer->handle_mds_failure(who);
2882
2883 // clean up any requests slave to/from this node
2884 list<MDRequestRef> finish;
2885 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2886 p != active_requests.end();
2887 ++p) {
2888 MDRequestRef& mdr = p->second;
2889 // slave to the failed node?
2890 if (mdr->slave_to_mds == who) {
2891 if (mdr->slave_did_prepare()) {
2892 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2893 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2894 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2895
2896 if (!mdr->more()->waiting_on_slave.empty()) {
2897 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2898 // will rollback, no need to wait
2899 if (mdr->slave_request) {
2900 mdr->slave_request->put();
2901 mdr->slave_request = 0;
2902 }
2903 mdr->more()->waiting_on_slave.clear();
2904 }
2905 } else if (!mdr->committing) {
2906 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2907 if (mdr->slave_request || mdr->slave_rolling_back())
2908 mdr->aborted = true;
2909 else
2910 finish.push_back(mdr);
2911 }
2912 }
2913
2914 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2915 if (mdr->more()->waiting_on_slave.count(who)) {
2916 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2917 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2918 << who << dendl;
2919 mdr->more()->waiting_on_slave.erase(who);
2920 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2921 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2922 }
2923
2924 if (mdr->more()->srcdn_auth_mds == who &&
2925 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2926 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2927 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2928 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2929 }
2930 } else if (mdr->slave_request) {
2931 MMDSSlaveRequest *slave_req = mdr->slave_request;
2932 // FIXME: Slave rename request can arrive after we notice mds failure.
2933 // This can cause mds to crash (does not affect integrity of FS).
2934 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2935 slave_req->srcdn_auth == who)
2936 slave_req->mark_interrupted();
2937 }
2938
2939 // failed node is slave?
2940 if (mdr->is_master() && !mdr->committing) {
2941 if (mdr->more()->srcdn_auth_mds == who) {
2942 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2943 << who << " to recover" << dendl;
2944 assert(mdr->more()->witnessed.count(who) == 0);
2945 if (mdr->more()->is_ambiguous_auth)
2946 mdr->clear_ambiguous_auth();
2947 // rename srcdn's auth mds failed, all witnesses will rollback
2948 mdr->more()->witnessed.clear();
2949 pending_masters.erase(p->first);
2950 }
2951
2952 if (mdr->more()->witnessed.count(who)) {
2953 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2954 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2955 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2956 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2957 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2958 // until either the request is committing or the slave also fails.
2959 assert(mdr->more()->waiting_on_slave.size() == 1);
2960 pending_masters.insert(p->first);
2961 } else {
2962 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2963 << who << " to recover" << dendl;
2964 if (srcdn_auth >= 0)
2965 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2966
2967 // discard this peer's prepare (if any)
2968 mdr->more()->witnessed.erase(who);
2969 }
2970 }
2971
2972 if (mdr->more()->waiting_on_slave.count(who)) {
2973 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2974 << " to recover" << dendl;
2975 // retry request when peer recovers
2976 mdr->more()->waiting_on_slave.erase(who);
2977 if (mdr->more()->waiting_on_slave.empty())
2978 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2979 }
2980
2981 if (mdr->locking && mdr->locking_target_mds == who)
2982 mdr->finish_locking(mdr->locking);
2983 }
2984 }
2985
2986 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2987 p != uncommitted_masters.end();
2988 ++p) {
2989 // The failed MDS may have already committed the slave update
2990 if (p->second.slaves.count(who)) {
2991 p->second.recovering = true;
2992 p->second.slaves.erase(who);
2993 }
2994 }
2995
2996 while (!finish.empty()) {
2997 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
2998 request_finish(finish.front());
2999 finish.pop_front();
3000 }
3001
3002 kick_find_ino_peers(who);
3003 kick_open_ino_peers(who);
3004
3005 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3006 p != fragments.end(); ) {
3007 dirfrag_t df = p->first;
3008 fragment_info_t& info = p->second;
3009 ++p;
3010 if (info.is_fragmenting())
3011 continue;
3012 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3013 list<CDir*> dirs;
3014 info.dirs.swap(dirs);
3015 fragments.erase(df);
3016 fragment_unmark_unfreeze_dirs(dirs);
3017 }
3018
3019 // MDCache::shutdown_export_strays() always exports strays to mds.0
3020 if (who == mds_rank_t(0))
3021 shutdown_exported_strays.clear();
3022
3023 show_subtrees();
3024 }
3025
3026 /*
3027 * handle_mds_recovery - called on another node's transition
3028 * from resolve -> active.
3029 */
3030 void MDCache::handle_mds_recovery(mds_rank_t who)
3031 {
3032 dout(7) << "handle_mds_recovery mds." << who << dendl;
3033
3034 // exclude all discover waiters. kick_discovers() will do the job
3035 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3036 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3037
3038 list<MDSInternalContextBase*> waiters;
3039
3040 // wake up any waiters in their subtrees
3041 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3042 p != subtrees.end();
3043 ++p) {
3044 CDir *dir = p->first;
3045
3046 if (dir->authority().first != who ||
3047 dir->authority().second == mds->get_nodeid())
3048 continue;
3049 assert(!dir->is_auth());
3050
3051 // wake any waiters
3052 list<CDir*> q;
3053 q.push_back(dir);
3054
3055 while (!q.empty()) {
3056 CDir *d = q.front();
3057 q.pop_front();
3058 d->take_waiting(d_mask, waiters);
3059
3060 // inode waiters too
3061 for (CDir::map_t::iterator p = d->items.begin();
3062 p != d->items.end();
3063 ++p) {
3064 CDentry *dn = p->second;
3065 CDentry::linkage_t *dnl = dn->get_linkage();
3066 if (dnl->is_primary()) {
3067 dnl->get_inode()->take_waiting(i_mask, waiters);
3068
3069 // recurse?
3070 list<CDir*> ls;
3071 dnl->get_inode()->get_dirfrags(ls);
3072 for (list<CDir*>::iterator p = ls.begin();
3073 p != ls.end();
3074 ++p) {
3075 CDir *subdir = *p;
3076 if (!subdir->is_subtree_root())
3077 q.push_back(subdir);
3078 }
3079 }
3080 }
3081 }
3082 }
3083
3084 kick_open_ino_peers(who);
3085 kick_find_ino_peers(who);
3086
3087 // queue them up.
3088 mds->queue_waiters(waiters);
3089 }
3090
3091 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3092 {
3093 dout(7) << "set_recovery_set " << s << dendl;
3094 recovery_set = s;
3095 }
3096
3097
3098 /*
3099 * during resolve state, we share resolves to determine who
3100 * is authoritative for which trees. we expect to get an resolve
3101 * from _everyone_ in the recovery_set (the mds cluster at the time of
3102 * the first failure).
3103 *
3104 * This functions puts the passed message before returning
3105 */
3106 void MDCache::handle_resolve(MMDSResolve *m)
3107 {
3108 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3109 mds_rank_t from = mds_rank_t(m->get_source().num());
3110
3111 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3112 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3113 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3114 return;
3115 }
3116 // wait until we reach the resolve stage!
3117 m->put();
3118 return;
3119 }
3120
3121 discard_delayed_resolve(from);
3122
3123 // ambiguous slave requests?
3124 if (!m->slave_requests.empty()) {
3125 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3126 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3127 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3128 assert(!p->second.committing);
3129 pending_masters.insert(p->first);
3130 }
3131 }
3132
3133 if (!pending_masters.empty()) {
3134 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3135 delayed_resolve[from] = m;
3136 return;
3137 }
3138 }
3139
3140 MMDSResolveAck *ack = new MMDSResolveAck;
3141 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3142 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3143 // COMMIT
3144 if (p->second.committing) {
3145 // already committing, waiting for the OP_COMMITTED slave reply
3146 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3147 } else {
3148 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3149 ack->add_commit(p->first);
3150 }
3151 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3152
3153 if (p->second.inode_caps.length() > 0) {
3154 // slave wants to export caps (rename)
3155 assert(mds->is_resolve());
3156
3157 inodeno_t ino;
3158 map<client_t,Capability::Export> cap_exports;
3159 bufferlist::iterator q = p->second.inode_caps.begin();
3160 ::decode(ino, q);
3161 ::decode(cap_exports, q);
3162
3163 assert(get_inode(ino));
3164
3165 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3166 q != cap_exports.end();
3167 ++q) {
3168 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3169 im.cap_id = ++last_cap_id; // assign a new cap ID
3170 im.issue_seq = 1;
3171 im.mseq = q->second.mseq;
3172 }
3173
3174 // will process these caps in rejoin stage
3175 rejoin_slave_exports[ino].first = from;
3176 rejoin_slave_exports[ino].second.swap(cap_exports);
3177
3178 // send information of imported caps back to slave
3179 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3180 }
3181 } else {
3182 // ABORT
3183 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3184 assert(!p->second.committing);
3185 ack->add_abort(p->first);
3186 }
3187 }
3188 mds->send_message(ack, m->get_connection());
3189 m->put();
3190 return;
3191 }
3192
3193 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3194 dout(10) << "delay processing subtree resolve" << dendl;
3195 delayed_resolve[from] = m;
3196 return;
3197 }
3198
3199 bool survivor = false;
3200 // am i a surviving ambiguous importer?
3201 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3202 survivor = true;
3203 // check for any import success/failure (from this node)
3204 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3205 while (p != my_ambiguous_imports.end()) {
3206 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3207 ++next;
3208 CDir *dir = get_dirfrag(p->first);
3209 assert(dir);
3210 dout(10) << "checking ambiguous import " << *dir << dendl;
3211 if (migrator->is_importing(dir->dirfrag()) &&
3212 migrator->get_import_peer(dir->dirfrag()) == from) {
3213 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3214
3215 // check if sender claims the subtree
3216 bool claimed_by_sender = false;
3217 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3218 q != m->subtrees.end();
3219 ++q) {
3220 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3221 CDir *base = get_force_dirfrag(q->first, false);
3222 if (!base || !base->contains(dir))
3223 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3224
3225 bool inside = true;
3226 set<CDir*> bounds;
3227 get_force_dirfrag_bound_set(q->second, bounds);
3228 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3229 CDir *bound = *p;
3230 if (bound->contains(dir)) {
3231 inside = false; // nope, bound is dir or parent of dir, not inside.
3232 break;
3233 }
3234 }
3235 if (inside)
3236 claimed_by_sender = true;
3237 }
3238
3239 my_ambiguous_imports.erase(p); // no longer ambiguous.
3240 if (claimed_by_sender) {
3241 dout(7) << "ambiguous import failed on " << *dir << dendl;
3242 migrator->import_reverse(dir);
3243 } else {
3244 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3245 migrator->import_finish(dir, true);
3246 }
3247 }
3248 p = next;
3249 }
3250 }
3251
3252 // update my dir_auth values
3253 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3254 // migrations between other nodes)
3255 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3256 pi != m->subtrees.end();
3257 ++pi) {
3258 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3259 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3260 if (!dir)
3261 continue;
3262 adjust_bounded_subtree_auth(dir, pi->second, from);
3263 try_subtree_merge(dir);
3264 }
3265
3266 show_subtrees();
3267
3268 // note ambiguous imports too
3269 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3270 pi != m->ambiguous_imports.end();
3271 ++pi) {
3272 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3273 other_ambiguous_imports[from][pi->first].swap( pi->second );
3274 }
3275
3276 // did i get them all?
3277 resolve_gather.erase(from);
3278
3279 maybe_resolve_finish();
3280
3281 m->put();
3282 }
3283
3284 void MDCache::process_delayed_resolve()
3285 {
3286 dout(10) << "process_delayed_resolve" << dendl;
3287 map<mds_rank_t, MMDSResolve*> tmp;
3288 tmp.swap(delayed_resolve);
3289 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3290 handle_resolve(p->second);
3291 }
3292
3293 void MDCache::discard_delayed_resolve(mds_rank_t who)
3294 {
3295 if (delayed_resolve.count(who)) {
3296 delayed_resolve[who]->put();
3297 delayed_resolve.erase(who);
3298 }
3299 }
3300
3301 void MDCache::maybe_resolve_finish()
3302 {
3303 assert(resolve_ack_gather.empty());
3304 assert(need_resolve_rollback.empty());
3305
3306 if (!resolve_gather.empty()) {
3307 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3308 << resolve_gather << ")" << dendl;
3309 return;
3310 }
3311
3312 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3313 disambiguate_my_imports();
3314 finish_committed_masters();
3315
3316 if (resolve_done) {
3317 assert(mds->is_resolve());
3318 trim_unlinked_inodes();
3319 recalc_auth_bits(false);
3320 resolve_done.release()->complete(0);
3321 } else {
3322 maybe_send_pending_rejoins();
3323 }
3324 }
3325
3326 /* This functions puts the passed message before returning */
3327 void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3328 {
3329 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3330 mds_rank_t from = mds_rank_t(ack->get_source().num());
3331
3332 if (!resolve_ack_gather.count(from) ||
3333 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3334 ack->put();
3335 return;
3336 }
3337
3338 if (ambiguous_slave_updates.count(from)) {
3339 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3340 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3341 }
3342
3343 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3344 p != ack->commit.end();
3345 ++p) {
3346 dout(10) << " commit on slave " << p->first << dendl;
3347
3348 if (ambiguous_slave_updates.count(from)) {
3349 remove_ambiguous_slave_update(p->first, from);
3350 continue;
3351 }
3352
3353 if (mds->is_resolve()) {
3354 // replay
3355 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3356 assert(su);
3357
3358 // log commit
3359 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3360 ESlaveUpdate::OP_COMMIT, su->origop),
3361 new C_MDC_SlaveCommit(this, from, p->first));
3362 mds->mdlog->flush();
3363
3364 finish_uncommitted_slave_update(p->first, from);
3365 } else {
3366 MDRequestRef mdr = request_get(p->first);
3367 // information about master imported caps
3368 if (p->second.length() > 0)
3369 mdr->more()->inode_import.claim(p->second);
3370
3371 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3372 request_finish(mdr);
3373 }
3374 }
3375
3376 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3377 p != ack->abort.end();
3378 ++p) {
3379 dout(10) << " abort on slave " << *p << dendl;
3380
3381 if (mds->is_resolve()) {
3382 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3383 assert(su);
3384
3385 // perform rollback (and journal a rollback entry)
3386 // note: this will hold up the resolve a bit, until the rollback entries journal.
3387 MDRequestRef null_ref;
3388 switch (su->origop) {
3389 case ESlaveUpdate::LINK:
3390 mds->server->do_link_rollback(su->rollback, from, null_ref);
3391 break;
3392 case ESlaveUpdate::RENAME:
3393 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3394 break;
3395 case ESlaveUpdate::RMDIR:
3396 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3397 break;
3398 default:
3399 ceph_abort();
3400 }
3401 } else {
3402 MDRequestRef mdr = request_get(*p);
3403 mdr->aborted = true;
3404 if (mdr->slave_request) {
3405 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3406 add_rollback(*p, from);
3407 } else {
3408 request_finish(mdr);
3409 }
3410 }
3411 }
3412
3413 if (!ambiguous_slave_updates.count(from))
3414 resolve_ack_gather.erase(from);
3415 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3416 send_subtree_resolves();
3417 process_delayed_resolve();
3418 }
3419
3420 ack->put();
3421 }
3422
3423 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3424 {
3425 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3426 uncommitted_slave_updates[master][reqid] = su;
3427 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3428 uncommitted_slave_rename_olddir[*p]++;
3429 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3430 uncommitted_slave_unlink[*p]++;
3431 }
3432
3433 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3434 {
3435 assert(uncommitted_slave_updates[master].count(reqid));
3436 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3437
3438 uncommitted_slave_updates[master].erase(reqid);
3439 if (uncommitted_slave_updates[master].empty())
3440 uncommitted_slave_updates.erase(master);
3441 // discard the non-auth subtree we renamed out of
3442 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3443 CInode *diri = *p;
3444 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3445 assert(it != uncommitted_slave_rename_olddir.end());
3446 it->second--;
3447 if (it->second == 0) {
3448 uncommitted_slave_rename_olddir.erase(it);
3449 list<CDir*> ls;
3450 diri->get_dirfrags(ls);
3451 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3452 CDir *root = get_subtree_root(*q);
3453 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3454 try_trim_non_auth_subtree(root);
3455 if (*q != root)
3456 break;
3457 }
3458 }
3459 } else
3460 assert(it->second > 0);
3461 }
3462 // removed the inodes that were unlinked by slave update
3463 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3464 CInode *in = *p;
3465 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3466 assert(it != uncommitted_slave_unlink.end());
3467 it->second--;
3468 if (it->second == 0) {
3469 uncommitted_slave_unlink.erase(it);
3470 if (!in->get_projected_parent_dn())
3471 mds->mdcache->remove_inode_recursive(in);
3472 } else
3473 assert(it->second > 0);
3474 }
3475 delete su;
3476 }
3477
3478 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3479 {
3480
3481 MDSlaveUpdate* su = NULL;
3482 if (uncommitted_slave_updates.count(master) &&
3483 uncommitted_slave_updates[master].count(reqid)) {
3484 su = uncommitted_slave_updates[master][reqid];
3485 assert(su);
3486 }
3487 return su;
3488 }
3489
3490 void MDCache::finish_rollback(metareqid_t reqid) {
3491 assert(need_resolve_rollback.count(reqid));
3492 if (mds->is_resolve())
3493 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3494 need_resolve_rollback.erase(reqid);
3495 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3496 send_subtree_resolves();
3497 process_delayed_resolve();
3498 }
3499 }
3500
3501 void MDCache::disambiguate_other_imports()
3502 {
3503 dout(10) << "disambiguate_other_imports" << dendl;
3504
3505 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3506 // other nodes' ambiguous imports
3507 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3508 p != other_ambiguous_imports.end();
3509 ++p) {
3510 mds_rank_t who = p->first;
3511 dout(10) << "ambiguous imports for mds." << who << dendl;
3512
3513 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3514 q != p->second.end();
3515 ++q) {
3516 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3517 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3518 CDir *dir = get_force_dirfrag(q->first, recovering);
3519 if (!dir) continue;
3520
3521 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3522 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3523 dout(10) << " mds." << who << " did import " << *dir << dendl;
3524 adjust_bounded_subtree_auth(dir, q->second, who);
3525 try_subtree_merge(dir);
3526 } else {
3527 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3528 }
3529 }
3530 }
3531 other_ambiguous_imports.clear();
3532 }
3533
3534 void MDCache::disambiguate_my_imports()
3535 {
3536 dout(10) << "disambiguate_my_imports" << dendl;
3537
3538 if (!mds->is_resolve()) {
3539 assert(my_ambiguous_imports.empty());
3540 return;
3541 }
3542
3543 disambiguate_other_imports();
3544
3545 // my ambiguous imports
3546 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3547 while (!my_ambiguous_imports.empty()) {
3548 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3549
3550 CDir *dir = get_dirfrag(q->first);
3551 assert(dir);
3552
3553 if (dir->authority() != me_ambig) {
3554 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3555 cancel_ambiguous_import(dir);
3556
3557 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3558
3559 // subtree may have been swallowed by another node claiming dir
3560 // as their own.
3561 CDir *root = get_subtree_root(dir);
3562 if (root != dir)
3563 dout(10) << " subtree root is " << *root << dendl;
3564 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3565 try_trim_non_auth_subtree(root);
3566 } else {
3567 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3568 finish_ambiguous_import(q->first);
3569 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3570 }
3571 }
3572 assert(my_ambiguous_imports.empty());
3573 mds->mdlog->flush();
3574
3575 // verify all my subtrees are unambiguous!
3576 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3577 p != subtrees.end();
3578 ++p) {
3579 CDir *dir = p->first;
3580 if (dir->is_ambiguous_dir_auth()) {
3581 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3582 }
3583 assert(!dir->is_ambiguous_dir_auth());
3584 }
3585
3586 show_subtrees();
3587 }
3588
3589
3590 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3591 {
3592 assert(my_ambiguous_imports.count(base) == 0);
3593 my_ambiguous_imports[base] = bounds;
3594 }
3595
3596
3597 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3598 {
3599 // make a list
3600 vector<dirfrag_t> binos;
3601 for (set<CDir*>::iterator p = bounds.begin();
3602 p != bounds.end();
3603 ++p)
3604 binos.push_back((*p)->dirfrag());
3605
3606 // note: this can get called twice if the exporter fails during recovery
3607 if (my_ambiguous_imports.count(base->dirfrag()))
3608 my_ambiguous_imports.erase(base->dirfrag());
3609
3610 add_ambiguous_import(base->dirfrag(), binos);
3611 }
3612
3613 void MDCache::cancel_ambiguous_import(CDir *dir)
3614 {
3615 dirfrag_t df = dir->dirfrag();
3616 assert(my_ambiguous_imports.count(df));
3617 dout(10) << "cancel_ambiguous_import " << df
3618 << " bounds " << my_ambiguous_imports[df]
3619 << " " << *dir
3620 << dendl;
3621 my_ambiguous_imports.erase(df);
3622 }
3623
3624 void MDCache::finish_ambiguous_import(dirfrag_t df)
3625 {
3626 assert(my_ambiguous_imports.count(df));
3627 vector<dirfrag_t> bounds;
3628 bounds.swap(my_ambiguous_imports[df]);
3629 my_ambiguous_imports.erase(df);
3630
3631 dout(10) << "finish_ambiguous_import " << df
3632 << " bounds " << bounds
3633 << dendl;
3634 CDir *dir = get_dirfrag(df);
3635 assert(dir);
3636
3637 // adjust dir_auth, import maps
3638 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3639 try_subtree_merge(dir);
3640 }
3641
3642 void MDCache::remove_inode_recursive(CInode *in)
3643 {
3644 dout(10) << "remove_inode_recursive " << *in << dendl;
3645 list<CDir*> ls;
3646 in->get_dirfrags(ls);
3647 list<CDir*>::iterator p = ls.begin();
3648 while (p != ls.end()) {
3649 CDir *subdir = *p++;
3650
3651 dout(10) << " removing dirfrag " << subdir << dendl;
3652 CDir::map_t::iterator q = subdir->items.begin();
3653 while (q != subdir->items.end()) {
3654 CDentry *dn = q->second;
3655 ++q;
3656 CDentry::linkage_t *dnl = dn->get_linkage();
3657 if (dnl->is_primary()) {
3658 CInode *tin = dnl->get_inode();
3659 subdir->unlink_inode(dn, false);
3660 remove_inode_recursive(tin);
3661 }
3662 subdir->remove_dentry(dn);
3663 }
3664
3665 if (subdir->is_subtree_root())
3666 remove_subtree(subdir);
3667 in->close_dirfrag(subdir->dirfrag().frag);
3668 }
3669 remove_inode(in);
3670 }
3671
3672 bool MDCache::expire_recursive(
3673 CInode *in,
3674 map<mds_rank_t, MCacheExpire*>& expiremap)
3675 {
3676 assert(!in->is_auth());
3677
3678 dout(10) << __func__ << ":" << *in << dendl;
3679
3680 // Recurse into any dirfrags beneath this inode
3681 list<CDir*> ls;
3682 in->get_dirfrags(ls);
3683 for (auto subdir : ls) {
3684 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3685 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3686 return true;
3687 }
3688
3689 for (auto &it : subdir->items) {
3690 CDentry *dn = it.second;
3691 CDentry::linkage_t *dnl = dn->get_linkage();
3692 if (dnl->is_primary()) {
3693 CInode *tin = dnl->get_inode();
3694
3695 /* Remote strays with linkage (i.e. hardlinks) should not be
3696 * expired, because they may be the target of
3697 * a rename() as the owning MDS shuts down */
3698 if (!tin->is_stray() && tin->inode.nlink) {
3699 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3700 return true;
3701 }
3702
3703 const bool abort = expire_recursive(tin, expiremap);
3704 if (abort) {
3705 return true;
3706 }
3707 }
3708 if (dn->lru_is_expireable()) {
3709 trim_dentry(dn, expiremap);
3710 } else {
3711 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3712 return true;
3713 }
3714 }
3715 }
3716
3717 return false;
3718 }
3719
3720 void MDCache::trim_unlinked_inodes()
3721 {
3722 dout(7) << "trim_unlinked_inodes" << dendl;
3723 list<CInode*> q;
3724 for (auto p : inode_map) {
3725 CInode *in = p.second;
3726 if (in->get_parent_dn() == NULL && !in->is_base()) {
3727 dout(7) << " will trim from " << *in << dendl;
3728 q.push_back(in);
3729 }
3730 }
3731 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3732 remove_inode_recursive(*p);
3733 }
3734
3735 /** recalc_auth_bits()
3736 * once subtree auth is disambiguated, we need to adjust all the
3737 * auth and dirty bits in our cache before moving on.
3738 */
3739 void MDCache::recalc_auth_bits(bool replay)
3740 {
3741 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3742
3743 if (root) {
3744 root->inode_auth.first = mds->mdsmap->get_root();
3745 bool auth = mds->get_nodeid() == root->inode_auth.first;
3746 if (auth) {
3747 root->state_set(CInode::STATE_AUTH);
3748 } else {
3749 root->state_clear(CInode::STATE_AUTH);
3750 if (!replay)
3751 root->state_set(CInode::STATE_REJOINING);
3752 }
3753 }
3754
3755 set<CInode*> subtree_inodes;
3756 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3757 p != subtrees.end();
3758 ++p) {
3759 if (p->first->dir_auth.first == mds->get_nodeid())
3760 subtree_inodes.insert(p->first->inode);
3761 }
3762
3763 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3764 p != subtrees.end();
3765 ++p) {
3766 if (p->first->inode->is_mdsdir()) {
3767 CInode *in = p->first->inode;
3768 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3769 if (auth) {
3770 in->state_set(CInode::STATE_AUTH);
3771 } else {
3772 in->state_clear(CInode::STATE_AUTH);
3773 if (!replay)
3774 in->state_set(CInode::STATE_REJOINING);
3775 }
3776 }
3777
3778 list<CDir*> dfq; // dirfrag queue
3779 dfq.push_back(p->first);
3780
3781 bool auth = p->first->authority().first == mds->get_nodeid();
3782 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3783
3784 while (!dfq.empty()) {
3785 CDir *dir = dfq.front();
3786 dfq.pop_front();
3787
3788 // dir
3789 if (auth) {
3790 dir->state_set(CDir::STATE_AUTH);
3791 } else {
3792 dir->state_clear(CDir::STATE_AUTH);
3793 if (!replay) {
3794 // close empty non-auth dirfrag
3795 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3796 dir->inode->close_dirfrag(dir->get_frag());
3797 continue;
3798 }
3799 dir->state_set(CDir::STATE_REJOINING);
3800 dir->state_clear(CDir::STATE_COMPLETE);
3801 if (dir->is_dirty())
3802 dir->mark_clean();
3803 }
3804 }
3805
3806 // dentries in this dir
3807 for (CDir::map_t::iterator q = dir->items.begin();
3808 q != dir->items.end();
3809 ++q) {
3810 // dn
3811 CDentry *dn = q->second;
3812 CDentry::linkage_t *dnl = dn->get_linkage();
3813 if (auth) {
3814 dn->state_set(CDentry::STATE_AUTH);
3815 } else {
3816 dn->state_clear(CDentry::STATE_AUTH);
3817 if (!replay) {
3818 dn->state_set(CDentry::STATE_REJOINING);
3819 if (dn->is_dirty())
3820 dn->mark_clean();
3821 }
3822 }
3823
3824 if (dnl->is_primary()) {
3825 // inode
3826 CInode *in = dnl->get_inode();
3827 if (auth) {
3828 in->state_set(CInode::STATE_AUTH);
3829 } else {
3830 in->state_clear(CInode::STATE_AUTH);
3831 if (!replay) {
3832 in->state_set(CInode::STATE_REJOINING);
3833 if (in->is_dirty())
3834 in->mark_clean();
3835 if (in->is_dirty_parent())
3836 in->clear_dirty_parent();
3837 // avoid touching scatterlocks for our subtree roots!
3838 if (subtree_inodes.count(in) == 0)
3839 in->clear_scatter_dirty();
3840 }
3841 }
3842 // recurse?
3843 if (in->is_dir())
3844 in->get_nested_dirfrags(dfq);
3845 }
3846 }
3847 }
3848 }
3849
3850 show_subtrees();
3851 show_cache();
3852 }
3853
3854
3855
3856 // ===========================================================================
3857 // REJOIN
3858
3859 /*
3860 * notes on scatterlock recovery:
3861 *
3862 * - recovering inode replica sends scatterlock data for any subtree
3863 * roots (the only ones that are possibly dirty).
3864 *
3865 * - surviving auth incorporates any provided scatterlock data. any
3866 * pending gathers are then finished, as with the other lock types.
3867 *
3868 * that takes care of surviving auth + (recovering replica)*.
3869 *
3870 * - surviving replica sends strong_inode, which includes current
3871 * scatterlock state, AND any dirty scatterlock data. this
3872 * provides the recovering auth with everything it might need.
3873 *
3874 * - recovering auth must pick initial scatterlock state based on
3875 * (weak|strong) rejoins.
3876 * - always assimilate scatterlock data (it can't hurt)
3877 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3878 * - include base inode in ack for all inodes that saw scatterlock content
3879 *
3880 * also, for scatter gather,
3881 *
3882 * - auth increments {frag,r}stat.version on completion of any gather.
3883 *
3884 * - auth incorporates changes in a gather _only_ if the version
3885 * matches.
3886 *
3887 * - replica discards changes any time the scatterlock syncs, and
3888 * after recovery.
3889 */
3890
3891 void MDCache::dump_rejoin_status(Formatter *f) const
3892 {
3893 f->open_object_section("rejoin_status");
3894 f->dump_stream("rejoin_gather") << rejoin_gather;
3895 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3896 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3897 f->close_section();
3898 }
3899
3900 void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3901 {
3902 dout(10) << "rejoin_start" << dendl;
3903 assert(!rejoin_done);
3904 rejoin_done.reset(rejoin_done_);
3905
3906 rejoin_gather = recovery_set;
3907 // need finish opening cap inodes before sending cache rejoins
3908 rejoin_gather.insert(mds->get_nodeid());
3909 process_imported_caps();
3910 }
3911
3912 /*
3913 * rejoin phase!
3914 *
3915 * this initiates rejoin. it shoudl be called before we get any
3916 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3917 *
3918 * we start out by sending rejoins to everyone in the recovery set.
3919 *
3920 * if we are rejoin, send for all regions in our cache.
3921 * if we are active|stopping, send only to nodes that are are rejoining.
3922 */
3923 void MDCache::rejoin_send_rejoins()
3924 {
3925 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3926
3927 if (rejoin_gather.count(mds->get_nodeid())) {
3928 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3929 rejoins_pending = true;
3930 return;
3931 }
3932 if (!resolve_gather.empty()) {
3933 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3934 << resolve_gather << ")" << dendl;
3935 rejoins_pending = true;
3936 return;
3937 }
3938
3939 assert(!migrator->is_importing());
3940 assert(!migrator->is_exporting());
3941
3942 if (!mds->is_rejoin()) {
3943 disambiguate_other_imports();
3944 }
3945
3946 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3947
3948
3949 // if i am rejoining, send a rejoin to everyone.
3950 // otherwise, just send to others who are rejoining.
3951 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3952 p != recovery_set.end();
3953 ++p) {
3954 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3955 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3956 if (mds->is_rejoin())
3957 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3958 else if (mds->mdsmap->is_rejoin(*p))
3959 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3960 }
3961
3962 if (mds->is_rejoin()) {
3963 map<client_t, set<mds_rank_t> > client_exports;
3964 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3965 assert(cap_export_targets.count(p->first));
3966 mds_rank_t target = cap_export_targets[p->first];
3967 if (rejoins.count(target) == 0)
3968 continue;
3969 rejoins[target]->cap_exports[p->first] = p->second;
3970 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3971 client_exports[q->first].insert(target);
3972 }
3973 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3974 p != client_exports.end();
3975 ++p) {
3976 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3977 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3978 rejoins[*q]->client_map[p->first] = inst;
3979 }
3980 }
3981
3982
3983 // check all subtrees
3984 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
3985 p != subtrees.end();
3986 ++p) {
3987 CDir *dir = p->first;
3988 assert(dir->is_subtree_root());
3989 if (dir->is_ambiguous_dir_auth()) {
3990 // exporter is recovering, importer is survivor.
3991 assert(rejoins.count(dir->authority().first));
3992 assert(!rejoins.count(dir->authority().second));
3993 continue;
3994 }
3995
3996 // my subtree?
3997 if (dir->is_auth())
3998 continue; // skip my own regions!
3999
4000 mds_rank_t auth = dir->get_dir_auth().first;
4001 assert(auth >= 0);
4002 if (rejoins.count(auth) == 0)
4003 continue; // don't care about this node's subtrees
4004
4005 rejoin_walk(dir, rejoins[auth]);
4006 }
4007
4008 // rejoin root inodes, too
4009 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4010 p != rejoins.end();
4011 ++p) {
4012 if (mds->is_rejoin()) {
4013 // weak
4014 if (p->first == 0 && root) {
4015 p->second->add_weak_inode(root->vino());
4016 if (root->is_dirty_scattered()) {
4017 dout(10) << " sending scatterlock state on root " << *root << dendl;
4018 p->second->add_scatterlock_state(root);
4019 }
4020 }
4021 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4022 if (in)
4023 p->second->add_weak_inode(in->vino());
4024 }
4025 } else {
4026 // strong
4027 if (p->first == 0 && root) {
4028 p->second->add_strong_inode(root->vino(),
4029 root->get_replica_nonce(),
4030 root->get_caps_wanted(),
4031 root->filelock.get_state(),
4032 root->nestlock.get_state(),
4033 root->dirfragtreelock.get_state());
4034 root->state_set(CInode::STATE_REJOINING);
4035 if (root->is_dirty_scattered()) {
4036 dout(10) << " sending scatterlock state on root " << *root << dendl;
4037 p->second->add_scatterlock_state(root);
4038 }
4039 }
4040
4041 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4042 p->second->add_strong_inode(in->vino(),
4043 in->get_replica_nonce(),
4044 in->get_caps_wanted(),
4045 in->filelock.get_state(),
4046 in->nestlock.get_state(),
4047 in->dirfragtreelock.get_state());
4048 in->state_set(CInode::STATE_REJOINING);
4049 }
4050 }
4051 }
4052
4053 if (!mds->is_rejoin()) {
4054 // i am survivor. send strong rejoin.
4055 // note request remote_auth_pins, xlocks
4056 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4057 p != active_requests.end();
4058 ++p) {
4059 MDRequestRef& mdr = p->second;
4060 if (mdr->is_slave())
4061 continue;
4062 // auth pins
4063 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4064 q != mdr->remote_auth_pins.end();
4065 ++q) {
4066 if (!q->first->is_auth()) {
4067 assert(q->second == q->first->authority().first);
4068 if (rejoins.count(q->second) == 0) continue;
4069 MMDSCacheRejoin *rejoin = rejoins[q->second];
4070
4071 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4072 MDSCacheObjectInfo i;
4073 q->first->set_object_info(i);
4074 if (i.ino)
4075 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4076 else
4077 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4078
4079 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4080 mdr->more()->rename_inode == q->first)
4081 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4082 mdr->reqid, mdr->attempt);
4083 }
4084 }
4085 // xlocks
4086 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4087 q != mdr->xlocks.end();
4088 ++q) {
4089 if (!(*q)->get_parent()->is_auth()) {
4090 mds_rank_t who = (*q)->get_parent()->authority().first;
4091 if (rejoins.count(who) == 0) continue;
4092 MMDSCacheRejoin *rejoin = rejoins[who];
4093
4094 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4095 MDSCacheObjectInfo i;
4096 (*q)->get_parent()->set_object_info(i);
4097 if (i.ino)
4098 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4099 mdr->reqid, mdr->attempt);
4100 else
4101 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4102 mdr->reqid, mdr->attempt);
4103 }
4104 }
4105 // remote wrlocks
4106 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4107 q != mdr->remote_wrlocks.end();
4108 ++q) {
4109 mds_rank_t who = q->second;
4110 if (rejoins.count(who) == 0) continue;
4111 MMDSCacheRejoin *rejoin = rejoins[who];
4112
4113 dout(15) << " " << *mdr << " wrlock on " << q->second
4114 << " " << q->first->get_parent() << dendl;
4115 MDSCacheObjectInfo i;
4116 q->first->get_parent()->set_object_info(i);
4117 assert(i.ino);
4118 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4119 mdr->reqid, mdr->attempt);
4120 }
4121 }
4122 }
4123
4124 // send the messages
4125 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4126 p != rejoins.end();
4127 ++p) {
4128 assert(rejoin_sent.count(p->first) == 0);
4129 assert(rejoin_ack_gather.count(p->first) == 0);
4130 rejoin_sent.insert(p->first);
4131 rejoin_ack_gather.insert(p->first);
4132 mds->send_message_mds(p->second, p->first);
4133 }
4134 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4135 rejoins_pending = false;
4136
4137 // nothing?
4138 if (mds->is_rejoin() && rejoins.empty()) {
4139 dout(10) << "nothing to rejoin" << dendl;
4140 rejoin_gather_finish();
4141 }
4142 }
4143
4144
4145 /**
4146 * rejoin_walk - build rejoin declarations for a subtree
4147 *
4148 * @param dir subtree root
4149 * @param rejoin rejoin message
4150 *
4151 * from a rejoining node:
4152 * weak dirfrag
4153 * weak dentries (w/ connectivity)
4154 *
4155 * from a surviving node:
4156 * strong dirfrag
4157 * strong dentries (no connectivity!)
4158 * strong inodes
4159 */
4160 void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4161 {
4162 dout(10) << "rejoin_walk " << *dir << dendl;
4163
4164 list<CDir*> nested; // finish this dir, then do nested items
4165
4166 if (mds->is_rejoin()) {
4167 // WEAK
4168 rejoin->add_weak_dirfrag(dir->dirfrag());
4169 for (CDir::map_t::iterator p = dir->items.begin();
4170 p != dir->items.end();
4171 ++p) {
4172 CDentry *dn = p->second;
4173 CDentry::linkage_t *dnl = dn->get_linkage();
4174 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4175 assert(dnl->is_primary());
4176 CInode *in = dnl->get_inode();
4177 assert(dnl->get_inode()->is_dir());
4178 rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
4179 in->get_nested_dirfrags(nested);
4180 if (in->is_dirty_scattered()) {
4181 dout(10) << " sending scatterlock state on " << *in << dendl;
4182 rejoin->add_scatterlock_state(in);
4183 }
4184 }
4185 } else {
4186 // STRONG
4187 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4188 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4189 dir->state_set(CDir::STATE_REJOINING);
4190
4191 for (CDir::map_t::iterator p = dir->items.begin();
4192 p != dir->items.end();
4193 ++p) {
4194 CDentry *dn = p->second;
4195 CDentry::linkage_t *dnl = dn->get_linkage();
4196 dout(15) << " add_strong_dentry " << *dn << dendl;
4197 rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4198 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4199 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4200 dnl->is_remote() ? dnl->get_remote_d_type():0,
4201 dn->get_replica_nonce(),
4202 dn->lock.get_state());
4203 dn->state_set(CDentry::STATE_REJOINING);
4204 if (dnl->is_primary()) {
4205 CInode *in = dnl->get_inode();
4206 dout(15) << " add_strong_inode " << *in << dendl;
4207 rejoin->add_strong_inode(in->vino(),
4208 in->get_replica_nonce(),
4209 in->get_caps_wanted(),
4210 in->filelock.get_state(),
4211 in->nestlock.get_state(),
4212 in->dirfragtreelock.get_state());
4213 in->state_set(CInode::STATE_REJOINING);
4214 in->get_nested_dirfrags(nested);
4215 if (in->is_dirty_scattered()) {
4216 dout(10) << " sending scatterlock state on " << *in << dendl;
4217 rejoin->add_scatterlock_state(in);
4218 }
4219 }
4220 }
4221 }
4222
4223 // recurse into nested dirs
4224 for (list<CDir*>::iterator p = nested.begin();
4225 p != nested.end();
4226 ++p)
4227 rejoin_walk(*p, rejoin);
4228 }
4229
4230
4231 /*
4232 * i got a rejoin.
4233 * - reply with the lockstate
4234 *
4235 * if i am active|stopping,
4236 * - remove source from replica list for everything not referenced here.
4237 * This function puts the passed message before returning.
4238 */
4239 void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4240 {
4241 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4242 << " (" << m->get_payload().length() << " bytes)"
4243 << dendl;
4244
4245 switch (m->op) {
4246 case MMDSCacheRejoin::OP_WEAK:
4247 handle_cache_rejoin_weak(m);
4248 break;
4249 case MMDSCacheRejoin::OP_STRONG:
4250 handle_cache_rejoin_strong(m);
4251 break;
4252 case MMDSCacheRejoin::OP_ACK:
4253 handle_cache_rejoin_ack(m);
4254 break;
4255
4256 default:
4257 ceph_abort();
4258 }
4259 m->put();
4260 }
4261
4262
4263 /*
4264 * handle_cache_rejoin_weak
4265 *
4266 * the sender
4267 * - is recovering from their journal.
4268 * - may have incorrect (out of date) inode contents
4269 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4270 *
4271 * if the sender didn't trim_non_auth(), they
4272 * - may have incorrect (out of date) dentry/inode linkage
4273 * - may have deleted/purged inodes
4274 * and i may have to go to disk to get accurate inode contents. yuck.
4275 * This functions DOES NOT put the passed message before returning
4276 */
4277 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4278 {
4279 mds_rank_t from = mds_rank_t(weak->get_source().num());
4280
4281 // possible response(s)
4282 MMDSCacheRejoin *ack = 0; // if survivor
4283 set<vinodeno_t> acked_inodes; // if survivor
4284 set<SimpleLock *> gather_locks; // if survivor
4285 bool survivor = false; // am i a survivor?
4286
4287 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4288 survivor = true;
4289 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4290 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4291
4292 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4293
4294 // check cap exports
4295 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4296 CInode *in = get_inode(p->first);
4297 assert(!in || in->is_auth());
4298 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4299 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4300 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4301 Capability::Import& im = imported_caps[p->first][q->first];
4302 if (cap) {
4303 im.cap_id = cap->get_cap_id();
4304 im.issue_seq = cap->get_last_seq();
4305 im.mseq = cap->get_mseq();
4306 } else {
4307 // all are zero
4308 }
4309 }
4310 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4311 }
4312
4313 ::encode(imported_caps, ack->imported_caps);
4314 } else {
4315 assert(mds->is_rejoin());
4316
4317 // we may have already received a strong rejoin from the sender.
4318 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4319 assert(gather_locks.empty());
4320
4321 // check cap exports.
4322 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4323
4324 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4325 CInode *in = get_inode(p->first);
4326 assert(!in || in->is_auth());
4327 // note
4328 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4329 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4330 cap_imports[p->first][q->first][from] = q->second;
4331 }
4332 }
4333 }
4334
4335 // assimilate any potentially dirty scatterlock state
4336 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4337 p != weak->inode_scatterlocks.end();
4338 ++p) {
4339 CInode *in = get_inode(p->first);
4340 assert(in);
4341 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4342 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4343 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4344 if (!survivor)
4345 rejoin_potential_updated_scatterlocks.insert(in);
4346 }
4347
4348 // recovering peer may send incorrect dirfrags here. we need to
4349 // infer which dirfrag they meant. the ack will include a
4350 // strong_dirfrag that will set them straight on the fragmentation.
4351
4352 // walk weak map
4353 set<CDir*> dirs_to_share;
4354 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4355 p != weak->weak_dirfrags.end();
4356 ++p) {
4357 CInode *diri = get_inode(p->ino);
4358 if (!diri)
4359 dout(0) << " missing dir ino " << p->ino << dendl;
4360 assert(diri);
4361
4362 list<frag_t> ls;
4363 if (diri->dirfragtree.is_leaf(p->frag)) {
4364 ls.push_back(p->frag);
4365 } else {
4366 diri->dirfragtree.get_leaves_under(p->frag, ls);
4367 if (ls.empty())
4368 ls.push_back(diri->dirfragtree[p->frag.value()]);
4369 }
4370 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4371 frag_t fg = *q;
4372 CDir *dir = diri->get_dirfrag(fg);
4373 if (!dir) {
4374 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4375 continue;
4376 }
4377 assert(dir);
4378 if (dirs_to_share.count(dir)) {
4379 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4380 } else {
4381 dirs_to_share.insert(dir);
4382 unsigned nonce = dir->add_replica(from);
4383 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4384 if (ack) {
4385 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4386 ack->add_dirfrag_base(dir);
4387 }
4388 }
4389 }
4390 }
4391
4392 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4393 p != weak->weak.end();
4394 ++p) {
4395 CInode *diri = get_inode(p->first);
4396 if (!diri)
4397 dout(0) << " missing dir ino " << p->first << dendl;
4398 assert(diri);
4399
4400 // weak dentries
4401 CDir *dir = 0;
4402 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4403 q != p->second.end();
4404 ++q) {
4405 // locate proper dirfrag.
4406 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4407 frag_t fg = diri->pick_dirfrag(q->first.name);
4408 if (!dir || dir->get_frag() != fg) {
4409 dir = diri->get_dirfrag(fg);
4410 if (!dir)
4411 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4412 assert(dir);
4413 assert(dirs_to_share.count(dir));
4414 }
4415
4416 // and dentry
4417 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4418 assert(dn);
4419 CDentry::linkage_t *dnl = dn->get_linkage();
4420 assert(dnl->is_primary());
4421
4422 if (survivor && dn->is_replica(from))
4423 dentry_remove_replica(dn, from, gather_locks);
4424 unsigned dnonce = dn->add_replica(from);
4425 dout(10) << " have " << *dn << dendl;
4426 if (ack)
4427 ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4428 dnl->get_inode()->ino(), inodeno_t(0), 0,
4429 dnonce, dn->lock.get_replica_state());
4430
4431 // inode
4432 CInode *in = dnl->get_inode();
4433 assert(in);
4434
4435 if (survivor && in->is_replica(from))
4436 inode_remove_replica(in, from, true, gather_locks);
4437 unsigned inonce = in->add_replica(from);
4438 dout(10) << " have " << *in << dendl;
4439
4440 // scatter the dirlock, just in case?
4441 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4442 in->filelock.set_state(LOCK_MIX);
4443
4444 if (ack) {
4445 acked_inodes.insert(in->vino());
4446 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4447 bufferlist bl;
4448 in->_encode_locks_state_for_rejoin(bl, from);
4449 ack->add_inode_locks(in, inonce, bl);
4450 }
4451 }
4452 }
4453
4454 // weak base inodes? (root, stray, etc.)
4455 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4456 p != weak->weak_inodes.end();
4457 ++p) {
4458 CInode *in = get_inode(*p);
4459 assert(in); // hmm fixme wrt stray?
4460 if (survivor && in->is_replica(from))
4461 inode_remove_replica(in, from, true, gather_locks);
4462 unsigned inonce = in->add_replica(from);
4463 dout(10) << " have base " << *in << dendl;
4464
4465 if (ack) {
4466 acked_inodes.insert(in->vino());
4467 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4468 bufferlist bl;
4469 in->_encode_locks_state_for_rejoin(bl, from);
4470 ack->add_inode_locks(in, inonce, bl);
4471 }
4472 }
4473
4474 assert(rejoin_gather.count(from));
4475 rejoin_gather.erase(from);
4476 if (survivor) {
4477 // survivor. do everything now.
4478 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4479 p != weak->inode_scatterlocks.end();
4480 ++p) {
4481 CInode *in = get_inode(p->first);
4482 assert(in);
4483 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4484 acked_inodes.insert(in->vino());
4485 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4486 }
4487
4488 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4489 mds->send_message(ack, weak->get_connection());
4490
4491 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4492 if (!(*p)->is_stable())
4493 mds->locker->eval_gather(*p);
4494 }
4495 } else {
4496 // done?
4497 if (rejoin_gather.empty()) {
4498 rejoin_gather_finish();
4499 } else {
4500 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4501 }
4502 }
4503 }
4504
4505 class C_MDC_RejoinGatherFinish : public MDCacheContext {
4506 public:
4507 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4508 void finish(int r) override {
4509 mdcache->rejoin_gather_finish();
4510 }
4511 };
4512
4513 /*
4514 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4515 *
4516 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4517 * ack, the replica dne, and we can remove it from our replica maps.
4518 */
4519 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4520 set<vinodeno_t>& acked_inodes,
4521 set<SimpleLock *>& gather_locks)
4522 {
4523 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4524
4525 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4526 // inode?
4527 if (in->is_auth() &&
4528 in->is_replica(from) &&
4529 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4530 inode_remove_replica(in, from, false, gather_locks);
4531 dout(10) << " rem " << *in << dendl;
4532 }
4533
4534 if (!in->is_dir())
4535 return;
4536
4537 list<CDir*> dfs;
4538 in->get_dirfrags(dfs);
4539 for (list<CDir*>::iterator p = dfs.begin();
4540 p != dfs.end();
4541 ++p) {
4542 CDir *dir = *p;
4543 if (!dir->is_auth())
4544 continue;
4545
4546 if (dir->is_replica(from) &&
4547 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4548 dir->remove_replica(from);
4549 dout(10) << " rem " << *dir << dendl;
4550 }
4551
4552 // dentries
4553 for (CDir::map_t::iterator p = dir->items.begin();
4554 p != dir->items.end();
4555 ++p) {
4556 CDentry *dn = p->second;
4557
4558 if (dn->is_replica(from) &&
4559 (ack == NULL ||
4560 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4561 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
4562 dentry_remove_replica(dn, from, gather_locks);
4563 dout(10) << " rem " << *dn << dendl;
4564 }
4565 }
4566 }
4567 };
4568
4569 for (auto p : inode_map)
4570 scour_func(p.second);
4571 for (auto p : snap_inode_map)
4572 scour_func(p.second);
4573 }
4574
4575
4576 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4577 {
4578 CInode *in = new CInode(this, true, 1, last);
4579 in->inode.ino = ino;
4580 in->state_set(CInode::STATE_REJOINUNDEF);
4581 add_inode(in);
4582 rejoin_undef_inodes.insert(in);
4583 dout(10) << " invented " << *in << dendl;
4584 return in;
4585 }
4586
4587 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4588 {
4589 CInode *in = get_inode(df.ino);
4590 if (!in)
4591 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4592 if (!in->is_dir()) {
4593 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4594 in->inode.mode = S_IFDIR;
4595 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4596 }
4597 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4598 dir->state_set(CDir::STATE_REJOINUNDEF);
4599 rejoin_undef_dirfrags.insert(dir);
4600 dout(10) << " invented " << *dir << dendl;
4601 return dir;
4602 }
4603
4604 /* This functions DOES NOT put the passed message before returning */
4605 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4606 {
4607 mds_rank_t from = mds_rank_t(strong->get_source().num());
4608
4609 // only a recovering node will get a strong rejoin.
4610 assert(mds->is_rejoin());
4611
4612 // assimilate any potentially dirty scatterlock state
4613 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4614 p != strong->inode_scatterlocks.end();
4615 ++p) {
4616 CInode *in = get_inode(p->first);
4617 assert(in);
4618 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4619 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4620 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4621 rejoin_potential_updated_scatterlocks.insert(in);
4622 }
4623
4624 rejoin_unlinked_inodes[from].clear();
4625
4626 // surviving peer may send incorrect dirfrag here (maybe they didn't
4627 // get the fragment notify, or maybe we rolled back?). we need to
4628 // infer the right frag and get them with the program. somehow.
4629 // we don't normally send ACK.. so we'll need to bundle this with
4630 // MISSING or something.
4631
4632 // strong dirfrags/dentries.
4633 // also process auth_pins, xlocks.
4634 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4635 p != strong->strong_dirfrags.end();
4636 ++p) {
4637 CInode *diri = get_inode(p->first.ino);
4638 if (!diri)
4639 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4640 CDir *dir = diri->get_dirfrag(p->first.frag);
4641 bool refragged = false;
4642 if (dir) {
4643 dout(10) << " have " << *dir << dendl;
4644 } else {
4645 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4646 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4647 else if (diri->dirfragtree.is_leaf(p->first.frag))
4648 dir = rejoin_invent_dirfrag(p->first);
4649 }
4650 if (dir) {
4651 dir->add_replica(from, p->second.nonce);
4652 dir->dir_rep = p->second.dir_rep;
4653 } else {
4654 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4655 list<frag_t> ls;
4656 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4657 if (ls.empty())
4658 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4659 dout(10) << " maps to frag(s) " << ls << dendl;
4660 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4661 CDir *dir = diri->get_dirfrag(*q);
4662 if (!dir)
4663 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4664 else
4665 dout(10) << " have(approx) " << *dir << dendl;
4666 dir->add_replica(from, p->second.nonce);
4667 dir->dir_rep = p->second.dir_rep;
4668 }
4669 refragged = true;
4670 }
4671
4672 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4673 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4674 q != dmap.end();
4675 ++q) {
4676 CDentry *dn;
4677 if (!refragged)
4678 dn = dir->lookup(q->first.name, q->first.snapid);
4679 else {
4680 frag_t fg = diri->pick_dirfrag(q->first.name);
4681 dir = diri->get_dirfrag(fg);
4682 assert(dir);
4683 dn = dir->lookup(q->first.name, q->first.snapid);
4684 }
4685 if (!dn) {
4686 if (q->second.is_remote()) {
4687 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4688 q->second.first, q->first.snapid);
4689 } else if (q->second.is_null()) {
4690 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4691 } else {
4692 CInode *in = get_inode(q->second.ino, q->first.snapid);
4693 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4694 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4695 }
4696 dout(10) << " invented " << *dn << dendl;
4697 }
4698 CDentry::linkage_t *dnl = dn->get_linkage();
4699
4700 // dn auth_pin?
4701 if (strong->authpinned_dentries.count(p->first) &&
4702 strong->authpinned_dentries[p->first].count(q->first)) {
4703 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4704 r != strong->authpinned_dentries[p->first][q->first].end();
4705 ++r) {
4706 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4707
4708 // get/create slave mdrequest
4709 MDRequestRef mdr;
4710 if (have_request(r->reqid))
4711 mdr = request_get(r->reqid);
4712 else
4713 mdr = request_start_slave(r->reqid, r->attempt, strong);
4714 mdr->auth_pin(dn);
4715 }
4716 }
4717
4718 // dn xlock?
4719 if (strong->xlocked_dentries.count(p->first) &&
4720 strong->xlocked_dentries[p->first].count(q->first)) {
4721 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4722 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4723 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4724 assert(mdr->is_auth_pinned(dn));
4725 if (!mdr->xlocks.count(&dn->versionlock)) {
4726 assert(dn->versionlock.can_xlock_local());
4727 dn->versionlock.get_xlock(mdr, mdr->get_client());
4728 mdr->xlocks.insert(&dn->versionlock);
4729 mdr->locks.insert(&dn->versionlock);
4730 }
4731 if (dn->lock.is_stable())
4732 dn->auth_pin(&dn->lock);
4733 dn->lock.set_state(LOCK_XLOCK);
4734 dn->lock.get_xlock(mdr, mdr->get_client());
4735 mdr->xlocks.insert(&dn->lock);
4736 mdr->locks.insert(&dn->lock);
4737 }
4738
4739 dn->add_replica(from, q->second.nonce);
4740 dout(10) << " have " << *dn << dendl;
4741
4742 if (dnl->is_primary()) {
4743 if (q->second.is_primary()) {
4744 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4745 // the survivor missed MDentryUnlink+MDentryLink messages ?
4746 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4747 CInode *in = get_inode(q->second.ino, q->first.snapid);
4748 assert(in);
4749 assert(in->get_parent_dn());
4750 rejoin_unlinked_inodes[from].insert(in);
4751 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4752 }
4753 } else {
4754 // the survivor missed MDentryLink message ?
4755 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4756 dout(7) << " sender doesn't have primay dentry" << dendl;
4757 }
4758 } else {
4759 if (q->second.is_primary()) {
4760 // the survivor missed MDentryUnlink message ?
4761 CInode *in = get_inode(q->second.ino, q->first.snapid);
4762 assert(in);
4763 assert(in->get_parent_dn());
4764 rejoin_unlinked_inodes[from].insert(in);
4765 dout(7) << " sender has primary dentry but we don't" << dendl;
4766 }
4767 }
4768 }
4769 }
4770
4771 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4772 p != strong->strong_inodes.end();
4773 ++p) {
4774 CInode *in = get_inode(p->first);
4775 assert(in);
4776 in->add_replica(from, p->second.nonce);
4777 dout(10) << " have " << *in << dendl;
4778
4779 MMDSCacheRejoin::inode_strong &is = p->second;
4780
4781 // caps_wanted
4782 if (is.caps_wanted) {
4783 in->mds_caps_wanted[from] = is.caps_wanted;
4784 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4785 << " on " << *in << dendl;
4786 }
4787
4788 // scatterlocks?
4789 // infer state from replica state:
4790 // * go to MIX if they might have wrlocks
4791 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4792 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4793 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4794 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4795
4796 // auth pin?
4797 if (strong->authpinned_inodes.count(in->vino())) {
4798 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4799 r != strong->authpinned_inodes[in->vino()].end();
4800 ++r) {
4801 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4802
4803 // get/create slave mdrequest
4804 MDRequestRef mdr;
4805 if (have_request(r->reqid))
4806 mdr = request_get(r->reqid);
4807 else
4808 mdr = request_start_slave(r->reqid, r->attempt, strong);
4809 if (strong->frozen_authpin_inodes.count(in->vino())) {
4810 assert(!in->get_num_auth_pins());
4811 mdr->freeze_auth_pin(in);
4812 } else {
4813 assert(!in->is_frozen_auth_pin());
4814 }
4815 mdr->auth_pin(in);
4816 }
4817 }
4818 // xlock(s)?
4819 if (strong->xlocked_inodes.count(in->vino())) {
4820 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4821 q != strong->xlocked_inodes[in->vino()].end();
4822 ++q) {
4823 SimpleLock *lock = in->get_lock(q->first);
4824 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4825 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4826 assert(mdr->is_auth_pinned(in));
4827 if (!mdr->xlocks.count(&in->versionlock)) {
4828 assert(in->versionlock.can_xlock_local());
4829 in->versionlock.get_xlock(mdr, mdr->get_client());
4830 mdr->xlocks.insert(&in->versionlock);
4831 mdr->locks.insert(&in->versionlock);
4832 }
4833 if (lock->is_stable())
4834 in->auth_pin(lock);
4835 lock->set_state(LOCK_XLOCK);
4836 if (lock == &in->filelock)
4837 in->loner_cap = -1;
4838 lock->get_xlock(mdr, mdr->get_client());
4839 mdr->xlocks.insert(lock);
4840 mdr->locks.insert(lock);
4841 }
4842 }
4843 }
4844 // wrlock(s)?
4845 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4846 p != strong->wrlocked_inodes.end();
4847 ++p) {
4848 CInode *in = get_inode(p->first);
4849 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4850 q != p->second.end();
4851 ++q) {
4852 SimpleLock *lock = in->get_lock(q->first);
4853 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4854 r != q->second.end();
4855 ++r) {
4856 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4857 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4858 if (in->is_auth())
4859 assert(mdr->is_auth_pinned(in));
4860 lock->set_state(LOCK_MIX);
4861 if (lock == &in->filelock)
4862 in->loner_cap = -1;
4863 lock->get_wrlock(true);
4864 mdr->wrlocks.insert(lock);
4865 mdr->locks.insert(lock);
4866 }
4867 }
4868 }
4869
4870 // done?
4871 assert(rejoin_gather.count(from));
4872 rejoin_gather.erase(from);
4873 if (rejoin_gather.empty()) {
4874 rejoin_gather_finish();
4875 } else {
4876 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4877 }
4878 }
4879
4880 /* This functions DOES NOT put the passed message before returning */
4881 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4882 {
4883 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4884 mds_rank_t from = mds_rank_t(ack->get_source().num());
4885
4886 assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4887 bool survivor = !mds->is_rejoin();
4888
4889 // for sending cache expire message
4890 set<CInode*> isolated_inodes;
4891 set<CInode*> refragged_inodes;
4892
4893 // dirs
4894 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4895 p != ack->strong_dirfrags.end();
4896 ++p) {
4897 // we may have had incorrect dir fragmentation; refragment based
4898 // on what they auth tells us.
4899 CDir *dir = get_dirfrag(p->first);
4900 if (!dir) {
4901 dir = get_force_dirfrag(p->first, false);
4902 if (dir)
4903 refragged_inodes.insert(dir->get_inode());
4904 }
4905 if (!dir) {
4906 CInode *diri = get_inode(p->first.ino);
4907 if (!diri) {
4908 // barebones inode; the full inode loop below will clean up.
4909 diri = new CInode(this, false);
4910 diri->inode.ino = p->first.ino;
4911 diri->inode.mode = S_IFDIR;
4912 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4913 add_inode(diri);
4914 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4915 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4916 dout(10) << " add inode " << *diri << dendl;
4917 } else {
4918 diri->inode_auth = CDIR_AUTH_DEFAULT;
4919 isolated_inodes.insert(diri);
4920 dout(10) << " unconnected dirfrag " << p->first << dendl;
4921 }
4922 }
4923 // barebones dirfrag; the full dirfrag loop below will clean up.
4924 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4925 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4926 (dir->authority() != CDIR_AUTH_UNDEF &&
4927 dir->authority().first != from))
4928 adjust_subtree_auth(dir, from);
4929 dout(10) << " add dirfrag " << *dir << dendl;
4930 }
4931
4932 dir->set_replica_nonce(p->second.nonce);
4933 dir->state_clear(CDir::STATE_REJOINING);
4934 dout(10) << " got " << *dir << dendl;
4935
4936 // dentries
4937 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4938 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4939 q != dmap.end();
4940 ++q) {
4941 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4942 if(!dn)
4943 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4944
4945 CDentry::linkage_t *dnl = dn->get_linkage();
4946
4947 assert(dn->last == q->first.snapid);
4948 if (dn->first != q->second.first) {
4949 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4950 dn->first = q->second.first;
4951 }
4952
4953 // may have bad linkage if we missed dentry link/unlink messages
4954 if (dnl->is_primary()) {
4955 CInode *in = dnl->get_inode();
4956 if (!q->second.is_primary() ||
4957 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4958 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4959 dir->unlink_inode(dn);
4960 }
4961 } else if (dnl->is_remote()) {
4962 if (!q->second.is_remote() ||
4963 q->second.remote_ino != dnl->get_remote_ino() ||
4964 q->second.remote_d_type != dnl->get_remote_d_type()) {
4965 dout(10) << " had bad linkage for " << *dn << dendl;
4966 dir->unlink_inode(dn);
4967 }
4968 } else {
4969 if (!q->second.is_null())
4970 dout(10) << " had bad linkage for " << *dn << dendl;
4971 }
4972
4973 // hmm, did we have the proper linkage here?
4974 if (dnl->is_null() && !q->second.is_null()) {
4975 if (q->second.is_remote()) {
4976 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4977 } else {
4978 CInode *in = get_inode(q->second.ino, q->first.snapid);
4979 if (!in) {
4980 // barebones inode; assume it's dir, the full inode loop below will clean up.
4981 in = new CInode(this, false, q->second.first, q->first.snapid);
4982 in->inode.ino = q->second.ino;
4983 in->inode.mode = S_IFDIR;
4984 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4985 add_inode(in);
4986 dout(10) << " add inode " << *in << dendl;
4987 } else if (in->get_parent_dn()) {
4988 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
4989 << ", unlinking " << *in << dendl;
4990 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
4991 }
4992 dn->dir->link_primary_inode(dn, in);
4993 isolated_inodes.erase(in);
4994 }
4995 }
4996
4997 dn->set_replica_nonce(q->second.nonce);
4998 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
4999 dn->state_clear(CDentry::STATE_REJOINING);
5000 dout(10) << " got " << *dn << dendl;
5001 }
5002 }
5003
5004 for (set<CInode*>::iterator p = refragged_inodes.begin();
5005 p != refragged_inodes.end();
5006 ++p) {
5007 list<CDir*> ls;
5008 (*p)->get_nested_dirfrags(ls);
5009 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5010 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5011 continue;
5012 assert((*q)->get_num_any() == 0);
5013 (*p)->close_dirfrag((*q)->get_frag());
5014 }
5015 }
5016
5017 // full dirfrags
5018 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5019 p != ack->dirfrag_bases.end();
5020 ++p) {
5021 CDir *dir = get_dirfrag(p->first);
5022 assert(dir);
5023 bufferlist::iterator q = p->second.begin();
5024 dir->_decode_base(q);
5025 dout(10) << " got dir replica " << *dir << dendl;
5026 }
5027
5028 // full inodes
5029 bufferlist::iterator p = ack->inode_base.begin();
5030 while (!p.end()) {
5031 inodeno_t ino;
5032 snapid_t last;
5033 bufferlist basebl;
5034 ::decode(ino, p);
5035 ::decode(last, p);
5036 ::decode(basebl, p);
5037 CInode *in = get_inode(ino, last);
5038 assert(in);
5039 bufferlist::iterator q = basebl.begin();
5040 in->_decode_base(q);
5041 dout(10) << " got inode base " << *in << dendl;
5042 }
5043
5044 // inodes
5045 p = ack->inode_locks.begin();
5046 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5047 while (!p.end()) {
5048 inodeno_t ino;
5049 snapid_t last;
5050 __u32 nonce;
5051 bufferlist lockbl;
5052 ::decode(ino, p);
5053 ::decode(last, p);
5054 ::decode(nonce, p);
5055 ::decode(lockbl, p);
5056
5057 CInode *in = get_inode(ino, last);
5058 assert(in);
5059 in->set_replica_nonce(nonce);
5060 bufferlist::iterator q = lockbl.begin();
5061 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5062 in->state_clear(CInode::STATE_REJOINING);
5063 dout(10) << " got inode locks " << *in << dendl;
5064 }
5065
5066 // FIXME: This can happen if entire subtree, together with the inode subtree root
5067 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5068 assert(isolated_inodes.empty());
5069
5070 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5071 bufferlist::iterator bp = ack->imported_caps.begin();
5072 ::decode(peer_imported, bp);
5073
5074 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5075 p != peer_imported.end();
5076 ++p) {
5077 assert(cap_exports.count(p->first));
5078 assert(cap_export_targets.count(p->first));
5079 assert(cap_export_targets[p->first] == from);
5080 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5081 q != p->second.end();
5082 ++q) {
5083 assert(cap_exports[p->first].count(q->first));
5084
5085 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5086 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5087 assert(session);
5088
5089 // mark client caps stale.
5090 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5091 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5092 mds->get_osd_epoch_barrier());
5093 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5094 (q->second.cap_id > 0 ? from : -1), 0);
5095 mds->send_message_client_counted(m, session);
5096
5097 cap_exports[p->first].erase(q->first);
5098 }
5099 assert(cap_exports[p->first].empty());
5100 }
5101
5102 // done?
5103 assert(rejoin_ack_gather.count(from));
5104 rejoin_ack_gather.erase(from);
5105 if (!survivor) {
5106
5107 if (rejoin_gather.empty()) {
5108 // eval unstable scatter locks after all wrlocks are rejoined.
5109 while (!rejoin_eval_locks.empty()) {
5110 SimpleLock *lock = rejoin_eval_locks.front();
5111 rejoin_eval_locks.pop_front();
5112 if (!lock->is_stable())
5113 mds->locker->eval_gather(lock);
5114 }
5115 }
5116
5117 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5118 rejoin_ack_gather.empty()) {
5119 // finally, kickstart past snap parent opens
5120 open_snap_parents();
5121 } else {
5122 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5123 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5124 }
5125 } else {
5126 // survivor.
5127 mds->queue_waiters(rejoin_waiters);
5128 }
5129 }
5130
5131 /**
5132 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5133 *
5134 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5135 * messages that clean these guys up...
5136 */
5137 void MDCache::rejoin_trim_undef_inodes()
5138 {
5139 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5140
5141 while (!rejoin_undef_inodes.empty()) {
5142 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5143 CInode *in = *p;
5144 rejoin_undef_inodes.erase(p);
5145
5146 in->clear_replica_map();
5147
5148 // close out dirfrags
5149 if (in->is_dir()) {
5150 list<CDir*> dfls;
5151 in->get_dirfrags(dfls);
5152 for (list<CDir*>::iterator p = dfls.begin();
5153 p != dfls.end();
5154 ++p) {
5155 CDir *dir = *p;
5156 dir->clear_replica_map();
5157
5158 for (CDir::map_t::iterator p = dir->items.begin();
5159 p != dir->items.end();
5160 ++p) {
5161 CDentry *dn = p->second;
5162 dn->clear_replica_map();
5163
5164 dout(10) << " trimming " << *dn << dendl;
5165 dir->remove_dentry(dn);
5166 }
5167
5168 dout(10) << " trimming " << *dir << dendl;
5169 in->close_dirfrag(dir->dirfrag().frag);
5170 }
5171 }
5172
5173 CDentry *dn = in->get_parent_dn();
5174 if (dn) {
5175 dn->clear_replica_map();
5176 dout(10) << " trimming " << *dn << dendl;
5177 dn->dir->remove_dentry(dn);
5178 } else {
5179 dout(10) << " trimming " << *in << dendl;
5180 remove_inode(in);
5181 }
5182 }
5183
5184 assert(rejoin_undef_inodes.empty());
5185 }
5186
5187 void MDCache::rejoin_gather_finish()
5188 {
5189 dout(10) << "rejoin_gather_finish" << dendl;
5190 assert(mds->is_rejoin());
5191
5192 if (open_undef_inodes_dirfrags())
5193 return;
5194
5195 if (process_imported_caps())
5196 return;
5197
5198 choose_lock_states_and_reconnect_caps();
5199
5200 identify_files_to_recover();
5201 rejoin_send_acks();
5202
5203 // signal completion of fetches, rejoin_gather_finish, etc.
5204 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5205 rejoin_ack_gather.erase(mds->get_nodeid());
5206
5207 // did we already get our acks too?
5208 if (rejoin_ack_gather.empty()) {
5209 // finally, kickstart past snap parent opens
5210 open_snap_parents();
5211 }
5212 }
5213
5214 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5215 inodeno_t ino;
5216 public:
5217 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5218 void finish(int r) override {
5219 mdcache->rejoin_open_ino_finish(ino, r);
5220 }
5221 };
5222
5223 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5224 {
5225 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5226
5227 if (ret < 0) {
5228 cap_imports_missing.insert(ino);
5229 } else if (ret == mds->get_nodeid()) {
5230 assert(get_inode(ino));
5231 } else {
5232 auto p = cap_imports.find(ino);
5233 assert(p != cap_imports.end());
5234 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5235 assert(q->second.count(MDS_RANK_NONE));
5236 assert(q->second.size() == 1);
5237 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5238 }
5239 cap_imports.erase(p);
5240 }
5241
5242 assert(cap_imports_num_opening > 0);
5243 cap_imports_num_opening--;
5244
5245 if (cap_imports_num_opening == 0) {
5246 if (rejoin_gather.empty())
5247 rejoin_gather_finish();
5248 else if (rejoin_gather.count(mds->get_nodeid()))
5249 process_imported_caps();
5250 }
5251 }
5252
5253 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5254 public:
5255 map<client_t,entity_inst_t> client_map;
5256 map<client_t,uint64_t> sseqmap;
5257
5258 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5259 MDCacheLogContext(c), client_map(cm) {}
5260 void finish(int r) override {
5261 assert(r == 0);
5262 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5263 }
5264 };
5265
5266 void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5267 map<client_t,uint64_t>& sseqmap)
5268 {
5269 dout(10) << "rejoin_open_sessions_finish" << dendl;
5270 mds->server->finish_force_open_sessions(client_map, sseqmap);
5271 if (rejoin_gather.empty())
5272 rejoin_gather_finish();
5273 }
5274
5275 bool MDCache::process_imported_caps()
5276 {
5277 dout(10) << "process_imported_caps" << dendl;
5278
5279 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5280 CInode *in = get_inode(p->first);
5281 if (in) {
5282 assert(in->is_auth());
5283 cap_imports_missing.erase(p->first);
5284 continue;
5285 }
5286 if (cap_imports_missing.count(p->first) > 0)
5287 continue;
5288
5289 cap_imports_num_opening++;
5290 dout(10) << " opening missing ino " << p->first << dendl;
5291 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5292 }
5293
5294 if (cap_imports_num_opening > 0)
5295 return true;
5296
5297 // called by rejoin_gather_finish() ?
5298 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5299 // if sessions for imported caps are all open ?
5300 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5301 p != rejoin_client_map.end();
5302 ++p) {
5303 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5304 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5305 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5306 ESessions *le = new ESessions(pv, rejoin_client_map);
5307 mds->mdlog->start_submit_entry(le, finish);
5308 mds->mdlog->flush();
5309 rejoin_client_map.clear();
5310 return true;
5311 }
5312 }
5313 rejoin_client_map.clear();
5314
5315 // process caps that were exported by slave rename
5316 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5317 p != rejoin_slave_exports.end();
5318 ++p) {
5319 CInode *in = get_inode(p->first);
5320 assert(in);
5321 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5322 q != p->second.second.end();
5323 ++q) {
5324 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5325 assert(session);
5326
5327 Capability *cap = in->get_client_cap(q->first);
5328 if (!cap)
5329 cap = in->add_client_cap(q->first, session);
5330 cap->merge(q->second, true);
5331
5332 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5333 assert(cap->get_last_seq() == im.issue_seq);
5334 assert(cap->get_mseq() == im.mseq);
5335 cap->set_cap_id(im.cap_id);
5336 // send cap import because we assigned a new cap ID
5337 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5338 p->second.first, CEPH_CAP_FLAG_AUTH);
5339 }
5340 }
5341 rejoin_slave_exports.clear();
5342 rejoin_imported_caps.clear();
5343
5344 // process cap imports
5345 // ino -> client -> frommds -> capex
5346 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5347 CInode *in = get_inode(p->first);
5348 if (!in) {
5349 dout(10) << " still missing ino " << p->first
5350 << ", will try again after replayed client requests" << dendl;
5351 ++p;
5352 continue;
5353 }
5354 assert(in->is_auth());
5355 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5356 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5357 assert(session);
5358 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5359 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5360 add_reconnected_cap(q->first, in->ino(), r->second);
5361 if (r->first >= 0) {
5362 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5363 cap->inc_mseq();
5364 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5365
5366 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5367 im.cap_id = cap->get_cap_id();
5368 im.issue_seq = cap->get_last_seq();
5369 im.mseq = cap->get_mseq();
5370 }
5371 }
5372 }
5373 cap_imports.erase(p++); // remove and move on
5374 }
5375 } else {
5376 trim_non_auth();
5377
5378 rejoin_gather.erase(mds->get_nodeid());
5379 maybe_send_pending_rejoins();
5380
5381 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5382 rejoin_gather_finish();
5383 }
5384 return false;
5385 }
5386
5387 void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5388 {
5389 // are this realm's parents fully open?
5390 if (realm->have_past_parents_open()) {
5391 dout(10) << " have past snap parents for realm " << *realm
5392 << " on " << *realm->inode << dendl;
5393 if (reconnect) {
5394 // finish off client snaprealm reconnects?
5395 auto p = reconnected_snaprealms.find(realm->inode->ino());
5396 if (p != reconnected_snaprealms.end()) {
5397 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5398 finish_snaprealm_reconnect(q->first, realm, q->second);
5399 reconnected_snaprealms.erase(p);
5400 }
5401 }
5402 } else {
5403 if (!missing_snap_parents.count(realm->inode)) {
5404 dout(10) << " MISSING past snap parents for realm " << *realm
5405 << " on " << *realm->inode << dendl;
5406 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5407 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5408 } else {
5409 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5410 << " on " << *realm->inode << dendl;
5411 }
5412 }
5413 }
5414
5415 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5416 client_t client, snapid_t snap_follows)
5417 {
5418 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5419
5420 const set<snapid_t>& snaps = realm->get_snaps();
5421 snapid_t follows = snap_follows;
5422
5423 while (true) {
5424 CInode *in = pick_inode_snap(head_in, follows);
5425 if (in == head_in)
5426 break;
5427 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5428
5429 /* TODO: we can check the reconnected/flushing caps to find
5430 * which locks need gathering */
5431 for (int i = 0; i < num_cinode_locks; i++) {
5432 int lockid = cinode_lock_info[i].lock;
5433 SimpleLock *lock = in->get_lock(lockid);
5434 assert(lock);
5435 in->client_snap_caps[lockid].insert(client);
5436 in->auth_pin(lock);
5437 lock->set_state(LOCK_SNAP_SYNC);
5438 lock->get_wrlock(true);
5439 }
5440
5441 for (auto p = snaps.lower_bound(in->first);
5442 p != snaps.end() && *p <= in->last;
5443 ++p) {
5444 head_in->add_need_snapflush(in, *p, client);
5445 }
5446
5447 follows = in->last;
5448 }
5449 }
5450
5451 /*
5452 * choose lock states based on reconnected caps
5453 */
5454 void MDCache::choose_lock_states_and_reconnect_caps()
5455 {
5456 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5457
5458 map<client_t,MClientSnap*> splits;
5459
5460 for (auto i : inode_map) {
5461 CInode *in = i.second;
5462
5463 if (in->last != CEPH_NOSNAP)
5464 continue;
5465
5466 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5467 in->mark_dirty_rstat();
5468
5469 int dirty_caps = 0;
5470 auto p = reconnected_caps.find(in->ino());
5471 if (p != reconnected_caps.end()) {
5472 for (const auto &it : p->second)
5473 dirty_caps |= it.second.dirty_caps;
5474 }
5475 in->choose_lock_states(dirty_caps);
5476 dout(15) << " chose lock states on " << *in << dendl;
5477
5478 SnapRealm *realm = in->find_snaprealm();
5479
5480 check_realm_past_parents(realm, realm == in->snaprealm);
5481
5482 if (p != reconnected_caps.end()) {
5483 bool missing_snap_parent = false;
5484 // also, make sure client's cap is in the correct snaprealm.
5485 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5486 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5487 if (realm->have_past_parents_open()) {
5488 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5489 } else {
5490 missing_snap_parent = true;
5491 }
5492 }
5493
5494 if (q->second.realm_ino == realm->inode->ino()) {
5495 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5496 } else {
5497 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5498 << " != " << realm->inode->ino() << dendl;
5499 if (realm->have_past_parents_open()) {
5500 // ok, include in a split message _now_.
5501 prepare_realm_split(realm, q->first, in->ino(), splits);
5502 } else {
5503 // send the split later.
5504 missing_snap_parent = true;
5505 }
5506 }
5507 }
5508 if (missing_snap_parent)
5509 missing_snap_parents[realm->inode].insert(in);
5510 }
5511 }
5512
5513 send_snaps(splits);
5514 }
5515
5516 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5517 map<client_t,MClientSnap*>& splits)
5518 {
5519 MClientSnap *snap;
5520 if (splits.count(client) == 0) {
5521 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5522 snap->head.split = realm->inode->ino();
5523 realm->build_snap_trace(snap->bl);
5524
5525 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5526 p != realm->open_children.end();
5527 ++p)
5528 snap->split_realms.push_back((*p)->inode->ino());
5529
5530 } else
5531 snap = splits[client];
5532 snap->split_inos.push_back(ino);
5533 }
5534
5535 void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5536 {
5537 dout(10) << "send_snaps" << dendl;
5538
5539 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5540 p != splits.end();
5541 ++p) {
5542 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5543 if (session) {
5544 dout(10) << " client." << p->first
5545 << " split " << p->second->head.split
5546 << " inos " << p->second->split_inos
5547 << dendl;
5548 mds->send_message_client_counted(p->second, session);
5549 } else {
5550 dout(10) << " no session for client." << p->first << dendl;
5551 p->second->put();
5552 }
5553 }
5554 splits.clear();
5555 }
5556
5557
5558 /*
5559 * remove any items from logsegment open_file lists that don't have
5560 * any caps
5561 */
5562 void MDCache::clean_open_file_lists()
5563 {
5564 dout(10) << "clean_open_file_lists" << dendl;
5565
5566 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5567 p != mds->mdlog->segments.end();
5568 ++p) {
5569 LogSegment *ls = p->second;
5570
5571 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5572 while (!q.end()) {
5573 CInode *in = *q;
5574 ++q;
5575 if (in->last == CEPH_NOSNAP) {
5576 if (!in->is_any_caps_wanted()) {
5577 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5578 in->item_open_file.remove_myself();
5579 }
5580 } else if (in->last != CEPH_NOSNAP) {
5581 if (in->client_snap_caps.empty()) {
5582 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5583 in->item_open_file.remove_myself();
5584 }
5585 }
5586 }
5587 }
5588 }
5589
5590
5591
5592 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5593 {
5594 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5595 << " on " << *in << dendl;
5596 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5597 if (!session) {
5598 dout(10) << " no session for client." << client << dendl;
5599 return NULL;
5600 }
5601
5602 Capability *cap = in->reconnect_cap(client, icr, session);
5603
5604 if (frommds >= 0) {
5605 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5606 cap->inc_mseq();
5607 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5608 }
5609
5610 return cap;
5611 }
5612
5613 void MDCache::export_remaining_imported_caps()
5614 {
5615 dout(10) << "export_remaining_imported_caps" << dendl;
5616
5617 stringstream warn_str;
5618
5619 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5620 warn_str << " ino " << p->first << "\n";
5621 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5622 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5623 if (session) {
5624 // mark client caps stale.
5625 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5626 stale->set_cap_peer(0, 0, 0, -1, 0);
5627 mds->send_message_client_counted(stale, q->first);
5628 }
5629 }
5630
5631 mds->heartbeat_reset();
5632 }
5633
5634 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5635 p != cap_reconnect_waiters.end();
5636 ++p)
5637 mds->queue_waiters(p->second);
5638
5639 cap_imports.clear();
5640 cap_reconnect_waiters.clear();
5641
5642 if (warn_str.peek() != EOF) {
5643 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5644 mds->clog->warn(warn_str);
5645 }
5646 }
5647
5648 void MDCache::try_reconnect_cap(CInode *in, Session *session)
5649 {
5650 client_t client = session->info.get_client();
5651 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5652 if (rc) {
5653 in->reconnect_cap(client, *rc, session);
5654 dout(10) << "try_reconnect_cap client." << client
5655 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5656 << " issue " << ccap_string(rc->capinfo.issued)
5657 << " on " << *in << dendl;
5658 remove_replay_cap_reconnect(in->ino(), client);
5659
5660 if (in->is_replicated()) {
5661 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5662 } else {
5663 int dirty_caps = 0;
5664 auto p = reconnected_caps.find(in->ino());
5665 if (p != reconnected_caps.end()) {
5666 auto q = p->second.find(client);
5667 if (q != p->second.end())
5668 dirty_caps = q->second.dirty_caps;
5669 }
5670 in->choose_lock_states(dirty_caps);
5671 dout(15) << " chose lock states on " << *in << dendl;
5672 }
5673
5674 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5675 cap_reconnect_waiters.find(in->ino());
5676 if (it != cap_reconnect_waiters.end()) {
5677 mds->queue_waiters(it->second);
5678 cap_reconnect_waiters.erase(it);
5679 }
5680 }
5681 }
5682
5683
5684
5685 // -------
5686 // cap imports and delayed snap parent opens
5687
5688 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5689 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5690 int peer, int p_flags)
5691 {
5692 client_t client = session->info.inst.name.num();
5693 SnapRealm *realm = in->find_snaprealm();
5694 if (realm->have_past_parents_open()) {
5695 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5696 if (cap->get_last_seq() == 0) // reconnected cap
5697 cap->inc_last_seq();
5698 cap->set_last_issue();
5699 cap->set_last_issue_stamp(ceph_clock_now());
5700 cap->clear_new();
5701 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5702 in->ino(),
5703 realm->inode->ino(),
5704 cap->get_cap_id(), cap->get_last_seq(),
5705 cap->pending(), cap->wanted(), 0,
5706 cap->get_mseq(), mds->get_osd_epoch_barrier());
5707 in->encode_cap_message(reap, cap);
5708 realm->build_snap_trace(reap->snapbl);
5709 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5710 mds->send_message_client_counted(reap, session);
5711 } else {
5712 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5713 << cap->get_mseq() << " on " << *in << dendl;
5714 in->auth_pin(this);
5715 cap->inc_suppress();
5716 delayed_imported_caps[client].insert(in);
5717 missing_snap_parents[in].size();
5718 }
5719 }
5720
5721 void MDCache::do_delayed_cap_imports()
5722 {
5723 dout(10) << "do_delayed_cap_imports" << dendl;
5724
5725 assert(delayed_imported_caps.empty());
5726 }
5727
5728 struct C_MDC_OpenSnapParents : public MDCacheContext {
5729 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5730 void finish(int r) override {
5731 mdcache->open_snap_parents();
5732 }
5733 };
5734
5735 void MDCache::open_snap_parents()
5736 {
5737 dout(10) << "open_snap_parents" << dendl;
5738
5739 map<client_t,MClientSnap*> splits;
5740 MDSGatherBuilder gather(g_ceph_context);
5741
5742 auto p = missing_snap_parents.begin();
5743 while (p != missing_snap_parents.end()) {
5744 CInode *in = p->first;
5745 assert(in->snaprealm);
5746 if (in->snaprealm->open_parents(gather.new_sub())) {
5747 dout(10) << " past parents now open on " << *in << dendl;
5748
5749 for (CInode *child : p->second) {
5750 auto q = reconnected_caps.find(child->ino());
5751 assert(q != reconnected_caps.end());
5752 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5753 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5754 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5755 }
5756 // make sure client's cap is in the correct snaprealm.
5757 if (r->second.realm_ino != in->ino()) {
5758 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5759 }
5760 }
5761 }
5762
5763 missing_snap_parents.erase(p++);
5764
5765 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5766
5767 // finish off client snaprealm reconnects?
5768 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5769 if (q != reconnected_snaprealms.end()) {
5770 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5771 r != q->second.end();
5772 ++r)
5773 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5774 reconnected_snaprealms.erase(q);
5775 }
5776 } else {
5777 dout(10) << " opening past parents on " << *in << dendl;
5778 ++p;
5779 }
5780 }
5781
5782 send_snaps(splits);
5783
5784 if (gather.has_subs()) {
5785 dout(10) << "open_snap_parents - waiting for "
5786 << gather.num_subs_remaining() << dendl;
5787 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5788 gather.activate();
5789 } else {
5790 if (!reconnected_snaprealms.empty()) {
5791 stringstream warn_str;
5792 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5793 p != reconnected_snaprealms.end();
5794 ++p) {
5795 warn_str << " unconnected snaprealm " << p->first << "\n";
5796 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5797 q != p->second.end();
5798 ++q)
5799 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5800 }
5801 mds->clog->warn() << "open_snap_parents has:";
5802 mds->clog->warn(warn_str);
5803 }
5804 assert(rejoin_waiters.empty());
5805 assert(missing_snap_parents.empty());
5806 dout(10) << "open_snap_parents - all open" << dendl;
5807 do_delayed_cap_imports();
5808
5809 assert(rejoin_done);
5810 rejoin_done.release()->complete(0);
5811 reconnected_caps.clear();
5812 }
5813 }
5814
5815 bool MDCache::open_undef_inodes_dirfrags()
5816 {
5817 dout(10) << "open_undef_inodes_dirfrags "
5818 << rejoin_undef_inodes.size() << " inodes "
5819 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5820
5821 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5822
5823 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5824 p != rejoin_undef_inodes.end();
5825 ++p) {
5826 CInode *in = *p;
5827 assert(!in->is_base());
5828 fetch_queue.insert(in->get_parent_dir());
5829 }
5830
5831 if (fetch_queue.empty())
5832 return false;
5833
5834 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5835 for (set<CDir*>::iterator p = fetch_queue.begin();
5836 p != fetch_queue.end();
5837 ++p) {
5838 CDir *dir = *p;
5839 CInode *diri = dir->get_inode();
5840 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5841 continue;
5842 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5843 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5844 dir->fetch(gather.new_sub());
5845 }
5846 assert(gather.has_subs());
5847 gather.activate();
5848 return true;
5849 }
5850
5851 void MDCache::opened_undef_inode(CInode *in) {
5852 dout(10) << "opened_undef_inode " << *in << dendl;
5853 rejoin_undef_inodes.erase(in);
5854 if (in->is_dir()) {
5855 // FIXME: re-hash dentries if necessary
5856 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5857 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5858 CDir *dir = in->get_dirfrag(frag_t());
5859 assert(dir);
5860 rejoin_undef_dirfrags.erase(dir);
5861 in->force_dirfrags();
5862 list<CDir*> ls;
5863 in->get_dirfrags(ls);
5864 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5865 rejoin_undef_dirfrags.insert(*p);
5866 }
5867 }
5868 }
5869
5870 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5871 {
5872 if (seq < realm->get_newest_seq()) {
5873 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5874 << realm->get_newest_seq()
5875 << " on " << *realm << dendl;
5876 // send an update
5877 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5878 if (session) {
5879 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5880 realm->build_snap_trace(snap->bl);
5881 mds->send_message_client_counted(snap, session);
5882 } else {
5883 dout(10) << " ...or not, no session for this client!" << dendl;
5884 }
5885 } else {
5886 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5887 << " on " << *realm << dendl;
5888 }
5889 }
5890
5891
5892
5893 void MDCache::rejoin_send_acks()
5894 {
5895 dout(7) << "rejoin_send_acks" << dendl;
5896
5897 // replicate stray
5898 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5899 p != rejoin_unlinked_inodes.end();
5900 ++p) {
5901 for (set<CInode*>::iterator q = p->second.begin();
5902 q != p->second.end();
5903 ++q) {
5904 CInode *in = *q;
5905 dout(7) << " unlinked inode " << *in << dendl;
5906 // inode expired
5907 if (!in->is_replica(p->first))
5908 continue;
5909 while (1) {
5910 CDentry *dn = in->get_parent_dn();
5911 if (dn->is_replica(p->first))
5912 break;
5913 dn->add_replica(p->first);
5914 CDir *dir = dn->get_dir();
5915 if (dir->is_replica(p->first))
5916 break;
5917 dir->add_replica(p->first);
5918 in = dir->get_inode();
5919 if (in->is_replica(p->first))
5920 break;
5921 in->add_replica(p->first);
5922 if (in->is_base())
5923 break;
5924 }
5925 }
5926 }
5927 rejoin_unlinked_inodes.clear();
5928
5929 // send acks to everyone in the recovery set
5930 map<mds_rank_t,MMDSCacheRejoin*> acks;
5931 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5932 p != recovery_set.end();
5933 ++p) {
5934 if (rejoin_ack_sent.count(*p))
5935 continue;
5936 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5937 }
5938
5939 rejoin_ack_sent = recovery_set;
5940
5941 // walk subtrees
5942 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5943 p != subtrees.end();
5944 ++p) {
5945 CDir *dir = p->first;
5946 if (!dir->is_auth())
5947 continue;
5948 dout(10) << "subtree " << *dir << dendl;
5949
5950 // auth items in this subtree
5951 list<CDir*> dq;
5952 dq.push_back(dir);
5953
5954 while (!dq.empty()) {
5955 CDir *dir = dq.front();
5956 dq.pop_front();
5957
5958 // dir
5959 for (auto &r : dir->get_replicas()) {
5960 auto it = acks.find(r.first);
5961 if (it == acks.end())
5962 continue;
5963 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
5964 it->second->add_dirfrag_base(dir);
5965 }
5966
5967 for (CDir::map_t::iterator q = dir->items.begin();
5968 q != dir->items.end();
5969 ++q) {
5970 CDentry *dn = q->second;
5971 CDentry::linkage_t *dnl = dn->get_linkage();
5972
5973 // inode
5974 CInode *in = NULL;
5975 if (dnl->is_primary())
5976 in = dnl->get_inode();
5977
5978 // dentry
5979 for (auto &r : dn->get_replicas()) {
5980 auto it = acks.find(r.first);
5981 if (it == acks.end())
5982 continue;
5983 it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
5984 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
5985 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
5986 dnl->is_remote() ? dnl->get_remote_d_type():0,
5987 ++r.second,
5988 dn->lock.get_replica_state());
5989 // peer missed MDentrylink message ?
5990 if (in && !in->is_replica(r.first))
5991 in->add_replica(r.first);
5992 }
5993
5994 if (!in)
5995 continue;
5996
5997 for (auto &r : in->get_replicas()) {
5998 auto it = acks.find(r.first);
5999 if (it == acks.end())
6000 continue;
6001 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6002 bufferlist bl;
6003 in->_encode_locks_state_for_rejoin(bl, r.first);
6004 it->second->add_inode_locks(in, ++r.second, bl);
6005 }
6006
6007 // subdirs in this subtree?
6008 in->get_nested_dirfrags(dq);
6009 }
6010 }
6011 }
6012
6013 // base inodes too
6014 if (root && root->is_auth())
6015 for (auto &r : root->get_replicas()) {
6016 auto it = acks.find(r.first);
6017 if (it == acks.end())
6018 continue;
6019 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6020 bufferlist bl;
6021 root->_encode_locks_state_for_rejoin(bl, r.first);
6022 it->second->add_inode_locks(root, ++r.second, bl);
6023 }
6024 if (myin)
6025 for (auto &r : myin->get_replicas()) {
6026 auto it = acks.find(r.first);
6027 if (it == acks.end())
6028 continue;
6029 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6030 bufferlist bl;
6031 myin->_encode_locks_state_for_rejoin(bl, r.first);
6032 it->second->add_inode_locks(myin, ++r.second, bl);
6033 }
6034
6035 // include inode base for any inodes whose scatterlocks may have updated
6036 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6037 p != rejoin_potential_updated_scatterlocks.end();
6038 ++p) {
6039 CInode *in = *p;
6040 for (const auto &r : in->get_replicas()) {
6041 auto it = acks.find(r.first);
6042 if (it == acks.end())
6043 continue;
6044 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6045 }
6046 }
6047
6048 // send acks
6049 for (auto p = acks.begin(); p != acks.end(); ++p) {
6050 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6051 mds->send_message_mds(p->second, p->first);
6052 }
6053
6054 rejoin_imported_caps.clear();
6055 }
6056
6057 class C_MDC_ReIssueCaps : public MDCacheContext {
6058 CInode *in;
6059 public:
6060 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6061 MDCacheContext(mdc), in(i)
6062 {
6063 in->get(CInode::PIN_PTRWAITER);
6064 }
6065 void finish(int r) override {
6066 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6067 mdcache->mds->locker->issue_caps(in);
6068 in->put(CInode::PIN_PTRWAITER);
6069 }
6070 };
6071
6072 void MDCache::reissue_all_caps()
6073 {
6074 dout(10) << "reissue_all_caps" << dendl;
6075
6076 for (auto p : inode_map) {
6077 CInode *in = p.second;
6078 if (in->is_head() && in->is_any_caps()) {
6079 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6080 if (in->is_frozen_inode()) {
6081 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6082 continue;
6083 }
6084 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6085 mds->locker->issue_caps(in);
6086 }
6087 }
6088 }
6089
6090
6091 // ===============================================================================
6092
6093 struct C_MDC_QueuedCow : public MDCacheContext {
6094 CInode *in;
6095 MutationRef mut;
6096 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6097 MDCacheContext(mdc), in(i), mut(m) {}
6098 void finish(int r) override {
6099 mdcache->_queued_file_recover_cow(in, mut);
6100 }
6101 };
6102
6103
6104 void MDCache::queue_file_recover(CInode *in)
6105 {
6106 dout(10) << "queue_file_recover " << *in << dendl;
6107 assert(in->is_auth());
6108
6109 // cow?
6110 /*
6111 SnapRealm *realm = in->find_snaprealm();
6112 set<snapid_t> s = realm->get_snaps();
6113 while (!s.empty() && *s.begin() < in->first)
6114 s.erase(s.begin());
6115 while (!s.empty() && *s.rbegin() > in->last)
6116 s.erase(*s.rbegin());
6117 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6118 if (s.size() > 1) {
6119 inode_t *pi = in->project_inode();
6120 pi->version = in->pre_dirty();
6121
6122 auto mut(std::make_shared<MutationImpl>());
6123 mut->ls = mds->mdlog->get_current_segment();
6124 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6125 mds->mdlog->start_entry(le);
6126 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6127
6128 s.erase(*s.begin());
6129 while (!s.empty()) {
6130 snapid_t snapid = *s.begin();
6131 CInode *cow_inode = 0;
6132 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6133 assert(cow_inode);
6134 recovery_queue.enqueue(cow_inode);
6135 s.erase(*s.begin());
6136 }
6137
6138 in->parent->first = in->first;
6139 le->metablob.add_primary_dentry(in->parent, in, true);
6140 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6141 mds->mdlog->flush();
6142 }
6143 */
6144
6145 recovery_queue.enqueue(in);
6146 }
6147
6148 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6149 {
6150 in->pop_and_dirty_projected_inode(mut->ls);
6151 mut->apply();
6152 mds->locker->drop_locks(mut.get());
6153 mut->cleanup();
6154 }
6155
6156
6157 /*
6158 * called after recovery to recover file sizes for previously opened (for write)
6159 * files. that is, those where max_size > size.
6160 */
6161 void MDCache::identify_files_to_recover()
6162 {
6163 dout(10) << "identify_files_to_recover" << dendl;
6164 for (auto p : inode_map) {
6165 CInode *in = p.second;
6166 if (!in->is_auth())
6167 continue;
6168
6169 if (in->last != CEPH_NOSNAP)
6170 continue;
6171
6172 // Only normal files need file size recovery
6173 if (!in->is_file()) {
6174 continue;
6175 }
6176
6177 bool recover = false;
6178 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6179 p != in->inode.client_ranges.end();
6180 ++p) {
6181 Capability *cap = in->get_client_cap(p->first);
6182 if (!cap) {
6183 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6184 recover = true;
6185 break;
6186 }
6187 }
6188
6189 if (recover) {
6190 if (in->filelock.is_stable()) {
6191 in->auth_pin(&in->filelock);
6192 } else {
6193 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6194 }
6195 in->filelock.set_state(LOCK_PRE_SCAN);
6196 rejoin_recover_q.push_back(in);
6197 } else {
6198 rejoin_check_q.push_back(in);
6199 }
6200 }
6201 }
6202
6203 void MDCache::start_files_to_recover()
6204 {
6205 for (CInode *in : rejoin_check_q) {
6206 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6207 mds->locker->issue_caps(in);
6208 mds->locker->check_inode_max_size(in);
6209 }
6210 rejoin_check_q.clear();
6211 for (CInode *in : rejoin_recover_q) {
6212 mds->locker->file_recover(&in->filelock);
6213 }
6214 if (!rejoin_recover_q.empty()) {
6215 rejoin_recover_q.clear();
6216 do_file_recover();
6217 }
6218 }
6219
6220 void MDCache::do_file_recover()
6221 {
6222 recovery_queue.advance();
6223 }
6224
6225 // ===============================================================================
6226
6227
6228 // ----------------------------
6229 // truncate
6230
6231 class C_MDC_RetryTruncate : public MDCacheContext {
6232 CInode *in;
6233 LogSegment *ls;
6234 public:
6235 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6236 MDCacheContext(c), in(i), ls(l) {}
6237 void finish(int r) override {
6238 mdcache->_truncate_inode(in, ls);
6239 }
6240 };
6241
6242 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6243 {
6244 inode_t *pi = in->get_projected_inode();
6245 dout(10) << "truncate_inode "
6246 << pi->truncate_from << " -> " << pi->truncate_size
6247 << " on " << *in
6248 << dendl;
6249
6250 ls->truncating_inodes.insert(in);
6251 in->get(CInode::PIN_TRUNCATING);
6252 in->auth_pin(this);
6253
6254 if (!in->client_need_snapflush.empty() &&
6255 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6256 assert(in->filelock.is_xlocked());
6257 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6258 mds->locker->issue_caps(in);
6259 return;
6260 }
6261
6262 _truncate_inode(in, ls);
6263 }
6264
6265 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6266 CInode *in;
6267 LogSegment *ls;
6268 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6269 MDCacheIOContext(c), in(i), ls(l) {}
6270 void finish(int r) override {
6271 assert(r == 0 || r == -ENOENT);
6272 mdcache->truncate_inode_finish(in, ls);
6273 }
6274 };
6275
6276 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6277 {
6278 inode_t *pi = &in->inode;
6279 dout(10) << "_truncate_inode "
6280 << pi->truncate_from << " -> " << pi->truncate_size
6281 << " on " << *in << dendl;
6282
6283 assert(pi->is_truncating());
6284 assert(pi->truncate_size < (1ULL << 63));
6285 assert(pi->truncate_from < (1ULL << 63));
6286 assert(pi->truncate_size < pi->truncate_from);
6287
6288
6289 SnapRealm *realm = in->find_snaprealm();
6290 SnapContext nullsnap;
6291 const SnapContext *snapc;
6292 if (realm) {
6293 dout(10) << " realm " << *realm << dendl;
6294 snapc = &realm->get_snap_context();
6295 } else {
6296 dout(10) << " NO realm, using null context" << dendl;
6297 snapc = &nullsnap;
6298 assert(in->last == CEPH_NOSNAP);
6299 }
6300 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6301 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6302 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6303 pi->truncate_seq, ceph::real_time::min(), 0,
6304 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6305 mds->finisher));
6306 }
6307
6308 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6309 CInode *in;
6310 MutationRef mut;
6311 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6312 MDCacheLogContext(m), in(i), mut(mu) {}
6313 void finish(int r) override {
6314 mdcache->truncate_inode_logged(in, mut);
6315 }
6316 };
6317
6318 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6319 {
6320 dout(10) << "truncate_inode_finish " << *in << dendl;
6321
6322 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6323 assert(p != ls->truncating_inodes.end());
6324 ls->truncating_inodes.erase(p);
6325
6326 // update
6327 inode_t *pi = in->project_inode();
6328 pi->version = in->pre_dirty();
6329 pi->truncate_from = 0;
6330 pi->truncate_pending--;
6331
6332 MutationRef mut(new MutationImpl());
6333 mut->ls = mds->mdlog->get_current_segment();
6334 mut->add_projected_inode(in);
6335
6336 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6337 mds->mdlog->start_entry(le);
6338 CDentry *dn = in->get_projected_parent_dn();
6339 le->metablob.add_dir_context(dn->get_dir());
6340 le->metablob.add_primary_dentry(dn, in, true);
6341 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6342
6343 journal_dirty_inode(mut.get(), &le->metablob, in);
6344 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6345
6346 // flush immediately if there are readers/writers waiting
6347 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6348 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6349 mds->mdlog->flush();
6350 }
6351
6352 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6353 {
6354 dout(10) << "truncate_inode_logged " << *in << dendl;
6355 mut->apply();
6356 mds->locker->drop_locks(mut.get());
6357 mut->cleanup();
6358
6359 in->put(CInode::PIN_TRUNCATING);
6360 in->auth_unpin(this);
6361
6362 list<MDSInternalContextBase*> waiters;
6363 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6364 mds->queue_waiters(waiters);
6365 }
6366
6367
6368 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6369 {
6370 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6371 << ls->seq << "/" << ls->offset << dendl;
6372 ls->truncating_inodes.insert(in);
6373 in->get(CInode::PIN_TRUNCATING);
6374 }
6375
6376 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6377 {
6378 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6379 << ls->seq << "/" << ls->offset << dendl;
6380 // if we have the logseg the truncate started in, it must be in our list.
6381 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6382 assert(p != ls->truncating_inodes.end());
6383 ls->truncating_inodes.erase(p);
6384 in->put(CInode::PIN_TRUNCATING);
6385 }
6386
6387 void MDCache::start_recovered_truncates()
6388 {
6389 dout(10) << "start_recovered_truncates" << dendl;
6390 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6391 p != mds->mdlog->segments.end();
6392 ++p) {
6393 LogSegment *ls = p->second;
6394 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6395 q != ls->truncating_inodes.end();
6396 ++q) {
6397 CInode *in = *q;
6398 in->auth_pin(this);
6399
6400 if (!in->client_need_snapflush.empty() &&
6401 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6402 assert(in->filelock.is_stable());
6403 in->filelock.set_state(LOCK_XLOCKDONE);
6404 in->auth_pin(&in->filelock);
6405 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6406 // start_files_to_recover will revoke caps
6407 continue;
6408 }
6409 _truncate_inode(in, ls);
6410 }
6411 }
6412 }
6413
6414
6415
6416
6417
6418
6419 // ================================================================================
6420 // cache trimming
6421
6422 void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6423 {
6424 bool is_standby_replay = mds->is_standby_replay();
6425 std::vector<CDentry *> unexpirables;
6426 uint64_t trimmed = 0;
6427
6428 dout(7) << "trim_lru trimming " << count
6429 << " items from LRU"
6430 << " size=" << lru.lru_get_size()
6431 << " mid=" << lru.lru_get_top()
6432 << " pintail=" << lru.lru_get_pintail()
6433 << " pinned=" << lru.lru_get_num_pinned()
6434 << dendl;
6435
6436 for (;;) {
6437 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6438 if (!dn)
6439 break;
6440 if (trim_dentry(dn, expiremap)) {
6441 unexpirables.push_back(dn);
6442 } else {
6443 trimmed++;
6444 }
6445 }
6446
6447 for (auto &dn : unexpirables) {
6448 bottom_lru.lru_insert_mid(dn);
6449 }
6450 unexpirables.clear();
6451
6452 // trim dentries from the LRU until count is reached
6453 while (cache_toofull() || count > 0) {
6454 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6455 if (!dn) {
6456 break;
6457 }
6458 if ((is_standby_replay && dn->get_linkage()->inode &&
6459 dn->get_linkage()->inode->item_open_file.is_on_list())) {
6460 unexpirables.push_back(dn);
6461 } else if (trim_dentry(dn, expiremap)) {
6462 unexpirables.push_back(dn);
6463 } else {
6464 trimmed++;
6465 if (count > 0) count--;
6466 }
6467 }
6468
6469 for (auto &dn : unexpirables) {
6470 lru.lru_insert_mid(dn);
6471 }
6472 unexpirables.clear();
6473
6474 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6475 }
6476
6477 /*
6478 * note: only called while MDS is active or stopping... NOT during recovery.
6479 * however, we may expire a replica whose authority is recovering.
6480 *
6481 * @param count is number of dentries to try to expire
6482 */
6483 bool MDCache::trim(uint64_t count)
6484 {
6485 uint64_t used = cache_size();
6486 uint64_t limit = cache_limit_memory();
6487 map<mds_rank_t, MCacheExpire*> expiremap;
6488
6489 dout(7) << "trim bytes_used=" << bytes2str(used)
6490 << " limit=" << bytes2str(limit)
6491 << " reservation=" << cache_reservation()
6492 << "% count=" << count << dendl;
6493
6494 // process delayed eval_stray()
6495 stray_manager.advance_delayed();
6496
6497 trim_lru(count, expiremap);
6498
6499 // trim non-auth, non-bound subtrees
6500 for (auto p = subtrees.begin(); p != subtrees.end();) {
6501 CDir *dir = p->first;
6502 ++p;
6503 CInode *diri = dir->get_inode();
6504 if (dir->is_auth()) {
6505 if (!diri->is_auth() && !diri->is_base() &&
6506 dir->get_num_head_items() == 0) {
6507 if (dir->state_test(CDir::STATE_EXPORTING) ||
6508 !(mds->is_active() || mds->is_stopping()) ||
6509 dir->is_freezing() || dir->is_frozen())
6510 continue;
6511
6512 migrator->export_empty_import(dir);
6513 }
6514 } else {
6515 if (!diri->is_auth()) {
6516 if (dir->get_num_ref() > 1) // only subtree pin
6517 continue;
6518 list<CDir*> ls;
6519 diri->get_subtree_dirfrags(ls);
6520 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6521 continue;
6522
6523 // don't trim subtree root if its auth MDS is recovering.
6524 // This simplify the cache rejoin code.
6525 if (dir->is_subtree_root() &&
6526 rejoin_ack_gather.count(dir->get_dir_auth().first))
6527 continue;
6528 trim_dirfrag(dir, 0, expiremap);
6529 }
6530 }
6531 }
6532
6533 // trim root?
6534 if (mds->is_stopping() && root) {
6535 list<CDir*> ls;
6536 root->get_dirfrags(ls);
6537 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6538 CDir *dir = *p;
6539 if (dir->get_num_ref() == 1) // subtree pin
6540 trim_dirfrag(dir, 0, expiremap);
6541 }
6542 if (root->get_num_ref() == 0)
6543 trim_inode(0, root, 0, expiremap);
6544 }
6545
6546 std::set<mds_rank_t> stopping;
6547 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6548 stopping.erase(mds->get_nodeid());
6549 for (auto rank : stopping) {
6550 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6551 if (!mdsdir_in)
6552 continue;
6553
6554 if (expiremap.count(rank) == 0) {
6555 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6556 }
6557
6558 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6559
6560 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6561 if (!aborted) {
6562 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6563 list<CDir*> ls;
6564 mdsdir_in->get_dirfrags(ls);
6565 for (auto dir : ls) {
6566 if (dir->get_num_ref() == 1) // subtree pin
6567 trim_dirfrag(dir, dir, expiremap);
6568 }
6569 if (mdsdir_in->get_num_ref() == 0)
6570 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6571 } else {
6572 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6573 }
6574 }
6575
6576 // Other rank's base inodes (when I'm stopping)
6577 if (mds->is_stopping()) {
6578 for (set<CInode*>::iterator p = base_inodes.begin();
6579 p != base_inodes.end(); ++p) {
6580 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6581 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6582 if ((*p)->get_num_ref() == 0) {
6583 trim_inode(NULL, *p, NULL, expiremap);
6584 }
6585 }
6586 }
6587 }
6588
6589 // send any expire messages
6590 send_expire_messages(expiremap);
6591
6592 return true;
6593 }
6594
6595 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6596 {
6597 // send expires
6598 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6599 it != expiremap.end();
6600 ++it) {
6601 if (mds->is_cluster_degraded() &&
6602 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6603 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6604 rejoin_sent.count(it->first) == 0))) {
6605 it->second->put();
6606 continue;
6607 }
6608 dout(7) << "sending cache_expire to " << it->first << dendl;
6609 mds->send_message_mds(it->second, it->first);
6610 }
6611 }
6612
6613
6614 bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6615 {
6616 dout(12) << "trim_dentry " << *dn << dendl;
6617
6618 CDentry::linkage_t *dnl = dn->get_linkage();
6619
6620 CDir *dir = dn->get_dir();
6621 assert(dir);
6622
6623 CDir *con = get_subtree_root(dir);
6624 if (con)
6625 dout(12) << " in container " << *con << dendl;
6626 else {
6627 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6628 assert(dn->is_auth());
6629 }
6630
6631 // If replica dentry is not readable, it's likely we will receive
6632 // MDentryLink/MDentryUnlink message soon (It's possible we first
6633 // receive a MDentryUnlink message, then MDentryLink message)
6634 // MDentryLink message only replicates an inode, so we should
6635 // avoid trimming the inode's parent dentry. This is because that
6636 // unconnected replicas are problematic for subtree migration.
6637 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6638 !dn->get_dir()->get_inode()->is_stray())
6639 return true;
6640
6641 // adjust the dir state
6642 // NOTE: we can safely remove a clean, null dentry without effecting
6643 // directory completeness.
6644 // (check this _before_ we unlink the inode, below!)
6645 bool clear_complete = false;
6646 if (!(dnl->is_null() && dn->is_clean()))
6647 clear_complete = true;
6648
6649 // unlink the dentry
6650 if (dnl->is_remote()) {
6651 // just unlink.
6652 dir->unlink_inode(dn, false);
6653 } else if (dnl->is_primary()) {
6654 // expire the inode, too.
6655 CInode *in = dnl->get_inode();
6656 assert(in);
6657 if (trim_inode(dn, in, con, expiremap))
6658 return true; // purging stray instead of trimming
6659 } else {
6660 assert(dnl->is_null());
6661 }
6662
6663 if (!dn->is_auth()) {
6664 // notify dentry authority.
6665 mds_authority_t auth = dn->authority();
6666
6667 for (int p=0; p<2; p++) {
6668 mds_rank_t a = auth.first;
6669 if (p) a = auth.second;
6670 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6671 if (mds->get_nodeid() == auth.second &&
6672 con->is_importing()) break; // don't send any expire while importing.
6673 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6674
6675 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6676 assert(a != mds->get_nodeid());
6677 if (expiremap.count(a) == 0)
6678 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6679 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
6680 }
6681 }
6682
6683 // remove dentry
6684 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6685 dir->add_to_bloom(dn);
6686 dir->remove_dentry(dn);
6687
6688 if (clear_complete)
6689 dir->state_clear(CDir::STATE_COMPLETE);
6690
6691 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6692 return false;
6693 }
6694
6695
6696 void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6697 {
6698 dout(15) << "trim_dirfrag " << *dir << dendl;
6699
6700 if (dir->is_subtree_root()) {
6701 assert(!dir->is_auth() ||
6702 (!dir->is_replicated() && dir->inode->is_base()));
6703 remove_subtree(dir); // remove from subtree map
6704 }
6705 assert(dir->get_num_ref() == 0);
6706
6707 CInode *in = dir->get_inode();
6708
6709 if (!dir->is_auth()) {
6710 mds_authority_t auth = dir->authority();
6711
6712 // was this an auth delegation? (if so, slightly modified container)
6713 dirfrag_t condf;
6714 if (dir->is_subtree_root()) {
6715 dout(12) << " subtree root, container is " << *dir << dendl;
6716 con = dir;
6717 condf = dir->dirfrag();
6718 } else {
6719 condf = con->dirfrag();
6720 }
6721
6722 for (int p=0; p<2; p++) {
6723 mds_rank_t a = auth.first;
6724 if (p) a = auth.second;
6725 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6726 if (mds->get_nodeid() == auth.second &&
6727 con->is_importing()) break; // don't send any expire while importing.
6728 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6729
6730 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6731 assert(a != mds->get_nodeid());
6732 if (expiremap.count(a) == 0)
6733 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6734 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6735 }
6736 }
6737
6738 in->close_dirfrag(dir->dirfrag().frag);
6739 }
6740
6741 /**
6742 * Try trimming an inode from the cache
6743 *
6744 * @return true if the inode is still in cache, else false if it was trimmed
6745 */
6746 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6747 {
6748 dout(15) << "trim_inode " << *in << dendl;
6749 assert(in->get_num_ref() == 0);
6750
6751 if (in->is_dir()) {
6752 // If replica inode's dirfragtreelock is not readable, it's likely
6753 // some dirfrags of the inode are being fragmented and we will receive
6754 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6755 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6756 // This is because that unconnected replicas are problematic for
6757 // subtree migration.
6758 //
6759 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6760 return true;
6761
6762 // DIR
6763 list<CDir*> dfls;
6764 in->get_dirfrags(dfls);
6765 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6766 CDir *dir = *p;
6767 assert(!dir->is_subtree_root());
6768 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6769 }
6770 }
6771
6772 // INODE
6773 if (in->is_auth()) {
6774 // eval stray after closing dirfrags
6775 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6776 maybe_eval_stray(in);
6777 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6778 return true;
6779 }
6780 } else {
6781 mds_authority_t auth = in->authority();
6782
6783 dirfrag_t df;
6784 if (con)
6785 df = con->dirfrag();
6786 else
6787 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6788
6789 for (int p=0; p<2; p++) {
6790 mds_rank_t a = auth.first;
6791 if (p) a = auth.second;
6792 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6793 if (con && mds->get_nodeid() == auth.second &&
6794 con->is_importing()) break; // don't send any expire while importing.
6795 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6796
6797 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6798 assert(a != mds->get_nodeid());
6799 if (expiremap.count(a) == 0)
6800 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6801 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6802 }
6803 }
6804
6805 /*
6806 if (in->is_auth()) {
6807 if (in->hack_accessed)
6808 mds->logger->inc("outt");
6809 else {
6810 mds->logger->inc("outut");
6811 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6812 }
6813 }
6814 */
6815
6816 // unlink
6817 if (dn)
6818 dn->get_dir()->unlink_inode(dn, false);
6819 remove_inode(in);
6820 return false;
6821 }
6822
6823
6824 /**
6825 * trim_non_auth - remove any non-auth items from our cache
6826 *
6827 * this reduces the amount of non-auth metadata in our cache, reducing the
6828 * load incurred by the rejoin phase.
6829 *
6830 * the only non-auth items that remain are those that are needed to
6831 * attach our own subtrees to the root.
6832 *
6833 * when we are done, all dentries will be in the top bit of the lru.
6834 *
6835 * why we have to do this:
6836 * we may not have accurate linkage for non-auth items. which means we will
6837 * know which subtree it falls into, and can not be sure to declare it to the
6838 * correct authority.
6839 */
6840 void MDCache::trim_non_auth()
6841 {
6842 dout(7) << "trim_non_auth" << dendl;
6843
6844 // temporarily pin all subtree roots
6845 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6846 p != subtrees.end();
6847 ++p)
6848 p->first->get(CDir::PIN_SUBTREETEMP);
6849
6850 list<CDentry*> auth_list;
6851
6852 // trim non-auth items from the lru
6853 for (;;) {
6854 CDentry *dn = NULL;
6855 if (bottom_lru.lru_get_size() > 0)
6856 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6857 if (!dn && lru.lru_get_size() > 0)
6858 dn = static_cast<CDentry*>(lru.lru_expire());
6859 if (!dn)
6860 break;
6861
6862 CDentry::linkage_t *dnl = dn->get_linkage();
6863
6864 if (dn->is_auth()) {
6865 // add back into lru (at the top)
6866 auth_list.push_back(dn);
6867
6868 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6869 dn->unlink_remote(dnl);
6870 } else {
6871 // non-auth. expire.
6872 CDir *dir = dn->get_dir();
6873 assert(dir);
6874
6875 // unlink the dentry
6876 dout(10) << " removing " << *dn << dendl;
6877 if (dnl->is_remote()) {
6878 dir->unlink_inode(dn, false);
6879 }
6880 else if (dnl->is_primary()) {
6881 CInode *in = dnl->get_inode();
6882 dout(10) << " removing " << *in << dendl;
6883 list<CDir*> ls;
6884 in->get_dirfrags(ls);
6885 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6886 CDir *subdir = *p;
6887 assert(!subdir->is_subtree_root());
6888 in->close_dirfrag(subdir->dirfrag().frag);
6889 }
6890 dir->unlink_inode(dn, false);
6891 remove_inode(in);
6892 }
6893 else {
6894 assert(dnl->is_null());
6895 }
6896
6897 assert(!dir->has_bloom());
6898 dir->remove_dentry(dn);
6899 // adjust the dir state
6900 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6901 // close empty non-auth dirfrag
6902 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6903 dir->inode->close_dirfrag(dir->get_frag());
6904 }
6905 }
6906
6907 for (auto dn : auth_list) {
6908 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6909 bottom_lru.lru_insert_mid(dn);
6910 else
6911 lru.lru_insert_top(dn);
6912 }
6913
6914 // move everything in the pintail to the top bit of the lru.
6915 lru.lru_touch_entire_pintail();
6916
6917 // unpin all subtrees
6918 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6919 p != subtrees.end();
6920 ++p)
6921 p->first->put(CDir::PIN_SUBTREETEMP);
6922
6923 if (lru.lru_get_size() == 0 &&
6924 bottom_lru.lru_get_size() == 0) {
6925 // root, stray, etc.?
6926 auto p = inode_map.begin();
6927 while (p != inode_map.end()) {
6928 CInode *in = p->second;
6929 ++p;
6930 if (!in->is_auth()) {
6931 list<CDir*> ls;
6932 in->get_dirfrags(ls);
6933 for (list<CDir*>::iterator p = ls.begin();
6934 p != ls.end();
6935 ++p) {
6936 dout(10) << " removing " << **p << dendl;
6937 assert((*p)->get_num_ref() == 1); // SUBTREE
6938 remove_subtree((*p));
6939 in->close_dirfrag((*p)->dirfrag().frag);
6940 }
6941 dout(10) << " removing " << *in << dendl;
6942 assert(!in->get_parent_dn());
6943 assert(in->get_num_ref() == 0);
6944 remove_inode(in);
6945 }
6946 }
6947 }
6948
6949 show_subtrees();
6950 }
6951
6952 /**
6953 * Recursively trim the subtree rooted at directory to remove all
6954 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6955 * of those links. This is used to clear invalid data out of the cache.
6956 * Note that it doesn't clear the passed-in directory, since that's not
6957 * always safe.
6958 */
6959 bool MDCache::trim_non_auth_subtree(CDir *dir)
6960 {
6961 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6962
6963 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6964
6965 CDir::map_t::iterator j = dir->begin();
6966 CDir::map_t::iterator i = j;
6967 while (j != dir->end()) {
6968 i = j++;
6969 CDentry *dn = i->second;
6970 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6971 CDentry::linkage_t *dnl = dn->get_linkage();
6972 if (dnl->is_primary()) { // check for subdirectories, etc
6973 CInode *in = dnl->get_inode();
6974 bool keep_inode = false;
6975 if (in->is_dir()) {
6976 list<CDir*> subdirs;
6977 in->get_dirfrags(subdirs);
6978 for (list<CDir*>::iterator subdir = subdirs.begin();
6979 subdir != subdirs.end();
6980 ++subdir) {
6981 if ((*subdir)->is_subtree_root()) {
6982 keep_inode = true;
6983 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
6984 } else {
6985 if (trim_non_auth_subtree(*subdir))
6986 keep_inode = true;
6987 else {
6988 in->close_dirfrag((*subdir)->get_frag());
6989 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
6990 }
6991 }
6992 }
6993
6994 }
6995 if (!keep_inode) { // remove it!
6996 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
6997 dir->unlink_inode(dn, false);
6998 remove_inode(in);
6999 assert(!dir->has_bloom());
7000 dir->remove_dentry(dn);
7001 } else {
7002 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7003 dn->state_clear(CDentry::STATE_AUTH);
7004 in->state_clear(CInode::STATE_AUTH);
7005 }
7006 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7007 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7008 } else { // just remove it
7009 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7010 if (dnl->is_remote())
7011 dir->unlink_inode(dn, false);
7012 dir->remove_dentry(dn);
7013 }
7014 }
7015 dir->state_clear(CDir::STATE_AUTH);
7016 /**
7017 * We've now checked all our children and deleted those that need it.
7018 * Now return to caller, and tell them if *we're* a keeper.
7019 */
7020 return keep_dir || dir->get_num_any();
7021 }
7022
7023 /*
7024 * during replay, when we determine a subtree is no longer ours, we
7025 * try to trim it from our cache. because subtrees must be connected
7026 * to the root, the fact that we can trim this tree may mean that our
7027 * children or parents can also be trimmed.
7028 */
7029 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7030 {
7031 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7032
7033 // can we now trim child subtrees?
7034 set<CDir*> bounds;
7035 get_subtree_bounds(dir, bounds);
7036 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7037 CDir *bd = *p;
7038 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7039 bd->get_num_any() == 0 && // and empty
7040 can_trim_non_auth_dirfrag(bd)) {
7041 CInode *bi = bd->get_inode();
7042 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7043 remove_subtree(bd);
7044 bd->mark_clean();
7045 bi->close_dirfrag(bd->get_frag());
7046 }
7047 }
7048
7049 if (trim_non_auth_subtree(dir)) {
7050 // keep
7051 try_subtree_merge(dir);
7052 } else {
7053 // can we trim this subtree (and possibly our ancestors) too?
7054 while (true) {
7055 CInode *diri = dir->get_inode();
7056 if (diri->is_base()) {
7057 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7058 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7059 remove_subtree(dir);
7060 dir->mark_clean();
7061 diri->close_dirfrag(dir->get_frag());
7062
7063 dout(10) << " removing " << *diri << dendl;
7064 assert(!diri->get_parent_dn());
7065 assert(diri->get_num_ref() == 0);
7066 remove_inode(diri);
7067 }
7068 break;
7069 }
7070
7071 CDir *psub = get_subtree_root(diri->get_parent_dir());
7072 dout(10) << " parent subtree is " << *psub << dendl;
7073 if (psub->get_dir_auth().first == mds->get_nodeid())
7074 break; // we are auth, keep.
7075
7076 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7077 remove_subtree(dir);
7078 dir->mark_clean();
7079 diri->close_dirfrag(dir->get_frag());
7080
7081 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7082 if (trim_non_auth_subtree(psub))
7083 break;
7084 dir = psub;
7085 }
7086 }
7087
7088 show_subtrees();
7089 }
7090
7091 void MDCache::standby_trim_segment(LogSegment *ls)
7092 {
7093 ls->new_dirfrags.clear_list();
7094 ls->open_files.clear_list();
7095
7096 while (!ls->dirty_dirfrags.empty()) {
7097 CDir *dir = ls->dirty_dirfrags.front();
7098 dir->mark_clean();
7099 }
7100 while (!ls->dirty_inodes.empty()) {
7101 CInode *in = ls->dirty_inodes.front();
7102 in->mark_clean();
7103 }
7104 while (!ls->dirty_dentries.empty()) {
7105 CDentry *dn = ls->dirty_dentries.front();
7106 dn->mark_clean();
7107 }
7108 while (!ls->dirty_parent_inodes.empty()) {
7109 CInode *in = ls->dirty_parent_inodes.front();
7110 in->clear_dirty_parent();
7111 }
7112 while (!ls->dirty_dirfrag_dir.empty()) {
7113 CInode *in = ls->dirty_dirfrag_dir.front();
7114 in->filelock.remove_dirty();
7115 }
7116 while (!ls->dirty_dirfrag_nest.empty()) {
7117 CInode *in = ls->dirty_dirfrag_nest.front();
7118 in->nestlock.remove_dirty();
7119 }
7120 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7121 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7122 in->dirfragtreelock.remove_dirty();
7123 }
7124 }
7125
7126 /* This function DOES put the passed message before returning */
7127 void MDCache::handle_cache_expire(MCacheExpire *m)
7128 {
7129 mds_rank_t from = mds_rank_t(m->get_from());
7130
7131 dout(7) << "cache_expire from mds." << from << dendl;
7132
7133 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7134 m->put();
7135 return;
7136 }
7137
7138 set<SimpleLock *> gather_locks;
7139 // loop over realms
7140 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7141 p != m->realms.end();
7142 ++p) {
7143 // check container?
7144 if (p->first.ino > 0) {
7145 CInode *expired_inode = get_inode(p->first.ino);
7146 assert(expired_inode); // we had better have this.
7147 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7148 assert(parent_dir);
7149
7150 int export_state = -1;
7151 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7152 export_state = migrator->get_export_state(parent_dir);
7153 assert(export_state >= 0);
7154 }
7155
7156 if (!parent_dir->is_auth() ||
7157 (export_state != -1 &&
7158 ((export_state == Migrator::EXPORT_WARNING &&
7159 migrator->export_has_warned(parent_dir,from)) ||
7160 export_state == Migrator::EXPORT_EXPORTING ||
7161 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7162 (export_state == Migrator::EXPORT_NOTIFYING &&
7163 !migrator->export_has_notified(parent_dir,from))))) {
7164
7165 // not auth.
7166 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7167 assert(parent_dir->is_frozen_tree_root());
7168
7169 // make a message container
7170 if (delayed_expire[parent_dir].count(from) == 0)
7171 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7172
7173 // merge these expires into it
7174 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7175 continue;
7176 }
7177 assert(export_state <= Migrator::EXPORT_PREPPING ||
7178 (export_state == Migrator::EXPORT_WARNING &&
7179 !migrator->export_has_warned(parent_dir, from)));
7180
7181 dout(7) << "expires for " << *parent_dir << dendl;
7182 } else {
7183 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7184 }
7185
7186 // INODES
7187 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7188 it != p->second.inodes.end();
7189 ++it) {
7190 CInode *in = get_inode(it->first);
7191 unsigned nonce = it->second;
7192
7193 if (!in) {
7194 dout(0) << " inode expire on " << it->first << " from " << from
7195 << ", don't have it" << dendl;
7196 assert(in);
7197 }
7198 assert(in->is_auth());
7199 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7200
7201 // check nonce
7202 if (nonce == in->get_replica_nonce(from)) {
7203 // remove from our cached_by
7204 dout(7) << " inode expire on " << *in << " from mds." << from
7205 << " cached_by was " << in->get_replicas() << dendl;
7206 inode_remove_replica(in, from, false, gather_locks);
7207 }
7208 else {
7209 // this is an old nonce, ignore expire.
7210 dout(7) << " inode expire on " << *in << " from mds." << from
7211 << " with old nonce " << nonce
7212 << " (current " << in->get_replica_nonce(from) << "), dropping"
7213 << dendl;
7214 }
7215 }
7216
7217 // DIRS
7218 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7219 it != p->second.dirs.end();
7220 ++it) {
7221 CDir *dir = get_dirfrag(it->first);
7222 unsigned nonce = it->second;
7223
7224 if (!dir) {
7225 CInode *diri = get_inode(it->first.ino);
7226 if (diri) {
7227 if (mds->is_rejoin() &&
7228 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7229 !diri->is_replica(from)) {
7230 list<CDir*> ls;
7231 diri->get_nested_dirfrags(ls);
7232 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7233 << " while rejoining, inode isn't replicated" << dendl;
7234 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7235 dir = *q;
7236 if (dir->is_replica(from)) {
7237 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7238 dir->remove_replica(from);
7239 }
7240 }
7241 continue;
7242 }
7243 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7244 if (other) {
7245 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7246 << " have " << *other << ", mismatched frags, dropping" << dendl;
7247 continue;
7248 }
7249 }
7250 dout(0) << " dir expire on " << it->first << " from " << from
7251 << ", don't have it" << dendl;
7252 assert(dir);
7253 }
7254 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7255
7256 assert(dir->is_auth());
7257
7258 // check nonce
7259 if (nonce == dir->get_replica_nonce(from)) {
7260 // remove from our cached_by
7261 dout(7) << " dir expire on " << *dir << " from mds." << from
7262 << " replicas was " << dir->get_replicas() << dendl;
7263 dir->remove_replica(from);
7264 }
7265 else {
7266 // this is an old nonce, ignore expire.
7267 dout(7) << " dir expire on " << *dir << " from mds." << from
7268 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7269 << "), dropping" << dendl;
7270 }
7271 }
7272
7273 // DENTRIES
7274 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7275 pd != p->second.dentries.end();
7276 ++pd) {
7277 dout(10) << " dn expires in dir " << pd->first << dendl;
7278 CInode *diri = get_inode(pd->first.ino);
7279 assert(diri);
7280 CDir *dir = diri->get_dirfrag(pd->first.frag);
7281
7282 if (!dir) {
7283 dout(0) << " dn expires on " << pd->first << " from " << from
7284 << ", must have refragmented" << dendl;
7285 } else {
7286 assert(dir->is_auth());
7287 }
7288
7289 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7290 p != pd->second.end();
7291 ++p) {
7292 unsigned nonce = p->second;
7293 CDentry *dn;
7294
7295 if (dir) {
7296 dn = dir->lookup(p->first.first, p->first.second);
7297 } else {
7298 // which dirfrag for this dentry?
7299 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7300 assert(dir);
7301 assert(dir->is_auth());
7302 dn = dir->lookup(p->first.first, p->first.second);
7303 }
7304
7305 if (!dn) {
7306 if (dir)
7307 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7308 else
7309 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7310 }
7311 assert(dn);
7312
7313 if (nonce == dn->get_replica_nonce(from)) {
7314 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7315 dentry_remove_replica(dn, from, gather_locks);
7316 }
7317 else {
7318 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7319 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7320 << "), dropping" << dendl;
7321 }
7322 }
7323 }
7324 }
7325
7326 // done
7327 m->put();
7328
7329 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7330 if (!(*p)->is_stable())
7331 mds->locker->eval_gather(*p);
7332 }
7333 }
7334
7335 void MDCache::process_delayed_expire(CDir *dir)
7336 {
7337 dout(7) << "process_delayed_expire on " << *dir << dendl;
7338 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7339 p != delayed_expire[dir].end();
7340 ++p)
7341 handle_cache_expire(p->second);
7342 delayed_expire.erase(dir);
7343 }
7344
7345 void MDCache::discard_delayed_expire(CDir *dir)
7346 {
7347 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7348 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7349 p != delayed_expire[dir].end();
7350 ++p)
7351 p->second->put();
7352 delayed_expire.erase(dir);
7353 }
7354
7355 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7356 set<SimpleLock *>& gather_locks)
7357 {
7358 in->remove_replica(from);
7359 in->mds_caps_wanted.erase(from);
7360
7361 // note: this code calls _eval more often than it needs to!
7362 // fix lock
7363 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7364 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7365 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7366 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7367 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7368 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7369
7370 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7371 // Don't remove the recovering mds from lock's gathering list because
7372 // it may hold rejoined wrlocks.
7373 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7374 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7375 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7376 }
7377
7378 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7379 {
7380 dn->remove_replica(from);
7381
7382 // fix lock
7383 if (dn->lock.remove_replica(from))
7384 gather_locks.insert(&dn->lock);
7385
7386 // Replicated strays might now be elegible for purge
7387 CDentry::linkage_t *dnl = dn->get_linkage();
7388 if (dnl->is_primary()) {
7389 maybe_eval_stray(dnl->get_inode());
7390 }
7391 }
7392
7393 void MDCache::trim_client_leases()
7394 {
7395 utime_t now = ceph_clock_now();
7396
7397 dout(10) << "trim_client_leases" << dendl;
7398
7399 for (int pool=0; pool<client_lease_pools; pool++) {
7400 int before = client_leases[pool].size();
7401 if (client_leases[pool].empty())
7402 continue;
7403
7404 while (!client_leases[pool].empty()) {
7405 ClientLease *r = client_leases[pool].front();
7406 if (r->ttl > now) break;
7407 CDentry *dn = static_cast<CDentry*>(r->parent);
7408 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7409 dn->remove_client_lease(r, mds->locker);
7410 }
7411 int after = client_leases[pool].size();
7412 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7413 << (before-after) << " leases, " << after << " left" << dendl;
7414 }
7415 }
7416
7417
7418 void MDCache::check_memory_usage()
7419 {
7420 static MemoryModel mm(g_ceph_context);
7421 static MemoryModel::snap last;
7422 mm.sample(&last);
7423 static MemoryModel::snap baseline = last;
7424
7425 // check client caps
7426 assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7427 double caps_per_inode = 0.0;
7428 if (CInode::count())
7429 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7430
7431 dout(2) << "check_memory_usage"
7432 << " total " << last.get_total()
7433 << ", rss " << last.get_rss()
7434 << ", heap " << last.get_heap()
7435 << ", baseline " << baseline.get_heap()
7436 << ", buffers " << (buffer::get_total_alloc() >> 10)
7437 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7438 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7439 << dendl;
7440
7441 mds->update_mlogger();
7442 mds->mlogger->set(l_mdm_rss, last.get_rss());
7443 mds->mlogger->set(l_mdm_heap, last.get_heap());
7444
7445 if (cache_toofull()) {
7446 last_recall_state = ceph_clock_now();
7447 mds->server->recall_client_state();
7448 }
7449
7450 // If the cache size had exceeded its limit, but we're back in bounds
7451 // now, free any unused pool memory so that our memory usage isn't
7452 // permanently bloated.
7453 if (exceeded_size_limit && !cache_toofull()) {
7454 // Only do this once we are back in bounds: otherwise the releases would
7455 // slow down whatever process caused us to exceed bounds to begin with
7456 if (ceph_using_tcmalloc()) {
7457 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7458 << dendl;
7459 ceph_heap_release_free_memory();
7460 }
7461 exceeded_size_limit = false;
7462 }
7463 }
7464
7465
7466
7467 // =========================================================================================
7468 // shutdown
7469
7470 class C_MDC_ShutdownCheck : public MDCacheContext {
7471 public:
7472 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7473 void finish(int) override {
7474 mdcache->shutdown_check();
7475 }
7476 };
7477
7478 void MDCache::shutdown_check()
7479 {
7480 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7481
7482 // cache
7483 char old_val[32] = { 0 };
7484 char *o = old_val;
7485 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7486 g_conf->set_val("debug_mds", "10");
7487 g_conf->apply_changes(NULL);
7488 show_cache();
7489 g_conf->set_val("debug_mds", old_val);
7490 g_conf->apply_changes(NULL);
7491 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7492
7493 // this
7494 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7495 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7496
7497
7498 if (mds->objecter->is_active()) {
7499 dout(0) << "objecter still active" << dendl;
7500 mds->objecter->dump_active();
7501 }
7502 }
7503
7504
7505 void MDCache::shutdown_start()
7506 {
7507 dout(2) << "shutdown_start" << dendl;
7508
7509 if (g_conf->mds_shutdown_check)
7510 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7511
7512 // g_conf->debug_mds = 10;
7513 }
7514
7515
7516
7517 bool MDCache::shutdown_pass()
7518 {
7519 dout(7) << "shutdown_pass" << dendl;
7520
7521 if (mds->is_stopped()) {
7522 dout(7) << " already shut down" << dendl;
7523 show_cache();
7524 show_subtrees();
7525 return true;
7526 }
7527
7528 // empty stray dir
7529 if (!shutdown_export_strays()) {
7530 dout(7) << "waiting for strays to migrate" << dendl;
7531 return false;
7532 }
7533
7534 // drop our reference to our stray dir inode
7535 for (int i = 0; i < NUM_STRAY; ++i) {
7536 if (strays[i] &&
7537 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7538 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7539 strays[i]->put(CInode::PIN_STRAY);
7540 strays[i]->put_stickydirs();
7541 }
7542 }
7543
7544 // trim cache
7545 trim(UINT64_MAX);
7546 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7547
7548 // SUBTREES
7549 int num_auth_subtree = 0;
7550 if (!subtrees.empty() &&
7551 mds->get_nodeid() != 0 &&
7552 migrator->get_export_queue_size() == 0) {
7553 dout(7) << "looking for subtrees to export to mds0" << dendl;
7554 list<CDir*> ls;
7555 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7556 it != subtrees.end();
7557 ++it) {
7558 CDir *dir = it->first;
7559 if (dir->get_inode()->is_mdsdir())
7560 continue;
7561 if (dir->is_auth()) {
7562 num_auth_subtree++;
7563 if (dir->is_frozen() ||
7564 dir->is_freezing() ||
7565 dir->is_ambiguous_dir_auth() ||
7566 dir->state_test(CDir::STATE_EXPORTING))
7567 continue;
7568 ls.push_back(dir);
7569 }
7570 }
7571 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7572 CDir *dir = *p;
7573 mds_rank_t dest = dir->get_inode()->authority().first;
7574 if (dest > 0 && !mds->mdsmap->is_active(dest))
7575 dest = 0;
7576 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7577 migrator->export_dir_nicely(dir, dest);
7578 }
7579 }
7580
7581 if (num_auth_subtree > 0) {
7582 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7583 show_subtrees();
7584 return false;
7585 }
7586
7587 // close out any sessions (and open files!) before we try to trim the log, etc.
7588 if (mds->sessionmap.have_unclosed_sessions()) {
7589 if (!mds->server->terminating_sessions)
7590 mds->server->terminate_sessions();
7591 return false;
7592 }
7593
7594 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7595 if (mydir && !mydir->is_subtree_root())
7596 mydir = NULL;
7597
7598 // subtrees map not empty yet?
7599 if (subtrees.size() > (mydir ? 1 : 0)) {
7600 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7601 show_subtrees();
7602 migrator->show_importing();
7603 migrator->show_exporting();
7604 if (!migrator->is_importing() && !migrator->is_exporting())
7605 show_cache();
7606 return false;
7607 }
7608 assert(!migrator->is_exporting());
7609 assert(!migrator->is_importing());
7610
7611 // flush what we can from the log
7612 mds->mdlog->trim(0);
7613 if (mds->mdlog->get_num_segments() > 1) {
7614 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7615 return false;
7616 }
7617
7618 if ((myin && myin->is_auth_pinned()) ||
7619 (mydir && mydir->is_auth_pinned())) {
7620 dout(7) << "still have auth pinned objects" << dendl;
7621 return false;
7622 }
7623
7624 // (only do this once!)
7625 if (!mds->mdlog->is_capped()) {
7626 dout(7) << "capping the log" << dendl;
7627 mds->mdlog->cap();
7628 mds->mdlog->trim();
7629 }
7630
7631 if (!mds->mdlog->empty()) {
7632 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7633 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7634 return false;
7635 }
7636
7637 if (!did_shutdown_log_cap) {
7638 // flush journal header
7639 dout(7) << "writing header for (now-empty) journal" << dendl;
7640 assert(mds->mdlog->empty());
7641 mds->mdlog->write_head(0);
7642 // NOTE: filer active checker below will block us until this completes.
7643 did_shutdown_log_cap = true;
7644 return false;
7645 }
7646
7647 // filer active?
7648 if (mds->objecter->is_active()) {
7649 dout(7) << "objecter still active" << dendl;
7650 mds->objecter->dump_active();
7651 return false;
7652 }
7653
7654 // trim what we can from the cache
7655 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7656 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7657 show_cache();
7658 //dump();
7659 return false;
7660 }
7661
7662 // make mydir subtree go away
7663 if (mydir) {
7664 if (mydir->get_num_ref() > 1) { // subtree pin
7665 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7666 show_cache();
7667 return false;
7668 }
7669
7670 remove_subtree(mydir);
7671 myin->close_dirfrag(mydir->get_frag());
7672 }
7673 assert(subtrees.empty());
7674
7675 if (myin)
7676 remove_inode(myin);
7677
7678 // done!
7679 dout(2) << "shutdown done." << dendl;
7680 return true;
7681 }
7682
7683 bool MDCache::shutdown_export_strays()
7684 {
7685 if (mds->get_nodeid() == 0)
7686 return true;
7687
7688 dout(10) << "shutdown_export_strays" << dendl;
7689
7690 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7691
7692 bool done = true;
7693
7694 list<CDir*> dfs;
7695 for (int i = 0; i < NUM_STRAY; ++i) {
7696 if (!strays[i]) {
7697 continue;
7698 }
7699 strays[i]->get_dirfrags(dfs);
7700 }
7701
7702 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7703 dfs_i != dfs.end(); ++dfs_i)
7704 {
7705 CDir *dir = *dfs_i;
7706
7707 if (!dir->is_complete()) {
7708 dir->fetch(0);
7709 done = false;
7710 if (!mds0_active)
7711 break;
7712 }
7713
7714 for (CDir::map_t::iterator p = dir->items.begin();
7715 p != dir->items.end();
7716 ++p) {
7717 CDentry *dn = p->second;
7718 CDentry::linkage_t *dnl = dn->get_linkage();
7719 if (dnl->is_null())
7720 continue;
7721 done = false;
7722 if (!mds0_active)
7723 break;
7724
7725 if (dn->state_test(CDentry::STATE_PURGING)) {
7726 // Don't try to migrate anything that is actually
7727 // being purged right now
7728 continue;
7729 }
7730
7731 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7732 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7733 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7734 } else {
7735 dout(10) << "already exporting " << *dn << dendl;
7736 }
7737 }
7738 }
7739
7740 return done;
7741 }
7742
7743 // ========= messaging ==============
7744
7745 /* This function DOES put the passed message before returning */
7746 void MDCache::dispatch(Message *m)
7747 {
7748 switch (m->get_type()) {
7749
7750 // RESOLVE
7751 case MSG_MDS_RESOLVE:
7752 handle_resolve(static_cast<MMDSResolve*>(m));
7753 break;
7754 case MSG_MDS_RESOLVEACK:
7755 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7756 break;
7757
7758 // REJOIN
7759 case MSG_MDS_CACHEREJOIN:
7760 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7761 break;
7762
7763 case MSG_MDS_DISCOVER:
7764 handle_discover(static_cast<MDiscover*>(m));
7765 break;
7766 case MSG_MDS_DISCOVERREPLY:
7767 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7768 break;
7769
7770 case MSG_MDS_DIRUPDATE:
7771 handle_dir_update(static_cast<MDirUpdate*>(m));
7772 break;
7773
7774 case MSG_MDS_CACHEEXPIRE:
7775 handle_cache_expire(static_cast<MCacheExpire*>(m));
7776 break;
7777
7778 case MSG_MDS_DENTRYLINK:
7779 handle_dentry_link(static_cast<MDentryLink*>(m));
7780 break;
7781 case MSG_MDS_DENTRYUNLINK:
7782 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7783 break;
7784
7785 case MSG_MDS_FRAGMENTNOTIFY:
7786 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7787 break;
7788
7789 case MSG_MDS_FINDINO:
7790 handle_find_ino(static_cast<MMDSFindIno *>(m));
7791 break;
7792 case MSG_MDS_FINDINOREPLY:
7793 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7794 break;
7795
7796 case MSG_MDS_OPENINO:
7797 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7798 break;
7799 case MSG_MDS_OPENINOREPLY:
7800 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7801 break;
7802
7803 default:
7804 derr << "cache unknown message " << m->get_type() << dendl;
7805 assert(0 == "cache unknown message");
7806 }
7807 }
7808
7809 MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7810 {
7811 if (mdr) {
7812 dout(20) << "_get_waiter retryrequest" << dendl;
7813 return new C_MDS_RetryRequest(this, mdr);
7814 } else if (req) {
7815 dout(20) << "_get_waiter retrymessage" << dendl;
7816 return new C_MDS_RetryMessage(mds, req);
7817 } else {
7818 return fin;
7819 }
7820 }
7821
7822 int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7823 const filepath& path, // what
7824 vector<CDentry*> *pdnvec, // result
7825 CInode **pin,
7826 int onfail)
7827 {
7828 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7829 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7830 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7831
7832 assert(mdr || req || fin);
7833 assert(!forward || mdr || req); // forward requires a request
7834
7835 snapid_t snapid = CEPH_NOSNAP;
7836 if (mdr)
7837 mdr->snapid = snapid;
7838
7839 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7840
7841 if (mds->logger) mds->logger->inc(l_mds_traverse);
7842
7843 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7844 CInode *cur = get_inode(path.get_ino());
7845 if (cur == NULL) {
7846 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7847 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7848 else {
7849 //ceph_abort(); // hrm.. broken
7850 return -ESTALE;
7851 }
7852 return 1;
7853 }
7854 if (cur->state_test(CInode::STATE_PURGING))
7855 return -ESTALE;
7856
7857 // make sure snaprealm are open...
7858 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7859 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7860 return 1;
7861 }
7862
7863 // start trace
7864 if (pdnvec)
7865 pdnvec->clear();
7866 if (pin)
7867 *pin = cur;
7868
7869 unsigned depth = 0;
7870 while (depth < path.depth()) {
7871 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7872 << "' snapid " << snapid << dendl;
7873
7874 if (!cur->is_dir()) {
7875 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7876 return -ENOTDIR;
7877 }
7878
7879 // walk into snapdir?
7880 if (path[depth].length() == 0) {
7881 dout(10) << "traverse: snapdir" << dendl;
7882 if (!mdr)
7883 return -EINVAL;
7884 snapid = CEPH_SNAPDIR;
7885 mdr->snapid = snapid;
7886 depth++;
7887 continue;
7888 }
7889 // walk thru snapdir?
7890 if (snapid == CEPH_SNAPDIR) {
7891 if (!mdr)
7892 return -EINVAL;
7893 SnapRealm *realm = cur->find_snaprealm();
7894 snapid = realm->resolve_snapname(path[depth], cur->ino());
7895 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7896 if (!snapid)
7897 return -ENOENT;
7898 mdr->snapid = snapid;
7899 depth++;
7900 continue;
7901 }
7902
7903 // open dir
7904 frag_t fg = cur->pick_dirfrag(path[depth]);
7905 CDir *curdir = cur->get_dirfrag(fg);
7906 if (!curdir) {
7907 if (cur->is_auth()) {
7908 // parent dir frozen_dir?
7909 if (cur->is_frozen()) {
7910 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7911 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7912 return 1;
7913 }
7914 curdir = cur->get_or_open_dirfrag(this, fg);
7915 } else {
7916 // discover?
7917 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7918 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7919 null_okay);
7920 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7921 return 1;
7922 }
7923 }
7924 assert(curdir);
7925
7926 #ifdef MDS_VERIFY_FRAGSTAT
7927 if (curdir->is_complete())
7928 curdir->verify_fragstat();
7929 #endif
7930
7931 // frozen?
7932 /*
7933 if (curdir->is_frozen()) {
7934 // doh!
7935 // FIXME: traverse is allowed?
7936 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7937 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7938 if (onfinish) delete onfinish;
7939 return 1;
7940 }
7941 */
7942
7943 // Before doing dirfrag->dn lookup, compare with DamageTable's
7944 // record of which dentries were unreadable
7945 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7946 dout(4) << "traverse: stopped lookup at damaged dentry "
7947 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7948 return -EIO;
7949 }
7950
7951 // dentry
7952 CDentry *dn = curdir->lookup(path[depth], snapid);
7953 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7954
7955 // null and last_bit and xlocked by me?
7956 if (dnl && dnl->is_null() && null_okay) {
7957 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7958 if (pdnvec)
7959 pdnvec->push_back(dn);
7960 if (pin)
7961 *pin = 0;
7962 break; // done!
7963 }
7964
7965 if (dnl &&
7966 dn->lock.is_xlocked() &&
7967 dn->lock.get_xlock_by() != mdr &&
7968 !dn->lock.can_read(client) &&
7969 (dnl->is_null() || forward)) {
7970 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7971 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7972 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7973 mds->mdlog->flush();
7974 return 1;
7975 }
7976
7977 // can we conclude ENOENT?
7978 if (dnl && dnl->is_null()) {
7979 if (dn->lock.can_read(client) ||
7980 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
7981 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
7982 if (pdnvec) {
7983 if (depth == path.depth() - 1)
7984 pdnvec->push_back(dn);
7985 else
7986 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
7987 }
7988 return -ENOENT;
7989 } else {
7990 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
7991 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7992 return 1;
7993 }
7994 }
7995
7996 if (dnl && !dnl->is_null()) {
7997 CInode *in = dnl->get_inode();
7998
7999 // do we have inode?
8000 if (!in) {
8001 assert(dnl->is_remote());
8002 // do i have it?
8003 in = get_inode(dnl->get_remote_ino());
8004 if (in) {
8005 dout(7) << "linking in remote in " << *in << dendl;
8006 dn->link_remote(dnl, in);
8007 } else {
8008 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8009 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8010 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8011 dout(4) << "traverse: remote dentry points to damaged ino "
8012 << *dn << dendl;
8013 return -EIO;
8014 }
8015 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8016 (null_okay && depth == path.depth() - 1));
8017 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8018 return 1;
8019 }
8020 }
8021
8022 cur = in;
8023 // make sure snaprealm are open...
8024 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8025 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8026 return 1;
8027 }
8028
8029 // add to trace, continue.
8030 touch_inode(cur);
8031 if (pdnvec)
8032 pdnvec->push_back(dn);
8033 if (pin)
8034 *pin = cur;
8035 depth++;
8036 continue;
8037 }
8038
8039
8040 // MISS. dentry doesn't exist.
8041 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8042
8043 if (curdir->is_auth()) {
8044 // dentry is mine.
8045 if (curdir->is_complete() ||
8046 (snapid == CEPH_NOSNAP &&
8047 curdir->has_bloom() &&
8048 !curdir->is_in_bloom(path[depth]))){
8049 // file not found
8050 if (pdnvec) {
8051 // instantiate a null dn?
8052 if (depth < path.depth()-1){
8053 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8054 dn = NULL;
8055 } else if (dn) {
8056 ceph_abort(); // should have fallen out in ->is_null() check above
8057 } else if (curdir->is_frozen()) {
8058 dout(20) << " not adding null to frozen dir " << dendl;
8059 } else if (snapid < CEPH_MAXSNAP) {
8060 dout(20) << " not adding null for snapid " << snapid << dendl;
8061 } else {
8062 // create a null dentry
8063 dn = curdir->add_null_dentry(path[depth]);
8064 dout(20) << " added null " << *dn << dendl;
8065 }
8066 if (dn)
8067 pdnvec->push_back(dn);
8068 else
8069 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8070 }
8071 return -ENOENT;
8072 } else {
8073
8074 // Check DamageTable for missing fragments before trying to fetch
8075 // this
8076 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8077 dout(4) << "traverse: damaged dirfrag " << *curdir
8078 << ", blocking fetch" << dendl;
8079 return -EIO;
8080 }
8081
8082 // directory isn't complete; reload
8083 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8084 touch_inode(cur);
8085 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8086 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8087 return 1;
8088 }
8089 } else {
8090 // dirfrag/dentry is not mine.
8091 mds_authority_t dauth = curdir->authority();
8092
8093 if (forward &&
8094 snapid && mdr && mdr->client_request &&
8095 (int)depth < mdr->client_request->get_num_fwd()) {
8096 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8097 << " < fwd " << mdr->client_request->get_num_fwd()
8098 << ", discovering instead of forwarding" << dendl;
8099 discover = true;
8100 }
8101
8102 if ((discover || null_okay)) {
8103 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8104 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8105 null_okay);
8106 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8107 return 1;
8108 }
8109 if (forward) {
8110 // forward
8111 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8112
8113 if (curdir->is_ambiguous_auth()) {
8114 // wait
8115 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8116 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8117 return 1;
8118 }
8119
8120 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8121
8122 if (mdr)
8123 request_forward(mdr, dauth.first);
8124 else
8125 mds->forward_message_mds(req, dauth.first);
8126
8127 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8128 assert(fin == NULL);
8129 return 2;
8130 }
8131 }
8132
8133 ceph_abort(); // i shouldn't get here
8134 }
8135
8136 // success.
8137 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8138 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8139 if (mdr)
8140 assert(mdr->snapid == snapid);
8141 return 0;
8142 }
8143
8144 CInode *MDCache::cache_traverse(const filepath& fp)
8145 {
8146 dout(10) << "cache_traverse " << fp << dendl;
8147
8148 CInode *in;
8149 if (fp.get_ino())
8150 in = get_inode(fp.get_ino());
8151 else
8152 in = root;
8153 if (!in)
8154 return NULL;
8155
8156 for (unsigned i = 0; i < fp.depth(); i++) {
8157 const string& dname = fp[i];
8158 frag_t fg = in->pick_dirfrag(dname);
8159 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8160 CDir *curdir = in->get_dirfrag(fg);
8161 if (!curdir)
8162 return NULL;
8163 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8164 if (!dn)
8165 return NULL;
8166 in = dn->get_linkage()->get_inode();
8167 if (!in)
8168 return NULL;
8169 }
8170 dout(10) << " got " << *in << dendl;
8171 return in;
8172 }
8173
8174
8175 /**
8176 * open_remote_dir -- open up a remote dirfrag
8177 *
8178 * @param diri base inode
8179 * @param approxfg approximate fragment.
8180 * @param fin completion callback
8181 */
8182 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8183 {
8184 dout(10) << "open_remote_dir on " << *diri << dendl;
8185 assert(diri->is_dir());
8186 assert(!diri->is_auth());
8187 assert(diri->get_dirfrag(approxfg) == 0);
8188
8189 discover_dir_frag(diri, approxfg, fin);
8190 }
8191
8192
8193 /**
8194 * get_dentry_inode - get or open inode
8195 *
8196 * @param dn the dentry
8197 * @param mdr current request
8198 *
8199 * will return inode for primary, or link up/open up remote link's inode as necessary.
8200 * If it's not available right now, puts mdr on wait list and returns null.
8201 */
8202 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8203 {
8204 CDentry::linkage_t *dnl;
8205 if (projected)
8206 dnl = dn->get_projected_linkage();
8207 else
8208 dnl = dn->get_linkage();
8209
8210 assert(!dnl->is_null());
8211
8212 if (dnl->is_primary())
8213 return dnl->inode;
8214
8215 assert(dnl->is_remote());
8216 CInode *in = get_inode(dnl->get_remote_ino());
8217 if (in) {
8218 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8219 dn->link_remote(dnl, in);
8220 return in;
8221 } else {
8222 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8223 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8224 return 0;
8225 }
8226 }
8227
8228 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8229 CDentry *dn;
8230 inodeno_t ino;
8231 MDSInternalContextBase *onfinish;
8232 bool want_xlocked;
8233 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
8234 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8235 dn->get(MDSCacheObject::PIN_PTRWAITER);
8236 }
8237 void finish(int r) override {
8238 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8239 dn->put(MDSCacheObject::PIN_PTRWAITER);
8240 }
8241 };
8242
8243 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8244 {
8245 dout(10) << "open_remote_dentry " << *dn << dendl;
8246 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8247 inodeno_t ino = dnl->get_remote_ino();
8248 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8249 open_ino(ino, pool,
8250 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8251 }
8252
8253 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8254 bool want_xlocked, int r)
8255 {
8256 if (r < 0) {
8257 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8258 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8259 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8260 dn->state_set(CDentry::STATE_BADREMOTEINO);
8261
8262 std::string path;
8263 CDir *dir = dn->get_dir();
8264 if (dir) {
8265 dir->get_inode()->make_path_string(path);
8266 path = path + "/" + dn->get_name();
8267 }
8268
8269 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8270 if (fatal) {
8271 mds->damaged();
8272 ceph_abort(); // unreachable, damaged() respawns us
8273 }
8274 } else {
8275 r = 0;
8276 }
8277 }
8278 fin->complete(r < 0 ? r : 0);
8279 }
8280
8281
8282 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8283 {
8284 // empty trace if we're a base inode
8285 if (in->is_base())
8286 return;
8287
8288 CInode *parent = in->get_parent_inode();
8289 assert(parent);
8290 make_trace(trace, parent);
8291
8292 CDentry *dn = in->get_parent_dn();
8293 dout(15) << "make_trace adding " << *dn << dendl;
8294 trace.push_back(dn);
8295 }
8296
8297
8298 // -------------------------------------------------------------------------------
8299 // Open inode by inode number
8300
8301 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8302 inodeno_t ino;
8303 public:
8304 bufferlist bl;
8305 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8306 MDCacheIOContext(c), ino(i) {}
8307 void finish(int r) override {
8308 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8309 }
8310 };
8311
8312 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8313 inodeno_t ino;
8314 MMDSOpenIno *msg;
8315 bool parent;
8316 public:
8317 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8318 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8319 void finish(int r) override {
8320 if (r < 0 && !parent)
8321 r = -EAGAIN;
8322 if (msg) {
8323 mdcache->handle_open_ino(msg, r);
8324 return;
8325 }
8326 assert(mdcache->opening_inodes.count(ino));
8327 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8328 }
8329 };
8330
8331 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8332 inodeno_t ino;
8333 public:
8334 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8335 void finish(int r) override {
8336 mdcache->_open_ino_parent_opened(ino, r);
8337 }
8338 };
8339
8340 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8341 {
8342 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8343
8344 assert(opening_inodes.count(ino));
8345 open_ino_info_t& info = opening_inodes[ino];
8346
8347 CInode *in = get_inode(ino);
8348 if (in) {
8349 dout(10) << " found cached " << *in << dendl;
8350 open_ino_finish(ino, info, in->authority().first);
8351 return;
8352 }
8353
8354 inode_backtrace_t backtrace;
8355 if (err == 0) {
8356 try {
8357 ::decode(backtrace, bl);
8358 } catch (const buffer::error &decode_exc) {
8359 derr << "corrupt backtrace on ino x0" << std::hex << ino
8360 << std::dec << ": " << decode_exc << dendl;
8361 open_ino_finish(ino, info, -EIO);
8362 return;
8363 }
8364 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8365 dout(10) << " old object in pool " << info.pool
8366 << ", retrying pool " << backtrace.pool << dendl;
8367 info.pool = backtrace.pool;
8368 C_IO_MDC_OpenInoBacktraceFetched *fin =
8369 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8370 fetch_backtrace(ino, info.pool, fin->bl,
8371 new C_OnFinisher(fin, mds->finisher));
8372 return;
8373 }
8374 } else if (err == -ENOENT) {
8375 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8376 if (info.pool != meta_pool) {
8377 dout(10) << " no object in pool " << info.pool
8378 << ", retrying pool " << meta_pool << dendl;
8379 info.pool = meta_pool;
8380 C_IO_MDC_OpenInoBacktraceFetched *fin =
8381 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8382 fetch_backtrace(ino, info.pool, fin->bl,
8383 new C_OnFinisher(fin, mds->finisher));
8384 return;
8385 }
8386 err = 0; // backtrace.ancestors.empty() is checked below
8387 }
8388
8389 if (err == 0) {
8390 if (backtrace.ancestors.empty()) {
8391 dout(10) << " got empty backtrace " << dendl;
8392 err = -EIO;
8393 } else if (!info.ancestors.empty()) {
8394 if (info.ancestors[0] == backtrace.ancestors[0]) {
8395 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8396 err = -EINVAL;
8397 } else {
8398 info.last_err = 0;
8399 }
8400 }
8401 }
8402 if (err) {
8403 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8404 if (info.last_err)
8405 err = info.last_err;
8406 open_ino_finish(ino, info, err);
8407 return;
8408 }
8409
8410 dout(10) << " got backtrace " << backtrace << dendl;
8411 info.ancestors = backtrace.ancestors;
8412
8413 _open_ino_traverse_dir(ino, info, 0);
8414 }
8415
8416 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8417 {
8418 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8419
8420 assert(opening_inodes.count(ino));
8421 open_ino_info_t& info = opening_inodes[ino];
8422
8423 CInode *in = get_inode(ino);
8424 if (in) {
8425 dout(10) << " found cached " << *in << dendl;
8426 open_ino_finish(ino, info, in->authority().first);
8427 return;
8428 }
8429
8430 if (ret == mds->get_nodeid()) {
8431 _open_ino_traverse_dir(ino, info, 0);
8432 } else {
8433 if (ret >= 0) {
8434 mds_rank_t checked_rank = mds_rank_t(ret);
8435 info.check_peers = true;
8436 info.auth_hint = checked_rank;
8437 info.checked.erase(checked_rank);
8438 }
8439 do_open_ino(ino, info, ret);
8440 }
8441 }
8442
8443 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8444 {
8445 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8446
8447 CInode *in = get_inode(ino);
8448 if (in) {
8449 dout(10) << " found cached " << *in << dendl;
8450 open_ino_finish(ino, info, in->authority().first);
8451 return;
8452 }
8453
8454 if (ret) {
8455 do_open_ino(ino, info, ret);
8456 return;
8457 }
8458
8459 mds_rank_t hint = info.auth_hint;
8460 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8461 info.discover, info.want_xlocked, &hint);
8462 if (ret > 0)
8463 return;
8464 if (hint != mds->get_nodeid())
8465 info.auth_hint = hint;
8466 do_open_ino(ino, info, ret);
8467 }
8468
8469 void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8470 {
8471 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8472 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8473 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8474 }
8475
8476 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8477 vector<inode_backpointer_t>& ancestors,
8478 bool discover, bool want_xlocked, mds_rank_t *hint)
8479 {
8480 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8481 int err = 0;
8482 for (unsigned i = 0; i < ancestors.size(); i++) {
8483 CInode *diri = get_inode(ancestors[i].dirino);
8484
8485 if (!diri) {
8486 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8487 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8488 return 1;
8489 }
8490 continue;
8491 }
8492
8493 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8494 CDir *dir = diri->get_parent_dir();
8495 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8496 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8497 dir = dir->get_inode()->get_parent_dir();
8498 _open_ino_fetch_dir(ino, m, dir, i == 0);
8499 return 1;
8500 }
8501
8502 if (!diri->is_dir()) {
8503 dout(10) << " " << *diri << " is not dir" << dendl;
8504 if (i == 0)
8505 err = -ENOTDIR;
8506 break;
8507 }
8508
8509 string &name = ancestors[i].dname;
8510 frag_t fg = diri->pick_dirfrag(name);
8511 CDir *dir = diri->get_dirfrag(fg);
8512 if (!dir) {
8513 if (diri->is_auth()) {
8514 if (diri->is_frozen()) {
8515 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8516 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8517 return 1;
8518 }
8519 dir = diri->get_or_open_dirfrag(this, fg);
8520 } else if (discover) {
8521 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8522 return 1;
8523 }
8524 }
8525 if (dir) {
8526 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8527 CDentry *dn = dir->lookup(name);
8528 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8529 if (dir->is_auth()) {
8530 if (dnl && dnl->is_primary() &&
8531 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8532 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8533 _open_ino_fetch_dir(ino, m, dir, i == 0);
8534 return 1;
8535 }
8536
8537 if (!dnl && !dir->is_complete() &&
8538 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8539 dout(10) << " fetching incomplete " << *dir << dendl;
8540 _open_ino_fetch_dir(ino, m, dir, i == 0);
8541 return 1;
8542 }
8543
8544 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8545 if (i == 0)
8546 err = -ENOENT;
8547 } else if (discover) {
8548 if (!dnl) {
8549 filepath path(name, 0);
8550 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8551 (i == 0 && want_xlocked));
8552 return 1;
8553 }
8554 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8555 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8556 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8557 return 1;
8558 }
8559 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8560 if (i == 0)
8561 err = -ENOENT;
8562 }
8563 }
8564 if (hint && i == 0)
8565 *hint = dir ? dir->authority().first : diri->authority().first;
8566 break;
8567 }
8568 return err;
8569 }
8570
8571 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8572 {
8573 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8574
8575 list<MDSInternalContextBase*> waiters;
8576 waiters.swap(info.waiters);
8577 opening_inodes.erase(ino);
8578 finish_contexts(g_ceph_context, waiters, ret);
8579 }
8580
8581 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8582 {
8583 if (err < 0 && err != -EAGAIN) {
8584 info.checked.clear();
8585 info.checking = MDS_RANK_NONE;
8586 info.check_peers = true;
8587 info.fetch_backtrace = true;
8588 if (info.discover) {
8589 info.discover = false;
8590 info.ancestors.clear();
8591 }
8592 if (err != -ENOENT && err != -ENOTDIR)
8593 info.last_err = err;
8594 }
8595
8596 if (info.check_peers || info.discover) {
8597 if (info.discover) {
8598 // got backtrace from peer, but failed to find inode. re-check peers
8599 info.discover = false;
8600 info.ancestors.clear();
8601 info.checked.clear();
8602 }
8603 info.check_peers = false;
8604 info.checking = MDS_RANK_NONE;
8605 do_open_ino_peer(ino, info);
8606 } else if (info.fetch_backtrace) {
8607 info.check_peers = true;
8608 info.fetch_backtrace = false;
8609 info.checking = mds->get_nodeid();
8610 info.checked.clear();
8611 C_IO_MDC_OpenInoBacktraceFetched *fin =
8612 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8613 fetch_backtrace(ino, info.pool, fin->bl,
8614 new C_OnFinisher(fin, mds->finisher));
8615 } else {
8616 assert(!info.ancestors.empty());
8617 info.checking = mds->get_nodeid();
8618 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8619 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8620 }
8621 }
8622
8623 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8624 {
8625 set<mds_rank_t> all, active;
8626 mds->mdsmap->get_mds_set(all);
8627 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8628 if (mds->get_state() == MDSMap::STATE_REJOIN)
8629 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8630
8631 dout(10) << "do_open_ino_peer " << ino << " active " << active
8632 << " all " << all << " checked " << info.checked << dendl;
8633
8634 mds_rank_t peer = MDS_RANK_NONE;
8635 if (info.auth_hint >= 0) {
8636 if (active.count(info.auth_hint)) {
8637 peer = info.auth_hint;
8638 info.auth_hint = MDS_RANK_NONE;
8639 }
8640 } else {
8641 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8642 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8643 peer = *p;
8644 break;
8645 }
8646 }
8647 if (peer < 0) {
8648 all.erase(mds->get_nodeid());
8649 if (all != info.checked) {
8650 dout(10) << " waiting for more peers to be active" << dendl;
8651 } else {
8652 dout(10) << " all MDS peers have been checked " << dendl;
8653 do_open_ino(ino, info, 0);
8654 }
8655 } else {
8656 info.checking = peer;
8657 vector<inode_backpointer_t> *pa = NULL;
8658 // got backtrace from peer or backtrace just fetched
8659 if (info.discover || !info.fetch_backtrace)
8660 pa = &info.ancestors;
8661 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8662 }
8663 }
8664
8665 void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8666 {
8667 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8668 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8669 m->put();
8670 return;
8671 }
8672
8673 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8674
8675 inodeno_t ino = m->ino;
8676 MMDSOpenInoReply *reply;
8677 CInode *in = get_inode(ino);
8678 if (in) {
8679 dout(10) << " have " << *in << dendl;
8680 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8681 if (in->is_auth()) {
8682 touch_inode(in);
8683 while (1) {
8684 CDentry *pdn = in->get_parent_dn();
8685 if (!pdn)
8686 break;
8687 CInode *diri = pdn->get_dir()->get_inode();
8688 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
8689 in->inode.version));
8690 in = diri;
8691 }
8692 } else {
8693 reply->hint = in->authority().first;
8694 }
8695 } else if (err < 0) {
8696 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8697 } else {
8698 mds_rank_t hint = MDS_RANK_NONE;
8699 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8700 if (ret > 0)
8701 return;
8702 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8703 }
8704 m->get_connection()->send_message(reply);
8705 m->put();
8706 }
8707
8708 void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8709 {
8710 dout(10) << "handle_open_ino_reply " << *m << dendl;
8711
8712 inodeno_t ino = m->ino;
8713 mds_rank_t from = mds_rank_t(m->get_source().num());
8714 auto it = opening_inodes.find(ino);
8715 if (it != opening_inodes.end() && it->second.checking == from) {
8716 open_ino_info_t& info = it->second;
8717 info.checking = MDS_RANK_NONE;
8718 info.checked.insert(from);
8719
8720 CInode *in = get_inode(ino);
8721 if (in) {
8722 dout(10) << " found cached " << *in << dendl;
8723 open_ino_finish(ino, info, in->authority().first);
8724 } else if (!m->ancestors.empty()) {
8725 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8726 if (!info.want_replica) {
8727 open_ino_finish(ino, info, from);
8728 m->put();
8729 return;
8730 }
8731
8732 info.ancestors = m->ancestors;
8733 info.auth_hint = from;
8734 info.checking = mds->get_nodeid();
8735 info.discover = true;
8736 _open_ino_traverse_dir(ino, info, 0);
8737 } else if (m->error) {
8738 dout(10) << " error " << m->error << " from mds." << from << dendl;
8739 do_open_ino(ino, info, m->error);
8740 } else {
8741 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8742 info.auth_hint = m->hint;
8743 info.checked.erase(m->hint);
8744 }
8745 do_open_ino_peer(ino, info);
8746 }
8747 }
8748 m->put();
8749 }
8750
8751 void MDCache::kick_open_ino_peers(mds_rank_t who)
8752 {
8753 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8754
8755 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8756 p != opening_inodes.end();
8757 ++p) {
8758 open_ino_info_t& info = p->second;
8759 if (info.checking == who) {
8760 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8761 info.checking = MDS_RANK_NONE;
8762 do_open_ino_peer(p->first, info);
8763 } else if (info.checking == MDS_RANK_NONE) {
8764 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8765 do_open_ino_peer(p->first, info);
8766 }
8767 }
8768 }
8769
8770 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8771 bool want_replica, bool want_xlocked)
8772 {
8773 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8774 << want_replica << dendl;
8775
8776 if (opening_inodes.count(ino)) {
8777 open_ino_info_t& info = opening_inodes[ino];
8778 if (want_replica) {
8779 info.want_replica = true;
8780 if (want_xlocked && !info.want_xlocked) {
8781 if (!info.ancestors.empty()) {
8782 CInode *diri = get_inode(info.ancestors[0].dirino);
8783 if (diri) {
8784 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8785 CDir *dir = diri->get_dirfrag(fg);
8786 if (dir && !dir->is_auth()) {
8787 filepath path(info.ancestors[0].dname, 0);
8788 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8789 }
8790 }
8791 }
8792 info.want_xlocked = true;
8793 }
8794 }
8795 info.waiters.push_back(fin);
8796 } else {
8797 open_ino_info_t& info = opening_inodes[ino];
8798 info.want_replica = want_replica;
8799 info.want_xlocked = want_xlocked;
8800 info.tid = ++open_ino_last_tid;
8801 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8802 info.waiters.push_back(fin);
8803 do_open_ino(ino, info, 0);
8804 }
8805 }
8806
8807 /* ---------------------------- */
8808
8809 /*
8810 * search for a given inode on MDS peers. optionally start with the given node.
8811
8812
8813 TODO
8814 - recover from mds node failure, recovery
8815 - traverse path
8816
8817 */
8818 void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8819 {
8820 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8821 CInode *in = get_inode(ino);
8822 if (in && in->state_test(CInode::STATE_PURGING)) {
8823 c->complete(-ESTALE);
8824 return;
8825 }
8826 assert(!in);
8827
8828 ceph_tid_t tid = ++find_ino_peer_last_tid;
8829 find_ino_peer_info_t& fip = find_ino_peer[tid];
8830 fip.ino = ino;
8831 fip.tid = tid;
8832 fip.fin = c;
8833 fip.hint = hint;
8834 _do_find_ino_peer(fip);
8835 }
8836
8837 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8838 {
8839 set<mds_rank_t> all, active;
8840 mds->mdsmap->get_mds_set(all);
8841 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8842
8843 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8844 << " active " << active << " all " << all
8845 << " checked " << fip.checked
8846 << dendl;
8847
8848 mds_rank_t m = MDS_RANK_NONE;
8849 if (fip.hint >= 0) {
8850 m = fip.hint;
8851 fip.hint = MDS_RANK_NONE;
8852 } else {
8853 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8854 if (*p != mds->get_nodeid() &&
8855 fip.checked.count(*p) == 0) {
8856 m = *p;
8857 break;
8858 }
8859 }
8860 if (m == MDS_RANK_NONE) {
8861 all.erase(mds->get_nodeid());
8862 if (all != fip.checked) {
8863 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8864 } else {
8865 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8866 fip.fin->complete(-ESTALE);
8867 find_ino_peer.erase(fip.tid);
8868 }
8869 } else {
8870 fip.checking = m;
8871 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8872 }
8873 }
8874
8875 void MDCache::handle_find_ino(MMDSFindIno *m)
8876 {
8877 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8878 m->put();
8879 return;
8880 }
8881
8882 dout(10) << "handle_find_ino " << *m << dendl;
8883 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8884 CInode *in = get_inode(m->ino);
8885 if (in) {
8886 in->make_path(r->path);
8887 dout(10) << " have " << r->path << " " << *in << dendl;
8888 }
8889 m->get_connection()->send_message(r);
8890 m->put();
8891 }
8892
8893
8894 void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8895 {
8896 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8897 if (p != find_ino_peer.end()) {
8898 dout(10) << "handle_find_ino_reply " << *m << dendl;
8899 find_ino_peer_info_t& fip = p->second;
8900
8901 // success?
8902 if (get_inode(fip.ino)) {
8903 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8904 mds->queue_waiter(fip.fin);
8905 find_ino_peer.erase(p);
8906 m->put();
8907 return;
8908 }
8909
8910 mds_rank_t from = mds_rank_t(m->get_source().num());
8911 if (fip.checking == from)
8912 fip.checking = MDS_RANK_NONE;
8913 fip.checked.insert(from);
8914
8915 if (!m->path.empty()) {
8916 // we got a path!
8917 vector<CDentry*> trace;
8918 MDRequestRef null_ref;
8919 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8920 if (r > 0)
8921 return;
8922 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8923 << ", retrying" << dendl;
8924 fip.checked.clear();
8925 _do_find_ino_peer(fip);
8926 } else {
8927 // nope, continue.
8928 _do_find_ino_peer(fip);
8929 }
8930 } else {
8931 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8932 }
8933 m->put();
8934 }
8935
8936 void MDCache::kick_find_ino_peers(mds_rank_t who)
8937 {
8938 // find_ino_peers requests we should move on from
8939 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8940 p != find_ino_peer.end();
8941 ++p) {
8942 find_ino_peer_info_t& fip = p->second;
8943 if (fip.checking == who) {
8944 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8945 fip.checking = MDS_RANK_NONE;
8946 _do_find_ino_peer(fip);
8947 } else if (fip.checking == MDS_RANK_NONE) {
8948 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8949 _do_find_ino_peer(fip);
8950 }
8951 }
8952 }
8953
8954 /* ---------------------------- */
8955
8956 int MDCache::get_num_client_requests()
8957 {
8958 int count = 0;
8959 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8960 p != active_requests.end();
8961 ++p) {
8962 MDRequestRef& mdr = p->second;
8963 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8964 count++;
8965 }
8966 return count;
8967 }
8968
8969 /* This function takes over the reference to the passed Message */
8970 MDRequestRef MDCache::request_start(MClientRequest *req)
8971 {
8972 // did we win a forward race against a slave?
8973 if (active_requests.count(req->get_reqid())) {
8974 MDRequestRef& mdr = active_requests[req->get_reqid()];
8975 assert(mdr);
8976 if (mdr->is_slave()) {
8977 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8978 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8979 } else {
8980 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8981 req->put();
8982 }
8983 return MDRequestRef();
8984 }
8985
8986 // register new client request
8987 MDRequestImpl::Params params;
8988 params.reqid = req->get_reqid();
8989 params.attempt = req->get_num_fwd();
8990 params.client_req = req;
8991 params.initiated = req->get_recv_stamp();
8992 params.throttled = req->get_throttle_stamp();
8993 params.all_read = req->get_recv_complete_stamp();
8994 params.dispatched = req->get_dispatch_stamp();
8995
8996 MDRequestRef mdr =
8997 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8998 active_requests[params.reqid] = mdr;
8999 mdr->set_op_stamp(req->get_stamp());
9000 dout(7) << "request_start " << *mdr << dendl;
9001 return mdr;
9002 }
9003
9004 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9005 {
9006 int by = m->get_source().num();
9007 MDRequestImpl::Params params;
9008 params.reqid = ri;
9009 params.attempt = attempt;
9010 params.triggering_slave_req = m;
9011 params.slave_to = by;
9012 params.initiated = m->get_recv_stamp();
9013 params.throttled = m->get_throttle_stamp();
9014 params.all_read = m->get_recv_complete_stamp();
9015 params.dispatched = m->get_dispatch_stamp();
9016 MDRequestRef mdr =
9017 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9018 assert(active_requests.count(mdr->reqid) == 0);
9019 active_requests[mdr->reqid] = mdr;
9020 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9021 return mdr;
9022 }
9023
9024 MDRequestRef MDCache::request_start_internal(int op)
9025 {
9026 MDRequestImpl::Params params;
9027 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9028 params.reqid.tid = mds->issue_tid();
9029 params.initiated = ceph_clock_now();
9030 params.internal_op = op;
9031 MDRequestRef mdr =
9032 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9033
9034 assert(active_requests.count(mdr->reqid) == 0);
9035 active_requests[mdr->reqid] = mdr;
9036 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9037 return mdr;
9038 }
9039
9040 MDRequestRef MDCache::request_get(metareqid_t rid)
9041 {
9042 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9043 assert(p != active_requests.end());
9044 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9045 return p->second;
9046 }
9047
9048 void MDCache::request_finish(MDRequestRef& mdr)
9049 {
9050 dout(7) << "request_finish " << *mdr << dendl;
9051 mdr->mark_event("finishing request");
9052
9053 // slave finisher?
9054 if (mdr->has_more() && mdr->more()->slave_commit) {
9055 Context *fin = mdr->more()->slave_commit;
9056 mdr->more()->slave_commit = 0;
9057 int ret;
9058 if (mdr->aborted) {
9059 mdr->aborted = false;
9060 ret = -1;
9061 mdr->more()->slave_rolling_back = true;
9062 } else {
9063 ret = 0;
9064 mdr->committing = true;
9065 }
9066 fin->complete(ret); // this must re-call request_finish.
9067 return;
9068 }
9069
9070 switch(mdr->internal_op) {
9071 case CEPH_MDS_OP_FRAGMENTDIR:
9072 logger->inc(l_mdss_ireq_fragmentdir);
9073 break;
9074 case CEPH_MDS_OP_EXPORTDIR:
9075 logger->inc(l_mdss_ireq_exportdir);
9076 break;
9077 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9078 logger->inc(l_mdss_ireq_enqueue_scrub);
9079 break;
9080 case CEPH_MDS_OP_FLUSH:
9081 logger->inc(l_mdss_ireq_flush);
9082 break;
9083 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9084 logger->inc(l_mdss_ireq_fragstats);
9085 break;
9086 case CEPH_MDS_OP_REPAIR_INODESTATS:
9087 logger->inc(l_mdss_ireq_inodestats);
9088 break;
9089 }
9090
9091 request_cleanup(mdr);
9092 }
9093
9094
9095 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9096 {
9097 mdr->mark_event("forwarding request");
9098 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9099 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9100 << *mdr->client_request << dendl;
9101 mds->forward_message_mds(mdr->client_request, who);
9102 mdr->client_request = 0;
9103 if (mds->logger) mds->logger->inc(l_mds_forward);
9104 } else if (mdr->internal_op >= 0) {
9105 dout(10) << "request_forward on internal op; cancelling" << dendl;
9106 mdr->internal_op_finish->complete(-EXDEV);
9107 } else {
9108 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9109 << " was from mds" << dendl;
9110 }
9111 request_cleanup(mdr);
9112 }
9113
9114
9115 void MDCache::dispatch_request(MDRequestRef& mdr)
9116 {
9117 if (mdr->client_request) {
9118 mds->server->dispatch_client_request(mdr);
9119 } else if (mdr->slave_request) {
9120 mds->server->dispatch_slave_request(mdr);
9121 } else {
9122 switch (mdr->internal_op) {
9123 case CEPH_MDS_OP_FRAGMENTDIR:
9124 dispatch_fragment_dir(mdr);
9125 break;
9126 case CEPH_MDS_OP_EXPORTDIR:
9127 migrator->dispatch_export_dir(mdr, 0);
9128 break;
9129 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9130 enqueue_scrub_work(mdr);
9131 break;
9132 case CEPH_MDS_OP_FLUSH:
9133 flush_dentry_work(mdr);
9134 break;
9135 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9136 repair_dirfrag_stats_work(mdr);
9137 break;
9138 case CEPH_MDS_OP_REPAIR_INODESTATS:
9139 repair_inode_stats_work(mdr);
9140 break;
9141 default:
9142 ceph_abort();
9143 }
9144 }
9145 }
9146
9147
9148 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9149 {
9150 if (!mdr->has_more())
9151 return;
9152
9153 // clean up slaves
9154 // (will implicitly drop remote dn pins)
9155 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9156 p != mdr->more()->slaves.end();
9157 ++p) {
9158 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9159 MMDSSlaveRequest::OP_FINISH);
9160
9161 if (mdr->killed && !mdr->committing) {
9162 r->mark_abort();
9163 } else if (mdr->more()->srcdn_auth_mds == *p &&
9164 mdr->more()->inode_import.length() > 0) {
9165 // information about rename imported caps
9166 r->inode_export.claim(mdr->more()->inode_import);
9167 }
9168
9169 mds->send_message_mds(r, *p);
9170 }
9171
9172 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9173 * implicitly. Note that we don't call the finishers -- there shouldn't
9174 * be any on a remote lock and the request finish wakes up all
9175 * the waiters anyway! */
9176 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9177 while (p != mdr->xlocks.end()) {
9178 if ((*p)->get_parent()->is_auth())
9179 ++p;
9180 else {
9181 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9182 << " on " << *(*p)->get_parent() << dendl;
9183 (*p)->put_xlock();
9184 mdr->locks.erase(*p);
9185 mdr->xlocks.erase(p++);
9186 }
9187 }
9188
9189 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9190 while (q != mdr->remote_wrlocks.end()) {
9191 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9192 << " on mds." << q->second
9193 << " on " << *(q->first)->get_parent() << dendl;
9194 mdr->locks.erase(q->first);
9195 mdr->remote_wrlocks.erase(q++);
9196 }
9197
9198 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9199 * leaving them in can cause double-notifies as
9200 * this function can get called more than once */
9201 }
9202
9203 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9204 {
9205 request_drop_foreign_locks(mdr);
9206 mds->locker->drop_non_rdlocks(mdr.get());
9207 }
9208
9209 void MDCache::request_drop_locks(MDRequestRef& mdr)
9210 {
9211 request_drop_foreign_locks(mdr);
9212 mds->locker->drop_locks(mdr.get());
9213 }
9214
9215 void MDCache::request_cleanup(MDRequestRef& mdr)
9216 {
9217 dout(15) << "request_cleanup " << *mdr << dendl;
9218
9219 if (mdr->has_more()) {
9220 if (mdr->more()->is_ambiguous_auth)
9221 mdr->clear_ambiguous_auth();
9222 if (!mdr->more()->waiting_for_finish.empty())
9223 mds->queue_waiters(mdr->more()->waiting_for_finish);
9224 }
9225
9226 request_drop_locks(mdr);
9227
9228 // drop (local) auth pins
9229 mdr->drop_local_auth_pins();
9230
9231 // drop stickydirs
9232 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9233 p != mdr->stickydirs.end();
9234 ++p)
9235 (*p)->put_stickydirs();
9236
9237 mds->locker->kick_cap_releases(mdr);
9238
9239 // drop cache pins
9240 mdr->drop_pins();
9241
9242 // remove from session
9243 mdr->item_session_request.remove_myself();
9244
9245 // remove from map
9246 active_requests.erase(mdr->reqid);
9247
9248 if (mds->logger)
9249 log_stat();
9250
9251 mdr->mark_event("cleaned up request");
9252 }
9253
9254 void MDCache::request_kill(MDRequestRef& mdr)
9255 {
9256 // rollback slave requests is tricky. just let the request proceed.
9257 if (mdr->done_locking && mdr->has_more() &&
9258 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9259 dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
9260
9261 assert(mdr->used_prealloc_ino == 0);
9262 assert(mdr->prealloc_inos.empty());
9263
9264 mdr->session = NULL;
9265 mdr->item_session_request.remove_myself();
9266 return;
9267 }
9268
9269 mdr->killed = true;
9270 mdr->mark_event("killing request");
9271
9272 if (mdr->committing) {
9273 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9274 } else {
9275 dout(10) << "request_kill " << *mdr << dendl;
9276 request_cleanup(mdr);
9277 }
9278 }
9279
9280 // -------------------------------------------------------------------------------
9281 // SNAPREALMS
9282
9283 struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9284 MDRequestRef mdr;
9285 MutationRef mut;
9286 CInode *in;
9287 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9288 MutationRef& mu, CInode *i) :
9289 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9290 void finish(int r) override {
9291 mdcache->_snaprealm_create_finish(mdr, mut, in);
9292 }
9293 };
9294
9295 void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9296 {
9297 dout(10) << "snaprealm_create " << *in << dendl;
9298 assert(!in->snaprealm);
9299
9300 // allocate an id..
9301 if (!mdr->more()->stid) {
9302 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9303 new C_MDS_RetryRequest(this, mdr));
9304 return;
9305 }
9306
9307 MutationRef mut(new MutationImpl());
9308 mut->ls = mds->mdlog->get_current_segment();
9309 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9310 mds->mdlog->start_entry(le);
9311
9312 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9313
9314 inode_t *pi = in->project_inode();
9315 pi->version = in->pre_dirty();
9316 pi->rstat.rsnaprealms++;
9317
9318 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9319 snapid_t seq;
9320 ::decode(seq, p);
9321
9322 sr_t *newsnap = in->project_snaprealm(seq);
9323 newsnap->seq = seq;
9324 newsnap->last_created = seq;
9325
9326 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9327 journal_cow_inode(mut, &le->metablob, in);
9328 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9329
9330 mds->server->submit_mdlog_entry(le,
9331 new C_MDC_snaprealm_create_finish(this, mdr,
9332 mut, in),
9333 mdr, __func__);
9334 mds->mdlog->flush();
9335 }
9336
9337
9338 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9339 {
9340 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9341
9342 vector<inodeno_t> split_inos;
9343 vector<inodeno_t> split_realms;
9344
9345 if (snapop == CEPH_SNAP_OP_SPLIT) {
9346 // notify clients of update|split
9347 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9348 !p.end(); ++p)
9349 split_inos.push_back((*p)->ino());
9350
9351 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9352 p != in->snaprealm->open_children.end();
9353 ++p)
9354 split_realms.push_back((*p)->inode->ino());
9355 }
9356
9357 bufferlist snapbl;
9358 in->snaprealm->build_snap_trace(snapbl);
9359
9360 set<SnapRealm*> past_children;
9361 map<client_t, MClientSnap*> updates;
9362 list<SnapRealm*> q;
9363 q.push_back(in->snaprealm);
9364 while (!q.empty()) {
9365 SnapRealm *realm = q.front();
9366 q.pop_front();
9367
9368 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9369 realm->invalidate_cached_snaps();
9370
9371 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9372 p != realm->client_caps.end();
9373 ++p) {
9374 assert(!p->second->empty());
9375 if (!nosend && updates.count(p->first) == 0) {
9376 MClientSnap *update = new MClientSnap(snapop);
9377 update->head.split = in->ino();
9378 update->split_inos = split_inos;
9379 update->split_realms = split_realms;
9380 update->bl = snapbl;
9381 updates[p->first] = update;
9382 }
9383 }
9384
9385 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9386 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9387 p != realm->open_past_children.end();
9388 ++p)
9389 past_children.insert(*p);
9390 }
9391
9392 // notify for active children, too.
9393 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9394 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9395 p != realm->open_children.end();
9396 ++p)
9397 q.push_back(*p);
9398 }
9399
9400 if (!nosend)
9401 send_snaps(updates);
9402
9403 // notify past children and their descendants if we update/delete old snapshots
9404 for (set<SnapRealm*>::iterator p = past_children.begin();
9405 p != past_children.end();
9406 ++p)
9407 q.push_back(*p);
9408
9409 while (!q.empty()) {
9410 SnapRealm *realm = q.front();
9411 q.pop_front();
9412
9413 realm->invalidate_cached_snaps();
9414
9415 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9416 p != realm->open_children.end();
9417 ++p) {
9418 if (past_children.count(*p) == 0)
9419 q.push_back(*p);
9420 }
9421
9422 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9423 p != realm->open_past_children.end();
9424 ++p) {
9425 if (past_children.count(*p) == 0) {
9426 q.push_back(*p);
9427 past_children.insert(*p);
9428 }
9429 }
9430 }
9431
9432 if (snapop == CEPH_SNAP_OP_DESTROY) {
9433 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9434 for (set<SnapRealm*>::iterator p = past_children.begin();
9435 p != past_children.end();
9436 ++p)
9437 maybe_eval_stray((*p)->inode, true);
9438 }
9439 }
9440
9441 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9442 {
9443 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9444
9445 // apply
9446 in->pop_and_dirty_projected_inode(mut->ls);
9447 mut->apply();
9448 mds->locker->drop_locks(mut.get());
9449 mut->cleanup();
9450
9451 // tell table we've committed
9452 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9453
9454 // create
9455 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9456 snapid_t seq;
9457 ::decode(seq, p);
9458
9459 in->open_snaprealm();
9460 in->snaprealm->srnode.seq = seq;
9461 in->snaprealm->srnode.created = seq;
9462 bool ok = in->snaprealm->_open_parents(NULL);
9463 assert(ok);
9464
9465 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9466
9467 /*
9468 static int count = 5;
9469 if (--count == 0)
9470 ceph_abort(); // hack test test **********
9471 */
9472
9473 // done.
9474 mdr->more()->stid = 0; // caller will likely need to reuse this
9475 dispatch_request(mdr);
9476 }
9477
9478
9479 // -------------------------------------------------------------------------------
9480 // STRAYS
9481
9482 struct C_MDC_RetryScanStray : public MDCacheContext {
9483 dirfrag_t next;
9484 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9485 void finish(int r) override {
9486 mdcache->scan_stray_dir(next);
9487 }
9488 };
9489
9490 void MDCache::scan_stray_dir(dirfrag_t next)
9491 {
9492 dout(10) << "scan_stray_dir " << next << dendl;
9493
9494 list<CDir*> ls;
9495 for (int i = 0; i < NUM_STRAY; ++i) {
9496 if (strays[i]->ino() < next.ino)
9497 continue;
9498 strays[i]->get_dirfrags(ls);
9499 }
9500
9501 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9502 CDir *dir = *p;
9503 if (dir->dirfrag() < next)
9504 continue;
9505 if (!dir->is_complete()) {
9506 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9507 return;
9508 }
9509 for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
9510 CDentry *dn = q->second;
9511 dn->state_set(CDentry::STATE_STRAY);
9512 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9513 if (dnl->is_primary()) {
9514 CInode *in = dnl->get_inode();
9515 if (in->inode.nlink == 0)
9516 in->state_set(CInode::STATE_ORPHAN);
9517 maybe_eval_stray(in);
9518 }
9519 }
9520 }
9521 }
9522
9523 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9524 {
9525 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9526 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9527 }
9528
9529
9530
9531
9532
9533 // ========================================================================================
9534 // DISCOVER
9535 /*
9536
9537 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9538 to the parent metadata object in the cache (pinning it).
9539
9540 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9541
9542 */
9543
9544 void MDCache::_send_discover(discover_info_t& d)
9545 {
9546 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9547 d.want_base_dir, d.want_xlocked);
9548 dis->set_tid(d.tid);
9549 mds->send_message_mds(dis, d.mds);
9550 }
9551
9552 void MDCache::discover_base_ino(inodeno_t want_ino,
9553 MDSInternalContextBase *onfinish,
9554 mds_rank_t from)
9555 {
9556 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9557 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9558 discover_info_t& d = _create_discover(from);
9559 d.ino = want_ino;
9560 _send_discover(d);
9561 }
9562 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9563 }
9564
9565
9566 void MDCache::discover_dir_frag(CInode *base,
9567 frag_t approx_fg,
9568 MDSInternalContextBase *onfinish,
9569 mds_rank_t from)
9570 {
9571 if (from < 0)
9572 from = base->authority().first;
9573
9574 dirfrag_t df(base->ino(), approx_fg);
9575 dout(7) << "discover_dir_frag " << df
9576 << " from mds." << from << dendl;
9577
9578 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9579 discover_info_t& d = _create_discover(from);
9580 d.pin_base(base);
9581 d.ino = base->ino();
9582 d.frag = approx_fg;
9583 d.want_base_dir = true;
9584 _send_discover(d);
9585 }
9586
9587 if (onfinish)
9588 base->add_dir_waiter(approx_fg, onfinish);
9589 }
9590
9591 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9592 CInode *base;
9593 snapid_t snapid;
9594 filepath path;
9595 mds_rank_t from;
9596 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9597 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9598 void finish(int r) override {
9599 mdcache->discover_path(base, snapid, path, 0, from);
9600 }
9601 };
9602
9603 void MDCache::discover_path(CInode *base,
9604 snapid_t snap,
9605 filepath want_path,
9606 MDSInternalContextBase *onfinish,
9607 bool want_xlocked,
9608 mds_rank_t from)
9609 {
9610 if (from < 0)
9611 from = base->authority().first;
9612
9613 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9614 << (want_xlocked ? " want_xlocked":"")
9615 << dendl;
9616
9617 if (base->is_ambiguous_auth()) {
9618 dout(10) << " waiting for single auth on " << *base << dendl;
9619 if (!onfinish)
9620 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9621 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9622 return;
9623 } else if (from == mds->get_nodeid()) {
9624 list<MDSInternalContextBase*> finished;
9625 base->take_waiting(CInode::WAIT_DIR, finished);
9626 mds->queue_waiters(finished);
9627 return;
9628 }
9629
9630 frag_t fg = base->pick_dirfrag(want_path[0]);
9631 if ((want_xlocked && want_path.depth() == 1) ||
9632 !base->is_waiting_for_dir(fg) || !onfinish) {
9633 discover_info_t& d = _create_discover(from);
9634 d.ino = base->ino();
9635 d.pin_base(base);
9636 d.frag = fg;
9637 d.snap = snap;
9638 d.want_path = want_path;
9639 d.want_base_dir = true;
9640 d.want_xlocked = want_xlocked;
9641 _send_discover(d);
9642 }
9643
9644 // register + wait
9645 if (onfinish)
9646 base->add_dir_waiter(fg, onfinish);
9647 }
9648
9649 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9650 CDir *base;
9651 snapid_t snapid;
9652 filepath path;
9653 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9654 MDCacheContext(c), base(b), snapid(s), path(p) {}
9655 void finish(int r) override {
9656 mdcache->discover_path(base, snapid, path, 0);
9657 }
9658 };
9659
9660 void MDCache::discover_path(CDir *base,
9661 snapid_t snap,
9662 filepath want_path,
9663 MDSInternalContextBase *onfinish,
9664 bool want_xlocked)
9665 {
9666 mds_rank_t from = base->authority().first;
9667
9668 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9669 << (want_xlocked ? " want_xlocked":"")
9670 << dendl;
9671
9672 if (base->is_ambiguous_auth()) {
9673 dout(7) << " waiting for single auth on " << *base << dendl;
9674 if (!onfinish)
9675 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9676 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9677 return;
9678 } else if (from == mds->get_nodeid()) {
9679 list<MDSInternalContextBase*> finished;
9680 base->take_sub_waiting(finished);
9681 mds->queue_waiters(finished);
9682 return;
9683 }
9684
9685 if ((want_xlocked && want_path.depth() == 1) ||
9686 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9687 discover_info_t& d = _create_discover(from);
9688 d.ino = base->ino();
9689 d.pin_base(base->inode);
9690 d.frag = base->get_frag();
9691 d.snap = snap;
9692 d.want_path = want_path;
9693 d.want_base_dir = false;
9694 d.want_xlocked = want_xlocked;
9695 _send_discover(d);
9696 }
9697
9698 // register + wait
9699 if (onfinish)
9700 base->add_dentry_waiter(want_path[0], snap, onfinish);
9701 }
9702
9703 void MDCache::kick_discovers(mds_rank_t who)
9704 {
9705 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9706 p != discovers.end();
9707 ++p) {
9708 if (p->second.mds != who)
9709 continue;
9710 _send_discover(p->second);
9711 }
9712 }
9713
9714
9715 /* This function DOES put the passed message before returning */
9716 void MDCache::handle_discover(MDiscover *dis)
9717 {
9718 mds_rank_t whoami = mds->get_nodeid();
9719 mds_rank_t from = mds_rank_t(dis->get_source().num());
9720
9721 assert(from != whoami);
9722
9723 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9724 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9725 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
9726 dis->put();
9727 return;
9728 }
9729
9730 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9731 // delay processing request from survivor because we may not yet choose lock states.
9732 if (!mds->mdsmap->is_rejoin(from)) {
9733 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9734 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9735 return;
9736 }
9737 }
9738
9739
9740 CInode *cur = 0;
9741 MDiscoverReply *reply = new MDiscoverReply(dis);
9742
9743 snapid_t snapid = dis->get_snapid();
9744
9745 // get started.
9746 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9747 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9748 // wants root
9749 dout(7) << "handle_discover from mds." << from
9750 << " wants base + " << dis->get_want().get_path()
9751 << " snap " << snapid
9752 << dendl;
9753
9754 cur = get_inode(dis->get_base_ino());
9755 assert(cur);
9756
9757 // add root
9758 reply->starts_with = MDiscoverReply::INODE;
9759 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9760 dout(10) << "added base " << *cur << dendl;
9761 }
9762 else {
9763 // there's a base inode
9764 cur = get_inode(dis->get_base_ino(), snapid);
9765 if (!cur && snapid != CEPH_NOSNAP) {
9766 cur = get_inode(dis->get_base_ino());
9767 if (cur && !cur->is_multiversion())
9768 cur = NULL; // nope!
9769 }
9770
9771 if (!cur) {
9772 dout(7) << "handle_discover mds." << from
9773 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9774 << dendl;
9775 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9776 reply->set_error_dentry(dis->get_dentry(0));
9777 reply->set_flag_error_dir();
9778 } else if (dis->wants_base_dir()) {
9779 dout(7) << "handle_discover mds." << from
9780 << " wants basedir+" << dis->get_want().get_path()
9781 << " has " << *cur
9782 << dendl;
9783 } else {
9784 dout(7) << "handle_discover mds." << from
9785 << " wants " << dis->get_want().get_path()
9786 << " has " << *cur
9787 << dendl;
9788 }
9789 }
9790
9791 assert(reply);
9792
9793 // add content
9794 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9795 for (unsigned i = 0;
9796 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9797 i++) {
9798
9799 // -- figure out the dir
9800
9801 // is *cur even a dir at all?
9802 if (!cur->is_dir()) {
9803 dout(7) << *cur << " not a dir" << dendl;
9804 reply->set_flag_error_dir();
9805 break;
9806 }
9807
9808 // pick frag
9809 frag_t fg;
9810 if (dis->get_want().depth()) {
9811 // dentry specifies
9812 fg = cur->pick_dirfrag(dis->get_dentry(i));
9813 } else {
9814 // requester explicity specified the frag
9815 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9816 fg = dis->get_base_dir_frag();
9817 if (!cur->dirfragtree.is_leaf(fg))
9818 fg = cur->dirfragtree[fg.value()];
9819 }
9820 CDir *curdir = cur->get_dirfrag(fg);
9821
9822 if ((!curdir && !cur->is_auth()) ||
9823 (curdir && !curdir->is_auth())) {
9824
9825 /* before:
9826 * ONLY set flag if empty!!
9827 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9828 * resulting in duplicate discovers in flight,
9829 * which can wreak havoc when discovering rename srcdn (which may move)
9830 */
9831
9832 if (reply->is_empty()) {
9833 // only hint if empty.
9834 // someday this could be better, but right now the waiter logic isn't smart enough.
9835
9836 // hint
9837 if (curdir) {
9838 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9839 reply->set_dir_auth_hint(curdir->authority().first);
9840 } else {
9841 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9842 << *cur << dendl;
9843 reply->set_dir_auth_hint(cur->authority().first);
9844 }
9845
9846 // note error dentry, if any
9847 // NOTE: important, as it allows requester to issue an equivalent discover
9848 // to whomever we hint at.
9849 if (dis->get_want().depth() > i)
9850 reply->set_error_dentry(dis->get_dentry(i));
9851 }
9852
9853 break;
9854 }
9855
9856 if (!curdir) { // open dir?
9857 if (cur->is_frozen()) {
9858 if (!reply->is_empty()) {
9859 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9860 break;
9861 }
9862 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9863 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9864 reply->put();
9865 return;
9866 }
9867 curdir = cur->get_or_open_dirfrag(this, fg);
9868 } else if (curdir->is_frozen_tree() ||
9869 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
9870 if (!reply->is_empty()) {
9871 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9872 break;
9873 }
9874 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9875 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9876 reply->set_flag_error_dir();
9877 break;
9878 }
9879 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9880 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9881 reply->put();
9882 return;
9883 }
9884
9885 // add dir
9886 if (curdir->get_version() == 0) {
9887 // fetch newly opened dir
9888 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9889 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9890 // make sure the base frag is correct, though, in there was a refragment since the
9891 // original request was sent.
9892 reply->set_base_dir_frag(curdir->get_frag());
9893 } else {
9894 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9895 if (!reply->trace.length())
9896 reply->starts_with = MDiscoverReply::DIR;
9897 replicate_dir(curdir, from, reply->trace);
9898 dout(7) << "handle_discover added dir " << *curdir << dendl;
9899 }
9900
9901 // lookup
9902 CDentry *dn = 0;
9903 if (curdir->get_version() == 0) {
9904 // fetch newly opened dir
9905 assert(!curdir->has_bloom());
9906 } else if (dis->get_want().depth() > 0) {
9907 // lookup dentry
9908 dn = curdir->lookup(dis->get_dentry(i), snapid);
9909 } else
9910 break; // done!
9911
9912 // incomplete dir?
9913 if (!dn) {
9914 if (!curdir->is_complete() &&
9915 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
9916 // readdir
9917 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9918 if (reply->is_empty()) {
9919 // fetch and wait
9920 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9921 dis->wants_base_dir() && curdir->get_version() == 0);
9922 reply->put();
9923 return;
9924 } else {
9925 // initiate fetch, but send what we have so far
9926 curdir->fetch(0);
9927 break;
9928 }
9929 }
9930
9931 // send null dentry
9932 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9933 << *curdir << dendl;
9934 dn = curdir->add_null_dentry(dis->get_dentry(i));
9935 }
9936 assert(dn);
9937
9938 // don't add replica to purging dentry/inode
9939 if (dn->state_test(CDentry::STATE_PURGING)) {
9940 if (reply->is_empty())
9941 reply->set_flag_error_dn(dis->get_dentry(i));
9942 break;
9943 }
9944
9945 CDentry::linkage_t *dnl = dn->get_linkage();
9946
9947 // xlocked dentry?
9948 // ...always block on non-tail items (they are unrelated)
9949 // ...allow xlocked tail disocvery _only_ if explicitly requested
9950 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9951 if (dn->lock.is_xlocked()) {
9952 // is this the last (tail) item in the discover traversal?
9953 if (tailitem && dis->wants_xlocked()) {
9954 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9955 } else if (reply->is_empty()) {
9956 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9957 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9958 reply->put();
9959 return;
9960 } else {
9961 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9962 break;
9963 }
9964 }
9965
9966 // frozen inode?
9967 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9968 if (tailitem && dis->wants_xlocked()) {
9969 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9970 } else if (reply->is_empty()) {
9971 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9972 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9973 reply->put();
9974 return;
9975 } else {
9976 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9977 break;
9978 }
9979 }
9980
9981 // add dentry
9982 if (!reply->trace.length())
9983 reply->starts_with = MDiscoverReply::DENTRY;
9984 replicate_dentry(dn, from, reply->trace);
9985 dout(7) << "handle_discover added dentry " << *dn << dendl;
9986
9987 if (!dnl->is_primary()) break; // stop on null or remote link.
9988
9989 // add inode
9990 CInode *next = dnl->get_inode();
9991 assert(next->is_auth());
9992
9993 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
9994 dout(7) << "handle_discover added inode " << *next << dendl;
9995
9996 // descend, keep going.
9997 cur = next;
9998 continue;
9999 }
10000
10001 // how did we do?
10002 assert(!reply->is_empty());
10003 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10004 mds->send_message(reply, dis->get_connection());
10005
10006 dis->put();
10007 }
10008
10009 /* This function DOES put the passed message before returning */
10010 void MDCache::handle_discover_reply(MDiscoverReply *m)
10011 {
10012 /*
10013 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10014 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10015 m->put();
10016 return;
10017 }
10018 */
10019 dout(7) << "discover_reply " << *m << dendl;
10020 if (m->is_flag_error_dir())
10021 dout(7) << " flag error, dir" << dendl;
10022 if (m->is_flag_error_dn())
10023 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10024
10025 list<MDSInternalContextBase*> finished, error;
10026 mds_rank_t from = mds_rank_t(m->get_source().num());
10027
10028 // starting point
10029 CInode *cur = get_inode(m->get_base_ino());
10030 bufferlist::iterator p = m->trace.begin();
10031
10032 int next = m->starts_with;
10033
10034 // decrement discover counters
10035 if (m->get_tid()) {
10036 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10037 if (p != discovers.end()) {
10038 dout(10) << " found tid " << m->get_tid() << dendl;
10039 discovers.erase(p);
10040 } else {
10041 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10042 }
10043 }
10044
10045 // discover may start with an inode
10046 if (!p.end() && next == MDiscoverReply::INODE) {
10047 cur = add_replica_inode(p, NULL, finished);
10048 dout(7) << "discover_reply got base inode " << *cur << dendl;
10049 assert(cur->is_base());
10050
10051 next = MDiscoverReply::DIR;
10052
10053 // take waiters?
10054 if (cur->is_base() &&
10055 waiting_for_base_ino[from].count(cur->ino())) {
10056 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10057 waiting_for_base_ino[from].erase(cur->ino());
10058 }
10059 }
10060 assert(cur);
10061
10062 // loop over discover results.
10063 // indexes follow each ([[dir] dentry] inode)
10064 // can start, end with any type.
10065 while (!p.end()) {
10066 // dir
10067 frag_t fg;
10068 CDir *curdir = 0;
10069 if (next == MDiscoverReply::DIR) {
10070 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10071 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10072 assert(m->get_wanted_base_dir());
10073 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10074 }
10075 } else {
10076 // note: this can only happen our first way around this loop.
10077 if (p.end() && m->is_flag_error_dn()) {
10078 fg = cur->pick_dirfrag(m->get_error_dentry());
10079 curdir = cur->get_dirfrag(fg);
10080 } else
10081 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10082 }
10083
10084 if (p.end())
10085 break;
10086
10087 // dentry
10088 CDentry *dn = add_replica_dentry(p, curdir, finished);
10089
10090 if (p.end())
10091 break;
10092
10093 // inode
10094 cur = add_replica_inode(p, dn, finished);
10095
10096 next = MDiscoverReply::DIR;
10097 }
10098
10099 // dir error?
10100 // or dir_auth hint?
10101 if (m->is_flag_error_dir() && !cur->is_dir()) {
10102 // not a dir.
10103 cur->take_waiting(CInode::WAIT_DIR, error);
10104 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10105 mds_rank_t who = m->get_dir_auth_hint();
10106 if (who == mds->get_nodeid()) who = -1;
10107 if (who >= 0)
10108 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10109
10110
10111 if (m->get_wanted_base_dir()) {
10112 frag_t fg = m->get_base_dir_frag();
10113 CDir *dir = cur->get_dirfrag(fg);
10114
10115 if (cur->is_waiting_for_dir(fg)) {
10116 if (cur->is_auth())
10117 cur->take_waiting(CInode::WAIT_DIR, finished);
10118 else if (dir || !cur->dirfragtree.is_leaf(fg))
10119 cur->take_dir_waiting(fg, finished);
10120 else
10121 discover_dir_frag(cur, fg, 0, who);
10122 } else
10123 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10124 }
10125
10126 // try again?
10127 if (m->get_error_dentry().length()) {
10128 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10129 CDir *dir = cur->get_dirfrag(fg);
10130 // wanted a dentry
10131 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10132 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10133 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10134 m->get_wanted_snapid(), finished);
10135 } else {
10136 filepath relpath(m->get_error_dentry(), 0);
10137 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10138 }
10139 } else
10140 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10141 << m->get_error_dentry() << dendl;
10142 }
10143 } else if (m->is_flag_error_dn()) {
10144 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10145 CDir *dir = cur->get_dirfrag(fg);
10146 if (dir) {
10147 if (dir->is_auth()) {
10148 dir->take_sub_waiting(finished);
10149 } else {
10150 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10151 m->get_wanted_snapid(), error);
10152 }
10153 }
10154 }
10155
10156 // waiters
10157 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10158 mds->queue_waiters(finished);
10159
10160 // done
10161 m->put();
10162 }
10163
10164
10165
10166 // ----------------------------
10167 // REPLICAS
10168
10169
10170 void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10171 {
10172 dirfrag_t df = dir->dirfrag();
10173 ::encode(df, bl);
10174 dir->encode_replica(to, bl);
10175 }
10176
10177 void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10178 {
10179 ::encode(dn->name, bl);
10180 ::encode(dn->last, bl);
10181 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10182 }
10183
10184 void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10185 uint64_t features)
10186 {
10187 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
10188 ::encode(in->last, bl);
10189 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10190 }
10191
10192 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10193 list<MDSInternalContextBase*>& finished)
10194 {
10195 dirfrag_t df;
10196 ::decode(df, p);
10197
10198 assert(diri->ino() == df.ino);
10199
10200 // add it (_replica_)
10201 CDir *dir = diri->get_dirfrag(df.frag);
10202
10203 if (dir) {
10204 // had replica. update w/ new nonce.
10205 dir->decode_replica(p);
10206 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10207 } else {
10208 // force frag to leaf in the diri tree
10209 if (!diri->dirfragtree.is_leaf(df.frag)) {
10210 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10211 << diri->dirfragtree << dendl;
10212 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10213 }
10214
10215 // add replica.
10216 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10217 dir->decode_replica(p);
10218
10219 // is this a dir_auth delegation boundary?
10220 if (from != diri->authority().first ||
10221 diri->is_ambiguous_auth() ||
10222 diri->is_base())
10223 adjust_subtree_auth(dir, from);
10224
10225 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10226
10227 // get waiters
10228 diri->take_dir_waiting(df.frag, finished);
10229 }
10230
10231 return dir;
10232 }
10233
10234 CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10235 {
10236 string name;
10237 snapid_t last;
10238 ::decode(name, p);
10239 ::decode(last, p);
10240
10241 CDentry *dn = dir->lookup(name, last);
10242
10243 // have it?
10244 if (dn) {
10245 dn->decode_replica(p, false);
10246 dout(7) << "add_replica_dentry had " << *dn << dendl;
10247 } else {
10248 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10249 dn->decode_replica(p, true);
10250 dout(7) << "add_replica_dentry added " << *dn << dendl;
10251 }
10252
10253 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10254
10255 return dn;
10256 }
10257
10258 CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10259 {
10260 inodeno_t ino;
10261 snapid_t last;
10262 ::decode(ino, p);
10263 ::decode(last, p);
10264 CInode *in = get_inode(ino, last);
10265 if (!in) {
10266 in = new CInode(this, false, 1, last);
10267 in->decode_replica(p, true);
10268 add_inode(in);
10269 if (in->ino() == MDS_INO_ROOT)
10270 in->inode_auth.first = 0;
10271 else if (in->is_mdsdir())
10272 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10273 dout(10) << "add_replica_inode added " << *in << dendl;
10274 if (dn) {
10275 assert(dn->get_linkage()->is_null());
10276 dn->dir->link_primary_inode(dn, in);
10277 }
10278 } else {
10279 in->decode_replica(p, false);
10280 dout(10) << "add_replica_inode had " << *in << dendl;
10281 }
10282
10283 if (dn) {
10284 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10285 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10286 }
10287
10288 return in;
10289 }
10290
10291
10292 void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10293 {
10294 uint64_t features = mds->mdsmap->get_up_features();
10295 replicate_inode(get_myin(), who, bl, features);
10296 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10297 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10298 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10299 replicate_dir(straydn->get_dir(), who, bl);
10300 replicate_dentry(straydn, who, bl);
10301 }
10302
10303 CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10304 {
10305 list<MDSInternalContextBase*> finished;
10306 bufferlist::iterator p = bl.begin();
10307
10308 CInode *mdsin = add_replica_inode(p, NULL, finished);
10309 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10310 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10311 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10312 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10313 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10314 if (!finished.empty())
10315 mds->queue_waiters(finished);
10316
10317 return straydn;
10318 }
10319
10320
10321 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10322 {
10323 // this is an FYI, re: replication
10324
10325 set<mds_rank_t> who;
10326 if (bcast) {
10327 mds->get_mds_map()->get_active_mds_set(who);
10328 } else {
10329 for (const auto &p : dir->get_replicas()) {
10330 who.insert(p.first);
10331 }
10332 }
10333
10334 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10335
10336 filepath path;
10337 dir->inode->make_path(path);
10338
10339 mds_rank_t whoami = mds->get_nodeid();
10340 for (set<mds_rank_t>::iterator it = who.begin();
10341 it != who.end();
10342 ++it) {
10343 if (*it == whoami) continue;
10344 //if (*it == except) continue;
10345 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10346
10347 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10348 dir->dirfrag(),
10349 dir->dir_rep,
10350 dir->dir_rep_by,
10351 path,
10352 bcast),
10353 *it);
10354 }
10355
10356 return 0;
10357 }
10358
10359 /* This function DOES put the passed message before returning */
10360 void MDCache::handle_dir_update(MDirUpdate *m)
10361 {
10362 dirfrag_t df = m->get_dirfrag();
10363 CDir *dir = get_dirfrag(df);
10364 if (!dir) {
10365 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10366
10367 // discover it?
10368 if (m->should_discover()) {
10369 // only try once!
10370 // this is key to avoid a fragtree update race, among other things.
10371 m->inc_tried_discover();
10372 vector<CDentry*> trace;
10373 CInode *in;
10374 filepath path = m->get_path();
10375 dout(5) << "trying discover on dir_update for " << path << dendl;
10376 MDRequestRef null_ref;
10377 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10378 if (r > 0)
10379 return;
10380 if (r == 0 &&
10381 in->ino() == df.ino &&
10382 in->get_approx_dirfrag(df.frag) == NULL) {
10383 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10384 return;
10385 }
10386 }
10387
10388 m->put();
10389 return;
10390 }
10391
10392 if (!m->has_tried_discover()) {
10393 // Update if it already exists. Othwerwise it got updated by discover reply.
10394 dout(5) << "dir_update on " << *dir << dendl;
10395 dir->dir_rep = m->get_dir_rep();
10396 dir->dir_rep_by = m->get_dir_rep_by();
10397 }
10398
10399 // done
10400 m->put();
10401 }
10402
10403
10404
10405
10406
10407 // LINK
10408
10409 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10410 {
10411 dout(7) << "send_dentry_link " << *dn << dendl;
10412
10413 CDir *subtree = get_subtree_root(dn->get_dir());
10414 for (const auto &p : dn->get_replicas()) {
10415 // don't tell (rename) witnesses; they already know
10416 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10417 continue;
10418 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10419 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10420 rejoin_gather.count(p.first)))
10421 continue;
10422 CDentry::linkage_t *dnl = dn->get_linkage();
10423 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10424 dn->name, dnl->is_primary());
10425 if (dnl->is_primary()) {
10426 dout(10) << " primary " << *dnl->get_inode() << dendl;
10427 replicate_inode(dnl->get_inode(), p.first, m->bl,
10428 mds->mdsmap->get_up_features());
10429 } else if (dnl->is_remote()) {
10430 inodeno_t ino = dnl->get_remote_ino();
10431 __u8 d_type = dnl->get_remote_d_type();
10432 dout(10) << " remote " << ino << " " << d_type << dendl;
10433 ::encode(ino, m->bl);
10434 ::encode(d_type, m->bl);
10435 } else
10436 ceph_abort(); // aie, bad caller!
10437 mds->send_message_mds(m, p.first);
10438 }
10439 }
10440
10441 /* This function DOES put the passed message before returning */
10442 void MDCache::handle_dentry_link(MDentryLink *m)
10443 {
10444
10445 CDentry *dn = NULL;
10446 CDir *dir = get_dirfrag(m->get_dirfrag());
10447 if (!dir) {
10448 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10449 } else {
10450 dn = dir->lookup(m->get_dn());
10451 if (!dn) {
10452 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10453 } else {
10454 dout(7) << "handle_dentry_link on " << *dn << dendl;
10455 CDentry::linkage_t *dnl = dn->get_linkage();
10456
10457 assert(!dn->is_auth());
10458 assert(dnl->is_null());
10459 }
10460 }
10461
10462 bufferlist::iterator p = m->bl.begin();
10463 list<MDSInternalContextBase*> finished;
10464 if (dn) {
10465 if (m->get_is_primary()) {
10466 // primary link.
10467 add_replica_inode(p, dn, finished);
10468 } else {
10469 // remote link, easy enough.
10470 inodeno_t ino;
10471 __u8 d_type;
10472 ::decode(ino, p);
10473 ::decode(d_type, p);
10474 dir->link_remote_inode(dn, ino, d_type);
10475 }
10476 } else {
10477 ceph_abort();
10478 }
10479
10480 if (!finished.empty())
10481 mds->queue_waiters(finished);
10482
10483 m->put();
10484 return;
10485 }
10486
10487
10488 // UNLINK
10489
10490 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10491 {
10492 dout(10) << "send_dentry_unlink " << *dn << dendl;
10493 // share unlink news with replicas
10494 set<mds_rank_t> replicas;
10495 dn->list_replicas(replicas);
10496 if (straydn)
10497 straydn->list_replicas(replicas);
10498 for (set<mds_rank_t>::iterator it = replicas.begin();
10499 it != replicas.end();
10500 ++it) {
10501 // don't tell (rmdir) witnesses; they already know
10502 if (mdr.get() && mdr->more()->witnessed.count(*it))
10503 continue;
10504
10505 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10506 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10507 rejoin_gather.count(*it)))
10508 continue;
10509
10510 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
10511 if (straydn)
10512 replicate_stray(straydn, *it, unlink->straybl);
10513 mds->send_message_mds(unlink, *it);
10514 }
10515 }
10516
10517 /* This function DOES put the passed message before returning */
10518 void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10519 {
10520 // straydn
10521 CDentry *straydn = NULL;
10522 if (m->straybl.length())
10523 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10524
10525 CDir *dir = get_dirfrag(m->get_dirfrag());
10526 if (!dir) {
10527 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10528 } else {
10529 CDentry *dn = dir->lookup(m->get_dn());
10530 if (!dn) {
10531 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10532 } else {
10533 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10534 CDentry::linkage_t *dnl = dn->get_linkage();
10535
10536 // open inode?
10537 if (dnl->is_primary()) {
10538 CInode *in = dnl->get_inode();
10539 dn->dir->unlink_inode(dn);
10540 assert(straydn);
10541 straydn->dir->link_primary_inode(straydn, in);
10542
10543 // in->first is lazily updated on replica; drag it forward so
10544 // that we always keep it in sync with the dnq
10545 assert(straydn->first >= in->first);
10546 in->first = straydn->first;
10547
10548 // update subtree map?
10549 if (in->is_dir())
10550 adjust_subtree_after_rename(in, dir, false);
10551
10552 // send caps to auth (if we're not already)
10553 if (in->is_any_caps() &&
10554 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10555 migrator->export_caps(in);
10556
10557 straydn = NULL;
10558 } else {
10559 assert(!straydn);
10560 assert(dnl->is_remote());
10561 dn->dir->unlink_inode(dn);
10562 }
10563 assert(dnl->is_null());
10564 }
10565 }
10566
10567 // race with trim_dentry()
10568 if (straydn) {
10569 assert(straydn->get_num_ref() == 0);
10570 assert(straydn->get_linkage()->is_null());
10571 map<mds_rank_t, MCacheExpire*> expiremap;
10572 trim_dentry(straydn, expiremap);
10573 send_expire_messages(expiremap);
10574 }
10575
10576 m->put();
10577 return;
10578 }
10579
10580
10581
10582
10583
10584
10585 // ===================================================================
10586
10587
10588
10589 // ===================================================================
10590 // FRAGMENT
10591
10592
10593 /**
10594 * adjust_dir_fragments -- adjust fragmentation for a directory
10595 *
10596 * @param diri directory inode
10597 * @param basefrag base fragment
10598 * @param bits bit adjustment. positive for split, negative for merge.
10599 */
10600 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10601 list<CDir*>& resultfrags,
10602 list<MDSInternalContextBase*>& waiters,
10603 bool replay)
10604 {
10605 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10606 << " on " << *diri << dendl;
10607
10608 list<CDir*> srcfrags;
10609 diri->get_dirfrags_under(basefrag, srcfrags);
10610
10611 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10612 }
10613
10614 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10615 {
10616 CDir *dir = diri->get_dirfrag(fg);
10617 if (dir)
10618 return dir;
10619
10620 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10621
10622 list<CDir*> src, result;
10623 list<MDSInternalContextBase*> waiters;
10624
10625 // split a parent?
10626 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10627 while (1) {
10628 CDir *pdir = diri->get_dirfrag(parent);
10629 if (pdir) {
10630 int split = fg.bits() - parent.bits();
10631 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10632 src.push_back(pdir);
10633 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10634 dir = diri->get_dirfrag(fg);
10635 if (dir) {
10636 dout(10) << "force_dir_fragment result " << *dir << dendl;
10637 break;
10638 }
10639 }
10640 if (parent == frag_t())
10641 break;
10642 frag_t last = parent;
10643 parent = parent.parent();
10644 dout(10) << " " << last << " parent is " << parent << dendl;
10645 }
10646
10647 if (!dir) {
10648 // hoover up things under fg?
10649 diri->get_dirfrags_under(fg, src);
10650 if (src.empty()) {
10651 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10652 } else {
10653 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10654 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10655 dir = result.front();
10656 dout(10) << "force_dir_fragment result " << *dir << dendl;
10657 }
10658 }
10659 if (!replay)
10660 mds->queue_waiters(waiters);
10661 return dir;
10662 }
10663
10664 void MDCache::adjust_dir_fragments(CInode *diri,
10665 list<CDir*>& srcfrags,
10666 frag_t basefrag, int bits,
10667 list<CDir*>& resultfrags,
10668 list<MDSInternalContextBase*>& waiters,
10669 bool replay)
10670 {
10671 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10672 << " srcfrags " << srcfrags
10673 << " on " << *diri << dendl;
10674
10675 // adjust fragtree
10676 // yuck. we may have discovered the inode while it was being fragmented.
10677 if (!diri->dirfragtree.is_leaf(basefrag))
10678 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10679
10680 if (bits > 0)
10681 diri->dirfragtree.split(basefrag, bits);
10682 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10683
10684 if (srcfrags.empty())
10685 return;
10686
10687 // split
10688 CDir *parent_dir = diri->get_parent_dir();
10689 CDir *parent_subtree = 0;
10690 if (parent_dir)
10691 parent_subtree = get_subtree_root(parent_dir);
10692
10693 if (bits > 0) {
10694 // SPLIT
10695 assert(srcfrags.size() == 1);
10696 CDir *dir = srcfrags.front();
10697
10698 dir->split(bits, resultfrags, waiters, replay);
10699
10700 // did i change the subtree map?
10701 if (dir->is_subtree_root()) {
10702 // new frags are now separate subtrees
10703 for (list<CDir*>::iterator p = resultfrags.begin();
10704 p != resultfrags.end();
10705 ++p)
10706 subtrees[*p].clear(); // new frag is now its own subtree
10707
10708 // was i a bound?
10709 if (parent_subtree) {
10710 assert(subtrees[parent_subtree].count(dir));
10711 subtrees[parent_subtree].erase(dir);
10712 for (list<CDir*>::iterator p = resultfrags.begin();
10713 p != resultfrags.end();
10714 ++p) {
10715 assert((*p)->is_subtree_root());
10716 subtrees[parent_subtree].insert(*p);
10717 }
10718 }
10719
10720 // adjust my bounds.
10721 set<CDir*> bounds;
10722 bounds.swap(subtrees[dir]);
10723 subtrees.erase(dir);
10724 for (set<CDir*>::iterator p = bounds.begin();
10725 p != bounds.end();
10726 ++p) {
10727 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10728 subtrees[frag].insert(*p);
10729 }
10730
10731 show_subtrees(10);
10732
10733 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10734 dir->dir_auth = CDIR_AUTH_DEFAULT;
10735 }
10736
10737 diri->close_dirfrag(dir->get_frag());
10738
10739 } else {
10740 // MERGE
10741
10742 // are my constituent bits subtrees? if so, i will be too.
10743 // (it's all or none, actually.)
10744 bool any_subtree = false;
10745 for (CDir *dir : srcfrags) {
10746 if (dir->is_subtree_root()) {
10747 any_subtree = true;
10748 break;
10749 }
10750 }
10751 set<CDir*> new_bounds;
10752 if (any_subtree) {
10753 for (CDir *dir : srcfrags) {
10754 // this simplifies the code that find subtrees underneath the dirfrag
10755 if (!dir->is_subtree_root()) {
10756 dir->state_set(CDir::STATE_AUXSUBTREE);
10757 adjust_subtree_auth(dir, mds->get_nodeid());
10758 }
10759 }
10760
10761 for (CDir *dir : srcfrags) {
10762 assert(dir->is_subtree_root());
10763 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
10764 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10765 set<CDir*>::iterator r = q->second.begin();
10766 while (r != subtrees[dir].end()) {
10767 new_bounds.insert(*r);
10768 subtrees[dir].erase(r++);
10769 }
10770 subtrees.erase(q);
10771
10772 // remove myself as my parent's bound
10773 if (parent_subtree)
10774 subtrees[parent_subtree].erase(dir);
10775 }
10776 }
10777
10778 // merge
10779 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10780 f->merge(srcfrags, waiters, replay);
10781
10782 if (any_subtree) {
10783 assert(f->is_subtree_root());
10784 subtrees[f].swap(new_bounds);
10785 if (parent_subtree)
10786 subtrees[parent_subtree].insert(f);
10787
10788 show_subtrees(10);
10789 }
10790
10791 resultfrags.push_back(f);
10792 }
10793 }
10794
10795
10796 class C_MDC_FragmentFrozen : public MDSInternalContext {
10797 MDCache *mdcache;
10798 MDRequestRef mdr;
10799 public:
10800 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10801 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10802 void finish(int r) override {
10803 mdcache->fragment_frozen(mdr, r);
10804 }
10805 };
10806
10807 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10808 {
10809 if (is_readonly()) {
10810 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10811 return false;
10812 }
10813 if (mds->is_cluster_degraded()) {
10814 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10815 return false;
10816 }
10817 if (diri->get_parent_dir() &&
10818 diri->get_parent_dir()->get_inode()->is_stray()) {
10819 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10820 return false;
10821 }
10822 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10823 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10824 return false;
10825 }
10826
10827 if (diri->scrub_is_in_progress()) {
10828 dout(7) << "can_fragment: scrub in progress" << dendl;
10829 return false;
10830 }
10831
10832 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10833 CDir *dir = *p;
10834 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10835 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10836 return false;
10837 }
10838 if (!dir->is_auth()) {
10839 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10840 return false;
10841 }
10842 if (dir->is_bad()) {
10843 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10844 return false;
10845 }
10846 if (dir->is_frozen() ||
10847 dir->is_freezing()) {
10848 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10849 return false;
10850 }
10851 }
10852
10853 return true;
10854 }
10855
10856 void MDCache::split_dir(CDir *dir, int bits)
10857 {
10858 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10859 assert(dir->is_auth());
10860 CInode *diri = dir->inode;
10861
10862 list<CDir*> dirs;
10863 dirs.push_back(dir);
10864
10865 if (!can_fragment(diri, dirs)) {
10866 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10867 return;
10868 }
10869
10870 if (dir->frag.bits() + bits > 24) {
10871 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10872 return;
10873 }
10874
10875 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10876 mdr->more()->fragment_base = dir->dirfrag();
10877
10878 assert(fragments.count(dir->dirfrag()) == 0);
10879 fragment_info_t& info = fragments[dir->dirfrag()];
10880 info.mdr = mdr;
10881 info.dirs.push_back(dir);
10882 info.bits = bits;
10883 info.last_cum_auth_pins_change = ceph_clock_now();
10884
10885 fragment_freeze_dirs(dirs);
10886 // initial mark+complete pass
10887 fragment_mark_and_complete(mdr);
10888 }
10889
10890 void MDCache::merge_dir(CInode *diri, frag_t frag)
10891 {
10892 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10893
10894 list<CDir*> dirs;
10895 if (!diri->get_dirfrags_under(frag, dirs)) {
10896 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10897 return;
10898 }
10899
10900 if (diri->dirfragtree.is_leaf(frag)) {
10901 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10902 return;
10903 }
10904
10905 if (!can_fragment(diri, dirs))
10906 return;
10907
10908 CDir *first = dirs.front();
10909 int bits = first->get_frag().bits() - frag.bits();
10910 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10911
10912 dirfrag_t basedirfrag(diri->ino(), frag);
10913 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10914 mdr->more()->fragment_base = basedirfrag;
10915
10916 assert(fragments.count(basedirfrag) == 0);
10917 fragment_info_t& info = fragments[basedirfrag];
10918 info.mdr = mdr;
10919 info.dirs = dirs;
10920 info.bits = -bits;
10921 info.last_cum_auth_pins_change = ceph_clock_now();
10922
10923 fragment_freeze_dirs(dirs);
10924 // initial mark+complete pass
10925 fragment_mark_and_complete(mdr);
10926 }
10927
10928 void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10929 {
10930 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10931 CDir *dir = *p;
10932 dir->auth_pin(dir); // until we mark and complete them
10933 dir->state_set(CDir::STATE_FRAGMENTING);
10934 dir->freeze_dir();
10935 assert(dir->is_freezing_dir());
10936 }
10937 }
10938
10939 class C_MDC_FragmentMarking : public MDCacheContext {
10940 MDRequestRef mdr;
10941 public:
10942 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10943 void finish(int r) override {
10944 mdcache->fragment_mark_and_complete(mdr);
10945 }
10946 };
10947
10948 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10949 {
10950 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10951 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10952 if (it == fragments.end() || it->second.mdr != mdr) {
10953 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10954 request_finish(mdr);
10955 return;
10956 }
10957
10958 fragment_info_t& info = it->second;
10959 CInode *diri = info.dirs.front()->get_inode();
10960 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10961
10962 MDSGatherBuilder gather(g_ceph_context);
10963
10964 for (list<CDir*>::iterator p = info.dirs.begin();
10965 p != info.dirs.end();
10966 ++p) {
10967 CDir *dir = *p;
10968
10969 bool ready = true;
10970 if (!dir->is_complete()) {
10971 dout(15) << " fetching incomplete " << *dir << dendl;
10972 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10973 ready = false;
10974 } else if (dir->get_frag() == frag_t()) {
10975 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10976 // the operation. To avoid CDir::fetch() complaining about missing object,
10977 // we commit new dirfrag first.
10978 if (dir->state_test(CDir::STATE_CREATING)) {
10979 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10980 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10981 ready = false;
10982 } else if (dir->is_new()) {
10983 dout(15) << " committing new " << *dir << dendl;
10984 assert(dir->is_dirty());
10985 dir->commit(0, gather.new_sub(), true);
10986 ready = false;
10987 }
10988 }
10989 if (!ready)
10990 continue;
10991
10992 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10993 dout(15) << " marking " << *dir << dendl;
10994 for (CDir::map_t::iterator p = dir->items.begin();
10995 p != dir->items.end();
10996 ++p) {
10997 CDentry *dn = p->second;
10998 dn->get(CDentry::PIN_FRAGMENTING);
10999 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11000 dn->state_set(CDentry::STATE_FRAGMENTING);
11001 }
11002 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11003 dir->auth_unpin(dir);
11004 } else {
11005 dout(15) << " already marked " << *dir << dendl;
11006 }
11007 }
11008 if (gather.has_subs()) {
11009 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11010 gather.activate();
11011 return;
11012 }
11013
11014 for (list<CDir*>::iterator p = info.dirs.begin();
11015 p != info.dirs.end();
11016 ++p) {
11017 CDir *dir = *p;
11018 if (!dir->is_frozen_dir()) {
11019 assert(dir->is_freezing_dir());
11020 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11021 }
11022 }
11023 if (gather.has_subs()) {
11024 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11025 gather.activate();
11026 // flush log so that request auth_pins are retired
11027 mds->mdlog->flush();
11028 return;
11029 }
11030
11031 fragment_frozen(mdr, 0);
11032 }
11033
11034 void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11035 {
11036 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11037 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11038 CDir *dir = *p;
11039 dout(10) << " frag " << *dir << dendl;
11040
11041 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11042 dir->state_clear(CDir::STATE_FRAGMENTING);
11043
11044 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11045 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11046
11047 for (CDir::map_t::iterator p = dir->items.begin();
11048 p != dir->items.end();
11049 ++p) {
11050 CDentry *dn = p->second;
11051 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11052 dn->state_clear(CDentry::STATE_FRAGMENTING);
11053 dn->put(CDentry::PIN_FRAGMENTING);
11054 }
11055 } else {
11056 dir->auth_unpin(dir);
11057 }
11058
11059 dir->unfreeze_dir();
11060 }
11061 }
11062
11063 bool MDCache::fragment_are_all_frozen(CDir *dir)
11064 {
11065 assert(dir->is_frozen_dir());
11066 map<dirfrag_t,fragment_info_t>::iterator p;
11067 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11068 p != fragments.end() && p->first.ino == dir->ino();
11069 ++p) {
11070 if (p->first.frag.contains(dir->get_frag()))
11071 return p->second.all_frozen;
11072 }
11073 ceph_abort();
11074 return false;
11075 }
11076
11077 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11078 {
11079 map<dirfrag_t,fragment_info_t>::iterator p;
11080 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11081 p != fragments.end() && p->first.ino == dir->ino();
11082 ++p) {
11083 if (p->first.frag.contains(dir->get_frag())) {
11084 p->second.num_remote_waiters++;
11085 return;
11086 }
11087 }
11088 ceph_abort();
11089 }
11090
11091 void MDCache::find_stale_fragment_freeze()
11092 {
11093 dout(10) << "find_stale_fragment_freeze" << dendl;
11094 // see comment in Migrator::find_stale_export_freeze()
11095 utime_t now = ceph_clock_now();
11096 utime_t cutoff = now;
11097 cutoff -= g_conf->mds_freeze_tree_timeout;
11098
11099 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11100 p != fragments.end(); ) {
11101 dirfrag_t df = p->first;
11102 fragment_info_t& info = p->second;
11103 ++p;
11104 if (info.all_frozen)
11105 continue;
11106 CDir *dir;
11107 int total_auth_pins = 0;
11108 for (list<CDir*>::iterator q = info.dirs.begin();
11109 q != info.dirs.end();
11110 ++q) {
11111 dir = *q;
11112 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11113 total_auth_pins = -1;
11114 break;
11115 }
11116 if (dir->is_frozen_dir())
11117 continue;
11118 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11119 }
11120 if (total_auth_pins < 0)
11121 continue;
11122 if (info.last_cum_auth_pins != total_auth_pins) {
11123 info.last_cum_auth_pins = total_auth_pins;
11124 info.last_cum_auth_pins_change = now;
11125 continue;
11126 }
11127 if (info.last_cum_auth_pins_change >= cutoff)
11128 continue;
11129 dir = info.dirs.front();
11130 if (info.num_remote_waiters > 0 ||
11131 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11132 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11133 list<CDir*> dirs;
11134 info.dirs.swap(dirs);
11135 fragments.erase(df);
11136 fragment_unmark_unfreeze_dirs(dirs);
11137 }
11138 }
11139 }
11140
11141 class C_MDC_FragmentPrep : public MDCacheLogContext {
11142 MDRequestRef mdr;
11143 public:
11144 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11145 void finish(int r) override {
11146 mdcache->_fragment_logged(mdr);
11147 }
11148 };
11149
11150 class C_MDC_FragmentStore : public MDCacheContext {
11151 MDRequestRef mdr;
11152 public:
11153 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11154 void finish(int r) override {
11155 mdcache->_fragment_stored(mdr);
11156 }
11157 };
11158
11159 class C_MDC_FragmentCommit : public MDCacheLogContext {
11160 dirfrag_t basedirfrag;
11161 list<CDir*> resultfrags;
11162 public:
11163 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11164 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11165 void finish(int r) override {
11166 mdcache->_fragment_committed(basedirfrag, resultfrags);
11167 }
11168 };
11169
11170 class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11171 dirfrag_t basedirfrag;
11172 list<CDir*> resultfrags;
11173 public:
11174 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11175 MDCacheIOContext(m), basedirfrag(f) {
11176 resultfrags.swap(l);
11177 }
11178 void finish(int r) override {
11179 assert(r == 0 || r == -ENOENT);
11180 mdcache->_fragment_finish(basedirfrag, resultfrags);
11181 }
11182 };
11183
11184 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11185 {
11186 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11187 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11188 if (it == fragments.end() || it->second.mdr != mdr) {
11189 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11190 request_finish(mdr);
11191 return;
11192 }
11193
11194 assert(r == 0);
11195 fragment_info_t& info = it->second;
11196 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11197 << " on " << info.dirs.front()->get_inode() << dendl;
11198
11199 info.all_frozen = true;
11200 dispatch_fragment_dir(mdr);
11201 }
11202
11203 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11204 {
11205 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11206 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11207 if (it == fragments.end() || it->second.mdr != mdr) {
11208 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11209 request_finish(mdr);
11210 return;
11211 }
11212
11213 fragment_info_t& info = it->second;
11214 CInode *diri = info.dirs.front()->get_inode();
11215
11216 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11217 << " on " << *diri << dendl;
11218 if (!mdr->aborted) {
11219 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11220 wrlocks.insert(&diri->dirfragtreelock);
11221 // prevent a racing gather on any other scatterlocks too
11222 wrlocks.insert(&diri->nestlock);
11223 wrlocks.insert(&diri->filelock);
11224 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11225 if (!mdr->aborted)
11226 return;
11227 }
11228
11229 if (mdr->aborted) {
11230 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11231 << info.dirs.front()->dirfrag() << dendl;
11232 if (info.bits > 0)
11233 mds->balancer->queue_split(info.dirs.front(), false);
11234 else
11235 mds->balancer->queue_merge(info.dirs.front());
11236 fragment_unmark_unfreeze_dirs(info.dirs);
11237 fragments.erase(it);
11238 request_finish(mdr);
11239 return;
11240 }
11241
11242 mdr->ls = mds->mdlog->get_current_segment();
11243 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11244 mds->mdlog->start_entry(le);
11245
11246 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11247 CDir *dir = *p;
11248 dirfrag_rollback rollback;
11249 rollback.fnode = dir->fnode;
11250 le->add_orig_frag(dir->get_frag(), &rollback);
11251 }
11252
11253 // refragment
11254 list<MDSInternalContextBase*> waiters;
11255 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11256 info.resultfrags, waiters, false);
11257 if (g_conf->mds_debug_frag)
11258 diri->verify_dirfrags();
11259 mds->queue_waiters(waiters);
11260
11261 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11262 assert(!diri->dirfragtree.is_leaf(*p));
11263
11264 le->metablob.add_dir_context(*info.resultfrags.begin());
11265 for (list<CDir*>::iterator p = info.resultfrags.begin();
11266 p != info.resultfrags.end();
11267 ++p) {
11268 if (diri->is_auth()) {
11269 le->metablob.add_fragmented_dir(*p, false, false);
11270 } else {
11271 (*p)->state_set(CDir::STATE_DIRTYDFT);
11272 le->metablob.add_fragmented_dir(*p, false, true);
11273 }
11274 }
11275
11276 // dft lock
11277 if (diri->is_auth()) {
11278 // journal dirfragtree
11279 inode_t *pi = diri->project_inode();
11280 pi->version = diri->pre_dirty();
11281 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11282 } else {
11283 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11284 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11285 mdr->add_updated_lock(&diri->dirfragtreelock);
11286 }
11287
11288 /*
11289 // filelock
11290 mds->locker->mark_updated_scatterlock(&diri->filelock);
11291 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11292 mut->add_updated_lock(&diri->filelock);
11293
11294 // dirlock
11295 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11296 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11297 mut->add_updated_lock(&diri->nestlock);
11298 */
11299
11300 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11301 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11302 mdr, __func__);
11303 mds->mdlog->flush();
11304 }
11305
11306 void MDCache::_fragment_logged(MDRequestRef& mdr)
11307 {
11308 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11309 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11310 assert(it != fragments.end());
11311 fragment_info_t &info = it->second;
11312 CInode *diri = info.resultfrags.front()->get_inode();
11313
11314 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11315 << " on " << *diri << dendl;
11316
11317 if (diri->is_auth())
11318 diri->pop_and_dirty_projected_inode(mdr->ls);
11319
11320 mdr->apply(); // mark scatterlock
11321
11322 // store resulting frags
11323 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11324
11325 for (list<CDir*>::iterator p = info.resultfrags.begin();
11326 p != info.resultfrags.end();
11327 ++p) {
11328 CDir *dir = *p;
11329 dout(10) << " storing result frag " << *dir << dendl;
11330
11331 // freeze and store them too
11332 dir->auth_pin(this);
11333 dir->state_set(CDir::STATE_FRAGMENTING);
11334 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11335 }
11336
11337 gather.activate();
11338 }
11339
11340 void MDCache::_fragment_stored(MDRequestRef& mdr)
11341 {
11342 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11343 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11344 assert(it != fragments.end());
11345 fragment_info_t &info = it->second;
11346 CInode *diri = info.resultfrags.front()->get_inode();
11347
11348 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11349 << " on " << *diri << dendl;
11350
11351 // tell peers
11352 CDir *first = *info.resultfrags.begin();
11353 for (const auto &p : first->get_replicas()) {
11354 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11355 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11356 rejoin_gather.count(p.first)))
11357 continue;
11358
11359 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11360
11361 // freshly replicate new dirs to peers
11362 for (list<CDir*>::iterator q = info.resultfrags.begin();
11363 q != info.resultfrags.end();
11364 ++q)
11365 replicate_dir(*q, p.first, notify->basebl);
11366
11367 mds->send_message_mds(notify, p.first);
11368 }
11369
11370 // journal commit
11371 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11372 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11373 info.resultfrags));
11374
11375 mds->locker->drop_locks(mdr.get());
11376
11377 // unfreeze resulting frags
11378 for (list<CDir*>::iterator p = info.resultfrags.begin();
11379 p != info.resultfrags.end();
11380 ++p) {
11381 CDir *dir = *p;
11382 dout(10) << " result frag " << *dir << dendl;
11383
11384 for (CDir::map_t::iterator p = dir->items.begin();
11385 p != dir->items.end();
11386 ++p) {
11387 CDentry *dn = p->second;
11388 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11389 dn->state_clear(CDentry::STATE_FRAGMENTING);
11390 dn->put(CDentry::PIN_FRAGMENTING);
11391 }
11392
11393 // unfreeze
11394 dir->unfreeze_dir();
11395 }
11396
11397 fragments.erase(it);
11398 request_finish(mdr);
11399 }
11400
11401 void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11402 {
11403 dout(10) << "fragment_committed " << basedirfrag << dendl;
11404 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11405 assert(it != uncommitted_fragments.end());
11406 ufragment &uf = it->second;
11407
11408 // remove old frags
11409 C_GatherBuilder gather(
11410 g_ceph_context,
11411 new C_OnFinisher(
11412 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11413 mds->finisher));
11414
11415 SnapContext nullsnapc;
11416 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11417 for (list<frag_t>::iterator p = uf.old_frags.begin();
11418 p != uf.old_frags.end();
11419 ++p) {
11420 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11421 ObjectOperation op;
11422 if (*p == frag_t()) {
11423 // backtrace object
11424 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11425 op.truncate(0);
11426 op.omap_clear();
11427 } else {
11428 dout(10) << " removing orphan dirfrag " << oid << dendl;
11429 op.remove();
11430 }
11431 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11432 ceph::real_clock::now(),
11433 0, gather.new_sub());
11434 }
11435
11436 assert(gather.has_subs());
11437 gather.activate();
11438 }
11439
11440 void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11441 {
11442 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11443 << resultfrags.size() << dendl;
11444 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11445 assert(it != uncommitted_fragments.end());
11446 ufragment &uf = it->second;
11447
11448 // unmark & auth_unpin
11449 for (const auto &dir : resultfrags) {
11450 dir->state_clear(CDir::STATE_FRAGMENTING);
11451 dir->auth_unpin(this);
11452
11453 // In case the resulting fragments are beyond the split size,
11454 // we might need to split them again right away (they could
11455 // have been taking inserts between unfreezing and getting
11456 // here)
11457 mds->balancer->maybe_fragment(dir, false);
11458 }
11459
11460 if (mds->logger) {
11461 if (resultfrags.size() > 1) {
11462 mds->logger->inc(l_mds_dir_split);
11463 } else {
11464 mds->logger->inc(l_mds_dir_merge);
11465 }
11466 }
11467
11468 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11469 mds->mdlog->start_submit_entry(le);
11470
11471 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11472 }
11473
11474 /* This function DOES put the passed message before returning */
11475 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11476 {
11477 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11478
11479 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11480 notify->put();
11481 return;
11482 }
11483
11484 CInode *diri = get_inode(notify->get_ino());
11485 if (diri) {
11486 frag_t base = notify->get_basefrag();
11487 int bits = notify->get_bits();
11488
11489 /*
11490 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11491 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11492 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11493 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11494 notify->put();
11495 return;
11496 }
11497 */
11498
11499 // refragment
11500 list<MDSInternalContextBase*> waiters;
11501 list<CDir*> resultfrags;
11502 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11503 if (g_conf->mds_debug_frag)
11504 diri->verify_dirfrags();
11505
11506 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11507 diri->take_dir_waiting((*p)->get_frag(), waiters);
11508
11509 // add new replica dirs values
11510 bufferlist::iterator p = notify->basebl.begin();
11511 while (!p.end())
11512 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11513
11514 mds->queue_waiters(waiters);
11515 } else {
11516 ceph_abort();
11517 }
11518
11519 notify->put();
11520 }
11521
11522 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11523 LogSegment *ls, bufferlist *rollback)
11524 {
11525 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11526 assert(!uncommitted_fragments.count(basedirfrag));
11527 ufragment& uf = uncommitted_fragments[basedirfrag];
11528 uf.old_frags = old_frags;
11529 uf.bits = bits;
11530 uf.ls = ls;
11531 ls->uncommitted_fragments.insert(basedirfrag);
11532 if (rollback)
11533 uf.rollback.swap(*rollback);
11534 }
11535
11536 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11537 {
11538 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11539 << " op " << EFragment::op_name(op) << dendl;
11540 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11541 if (it != uncommitted_fragments.end()) {
11542 ufragment& uf = it->second;
11543 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11544 uf.committed = true;
11545 } else {
11546 uf.ls->uncommitted_fragments.erase(basedirfrag);
11547 mds->queue_waiters(uf.waiters);
11548 uncommitted_fragments.erase(it);
11549 }
11550 }
11551 }
11552
11553 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11554 {
11555 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11556 << " old_frags (" << old_frags << ")" << dendl;
11557 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11558 if (it != uncommitted_fragments.end()) {
11559 ufragment& uf = it->second;
11560 if (!uf.old_frags.empty()) {
11561 uf.old_frags.swap(old_frags);
11562 uf.committed = true;
11563 } else {
11564 uf.ls->uncommitted_fragments.erase(basedirfrag);
11565 uncommitted_fragments.erase(it);
11566 }
11567 }
11568 }
11569
11570 void MDCache::rollback_uncommitted_fragments()
11571 {
11572 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11573 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11574 p != uncommitted_fragments.end();
11575 ++p) {
11576 ufragment &uf = p->second;
11577 CInode *diri = get_inode(p->first.ino);
11578 assert(diri);
11579
11580 if (uf.committed) {
11581 list<CDir*> frags;
11582 diri->get_dirfrags_under(p->first.frag, frags);
11583 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11584 CDir *dir = *q;
11585 dir->auth_pin(this);
11586 dir->state_set(CDir::STATE_FRAGMENTING);
11587 }
11588 _fragment_committed(p->first, frags);
11589 continue;
11590 }
11591
11592 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11593
11594 LogSegment *ls = mds->mdlog->get_current_segment();
11595 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11596 mds->mdlog->start_entry(le);
11597 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11598
11599 list<frag_t> old_frags;
11600 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11601
11602 list<CDir*> resultfrags;
11603 if (uf.old_frags.empty()) {
11604 // created by old format EFragment
11605 list<MDSInternalContextBase*> waiters;
11606 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11607 } else {
11608 bufferlist::iterator bp = uf.rollback.begin();
11609 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11610 CDir *dir = force_dir_fragment(diri, *q);
11611 resultfrags.push_back(dir);
11612
11613 dirfrag_rollback rollback;
11614 ::decode(rollback, bp);
11615
11616 dir->set_version(rollback.fnode.version);
11617 dir->fnode = rollback.fnode;
11618
11619 dir->_mark_dirty(ls);
11620
11621 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11622 dout(10) << " dirty nestinfo on " << *dir << dendl;
11623 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11624 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11625 }
11626 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11627 dout(10) << " dirty fragstat on " << *dir << dendl;
11628 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11629 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11630 }
11631
11632 le->add_orig_frag(dir->get_frag());
11633 le->metablob.add_dir_context(dir);
11634 if (diri_auth) {
11635 le->metablob.add_fragmented_dir(dir, true, false);
11636 } else {
11637 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11638 dir->state_set(CDir::STATE_DIRTYDFT);
11639 le->metablob.add_fragmented_dir(dir, true, true);
11640 }
11641 }
11642 }
11643
11644 if (diri_auth) {
11645 diri->project_inode()->version = diri->pre_dirty();
11646 diri->pop_and_dirty_projected_inode(ls); // hacky
11647 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11648 } else {
11649 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11650 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11651 }
11652
11653 if (g_conf->mds_debug_frag)
11654 diri->verify_dirfrags();
11655
11656 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11657 assert(!diri->dirfragtree.is_leaf(*q));
11658
11659 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11660 CDir *dir = *q;
11661 dir->auth_pin(this);
11662 dir->state_set(CDir::STATE_FRAGMENTING);
11663 }
11664
11665 mds->mdlog->submit_entry(le);
11666
11667 uf.old_frags.swap(old_frags);
11668 _fragment_committed(p->first, resultfrags);
11669 }
11670 }
11671
11672 void MDCache::force_readonly()
11673 {
11674 if (is_readonly())
11675 return;
11676
11677 dout(1) << "force file system read-only" << dendl;
11678 mds->clog->warn() << "force file system read-only";
11679
11680 set_readonly();
11681
11682 mds->server->force_clients_readonly();
11683
11684 // revoke write caps
11685 for (auto p : inode_map) {
11686 CInode *in = p.second;
11687 if (in->is_head())
11688 mds->locker->eval(in, CEPH_CAP_LOCKS);
11689 }
11690
11691 mds->mdlog->flush();
11692 }
11693
11694
11695 // ==============================================================
11696 // debug crap
11697
11698 void MDCache::show_subtrees(int dbl)
11699 {
11700 if (g_conf->mds_thrash_exports)
11701 dbl += 15;
11702
11703 //dout(10) << "show_subtrees" << dendl;
11704
11705 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11706 return; // i won't print anything.
11707
11708 if (subtrees.empty()) {
11709 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11710 return;
11711 }
11712
11713 // root frags
11714 list<CDir*> basefrags;
11715 for (set<CInode*>::iterator p = base_inodes.begin();
11716 p != base_inodes.end();
11717 ++p)
11718 (*p)->get_dirfrags(basefrags);
11719 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11720 dout(15) << "show_subtrees" << dendl;
11721
11722 // queue stuff
11723 list<pair<CDir*,int> > q;
11724 string indent;
11725 set<CDir*> seen;
11726
11727 // calc max depth
11728 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11729 q.push_back(pair<CDir*,int>(*p, 0));
11730
11731 set<CDir*> subtrees_seen;
11732
11733 int depth = 0;
11734 while (!q.empty()) {
11735 CDir *dir = q.front().first;
11736 int d = q.front().second;
11737 q.pop_front();
11738
11739 if (subtrees.count(dir) == 0) continue;
11740
11741 subtrees_seen.insert(dir);
11742
11743 if (d > depth) depth = d;
11744
11745 // sanity check
11746 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11747 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11748 assert(seen.count(dir) == 0);
11749 seen.insert(dir);
11750
11751 // nested items?
11752 if (!subtrees[dir].empty()) {
11753 for (set<CDir*>::iterator p = subtrees[dir].begin();
11754 p != subtrees[dir].end();
11755 ++p) {
11756 //dout(25) << " saw sub " << **p << dendl;
11757 q.push_front(pair<CDir*,int>(*p, d+1));
11758 }
11759 }
11760 }
11761
11762
11763 // print tree
11764 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11765 q.push_back(pair<CDir*,int>(*p, 0));
11766
11767 while (!q.empty()) {
11768 CDir *dir = q.front().first;
11769 int d = q.front().second;
11770 q.pop_front();
11771
11772 if (subtrees.count(dir) == 0) continue;
11773
11774 // adjust indenter
11775 while ((unsigned)d < indent.size())
11776 indent.resize(d);
11777
11778 // pad
11779 string pad = "______________________________________";
11780 pad.resize(depth*2+1-indent.size());
11781 if (!subtrees[dir].empty())
11782 pad[0] = '.'; // parent
11783
11784
11785 string auth;
11786 if (dir->is_auth())
11787 auth = "auth ";
11788 else
11789 auth = " rep ";
11790
11791 char s[10];
11792 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11793 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11794 else
11795 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11796
11797 // print
11798 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11799
11800 if (dir->ino() == MDS_INO_ROOT)
11801 assert(dir->inode == root);
11802 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11803 assert(dir->inode == myin);
11804 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11805 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11806
11807 // nested items?
11808 if (!subtrees[dir].empty()) {
11809 // more at my level?
11810 if (!q.empty() && q.front().second == d)
11811 indent += "| ";
11812 else
11813 indent += " ";
11814
11815 for (set<CDir*>::iterator p = subtrees[dir].begin();
11816 p != subtrees[dir].end();
11817 ++p)
11818 q.push_front(pair<CDir*,int>(*p, d+2));
11819 }
11820 }
11821
11822 // verify there isn't stray crap in subtree map
11823 int lost = 0;
11824 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11825 p != subtrees.end();
11826 ++p) {
11827 if (subtrees_seen.count(p->first)) continue;
11828 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11829 lost++;
11830 }
11831 assert(lost == 0);
11832 }
11833
11834 void MDCache::show_cache()
11835 {
11836 dout(7) << "show_cache" << dendl;
11837
11838 auto show_func = [this](CInode *in) {
11839 // unlinked?
11840 if (!in->parent)
11841 dout(7) << " unlinked " << *in << dendl;
11842
11843 // dirfrags?
11844 list<CDir*> dfs;
11845 in->get_dirfrags(dfs);
11846 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11847 CDir *dir = *p;
11848 dout(7) << " dirfrag " << *dir << dendl;
11849
11850 for (CDir::map_t::iterator p = dir->items.begin();
11851 p != dir->items.end();
11852 ++p) {
11853 CDentry *dn = p->second;
11854 dout(7) << " dentry " << *dn << dendl;
11855 CDentry::linkage_t *dnl = dn->get_linkage();
11856 if (dnl->is_primary() && dnl->get_inode())
11857 dout(7) << " inode " << *dnl->get_inode() << dendl;
11858 }
11859 }
11860 };
11861
11862 for (auto p : inode_map)
11863 show_func(p.second);
11864 for (auto p : snap_inode_map)
11865 show_func(p.second);
11866 }
11867
11868 int MDCache::cache_status(Formatter *f)
11869 {
11870 f->open_object_section("cache");
11871
11872 f->open_object_section("pool");
11873 mempool::get_pool(mempool::mds_co::id).dump(f);
11874 f->close_section();
11875
11876 f->close_section();
11877 return 0;
11878 }
11879
11880 int MDCache::dump_cache(std::string const &file_name)
11881 {
11882 return dump_cache(file_name.c_str(), NULL);
11883 }
11884
11885 int MDCache::dump_cache(Formatter *f)
11886 {
11887 return dump_cache(NULL, f);
11888 }
11889
11890 int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
11891 {
11892 return dump_cache(NULL, f, dump_root, depth);
11893 }
11894
11895 /**
11896 * Dump the metadata cache, either to a Formatter, if
11897 * provided, else to a plain text file.
11898 */
11899 int MDCache::dump_cache(const char *fn, Formatter *f,
11900 const string& dump_root, int depth)
11901 {
11902 int r = 0;
11903 int fd = -1;
11904
11905 if (f) {
11906 f->open_array_section("inodes");
11907 } else {
11908 char deffn[200];
11909 if (!fn) {
11910 snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11911 fn = deffn;
11912 }
11913
11914 dout(1) << "dump_cache to " << fn << dendl;
11915
11916 fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
11917 if (fd < 0) {
11918 derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
11919 return errno;
11920 }
11921 }
11922
11923 auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
11924 int r;
11925 if (!dump_root.empty()) {
11926 string ipath;
11927 if (in->is_root())
11928 ipath = "/";
11929 else
11930 in->make_path_string(ipath);
11931
11932 if (dump_root.length() > ipath.length() ||
11933 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11934 return 0;
11935
11936 if (depth >= 0 &&
11937 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11938 return 0;
11939 }
11940
11941 if (f) {
11942 f->open_object_section("inode");
11943 in->dump(f);
11944 } else {
11945 ostringstream ss;
11946 ss << *in << std::endl;
11947 std::string s = ss.str();
11948 r = safe_write(fd, s.c_str(), s.length());
11949 if (r < 0)
11950 return r;
11951 }
11952
11953 list<CDir*> dfs;
11954 in->get_dirfrags(dfs);
11955 if (f) {
11956 f->open_array_section("dirfrags");
11957 }
11958 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11959 CDir *dir = *p;
11960 if (f) {
11961 f->open_object_section("dir");
11962 dir->dump(f);
11963 } else {
11964 ostringstream tt;
11965 tt << " " << *dir << std::endl;
11966 string t = tt.str();
11967 r = safe_write(fd, t.c_str(), t.length());
11968 if (r < 0)
11969 return r;
11970 }
11971
11972 if (f) {
11973 f->open_array_section("dentries");
11974 }
11975 for (CDir::map_t::iterator q = dir->items.begin();
11976 q != dir->items.end();
11977 ++q) {
11978 CDentry *dn = q->second;
11979 if (f) {
11980 f->open_object_section("dentry");
11981 dn->dump(f);
11982 f->close_section();
11983 } else {
11984 ostringstream uu;
11985 uu << " " << *dn << std::endl;
11986 string u = uu.str();
11987 r = safe_write(fd, u.c_str(), u.length());
11988 if (r < 0)
11989 return r;
11990 }
11991 }
11992 if (f) {
11993 f->close_section(); //dentries
11994 }
11995 dir->check_rstats();
11996 if (f) {
11997 f->close_section(); //dir
11998 }
11999 }
12000 if (f) {
12001 f->close_section(); // dirfrags
12002 }
12003
12004 if (f) {
12005 f->close_section(); // inode
12006 }
12007 return 1;
12008 };
12009
12010 for (auto p : inode_map) {
12011 r = dump_func(p.second);
12012 if (r < 0)
12013 goto out;
12014 }
12015 for (auto p : snap_inode_map) {
12016 r = dump_func(p.second);
12017 if (r < 0)
12018 goto out;
12019 }
12020 r = 0;
12021
12022 out:
12023 if (f) {
12024 f->close_section(); // inodes
12025 } else {
12026 ::close(fd);
12027 }
12028 return r;
12029 }
12030
12031
12032
12033 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12034 : MDSInternalContext(c->mds), cache(c), mdr(r)
12035 {}
12036
12037 void C_MDS_RetryRequest::finish(int r)
12038 {
12039 mdr->retry++;
12040 cache->dispatch_request(mdr);
12041 }
12042
12043
12044 class C_MDS_EnqueueScrub : public Context
12045 {
12046 Formatter *formatter;
12047 Context *on_finish;
12048 public:
12049 ScrubHeaderRef header;
12050 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12051 formatter(f), on_finish(fin), header(nullptr) {}
12052
12053 Context *take_finisher() {
12054 Context *fin = on_finish;
12055 on_finish = NULL;
12056 return fin;
12057 }
12058
12059 void finish(int r) override {
12060 if (r < 0) { // we failed the lookup or something; dump ourselves
12061 formatter->open_object_section("results");
12062 formatter->dump_int("return_code", r);
12063 formatter->close_section(); // results
12064 }
12065 if (on_finish)
12066 on_finish->complete(r);
12067 }
12068 };
12069
12070 void MDCache::enqueue_scrub(
12071 const string& path,
12072 const std::string &tag,
12073 bool force, bool recursive, bool repair,
12074 Formatter *f, Context *fin)
12075 {
12076 dout(10) << __func__ << path << dendl;
12077 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12078 filepath fp(path.c_str());
12079 mdr->set_filepath(fp);
12080
12081 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12082 cs->header = std::make_shared<ScrubHeader>(
12083 tag, force, recursive, repair, f);
12084
12085 mdr->internal_op_finish = cs;
12086 enqueue_scrub_work(mdr);
12087 }
12088
12089 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12090 {
12091 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12092 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12093 if (NULL == in)
12094 return;
12095
12096 // TODO: Remove this restriction
12097 assert(in->is_auth());
12098
12099 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12100 if (!locked)
12101 return;
12102
12103 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12104 ScrubHeaderRef &header = cs->header;
12105
12106 // Cannot scrub same dentry twice at same time
12107 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12108 mds->server->respond_to_request(mdr, -EBUSY);
12109 return;
12110 } else {
12111 in->scrub_info();
12112 }
12113
12114 header->set_origin(in);
12115
12116 Context *fin = nullptr;
12117 if (!header->get_recursive()) {
12118 fin = cs->take_finisher();
12119 }
12120
12121 // If the scrub did some repair, then flush the journal at the end of
12122 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12123 // the on disk state will still look damaged.
12124 auto expiry_fin = new FunctionContext([this, header, fin](int r){
12125 if (header->get_repaired()) {
12126 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12127 mds->mdlog->start_new_segment();
12128 mds->mdlog->trim_all();
12129 if (fin) {
12130 MDSGatherBuilder expiry_gather(g_ceph_context);
12131 const std::set<LogSegment*> &expiring_segments = mds->mdlog->get_expiring_segments();
12132 for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
12133 i != expiring_segments.end(); ++i) {
12134 (*i)->wait_for_expiry(expiry_gather.new_sub());
12135 }
12136 expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12137 expiry_gather.activate();
12138 }
12139 } else {
12140 if (fin) {
12141 fin->complete(r);
12142 }
12143 }
12144 });
12145
12146 if (!header->get_recursive()) {
12147 mds->scrubstack->enqueue_inode_top(in, header,
12148 new MDSInternalContextWrapper(mds,
12149 expiry_fin));
12150 } else {
12151 mds->scrubstack->enqueue_inode_bottom(in, header,
12152 new MDSInternalContextWrapper(mds,
12153 expiry_fin));
12154 }
12155
12156 mds->server->respond_to_request(mdr, 0);
12157 return;
12158 }
12159
12160 struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12161 MDRequestRef mdr;
12162 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12163 MDCacheLogContext(c), mdr(m) {}
12164 void finish(int r) override {
12165 mdr->apply();
12166 get_mds()->server->respond_to_request(mdr, r);
12167 }
12168 };
12169
12170 void MDCache::repair_dirfrag_stats(CDir *dir)
12171 {
12172 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12173 mdr->pin(dir);
12174 mdr->internal_op_private = dir;
12175 mdr->internal_op_finish = new C_MDSInternalNoop;
12176 repair_dirfrag_stats_work(mdr);
12177 }
12178
12179 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12180 {
12181 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12182 dout(10) << __func__ << " " << *dir << dendl;
12183
12184 if (!dir->is_auth()) {
12185 mds->server->respond_to_request(mdr, -ESTALE);
12186 return;
12187 }
12188
12189 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12190 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12191
12192 mds->locker->drop_locks(mdr.get());
12193 mdr->drop_local_auth_pins();
12194 if (!mdr->remote_auth_pins.empty())
12195 mds->locker->notify_freeze_waiter(dir);
12196 return;
12197 }
12198
12199 mdr->auth_pin(dir);
12200
12201 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12202 CInode *diri = dir->inode;
12203 rdlocks.insert(&diri->dirfragtreelock);
12204 wrlocks.insert(&diri->nestlock);
12205 wrlocks.insert(&diri->filelock);
12206 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12207 return;
12208
12209 if (!dir->is_complete()) {
12210 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12211 return;
12212 }
12213
12214 frag_info_t frag_info;
12215 nest_info_t nest_info;
12216 for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
12217 CDentry *dn = it->second;
12218 if (dn->last != CEPH_NOSNAP)
12219 continue;
12220 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12221 if (dnl->is_primary()) {
12222 CInode *in = dnl->get_inode();
12223 nest_info.add(in->get_projected_inode()->accounted_rstat);
12224 if (in->is_dir())
12225 frag_info.nsubdirs++;
12226 else
12227 frag_info.nfiles++;
12228 } else if (dnl->is_remote())
12229 frag_info.nfiles++;
12230 }
12231
12232 fnode_t *pf = dir->get_projected_fnode();
12233 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12234 bool good_rstat = nest_info.same_sums(pf->rstat);
12235 if (good_fragstat && good_rstat) {
12236 dout(10) << __func__ << " no corruption found" << dendl;
12237 mds->server->respond_to_request(mdr, 0);
12238 return;
12239 }
12240
12241 pf = dir->project_fnode();
12242 pf->version = dir->pre_dirty();
12243 mdr->add_projected_fnode(dir);
12244
12245 mdr->ls = mds->mdlog->get_current_segment();
12246 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12247 mds->mdlog->start_entry(le);
12248
12249 if (!good_fragstat) {
12250 if (pf->fragstat.mtime > frag_info.mtime)
12251 frag_info.mtime = pf->fragstat.mtime;
12252 if (pf->fragstat.change_attr > frag_info.change_attr)
12253 frag_info.change_attr = pf->fragstat.change_attr;
12254 pf->fragstat = frag_info;
12255 mds->locker->mark_updated_scatterlock(&diri->filelock);
12256 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12257 mdr->add_updated_lock(&diri->filelock);
12258 }
12259
12260 if (!good_rstat) {
12261 if (pf->rstat.rctime > nest_info.rctime)
12262 nest_info.rctime = pf->rstat.rctime;
12263 pf->rstat = nest_info;
12264 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12265 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12266 mdr->add_updated_lock(&diri->nestlock);
12267 }
12268
12269 le->metablob.add_dir_context(dir);
12270 le->metablob.add_dir(dir, true);
12271
12272 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12273 }
12274
12275 void MDCache::repair_inode_stats(CInode *diri)
12276 {
12277 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12278 mdr->pin(diri);
12279 mdr->internal_op_private = diri;
12280 mdr->internal_op_finish = new C_MDSInternalNoop;
12281 repair_inode_stats_work(mdr);
12282 }
12283
12284 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12285 {
12286 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12287 dout(10) << __func__ << " " << *diri << dendl;
12288
12289 if (!diri->is_auth()) {
12290 mds->server->respond_to_request(mdr, -ESTALE);
12291 return;
12292 }
12293 if (!diri->is_dir()) {
12294 mds->server->respond_to_request(mdr, -ENOTDIR);
12295 return;
12296 }
12297
12298 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12299 std::list<frag_t> frags;
12300
12301 if (mdr->ls) // already marked filelock/nestlock dirty ?
12302 goto do_rdlocks;
12303
12304 rdlocks.insert(&diri->dirfragtreelock);
12305 wrlocks.insert(&diri->nestlock);
12306 wrlocks.insert(&diri->filelock);
12307 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12308 return;
12309
12310 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12311 // the scatter-gather process, which will fix any fragstat/rstat errors.
12312 diri->dirfragtree.get_leaves(frags);
12313 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12314 CDir *dir = diri->get_dirfrag(*p);
12315 if (!dir) {
12316 assert(mdr->is_auth_pinned(diri));
12317 dir = diri->get_or_open_dirfrag(this, *p);
12318 }
12319 if (dir->get_version() == 0) {
12320 assert(dir->is_auth());
12321 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12322 return;
12323 }
12324 }
12325
12326 diri->state_set(CInode::STATE_REPAIRSTATS);
12327 mdr->ls = mds->mdlog->get_current_segment();
12328 mds->locker->mark_updated_scatterlock(&diri->filelock);
12329 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12330 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12331 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12332
12333 mds->locker->drop_locks(mdr.get());
12334
12335 do_rdlocks:
12336 // force the scatter-gather process
12337 rdlocks.insert(&diri->dirfragtreelock);
12338 rdlocks.insert(&diri->nestlock);
12339 rdlocks.insert(&diri->filelock);
12340 wrlocks.clear();
12341 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12342 return;
12343
12344 diri->state_clear(CInode::STATE_REPAIRSTATS);
12345
12346 frag_info_t dir_info;
12347 nest_info_t nest_info;
12348 nest_info.rsubdirs++; // it gets one to account for self
12349
12350 diri->dirfragtree.get_leaves(frags);
12351 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12352 CDir *dir = diri->get_dirfrag(*p);
12353 assert(dir);
12354 assert(dir->get_version() > 0);
12355 dir_info.add(dir->fnode.accounted_fragstat);
12356 nest_info.add(dir->fnode.accounted_rstat);
12357 }
12358
12359 if (!dir_info.same_sums(diri->inode.dirstat) ||
12360 !nest_info.same_sums(diri->inode.rstat)) {
12361 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12362 << *diri << dendl;
12363 }
12364
12365 mds->server->respond_to_request(mdr, 0);
12366 }
12367
12368 void MDCache::flush_dentry(const string& path, Context *fin)
12369 {
12370 if (is_readonly()) {
12371 dout(10) << __func__ << ": read-only FS" << dendl;
12372 fin->complete(-EROFS);
12373 return;
12374 }
12375 dout(10) << "flush_dentry " << path << dendl;
12376 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12377 filepath fp(path.c_str());
12378 mdr->set_filepath(fp);
12379 mdr->internal_op_finish = fin;
12380 flush_dentry_work(mdr);
12381 }
12382
12383 class C_FinishIOMDR : public MDSInternalContextBase {
12384 protected:
12385 MDSRank *mds;
12386 MDRequestRef mdr;
12387 MDSRank *get_mds() override { return mds; }
12388 public:
12389 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12390 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12391 };
12392
12393 void MDCache::flush_dentry_work(MDRequestRef& mdr)
12394 {
12395 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12396 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12397 if (NULL == in)
12398 return;
12399
12400 // TODO: Is this necessary? Fix it if so
12401 assert(in->is_auth());
12402 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12403 if (!locked)
12404 return;
12405 in->flush(new C_FinishIOMDR(mds, mdr));
12406 }
12407
12408
12409 /**
12410 * Initialize performance counters with global perfcounter
12411 * collection.
12412 */
12413 void MDCache::register_perfcounters()
12414 {
12415 PerfCountersBuilder pcb(g_ceph_context,
12416 "mds_cache", l_mdc_first, l_mdc_last);
12417
12418 /* Stray/purge statistics */
12419 pcb.add_u64(l_mdc_num_strays, "num_strays",
12420 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
12421 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12422 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12423
12424 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12425 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12426 "Stray dentries enqueued for purge");
12427 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12428 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12429
12430
12431 /* Recovery queue statistics */
12432 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12433 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
12434 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
12435 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12436 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12437 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
12438 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
12439
12440 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12441 "Internal Request type enqueue scrub");
12442 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12443 "Internal Request type export dir");
12444 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12445 "Internal Request type flush");
12446 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12447 "Internal Request type fragmentdir");
12448 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12449 "Internal Request type frag stats");
12450 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12451 "Internal Request type inode stats");
12452
12453 logger.reset(pcb.create_perf_counters());
12454 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12455 recovery_queue.set_logger(logger.get());
12456 stray_manager.set_logger(logger.get());
12457 }
12458
12459 void MDCache::activate_stray_manager()
12460 {
12461 if (open) {
12462 stray_manager.activate();
12463 } else {
12464 wait_for_open(
12465 new MDSInternalContextWrapper(mds,
12466 new FunctionContext([this](int r){
12467 stray_manager.activate();
12468 })
12469 )
12470 );
12471 }
12472 }
12473
12474 /**
12475 * Call this when putting references to an inode/dentry or
12476 * when attempting to trim it.
12477 *
12478 * If this inode is no longer linked by anyone, and this MDS
12479 * rank holds the primary dentry, and that dentry is in a stray
12480 * directory, then give up the dentry to the StrayManager, never
12481 * to be seen again by MDCache.
12482 *
12483 * @param delay if true, then purgeable inodes are stashed til
12484 * the next trim(), rather than being purged right
12485 * away.
12486 */
12487 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
12488 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12489 mds->get_state() <= MDSMap::STATE_REJOIN)
12490 return;
12491
12492 CDentry *dn = in->get_projected_parent_dn();
12493
12494 if (dn->state_test(CDentry::STATE_PURGING)) {
12495 /* We have already entered the purging process, no need
12496 * to re-evaluate me ! */
12497 return;
12498 }
12499
12500 if (dn->get_projected_linkage()->is_primary() &&
12501 dn->get_dir()->get_inode()->is_stray()) {
12502 stray_manager.eval_stray(dn, delay);
12503 }
12504 }
12505
12506 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12507 dout(10) << __func__ << " " << *diri << dendl;
12508 assert(diri->get_projected_parent_dir()->inode->is_stray());
12509 list<CDir*> ls;
12510 diri->get_dirfrags(ls);
12511 for (auto p : ls) {
12512 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12513 p->try_remove_dentries_for_stray();
12514 }
12515 if (!diri->snaprealm) {
12516 if (diri->is_auth())
12517 diri->clear_dirty_rstat();
12518 diri->clear_scatter_dirty();
12519 }
12520 }
12521