]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
update sources to v12.1.3
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <map>
21
22 #include "MDCache.h"
23 #include "MDSRank.h"
24 #include "Server.h"
25 #include "Locker.h"
26 #include "MDLog.h"
27 #include "MDBalancer.h"
28 #include "Migrator.h"
29 #include "ScrubStack.h"
30
31 #include "SnapClient.h"
32
33 #include "MDSMap.h"
34
35 #include "CInode.h"
36 #include "CDir.h"
37
38 #include "Mutation.h"
39
40 #include "include/ceph_fs.h"
41 #include "include/filepath.h"
42
43 #include "msg/Message.h"
44 #include "msg/Messenger.h"
45
46 #include "common/errno.h"
47 #include "common/safe_io.h"
48 #include "common/perf_counters.h"
49 #include "common/MemoryModel.h"
50 #include "osdc/Journaler.h"
51 #include "osdc/Filer.h"
52
53 #include "events/ESubtreeMap.h"
54 #include "events/EUpdate.h"
55 #include "events/ESlaveUpdate.h"
56 #include "events/EImportFinish.h"
57 #include "events/EFragment.h"
58 #include "events/ECommitted.h"
59 #include "events/ESessions.h"
60
61 #include "messages/MGenericMessage.h"
62
63 #include "messages/MMDSResolve.h"
64 #include "messages/MMDSResolveAck.h"
65 #include "messages/MMDSCacheRejoin.h"
66
67 #include "messages/MDiscover.h"
68 #include "messages/MDiscoverReply.h"
69
70 //#include "messages/MInodeUpdate.h"
71 #include "messages/MDirUpdate.h"
72 #include "messages/MCacheExpire.h"
73
74 #include "messages/MInodeFileCaps.h"
75
76 #include "messages/MLock.h"
77 #include "messages/MDentryLink.h"
78 #include "messages/MDentryUnlink.h"
79
80 #include "messages/MMDSFindIno.h"
81 #include "messages/MMDSFindInoReply.h"
82
83 #include "messages/MMDSOpenIno.h"
84 #include "messages/MMDSOpenInoReply.h"
85
86 #include "messages/MClientRequest.h"
87 #include "messages/MClientCaps.h"
88 #include "messages/MClientSnap.h"
89 #include "messages/MClientQuota.h"
90
91 #include "messages/MMDSSlaveRequest.h"
92
93 #include "messages/MMDSFragmentNotify.h"
94
95 #include "messages/MGatherCaps.h"
96
97 #include "InoTable.h"
98
99 #include "common/Timer.h"
100
101 #include "perfglue/heap_profiler.h"
102
103 using namespace std;
104
105 #include "common/config.h"
106 #include "include/assert.h"
107
108 #define dout_context g_ceph_context
109 #define dout_subsys ceph_subsys_mds
110 #undef dout_prefix
111 #define dout_prefix _prefix(_dout, mds)
112 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
113 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
114 }
115
116 set<int> SimpleLock::empty_gather_set;
117
118
119 /**
120 * All non-I/O contexts that require a reference
121 * to an MDCache instance descend from this.
122 */
123 class MDCacheContext : public virtual MDSInternalContextBase {
124 protected:
125 MDCache *mdcache;
126 MDSRank *get_mds() override
127 {
128 assert(mdcache != NULL);
129 return mdcache->mds;
130 }
131 public:
132 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
133 };
134
135
136 /**
137 * Only for contexts called back from an I/O completion
138 *
139 * Note: duplication of members wrt MDCacheContext, because
140 * it'ls the lesser of two evils compared with introducing
141 * yet another piece of (multiple) inheritance.
142 */
143 class MDCacheIOContext : public virtual MDSIOContextBase {
144 protected:
145 MDCache *mdcache;
146 MDSRank *get_mds() override
147 {
148 assert(mdcache != NULL);
149 return mdcache->mds;
150 }
151 public:
152 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
153 };
154
155 class MDCacheLogContext : public virtual MDSLogContextBase {
156 protected:
157 MDCache *mdcache;
158 MDSRank *get_mds() override
159 {
160 assert(mdcache != NULL);
161 return mdcache->mds;
162 }
163 public:
164 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
165 };
166
167 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
168 mds(m),
169 filer(m->objecter, m->finisher),
170 exceeded_size_limit(false),
171 recovery_queue(m),
172 stray_manager(m, purge_queue_)
173 {
174 migrator.reset(new Migrator(mds, this));
175 root = NULL;
176 myin = NULL;
177 readonly = false;
178
179 stray_index = 0;
180 for (int i = 0; i < NUM_STRAY; ++i) {
181 strays[i] = NULL;
182 }
183
184 num_inodes_with_caps = 0;
185
186 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
187 (g_conf->mds_dir_max_commit_size << 20) :
188 (0.9 *(g_conf->osd_max_write_size << 20));
189
190 discover_last_tid = 0;
191 open_ino_last_tid = 0;
192 find_ino_peer_last_tid = 0;
193
194 last_cap_id = 0;
195
196 client_lease_durations[0] = 5.0;
197 client_lease_durations[1] = 30.0;
198 client_lease_durations[2] = 300.0;
199
200 resolves_pending = false;
201 rejoins_pending = false;
202 cap_imports_num_opening = 0;
203
204 opening_root = open = false;
205 lru.lru_set_max(g_conf->mds_cache_size);
206 lru.lru_set_midpoint(g_conf->mds_cache_mid);
207
208 bottom_lru.lru_set_max(0);
209 bottom_lru.lru_set_midpoint(0);
210
211 decayrate.set_halflife(g_conf->mds_decay_halflife);
212
213 did_shutdown_log_cap = false;
214 }
215
216 MDCache::~MDCache()
217 {
218 if (logger) {
219 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
220 }
221 }
222
223
224
225 void MDCache::log_stat()
226 {
227 mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
228 mds->logger->set(l_mds_inodes, lru.lru_get_size());
229 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
230 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
231 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
232 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
233 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
234 mds->logger->set(l_mds_caps, Capability::count());
235 }
236
237
238 //
239
240 bool MDCache::shutdown()
241 {
242 if (lru.lru_get_size() > 0) {
243 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
244 //show_cache();
245 show_subtrees();
246 //dump();
247 }
248 return true;
249 }
250
251
252 // ====================================================================
253 // some inode functions
254
255 void MDCache::add_inode(CInode *in)
256 {
257 // add to lru, inode map
258 assert(inode_map.count(in->vino()) == 0); // should be no dup inos!
259 inode_map[ in->vino() ] = in;
260
261 if (in->ino() < MDS_INO_SYSTEM_BASE) {
262 if (in->ino() == MDS_INO_ROOT)
263 root = in;
264 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
265 myin = in;
266 else if (in->is_stray()) {
267 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
268 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
269 }
270 }
271 if (in->is_base())
272 base_inodes.insert(in);
273 }
274
275 if (CInode::count() >
276 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
277 exceeded_size_limit = true;
278 }
279 }
280
281 void MDCache::remove_inode(CInode *o)
282 {
283 dout(14) << "remove_inode " << *o << dendl;
284
285 if (o->get_parent_dn()) {
286 // FIXME: multiple parents?
287 CDentry *dn = o->get_parent_dn();
288 assert(!dn->is_dirty());
289 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
290 }
291
292 if (o->is_dirty())
293 o->mark_clean();
294 if (o->is_dirty_parent())
295 o->clear_dirty_parent();
296
297 o->clear_scatter_dirty();
298
299 o->item_open_file.remove_myself();
300
301 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
302 export_pin_queue.erase(o);
303
304 // remove from inode map
305 inode_map.erase(o->vino());
306
307 if (o->ino() < MDS_INO_SYSTEM_BASE) {
308 if (o == root) root = 0;
309 if (o == myin) myin = 0;
310 if (o->is_stray()) {
311 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
312 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
313 }
314 }
315 if (o->is_base())
316 base_inodes.erase(o);
317 }
318
319 // delete it
320 assert(o->get_num_ref() == 0);
321 delete o;
322 }
323
324 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
325 {
326 file_layout_t result = file_layout_t::get_default();
327 result.pool_id = mdsmap.get_first_data_pool();
328 return result;
329 }
330
331 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
332 {
333 file_layout_t result = file_layout_t::get_default();
334 result.pool_id = mdsmap.get_metadata_pool();
335 if (g_conf->mds_log_segment_size > 0) {
336 result.object_size = g_conf->mds_log_segment_size;
337 result.stripe_unit = g_conf->mds_log_segment_size;
338 }
339 return result;
340 }
341
342 void MDCache::init_layouts()
343 {
344 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
345 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
346 }
347
348 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
349 int mode) const
350 {
351 in->inode.ino = ino;
352 in->inode.version = 1;
353 in->inode.xattr_version = 1;
354 in->inode.mode = 0500 | mode;
355 in->inode.size = 0;
356 in->inode.ctime =
357 in->inode.mtime =
358 in->inode.btime = ceph_clock_now();
359 in->inode.nlink = 1;
360 in->inode.truncate_size = -1ull;
361 in->inode.change_attr = 0;
362 in->inode.export_pin = MDS_RANK_NONE;
363
364 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
365 if (in->inode.is_dir()) {
366 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
367 ++in->inode.rstat.rsubdirs;
368 } else {
369 in->inode.layout = default_file_layout;
370 ++in->inode.rstat.rfiles;
371 }
372 in->inode.accounted_rstat = in->inode.rstat;
373
374 if (in->is_base()) {
375 if (in->is_root())
376 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
377 else
378 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
379 in->open_snaprealm(); // empty snaprealm
380 assert(!in->snaprealm->parent); // created its own
381 in->snaprealm->srnode.seq = 1;
382 }
383 }
384
385 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
386 {
387 dout(0) << "creating system inode with ino:" << ino << dendl;
388 CInode *in = new CInode(this);
389 create_unlinked_system_inode(in, ino, mode);
390 add_inode(in);
391 return in;
392 }
393
394 CInode *MDCache::create_root_inode()
395 {
396 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
397 i->inode.uid = g_conf->mds_root_ino_uid;
398 i->inode.gid = g_conf->mds_root_ino_gid;
399 i->inode.layout = default_file_layout;
400 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
401 return i;
402 }
403
404 void MDCache::create_empty_hierarchy(MDSGather *gather)
405 {
406 // create root dir
407 CInode *root = create_root_inode();
408
409 // force empty root dir
410 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
411 adjust_subtree_auth(rootdir, mds->get_nodeid());
412 rootdir->dir_rep = CDir::REP_ALL; //NONE;
413
414 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
415 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
416
417 root->inode.dirstat = rootdir->fnode.fragstat;
418 root->inode.rstat = rootdir->fnode.rstat;
419 ++root->inode.rstat.rsubdirs;
420 root->inode.accounted_rstat = root->inode.rstat;
421
422 rootdir->mark_complete();
423 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
424 rootdir->commit(0, gather->new_sub());
425
426 root->store(gather->new_sub());
427 }
428
429 void MDCache::create_mydir_hierarchy(MDSGather *gather)
430 {
431 // create mds dir
432 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
433
434 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
435 adjust_subtree_auth(mydir, mds->get_nodeid());
436
437 LogSegment *ls = mds->mdlog->get_current_segment();
438
439 // stray dir
440 for (int i = 0; i < NUM_STRAY; ++i) {
441 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
442 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
443 stringstream name;
444 name << "stray" << i;
445 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
446 sdn->_mark_dirty(mds->mdlog->get_current_segment());
447
448 stray->inode.dirstat = straydir->fnode.fragstat;
449
450 mydir->fnode.rstat.add(stray->inode.rstat);
451 mydir->fnode.fragstat.nsubdirs++;
452 // save them
453 straydir->mark_complete();
454 straydir->mark_dirty(straydir->pre_dirty(), ls);
455 straydir->commit(0, gather->new_sub());
456 stray->_mark_dirty_parent(ls, true);
457 stray->store_backtrace(gather->new_sub());
458 }
459
460 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
461 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
462
463 myin->inode.dirstat = mydir->fnode.fragstat;
464 myin->inode.rstat = mydir->fnode.rstat;
465 ++myin->inode.rstat.rsubdirs;
466 myin->inode.accounted_rstat = myin->inode.rstat;
467
468 mydir->mark_complete();
469 mydir->mark_dirty(mydir->pre_dirty(), ls);
470 mydir->commit(0, gather->new_sub());
471
472 myin->store(gather->new_sub());
473 }
474
475 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
476 MutationRef mut;
477 CDentry *dn;
478 version_t dpv;
479 MDSInternalContextBase *fin;
480 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
481 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
482 void finish(int r) override {
483 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
484 }
485 };
486
487 void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
488 {
489 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
490 CDentry *dn = dir->add_null_dentry(name);
491
492 dn->push_projected_linkage(in);
493 version_t dpv = dn->pre_dirty();
494
495 CDir *mdir = 0;
496 if (in->inode.is_dir()) {
497 in->inode.rstat.rsubdirs = 1;
498
499 mdir = in->get_or_open_dirfrag(this, frag_t());
500 mdir->mark_complete();
501 mdir->pre_dirty();
502 } else
503 in->inode.rstat.rfiles = 1;
504 in->inode.version = dn->pre_dirty();
505
506 SnapRealm *realm = dir->get_inode()->find_snaprealm();
507 dn->first = in->first = realm->get_newest_seq() + 1;
508
509 MutationRef mut(new MutationImpl());
510
511 // force some locks. hacky.
512 mds->locker->wrlock_force(&dir->inode->filelock, mut);
513 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
514
515 mut->ls = mds->mdlog->get_current_segment();
516 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
517 mds->mdlog->start_entry(le);
518
519 if (!in->is_mdsdir()) {
520 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
521 le->metablob.add_primary_dentry(dn, in, true);
522 } else {
523 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
524 journal_dirty_inode(mut.get(), &le->metablob, in);
525 dn->push_projected_linkage(in->ino(), in->d_type());
526 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
527 le->metablob.add_root(true, in);
528 }
529 if (mdir)
530 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
531
532 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
533 mds->mdlog->flush();
534 }
535
536 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
537 {
538 dout(10) << "_create_system_file_finish " << *dn << dendl;
539
540 dn->pop_projected_linkage();
541 dn->mark_dirty(dpv, mut->ls);
542
543 CInode *in = dn->get_linkage()->get_inode();
544 in->inode.version--;
545 in->mark_dirty(in->inode.version + 1, mut->ls);
546
547 if (in->inode.is_dir()) {
548 CDir *dir = in->get_dirfrag(frag_t());
549 assert(dir);
550 dir->mark_dirty(1, mut->ls);
551 dir->mark_new(mut->ls);
552 }
553
554 mut->apply();
555 mds->locker->drop_locks(mut.get());
556 mut->cleanup();
557
558 fin->complete(0);
559
560 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
561 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
562 }
563
564
565
566 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
567 MDCache *cache;
568 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
569 void finish(int r) override {
570 if (r < 0) {
571 // If we can't open root, something disastrous has happened: mark
572 // this rank damaged for operator intervention. Note that
573 // it is not okay to call suicide() here because we are in
574 // a Finisher callback.
575 cache->mds->damaged();
576 ceph_abort(); // damaged should never return
577 } else {
578 cache->open_root();
579 }
580 }
581 };
582
583 void MDCache::open_root_inode(MDSInternalContextBase *c)
584 {
585 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
586 CInode *in;
587 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
588 in->fetch(c);
589 } else {
590 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
591 }
592 }
593
594 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
595 {
596 MDSGatherBuilder gather(g_ceph_context);
597
598 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
599 in->fetch(gather.new_sub());
600
601 gather.set_finisher(c);
602 gather.activate();
603 }
604
605 void MDCache::open_root()
606 {
607 dout(10) << "open_root" << dendl;
608
609 if (!root) {
610 open_root_inode(new C_MDS_RetryOpenRoot(this));
611 return;
612 }
613 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
614 assert(root->is_auth());
615 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
616 assert(rootdir);
617 if (!rootdir->is_subtree_root())
618 adjust_subtree_auth(rootdir, mds->get_nodeid());
619 if (!rootdir->is_complete()) {
620 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
621 return;
622 }
623 } else {
624 assert(!root->is_auth());
625 CDir *rootdir = root->get_dirfrag(frag_t());
626 if (!rootdir) {
627 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
628 return;
629 }
630 }
631
632 if (!myin) {
633 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
634 in->fetch(new C_MDS_RetryOpenRoot(this));
635 return;
636 }
637 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
638 assert(mydir);
639 adjust_subtree_auth(mydir, mds->get_nodeid());
640
641 populate_mydir();
642 }
643
644 void MDCache::populate_mydir()
645 {
646 assert(myin);
647 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
648 assert(mydir);
649
650 dout(10) << "populate_mydir " << *mydir << dendl;
651
652 if (!mydir->is_complete()) {
653 mydir->fetch(new C_MDS_RetryOpenRoot(this));
654 return;
655 }
656
657 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
658 // A missing dirfrag, we will recreate it. Before that, we must dirty
659 // it before dirtying any of the strays we create within it.
660 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
661 "recreating it now";
662 LogSegment *ls = mds->mdlog->get_current_segment();
663 mydir->state_clear(CDir::STATE_BADFRAG);
664 mydir->mark_complete();
665 mydir->mark_dirty(mydir->pre_dirty(), ls);
666 }
667
668 // open or create stray
669 uint64_t num_strays = 0;
670 for (int i = 0; i < NUM_STRAY; ++i) {
671 stringstream name;
672 name << "stray" << i;
673 CDentry *straydn = mydir->lookup(name.str());
674
675 // allow for older fs's with stray instead of stray0
676 if (straydn == NULL && i == 0)
677 straydn = mydir->lookup("stray");
678
679 if (!straydn || !straydn->get_linkage()->get_inode()) {
680 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
681 new C_MDS_RetryOpenRoot(this));
682 return;
683 }
684 assert(straydn);
685 assert(strays[i]);
686 // we make multiple passes through this method; make sure we only pin each stray once.
687 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
688 strays[i]->get(CInode::PIN_STRAY);
689 strays[i]->state_set(CInode::STATE_STRAYPINNED);
690 strays[i]->get_stickydirs();
691 }
692 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
693
694 // open all frags
695 list<frag_t> ls;
696 strays[i]->dirfragtree.get_leaves(ls);
697 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
698 frag_t fg = *p;
699 CDir *dir = strays[i]->get_dirfrag(fg);
700 if (!dir) {
701 dir = strays[i]->get_or_open_dirfrag(this, fg);
702 }
703
704 // DamageTable applies special handling to strays: it will
705 // have damaged() us out if one is damaged.
706 assert(!dir->state_test(CDir::STATE_BADFRAG));
707
708 if (dir->get_version() == 0) {
709 dir->fetch(new C_MDS_RetryOpenRoot(this));
710 return;
711 }
712
713 if (dir->get_frag_size() > 0)
714 num_strays += dir->get_frag_size();
715 }
716 }
717
718 stray_manager.set_num_strays(num_strays);
719
720 // okay!
721 dout(10) << "populate_mydir done" << dendl;
722 assert(!open);
723 open = true;
724 mds->queue_waiters(waiting_for_open);
725
726 scan_stray_dir();
727 }
728
729 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
730 {
731 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
732 }
733
734 CDir *MDCache::get_stray_dir(CInode *in)
735 {
736 string straydname;
737 in->name_stray_dentry(straydname);
738
739 CInode *strayi = get_stray();
740 assert(strayi);
741 frag_t fg = strayi->pick_dirfrag(straydname);
742 CDir *straydir = strayi->get_dirfrag(fg);
743 assert(straydir);
744 return straydir;
745 }
746
747 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
748 {
749 CDir *straydir = get_stray_dir(in);
750 string straydname;
751 in->name_stray_dentry(straydname);
752 CDentry *straydn = straydir->lookup(straydname);
753 if (!straydn) {
754 straydn = straydir->add_null_dentry(straydname);
755 straydn->mark_new();
756 } else {
757 assert(straydn->get_projected_linkage()->is_null());
758 }
759
760 straydn->state_set(CDentry::STATE_STRAY);
761 return straydn;
762 }
763
764
765
766 MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
767 {
768 // inode?
769 if (info.ino)
770 return get_inode(info.ino, info.snapid);
771
772 // dir or dentry.
773 CDir *dir = get_dirfrag(info.dirfrag);
774 if (!dir) return 0;
775
776 if (info.dname.length())
777 return dir->lookup(info.dname, info.snapid);
778 else
779 return dir;
780 }
781
782
783
784
785 // ====================================================================
786 // subtree management
787
788 void MDCache::list_subtrees(list<CDir*>& ls)
789 {
790 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
791 p != subtrees.end();
792 ++p)
793 ls.push_back(p->first);
794 }
795
796 /*
797 * adjust the dir_auth of a subtree.
798 * merge with parent and/or child subtrees, if is it appropriate.
799 * merge can ONLY happen if both parent and child have unambiguous auth.
800 */
801 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
802 {
803 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
804 << " on " << *dir << dendl;
805
806 show_subtrees();
807
808 CDir *root;
809 if (dir->inode->is_base()) {
810 root = dir; // bootstrap hack.
811 if (subtrees.count(root) == 0) {
812 subtrees[root];
813 root->get(CDir::PIN_SUBTREE);
814 }
815 } else {
816 root = get_subtree_root(dir); // subtree root
817 }
818 assert(root);
819 assert(subtrees.count(root));
820 dout(7) << " current root is " << *root << dendl;
821
822 if (root == dir) {
823 // i am already a subtree.
824 dir->set_dir_auth(auth);
825 } else {
826 // i am a new subtree.
827 dout(10) << " new subtree at " << *dir << dendl;
828 assert(subtrees.count(dir) == 0);
829 subtrees[dir]; // create empty subtree bounds list for me.
830 dir->get(CDir::PIN_SUBTREE);
831
832 // set dir_auth
833 dir->set_dir_auth(auth);
834
835 // move items nested beneath me, under me.
836 set<CDir*>::iterator p = subtrees[root].begin();
837 while (p != subtrees[root].end()) {
838 set<CDir*>::iterator next = p;
839 ++next;
840 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
841 // move under me
842 dout(10) << " claiming child bound " << **p << dendl;
843 subtrees[dir].insert(*p);
844 subtrees[root].erase(p);
845 }
846 p = next;
847 }
848
849 // i am a bound of the parent subtree.
850 subtrees[root].insert(dir);
851
852 // i am now the subtree root.
853 root = dir;
854
855 // adjust recursive pop counters
856 if (dir->is_auth()) {
857 utime_t now = ceph_clock_now();
858 CDir *p = dir->get_parent_dir();
859 while (p) {
860 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
861 if (p->is_subtree_root()) break;
862 p = p->inode->get_parent_dir();
863 }
864 }
865 }
866
867 show_subtrees();
868 }
869
870
871 void MDCache::try_subtree_merge(CDir *dir)
872 {
873 dout(7) << "try_subtree_merge " << *dir << dendl;
874 assert(subtrees.count(dir));
875 set<CDir*> oldbounds = subtrees[dir];
876
877 set<CInode*> to_eval;
878 // try merge at my root
879 try_subtree_merge_at(dir, &to_eval);
880
881 // try merge at my old bounds
882 for (auto bound : oldbounds)
883 try_subtree_merge_at(bound, &to_eval);
884
885 if (!(mds->is_any_replay() || mds->is_resolve())) {
886 for(auto in : to_eval)
887 eval_subtree_root(in);
888 }
889 }
890
891 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
892 CInode *in;
893 MutationRef mut;
894 public:
895 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
896 void finish(int r) override {
897 mdcache->subtree_merge_writebehind_finish(in, mut);
898 }
899 };
900
901 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
902 {
903 dout(10) << "try_subtree_merge_at " << *dir << dendl;
904 assert(subtrees.count(dir));
905
906 // merge with parent?
907 CDir *parent = dir;
908 if (!dir->inode->is_base())
909 parent = get_subtree_root(dir->get_parent_dir());
910
911 if (parent != dir && // we have a parent,
912 parent->dir_auth == dir->dir_auth && // auth matches,
913 dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous,
914 !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound,
915 !dir->state_test(CDir::STATE_AUXSUBTREE)) { // not aux subtree
916 // merge with parent.
917 dout(10) << " subtree merge at " << *dir << dendl;
918 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
919
920 // move our bounds under the parent
921 for (set<CDir*>::iterator p = subtrees[dir].begin();
922 p != subtrees[dir].end();
923 ++p)
924 subtrees[parent].insert(*p);
925
926 // we are no longer a subtree or bound
927 dir->put(CDir::PIN_SUBTREE);
928 subtrees.erase(dir);
929 subtrees[parent].erase(dir);
930
931 // adjust popularity?
932 if (dir->is_auth()) {
933 utime_t now = ceph_clock_now();
934 CDir *p = dir->get_parent_dir();
935 while (p) {
936 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
937 if (p->is_subtree_root()) break;
938 p = p->inode->get_parent_dir();
939 }
940 }
941
942 if (to_eval && dir->get_inode()->is_auth())
943 to_eval->insert(dir->get_inode());
944 }
945
946 show_subtrees(15);
947 }
948
949 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
950 {
951 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
952 in->pop_and_dirty_projected_inode(mut->ls);
953
954 mut->apply();
955 mds->locker->drop_locks(mut.get());
956 mut->cleanup();
957
958 in->auth_unpin(this);
959 }
960
961 void MDCache::eval_subtree_root(CInode *diri)
962 {
963 // evaluate subtree inode filelock?
964 // (we should scatter the filelock on subtree bounds)
965 assert(diri->is_auth());
966 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
967 }
968
969
970 void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
971 {
972 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
973 << " on " << *dir
974 << " bounds " << bounds
975 << dendl;
976
977 show_subtrees();
978
979 CDir *root;
980 if (dir->ino() == MDS_INO_ROOT) {
981 root = dir; // bootstrap hack.
982 if (subtrees.count(root) == 0) {
983 subtrees[root];
984 root->get(CDir::PIN_SUBTREE);
985 }
986 } else {
987 root = get_subtree_root(dir); // subtree root
988 }
989 assert(root);
990 assert(subtrees.count(root));
991 dout(7) << " current root is " << *root << dendl;
992
993 mds_authority_t oldauth = dir->authority();
994
995 if (root == dir) {
996 // i am already a subtree.
997 dir->set_dir_auth(auth);
998 } else {
999 // i am a new subtree.
1000 dout(10) << " new subtree at " << *dir << dendl;
1001 assert(subtrees.count(dir) == 0);
1002 subtrees[dir]; // create empty subtree bounds list for me.
1003 dir->get(CDir::PIN_SUBTREE);
1004
1005 // set dir_auth
1006 dir->set_dir_auth(auth);
1007
1008 // move items nested beneath me, under me.
1009 set<CDir*>::iterator p = subtrees[root].begin();
1010 while (p != subtrees[root].end()) {
1011 set<CDir*>::iterator next = p;
1012 ++next;
1013 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1014 // move under me
1015 dout(10) << " claiming child bound " << **p << dendl;
1016 subtrees[dir].insert(*p);
1017 subtrees[root].erase(p);
1018 }
1019 p = next;
1020 }
1021
1022 // i am a bound of the parent subtree.
1023 subtrees[root].insert(dir);
1024
1025 // i am now the subtree root.
1026 root = dir;
1027 }
1028
1029 set<CInode*> to_eval;
1030
1031 // verify/adjust bounds.
1032 // - these may be new, or
1033 // - beneath existing ambiguous bounds (which will be collapsed),
1034 // - but NOT beneath unambiguous bounds.
1035 for (set<CDir*>::iterator p = bounds.begin();
1036 p != bounds.end();
1037 ++p) {
1038 CDir *bound = *p;
1039
1040 // new bound?
1041 if (subtrees[dir].count(bound) == 0) {
1042 if (get_subtree_root(bound) == dir) {
1043 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1044 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1045 }
1046 else {
1047 dout(10) << " want bound " << *bound << dendl;
1048 CDir *t = get_subtree_root(bound->get_parent_dir());
1049 if (subtrees[t].count(bound) == 0) {
1050 assert(t != dir);
1051 dout(10) << " new bound " << *bound << dendl;
1052 adjust_subtree_auth(bound, t->authority());
1053 }
1054 // make sure it's nested beneath ambiguous subtree(s)
1055 while (1) {
1056 while (subtrees[dir].count(t) == 0)
1057 t = get_subtree_root(t->get_parent_dir());
1058 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1059 adjust_subtree_auth(t, auth);
1060 try_subtree_merge_at(t, &to_eval);
1061 t = get_subtree_root(bound->get_parent_dir());
1062 if (t == dir) break;
1063 }
1064 }
1065 }
1066 else {
1067 dout(10) << " already have bound " << *bound << dendl;
1068 }
1069 }
1070 // merge stray bounds?
1071 while (!subtrees[dir].empty()) {
1072 set<CDir*> copy = subtrees[dir];
1073 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1074 if (bounds.count(*p) == 0) {
1075 CDir *stray = *p;
1076 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1077 adjust_subtree_auth(stray, auth);
1078 try_subtree_merge_at(stray, &to_eval);
1079 }
1080 }
1081 // swallowing subtree may add new subtree bounds
1082 if (copy == subtrees[dir])
1083 break;
1084 }
1085
1086 // bound should now match.
1087 verify_subtree_bounds(dir, bounds);
1088
1089 show_subtrees();
1090
1091 if (!(mds->is_any_replay() || mds->is_resolve())) {
1092 for(auto in : to_eval)
1093 eval_subtree_root(in);
1094 }
1095 }
1096
1097
1098 /*
1099 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1100 * fragmentation as necessary to get an equivalent bounding set. That is, only
1101 * split if one of our frags spans the provided bounding set. Never merge.
1102 */
1103 void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1104 {
1105 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1106
1107 // sort by ino
1108 map<inodeno_t, fragset_t> byino;
1109 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1110 byino[p->ino].insert(p->frag);
1111 dout(10) << " by ino: " << byino << dendl;
1112
1113 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1114 CInode *diri = get_inode(p->first);
1115 if (!diri)
1116 continue;
1117 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1118
1119 fragtree_t tmpdft;
1120 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1121 tmpdft.force_to_leaf(g_ceph_context, *q);
1122
1123 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1124 frag_t fg = *q;
1125 list<frag_t> fgls;
1126 diri->dirfragtree.get_leaves_under(fg, fgls);
1127 if (fgls.empty()) {
1128 bool all = true;
1129 frag_t approx_fg = diri->dirfragtree[fg.value()];
1130 list<frag_t> ls;
1131 tmpdft.get_leaves_under(approx_fg, ls);
1132 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1133 if (p->second.get().count(*r) == 0) {
1134 // not bound, so the resolve message is from auth MDS of the dirfrag
1135 force_dir_fragment(diri, *r);
1136 all = false;
1137 }
1138 }
1139 if (all)
1140 fgls.push_back(approx_fg);
1141 else
1142 diri->dirfragtree.get_leaves_under(fg, fgls);
1143 }
1144 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1145 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1146 CDir *dir = diri->get_dirfrag(*r);
1147 if (dir)
1148 bounds.insert(dir);
1149 }
1150 }
1151 }
1152 }
1153
1154 void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1155 {
1156 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1157 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1158
1159 set<CDir*> bounds;
1160 get_force_dirfrag_bound_set(bound_dfs, bounds);
1161 adjust_bounded_subtree_auth(dir, bounds, auth);
1162 }
1163
1164 void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1165 {
1166 dout(10) << "map_dirfrag_set " << dfs << dendl;
1167
1168 // group by inode
1169 map<inodeno_t, fragset_t> ino_fragset;
1170 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1171 ino_fragset[p->ino].insert(p->frag);
1172
1173 // get frags
1174 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1175 p != ino_fragset.end();
1176 ++p) {
1177 CInode *in = get_inode(p->first);
1178 if (!in)
1179 continue;
1180
1181 list<frag_t> fglist;
1182 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1183 in->dirfragtree.get_leaves_under(*q, fglist);
1184
1185 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1186 << " on " << *in << dendl;
1187
1188 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1189 CDir *dir = in->get_dirfrag(*q);
1190 if (dir)
1191 result.insert(dir);
1192 }
1193 }
1194 }
1195
1196
1197
1198 CDir *MDCache::get_subtree_root(CDir *dir)
1199 {
1200 // find the underlying dir that delegates (or is about to delegate) auth
1201 while (true) {
1202 if (dir->is_subtree_root())
1203 return dir;
1204 dir = dir->get_inode()->get_parent_dir();
1205 if (!dir)
1206 return 0; // none
1207 }
1208 }
1209
1210 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1211 {
1212 // find the underlying dir that delegates (or is about to delegate) auth
1213 while (true) {
1214 if (dir->is_subtree_root())
1215 return dir;
1216 dir = dir->get_inode()->get_projected_parent_dir();
1217 if (!dir)
1218 return 0; // none
1219 }
1220 }
1221
1222 void MDCache::remove_subtree(CDir *dir)
1223 {
1224 dout(10) << "remove_subtree " << *dir << dendl;
1225 assert(subtrees.count(dir));
1226 assert(subtrees[dir].empty());
1227 subtrees.erase(dir);
1228 dir->put(CDir::PIN_SUBTREE);
1229 if (dir->get_parent_dir()) {
1230 CDir *p = get_subtree_root(dir->get_parent_dir());
1231 assert(subtrees[p].count(dir));
1232 subtrees[p].erase(dir);
1233 }
1234 }
1235
1236 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1237 {
1238 assert(subtrees.count(dir));
1239 bounds = subtrees[dir];
1240 }
1241
1242 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1243 {
1244 if (subtrees.count(dir)) {
1245 // just copy them, dir is a subtree.
1246 get_subtree_bounds(dir, bounds);
1247 } else {
1248 // find them
1249 CDir *root = get_subtree_root(dir);
1250 for (set<CDir*>::iterator p = subtrees[root].begin();
1251 p != subtrees[root].end();
1252 ++p) {
1253 CDir *t = *p;
1254 while (t != root) {
1255 t = t->get_parent_dir();
1256 assert(t);
1257 if (t == dir) {
1258 bounds.insert(*p);
1259 continue;
1260 }
1261 }
1262 }
1263 }
1264 }
1265
1266 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1267 {
1268 // for debugging only.
1269 assert(subtrees.count(dir));
1270 if (bounds != subtrees[dir]) {
1271 dout(0) << "verify_subtree_bounds failed" << dendl;
1272 set<CDir*> b = bounds;
1273 for (auto &cd : subtrees[dir]) {
1274 if (bounds.count(cd)) {
1275 b.erase(cd);
1276 continue;
1277 }
1278 dout(0) << " missing bound " << *cd << dendl;
1279 }
1280 for (const auto &cd : b)
1281 dout(0) << " extra bound " << *cd << dendl;
1282 }
1283 assert(bounds == subtrees[dir]);
1284 }
1285
1286 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1287 {
1288 // for debugging only.
1289 assert(subtrees.count(dir));
1290
1291 // make sure that any bounds i do have are properly noted as such.
1292 int failed = 0;
1293 for (const auto &fg : bounds) {
1294 CDir *bd = get_dirfrag(fg);
1295 if (!bd) continue;
1296 if (subtrees[dir].count(bd) == 0) {
1297 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1298 failed++;
1299 }
1300 }
1301 assert(failed == 0);
1302 }
1303
1304 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1305 {
1306 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1307 << " to " << *newdir << dendl;
1308 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1309 }
1310
1311 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1312 {
1313 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1314
1315 //show_subtrees();
1316
1317 CDir *newdir = diri->get_parent_dir();
1318
1319 if (pop) {
1320 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1321 assert(p != projected_subtree_renames.end());
1322 assert(!p->second.empty());
1323 assert(p->second.front().first == olddir);
1324 assert(p->second.front().second == newdir);
1325 p->second.pop_front();
1326 if (p->second.empty())
1327 projected_subtree_renames.erase(p);
1328 }
1329
1330 // adjust subtree
1331 list<CDir*> dfls;
1332 // make sure subtree dirfrags are at the front of the list
1333 diri->get_subtree_dirfrags(dfls);
1334 diri->get_nested_dirfrags(dfls);
1335 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1336 CDir *dir = *p;
1337
1338 dout(10) << "dirfrag " << *dir << dendl;
1339 CDir *oldparent = get_subtree_root(olddir);
1340 dout(10) << " old parent " << *oldparent << dendl;
1341 CDir *newparent = get_subtree_root(newdir);
1342 dout(10) << " new parent " << *newparent << dendl;
1343
1344 if (oldparent == newparent) {
1345 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1346 continue;
1347 }
1348
1349 if (dir->is_subtree_root()) {
1350 // children are fine. change parent.
1351 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1352 assert(subtrees[oldparent].count(dir));
1353 subtrees[oldparent].erase(dir);
1354 assert(subtrees.count(newparent));
1355 subtrees[newparent].insert(dir);
1356 // caller is responsible for 'eval diri'
1357 try_subtree_merge_at(dir, NULL);
1358 } else {
1359 // mid-subtree.
1360
1361 // see if any old bounds move to the new parent.
1362 list<CDir*> tomove;
1363 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1364 p != subtrees[oldparent].end();
1365 ++p) {
1366 CDir *bound = *p;
1367 CDir *broot = get_subtree_root(bound->get_parent_dir());
1368 if (broot != oldparent) {
1369 assert(broot == newparent);
1370 tomove.push_back(bound);
1371 }
1372 }
1373 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1374 CDir *bound = *p;
1375 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1376 subtrees[oldparent].erase(bound);
1377 subtrees[newparent].insert(bound);
1378 }
1379
1380 // did auth change?
1381 if (oldparent->authority() != newparent->authority()) {
1382 adjust_subtree_auth(dir, oldparent->authority());
1383 // caller is responsible for 'eval diri'
1384 try_subtree_merge_at(dir, NULL);
1385 }
1386 }
1387 }
1388
1389 show_subtrees();
1390 }
1391
1392
1393 void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1394 {
1395 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1396 p != subtrees.end();
1397 ++p) {
1398 CDir *root = p->first;
1399 if (root->is_full_dir_auth())
1400 s.insert(root);
1401 }
1402 }
1403 void MDCache::get_auth_subtrees(set<CDir*>& s)
1404 {
1405 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1406 p != subtrees.end();
1407 ++p) {
1408 CDir *root = p->first;
1409 if (root->is_auth())
1410 s.insert(root);
1411 }
1412 }
1413
1414
1415 // count.
1416
1417 int MDCache::num_subtrees()
1418 {
1419 return subtrees.size();
1420 }
1421
1422 int MDCache::num_subtrees_fullauth()
1423 {
1424 int n = 0;
1425 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1426 p != subtrees.end();
1427 ++p) {
1428 CDir *root = p->first;
1429 if (root->is_full_dir_auth())
1430 n++;
1431 }
1432 return n;
1433 }
1434
1435 int MDCache::num_subtrees_fullnonauth()
1436 {
1437 int n = 0;
1438 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1439 p != subtrees.end();
1440 ++p) {
1441 CDir *root = p->first;
1442 if (root->is_full_dir_nonauth())
1443 n++;
1444 }
1445 return n;
1446 }
1447
1448
1449
1450 // ===================================
1451 // journal and snap/cow helpers
1452
1453
1454 /*
1455 * find first inode in cache that follows given snapid. otherwise, return current.
1456 */
1457 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1458 {
1459 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1460 assert(in->last == CEPH_NOSNAP);
1461
1462 SnapRealm *realm = in->find_snaprealm();
1463 const set<snapid_t>& snaps = realm->get_snaps();
1464 dout(10) << " realm " << *realm << " " << *realm->inode << dendl;
1465 dout(10) << " snaps " << snaps << dendl;
1466
1467 if (snaps.empty())
1468 return in;
1469
1470 for (set<snapid_t>::const_iterator p = snaps.upper_bound(follows); // first item > follows
1471 p != snaps.end();
1472 ++p) {
1473 CInode *t = get_inode(in->ino(), *p);
1474 if (t) {
1475 in = t;
1476 dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl;
1477 break;
1478 }
1479 }
1480 return in;
1481 }
1482
1483
1484 /*
1485 * note: i'm currently cheating wrt dirty and inode.version on cow
1486 * items. instead of doing a full dir predirty, i just take the
1487 * original item's version, and set the dirty flag (via
1488 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1489 * means a special case in the dir commit clean sweep assertions.
1490 * bah.
1491 */
1492 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1493 {
1494 assert(last >= in->first);
1495
1496 SnapRealm *realm = in->find_snaprealm();
1497 const set<snapid_t>& snaps = realm->get_snaps();
1498
1499 // make sure snap inode's last match existing snapshots.
1500 // MDCache::pick_inode_snap() requires this.
1501 snapid_t last_snap = last;
1502 if (snaps.count(last) == 0) {
1503 set<snapid_t>::const_iterator p = snaps.upper_bound(last);
1504 if (p != snaps.begin()) {
1505 --p;
1506 if (*p >= in->first)
1507 last_snap = *p;
1508 }
1509 }
1510
1511 CInode *oldin = new CInode(this, true, in->first, last_snap);
1512 oldin->inode = *in->get_previous_projected_inode();
1513 oldin->symlink = in->symlink;
1514 oldin->xattrs = *in->get_previous_projected_xattrs();
1515 oldin->inode.trim_client_ranges(last);
1516
1517 if (in->first < in->oldest_snap)
1518 in->oldest_snap = in->first;
1519
1520 in->first = last+1;
1521
1522 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1523 add_inode(oldin);
1524
1525 if (in->last != CEPH_NOSNAP) {
1526 CInode *head_in = get_inode(in->ino());
1527 assert(head_in);
1528 if (head_in->split_need_snapflush(oldin, in)) {
1529 oldin->client_snap_caps = in->client_snap_caps;
1530 for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
1531 p != in->client_snap_caps.end();
1532 ++p) {
1533 SimpleLock *lock = oldin->get_lock(p->first);
1534 assert(lock);
1535 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
1536 oldin->auth_pin(lock);
1537 lock->set_state(LOCK_SNAP_SYNC); // gathering
1538 lock->get_wrlock(true);
1539 }
1540 }
1541 }
1542 return oldin;
1543 }
1544
1545 // clone caps?
1546 for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
1547 p != in->client_caps.end();
1548 ++p) {
1549 client_t client = p->first;
1550 Capability *cap = p->second;
1551 int issued = cap->issued();
1552 if ((issued & CEPH_CAP_ANY_WR) &&
1553 cap->client_follows < last) {
1554 // note in oldin
1555 for (int i = 0; i < num_cinode_locks; i++) {
1556 if (issued & cinode_lock_info[i].wr_caps) {
1557 int lockid = cinode_lock_info[i].lock;
1558 SimpleLock *lock = oldin->get_lock(lockid);
1559 assert(lock);
1560 oldin->client_snap_caps[lockid].insert(client);
1561 oldin->auth_pin(lock);
1562 lock->set_state(LOCK_SNAP_SYNC); // gathering
1563 lock->get_wrlock(true);
1564 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1565 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1566 }
1567 }
1568 cap->client_follows = last;
1569
1570 // we need snapflushes for any intervening snaps
1571 dout(10) << " snaps " << snaps << dendl;
1572 for (set<snapid_t>::const_iterator q = snaps.lower_bound(oldin->first);
1573 q != snaps.end() && *q <= last;
1574 ++q) {
1575 in->add_need_snapflush(oldin, *q, client);
1576 }
1577 } else {
1578 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1579 }
1580 }
1581
1582 return oldin;
1583 }
1584
1585 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1586 CDentry *dn, snapid_t follows,
1587 CInode **pcow_inode, CDentry::linkage_t *dnl)
1588 {
1589 if (!dn) {
1590 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1591 return;
1592 }
1593 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1594 assert(dn->is_auth());
1595
1596 // nothing to cow on a null dentry, fix caller
1597 if (!dnl)
1598 dnl = dn->get_projected_linkage();
1599 assert(!dnl->is_null());
1600
1601 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1602 // multiversion inode.
1603 CInode *in = dnl->get_inode();
1604 SnapRealm *realm = NULL;
1605
1606 if (in->get_projected_parent_dn() != dn) {
1607 assert(follows == CEPH_NOSNAP);
1608 realm = dn->dir->inode->find_snaprealm();
1609 snapid_t dir_follows = realm->get_newest_snap();
1610
1611 if (dir_follows+1 > dn->first) {
1612 snapid_t oldfirst = dn->first;
1613 dn->first = dir_follows+1;
1614 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1615 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
1616 oldfirst, dir_follows);
1617 olddn->pre_dirty();
1618 dout(10) << " olddn " << *olddn << dendl;
1619 metablob->add_remote_dentry(olddn, true);
1620 mut->add_cow_dentry(olddn);
1621 // FIXME: adjust link count here? hmm.
1622
1623 if (dir_follows+1 > in->first)
1624 in->cow_old_inode(dir_follows, false);
1625 }
1626 }
1627
1628 if (in->snaprealm) {
1629 realm = in->snaprealm;
1630 follows = realm->get_newest_seq();
1631 } else
1632 follows = dir_follows;
1633 } else {
1634 realm = in->find_snaprealm();
1635 if (follows == CEPH_NOSNAP)
1636 follows = realm->get_newest_seq();
1637 }
1638
1639 // already cloned?
1640 if (follows < in->first) {
1641 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1642 return;
1643 }
1644
1645 if (!realm->has_snaps_in_range(in->first, follows)) {
1646 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1647 in->first = follows + 1;
1648 return;
1649 }
1650
1651 in->cow_old_inode(follows, false);
1652
1653 } else {
1654 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1655 if (follows == CEPH_NOSNAP)
1656 follows = realm->get_newest_seq();
1657
1658 // already cloned?
1659 if (follows < dn->first) {
1660 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1661 return;
1662 }
1663
1664 // update dn.first before adding old dentry to cdir's map
1665 snapid_t oldfirst = dn->first;
1666 dn->first = follows+1;
1667
1668 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1669
1670 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1671 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1672 if (in)
1673 in->first = follows+1;
1674 return;
1675 }
1676
1677 dout(10) << " dn " << *dn << dendl;
1678 if (in) {
1679 CInode *oldin = cow_inode(in, follows);
1680 mut->add_cow_inode(oldin);
1681 if (pcow_inode)
1682 *pcow_inode = oldin;
1683 CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
1684 oldin->inode.version = olddn->pre_dirty();
1685 dout(10) << " olddn " << *olddn << dendl;
1686 bool need_snapflush = !oldin->client_snap_caps.empty();
1687 if (need_snapflush)
1688 mut->ls->open_files.push_back(&oldin->item_open_file);
1689 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1690 mut->add_cow_dentry(olddn);
1691 } else {
1692 assert(dnl->is_remote());
1693 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
1694 oldfirst, follows);
1695 olddn->pre_dirty();
1696 dout(10) << " olddn " << *olddn << dendl;
1697 metablob->add_remote_dentry(olddn, true);
1698 mut->add_cow_dentry(olddn);
1699 }
1700 }
1701 }
1702
1703
1704 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1705 CInode *in, snapid_t follows,
1706 CInode **pcow_inode)
1707 {
1708 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1709 CDentry *dn = in->get_projected_parent_dn();
1710 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1711 }
1712
1713 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1714 {
1715 if (in->is_base()) {
1716 metablob->add_root(true, in, in->get_projected_inode());
1717 } else {
1718 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1719 follows = in->first - 1;
1720 CDentry *dn = in->get_projected_parent_dn();
1721 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1722 journal_cow_dentry(mut, metablob, dn, follows);
1723 if (in->get_projected_inode()->is_backtrace_updated()) {
1724 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1725 in->get_previous_projected_inode()->layout.pool_id;
1726 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1727 } else {
1728 metablob->add_primary_dentry(dn, in, true);
1729 }
1730 }
1731 }
1732
1733
1734
1735 // nested ---------------------------------------------------------------
1736
1737 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1738 int linkunlink, SnapRealm *prealm)
1739 {
1740 CDentry *parentdn = cur->get_projected_parent_dn();
1741 inode_t *curi = cur->get_projected_inode();
1742
1743 if (cur->first > first)
1744 first = cur->first;
1745
1746 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1747 << " " << *cur << dendl;
1748 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1749 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1750
1751 /*
1752 * FIXME. this incompletely propagates rstats to _old_ parents
1753 * (i.e. shortly after a directory rename). but we need full
1754 * blown hard link backpointers to make this work properly...
1755 */
1756 snapid_t floor = parentdn->first;
1757 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1758
1759 if (!prealm)
1760 prealm = parent->inode->find_snaprealm();
1761 const set<snapid_t> snaps = prealm->get_snaps();
1762
1763 if (cur->last != CEPH_NOSNAP) {
1764 assert(cur->dirty_old_rstats.empty());
1765 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1766 if (q == snaps.end() || *q > cur->last)
1767 return;
1768 }
1769
1770 if (cur->last >= floor) {
1771 bool update = true;
1772 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1773 // rename src inode is not projected in the slave rename prep case. so we should
1774 // avoid updateing the inode.
1775 assert(linkunlink < 0);
1776 assert(cur->is_frozen_inode());
1777 update = false;
1778 }
1779 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1780 linkunlink, update);
1781 }
1782
1783 if (g_conf->mds_snap_rstat) {
1784 for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
1785 p != cur->dirty_old_rstats.end();
1786 ++p) {
1787 old_inode_t& old = cur->old_inodes[*p];
1788 snapid_t ofirst = MAX(old.first, floor);
1789 set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
1790 if (q == snaps.end() || *q > *p)
1791 continue;
1792 if (*p >= floor)
1793 _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
1794 }
1795 }
1796 cur->dirty_old_rstats.clear();
1797 }
1798
1799
1800 void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
1801 CDir *parent, int linkunlink, bool update_inode)
1802 {
1803 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1804 dout(20) << " inode rstat " << inode.rstat << dendl;
1805 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1806 nest_info_t delta;
1807 if (linkunlink == 0) {
1808 delta.add(inode.rstat);
1809 delta.sub(inode.accounted_rstat);
1810 } else if (linkunlink < 0) {
1811 delta.sub(inode.accounted_rstat);
1812 } else {
1813 delta.add(inode.rstat);
1814 }
1815 dout(20) << " delta " << delta << dendl;
1816
1817 if (update_inode)
1818 inode.accounted_rstat = inode.rstat;
1819
1820 while (last >= ofirst) {
1821 /*
1822 * pick fnode version to update. at each iteration, we want to
1823 * pick a segment ending in 'last' to update. split as necessary
1824 * to make that work. then, adjust first up so that we only
1825 * update one segment at a time. then loop to cover the whole
1826 * [ofirst,last] interval.
1827 */
1828 nest_info_t *prstat;
1829 snapid_t first;
1830 fnode_t *pf = parent->get_projected_fnode();
1831 if (last == CEPH_NOSNAP) {
1832 if (g_conf->mds_snap_rstat)
1833 first = MAX(ofirst, parent->first);
1834 else
1835 first = parent->first;
1836 prstat = &pf->rstat;
1837 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1838
1839 if (first > parent->first &&
1840 !(pf->rstat == pf->accounted_rstat)) {
1841 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1842 << parent->first << "," << (first-1) << "] "
1843 << " " << *prstat << "/" << pf->accounted_rstat
1844 << dendl;
1845 parent->dirty_old_rstat[first-1].first = parent->first;
1846 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1847 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1848 }
1849 parent->first = first;
1850 } else if (!g_conf->mds_snap_rstat) {
1851 // drop snapshots' rstats
1852 break;
1853 } else if (last >= parent->first) {
1854 first = parent->first;
1855 parent->dirty_old_rstat[last].first = first;
1856 parent->dirty_old_rstat[last].rstat = pf->rstat;
1857 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1858 prstat = &parent->dirty_old_rstat[last].rstat;
1859 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1860 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1861 } else {
1862 // be careful, dirty_old_rstat is a _sparse_ map.
1863 // sorry, this is ugly.
1864 first = ofirst;
1865
1866 // find any intersection with last
1867 compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
1868 if (p == parent->dirty_old_rstat.end()) {
1869 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1870 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1871 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1872 first = parent->dirty_old_rstat.rbegin()->first+1;
1873 }
1874 } else {
1875 // *p last is >= last
1876 if (p->second.first <= last) {
1877 // *p intersects [first,last]
1878 if (p->second.first < first) {
1879 dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
1880 parent->dirty_old_rstat[first-1] = p->second;
1881 p->second.first = first;
1882 }
1883 if (p->second.first > first)
1884 first = p->second.first;
1885 if (last < p->first) {
1886 dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
1887 parent->dirty_old_rstat[last] = p->second;
1888 p->second.first = last+1;
1889 }
1890 } else {
1891 // *p is to the _right_ of [first,last]
1892 p = parent->dirty_old_rstat.lower_bound(first);
1893 // new *p last is >= first
1894 if (p->second.first <= last && // new *p isn't also to the right, and
1895 p->first >= first) { // it intersects our first bit,
1896 dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
1897 first = p->first+1;
1898 }
1899 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1900 }
1901 }
1902 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1903 parent->dirty_old_rstat[last].first = first;
1904 prstat = &parent->dirty_old_rstat[last].rstat;
1905 }
1906
1907 // apply
1908 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1909 assert(last >= first);
1910 prstat->add(delta);
1911 if (update_inode)
1912 inode.accounted_rstat = inode.rstat;
1913 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1914
1915 last = first-1;
1916 }
1917 }
1918
1919 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1920 snapid_t ofirst, snapid_t last,
1921 CInode *pin, bool cow_head)
1922 {
1923 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1924 dout(20) << " frag rstat " << rstat << dendl;
1925 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1926 nest_info_t delta = rstat;
1927 delta.sub(accounted_rstat);
1928 dout(20) << " delta " << delta << dendl;
1929
1930 while (last >= ofirst) {
1931 inode_t *pi;
1932 snapid_t first;
1933 if (last == pin->last) {
1934 pi = pin->get_projected_inode();
1935 first = MAX(ofirst, pin->first);
1936 if (first > pin->first) {
1937 old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
1938 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1939 }
1940 } else {
1941 if (last >= pin->first) {
1942 first = pin->first;
1943 pin->cow_old_inode(last, cow_head);
1944 } else {
1945 // our life is easier here because old_inodes is not sparse
1946 // (although it may not begin at snapid 1)
1947 compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
1948 if (p == pin->old_inodes.end()) {
1949 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1950 break;
1951 }
1952 first = p->second.first;
1953 if (first > last) {
1954 dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
1955 //assert(p == pin->old_inodes.begin());
1956 break;
1957 }
1958 if (p->first > last) {
1959 dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
1960 << (last+1) << "," << p->first << "]" << dendl;
1961 pin->old_inodes[last] = p->second;
1962 p->second.first = last+1;
1963 pin->dirty_old_rstats.insert(p->first);
1964 }
1965 }
1966 if (first < ofirst) {
1967 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1968 << first << "," << ofirst-1 << "]" << dendl;
1969 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1970 pin->dirty_old_rstats.insert(ofirst-1);
1971 pin->old_inodes[last].first = first = ofirst;
1972 }
1973 pi = &pin->old_inodes[last].inode;
1974 pin->dirty_old_rstats.insert(last);
1975 }
1976 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1977 pi->rstat.add(delta);
1978 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1979
1980 last = first-1;
1981 }
1982 }
1983
1984 void MDCache::broadcast_quota_to_client(CInode *in)
1985 {
1986 if (!in->is_auth() || in->is_frozen())
1987 return;
1988
1989 inode_t *i = in->get_projected_inode();
1990
1991 if (!i->quota.is_enable())
1992 return;
1993
1994 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1995 it != in->client_caps.end();
1996 ++it) {
1997 Session *session = mds->get_session(it->first);
1998 if (!session || !session->connection ||
1999 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
2000 continue;
2001
2002 Capability *cap = it->second;
2003 if (cap->last_rbytes == i->rstat.rbytes &&
2004 cap->last_rsize == i->rstat.rsize())
2005 continue;
2006
2007 if (i->quota.max_files > 0) {
2008 if (i->rstat.rsize() >= i->quota.max_files)
2009 goto update;
2010
2011 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2012 abs(cap->last_rsize - i->rstat.rsize()))
2013 goto update;
2014 }
2015
2016 if (i->quota.max_bytes > 0) {
2017 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2018 goto update;
2019
2020 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2021 abs(cap->last_rbytes - i->rstat.rbytes))
2022 goto update;
2023 }
2024
2025 continue;
2026
2027 update:
2028 cap->last_rsize = i->rstat.rsize();
2029 cap->last_rbytes = i->rstat.rbytes;
2030
2031 MClientQuota *msg = new MClientQuota();
2032 msg->ino = in->ino();
2033 msg->rstat = i->rstat;
2034 msg->quota = i->quota;
2035 mds->send_message_client_counted(msg, session->connection);
2036 }
2037 for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
2038 it != in->replicas_end();
2039 ++it) {
2040 MGatherCaps *msg = new MGatherCaps;
2041 msg->ino = in->ino();
2042 mds->send_message_mds(msg, it->first);
2043 }
2044 }
2045
2046 /*
2047 * NOTE: we _have_ to delay the scatter if we are called during a
2048 * rejoin, because we can't twiddle locks between when the
2049 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2050 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2051 * (no requests), and a survivor acks immediately. _except_ that
2052 * during rejoin_(weak|strong) processing, we may complete a lock
2053 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2054 * scatterlock state in that case or the lock states will get out of
2055 * sync between the auth and replica.
2056 *
2057 * the simple solution is to never do the scatter here. instead, put
2058 * the scatterlock on a list if it isn't already wrlockable. this is
2059 * probably the best plan anyway, since we avoid too many
2060 * scatters/locks under normal usage.
2061 */
2062 /*
2063 * some notes on dirlock/nestlock scatterlock semantics:
2064 *
2065 * the fragstat (dirlock) will never be updated without
2066 * dirlock+nestlock wrlock held by the caller.
2067 *
2068 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2069 * data is pushed up the tree. this could be changed with some
2070 * restructuring here, but in its current form we ensure that the
2071 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2072 * frag, which is nice. and, we only need to track frags that need to
2073 * be nudged (and not inodes with pending rstat changes that need to
2074 * be pushed into the frag). a consequence of this is that the
2075 * accounted_rstat on scatterlock sync may not match our current
2076 * rstat. this is normal and expected.
2077 */
2078 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2079 CInode *in, CDir *parent,
2080 int flags, int linkunlink,
2081 snapid_t cfollows)
2082 {
2083 bool primary_dn = flags & PREDIRTY_PRIMARY;
2084 bool do_parent_mtime = flags & PREDIRTY_DIR;
2085 bool shallow = flags & PREDIRTY_SHALLOW;
2086
2087 assert(mds->mdlog->entry_is_open());
2088
2089 // make sure stamp is set
2090 if (mut->get_mds_stamp() == utime_t())
2091 mut->set_mds_stamp(ceph_clock_now());
2092
2093 if (in->is_base())
2094 return;
2095
2096 dout(10) << "predirty_journal_parents"
2097 << (do_parent_mtime ? " do_parent_mtime":"")
2098 << " linkunlink=" << linkunlink
2099 << (primary_dn ? " primary_dn":" remote_dn")
2100 << (shallow ? " SHALLOW":"")
2101 << " follows " << cfollows
2102 << " " << *in << dendl;
2103
2104 if (!parent) {
2105 assert(primary_dn);
2106 parent = in->get_projected_parent_dn()->get_dir();
2107 }
2108
2109 if (flags == 0 && linkunlink == 0) {
2110 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2111 blob->add_dir_context(parent);
2112 return;
2113 }
2114
2115 // build list of inodes to wrlock, dirty, and update
2116 list<CInode*> lsi;
2117 CInode *cur = in;
2118 CDentry *parentdn = NULL;
2119 bool first = true;
2120 while (parent) {
2121 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2122 assert(parent->is_auth());
2123
2124 // opportunistically adjust parent dirfrag
2125 CInode *pin = parent->get_inode();
2126
2127 // inode -> dirfrag
2128 mut->auth_pin(parent);
2129 mut->add_projected_fnode(parent);
2130
2131 fnode_t *pf = parent->project_fnode();
2132 pf->version = parent->pre_dirty();
2133
2134 if (do_parent_mtime || linkunlink) {
2135 assert(mut->wrlocks.count(&pin->filelock));
2136 assert(mut->wrlocks.count(&pin->nestlock));
2137 assert(cfollows == CEPH_NOSNAP);
2138
2139 // update stale fragstat/rstat?
2140 parent->resync_accounted_fragstat();
2141 parent->resync_accounted_rstat();
2142
2143 if (do_parent_mtime) {
2144 pf->fragstat.mtime = mut->get_op_stamp();
2145 pf->fragstat.change_attr++;
2146 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2147 if (pf->fragstat.mtime > pf->rstat.rctime) {
2148 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2149 pf->rstat.rctime = pf->fragstat.mtime;
2150 } else {
2151 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2152 }
2153 }
2154 if (linkunlink) {
2155 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2156 if (in->is_dir()) {
2157 pf->fragstat.nsubdirs += linkunlink;
2158 //pf->rstat.rsubdirs += linkunlink;
2159 } else {
2160 pf->fragstat.nfiles += linkunlink;
2161 //pf->rstat.rfiles += linkunlink;
2162 }
2163 }
2164 }
2165
2166 // rstat
2167 if (!primary_dn) {
2168 // don't update parent this pass
2169 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2170 pin->versionlock.can_wrlock())) {
2171 dout(20) << " unwritable parent nestlock " << pin->nestlock
2172 << ", marking dirty rstat on " << *cur << dendl;
2173 cur->mark_dirty_rstat();
2174 } else {
2175 // if we don't hold a wrlock reference on this nestlock, take one,
2176 // because we are about to write into the dirfrag fnode and that needs
2177 // to commit before the lock can cycle.
2178 if (linkunlink) {
2179 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2180 }
2181
2182 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2183 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2184 mds->locker->wrlock_force(&pin->nestlock, mut);
2185 }
2186
2187 // now we can project the inode rstat diff the dirfrag
2188 SnapRealm *prealm = pin->find_snaprealm();
2189
2190 snapid_t follows = cfollows;
2191 if (follows == CEPH_NOSNAP)
2192 follows = prealm->get_newest_seq();
2193
2194 snapid_t first = follows+1;
2195
2196 // first, if the frag is stale, bring it back in sync.
2197 parent->resync_accounted_rstat();
2198
2199 // now push inode rstats into frag
2200 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2201 cur->clear_dirty_rstat();
2202 }
2203
2204 bool stop = false;
2205 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2206 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2207 stop = true;
2208 }
2209
2210 // delay propagating until later?
2211 if (!stop && !first &&
2212 g_conf->mds_dirstat_min_interval > 0) {
2213 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2214 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2215 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2216 << " < " << g_conf->mds_dirstat_min_interval
2217 << ", stopping" << dendl;
2218 stop = true;
2219 } else {
2220 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2221 }
2222 }
2223
2224 // can cast only because i'm passing nowait=true in the sole user
2225 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2226 if (!stop &&
2227 mut->wrlocks.count(&pin->nestlock) == 0 &&
2228 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2229 //true
2230 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2231 )) { // ** do not initiate.. see above comment **
2232 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2233 << " on " << *pin << dendl;
2234 stop = true;
2235 }
2236 if (stop) {
2237 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2238 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2239 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2240 mut->add_updated_lock(&pin->nestlock);
2241 if (do_parent_mtime || linkunlink) {
2242 mds->locker->mark_updated_scatterlock(&pin->filelock);
2243 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2244 mut->add_updated_lock(&pin->filelock);
2245 }
2246 break;
2247 }
2248 if (!mut->wrlocks.count(&pin->versionlock))
2249 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2250
2251 assert(mut->wrlocks.count(&pin->nestlock) ||
2252 mut->is_slave());
2253
2254 pin->last_dirstat_prop = mut->get_mds_stamp();
2255
2256 // dirfrag -> diri
2257 mut->auth_pin(pin);
2258 mut->add_projected_inode(pin);
2259 lsi.push_front(pin);
2260
2261 pin->pre_cow_old_inode(); // avoid cow mayhem!
2262
2263 inode_t *pi = pin->project_inode();
2264 pi->version = pin->pre_dirty();
2265
2266 // dirstat
2267 if (do_parent_mtime || linkunlink) {
2268 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2269 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2270 bool touched_mtime = false, touched_chattr = false;
2271 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2272 pf->accounted_fragstat = pf->fragstat;
2273 if (touched_mtime)
2274 pi->mtime = pi->ctime = pi->dirstat.mtime;
2275 if (touched_chattr)
2276 pi->change_attr = pi->dirstat.change_attr;
2277 dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
2278
2279 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2280 if (pi->dirstat.size() < 0)
2281 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2282 if (pi->dirstat.size() != pf->fragstat.size()) {
2283 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2284 << parent->dirfrag() << ", inode has " << pi->dirstat
2285 << ", dirfrag has " << pf->fragstat;
2286
2287 // trust the dirfrag for now
2288 pi->dirstat = pf->fragstat;
2289
2290 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2291 }
2292 }
2293 }
2294
2295 /*
2296 * the rule here is to follow the _oldest_ parent with dirty rstat
2297 * data. if we don't propagate all data, we add ourselves to the
2298 * nudge list. that way all rstat data will (eventually) get
2299 * pushed up the tree.
2300 *
2301 * actually, no. for now, silently drop rstats for old parents. we need
2302 * hard link backpointers to do the above properly.
2303 */
2304
2305 // stop?
2306 if (pin->is_base())
2307 break;
2308 parentdn = pin->get_projected_parent_dn();
2309 assert(parentdn);
2310
2311 // rstat
2312 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2313
2314 // first, if the frag is stale, bring it back in sync.
2315 parent->resync_accounted_rstat();
2316
2317 if (g_conf->mds_snap_rstat) {
2318 for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
2319 p != parent->dirty_old_rstat.end();
2320 ++p)
2321 project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
2322 p->first, pin, true);//false);
2323 }
2324 parent->dirty_old_rstat.clear();
2325 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2326
2327 pf->accounted_rstat = pf->rstat;
2328
2329 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2330 if (pi->rstat.rbytes != pf->rstat.rbytes) {
2331 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2332 << parent->dirfrag() << ", inode has " << pi->rstat
2333 << ", dirfrag has " << pf->rstat;
2334
2335 // trust the dirfrag for now
2336 pi->rstat = pf->rstat;
2337
2338 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2339 }
2340 }
2341
2342 parent->check_rstats();
2343 broadcast_quota_to_client(pin);
2344 // next parent!
2345 cur = pin;
2346 parent = parentdn->get_dir();
2347 linkunlink = 0;
2348 do_parent_mtime = false;
2349 primary_dn = true;
2350 first = false;
2351 }
2352
2353 // now, stick it in the blob
2354 assert(parent);
2355 assert(parent->is_auth());
2356 blob->add_dir_context(parent);
2357 blob->add_dir(parent, true);
2358 for (list<CInode*>::iterator p = lsi.begin();
2359 p != lsi.end();
2360 ++p) {
2361 CInode *cur = *p;
2362 journal_dirty_inode(mut.get(), blob, cur);
2363 }
2364
2365 }
2366
2367
2368
2369
2370
2371 // ===================================
2372 // slave requests
2373
2374
2375 /*
2376 * some handlers for master requests with slaves. we need to make
2377 * sure slaves journal commits before we forget we mastered them and
2378 * remove them from the uncommitted_masters map (used during recovery
2379 * to commit|abort slaves).
2380 */
2381 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2382 metareqid_t reqid;
2383 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2384 void finish(int r) override {
2385 mdcache->_logged_master_commit(reqid);
2386 }
2387 };
2388
2389 void MDCache::log_master_commit(metareqid_t reqid)
2390 {
2391 dout(10) << "log_master_commit " << reqid << dendl;
2392 uncommitted_masters[reqid].committing = true;
2393 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2394 new C_MDC_CommittedMaster(this, reqid));
2395 }
2396
2397 void MDCache::_logged_master_commit(metareqid_t reqid)
2398 {
2399 dout(10) << "_logged_master_commit " << reqid << dendl;
2400 assert(uncommitted_masters.count(reqid));
2401 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2402 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2403 uncommitted_masters.erase(reqid);
2404 }
2405
2406 // while active...
2407
2408 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2409 {
2410 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2411 assert(uncommitted_masters.count(r));
2412 uncommitted_masters[r].slaves.erase(from);
2413 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2414 log_master_commit(r);
2415 }
2416
2417 void MDCache::logged_master_update(metareqid_t reqid)
2418 {
2419 dout(10) << "logged_master_update " << reqid << dendl;
2420 assert(uncommitted_masters.count(reqid));
2421 uncommitted_masters[reqid].safe = true;
2422 if (pending_masters.count(reqid)) {
2423 pending_masters.erase(reqid);
2424 if (pending_masters.empty())
2425 process_delayed_resolve();
2426 }
2427 }
2428
2429 /*
2430 * Master may crash after receiving all slaves' commit acks, but before journalling
2431 * the final commit. Slaves may crash after journalling the slave commit, but before
2432 * sending commit ack to the master. Commit masters with no uncommitted slave when
2433 * resolve finishes.
2434 */
2435 void MDCache::finish_committed_masters()
2436 {
2437 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2438 p != uncommitted_masters.end();
2439 ++p) {
2440 p->second.recovering = false;
2441 if (!p->second.committing && p->second.slaves.empty()) {
2442 dout(10) << "finish_committed_masters " << p->first << dendl;
2443 log_master_commit(p->first);
2444 }
2445 }
2446 }
2447
2448 /*
2449 * at end of resolve... we must journal a commit|abort for all slave
2450 * updates, before moving on.
2451 *
2452 * this is so that the master can safely journal ECommitted on ops it
2453 * masters when it reaches up:active (all other recovering nodes must
2454 * complete resolve before that happens).
2455 */
2456 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2457 mds_rank_t from;
2458 metareqid_t reqid;
2459 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2460 void finish(int r) override {
2461 mdcache->_logged_slave_commit(from, reqid);
2462 }
2463 };
2464
2465 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2466 {
2467 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2468
2469 // send a message
2470 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2471 mds->send_message_mds(req, from);
2472 }
2473
2474
2475
2476
2477
2478
2479 // ====================================================================
2480 // import map, recovery
2481
2482 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2483 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2484 {
2485 if (subtrees.count(oldparent)) {
2486 vector<dirfrag_t>& v = subtrees[oldparent];
2487 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2488 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2489 if (*it == df) {
2490 v.erase(it);
2491 break;
2492 }
2493 }
2494 if (subtrees.count(newparent)) {
2495 vector<dirfrag_t>& v = subtrees[newparent];
2496 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2497 v.push_back(df);
2498 }
2499 }
2500
2501 ESubtreeMap *MDCache::create_subtree_map()
2502 {
2503 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2504 << num_subtrees_fullauth() << " fullauth"
2505 << dendl;
2506
2507 show_subtrees();
2508
2509 ESubtreeMap *le = new ESubtreeMap();
2510 mds->mdlog->_start_entry(le);
2511
2512 map<dirfrag_t, CDir*> dirs_to_add;
2513
2514 if (myin) {
2515 CDir* mydir = myin->get_dirfrag(frag_t());
2516 dirs_to_add[mydir->dirfrag()] = mydir;
2517 }
2518
2519 // include all auth subtrees, and their bounds.
2520 // and a spanning tree to tie it to the root.
2521 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2522 p != subtrees.end();
2523 ++p) {
2524 CDir *dir = p->first;
2525
2526 // journal subtree as "ours" if we are
2527 // me, -2
2528 // me, me
2529 // me, !me (may be importing and ambiguous!)
2530
2531 // so not
2532 // !me, *
2533 if (dir->get_dir_auth().first != mds->get_nodeid())
2534 continue;
2535
2536 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2537 my_ambiguous_imports.count(dir->dirfrag())) {
2538 dout(15) << " ambig subtree " << *dir << dendl;
2539 le->ambiguous_subtrees.insert(dir->dirfrag());
2540 } else {
2541 dout(15) << " subtree " << *dir << dendl;
2542 }
2543
2544 dirs_to_add[dir->dirfrag()] = dir;
2545 le->subtrees[dir->dirfrag()].clear();
2546
2547
2548 // bounds
2549 for (set<CDir*>::iterator q = p->second.begin();
2550 q != p->second.end();
2551 ++q) {
2552 CDir *bound = *q;
2553 dout(15) << " subtree bound " << *bound << dendl;
2554 dirs_to_add[bound->dirfrag()] = bound;
2555 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2556 }
2557 }
2558
2559 // apply projected renames
2560 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2561 p != projected_subtree_renames.end();
2562 ++p) {
2563 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2564 CInode *diri = p->first;
2565 CDir *olddir = q->first;
2566 CDir *newdir = q->second;
2567 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2568
2569 list<CDir*> dfls;
2570 diri->get_dirfrags(dfls);
2571 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2572 CDir *dir = *p;
2573 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2574 CDir *oldparent = get_projected_subtree_root(olddir);
2575 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2576 CDir *newparent = get_projected_subtree_root(newdir);
2577 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2578
2579 if (oldparent == newparent) {
2580 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2581 << oldparent->dirfrag() << dendl;
2582 continue;
2583 }
2584
2585 if (dir->is_subtree_root()) {
2586 if (le->subtrees.count(newparent->dirfrag()) &&
2587 oldparent->get_dir_auth() != newparent->get_dir_auth())
2588 dirs_to_add[dir->dirfrag()] = dir;
2589 // children are fine. change parent.
2590 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2591 le->subtrees);
2592 } else {
2593 // mid-subtree.
2594
2595 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2596 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2597 // if oldparent is auth, subtree is mine; include it.
2598 if (le->subtrees.count(oldparent->dirfrag())) {
2599 dirs_to_add[dir->dirfrag()] = dir;
2600 le->subtrees[dir->dirfrag()].clear();
2601 }
2602 // if newparent is auth, subtree is a new bound
2603 if (le->subtrees.count(newparent->dirfrag())) {
2604 dirs_to_add[dir->dirfrag()] = dir;
2605 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2606 }
2607 newparent = dir;
2608 }
2609
2610 // see if any old bounds move to the new parent.
2611 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2612 p != subtrees[oldparent].end();
2613 ++p) {
2614 CDir *bound = *p;
2615 if (dir->contains(bound->get_parent_dir()))
2616 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2617 le->subtrees);
2618 }
2619 }
2620 }
2621 }
2622 }
2623
2624 // simplify the journaled map. our in memory map may have more
2625 // subtrees than needed due to migrations that are just getting
2626 // started or just completing. but on replay, the "live" map will
2627 // be simple and we can do a straight comparison.
2628 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2629 if (le->ambiguous_subtrees.count(p->first))
2630 continue;
2631 unsigned i = 0;
2632 while (i < p->second.size()) {
2633 dirfrag_t b = p->second[i];
2634 if (le->subtrees.count(b) &&
2635 le->ambiguous_subtrees.count(b) == 0) {
2636 vector<dirfrag_t>& bb = le->subtrees[b];
2637 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2638 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2639 p->second.push_back(*r);
2640 dirs_to_add.erase(b);
2641 le->subtrees.erase(b);
2642 p->second.erase(p->second.begin() + i);
2643 } else {
2644 ++i;
2645 }
2646 }
2647 }
2648
2649 for (auto p : dirs_to_add) {
2650 CDir *dir = p.second;
2651 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2652 le->metablob.add_dir(dir, false);
2653 }
2654
2655 dout(15) << " subtrees " << le->subtrees << dendl;
2656 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2657
2658 //le->metablob.print(cout);
2659 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2660 return le;
2661 }
2662
2663 void MDCache::dump_resolve_status(Formatter *f) const
2664 {
2665 f->open_object_section("resolve_status");
2666 f->dump_stream("resolve_gather") << resolve_gather;
2667 f->dump_stream("resolve_ack_gather") << resolve_gather;
2668 f->close_section();
2669 }
2670
2671 void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2672 {
2673 dout(10) << "resolve_start" << dendl;
2674 assert(!resolve_done);
2675 resolve_done.reset(resolve_done_);
2676
2677 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2678 // if we don't have the root dir, adjust it to UNKNOWN. during
2679 // resolve we want mds0 to explicit claim the portion of it that
2680 // it owns, so that anything beyond its bounds get left as
2681 // unknown.
2682 CDir *rootdir = root->get_dirfrag(frag_t());
2683 if (rootdir)
2684 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2685 }
2686 resolve_gather = recovery_set;
2687 }
2688
2689 void MDCache::send_resolves()
2690 {
2691 send_slave_resolves();
2692 if (!resolve_ack_gather.empty()) {
2693 dout(10) << "send_resolves still waiting for resolve ack from ("
2694 << resolve_ack_gather << ")" << dendl;
2695 return;
2696 }
2697 if (!need_resolve_rollback.empty()) {
2698 dout(10) << "send_resolves still waiting for rollback to commit on ("
2699 << need_resolve_rollback << ")" << dendl;
2700 return;
2701 }
2702 send_subtree_resolves();
2703 }
2704
2705 void MDCache::send_slave_resolves()
2706 {
2707 dout(10) << "send_slave_resolves" << dendl;
2708
2709 map<mds_rank_t, MMDSResolve*> resolves;
2710
2711 if (mds->is_resolve()) {
2712 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2713 p != uncommitted_slave_updates.end();
2714 ++p) {
2715 resolves[p->first] = new MMDSResolve;
2716 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2717 q != p->second.end();
2718 ++q) {
2719 dout(10) << " including uncommitted " << q->first << dendl;
2720 resolves[p->first]->add_slave_request(q->first, false);
2721 }
2722 }
2723 } else {
2724 set<mds_rank_t> resolve_set;
2725 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2726 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2727 p != active_requests.end();
2728 ++p) {
2729 MDRequestRef& mdr = p->second;
2730 if (!mdr->is_slave())
2731 continue;
2732 if (!mdr->slave_did_prepare() && !mdr->committing) {
2733 continue;
2734 }
2735 mds_rank_t master = mdr->slave_to_mds;
2736 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2737 dout(10) << " including uncommitted " << *mdr << dendl;
2738 if (!resolves.count(master))
2739 resolves[master] = new MMDSResolve;
2740 if (!mdr->committing &&
2741 mdr->has_more() && mdr->more()->is_inode_exporter) {
2742 // re-send cap exports
2743 CInode *in = mdr->more()->rename_inode;
2744 map<client_t, Capability::Export> cap_map;
2745 in->export_client_caps(cap_map);
2746 bufferlist bl;
2747 ::encode(in->ino(), bl);
2748 ::encode(cap_map, bl);
2749 resolves[master]->add_slave_request(p->first, bl);
2750 } else {
2751 resolves[master]->add_slave_request(p->first, mdr->committing);
2752 }
2753 }
2754 }
2755 }
2756
2757 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2758 p != resolves.end();
2759 ++p) {
2760 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2761 mds->send_message_mds(p->second, p->first);
2762 resolve_ack_gather.insert(p->first);
2763 }
2764 }
2765
2766 void MDCache::send_subtree_resolves()
2767 {
2768 dout(10) << "send_subtree_resolves" << dendl;
2769
2770 if (migrator->is_exporting() || migrator->is_importing()) {
2771 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2772 migrator->show_importing();
2773 migrator->show_exporting();
2774 resolves_pending = true;
2775 return; // not now
2776 }
2777
2778 map<mds_rank_t, MMDSResolve*> resolves;
2779 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2780 p != recovery_set.end();
2781 ++p) {
2782 if (*p == mds->get_nodeid())
2783 continue;
2784 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2785 resolves[*p] = new MMDSResolve;
2786 }
2787
2788 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2789 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2790
2791 // known
2792 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2793 p != subtrees.end();
2794 ++p) {
2795 CDir *dir = p->first;
2796
2797 // only our subtrees
2798 if (dir->authority().first != mds->get_nodeid())
2799 continue;
2800
2801 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2802 continue; // we'll add it below
2803
2804 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2805 // ambiguous (mid-import)
2806 set<CDir*> bounds;
2807 get_subtree_bounds(dir, bounds);
2808 vector<dirfrag_t> dfls;
2809 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2810 dfls.push_back((*q)->dirfrag());
2811
2812 my_ambig_imports[dir->dirfrag()] = dfls;
2813 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2814 } else {
2815 // not ambiguous.
2816 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2817 q != resolves.end();
2818 ++q)
2819 resolves[q->first]->add_subtree(dir->dirfrag());
2820 // bounds too
2821 vector<dirfrag_t> dfls;
2822 for (set<CDir*>::iterator q = subtrees[dir].begin();
2823 q != subtrees[dir].end();
2824 ++q) {
2825 CDir *bound = *q;
2826 dfls.push_back(bound->dirfrag());
2827 }
2828
2829 my_subtrees[dir->dirfrag()] = dfls;
2830 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2831 }
2832 }
2833
2834 // ambiguous
2835 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2836 p != my_ambiguous_imports.end();
2837 ++p) {
2838 my_ambig_imports[p->first] = p->second;
2839 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2840 }
2841
2842 // simplify the claimed subtree.
2843 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2844 unsigned i = 0;
2845 while (i < p->second.size()) {
2846 dirfrag_t b = p->second[i];
2847 if (my_subtrees.count(b)) {
2848 vector<dirfrag_t>& bb = my_subtrees[b];
2849 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2850 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2851 p->second.push_back(*r);
2852 my_subtrees.erase(b);
2853 p->second.erase(p->second.begin() + i);
2854 } else {
2855 ++i;
2856 }
2857 }
2858 }
2859
2860 // send
2861 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2862 p != resolves.end();
2863 ++p) {
2864 MMDSResolve* m = p->second;
2865 m->subtrees = my_subtrees;
2866 m->ambiguous_imports = my_ambig_imports;
2867 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2868 mds->send_message_mds(m, p->first);
2869 }
2870 resolves_pending = false;
2871 }
2872
2873 void MDCache::handle_mds_failure(mds_rank_t who)
2874 {
2875 dout(7) << "handle_mds_failure mds." << who << dendl;
2876
2877 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2878
2879 resolve_gather.insert(who);
2880 discard_delayed_resolve(who);
2881 ambiguous_slave_updates.erase(who);
2882
2883 rejoin_gather.insert(who);
2884 rejoin_sent.erase(who); // i need to send another
2885 rejoin_ack_sent.erase(who); // i need to send another
2886 rejoin_ack_gather.erase(who); // i'll need/get another.
2887
2888 dout(10) << " resolve_gather " << resolve_gather << dendl;
2889 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2890 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2891 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2892 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2893
2894
2895 // tell the migrator too.
2896 migrator->handle_mds_failure_or_stop(who);
2897
2898 // tell the balancer too.
2899 mds->balancer->handle_mds_failure(who);
2900
2901 // clean up any requests slave to/from this node
2902 list<MDRequestRef> finish;
2903 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2904 p != active_requests.end();
2905 ++p) {
2906 MDRequestRef& mdr = p->second;
2907 // slave to the failed node?
2908 if (mdr->slave_to_mds == who) {
2909 if (mdr->slave_did_prepare()) {
2910 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2911 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2912 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2913
2914 if (!mdr->more()->waiting_on_slave.empty()) {
2915 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2916 // will rollback, no need to wait
2917 if (mdr->slave_request) {
2918 mdr->slave_request->put();
2919 mdr->slave_request = 0;
2920 }
2921 mdr->more()->waiting_on_slave.clear();
2922 }
2923 } else if (!mdr->committing) {
2924 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2925 if (mdr->slave_request || mdr->slave_rolling_back())
2926 mdr->aborted = true;
2927 else
2928 finish.push_back(mdr);
2929 }
2930 }
2931
2932 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2933 if (mdr->more()->waiting_on_slave.count(who)) {
2934 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2935 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2936 << who << dendl;
2937 mdr->more()->waiting_on_slave.erase(who);
2938 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2939 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2940 }
2941
2942 if (mdr->more()->srcdn_auth_mds == who &&
2943 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2944 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2945 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2946 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2947 }
2948 } else if (mdr->slave_request) {
2949 MMDSSlaveRequest *slave_req = mdr->slave_request;
2950 // FIXME: Slave rename request can arrive after we notice mds failure.
2951 // This can cause mds to crash (does not affect integrity of FS).
2952 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2953 slave_req->srcdn_auth == who)
2954 slave_req->mark_interrupted();
2955 }
2956
2957 // failed node is slave?
2958 if (mdr->is_master() && !mdr->committing) {
2959 if (mdr->more()->srcdn_auth_mds == who) {
2960 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2961 << who << " to recover" << dendl;
2962 assert(mdr->more()->witnessed.count(who) == 0);
2963 if (mdr->more()->is_ambiguous_auth)
2964 mdr->clear_ambiguous_auth();
2965 // rename srcdn's auth mds failed, all witnesses will rollback
2966 mdr->more()->witnessed.clear();
2967 pending_masters.erase(p->first);
2968 }
2969
2970 if (mdr->more()->witnessed.count(who)) {
2971 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2972 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2973 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2974 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2975 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2976 // until either the request is committing or the slave also fails.
2977 assert(mdr->more()->waiting_on_slave.size() == 1);
2978 pending_masters.insert(p->first);
2979 } else {
2980 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2981 << who << " to recover" << dendl;
2982 if (srcdn_auth >= 0)
2983 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2984
2985 // discard this peer's prepare (if any)
2986 mdr->more()->witnessed.erase(who);
2987 }
2988 }
2989
2990 if (mdr->more()->waiting_on_slave.count(who)) {
2991 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2992 << " to recover" << dendl;
2993 // retry request when peer recovers
2994 mdr->more()->waiting_on_slave.erase(who);
2995 if (mdr->more()->waiting_on_slave.empty())
2996 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2997 }
2998
2999 if (mdr->locking && mdr->locking_target_mds == who)
3000 mdr->finish_locking(mdr->locking);
3001 }
3002 }
3003
3004 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3005 p != uncommitted_masters.end();
3006 ++p) {
3007 // The failed MDS may have already committed the slave update
3008 if (p->second.slaves.count(who)) {
3009 p->second.recovering = true;
3010 p->second.slaves.erase(who);
3011 }
3012 }
3013
3014 while (!finish.empty()) {
3015 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3016 request_finish(finish.front());
3017 finish.pop_front();
3018 }
3019
3020 kick_find_ino_peers(who);
3021 kick_open_ino_peers(who);
3022
3023 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3024 p != fragments.end(); ) {
3025 dirfrag_t df = p->first;
3026 fragment_info_t& info = p->second;
3027 ++p;
3028 if (info.is_fragmenting())
3029 continue;
3030 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3031 list<CDir*> dirs;
3032 info.dirs.swap(dirs);
3033 fragments.erase(df);
3034 fragment_unmark_unfreeze_dirs(dirs);
3035 }
3036
3037 // MDCache::shutdown_export_strays() always exports strays to mds.0
3038 if (who == mds_rank_t(0))
3039 shutdown_exported_strays.clear();
3040
3041 show_subtrees();
3042 }
3043
3044 /*
3045 * handle_mds_recovery - called on another node's transition
3046 * from resolve -> active.
3047 */
3048 void MDCache::handle_mds_recovery(mds_rank_t who)
3049 {
3050 dout(7) << "handle_mds_recovery mds." << who << dendl;
3051
3052 // exclude all discover waiters. kick_discovers() will do the job
3053 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3054 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3055
3056 list<MDSInternalContextBase*> waiters;
3057
3058 // wake up any waiters in their subtrees
3059 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3060 p != subtrees.end();
3061 ++p) {
3062 CDir *dir = p->first;
3063
3064 if (dir->authority().first != who ||
3065 dir->authority().second == mds->get_nodeid())
3066 continue;
3067 assert(!dir->is_auth());
3068
3069 // wake any waiters
3070 list<CDir*> q;
3071 q.push_back(dir);
3072
3073 while (!q.empty()) {
3074 CDir *d = q.front();
3075 q.pop_front();
3076 d->take_waiting(d_mask, waiters);
3077
3078 // inode waiters too
3079 for (CDir::map_t::iterator p = d->items.begin();
3080 p != d->items.end();
3081 ++p) {
3082 CDentry *dn = p->second;
3083 CDentry::linkage_t *dnl = dn->get_linkage();
3084 if (dnl->is_primary()) {
3085 dnl->get_inode()->take_waiting(i_mask, waiters);
3086
3087 // recurse?
3088 list<CDir*> ls;
3089 dnl->get_inode()->get_dirfrags(ls);
3090 for (list<CDir*>::iterator p = ls.begin();
3091 p != ls.end();
3092 ++p) {
3093 CDir *subdir = *p;
3094 if (!subdir->is_subtree_root())
3095 q.push_back(subdir);
3096 }
3097 }
3098 }
3099 }
3100 }
3101
3102 kick_open_ino_peers(who);
3103 kick_find_ino_peers(who);
3104
3105 // queue them up.
3106 mds->queue_waiters(waiters);
3107 }
3108
3109 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3110 {
3111 dout(7) << "set_recovery_set " << s << dendl;
3112 recovery_set = s;
3113 }
3114
3115
3116 /*
3117 * during resolve state, we share resolves to determine who
3118 * is authoritative for which trees. we expect to get an resolve
3119 * from _everyone_ in the recovery_set (the mds cluster at the time of
3120 * the first failure).
3121 *
3122 * This functions puts the passed message before returning
3123 */
3124 void MDCache::handle_resolve(MMDSResolve *m)
3125 {
3126 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3127 mds_rank_t from = mds_rank_t(m->get_source().num());
3128
3129 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3130 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3131 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3132 return;
3133 }
3134 // wait until we reach the resolve stage!
3135 m->put();
3136 return;
3137 }
3138
3139 discard_delayed_resolve(from);
3140
3141 // ambiguous slave requests?
3142 if (!m->slave_requests.empty()) {
3143 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3144 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3145 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3146 assert(!p->second.committing);
3147 pending_masters.insert(p->first);
3148 }
3149 }
3150
3151 if (!pending_masters.empty()) {
3152 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3153 delayed_resolve[from] = m;
3154 return;
3155 }
3156 }
3157
3158 MMDSResolveAck *ack = new MMDSResolveAck;
3159 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3160 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3161 // COMMIT
3162 if (p->second.committing) {
3163 // already committing, waiting for the OP_COMMITTED slave reply
3164 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3165 } else {
3166 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3167 ack->add_commit(p->first);
3168 }
3169 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3170
3171 if (p->second.inode_caps.length() > 0) {
3172 // slave wants to export caps (rename)
3173 assert(mds->is_resolve());
3174
3175 inodeno_t ino;
3176 map<client_t,Capability::Export> cap_exports;
3177 bufferlist::iterator q = p->second.inode_caps.begin();
3178 ::decode(ino, q);
3179 ::decode(cap_exports, q);
3180
3181 assert(get_inode(ino));
3182
3183 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3184 q != cap_exports.end();
3185 ++q) {
3186 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3187 im.cap_id = ++last_cap_id; // assign a new cap ID
3188 im.issue_seq = 1;
3189 im.mseq = q->second.mseq;
3190 }
3191
3192 // will process these caps in rejoin stage
3193 rejoin_slave_exports[ino].first = from;
3194 rejoin_slave_exports[ino].second.swap(cap_exports);
3195
3196 // send information of imported caps back to slave
3197 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3198 }
3199 } else {
3200 // ABORT
3201 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3202 assert(!p->second.committing);
3203 ack->add_abort(p->first);
3204 }
3205 }
3206 mds->send_message(ack, m->get_connection());
3207 m->put();
3208 return;
3209 }
3210
3211 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3212 dout(10) << "delay processing subtree resolve" << dendl;
3213 delayed_resolve[from] = m;
3214 return;
3215 }
3216
3217 bool survivor = false;
3218 // am i a surviving ambiguous importer?
3219 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3220 survivor = true;
3221 // check for any import success/failure (from this node)
3222 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3223 while (p != my_ambiguous_imports.end()) {
3224 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3225 ++next;
3226 CDir *dir = get_dirfrag(p->first);
3227 assert(dir);
3228 dout(10) << "checking ambiguous import " << *dir << dendl;
3229 if (migrator->is_importing(dir->dirfrag()) &&
3230 migrator->get_import_peer(dir->dirfrag()) == from) {
3231 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3232
3233 // check if sender claims the subtree
3234 bool claimed_by_sender = false;
3235 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3236 q != m->subtrees.end();
3237 ++q) {
3238 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3239 CDir *base = get_force_dirfrag(q->first, false);
3240 if (!base || !base->contains(dir))
3241 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3242
3243 bool inside = true;
3244 set<CDir*> bounds;
3245 get_force_dirfrag_bound_set(q->second, bounds);
3246 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3247 CDir *bound = *p;
3248 if (bound->contains(dir)) {
3249 inside = false; // nope, bound is dir or parent of dir, not inside.
3250 break;
3251 }
3252 }
3253 if (inside)
3254 claimed_by_sender = true;
3255 }
3256
3257 my_ambiguous_imports.erase(p); // no longer ambiguous.
3258 if (claimed_by_sender) {
3259 dout(7) << "ambiguous import failed on " << *dir << dendl;
3260 migrator->import_reverse(dir);
3261 } else {
3262 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3263 migrator->import_finish(dir, true);
3264 }
3265 }
3266 p = next;
3267 }
3268 }
3269
3270 // update my dir_auth values
3271 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3272 // migrations between other nodes)
3273 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3274 pi != m->subtrees.end();
3275 ++pi) {
3276 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3277 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3278 if (!dir)
3279 continue;
3280 adjust_bounded_subtree_auth(dir, pi->second, from);
3281 try_subtree_merge(dir);
3282 }
3283
3284 show_subtrees();
3285
3286 // note ambiguous imports too
3287 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3288 pi != m->ambiguous_imports.end();
3289 ++pi) {
3290 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3291 other_ambiguous_imports[from][pi->first].swap( pi->second );
3292 }
3293
3294 // did i get them all?
3295 resolve_gather.erase(from);
3296
3297 maybe_resolve_finish();
3298
3299 m->put();
3300 }
3301
3302 void MDCache::process_delayed_resolve()
3303 {
3304 dout(10) << "process_delayed_resolve" << dendl;
3305 map<mds_rank_t, MMDSResolve*> tmp;
3306 tmp.swap(delayed_resolve);
3307 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3308 handle_resolve(p->second);
3309 }
3310
3311 void MDCache::discard_delayed_resolve(mds_rank_t who)
3312 {
3313 if (delayed_resolve.count(who)) {
3314 delayed_resolve[who]->put();
3315 delayed_resolve.erase(who);
3316 }
3317 }
3318
3319 void MDCache::maybe_resolve_finish()
3320 {
3321 assert(resolve_ack_gather.empty());
3322 assert(need_resolve_rollback.empty());
3323
3324 if (!resolve_gather.empty()) {
3325 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3326 << resolve_gather << ")" << dendl;
3327 return;
3328 }
3329
3330 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3331 disambiguate_my_imports();
3332 finish_committed_masters();
3333
3334 if (resolve_done) {
3335 assert(mds->is_resolve());
3336 trim_unlinked_inodes();
3337 recalc_auth_bits(false);
3338 resolve_done.release()->complete(0);
3339 } else {
3340 maybe_send_pending_rejoins();
3341 }
3342 }
3343
3344 /* This functions puts the passed message before returning */
3345 void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3346 {
3347 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3348 mds_rank_t from = mds_rank_t(ack->get_source().num());
3349
3350 if (!resolve_ack_gather.count(from) ||
3351 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3352 ack->put();
3353 return;
3354 }
3355
3356 if (ambiguous_slave_updates.count(from)) {
3357 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3358 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3359 }
3360
3361 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3362 p != ack->commit.end();
3363 ++p) {
3364 dout(10) << " commit on slave " << p->first << dendl;
3365
3366 if (ambiguous_slave_updates.count(from)) {
3367 remove_ambiguous_slave_update(p->first, from);
3368 continue;
3369 }
3370
3371 if (mds->is_resolve()) {
3372 // replay
3373 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3374 assert(su);
3375
3376 // log commit
3377 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3378 ESlaveUpdate::OP_COMMIT, su->origop),
3379 new C_MDC_SlaveCommit(this, from, p->first));
3380 mds->mdlog->flush();
3381
3382 finish_uncommitted_slave_update(p->first, from);
3383 } else {
3384 MDRequestRef mdr = request_get(p->first);
3385 // information about master imported caps
3386 if (p->second.length() > 0)
3387 mdr->more()->inode_import.claim(p->second);
3388
3389 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3390 request_finish(mdr);
3391 }
3392 }
3393
3394 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3395 p != ack->abort.end();
3396 ++p) {
3397 dout(10) << " abort on slave " << *p << dendl;
3398
3399 if (mds->is_resolve()) {
3400 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3401 assert(su);
3402
3403 // perform rollback (and journal a rollback entry)
3404 // note: this will hold up the resolve a bit, until the rollback entries journal.
3405 MDRequestRef null_ref;
3406 switch (su->origop) {
3407 case ESlaveUpdate::LINK:
3408 mds->server->do_link_rollback(su->rollback, from, null_ref);
3409 break;
3410 case ESlaveUpdate::RENAME:
3411 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3412 break;
3413 case ESlaveUpdate::RMDIR:
3414 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3415 break;
3416 default:
3417 ceph_abort();
3418 }
3419 } else {
3420 MDRequestRef mdr = request_get(*p);
3421 mdr->aborted = true;
3422 if (mdr->slave_request) {
3423 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3424 add_rollback(*p, from);
3425 } else {
3426 request_finish(mdr);
3427 }
3428 }
3429 }
3430
3431 if (!ambiguous_slave_updates.count(from))
3432 resolve_ack_gather.erase(from);
3433 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3434 send_subtree_resolves();
3435 process_delayed_resolve();
3436 }
3437
3438 ack->put();
3439 }
3440
3441 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3442 {
3443 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3444 uncommitted_slave_updates[master][reqid] = su;
3445 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3446 uncommitted_slave_rename_olddir[*p]++;
3447 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3448 uncommitted_slave_unlink[*p]++;
3449 }
3450
3451 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3452 {
3453 assert(uncommitted_slave_updates[master].count(reqid));
3454 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3455
3456 uncommitted_slave_updates[master].erase(reqid);
3457 if (uncommitted_slave_updates[master].empty())
3458 uncommitted_slave_updates.erase(master);
3459 // discard the non-auth subtree we renamed out of
3460 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3461 CInode *diri = *p;
3462 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3463 assert(it != uncommitted_slave_rename_olddir.end());
3464 it->second--;
3465 if (it->second == 0) {
3466 uncommitted_slave_rename_olddir.erase(it);
3467 list<CDir*> ls;
3468 diri->get_dirfrags(ls);
3469 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3470 CDir *root = get_subtree_root(*q);
3471 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3472 try_trim_non_auth_subtree(root);
3473 if (*q != root)
3474 break;
3475 }
3476 }
3477 } else
3478 assert(it->second > 0);
3479 }
3480 // removed the inodes that were unlinked by slave update
3481 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3482 CInode *in = *p;
3483 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3484 assert(it != uncommitted_slave_unlink.end());
3485 it->second--;
3486 if (it->second == 0) {
3487 uncommitted_slave_unlink.erase(it);
3488 if (!in->get_projected_parent_dn())
3489 mds->mdcache->remove_inode_recursive(in);
3490 } else
3491 assert(it->second > 0);
3492 }
3493 delete su;
3494 }
3495
3496 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3497 {
3498
3499 MDSlaveUpdate* su = NULL;
3500 if (uncommitted_slave_updates.count(master) &&
3501 uncommitted_slave_updates[master].count(reqid)) {
3502 su = uncommitted_slave_updates[master][reqid];
3503 assert(su);
3504 }
3505 return su;
3506 }
3507
3508 void MDCache::finish_rollback(metareqid_t reqid) {
3509 assert(need_resolve_rollback.count(reqid));
3510 if (mds->is_resolve())
3511 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3512 need_resolve_rollback.erase(reqid);
3513 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3514 send_subtree_resolves();
3515 process_delayed_resolve();
3516 }
3517 }
3518
3519 void MDCache::disambiguate_other_imports()
3520 {
3521 dout(10) << "disambiguate_other_imports" << dendl;
3522
3523 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3524 // other nodes' ambiguous imports
3525 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3526 p != other_ambiguous_imports.end();
3527 ++p) {
3528 mds_rank_t who = p->first;
3529 dout(10) << "ambiguous imports for mds." << who << dendl;
3530
3531 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3532 q != p->second.end();
3533 ++q) {
3534 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3535 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3536 CDir *dir = get_force_dirfrag(q->first, recovering);
3537 if (!dir) continue;
3538
3539 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3540 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3541 dout(10) << " mds." << who << " did import " << *dir << dendl;
3542 adjust_bounded_subtree_auth(dir, q->second, who);
3543 try_subtree_merge(dir);
3544 } else {
3545 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3546 }
3547 }
3548 }
3549 other_ambiguous_imports.clear();
3550 }
3551
3552 void MDCache::disambiguate_my_imports()
3553 {
3554 dout(10) << "disambiguate_my_imports" << dendl;
3555
3556 if (!mds->is_resolve()) {
3557 assert(my_ambiguous_imports.empty());
3558 return;
3559 }
3560
3561 disambiguate_other_imports();
3562
3563 // my ambiguous imports
3564 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3565 while (!my_ambiguous_imports.empty()) {
3566 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3567
3568 CDir *dir = get_dirfrag(q->first);
3569 assert(dir);
3570
3571 if (dir->authority() != me_ambig) {
3572 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3573 cancel_ambiguous_import(dir);
3574
3575 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3576
3577 // subtree may have been swallowed by another node claiming dir
3578 // as their own.
3579 CDir *root = get_subtree_root(dir);
3580 if (root != dir)
3581 dout(10) << " subtree root is " << *root << dendl;
3582 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3583 try_trim_non_auth_subtree(root);
3584 } else {
3585 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3586 finish_ambiguous_import(q->first);
3587 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3588 }
3589 }
3590 assert(my_ambiguous_imports.empty());
3591 mds->mdlog->flush();
3592
3593 // verify all my subtrees are unambiguous!
3594 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3595 p != subtrees.end();
3596 ++p) {
3597 CDir *dir = p->first;
3598 if (dir->is_ambiguous_dir_auth()) {
3599 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3600 }
3601 assert(!dir->is_ambiguous_dir_auth());
3602 }
3603
3604 show_subtrees();
3605 }
3606
3607
3608 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3609 {
3610 assert(my_ambiguous_imports.count(base) == 0);
3611 my_ambiguous_imports[base] = bounds;
3612 }
3613
3614
3615 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3616 {
3617 // make a list
3618 vector<dirfrag_t> binos;
3619 for (set<CDir*>::iterator p = bounds.begin();
3620 p != bounds.end();
3621 ++p)
3622 binos.push_back((*p)->dirfrag());
3623
3624 // note: this can get called twice if the exporter fails during recovery
3625 if (my_ambiguous_imports.count(base->dirfrag()))
3626 my_ambiguous_imports.erase(base->dirfrag());
3627
3628 add_ambiguous_import(base->dirfrag(), binos);
3629 }
3630
3631 void MDCache::cancel_ambiguous_import(CDir *dir)
3632 {
3633 dirfrag_t df = dir->dirfrag();
3634 assert(my_ambiguous_imports.count(df));
3635 dout(10) << "cancel_ambiguous_import " << df
3636 << " bounds " << my_ambiguous_imports[df]
3637 << " " << *dir
3638 << dendl;
3639 my_ambiguous_imports.erase(df);
3640 }
3641
3642 void MDCache::finish_ambiguous_import(dirfrag_t df)
3643 {
3644 assert(my_ambiguous_imports.count(df));
3645 vector<dirfrag_t> bounds;
3646 bounds.swap(my_ambiguous_imports[df]);
3647 my_ambiguous_imports.erase(df);
3648
3649 dout(10) << "finish_ambiguous_import " << df
3650 << " bounds " << bounds
3651 << dendl;
3652 CDir *dir = get_dirfrag(df);
3653 assert(dir);
3654
3655 // adjust dir_auth, import maps
3656 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3657 try_subtree_merge(dir);
3658 }
3659
3660 void MDCache::remove_inode_recursive(CInode *in)
3661 {
3662 dout(10) << "remove_inode_recursive " << *in << dendl;
3663 list<CDir*> ls;
3664 in->get_dirfrags(ls);
3665 list<CDir*>::iterator p = ls.begin();
3666 while (p != ls.end()) {
3667 CDir *subdir = *p++;
3668
3669 dout(10) << " removing dirfrag " << subdir << dendl;
3670 CDir::map_t::iterator q = subdir->items.begin();
3671 while (q != subdir->items.end()) {
3672 CDentry *dn = q->second;
3673 ++q;
3674 CDentry::linkage_t *dnl = dn->get_linkage();
3675 if (dnl->is_primary()) {
3676 CInode *tin = dnl->get_inode();
3677 subdir->unlink_inode(dn, false);
3678 remove_inode_recursive(tin);
3679 }
3680 subdir->remove_dentry(dn);
3681 }
3682
3683 if (subdir->is_subtree_root())
3684 remove_subtree(subdir);
3685 in->close_dirfrag(subdir->dirfrag().frag);
3686 }
3687 remove_inode(in);
3688 }
3689
3690 bool MDCache::expire_recursive(
3691 CInode *in,
3692 map<mds_rank_t, MCacheExpire*>& expiremap)
3693 {
3694 assert(!in->is_auth());
3695
3696 dout(10) << __func__ << ":" << *in << dendl;
3697
3698 // Recurse into any dirfrags beneath this inode
3699 list<CDir*> ls;
3700 in->get_dirfrags(ls);
3701 for (auto subdir : ls) {
3702 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3703 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3704 return true;
3705 }
3706
3707 for (auto &it : subdir->items) {
3708 CDentry *dn = it.second;
3709 CDentry::linkage_t *dnl = dn->get_linkage();
3710 if (dnl->is_primary()) {
3711 CInode *tin = dnl->get_inode();
3712
3713 /* Remote strays with linkage (i.e. hardlinks) should not be
3714 * expired, because they may be the target of
3715 * a rename() as the owning MDS shuts down */
3716 if (!tin->is_stray() && tin->inode.nlink) {
3717 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3718 return true;
3719 }
3720
3721 const bool abort = expire_recursive(tin, expiremap);
3722 if (abort) {
3723 return true;
3724 }
3725 }
3726 if (dn->lru_is_expireable()) {
3727 trim_dentry(dn, expiremap);
3728 } else {
3729 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3730 return true;
3731 }
3732 }
3733 }
3734
3735 return false;
3736 }
3737
3738 void MDCache::trim_unlinked_inodes()
3739 {
3740 dout(7) << "trim_unlinked_inodes" << dendl;
3741 list<CInode*> q;
3742 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
3743 p != inode_map.end();
3744 ++p) {
3745 CInode *in = p->second;
3746 if (in->get_parent_dn() == NULL && !in->is_base()) {
3747 dout(7) << " will trim from " << *in << dendl;
3748 q.push_back(in);
3749 }
3750 }
3751 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3752 remove_inode_recursive(*p);
3753 }
3754
3755 /** recalc_auth_bits()
3756 * once subtree auth is disambiguated, we need to adjust all the
3757 * auth and dirty bits in our cache before moving on.
3758 */
3759 void MDCache::recalc_auth_bits(bool replay)
3760 {
3761 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3762
3763 if (root) {
3764 root->inode_auth.first = mds->mdsmap->get_root();
3765 bool auth = mds->get_nodeid() == root->inode_auth.first;
3766 if (auth) {
3767 root->state_set(CInode::STATE_AUTH);
3768 } else {
3769 root->state_clear(CInode::STATE_AUTH);
3770 if (!replay)
3771 root->state_set(CInode::STATE_REJOINING);
3772 }
3773 }
3774
3775 set<CInode*> subtree_inodes;
3776 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3777 p != subtrees.end();
3778 ++p) {
3779 if (p->first->dir_auth.first == mds->get_nodeid())
3780 subtree_inodes.insert(p->first->inode);
3781 }
3782
3783 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3784 p != subtrees.end();
3785 ++p) {
3786 if (p->first->inode->is_mdsdir()) {
3787 CInode *in = p->first->inode;
3788 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3789 if (auth) {
3790 in->state_set(CInode::STATE_AUTH);
3791 } else {
3792 in->state_clear(CInode::STATE_AUTH);
3793 if (!replay)
3794 in->state_set(CInode::STATE_REJOINING);
3795 }
3796 }
3797
3798 list<CDir*> dfq; // dirfrag queue
3799 dfq.push_back(p->first);
3800
3801 bool auth = p->first->authority().first == mds->get_nodeid();
3802 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3803
3804 while (!dfq.empty()) {
3805 CDir *dir = dfq.front();
3806 dfq.pop_front();
3807
3808 // dir
3809 if (auth) {
3810 dir->state_set(CDir::STATE_AUTH);
3811 } else {
3812 dir->state_clear(CDir::STATE_AUTH);
3813 if (!replay) {
3814 // close empty non-auth dirfrag
3815 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3816 dir->inode->close_dirfrag(dir->get_frag());
3817 continue;
3818 }
3819 dir->state_set(CDir::STATE_REJOINING);
3820 dir->state_clear(CDir::STATE_COMPLETE);
3821 if (dir->is_dirty())
3822 dir->mark_clean();
3823 }
3824 }
3825
3826 // dentries in this dir
3827 for (CDir::map_t::iterator q = dir->items.begin();
3828 q != dir->items.end();
3829 ++q) {
3830 // dn
3831 CDentry *dn = q->second;
3832 CDentry::linkage_t *dnl = dn->get_linkage();
3833 if (auth) {
3834 dn->state_set(CDentry::STATE_AUTH);
3835 } else {
3836 dn->state_clear(CDentry::STATE_AUTH);
3837 if (!replay) {
3838 dn->state_set(CDentry::STATE_REJOINING);
3839 if (dn->is_dirty())
3840 dn->mark_clean();
3841 }
3842 }
3843
3844 if (dnl->is_primary()) {
3845 // inode
3846 CInode *in = dnl->get_inode();
3847 if (auth) {
3848 in->state_set(CInode::STATE_AUTH);
3849 } else {
3850 in->state_clear(CInode::STATE_AUTH);
3851 if (!replay) {
3852 in->state_set(CInode::STATE_REJOINING);
3853 if (in->is_dirty())
3854 in->mark_clean();
3855 if (in->is_dirty_parent())
3856 in->clear_dirty_parent();
3857 // avoid touching scatterlocks for our subtree roots!
3858 if (subtree_inodes.count(in) == 0)
3859 in->clear_scatter_dirty();
3860 }
3861 }
3862 // recurse?
3863 if (in->is_dir())
3864 in->get_nested_dirfrags(dfq);
3865 }
3866 }
3867 }
3868 }
3869
3870 show_subtrees();
3871 show_cache();
3872 }
3873
3874
3875
3876 // ===========================================================================
3877 // REJOIN
3878
3879 /*
3880 * notes on scatterlock recovery:
3881 *
3882 * - recovering inode replica sends scatterlock data for any subtree
3883 * roots (the only ones that are possibly dirty).
3884 *
3885 * - surviving auth incorporates any provided scatterlock data. any
3886 * pending gathers are then finished, as with the other lock types.
3887 *
3888 * that takes care of surviving auth + (recovering replica)*.
3889 *
3890 * - surviving replica sends strong_inode, which includes current
3891 * scatterlock state, AND any dirty scatterlock data. this
3892 * provides the recovering auth with everything it might need.
3893 *
3894 * - recovering auth must pick initial scatterlock state based on
3895 * (weak|strong) rejoins.
3896 * - always assimilate scatterlock data (it can't hurt)
3897 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3898 * - include base inode in ack for all inodes that saw scatterlock content
3899 *
3900 * also, for scatter gather,
3901 *
3902 * - auth increments {frag,r}stat.version on completion of any gather.
3903 *
3904 * - auth incorporates changes in a gather _only_ if the version
3905 * matches.
3906 *
3907 * - replica discards changes any time the scatterlock syncs, and
3908 * after recovery.
3909 */
3910
3911 void MDCache::dump_rejoin_status(Formatter *f) const
3912 {
3913 f->open_object_section("rejoin_status");
3914 f->dump_stream("rejoin_gather") << rejoin_gather;
3915 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3916 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3917 f->close_section();
3918 }
3919
3920 void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3921 {
3922 dout(10) << "rejoin_start" << dendl;
3923 assert(!rejoin_done);
3924 rejoin_done.reset(rejoin_done_);
3925
3926 rejoin_gather = recovery_set;
3927 // need finish opening cap inodes before sending cache rejoins
3928 rejoin_gather.insert(mds->get_nodeid());
3929 process_imported_caps();
3930 }
3931
3932 /*
3933 * rejoin phase!
3934 *
3935 * this initiates rejoin. it shoudl be called before we get any
3936 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3937 *
3938 * we start out by sending rejoins to everyone in the recovery set.
3939 *
3940 * if we are rejoin, send for all regions in our cache.
3941 * if we are active|stopping, send only to nodes that are are rejoining.
3942 */
3943 void MDCache::rejoin_send_rejoins()
3944 {
3945 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3946
3947 if (rejoin_gather.count(mds->get_nodeid())) {
3948 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3949 rejoins_pending = true;
3950 return;
3951 }
3952 if (!resolve_gather.empty()) {
3953 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3954 << resolve_gather << ")" << dendl;
3955 rejoins_pending = true;
3956 return;
3957 }
3958
3959 assert(!migrator->is_importing());
3960 assert(!migrator->is_exporting());
3961
3962 if (!mds->is_rejoin()) {
3963 disambiguate_other_imports();
3964 }
3965
3966 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3967
3968
3969 // if i am rejoining, send a rejoin to everyone.
3970 // otherwise, just send to others who are rejoining.
3971 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3972 p != recovery_set.end();
3973 ++p) {
3974 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3975 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3976 if (mds->is_rejoin())
3977 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3978 else if (mds->mdsmap->is_rejoin(*p))
3979 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3980 }
3981
3982 if (mds->is_rejoin()) {
3983 map<client_t, set<mds_rank_t> > client_exports;
3984 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3985 assert(cap_export_targets.count(p->first));
3986 mds_rank_t target = cap_export_targets[p->first];
3987 if (rejoins.count(target) == 0)
3988 continue;
3989 rejoins[target]->cap_exports[p->first] = p->second;
3990 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3991 client_exports[q->first].insert(target);
3992 }
3993 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3994 p != client_exports.end();
3995 ++p) {
3996 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3997 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3998 rejoins[*q]->client_map[p->first] = inst;
3999 }
4000 }
4001
4002
4003 // check all subtrees
4004 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4005 p != subtrees.end();
4006 ++p) {
4007 CDir *dir = p->first;
4008 assert(dir->is_subtree_root());
4009 if (dir->is_ambiguous_dir_auth()) {
4010 // exporter is recovering, importer is survivor.
4011 assert(rejoins.count(dir->authority().first));
4012 assert(!rejoins.count(dir->authority().second));
4013 continue;
4014 }
4015
4016 // my subtree?
4017 if (dir->is_auth())
4018 continue; // skip my own regions!
4019
4020 mds_rank_t auth = dir->get_dir_auth().first;
4021 assert(auth >= 0);
4022 if (rejoins.count(auth) == 0)
4023 continue; // don't care about this node's subtrees
4024
4025 rejoin_walk(dir, rejoins[auth]);
4026 }
4027
4028 // rejoin root inodes, too
4029 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4030 p != rejoins.end();
4031 ++p) {
4032 if (mds->is_rejoin()) {
4033 // weak
4034 if (p->first == 0 && root) {
4035 p->second->add_weak_inode(root->vino());
4036 if (root->is_dirty_scattered()) {
4037 dout(10) << " sending scatterlock state on root " << *root << dendl;
4038 p->second->add_scatterlock_state(root);
4039 }
4040 }
4041 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4042 if (in)
4043 p->second->add_weak_inode(in->vino());
4044 }
4045 } else {
4046 // strong
4047 if (p->first == 0 && root) {
4048 p->second->add_strong_inode(root->vino(),
4049 root->get_replica_nonce(),
4050 root->get_caps_wanted(),
4051 root->filelock.get_state(),
4052 root->nestlock.get_state(),
4053 root->dirfragtreelock.get_state());
4054 root->state_set(CInode::STATE_REJOINING);
4055 if (root->is_dirty_scattered()) {
4056 dout(10) << " sending scatterlock state on root " << *root << dendl;
4057 p->second->add_scatterlock_state(root);
4058 }
4059 }
4060
4061 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4062 p->second->add_strong_inode(in->vino(),
4063 in->get_replica_nonce(),
4064 in->get_caps_wanted(),
4065 in->filelock.get_state(),
4066 in->nestlock.get_state(),
4067 in->dirfragtreelock.get_state());
4068 in->state_set(CInode::STATE_REJOINING);
4069 }
4070 }
4071 }
4072
4073 if (!mds->is_rejoin()) {
4074 // i am survivor. send strong rejoin.
4075 // note request remote_auth_pins, xlocks
4076 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4077 p != active_requests.end();
4078 ++p) {
4079 MDRequestRef& mdr = p->second;
4080 if (mdr->is_slave())
4081 continue;
4082 // auth pins
4083 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4084 q != mdr->remote_auth_pins.end();
4085 ++q) {
4086 if (!q->first->is_auth()) {
4087 assert(q->second == q->first->authority().first);
4088 if (rejoins.count(q->second) == 0) continue;
4089 MMDSCacheRejoin *rejoin = rejoins[q->second];
4090
4091 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4092 MDSCacheObjectInfo i;
4093 q->first->set_object_info(i);
4094 if (i.ino)
4095 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4096 else
4097 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4098
4099 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4100 mdr->more()->rename_inode == q->first)
4101 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4102 mdr->reqid, mdr->attempt);
4103 }
4104 }
4105 // xlocks
4106 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4107 q != mdr->xlocks.end();
4108 ++q) {
4109 if (!(*q)->get_parent()->is_auth()) {
4110 mds_rank_t who = (*q)->get_parent()->authority().first;
4111 if (rejoins.count(who) == 0) continue;
4112 MMDSCacheRejoin *rejoin = rejoins[who];
4113
4114 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4115 MDSCacheObjectInfo i;
4116 (*q)->get_parent()->set_object_info(i);
4117 if (i.ino)
4118 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4119 mdr->reqid, mdr->attempt);
4120 else
4121 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4122 mdr->reqid, mdr->attempt);
4123 }
4124 }
4125 // remote wrlocks
4126 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4127 q != mdr->remote_wrlocks.end();
4128 ++q) {
4129 mds_rank_t who = q->second;
4130 if (rejoins.count(who) == 0) continue;
4131 MMDSCacheRejoin *rejoin = rejoins[who];
4132
4133 dout(15) << " " << *mdr << " wrlock on " << q->second
4134 << " " << q->first->get_parent() << dendl;
4135 MDSCacheObjectInfo i;
4136 q->first->get_parent()->set_object_info(i);
4137 assert(i.ino);
4138 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4139 mdr->reqid, mdr->attempt);
4140 }
4141 }
4142 }
4143
4144 // send the messages
4145 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4146 p != rejoins.end();
4147 ++p) {
4148 assert(rejoin_sent.count(p->first) == 0);
4149 assert(rejoin_ack_gather.count(p->first) == 0);
4150 rejoin_sent.insert(p->first);
4151 rejoin_ack_gather.insert(p->first);
4152 mds->send_message_mds(p->second, p->first);
4153 }
4154 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4155 rejoins_pending = false;
4156
4157 // nothing?
4158 if (mds->is_rejoin() && rejoins.empty()) {
4159 dout(10) << "nothing to rejoin" << dendl;
4160 rejoin_gather_finish();
4161 }
4162 }
4163
4164
4165 /**
4166 * rejoin_walk - build rejoin declarations for a subtree
4167 *
4168 * @param dir subtree root
4169 * @param rejoin rejoin message
4170 *
4171 * from a rejoining node:
4172 * weak dirfrag
4173 * weak dentries (w/ connectivity)
4174 *
4175 * from a surviving node:
4176 * strong dirfrag
4177 * strong dentries (no connectivity!)
4178 * strong inodes
4179 */
4180 void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4181 {
4182 dout(10) << "rejoin_walk " << *dir << dendl;
4183
4184 list<CDir*> nested; // finish this dir, then do nested items
4185
4186 if (mds->is_rejoin()) {
4187 // WEAK
4188 rejoin->add_weak_dirfrag(dir->dirfrag());
4189 for (CDir::map_t::iterator p = dir->items.begin();
4190 p != dir->items.end();
4191 ++p) {
4192 CDentry *dn = p->second;
4193 CDentry::linkage_t *dnl = dn->get_linkage();
4194 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4195 assert(dnl->is_primary());
4196 CInode *in = dnl->get_inode();
4197 assert(dnl->get_inode()->is_dir());
4198 rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
4199 in->get_nested_dirfrags(nested);
4200 if (in->is_dirty_scattered()) {
4201 dout(10) << " sending scatterlock state on " << *in << dendl;
4202 rejoin->add_scatterlock_state(in);
4203 }
4204 }
4205 } else {
4206 // STRONG
4207 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4208 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4209 dir->state_set(CDir::STATE_REJOINING);
4210
4211 for (CDir::map_t::iterator p = dir->items.begin();
4212 p != dir->items.end();
4213 ++p) {
4214 CDentry *dn = p->second;
4215 CDentry::linkage_t *dnl = dn->get_linkage();
4216 dout(15) << " add_strong_dentry " << *dn << dendl;
4217 rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4218 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4219 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4220 dnl->is_remote() ? dnl->get_remote_d_type():0,
4221 dn->get_replica_nonce(),
4222 dn->lock.get_state());
4223 dn->state_set(CDentry::STATE_REJOINING);
4224 if (dnl->is_primary()) {
4225 CInode *in = dnl->get_inode();
4226 dout(15) << " add_strong_inode " << *in << dendl;
4227 rejoin->add_strong_inode(in->vino(),
4228 in->get_replica_nonce(),
4229 in->get_caps_wanted(),
4230 in->filelock.get_state(),
4231 in->nestlock.get_state(),
4232 in->dirfragtreelock.get_state());
4233 in->state_set(CInode::STATE_REJOINING);
4234 in->get_nested_dirfrags(nested);
4235 if (in->is_dirty_scattered()) {
4236 dout(10) << " sending scatterlock state on " << *in << dendl;
4237 rejoin->add_scatterlock_state(in);
4238 }
4239 }
4240 }
4241 }
4242
4243 // recurse into nested dirs
4244 for (list<CDir*>::iterator p = nested.begin();
4245 p != nested.end();
4246 ++p)
4247 rejoin_walk(*p, rejoin);
4248 }
4249
4250
4251 /*
4252 * i got a rejoin.
4253 * - reply with the lockstate
4254 *
4255 * if i am active|stopping,
4256 * - remove source from replica list for everything not referenced here.
4257 * This function puts the passed message before returning.
4258 */
4259 void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4260 {
4261 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4262 << " (" << m->get_payload().length() << " bytes)"
4263 << dendl;
4264
4265 switch (m->op) {
4266 case MMDSCacheRejoin::OP_WEAK:
4267 handle_cache_rejoin_weak(m);
4268 break;
4269 case MMDSCacheRejoin::OP_STRONG:
4270 handle_cache_rejoin_strong(m);
4271 break;
4272 case MMDSCacheRejoin::OP_ACK:
4273 handle_cache_rejoin_ack(m);
4274 break;
4275
4276 default:
4277 ceph_abort();
4278 }
4279 m->put();
4280 }
4281
4282
4283 /*
4284 * handle_cache_rejoin_weak
4285 *
4286 * the sender
4287 * - is recovering from their journal.
4288 * - may have incorrect (out of date) inode contents
4289 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4290 *
4291 * if the sender didn't trim_non_auth(), they
4292 * - may have incorrect (out of date) dentry/inode linkage
4293 * - may have deleted/purged inodes
4294 * and i may have to go to disk to get accurate inode contents. yuck.
4295 * This functions DOES NOT put the passed message before returning
4296 */
4297 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4298 {
4299 mds_rank_t from = mds_rank_t(weak->get_source().num());
4300
4301 // possible response(s)
4302 MMDSCacheRejoin *ack = 0; // if survivor
4303 set<vinodeno_t> acked_inodes; // if survivor
4304 set<SimpleLock *> gather_locks; // if survivor
4305 bool survivor = false; // am i a survivor?
4306
4307 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4308 survivor = true;
4309 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4310 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4311
4312 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4313
4314 // check cap exports
4315 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4316 CInode *in = get_inode(p->first);
4317 assert(!in || in->is_auth());
4318 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4319 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4320 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4321 Capability::Import& im = imported_caps[p->first][q->first];
4322 if (cap) {
4323 im.cap_id = cap->get_cap_id();
4324 im.issue_seq = cap->get_last_seq();
4325 im.mseq = cap->get_mseq();
4326 } else {
4327 // all are zero
4328 }
4329 }
4330 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4331 }
4332
4333 ::encode(imported_caps, ack->imported_caps);
4334 } else {
4335 assert(mds->is_rejoin());
4336
4337 // we may have already received a strong rejoin from the sender.
4338 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4339 assert(gather_locks.empty());
4340
4341 // check cap exports.
4342 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4343
4344 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4345 CInode *in = get_inode(p->first);
4346 assert(in && in->is_auth());
4347 // note
4348 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4349 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4350 cap_imports[p->first][q->first][from] = q->second;
4351 }
4352 }
4353 }
4354
4355 // assimilate any potentially dirty scatterlock state
4356 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4357 p != weak->inode_scatterlocks.end();
4358 ++p) {
4359 CInode *in = get_inode(p->first);
4360 assert(in);
4361 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4362 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4363 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4364 if (!survivor)
4365 rejoin_potential_updated_scatterlocks.insert(in);
4366 }
4367
4368 // recovering peer may send incorrect dirfrags here. we need to
4369 // infer which dirfrag they meant. the ack will include a
4370 // strong_dirfrag that will set them straight on the fragmentation.
4371
4372 // walk weak map
4373 set<CDir*> dirs_to_share;
4374 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4375 p != weak->weak_dirfrags.end();
4376 ++p) {
4377 CInode *diri = get_inode(p->ino);
4378 if (!diri)
4379 dout(0) << " missing dir ino " << p->ino << dendl;
4380 assert(diri);
4381
4382 list<frag_t> ls;
4383 if (diri->dirfragtree.is_leaf(p->frag)) {
4384 ls.push_back(p->frag);
4385 } else {
4386 diri->dirfragtree.get_leaves_under(p->frag, ls);
4387 if (ls.empty())
4388 ls.push_back(diri->dirfragtree[p->frag.value()]);
4389 }
4390 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4391 frag_t fg = *q;
4392 CDir *dir = diri->get_dirfrag(fg);
4393 if (!dir) {
4394 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4395 continue;
4396 }
4397 assert(dir);
4398 if (dirs_to_share.count(dir)) {
4399 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4400 } else {
4401 dirs_to_share.insert(dir);
4402 unsigned nonce = dir->add_replica(from);
4403 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4404 if (ack) {
4405 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4406 ack->add_dirfrag_base(dir);
4407 }
4408 }
4409 }
4410 }
4411
4412 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4413 p != weak->weak.end();
4414 ++p) {
4415 CInode *diri = get_inode(p->first);
4416 if (!diri)
4417 dout(0) << " missing dir ino " << p->first << dendl;
4418 assert(diri);
4419
4420 // weak dentries
4421 CDir *dir = 0;
4422 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4423 q != p->second.end();
4424 ++q) {
4425 // locate proper dirfrag.
4426 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4427 frag_t fg = diri->pick_dirfrag(q->first.name);
4428 if (!dir || dir->get_frag() != fg) {
4429 dir = diri->get_dirfrag(fg);
4430 if (!dir)
4431 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4432 assert(dir);
4433 assert(dirs_to_share.count(dir));
4434 }
4435
4436 // and dentry
4437 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4438 assert(dn);
4439 CDentry::linkage_t *dnl = dn->get_linkage();
4440 assert(dnl->is_primary());
4441
4442 if (survivor && dn->is_replica(from))
4443 dentry_remove_replica(dn, from, gather_locks);
4444 unsigned dnonce = dn->add_replica(from);
4445 dout(10) << " have " << *dn << dendl;
4446 if (ack)
4447 ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4448 dnl->get_inode()->ino(), inodeno_t(0), 0,
4449 dnonce, dn->lock.get_replica_state());
4450
4451 // inode
4452 CInode *in = dnl->get_inode();
4453 assert(in);
4454
4455 if (survivor && in->is_replica(from))
4456 inode_remove_replica(in, from, true, gather_locks);
4457 unsigned inonce = in->add_replica(from);
4458 dout(10) << " have " << *in << dendl;
4459
4460 // scatter the dirlock, just in case?
4461 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4462 in->filelock.set_state(LOCK_MIX);
4463
4464 if (ack) {
4465 acked_inodes.insert(in->vino());
4466 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4467 bufferlist bl;
4468 in->_encode_locks_state_for_rejoin(bl, from);
4469 ack->add_inode_locks(in, inonce, bl);
4470 }
4471 }
4472 }
4473
4474 // weak base inodes? (root, stray, etc.)
4475 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4476 p != weak->weak_inodes.end();
4477 ++p) {
4478 CInode *in = get_inode(*p);
4479 assert(in); // hmm fixme wrt stray?
4480 if (survivor && in->is_replica(from))
4481 inode_remove_replica(in, from, true, gather_locks);
4482 unsigned inonce = in->add_replica(from);
4483 dout(10) << " have base " << *in << dendl;
4484
4485 if (ack) {
4486 acked_inodes.insert(in->vino());
4487 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4488 bufferlist bl;
4489 in->_encode_locks_state_for_rejoin(bl, from);
4490 ack->add_inode_locks(in, inonce, bl);
4491 }
4492 }
4493
4494 assert(rejoin_gather.count(from));
4495 rejoin_gather.erase(from);
4496 if (survivor) {
4497 // survivor. do everything now.
4498 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4499 p != weak->inode_scatterlocks.end();
4500 ++p) {
4501 CInode *in = get_inode(p->first);
4502 assert(in);
4503 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4504 acked_inodes.insert(in->vino());
4505 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4506 }
4507
4508 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4509 mds->send_message(ack, weak->get_connection());
4510
4511 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4512 if (!(*p)->is_stable())
4513 mds->locker->eval_gather(*p);
4514 }
4515 } else {
4516 // done?
4517 if (rejoin_gather.empty()) {
4518 rejoin_gather_finish();
4519 } else {
4520 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4521 }
4522 }
4523 }
4524
4525 class C_MDC_RejoinGatherFinish : public MDCacheContext {
4526 public:
4527 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4528 void finish(int r) override {
4529 mdcache->rejoin_gather_finish();
4530 }
4531 };
4532
4533 /*
4534 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4535 *
4536 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4537 * ack, the replica dne, and we can remove it from our replica maps.
4538 */
4539 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4540 set<vinodeno_t>& acked_inodes,
4541 set<SimpleLock *>& gather_locks)
4542 {
4543 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4544
4545 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
4546 p != inode_map.end();
4547 ++p) {
4548 CInode *in = p->second;
4549
4550 // inode?
4551 if (in->is_auth() &&
4552 in->is_replica(from) &&
4553 (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) {
4554 inode_remove_replica(in, from, false, gather_locks);
4555 dout(10) << " rem " << *in << dendl;
4556 }
4557
4558 if (!in->is_dir()) continue;
4559
4560 list<CDir*> dfs;
4561 in->get_dirfrags(dfs);
4562 for (list<CDir*>::iterator p = dfs.begin();
4563 p != dfs.end();
4564 ++p) {
4565 CDir *dir = *p;
4566
4567 if (dir->is_auth() &&
4568 dir->is_replica(from) &&
4569 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4570 dir->remove_replica(from);
4571 dout(10) << " rem " << *dir << dendl;
4572 }
4573
4574 // dentries
4575 for (CDir::map_t::iterator p = dir->items.begin();
4576 p != dir->items.end();
4577 ++p) {
4578 CDentry *dn = p->second;
4579
4580 if (dn->is_replica(from) &&
4581 (ack == NULL ||
4582 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4583 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
4584 dentry_remove_replica(dn, from, gather_locks);
4585 dout(10) << " rem " << *dn << dendl;
4586 }
4587 }
4588 }
4589 }
4590 }
4591
4592
4593 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4594 {
4595 CInode *in = new CInode(this, true, 1, last);
4596 in->inode.ino = ino;
4597 in->state_set(CInode::STATE_REJOINUNDEF);
4598 add_inode(in);
4599 rejoin_undef_inodes.insert(in);
4600 dout(10) << " invented " << *in << dendl;
4601 return in;
4602 }
4603
4604 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4605 {
4606 CInode *in = get_inode(df.ino);
4607 if (!in)
4608 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4609 if (!in->is_dir()) {
4610 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4611 in->inode.mode = S_IFDIR;
4612 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4613 }
4614 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4615 dir->state_set(CDir::STATE_REJOINUNDEF);
4616 rejoin_undef_dirfrags.insert(dir);
4617 dout(10) << " invented " << *dir << dendl;
4618 return dir;
4619 }
4620
4621 /* This functions DOES NOT put the passed message before returning */
4622 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4623 {
4624 mds_rank_t from = mds_rank_t(strong->get_source().num());
4625
4626 // only a recovering node will get a strong rejoin.
4627 assert(mds->is_rejoin());
4628
4629 // assimilate any potentially dirty scatterlock state
4630 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4631 p != strong->inode_scatterlocks.end();
4632 ++p) {
4633 CInode *in = get_inode(p->first);
4634 assert(in);
4635 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4636 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4637 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4638 rejoin_potential_updated_scatterlocks.insert(in);
4639 }
4640
4641 rejoin_unlinked_inodes[from].clear();
4642
4643 // surviving peer may send incorrect dirfrag here (maybe they didn't
4644 // get the fragment notify, or maybe we rolled back?). we need to
4645 // infer the right frag and get them with the program. somehow.
4646 // we don't normally send ACK.. so we'll need to bundle this with
4647 // MISSING or something.
4648
4649 // strong dirfrags/dentries.
4650 // also process auth_pins, xlocks.
4651 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4652 p != strong->strong_dirfrags.end();
4653 ++p) {
4654 CInode *diri = get_inode(p->first.ino);
4655 if (!diri)
4656 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4657 CDir *dir = diri->get_dirfrag(p->first.frag);
4658 bool refragged = false;
4659 if (dir) {
4660 dout(10) << " have " << *dir << dendl;
4661 } else {
4662 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4663 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4664 else if (diri->dirfragtree.is_leaf(p->first.frag))
4665 dir = rejoin_invent_dirfrag(p->first);
4666 }
4667 if (dir) {
4668 dir->add_replica(from, p->second.nonce);
4669 dir->dir_rep = p->second.dir_rep;
4670 } else {
4671 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4672 list<frag_t> ls;
4673 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4674 if (ls.empty())
4675 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4676 dout(10) << " maps to frag(s) " << ls << dendl;
4677 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4678 CDir *dir = diri->get_dirfrag(*q);
4679 if (!dir)
4680 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4681 else
4682 dout(10) << " have(approx) " << *dir << dendl;
4683 dir->add_replica(from, p->second.nonce);
4684 dir->dir_rep = p->second.dir_rep;
4685 }
4686 refragged = true;
4687 }
4688
4689 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4690 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4691 q != dmap.end();
4692 ++q) {
4693 CDentry *dn;
4694 if (!refragged)
4695 dn = dir->lookup(q->first.name, q->first.snapid);
4696 else {
4697 frag_t fg = diri->pick_dirfrag(q->first.name);
4698 dir = diri->get_dirfrag(fg);
4699 assert(dir);
4700 dn = dir->lookup(q->first.name, q->first.snapid);
4701 }
4702 if (!dn) {
4703 if (q->second.is_remote()) {
4704 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4705 q->second.first, q->first.snapid);
4706 } else if (q->second.is_null()) {
4707 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4708 } else {
4709 CInode *in = get_inode(q->second.ino, q->first.snapid);
4710 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4711 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4712 }
4713 dout(10) << " invented " << *dn << dendl;
4714 }
4715 CDentry::linkage_t *dnl = dn->get_linkage();
4716
4717 // dn auth_pin?
4718 if (strong->authpinned_dentries.count(p->first) &&
4719 strong->authpinned_dentries[p->first].count(q->first)) {
4720 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4721 r != strong->authpinned_dentries[p->first][q->first].end();
4722 ++r) {
4723 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4724
4725 // get/create slave mdrequest
4726 MDRequestRef mdr;
4727 if (have_request(r->reqid))
4728 mdr = request_get(r->reqid);
4729 else
4730 mdr = request_start_slave(r->reqid, r->attempt, strong);
4731 mdr->auth_pin(dn);
4732 }
4733 }
4734
4735 // dn xlock?
4736 if (strong->xlocked_dentries.count(p->first) &&
4737 strong->xlocked_dentries[p->first].count(q->first)) {
4738 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4739 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4740 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4741 assert(mdr->is_auth_pinned(dn));
4742 if (!mdr->xlocks.count(&dn->versionlock)) {
4743 assert(dn->versionlock.can_xlock_local());
4744 dn->versionlock.get_xlock(mdr, mdr->get_client());
4745 mdr->xlocks.insert(&dn->versionlock);
4746 mdr->locks.insert(&dn->versionlock);
4747 }
4748 if (dn->lock.is_stable())
4749 dn->auth_pin(&dn->lock);
4750 dn->lock.set_state(LOCK_XLOCK);
4751 dn->lock.get_xlock(mdr, mdr->get_client());
4752 mdr->xlocks.insert(&dn->lock);
4753 mdr->locks.insert(&dn->lock);
4754 }
4755
4756 dn->add_replica(from, q->second.nonce);
4757 dout(10) << " have " << *dn << dendl;
4758
4759 if (dnl->is_primary()) {
4760 if (q->second.is_primary()) {
4761 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4762 // the survivor missed MDentryUnlink+MDentryLink messages ?
4763 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4764 CInode *in = get_inode(q->second.ino, q->first.snapid);
4765 assert(in);
4766 assert(in->get_parent_dn());
4767 rejoin_unlinked_inodes[from].insert(in);
4768 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4769 }
4770 } else {
4771 // the survivor missed MDentryLink message ?
4772 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4773 dout(7) << " sender doesn't have primay dentry" << dendl;
4774 }
4775 } else {
4776 if (q->second.is_primary()) {
4777 // the survivor missed MDentryUnlink message ?
4778 CInode *in = get_inode(q->second.ino, q->first.snapid);
4779 assert(in);
4780 assert(in->get_parent_dn());
4781 rejoin_unlinked_inodes[from].insert(in);
4782 dout(7) << " sender has primary dentry but we don't" << dendl;
4783 }
4784 }
4785 }
4786 }
4787
4788 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4789 p != strong->strong_inodes.end();
4790 ++p) {
4791 CInode *in = get_inode(p->first);
4792 assert(in);
4793 in->add_replica(from, p->second.nonce);
4794 dout(10) << " have " << *in << dendl;
4795
4796 MMDSCacheRejoin::inode_strong &is = p->second;
4797
4798 // caps_wanted
4799 if (is.caps_wanted) {
4800 in->mds_caps_wanted[from] = is.caps_wanted;
4801 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4802 << " on " << *in << dendl;
4803 }
4804
4805 // scatterlocks?
4806 // infer state from replica state:
4807 // * go to MIX if they might have wrlocks
4808 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4809 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4810 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4811 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4812
4813 // auth pin?
4814 if (strong->authpinned_inodes.count(in->vino())) {
4815 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4816 r != strong->authpinned_inodes[in->vino()].end();
4817 ++r) {
4818 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4819
4820 // get/create slave mdrequest
4821 MDRequestRef mdr;
4822 if (have_request(r->reqid))
4823 mdr = request_get(r->reqid);
4824 else
4825 mdr = request_start_slave(r->reqid, r->attempt, strong);
4826 if (strong->frozen_authpin_inodes.count(in->vino())) {
4827 assert(!in->get_num_auth_pins());
4828 mdr->freeze_auth_pin(in);
4829 } else {
4830 assert(!in->is_frozen_auth_pin());
4831 }
4832 mdr->auth_pin(in);
4833 }
4834 }
4835 // xlock(s)?
4836 if (strong->xlocked_inodes.count(in->vino())) {
4837 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4838 q != strong->xlocked_inodes[in->vino()].end();
4839 ++q) {
4840 SimpleLock *lock = in->get_lock(q->first);
4841 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4842 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4843 assert(mdr->is_auth_pinned(in));
4844 if (!mdr->xlocks.count(&in->versionlock)) {
4845 assert(in->versionlock.can_xlock_local());
4846 in->versionlock.get_xlock(mdr, mdr->get_client());
4847 mdr->xlocks.insert(&in->versionlock);
4848 mdr->locks.insert(&in->versionlock);
4849 }
4850 if (lock->is_stable())
4851 in->auth_pin(lock);
4852 lock->set_state(LOCK_XLOCK);
4853 if (lock == &in->filelock)
4854 in->loner_cap = -1;
4855 lock->get_xlock(mdr, mdr->get_client());
4856 mdr->xlocks.insert(lock);
4857 mdr->locks.insert(lock);
4858 }
4859 }
4860 }
4861 // wrlock(s)?
4862 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4863 p != strong->wrlocked_inodes.end();
4864 ++p) {
4865 CInode *in = get_inode(p->first);
4866 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4867 q != p->second.end();
4868 ++q) {
4869 SimpleLock *lock = in->get_lock(q->first);
4870 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4871 r != q->second.end();
4872 ++r) {
4873 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4874 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4875 if (in->is_auth())
4876 assert(mdr->is_auth_pinned(in));
4877 lock->set_state(LOCK_MIX);
4878 if (lock == &in->filelock)
4879 in->loner_cap = -1;
4880 lock->get_wrlock(true);
4881 mdr->wrlocks.insert(lock);
4882 mdr->locks.insert(lock);
4883 }
4884 }
4885 }
4886
4887 // done?
4888 assert(rejoin_gather.count(from));
4889 rejoin_gather.erase(from);
4890 if (rejoin_gather.empty()) {
4891 rejoin_gather_finish();
4892 } else {
4893 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4894 }
4895 }
4896
4897 /* This functions DOES NOT put the passed message before returning */
4898 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4899 {
4900 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4901 mds_rank_t from = mds_rank_t(ack->get_source().num());
4902
4903 // for sending cache expire message
4904 set<CInode*> isolated_inodes;
4905 set<CInode*> refragged_inodes;
4906
4907 // dirs
4908 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4909 p != ack->strong_dirfrags.end();
4910 ++p) {
4911 // we may have had incorrect dir fragmentation; refragment based
4912 // on what they auth tells us.
4913 CDir *dir = get_dirfrag(p->first);
4914 if (!dir) {
4915 dir = get_force_dirfrag(p->first, false);
4916 if (dir)
4917 refragged_inodes.insert(dir->get_inode());
4918 }
4919 if (!dir) {
4920 CInode *diri = get_inode(p->first.ino);
4921 if (!diri) {
4922 // barebones inode; the full inode loop below will clean up.
4923 diri = new CInode(this, false);
4924 diri->inode.ino = p->first.ino;
4925 diri->inode.mode = S_IFDIR;
4926 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4927 add_inode(diri);
4928 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4929 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4930 dout(10) << " add inode " << *diri << dendl;
4931 } else {
4932 diri->inode_auth = CDIR_AUTH_DEFAULT;
4933 isolated_inodes.insert(diri);
4934 dout(10) << " unconnected dirfrag " << p->first << dendl;
4935 }
4936 }
4937 // barebones dirfrag; the full dirfrag loop below will clean up.
4938 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4939 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4940 (dir->authority() != CDIR_AUTH_UNDEF &&
4941 dir->authority().first != from))
4942 adjust_subtree_auth(dir, from);
4943 dout(10) << " add dirfrag " << *dir << dendl;
4944 }
4945
4946 dir->set_replica_nonce(p->second.nonce);
4947 dir->state_clear(CDir::STATE_REJOINING);
4948 dout(10) << " got " << *dir << dendl;
4949
4950 // dentries
4951 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4952 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4953 q != dmap.end();
4954 ++q) {
4955 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4956 if(!dn)
4957 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4958
4959 CDentry::linkage_t *dnl = dn->get_linkage();
4960
4961 assert(dn->last == q->first.snapid);
4962 if (dn->first != q->second.first) {
4963 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4964 dn->first = q->second.first;
4965 }
4966
4967 // may have bad linkage if we missed dentry link/unlink messages
4968 if (dnl->is_primary()) {
4969 CInode *in = dnl->get_inode();
4970 if (!q->second.is_primary() ||
4971 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4972 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4973 dir->unlink_inode(dn);
4974 }
4975 } else if (dnl->is_remote()) {
4976 if (!q->second.is_remote() ||
4977 q->second.remote_ino != dnl->get_remote_ino() ||
4978 q->second.remote_d_type != dnl->get_remote_d_type()) {
4979 dout(10) << " had bad linkage for " << *dn << dendl;
4980 dir->unlink_inode(dn);
4981 }
4982 } else {
4983 if (!q->second.is_null())
4984 dout(10) << " had bad linkage for " << *dn << dendl;
4985 }
4986
4987 // hmm, did we have the proper linkage here?
4988 if (dnl->is_null() && !q->second.is_null()) {
4989 if (q->second.is_remote()) {
4990 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4991 } else {
4992 CInode *in = get_inode(q->second.ino, q->first.snapid);
4993 if (!in) {
4994 // barebones inode; assume it's dir, the full inode loop below will clean up.
4995 in = new CInode(this, false, q->second.first, q->first.snapid);
4996 in->inode.ino = q->second.ino;
4997 in->inode.mode = S_IFDIR;
4998 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4999 add_inode(in);
5000 dout(10) << " add inode " << *in << dendl;
5001 } else if (in->get_parent_dn()) {
5002 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5003 << ", unlinking " << *in << dendl;
5004 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5005 }
5006 dn->dir->link_primary_inode(dn, in);
5007 isolated_inodes.erase(in);
5008 }
5009 }
5010
5011 dn->set_replica_nonce(q->second.nonce);
5012 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
5013 dn->state_clear(CDentry::STATE_REJOINING);
5014 dout(10) << " got " << *dn << dendl;
5015 }
5016 }
5017
5018 for (set<CInode*>::iterator p = refragged_inodes.begin();
5019 p != refragged_inodes.end();
5020 ++p) {
5021 list<CDir*> ls;
5022 (*p)->get_nested_dirfrags(ls);
5023 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5024 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5025 continue;
5026 assert((*q)->get_num_any() == 0);
5027 (*p)->close_dirfrag((*q)->get_frag());
5028 }
5029 }
5030
5031 // full dirfrags
5032 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5033 p != ack->dirfrag_bases.end();
5034 ++p) {
5035 CDir *dir = get_dirfrag(p->first);
5036 assert(dir);
5037 bufferlist::iterator q = p->second.begin();
5038 dir->_decode_base(q);
5039 dout(10) << " got dir replica " << *dir << dendl;
5040 }
5041
5042 // full inodes
5043 bufferlist::iterator p = ack->inode_base.begin();
5044 while (!p.end()) {
5045 inodeno_t ino;
5046 snapid_t last;
5047 bufferlist basebl;
5048 ::decode(ino, p);
5049 ::decode(last, p);
5050 ::decode(basebl, p);
5051 CInode *in = get_inode(ino, last);
5052 assert(in);
5053 bufferlist::iterator q = basebl.begin();
5054 in->_decode_base(q);
5055 dout(10) << " got inode base " << *in << dendl;
5056 }
5057
5058 // inodes
5059 p = ack->inode_locks.begin();
5060 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5061 while (!p.end()) {
5062 inodeno_t ino;
5063 snapid_t last;
5064 __u32 nonce;
5065 bufferlist lockbl;
5066 ::decode(ino, p);
5067 ::decode(last, p);
5068 ::decode(nonce, p);
5069 ::decode(lockbl, p);
5070
5071 CInode *in = get_inode(ino, last);
5072 assert(in);
5073 in->set_replica_nonce(nonce);
5074 bufferlist::iterator q = lockbl.begin();
5075 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
5076 in->state_clear(CInode::STATE_REJOINING);
5077 dout(10) << " got inode locks " << *in << dendl;
5078 }
5079
5080 // FIXME: This can happen if entire subtree, together with the inode subtree root
5081 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5082 assert(isolated_inodes.empty());
5083
5084 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5085 bufferlist::iterator bp = ack->imported_caps.begin();
5086 ::decode(peer_imported, bp);
5087
5088 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5089 p != peer_imported.end();
5090 ++p) {
5091 assert(cap_exports.count(p->first));
5092 assert(cap_export_targets.count(p->first));
5093 assert(cap_export_targets[p->first] == from);
5094 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5095 q != p->second.end();
5096 ++q) {
5097 assert(cap_exports[p->first].count(q->first));
5098
5099 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5100 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5101 assert(session);
5102
5103 // mark client caps stale.
5104 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5105 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5106 mds->get_osd_epoch_barrier());
5107 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5108 (q->second.cap_id > 0 ? from : -1), 0);
5109 mds->send_message_client_counted(m, session);
5110
5111 cap_exports[p->first].erase(q->first);
5112 }
5113 assert(cap_exports[p->first].empty());
5114 }
5115
5116 // done?
5117 assert(rejoin_ack_gather.count(from));
5118 rejoin_ack_gather.erase(from);
5119 if (mds->is_rejoin()) {
5120
5121 if (rejoin_gather.empty()) {
5122 // eval unstable scatter locks after all wrlocks are rejoined.
5123 while (!rejoin_eval_locks.empty()) {
5124 SimpleLock *lock = rejoin_eval_locks.front();
5125 rejoin_eval_locks.pop_front();
5126 if (!lock->is_stable())
5127 mds->locker->eval_gather(lock);
5128 }
5129 }
5130
5131 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5132 rejoin_ack_gather.empty()) {
5133 // finally, kickstart past snap parent opens
5134 open_snap_parents();
5135 } else {
5136 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5137 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5138 }
5139 } else {
5140 // survivor.
5141 mds->queue_waiters(rejoin_waiters);
5142 }
5143 }
5144
5145 /**
5146 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5147 *
5148 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5149 * messages that clean these guys up...
5150 */
5151 void MDCache::rejoin_trim_undef_inodes()
5152 {
5153 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5154
5155 while (!rejoin_undef_inodes.empty()) {
5156 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5157 CInode *in = *p;
5158 rejoin_undef_inodes.erase(p);
5159
5160 in->clear_replica_map();
5161
5162 // close out dirfrags
5163 if (in->is_dir()) {
5164 list<CDir*> dfls;
5165 in->get_dirfrags(dfls);
5166 for (list<CDir*>::iterator p = dfls.begin();
5167 p != dfls.end();
5168 ++p) {
5169 CDir *dir = *p;
5170 dir->clear_replica_map();
5171
5172 for (CDir::map_t::iterator p = dir->items.begin();
5173 p != dir->items.end();
5174 ++p) {
5175 CDentry *dn = p->second;
5176 dn->clear_replica_map();
5177
5178 dout(10) << " trimming " << *dn << dendl;
5179 dir->remove_dentry(dn);
5180 }
5181
5182 dout(10) << " trimming " << *dir << dendl;
5183 in->close_dirfrag(dir->dirfrag().frag);
5184 }
5185 }
5186
5187 CDentry *dn = in->get_parent_dn();
5188 if (dn) {
5189 dn->clear_replica_map();
5190 dout(10) << " trimming " << *dn << dendl;
5191 dn->dir->remove_dentry(dn);
5192 } else {
5193 dout(10) << " trimming " << *in << dendl;
5194 remove_inode(in);
5195 }
5196 }
5197
5198 assert(rejoin_undef_inodes.empty());
5199 }
5200
5201 void MDCache::rejoin_gather_finish()
5202 {
5203 dout(10) << "rejoin_gather_finish" << dendl;
5204 assert(mds->is_rejoin());
5205
5206 if (open_undef_inodes_dirfrags())
5207 return;
5208
5209 if (process_imported_caps())
5210 return;
5211
5212 choose_lock_states_and_reconnect_caps();
5213
5214 identify_files_to_recover();
5215 rejoin_send_acks();
5216
5217 // signal completion of fetches, rejoin_gather_finish, etc.
5218 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5219 rejoin_ack_gather.erase(mds->get_nodeid());
5220
5221 // did we already get our acks too?
5222 if (rejoin_ack_gather.empty()) {
5223 // finally, kickstart past snap parent opens
5224 open_snap_parents();
5225 }
5226 }
5227
5228 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5229 inodeno_t ino;
5230 public:
5231 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5232 void finish(int r) override {
5233 mdcache->rejoin_open_ino_finish(ino, r);
5234 }
5235 };
5236
5237 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5238 {
5239 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5240
5241 if (ret < 0) {
5242 cap_imports_missing.insert(ino);
5243 } else if (ret == mds->get_nodeid()) {
5244 assert(get_inode(ino));
5245 } else {
5246 auto p = cap_imports.find(ino);
5247 assert(p != cap_imports.end());
5248 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5249 assert(q->second.count(MDS_RANK_NONE));
5250 assert(q->second.size() == 1);
5251 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5252 }
5253 cap_imports.erase(p);
5254 }
5255
5256 assert(cap_imports_num_opening > 0);
5257 cap_imports_num_opening--;
5258
5259 if (cap_imports_num_opening == 0) {
5260 if (rejoin_gather.empty())
5261 rejoin_gather_finish();
5262 else if (rejoin_gather.count(mds->get_nodeid()))
5263 process_imported_caps();
5264 }
5265 }
5266
5267 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5268 public:
5269 map<client_t,entity_inst_t> client_map;
5270 map<client_t,uint64_t> sseqmap;
5271
5272 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5273 MDCacheLogContext(c), client_map(cm) {}
5274 void finish(int r) override {
5275 assert(r == 0);
5276 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5277 }
5278 };
5279
5280 void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5281 map<client_t,uint64_t>& sseqmap)
5282 {
5283 dout(10) << "rejoin_open_sessions_finish" << dendl;
5284 mds->server->finish_force_open_sessions(client_map, sseqmap);
5285 if (rejoin_gather.empty())
5286 rejoin_gather_finish();
5287 }
5288
5289 bool MDCache::process_imported_caps()
5290 {
5291 dout(10) << "process_imported_caps" << dendl;
5292
5293 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5294 CInode *in = get_inode(p->first);
5295 if (in) {
5296 assert(in->is_auth());
5297 cap_imports_missing.erase(p->first);
5298 continue;
5299 }
5300 if (cap_imports_missing.count(p->first) > 0)
5301 continue;
5302
5303 cap_imports_num_opening++;
5304 dout(10) << " opening missing ino " << p->first << dendl;
5305 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5306 }
5307
5308 if (cap_imports_num_opening > 0)
5309 return true;
5310
5311 // called by rejoin_gather_finish() ?
5312 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5313 // if sessions for imported caps are all open ?
5314 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5315 p != rejoin_client_map.end();
5316 ++p) {
5317 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5318 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5319 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5320 ESessions *le = new ESessions(pv, rejoin_client_map);
5321 mds->mdlog->start_submit_entry(le, finish);
5322 mds->mdlog->flush();
5323 rejoin_client_map.clear();
5324 return true;
5325 }
5326 }
5327 rejoin_client_map.clear();
5328
5329 // process caps that were exported by slave rename
5330 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5331 p != rejoin_slave_exports.end();
5332 ++p) {
5333 CInode *in = get_inode(p->first);
5334 assert(in);
5335 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5336 q != p->second.second.end();
5337 ++q) {
5338 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5339 assert(session);
5340
5341 Capability *cap = in->get_client_cap(q->first);
5342 if (!cap)
5343 cap = in->add_client_cap(q->first, session);
5344 cap->merge(q->second, true);
5345
5346 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5347 assert(cap->get_last_seq() == im.issue_seq);
5348 assert(cap->get_mseq() == im.mseq);
5349 cap->set_cap_id(im.cap_id);
5350 // send cap import because we assigned a new cap ID
5351 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5352 p->second.first, CEPH_CAP_FLAG_AUTH);
5353 }
5354 }
5355 rejoin_slave_exports.clear();
5356 rejoin_imported_caps.clear();
5357
5358 // process cap imports
5359 // ino -> client -> frommds -> capex
5360 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5361 CInode *in = get_inode(p->first);
5362 if (!in) {
5363 dout(10) << " still missing ino " << p->first
5364 << ", will try again after replayed client requests" << dendl;
5365 ++p;
5366 continue;
5367 }
5368 assert(in->is_auth());
5369 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5370 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5371 assert(session);
5372 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5373 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5374 add_reconnected_cap(q->first, in->ino(), r->second);
5375 if (r->first >= 0) {
5376 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5377 cap->inc_mseq();
5378 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5379
5380 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5381 im.cap_id = cap->get_cap_id();
5382 im.issue_seq = cap->get_last_seq();
5383 im.mseq = cap->get_mseq();
5384 }
5385 }
5386 }
5387 cap_imports.erase(p++); // remove and move on
5388 }
5389 } else {
5390 trim_non_auth();
5391
5392 rejoin_gather.erase(mds->get_nodeid());
5393 maybe_send_pending_rejoins();
5394
5395 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5396 rejoin_gather_finish();
5397 }
5398 return false;
5399 }
5400
5401 void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5402 {
5403 // are this realm's parents fully open?
5404 if (realm->have_past_parents_open()) {
5405 dout(10) << " have past snap parents for realm " << *realm
5406 << " on " << *realm->inode << dendl;
5407 if (reconnect) {
5408 // finish off client snaprealm reconnects?
5409 auto p = reconnected_snaprealms.find(realm->inode->ino());
5410 if (p != reconnected_snaprealms.end()) {
5411 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5412 finish_snaprealm_reconnect(q->first, realm, q->second);
5413 reconnected_snaprealms.erase(p);
5414 }
5415 }
5416 } else {
5417 if (!missing_snap_parents.count(realm->inode)) {
5418 dout(10) << " MISSING past snap parents for realm " << *realm
5419 << " on " << *realm->inode << dendl;
5420 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5421 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5422 } else {
5423 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5424 << " on " << *realm->inode << dendl;
5425 }
5426 }
5427 }
5428
5429 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5430 client_t client, snapid_t snap_follows)
5431 {
5432 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5433
5434 const set<snapid_t>& snaps = realm->get_snaps();
5435 snapid_t follows = snap_follows;
5436
5437 while (true) {
5438 CInode *in = pick_inode_snap(head_in, follows);
5439 if (in == head_in)
5440 break;
5441 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5442
5443 /* TODO: we can check the reconnected/flushing caps to find
5444 * which locks need gathering */
5445 for (int i = 0; i < num_cinode_locks; i++) {
5446 int lockid = cinode_lock_info[i].lock;
5447 SimpleLock *lock = in->get_lock(lockid);
5448 assert(lock);
5449 in->client_snap_caps[lockid].insert(client);
5450 in->auth_pin(lock);
5451 lock->set_state(LOCK_SNAP_SYNC);
5452 lock->get_wrlock(true);
5453 }
5454
5455 for (auto p = snaps.lower_bound(in->first);
5456 p != snaps.end() && *p <= in->last;
5457 ++p) {
5458 head_in->add_need_snapflush(in, *p, client);
5459 }
5460
5461 follows = in->last;
5462 }
5463 }
5464
5465 /*
5466 * choose lock states based on reconnected caps
5467 */
5468 void MDCache::choose_lock_states_and_reconnect_caps()
5469 {
5470 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5471
5472 map<client_t,MClientSnap*> splits;
5473
5474 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator i = inode_map.begin();
5475 i != inode_map.end();
5476 ++i) {
5477 CInode *in = i->second;
5478
5479 if (in->last != CEPH_NOSNAP)
5480 continue;
5481
5482 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5483 in->mark_dirty_rstat();
5484
5485 auto p = reconnected_caps.find(in->ino());
5486
5487 int dirty_caps = 0;
5488 if (p != reconnected_caps.end()) {
5489 for (const auto &it : p->second)
5490 dirty_caps |= it.second.dirty_caps;
5491 }
5492 in->choose_lock_states(dirty_caps);
5493 dout(15) << " chose lock states on " << *in << dendl;
5494
5495 SnapRealm *realm = in->find_snaprealm();
5496
5497 check_realm_past_parents(realm, realm == in->snaprealm);
5498
5499 if (p != reconnected_caps.end()) {
5500 bool missing_snap_parent = false;
5501 // also, make sure client's cap is in the correct snaprealm.
5502 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5503 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5504 if (realm->have_past_parents_open()) {
5505 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5506 } else {
5507 missing_snap_parent = true;
5508 }
5509 }
5510
5511 if (q->second.realm_ino == realm->inode->ino()) {
5512 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5513 } else {
5514 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5515 << " != " << realm->inode->ino() << dendl;
5516 if (realm->have_past_parents_open()) {
5517 // ok, include in a split message _now_.
5518 prepare_realm_split(realm, q->first, in->ino(), splits);
5519 } else {
5520 // send the split later.
5521 missing_snap_parent = true;
5522 }
5523 }
5524 }
5525 if (missing_snap_parent)
5526 missing_snap_parents[realm->inode].insert(in);
5527 }
5528 }
5529
5530 send_snaps(splits);
5531 }
5532
5533 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5534 map<client_t,MClientSnap*>& splits)
5535 {
5536 MClientSnap *snap;
5537 if (splits.count(client) == 0) {
5538 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5539 snap->head.split = realm->inode->ino();
5540 realm->build_snap_trace(snap->bl);
5541
5542 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5543 p != realm->open_children.end();
5544 ++p)
5545 snap->split_realms.push_back((*p)->inode->ino());
5546
5547 } else
5548 snap = splits[client];
5549 snap->split_inos.push_back(ino);
5550 }
5551
5552 void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5553 {
5554 dout(10) << "send_snaps" << dendl;
5555
5556 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5557 p != splits.end();
5558 ++p) {
5559 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5560 if (session) {
5561 dout(10) << " client." << p->first
5562 << " split " << p->second->head.split
5563 << " inos " << p->second->split_inos
5564 << dendl;
5565 mds->send_message_client_counted(p->second, session);
5566 } else {
5567 dout(10) << " no session for client." << p->first << dendl;
5568 p->second->put();
5569 }
5570 }
5571 splits.clear();
5572 }
5573
5574
5575 /*
5576 * remove any items from logsegment open_file lists that don't have
5577 * any caps
5578 */
5579 void MDCache::clean_open_file_lists()
5580 {
5581 dout(10) << "clean_open_file_lists" << dendl;
5582
5583 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5584 p != mds->mdlog->segments.end();
5585 ++p) {
5586 LogSegment *ls = p->second;
5587
5588 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5589 while (!q.end()) {
5590 CInode *in = *q;
5591 ++q;
5592 if (in->last == CEPH_NOSNAP) {
5593 if (!in->is_any_caps_wanted()) {
5594 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5595 in->item_open_file.remove_myself();
5596 }
5597 } else if (in->last != CEPH_NOSNAP) {
5598 if (in->client_snap_caps.empty()) {
5599 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5600 in->item_open_file.remove_myself();
5601 }
5602 }
5603 }
5604 }
5605 }
5606
5607
5608
5609 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5610 {
5611 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5612 << " on " << *in << dendl;
5613 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5614 if (!session) {
5615 dout(10) << " no session for client." << client << dendl;
5616 return NULL;
5617 }
5618
5619 Capability *cap = in->reconnect_cap(client, icr, session);
5620
5621 if (frommds >= 0) {
5622 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5623 cap->inc_mseq();
5624 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5625 }
5626
5627 return cap;
5628 }
5629
5630 void MDCache::export_remaining_imported_caps()
5631 {
5632 dout(10) << "export_remaining_imported_caps" << dendl;
5633
5634 stringstream warn_str;
5635
5636 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5637 warn_str << " ino " << p->first << "\n";
5638 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5639 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5640 if (session) {
5641 // mark client caps stale.
5642 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5643 stale->set_cap_peer(0, 0, 0, -1, 0);
5644 mds->send_message_client_counted(stale, q->first);
5645 }
5646 }
5647
5648 mds->heartbeat_reset();
5649 }
5650
5651 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5652 p != cap_reconnect_waiters.end();
5653 ++p)
5654 mds->queue_waiters(p->second);
5655
5656 cap_imports.clear();
5657 cap_reconnect_waiters.clear();
5658
5659 if (warn_str.peek() != EOF) {
5660 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5661 mds->clog->warn(warn_str);
5662 }
5663 }
5664
5665 void MDCache::try_reconnect_cap(CInode *in, Session *session)
5666 {
5667 client_t client = session->info.get_client();
5668 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5669 if (rc) {
5670 in->reconnect_cap(client, *rc, session);
5671 dout(10) << "try_reconnect_cap client." << client
5672 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5673 << " issue " << ccap_string(rc->capinfo.issued)
5674 << " on " << *in << dendl;
5675 remove_replay_cap_reconnect(in->ino(), client);
5676
5677 if (in->is_replicated()) {
5678 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5679 } else {
5680 int dirty_caps = 0;
5681 auto p = reconnected_caps.find(in->ino());
5682 if (p != reconnected_caps.end()) {
5683 auto q = p->second.find(client);
5684 if (q != p->second.end())
5685 dirty_caps = q->second.dirty_caps;
5686 }
5687 in->choose_lock_states(dirty_caps);
5688 dout(15) << " chose lock states on " << *in << dendl;
5689 }
5690
5691 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5692 cap_reconnect_waiters.find(in->ino());
5693 if (it != cap_reconnect_waiters.end()) {
5694 mds->queue_waiters(it->second);
5695 cap_reconnect_waiters.erase(it);
5696 }
5697 }
5698 }
5699
5700
5701
5702 // -------
5703 // cap imports and delayed snap parent opens
5704
5705 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5706 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5707 int peer, int p_flags)
5708 {
5709 client_t client = session->info.inst.name.num();
5710 SnapRealm *realm = in->find_snaprealm();
5711 if (realm->have_past_parents_open()) {
5712 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5713 if (cap->get_last_seq() == 0) // reconnected cap
5714 cap->inc_last_seq();
5715 cap->set_last_issue();
5716 cap->set_last_issue_stamp(ceph_clock_now());
5717 cap->clear_new();
5718 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5719 in->ino(),
5720 realm->inode->ino(),
5721 cap->get_cap_id(), cap->get_last_seq(),
5722 cap->pending(), cap->wanted(), 0,
5723 cap->get_mseq(), mds->get_osd_epoch_barrier());
5724 in->encode_cap_message(reap, cap);
5725 realm->build_snap_trace(reap->snapbl);
5726 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5727 mds->send_message_client_counted(reap, session);
5728 } else {
5729 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5730 << cap->get_mseq() << " on " << *in << dendl;
5731 in->auth_pin(this);
5732 cap->inc_suppress();
5733 delayed_imported_caps[client].insert(in);
5734 missing_snap_parents[in].size();
5735 }
5736 }
5737
5738 void MDCache::do_delayed_cap_imports()
5739 {
5740 dout(10) << "do_delayed_cap_imports" << dendl;
5741
5742 assert(delayed_imported_caps.empty());
5743 }
5744
5745 struct C_MDC_OpenSnapParents : public MDCacheContext {
5746 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5747 void finish(int r) override {
5748 mdcache->open_snap_parents();
5749 }
5750 };
5751
5752 void MDCache::open_snap_parents()
5753 {
5754 dout(10) << "open_snap_parents" << dendl;
5755
5756 map<client_t,MClientSnap*> splits;
5757 MDSGatherBuilder gather(g_ceph_context);
5758
5759 auto p = missing_snap_parents.begin();
5760 while (p != missing_snap_parents.end()) {
5761 CInode *in = p->first;
5762 assert(in->snaprealm);
5763 if (in->snaprealm->open_parents(gather.new_sub())) {
5764 dout(10) << " past parents now open on " << *in << dendl;
5765
5766 for (CInode *child : p->second) {
5767 auto q = reconnected_caps.find(child->ino());
5768 assert(q != reconnected_caps.end());
5769 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5770 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5771 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5772 }
5773 // make sure client's cap is in the correct snaprealm.
5774 if (r->second.realm_ino != in->ino()) {
5775 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5776 }
5777 }
5778 }
5779
5780 missing_snap_parents.erase(p++);
5781
5782 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5783
5784 // finish off client snaprealm reconnects?
5785 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5786 if (q != reconnected_snaprealms.end()) {
5787 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5788 r != q->second.end();
5789 ++r)
5790 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5791 reconnected_snaprealms.erase(q);
5792 }
5793 } else {
5794 dout(10) << " opening past parents on " << *in << dendl;
5795 ++p;
5796 }
5797 }
5798
5799 send_snaps(splits);
5800
5801 if (gather.has_subs()) {
5802 dout(10) << "open_snap_parents - waiting for "
5803 << gather.num_subs_remaining() << dendl;
5804 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5805 gather.activate();
5806 } else {
5807 if (!reconnected_snaprealms.empty()) {
5808 stringstream warn_str;
5809 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5810 p != reconnected_snaprealms.end();
5811 ++p) {
5812 warn_str << " unconnected snaprealm " << p->first << "\n";
5813 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5814 q != p->second.end();
5815 ++q)
5816 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5817 }
5818 mds->clog->warn() << "open_snap_parents has:";
5819 mds->clog->warn(warn_str);
5820 }
5821 assert(rejoin_waiters.empty());
5822 assert(missing_snap_parents.empty());
5823 dout(10) << "open_snap_parents - all open" << dendl;
5824 do_delayed_cap_imports();
5825
5826 assert(rejoin_done);
5827 rejoin_done.release()->complete(0);
5828 reconnected_caps.clear();
5829 }
5830 }
5831
5832 bool MDCache::open_undef_inodes_dirfrags()
5833 {
5834 dout(10) << "open_undef_inodes_dirfrags "
5835 << rejoin_undef_inodes.size() << " inodes "
5836 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5837
5838 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5839
5840 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5841 p != rejoin_undef_inodes.end();
5842 ++p) {
5843 CInode *in = *p;
5844 assert(!in->is_base());
5845 fetch_queue.insert(in->get_parent_dir());
5846 }
5847
5848 if (fetch_queue.empty())
5849 return false;
5850
5851 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5852 for (set<CDir*>::iterator p = fetch_queue.begin();
5853 p != fetch_queue.end();
5854 ++p) {
5855 CDir *dir = *p;
5856 CInode *diri = dir->get_inode();
5857 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5858 continue;
5859 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5860 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5861 dir->fetch(gather.new_sub());
5862 }
5863 assert(gather.has_subs());
5864 gather.activate();
5865 return true;
5866 }
5867
5868 void MDCache::opened_undef_inode(CInode *in) {
5869 dout(10) << "opened_undef_inode " << *in << dendl;
5870 rejoin_undef_inodes.erase(in);
5871 if (in->is_dir()) {
5872 // FIXME: re-hash dentries if necessary
5873 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5874 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5875 CDir *dir = in->get_dirfrag(frag_t());
5876 assert(dir);
5877 rejoin_undef_dirfrags.erase(dir);
5878 in->force_dirfrags();
5879 list<CDir*> ls;
5880 in->get_dirfrags(ls);
5881 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5882 rejoin_undef_dirfrags.insert(*p);
5883 }
5884 }
5885 }
5886
5887 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5888 {
5889 if (seq < realm->get_newest_seq()) {
5890 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5891 << realm->get_newest_seq()
5892 << " on " << *realm << dendl;
5893 // send an update
5894 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5895 if (session) {
5896 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5897 realm->build_snap_trace(snap->bl);
5898 mds->send_message_client_counted(snap, session);
5899 } else {
5900 dout(10) << " ...or not, no session for this client!" << dendl;
5901 }
5902 } else {
5903 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5904 << " on " << *realm << dendl;
5905 }
5906 }
5907
5908
5909
5910 void MDCache::rejoin_send_acks()
5911 {
5912 dout(7) << "rejoin_send_acks" << dendl;
5913
5914 // replicate stray
5915 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5916 p != rejoin_unlinked_inodes.end();
5917 ++p) {
5918 for (set<CInode*>::iterator q = p->second.begin();
5919 q != p->second.end();
5920 ++q) {
5921 CInode *in = *q;
5922 dout(7) << " unlinked inode " << *in << dendl;
5923 // inode expired
5924 if (!in->is_replica(p->first))
5925 continue;
5926 while (1) {
5927 CDentry *dn = in->get_parent_dn();
5928 if (dn->is_replica(p->first))
5929 break;
5930 dn->add_replica(p->first);
5931 CDir *dir = dn->get_dir();
5932 if (dir->is_replica(p->first))
5933 break;
5934 dir->add_replica(p->first);
5935 in = dir->get_inode();
5936 if (in->is_replica(p->first))
5937 break;
5938 in->add_replica(p->first);
5939 if (in->is_base())
5940 break;
5941 }
5942 }
5943 }
5944 rejoin_unlinked_inodes.clear();
5945
5946 // send acks to everyone in the recovery set
5947 map<mds_rank_t,MMDSCacheRejoin*> acks;
5948 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5949 p != recovery_set.end();
5950 ++p) {
5951 if (rejoin_ack_sent.count(*p))
5952 continue;
5953 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5954 }
5955
5956 rejoin_ack_sent = recovery_set;
5957
5958 // walk subtrees
5959 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5960 p != subtrees.end();
5961 ++p) {
5962 CDir *dir = p->first;
5963 if (!dir->is_auth())
5964 continue;
5965 dout(10) << "subtree " << *dir << dendl;
5966
5967 // auth items in this subtree
5968 list<CDir*> dq;
5969 dq.push_back(dir);
5970
5971 while (!dq.empty()) {
5972 CDir *dir = dq.front();
5973 dq.pop_front();
5974
5975 // dir
5976 for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
5977 r != dir->replicas_end();
5978 ++r) {
5979 auto it = acks.find(r->first);
5980 if (it == acks.end())
5981 continue;
5982 it->second->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
5983 it->second->add_dirfrag_base(dir);
5984 }
5985
5986 for (CDir::map_t::iterator q = dir->items.begin();
5987 q != dir->items.end();
5988 ++q) {
5989 CDentry *dn = q->second;
5990 CDentry::linkage_t *dnl = dn->get_linkage();
5991
5992 // inode
5993 CInode *in = NULL;
5994 if (dnl->is_primary())
5995 in = dnl->get_inode();
5996
5997 // dentry
5998 for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
5999 r != dn->replicas_end();
6000 ++r) {
6001 auto it = acks.find(r->first);
6002 if (it == acks.end())
6003 continue;
6004 it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
6005 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6006 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6007 dnl->is_remote() ? dnl->get_remote_d_type():0,
6008 ++r->second,
6009 dn->lock.get_replica_state());
6010 // peer missed MDentrylink message ?
6011 if (in && !in->is_replica(r->first))
6012 in->add_replica(r->first);
6013 }
6014
6015 if (!in)
6016 continue;
6017
6018 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
6019 r != in->replicas_end();
6020 ++r) {
6021 auto it = acks.find(r->first);
6022 if (it == acks.end())
6023 continue;
6024 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6025 bufferlist bl;
6026 in->_encode_locks_state_for_rejoin(bl, r->first);
6027 it->second->add_inode_locks(in, ++r->second, bl);
6028 }
6029
6030 // subdirs in this subtree?
6031 in->get_nested_dirfrags(dq);
6032 }
6033 }
6034 }
6035
6036 // base inodes too
6037 if (root && root->is_auth())
6038 for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
6039 r != root->replicas_end();
6040 ++r) {
6041 auto it = acks.find(r->first);
6042 if (it == acks.end())
6043 continue;
6044 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6045 bufferlist bl;
6046 root->_encode_locks_state_for_rejoin(bl, r->first);
6047 it->second->add_inode_locks(root, ++r->second, bl);
6048 }
6049 if (myin)
6050 for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
6051 r != myin->replicas_end();
6052 ++r) {
6053 auto it = acks.find(r->first);
6054 if (it == acks.end())
6055 continue;
6056 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6057 bufferlist bl;
6058 myin->_encode_locks_state_for_rejoin(bl, r->first);
6059 it->second->add_inode_locks(myin, ++r->second, bl);
6060 }
6061
6062 // include inode base for any inodes whose scatterlocks may have updated
6063 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6064 p != rejoin_potential_updated_scatterlocks.end();
6065 ++p) {
6066 CInode *in = *p;
6067 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
6068 r != in->replicas_end();
6069 ++r) {
6070 auto it = acks.find(r->first);
6071 if (it == acks.end())
6072 continue;
6073 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6074 }
6075 }
6076
6077 // send acks
6078 for (auto p = acks.begin(); p != acks.end(); ++p) {
6079 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6080 mds->send_message_mds(p->second, p->first);
6081 }
6082
6083 rejoin_imported_caps.clear();
6084 }
6085
6086 class C_MDC_ReIssueCaps : public MDCacheContext {
6087 CInode *in;
6088 public:
6089 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6090 MDCacheContext(mdc), in(i)
6091 {
6092 in->get(CInode::PIN_PTRWAITER);
6093 }
6094 void finish(int r) override {
6095 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6096 mdcache->mds->locker->issue_caps(in);
6097 in->put(CInode::PIN_PTRWAITER);
6098 }
6099 };
6100
6101 void MDCache::reissue_all_caps()
6102 {
6103 dout(10) << "reissue_all_caps" << dendl;
6104
6105 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6106 p != inode_map.end();
6107 ++p) {
6108 CInode *in = p->second;
6109 if (in->is_head() && in->is_any_caps()) {
6110 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6111 if (in->is_frozen_inode()) {
6112 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6113 continue;
6114 }
6115 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6116 mds->locker->issue_caps(in);
6117 }
6118 }
6119 }
6120
6121
6122 // ===============================================================================
6123
6124 struct C_MDC_QueuedCow : public MDCacheContext {
6125 CInode *in;
6126 MutationRef mut;
6127 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6128 MDCacheContext(mdc), in(i), mut(m) {}
6129 void finish(int r) override {
6130 mdcache->_queued_file_recover_cow(in, mut);
6131 }
6132 };
6133
6134
6135 void MDCache::queue_file_recover(CInode *in)
6136 {
6137 dout(10) << "queue_file_recover " << *in << dendl;
6138 assert(in->is_auth());
6139
6140 // cow?
6141 /*
6142 SnapRealm *realm = in->find_snaprealm();
6143 set<snapid_t> s = realm->get_snaps();
6144 while (!s.empty() && *s.begin() < in->first)
6145 s.erase(s.begin());
6146 while (!s.empty() && *s.rbegin() > in->last)
6147 s.erase(*s.rbegin());
6148 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6149 if (s.size() > 1) {
6150 inode_t *pi = in->project_inode();
6151 pi->version = in->pre_dirty();
6152
6153 auto mut(std::make_shared<MutationImpl>());
6154 mut->ls = mds->mdlog->get_current_segment();
6155 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6156 mds->mdlog->start_entry(le);
6157 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6158
6159 s.erase(*s.begin());
6160 while (!s.empty()) {
6161 snapid_t snapid = *s.begin();
6162 CInode *cow_inode = 0;
6163 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6164 assert(cow_inode);
6165 recovery_queue.enqueue(cow_inode);
6166 s.erase(*s.begin());
6167 }
6168
6169 in->parent->first = in->first;
6170 le->metablob.add_primary_dentry(in->parent, in, true);
6171 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6172 mds->mdlog->flush();
6173 }
6174 */
6175
6176 recovery_queue.enqueue(in);
6177 }
6178
6179 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6180 {
6181 in->pop_and_dirty_projected_inode(mut->ls);
6182 mut->apply();
6183 mds->locker->drop_locks(mut.get());
6184 mut->cleanup();
6185 }
6186
6187
6188 /*
6189 * called after recovery to recover file sizes for previously opened (for write)
6190 * files. that is, those where max_size > size.
6191 */
6192 void MDCache::identify_files_to_recover()
6193 {
6194 dout(10) << "identify_files_to_recover" << dendl;
6195 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6196 p != inode_map.end();
6197 ++p) {
6198 CInode *in = p->second;
6199 if (!in->is_auth())
6200 continue;
6201
6202 if (in->last != CEPH_NOSNAP)
6203 continue;
6204
6205 // Only normal files need file size recovery
6206 if (!in->is_file()) {
6207 continue;
6208 }
6209
6210 bool recover = false;
6211 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6212 p != in->inode.client_ranges.end();
6213 ++p) {
6214 Capability *cap = in->get_client_cap(p->first);
6215 if (!cap) {
6216 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6217 recover = true;
6218 break;
6219 }
6220 }
6221
6222 if (recover) {
6223 if (in->filelock.is_stable()) {
6224 in->auth_pin(&in->filelock);
6225 } else {
6226 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6227 }
6228 in->filelock.set_state(LOCK_PRE_SCAN);
6229 rejoin_recover_q.push_back(in);
6230 } else {
6231 rejoin_check_q.push_back(in);
6232 }
6233 }
6234 }
6235
6236 void MDCache::start_files_to_recover()
6237 {
6238 for (CInode *in : rejoin_check_q) {
6239 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6240 mds->locker->issue_caps(in);
6241 mds->locker->check_inode_max_size(in);
6242 }
6243 rejoin_check_q.clear();
6244 for (CInode *in : rejoin_recover_q) {
6245 mds->locker->file_recover(&in->filelock);
6246 }
6247 if (!rejoin_recover_q.empty()) {
6248 rejoin_recover_q.clear();
6249 do_file_recover();
6250 }
6251 }
6252
6253 void MDCache::do_file_recover()
6254 {
6255 recovery_queue.advance();
6256 }
6257
6258 // ===============================================================================
6259
6260
6261 // ----------------------------
6262 // truncate
6263
6264 class C_MDC_RetryTruncate : public MDCacheContext {
6265 CInode *in;
6266 LogSegment *ls;
6267 public:
6268 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6269 MDCacheContext(c), in(i), ls(l) {}
6270 void finish(int r) override {
6271 mdcache->_truncate_inode(in, ls);
6272 }
6273 };
6274
6275 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6276 {
6277 inode_t *pi = in->get_projected_inode();
6278 dout(10) << "truncate_inode "
6279 << pi->truncate_from << " -> " << pi->truncate_size
6280 << " on " << *in
6281 << dendl;
6282
6283 ls->truncating_inodes.insert(in);
6284 in->get(CInode::PIN_TRUNCATING);
6285 in->auth_pin(this);
6286
6287 if (!in->client_need_snapflush.empty() &&
6288 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6289 assert(in->filelock.is_xlocked());
6290 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6291 mds->locker->issue_caps(in);
6292 return;
6293 }
6294
6295 _truncate_inode(in, ls);
6296 }
6297
6298 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6299 CInode *in;
6300 LogSegment *ls;
6301 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6302 MDCacheIOContext(c), in(i), ls(l) {}
6303 void finish(int r) override {
6304 assert(r == 0 || r == -ENOENT);
6305 mdcache->truncate_inode_finish(in, ls);
6306 }
6307 };
6308
6309 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6310 {
6311 inode_t *pi = &in->inode;
6312 dout(10) << "_truncate_inode "
6313 << pi->truncate_from << " -> " << pi->truncate_size
6314 << " on " << *in << dendl;
6315
6316 assert(pi->is_truncating());
6317 assert(pi->truncate_size < (1ULL << 63));
6318 assert(pi->truncate_from < (1ULL << 63));
6319 assert(pi->truncate_size < pi->truncate_from);
6320
6321
6322 SnapRealm *realm = in->find_snaprealm();
6323 SnapContext nullsnap;
6324 const SnapContext *snapc;
6325 if (realm) {
6326 dout(10) << " realm " << *realm << dendl;
6327 snapc = &realm->get_snap_context();
6328 } else {
6329 dout(10) << " NO realm, using null context" << dendl;
6330 snapc = &nullsnap;
6331 assert(in->last == CEPH_NOSNAP);
6332 }
6333 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6334 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6335 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6336 pi->truncate_seq, ceph::real_time::min(), 0,
6337 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6338 mds->finisher));
6339 }
6340
6341 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6342 CInode *in;
6343 MutationRef mut;
6344 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6345 MDCacheLogContext(m), in(i), mut(mu) {}
6346 void finish(int r) override {
6347 mdcache->truncate_inode_logged(in, mut);
6348 }
6349 };
6350
6351 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6352 {
6353 dout(10) << "truncate_inode_finish " << *in << dendl;
6354
6355 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6356 assert(p != ls->truncating_inodes.end());
6357 ls->truncating_inodes.erase(p);
6358
6359 // update
6360 inode_t *pi = in->project_inode();
6361 pi->version = in->pre_dirty();
6362 pi->truncate_from = 0;
6363 pi->truncate_pending--;
6364
6365 MutationRef mut(new MutationImpl());
6366 mut->ls = mds->mdlog->get_current_segment();
6367 mut->add_projected_inode(in);
6368
6369 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6370 mds->mdlog->start_entry(le);
6371 CDentry *dn = in->get_projected_parent_dn();
6372 le->metablob.add_dir_context(dn->get_dir());
6373 le->metablob.add_primary_dentry(dn, in, true);
6374 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6375
6376 journal_dirty_inode(mut.get(), &le->metablob, in);
6377 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6378
6379 // flush immediately if there are readers/writers waiting
6380 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6381 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6382 mds->mdlog->flush();
6383 }
6384
6385 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6386 {
6387 dout(10) << "truncate_inode_logged " << *in << dendl;
6388 mut->apply();
6389 mds->locker->drop_locks(mut.get());
6390 mut->cleanup();
6391
6392 in->put(CInode::PIN_TRUNCATING);
6393 in->auth_unpin(this);
6394
6395 list<MDSInternalContextBase*> waiters;
6396 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6397 mds->queue_waiters(waiters);
6398 }
6399
6400
6401 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6402 {
6403 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6404 << ls->seq << "/" << ls->offset << dendl;
6405 ls->truncating_inodes.insert(in);
6406 in->get(CInode::PIN_TRUNCATING);
6407 }
6408
6409 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6410 {
6411 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6412 << ls->seq << "/" << ls->offset << dendl;
6413 // if we have the logseg the truncate started in, it must be in our list.
6414 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6415 assert(p != ls->truncating_inodes.end());
6416 ls->truncating_inodes.erase(p);
6417 in->put(CInode::PIN_TRUNCATING);
6418 }
6419
6420 void MDCache::start_recovered_truncates()
6421 {
6422 dout(10) << "start_recovered_truncates" << dendl;
6423 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6424 p != mds->mdlog->segments.end();
6425 ++p) {
6426 LogSegment *ls = p->second;
6427 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6428 q != ls->truncating_inodes.end();
6429 ++q) {
6430 CInode *in = *q;
6431 in->auth_pin(this);
6432
6433 if (!in->client_need_snapflush.empty() &&
6434 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6435 assert(in->filelock.is_stable());
6436 in->filelock.set_state(LOCK_XLOCKDONE);
6437 in->auth_pin(&in->filelock);
6438 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6439 // start_files_to_recover will revoke caps
6440 continue;
6441 }
6442 _truncate_inode(in, ls);
6443 }
6444 }
6445 }
6446
6447
6448
6449
6450
6451
6452 // ================================================================================
6453 // cache trimming
6454
6455
6456 /*
6457 * note: only called while MDS is active or stopping... NOT during recovery.
6458 * however, we may expire a replica whose authority is recovering.
6459 *
6460 */
6461 bool MDCache::trim(int max, int count)
6462 {
6463 // trim LRU
6464 if (count > 0) {
6465 max = lru.lru_get_size() - count;
6466 if (max <= 0)
6467 max = 1;
6468 } else if (max < 0) {
6469 max = g_conf->mds_cache_size;
6470 if (max <= 0)
6471 return false;
6472 }
6473 dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size()
6474 << "/" << bottom_lru.lru_get_size() << dendl;
6475
6476 // process delayed eval_stray()
6477 stray_manager.advance_delayed();
6478
6479 map<mds_rank_t, MCacheExpire*> expiremap;
6480 bool is_standby_replay = mds->is_standby_replay();
6481 int unexpirable = 0;
6482 list<CDentry*> unexpirables;
6483
6484 for (;;) {
6485 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6486 if (!dn)
6487 break;
6488 if (trim_dentry(dn, expiremap)) {
6489 unexpirables.push_back(dn);
6490 ++unexpirable;
6491 }
6492 }
6493
6494 for(auto dn : unexpirables)
6495 bottom_lru.lru_insert_mid(dn);
6496 unexpirables.clear();
6497
6498 // trim dentries from the LRU: only enough to satisfy `max`,
6499 while (lru.lru_get_size() + unexpirable > (unsigned)max) {
6500 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6501 if (!dn) {
6502 break;
6503 }
6504 if ((is_standby_replay && dn->get_linkage()->inode &&
6505 dn->get_linkage()->inode->item_open_file.is_on_list()) ||
6506 trim_dentry(dn, expiremap)) {
6507 unexpirables.push_back(dn);
6508 ++unexpirable;
6509 }
6510 }
6511 for(auto dn : unexpirables)
6512 lru.lru_insert_mid(dn);
6513 unexpirables.clear();
6514
6515 // trim non-auth, non-bound subtrees
6516 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6517 p != subtrees.end();) {
6518 CDir *dir = p->first;
6519 ++p;
6520 CInode *diri = dir->get_inode();
6521 if (dir->is_auth()) {
6522 if (!diri->is_auth() && !diri->is_base() &&
6523 dir->get_num_head_items() == 0) {
6524 if (dir->state_test(CDir::STATE_EXPORTING) ||
6525 dir->is_freezing() || dir->is_frozen())
6526 continue;
6527
6528 migrator->export_empty_import(dir);
6529 }
6530 } else {
6531 if (!diri->is_auth()) {
6532 if (dir->get_num_ref() > 1) // only subtree pin
6533 continue;
6534 list<CDir*> ls;
6535 diri->get_subtree_dirfrags(ls);
6536 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6537 continue;
6538
6539 // don't trim subtree root if its auth MDS is recovering.
6540 // This simplify the cache rejoin code.
6541 if (dir->is_subtree_root() &&
6542 rejoin_ack_gather.count(dir->get_dir_auth().first))
6543 continue;
6544 trim_dirfrag(dir, 0, expiremap);
6545 }
6546 }
6547 }
6548
6549 // trim root?
6550 if (max == 0 && root) {
6551 list<CDir*> ls;
6552 root->get_dirfrags(ls);
6553 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6554 CDir *dir = *p;
6555 if (dir->get_num_ref() == 1) // subtree pin
6556 trim_dirfrag(dir, 0, expiremap);
6557 }
6558 if (root->get_num_ref() == 0)
6559 trim_inode(0, root, 0, expiremap);
6560 }
6561
6562 std::set<mds_rank_t> stopping;
6563 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6564 stopping.erase(mds->get_nodeid());
6565 for (auto rank : stopping) {
6566 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6567 if (!mdsdir_in)
6568 continue;
6569
6570 if (expiremap.count(rank) == 0) {
6571 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6572 }
6573
6574 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6575
6576 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6577 if (!aborted) {
6578 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6579 list<CDir*> ls;
6580 mdsdir_in->get_dirfrags(ls);
6581 for (auto dir : ls) {
6582 if (dir->get_num_ref() == 1) // subtree pin
6583 trim_dirfrag(dir, dir, expiremap);
6584 }
6585 if (mdsdir_in->get_num_ref() == 0)
6586 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6587 } else {
6588 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6589 }
6590 }
6591
6592 // Other rank's base inodes (when I'm stopping)
6593 if (max == 0) {
6594 for (set<CInode*>::iterator p = base_inodes.begin();
6595 p != base_inodes.end(); ++p) {
6596 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6597 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6598 if ((*p)->get_num_ref() == 0) {
6599 trim_inode(NULL, *p, NULL, expiremap);
6600 }
6601 }
6602 }
6603 }
6604
6605 // send any expire messages
6606 send_expire_messages(expiremap);
6607
6608 return true;
6609 }
6610
6611 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6612 {
6613 // send expires
6614 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6615 it != expiremap.end();
6616 ++it) {
6617 if (mds->is_cluster_degraded() &&
6618 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6619 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6620 rejoin_sent.count(it->first) == 0))) {
6621 it->second->put();
6622 continue;
6623 }
6624 dout(7) << "sending cache_expire to " << it->first << dendl;
6625 mds->send_message_mds(it->second, it->first);
6626 }
6627 }
6628
6629
6630 bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6631 {
6632 dout(12) << "trim_dentry " << *dn << dendl;
6633
6634 CDentry::linkage_t *dnl = dn->get_linkage();
6635
6636 CDir *dir = dn->get_dir();
6637 assert(dir);
6638
6639 CDir *con = get_subtree_root(dir);
6640 if (con)
6641 dout(12) << " in container " << *con << dendl;
6642 else {
6643 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6644 assert(dn->is_auth());
6645 }
6646
6647 // If replica dentry is not readable, it's likely we will receive
6648 // MDentryLink/MDentryUnlink message soon (It's possible we first
6649 // receive a MDentryUnlink message, then MDentryLink message)
6650 // MDentryLink message only replicates an inode, so we should
6651 // avoid trimming the inode's parent dentry. This is because that
6652 // unconnected replicas are problematic for subtree migration.
6653 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6654 !dn->get_dir()->get_inode()->is_stray())
6655 return true;
6656
6657 // adjust the dir state
6658 // NOTE: we can safely remove a clean, null dentry without effecting
6659 // directory completeness.
6660 // (check this _before_ we unlink the inode, below!)
6661 bool clear_complete = false;
6662 if (!(dnl->is_null() && dn->is_clean()))
6663 clear_complete = true;
6664
6665 // unlink the dentry
6666 if (dnl->is_remote()) {
6667 // just unlink.
6668 dir->unlink_inode(dn, false);
6669 } else if (dnl->is_primary()) {
6670 // expire the inode, too.
6671 CInode *in = dnl->get_inode();
6672 assert(in);
6673 if (trim_inode(dn, in, con, expiremap))
6674 return true; // purging stray instead of trimming
6675 } else {
6676 assert(dnl->is_null());
6677 }
6678
6679 if (!dn->is_auth()) {
6680 // notify dentry authority.
6681 mds_authority_t auth = dn->authority();
6682
6683 for (int p=0; p<2; p++) {
6684 mds_rank_t a = auth.first;
6685 if (p) a = auth.second;
6686 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6687 if (mds->get_nodeid() == auth.second &&
6688 con->is_importing()) break; // don't send any expire while importing.
6689 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6690
6691 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6692 assert(a != mds->get_nodeid());
6693 if (expiremap.count(a) == 0)
6694 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6695 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
6696 }
6697 }
6698
6699 // remove dentry
6700 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6701 dir->add_to_bloom(dn);
6702 dir->remove_dentry(dn);
6703
6704 if (clear_complete)
6705 dir->state_clear(CDir::STATE_COMPLETE);
6706
6707 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6708 return false;
6709 }
6710
6711
6712 void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6713 {
6714 dout(15) << "trim_dirfrag " << *dir << dendl;
6715
6716 if (dir->is_subtree_root()) {
6717 assert(!dir->is_auth() ||
6718 (!dir->is_replicated() && dir->inode->is_base()));
6719 remove_subtree(dir); // remove from subtree map
6720 }
6721 assert(dir->get_num_ref() == 0);
6722
6723 CInode *in = dir->get_inode();
6724
6725 if (!dir->is_auth()) {
6726 mds_authority_t auth = dir->authority();
6727
6728 // was this an auth delegation? (if so, slightly modified container)
6729 dirfrag_t condf;
6730 if (dir->is_subtree_root()) {
6731 dout(12) << " subtree root, container is " << *dir << dendl;
6732 con = dir;
6733 condf = dir->dirfrag();
6734 } else {
6735 condf = con->dirfrag();
6736 }
6737
6738 for (int p=0; p<2; p++) {
6739 mds_rank_t a = auth.first;
6740 if (p) a = auth.second;
6741 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6742 if (mds->get_nodeid() == auth.second &&
6743 con->is_importing()) break; // don't send any expire while importing.
6744 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6745
6746 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6747 assert(a != mds->get_nodeid());
6748 if (expiremap.count(a) == 0)
6749 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6750 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6751 }
6752 }
6753
6754 in->close_dirfrag(dir->dirfrag().frag);
6755 }
6756
6757 /**
6758 * Try trimming an inode from the cache
6759 *
6760 * @return true if the inode is still in cache, else false if it was trimmed
6761 */
6762 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6763 {
6764 dout(15) << "trim_inode " << *in << dendl;
6765 assert(in->get_num_ref() == 0);
6766
6767 if (in->is_dir()) {
6768 // If replica inode's dirfragtreelock is not readable, it's likely
6769 // some dirfrags of the inode are being fragmented and we will receive
6770 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6771 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6772 // This is because that unconnected replicas are problematic for
6773 // subtree migration.
6774 //
6775 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6776 return true;
6777
6778 // DIR
6779 list<CDir*> dfls;
6780 in->get_dirfrags(dfls);
6781 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6782 CDir *dir = *p;
6783 assert(!dir->is_subtree_root());
6784 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6785 }
6786 }
6787
6788 // INODE
6789 if (in->is_auth()) {
6790 // eval stray after closing dirfrags
6791 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6792 maybe_eval_stray(in);
6793 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6794 return true;
6795 }
6796 } else {
6797 mds_authority_t auth = in->authority();
6798
6799 dirfrag_t df;
6800 if (con)
6801 df = con->dirfrag();
6802 else
6803 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6804
6805 for (int p=0; p<2; p++) {
6806 mds_rank_t a = auth.first;
6807 if (p) a = auth.second;
6808 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6809 if (con && mds->get_nodeid() == auth.second &&
6810 con->is_importing()) break; // don't send any expire while importing.
6811 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6812
6813 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6814 assert(a != mds->get_nodeid());
6815 if (expiremap.count(a) == 0)
6816 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6817 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6818 }
6819 }
6820
6821 /*
6822 if (in->is_auth()) {
6823 if (in->hack_accessed)
6824 mds->logger->inc("outt");
6825 else {
6826 mds->logger->inc("outut");
6827 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6828 }
6829 }
6830 */
6831
6832 // unlink
6833 if (dn)
6834 dn->get_dir()->unlink_inode(dn, false);
6835 remove_inode(in);
6836 return false;
6837 }
6838
6839
6840 /**
6841 * trim_non_auth - remove any non-auth items from our cache
6842 *
6843 * this reduces the amount of non-auth metadata in our cache, reducing the
6844 * load incurred by the rejoin phase.
6845 *
6846 * the only non-auth items that remain are those that are needed to
6847 * attach our own subtrees to the root.
6848 *
6849 * when we are done, all dentries will be in the top bit of the lru.
6850 *
6851 * why we have to do this:
6852 * we may not have accurate linkage for non-auth items. which means we will
6853 * know which subtree it falls into, and can not be sure to declare it to the
6854 * correct authority.
6855 */
6856 void MDCache::trim_non_auth()
6857 {
6858 dout(7) << "trim_non_auth" << dendl;
6859
6860 // temporarily pin all subtree roots
6861 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6862 p != subtrees.end();
6863 ++p)
6864 p->first->get(CDir::PIN_SUBTREETEMP);
6865
6866 list<CDentry*> auth_list;
6867
6868 // trim non-auth items from the lru
6869 for (;;) {
6870 CDentry *dn = NULL;
6871 if (bottom_lru.lru_get_size() > 0)
6872 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6873 if (!dn && lru.lru_get_size() > 0)
6874 dn = static_cast<CDentry*>(lru.lru_expire());
6875 if (!dn)
6876 break;
6877
6878 CDentry::linkage_t *dnl = dn->get_linkage();
6879
6880 if (dn->is_auth()) {
6881 // add back into lru (at the top)
6882 auth_list.push_back(dn);
6883
6884 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6885 dn->unlink_remote(dnl);
6886 } else {
6887 // non-auth. expire.
6888 CDir *dir = dn->get_dir();
6889 assert(dir);
6890
6891 // unlink the dentry
6892 dout(10) << " removing " << *dn << dendl;
6893 if (dnl->is_remote()) {
6894 dir->unlink_inode(dn, false);
6895 }
6896 else if (dnl->is_primary()) {
6897 CInode *in = dnl->get_inode();
6898 dout(10) << " removing " << *in << dendl;
6899 list<CDir*> ls;
6900 in->get_dirfrags(ls);
6901 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6902 CDir *subdir = *p;
6903 assert(!subdir->is_subtree_root());
6904 in->close_dirfrag(subdir->dirfrag().frag);
6905 }
6906 dir->unlink_inode(dn, false);
6907 remove_inode(in);
6908 }
6909 else {
6910 assert(dnl->is_null());
6911 }
6912
6913 assert(!dir->has_bloom());
6914 dir->remove_dentry(dn);
6915 // adjust the dir state
6916 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6917 // close empty non-auth dirfrag
6918 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6919 dir->inode->close_dirfrag(dir->get_frag());
6920 }
6921 }
6922
6923 for (auto dn : auth_list) {
6924 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6925 bottom_lru.lru_insert_mid(dn);
6926 else
6927 lru.lru_insert_top(dn);
6928 }
6929
6930 // move everything in the pintail to the top bit of the lru.
6931 lru.lru_touch_entire_pintail();
6932
6933 // unpin all subtrees
6934 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6935 p != subtrees.end();
6936 ++p)
6937 p->first->put(CDir::PIN_SUBTREETEMP);
6938
6939 if (lru.lru_get_size() == 0 &&
6940 bottom_lru.lru_get_size() == 0) {
6941 // root, stray, etc.?
6942 ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6943 while (p != inode_map.end()) {
6944 ceph::unordered_map<vinodeno_t,CInode*>::iterator next = p;
6945 ++next;
6946 CInode *in = p->second;
6947 if (!in->is_auth()) {
6948 list<CDir*> ls;
6949 in->get_dirfrags(ls);
6950 for (list<CDir*>::iterator p = ls.begin();
6951 p != ls.end();
6952 ++p) {
6953 dout(10) << " removing " << **p << dendl;
6954 assert((*p)->get_num_ref() == 1); // SUBTREE
6955 remove_subtree((*p));
6956 in->close_dirfrag((*p)->dirfrag().frag);
6957 }
6958 dout(10) << " removing " << *in << dendl;
6959 assert(!in->get_parent_dn());
6960 assert(in->get_num_ref() == 0);
6961 remove_inode(in);
6962 }
6963 p = next;
6964 }
6965 }
6966
6967 show_subtrees();
6968 }
6969
6970 /**
6971 * Recursively trim the subtree rooted at directory to remove all
6972 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6973 * of those links. This is used to clear invalid data out of the cache.
6974 * Note that it doesn't clear the passed-in directory, since that's not
6975 * always safe.
6976 */
6977 bool MDCache::trim_non_auth_subtree(CDir *dir)
6978 {
6979 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6980
6981 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6982
6983 CDir::map_t::iterator j = dir->begin();
6984 CDir::map_t::iterator i = j;
6985 while (j != dir->end()) {
6986 i = j++;
6987 CDentry *dn = i->second;
6988 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6989 CDentry::linkage_t *dnl = dn->get_linkage();
6990 if (dnl->is_primary()) { // check for subdirectories, etc
6991 CInode *in = dnl->get_inode();
6992 bool keep_inode = false;
6993 if (in->is_dir()) {
6994 list<CDir*> subdirs;
6995 in->get_dirfrags(subdirs);
6996 for (list<CDir*>::iterator subdir = subdirs.begin();
6997 subdir != subdirs.end();
6998 ++subdir) {
6999 if ((*subdir)->is_subtree_root()) {
7000 keep_inode = true;
7001 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7002 } else {
7003 if (trim_non_auth_subtree(*subdir))
7004 keep_inode = true;
7005 else {
7006 in->close_dirfrag((*subdir)->get_frag());
7007 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7008 }
7009 }
7010 }
7011
7012 }
7013 if (!keep_inode) { // remove it!
7014 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7015 dir->unlink_inode(dn, false);
7016 remove_inode(in);
7017 assert(!dir->has_bloom());
7018 dir->remove_dentry(dn);
7019 } else {
7020 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7021 dn->state_clear(CDentry::STATE_AUTH);
7022 in->state_clear(CInode::STATE_AUTH);
7023 }
7024 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7025 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7026 } else { // just remove it
7027 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7028 if (dnl->is_remote())
7029 dir->unlink_inode(dn, false);
7030 dir->remove_dentry(dn);
7031 }
7032 }
7033 dir->state_clear(CDir::STATE_AUTH);
7034 /**
7035 * We've now checked all our children and deleted those that need it.
7036 * Now return to caller, and tell them if *we're* a keeper.
7037 */
7038 return keep_dir || dir->get_num_any();
7039 }
7040
7041 /*
7042 * during replay, when we determine a subtree is no longer ours, we
7043 * try to trim it from our cache. because subtrees must be connected
7044 * to the root, the fact that we can trim this tree may mean that our
7045 * children or parents can also be trimmed.
7046 */
7047 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7048 {
7049 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7050
7051 // can we now trim child subtrees?
7052 set<CDir*> bounds;
7053 get_subtree_bounds(dir, bounds);
7054 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7055 CDir *bd = *p;
7056 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7057 bd->get_num_any() == 0 && // and empty
7058 can_trim_non_auth_dirfrag(bd)) {
7059 CInode *bi = bd->get_inode();
7060 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7061 remove_subtree(bd);
7062 bd->mark_clean();
7063 bi->close_dirfrag(bd->get_frag());
7064 }
7065 }
7066
7067 if (trim_non_auth_subtree(dir)) {
7068 // keep
7069 try_subtree_merge(dir);
7070 } else {
7071 // can we trim this subtree (and possibly our ancestors) too?
7072 while (true) {
7073 CInode *diri = dir->get_inode();
7074 if (diri->is_base()) {
7075 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7076 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7077 remove_subtree(dir);
7078 dir->mark_clean();
7079 diri->close_dirfrag(dir->get_frag());
7080
7081 dout(10) << " removing " << *diri << dendl;
7082 assert(!diri->get_parent_dn());
7083 assert(diri->get_num_ref() == 0);
7084 remove_inode(diri);
7085 }
7086 break;
7087 }
7088
7089 CDir *psub = get_subtree_root(diri->get_parent_dir());
7090 dout(10) << " parent subtree is " << *psub << dendl;
7091 if (psub->get_dir_auth().first == mds->get_nodeid())
7092 break; // we are auth, keep.
7093
7094 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7095 remove_subtree(dir);
7096 dir->mark_clean();
7097 diri->close_dirfrag(dir->get_frag());
7098
7099 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7100 if (trim_non_auth_subtree(psub))
7101 break;
7102 dir = psub;
7103 }
7104 }
7105
7106 show_subtrees();
7107 }
7108
7109 void MDCache::standby_trim_segment(LogSegment *ls)
7110 {
7111 ls->new_dirfrags.clear_list();
7112 ls->open_files.clear_list();
7113
7114 while (!ls->dirty_dirfrags.empty()) {
7115 CDir *dir = ls->dirty_dirfrags.front();
7116 dir->mark_clean();
7117 }
7118 while (!ls->dirty_inodes.empty()) {
7119 CInode *in = ls->dirty_inodes.front();
7120 in->mark_clean();
7121 }
7122 while (!ls->dirty_dentries.empty()) {
7123 CDentry *dn = ls->dirty_dentries.front();
7124 dn->mark_clean();
7125 }
7126 while (!ls->dirty_parent_inodes.empty()) {
7127 CInode *in = ls->dirty_parent_inodes.front();
7128 in->clear_dirty_parent();
7129 }
7130 while (!ls->dirty_dirfrag_dir.empty()) {
7131 CInode *in = ls->dirty_dirfrag_dir.front();
7132 in->filelock.remove_dirty();
7133 }
7134 while (!ls->dirty_dirfrag_nest.empty()) {
7135 CInode *in = ls->dirty_dirfrag_nest.front();
7136 in->nestlock.remove_dirty();
7137 }
7138 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7139 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7140 in->dirfragtreelock.remove_dirty();
7141 }
7142 }
7143
7144 /* This function DOES put the passed message before returning */
7145 void MDCache::handle_cache_expire(MCacheExpire *m)
7146 {
7147 mds_rank_t from = mds_rank_t(m->get_from());
7148
7149 dout(7) << "cache_expire from mds." << from << dendl;
7150
7151 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7152 m->put();
7153 return;
7154 }
7155
7156 set<SimpleLock *> gather_locks;
7157 // loop over realms
7158 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7159 p != m->realms.end();
7160 ++p) {
7161 // check container?
7162 if (p->first.ino > 0) {
7163 CInode *expired_inode = get_inode(p->first.ino);
7164 assert(expired_inode); // we had better have this.
7165 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7166 assert(parent_dir);
7167
7168 int export_state = -1;
7169 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7170 export_state = migrator->get_export_state(parent_dir);
7171 assert(export_state >= 0);
7172 }
7173
7174 if (!parent_dir->is_auth() ||
7175 (export_state != -1 &&
7176 ((export_state == Migrator::EXPORT_WARNING &&
7177 migrator->export_has_warned(parent_dir,from)) ||
7178 export_state == Migrator::EXPORT_EXPORTING ||
7179 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7180 (export_state == Migrator::EXPORT_NOTIFYING &&
7181 !migrator->export_has_notified(parent_dir,from))))) {
7182
7183 // not auth.
7184 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7185 assert(parent_dir->is_frozen_tree_root());
7186
7187 // make a message container
7188 if (delayed_expire[parent_dir].count(from) == 0)
7189 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7190
7191 // merge these expires into it
7192 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7193 continue;
7194 }
7195 assert(export_state <= Migrator::EXPORT_PREPPING ||
7196 (export_state == Migrator::EXPORT_WARNING &&
7197 !migrator->export_has_warned(parent_dir, from)));
7198
7199 dout(7) << "expires for " << *parent_dir << dendl;
7200 } else {
7201 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7202 }
7203
7204 // INODES
7205 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7206 it != p->second.inodes.end();
7207 ++it) {
7208 CInode *in = get_inode(it->first);
7209 unsigned nonce = it->second;
7210
7211 if (!in) {
7212 dout(0) << " inode expire on " << it->first << " from " << from
7213 << ", don't have it" << dendl;
7214 assert(in);
7215 }
7216 assert(in->is_auth());
7217 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7218
7219 // check nonce
7220 if (nonce == in->get_replica_nonce(from)) {
7221 // remove from our cached_by
7222 dout(7) << " inode expire on " << *in << " from mds." << from
7223 << " cached_by was " << in->get_replicas() << dendl;
7224 inode_remove_replica(in, from, false, gather_locks);
7225 }
7226 else {
7227 // this is an old nonce, ignore expire.
7228 dout(7) << " inode expire on " << *in << " from mds." << from
7229 << " with old nonce " << nonce
7230 << " (current " << in->get_replica_nonce(from) << "), dropping"
7231 << dendl;
7232 }
7233 }
7234
7235 // DIRS
7236 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7237 it != p->second.dirs.end();
7238 ++it) {
7239 CDir *dir = get_dirfrag(it->first);
7240 unsigned nonce = it->second;
7241
7242 if (!dir) {
7243 CInode *diri = get_inode(it->first.ino);
7244 if (diri) {
7245 if (mds->is_rejoin() &&
7246 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7247 !diri->is_replica(from)) {
7248 list<CDir*> ls;
7249 diri->get_nested_dirfrags(ls);
7250 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7251 << " while rejoining, inode isn't replicated" << dendl;
7252 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7253 dir = *q;
7254 if (dir->is_replica(from)) {
7255 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7256 dir->remove_replica(from);
7257 }
7258 }
7259 continue;
7260 }
7261 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7262 if (other) {
7263 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7264 << " have " << *other << ", mismatched frags, dropping" << dendl;
7265 continue;
7266 }
7267 }
7268 dout(0) << " dir expire on " << it->first << " from " << from
7269 << ", don't have it" << dendl;
7270 assert(dir);
7271 }
7272 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7273
7274 assert(dir->is_auth());
7275
7276 // check nonce
7277 if (nonce == dir->get_replica_nonce(from)) {
7278 // remove from our cached_by
7279 dout(7) << " dir expire on " << *dir << " from mds." << from
7280 << " replicas was " << dir->replica_map << dendl;
7281 dir->remove_replica(from);
7282 }
7283 else {
7284 // this is an old nonce, ignore expire.
7285 dout(7) << " dir expire on " << *dir << " from mds." << from
7286 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7287 << "), dropping" << dendl;
7288 }
7289 }
7290
7291 // DENTRIES
7292 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7293 pd != p->second.dentries.end();
7294 ++pd) {
7295 dout(10) << " dn expires in dir " << pd->first << dendl;
7296 CInode *diri = get_inode(pd->first.ino);
7297 assert(diri);
7298 CDir *dir = diri->get_dirfrag(pd->first.frag);
7299
7300 if (!dir) {
7301 dout(0) << " dn expires on " << pd->first << " from " << from
7302 << ", must have refragmented" << dendl;
7303 } else {
7304 assert(dir->is_auth());
7305 }
7306
7307 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7308 p != pd->second.end();
7309 ++p) {
7310 unsigned nonce = p->second;
7311 CDentry *dn;
7312
7313 if (dir) {
7314 dn = dir->lookup(p->first.first, p->first.second);
7315 } else {
7316 // which dirfrag for this dentry?
7317 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7318 assert(dir);
7319 assert(dir->is_auth());
7320 dn = dir->lookup(p->first.first, p->first.second);
7321 }
7322
7323 if (!dn) {
7324 if (dir)
7325 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7326 else
7327 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7328 }
7329 assert(dn);
7330
7331 if (nonce == dn->get_replica_nonce(from)) {
7332 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7333 dentry_remove_replica(dn, from, gather_locks);
7334 }
7335 else {
7336 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7337 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7338 << "), dropping" << dendl;
7339 }
7340 }
7341 }
7342 }
7343
7344 // done
7345 m->put();
7346
7347 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7348 if (!(*p)->is_stable())
7349 mds->locker->eval_gather(*p);
7350 }
7351 }
7352
7353 void MDCache::process_delayed_expire(CDir *dir)
7354 {
7355 dout(7) << "process_delayed_expire on " << *dir << dendl;
7356 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7357 p != delayed_expire[dir].end();
7358 ++p)
7359 handle_cache_expire(p->second);
7360 delayed_expire.erase(dir);
7361 }
7362
7363 void MDCache::discard_delayed_expire(CDir *dir)
7364 {
7365 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7366 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7367 p != delayed_expire[dir].end();
7368 ++p)
7369 p->second->put();
7370 delayed_expire.erase(dir);
7371 }
7372
7373 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7374 set<SimpleLock *>& gather_locks)
7375 {
7376 in->remove_replica(from);
7377 in->mds_caps_wanted.erase(from);
7378
7379 // note: this code calls _eval more often than it needs to!
7380 // fix lock
7381 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7382 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7383 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7384 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7385 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7386 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7387
7388 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7389 // Don't remove the recovering mds from lock's gathering list because
7390 // it may hold rejoined wrlocks.
7391 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7392 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7393 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7394 }
7395
7396 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7397 {
7398 dn->remove_replica(from);
7399
7400 // fix lock
7401 if (dn->lock.remove_replica(from))
7402 gather_locks.insert(&dn->lock);
7403
7404 // Replicated strays might now be elegible for purge
7405 CDentry::linkage_t *dnl = dn->get_linkage();
7406 if (dnl->is_primary()) {
7407 maybe_eval_stray(dnl->get_inode());
7408 }
7409 }
7410
7411 void MDCache::trim_client_leases()
7412 {
7413 utime_t now = ceph_clock_now();
7414
7415 dout(10) << "trim_client_leases" << dendl;
7416
7417 for (int pool=0; pool<client_lease_pools; pool++) {
7418 int before = client_leases[pool].size();
7419 if (client_leases[pool].empty())
7420 continue;
7421
7422 while (!client_leases[pool].empty()) {
7423 ClientLease *r = client_leases[pool].front();
7424 if (r->ttl > now) break;
7425 CDentry *dn = static_cast<CDentry*>(r->parent);
7426 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7427 dn->remove_client_lease(r, mds->locker);
7428 }
7429 int after = client_leases[pool].size();
7430 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7431 << (before-after) << " leases, " << after << " left" << dendl;
7432 }
7433 }
7434
7435
7436 void MDCache::check_memory_usage()
7437 {
7438 static MemoryModel mm(g_ceph_context);
7439 static MemoryModel::snap last;
7440 mm.sample(&last);
7441 static MemoryModel::snap baseline = last;
7442
7443 // check client caps
7444 assert(CInode::count() == inode_map.size());
7445 float caps_per_inode = 0.0;
7446 if (CInode::count())
7447 caps_per_inode = (float)Capability::count() / (float)CInode::count();
7448
7449 dout(2) << "check_memory_usage"
7450 << " total " << last.get_total()
7451 << ", rss " << last.get_rss()
7452 << ", heap " << last.get_heap()
7453 << ", baseline " << baseline.get_heap()
7454 << ", buffers " << (buffer::get_total_alloc() >> 10)
7455 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7456 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7457 << dendl;
7458
7459 mds->update_mlogger();
7460 mds->mlogger->set(l_mdm_rss, last.get_rss());
7461 mds->mlogger->set(l_mdm_heap, last.get_heap());
7462
7463 if (num_inodes_with_caps > g_conf->mds_cache_size) {
7464 float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
7465 if (ratio < 1.0) {
7466 last_recall_state = ceph_clock_now();
7467 mds->server->recall_client_state(ratio);
7468 }
7469 }
7470
7471 // If the cache size had exceeded its limit, but we're back in bounds
7472 // now, free any unused pool memory so that our memory usage isn't
7473 // permanently bloated.
7474 if (exceeded_size_limit
7475 && CInode::count() <=
7476 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
7477 // Only do this once we are back in bounds: otherwise the releases would
7478 // slow down whatever process caused us to exceed bounds to begin with
7479 if (ceph_using_tcmalloc()) {
7480 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7481 << dendl;
7482 ceph_heap_release_free_memory();
7483 }
7484 exceeded_size_limit = false;
7485 }
7486 }
7487
7488
7489
7490 // =========================================================================================
7491 // shutdown
7492
7493 class C_MDC_ShutdownCheck : public MDCacheContext {
7494 public:
7495 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7496 void finish(int) override {
7497 mdcache->shutdown_check();
7498 }
7499 };
7500
7501 void MDCache::shutdown_check()
7502 {
7503 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7504
7505 // cache
7506 char old_val[32] = { 0 };
7507 char *o = old_val;
7508 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7509 g_conf->set_val("debug_mds", "10");
7510 g_conf->apply_changes(NULL);
7511 show_cache();
7512 g_conf->set_val("debug_mds", old_val);
7513 g_conf->apply_changes(NULL);
7514 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7515
7516 // this
7517 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7518 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7519
7520
7521 if (mds->objecter->is_active()) {
7522 dout(0) << "objecter still active" << dendl;
7523 mds->objecter->dump_active();
7524 }
7525 }
7526
7527
7528 void MDCache::shutdown_start()
7529 {
7530 dout(2) << "shutdown_start" << dendl;
7531
7532 if (g_conf->mds_shutdown_check)
7533 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7534
7535 // g_conf->debug_mds = 10;
7536 }
7537
7538
7539
7540 bool MDCache::shutdown_pass()
7541 {
7542 dout(7) << "shutdown_pass" << dendl;
7543
7544 if (mds->is_stopped()) {
7545 dout(7) << " already shut down" << dendl;
7546 show_cache();
7547 show_subtrees();
7548 return true;
7549 }
7550
7551 // empty stray dir
7552 if (!shutdown_export_strays()) {
7553 dout(7) << "waiting for strays to migrate" << dendl;
7554 return false;
7555 }
7556
7557 // drop our reference to our stray dir inode
7558 for (int i = 0; i < NUM_STRAY; ++i) {
7559 if (strays[i] &&
7560 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7561 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7562 strays[i]->put(CInode::PIN_STRAY);
7563 strays[i]->put_stickydirs();
7564 }
7565 }
7566
7567 // trim cache
7568 trim(0);
7569 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7570
7571 // SUBTREES
7572 int num_auth_subtree = 0;
7573 if (!subtrees.empty() &&
7574 mds->get_nodeid() != 0 &&
7575 migrator->get_export_queue_size() == 0) {
7576 dout(7) << "looking for subtrees to export to mds0" << dendl;
7577 list<CDir*> ls;
7578 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7579 it != subtrees.end();
7580 ++it) {
7581 CDir *dir = it->first;
7582 if (dir->get_inode()->is_mdsdir())
7583 continue;
7584 if (dir->is_auth()) {
7585 num_auth_subtree++;
7586 if (dir->is_frozen() ||
7587 dir->is_freezing() ||
7588 dir->is_ambiguous_dir_auth() ||
7589 dir->state_test(CDir::STATE_EXPORTING))
7590 continue;
7591 ls.push_back(dir);
7592 }
7593 }
7594 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7595 CDir *dir = *p;
7596 mds_rank_t dest = dir->get_inode()->authority().first;
7597 if (dest > 0 && !mds->mdsmap->is_active(dest))
7598 dest = 0;
7599 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7600 migrator->export_dir_nicely(dir, dest);
7601 }
7602 }
7603
7604 if (num_auth_subtree > 0) {
7605 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7606 show_subtrees();
7607 return false;
7608 }
7609
7610 // close out any sessions (and open files!) before we try to trim the log, etc.
7611 if (mds->sessionmap.have_unclosed_sessions()) {
7612 if (!mds->server->terminating_sessions)
7613 mds->server->terminate_sessions();
7614 return false;
7615 }
7616
7617 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7618 if (mydir && !mydir->is_subtree_root())
7619 mydir = NULL;
7620
7621 // subtrees map not empty yet?
7622 if (subtrees.size() > (mydir ? 1 : 0)) {
7623 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7624 show_subtrees();
7625 migrator->show_importing();
7626 migrator->show_exporting();
7627 if (!migrator->is_importing() && !migrator->is_exporting())
7628 show_cache();
7629 return false;
7630 }
7631 assert(!migrator->is_exporting());
7632 assert(!migrator->is_importing());
7633
7634 if ((myin && myin->is_auth_pinned()) ||
7635 (mydir && mydir->is_auth_pinned())) {
7636 dout(7) << "still have auth pinned objects" << dendl;
7637 return false;
7638 }
7639
7640 // flush what we can from the log
7641 mds->mdlog->trim(0);
7642 if (mds->mdlog->get_num_segments() > 1) {
7643 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7644 return false;
7645 }
7646
7647 // (only do this once!)
7648 if (!mds->mdlog->is_capped()) {
7649 dout(7) << "capping the log" << dendl;
7650 mds->mdlog->cap();
7651 mds->mdlog->trim();
7652 }
7653
7654 if (!mds->mdlog->empty()) {
7655 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7656 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7657 return false;
7658 }
7659
7660 if (!did_shutdown_log_cap) {
7661 // flush journal header
7662 dout(7) << "writing header for (now-empty) journal" << dendl;
7663 assert(mds->mdlog->empty());
7664 mds->mdlog->write_head(0);
7665 // NOTE: filer active checker below will block us until this completes.
7666 did_shutdown_log_cap = true;
7667 return false;
7668 }
7669
7670 // filer active?
7671 if (mds->objecter->is_active()) {
7672 dout(7) << "objecter still active" << dendl;
7673 mds->objecter->dump_active();
7674 return false;
7675 }
7676
7677 // trim what we can from the cache
7678 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7679 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7680 show_cache();
7681 //dump();
7682 return false;
7683 }
7684
7685 // make mydir subtree go away
7686 if (mydir) {
7687 if (mydir->get_num_ref() > 1) { // subtree pin
7688 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7689 show_cache();
7690 return false;
7691 }
7692
7693 remove_subtree(mydir);
7694 myin->close_dirfrag(mydir->get_frag());
7695 }
7696 assert(subtrees.empty());
7697
7698 if (myin)
7699 remove_inode(myin);
7700
7701 // done!
7702 dout(2) << "shutdown done." << dendl;
7703 return true;
7704 }
7705
7706 bool MDCache::shutdown_export_strays()
7707 {
7708 if (mds->get_nodeid() == 0)
7709 return true;
7710
7711 dout(10) << "shutdown_export_strays" << dendl;
7712
7713 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7714
7715 bool done = true;
7716
7717 list<CDir*> dfs;
7718 for (int i = 0; i < NUM_STRAY; ++i) {
7719 if (!strays[i]) {
7720 continue;
7721 }
7722 strays[i]->get_dirfrags(dfs);
7723 }
7724
7725 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7726 dfs_i != dfs.end(); ++dfs_i)
7727 {
7728 CDir *dir = *dfs_i;
7729
7730 if (!dir->is_complete()) {
7731 dir->fetch(0);
7732 done = false;
7733 if (!mds0_active)
7734 break;
7735 }
7736
7737 for (CDir::map_t::iterator p = dir->items.begin();
7738 p != dir->items.end();
7739 ++p) {
7740 CDentry *dn = p->second;
7741 CDentry::linkage_t *dnl = dn->get_linkage();
7742 if (dnl->is_null())
7743 continue;
7744 done = false;
7745 if (!mds0_active)
7746 break;
7747
7748 if (dn->state_test(CDentry::STATE_PURGING)) {
7749 // Don't try to migrate anything that is actually
7750 // being purged right now
7751 continue;
7752 }
7753
7754 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7755 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7756 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7757 } else {
7758 dout(10) << "already exporting " << *dn << dendl;
7759 }
7760 }
7761 }
7762
7763 return done;
7764 }
7765
7766 // ========= messaging ==============
7767
7768 /* This function DOES put the passed message before returning */
7769 void MDCache::dispatch(Message *m)
7770 {
7771 switch (m->get_type()) {
7772
7773 // RESOLVE
7774 case MSG_MDS_RESOLVE:
7775 handle_resolve(static_cast<MMDSResolve*>(m));
7776 break;
7777 case MSG_MDS_RESOLVEACK:
7778 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7779 break;
7780
7781 // REJOIN
7782 case MSG_MDS_CACHEREJOIN:
7783 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7784 break;
7785
7786 case MSG_MDS_DISCOVER:
7787 handle_discover(static_cast<MDiscover*>(m));
7788 break;
7789 case MSG_MDS_DISCOVERREPLY:
7790 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7791 break;
7792
7793 case MSG_MDS_DIRUPDATE:
7794 handle_dir_update(static_cast<MDirUpdate*>(m));
7795 break;
7796
7797 case MSG_MDS_CACHEEXPIRE:
7798 handle_cache_expire(static_cast<MCacheExpire*>(m));
7799 break;
7800
7801 case MSG_MDS_DENTRYLINK:
7802 handle_dentry_link(static_cast<MDentryLink*>(m));
7803 break;
7804 case MSG_MDS_DENTRYUNLINK:
7805 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7806 break;
7807
7808 case MSG_MDS_FRAGMENTNOTIFY:
7809 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7810 break;
7811
7812 case MSG_MDS_FINDINO:
7813 handle_find_ino(static_cast<MMDSFindIno *>(m));
7814 break;
7815 case MSG_MDS_FINDINOREPLY:
7816 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7817 break;
7818
7819 case MSG_MDS_OPENINO:
7820 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7821 break;
7822 case MSG_MDS_OPENINOREPLY:
7823 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7824 break;
7825
7826 default:
7827 derr << "cache unknown message " << m->get_type() << dendl;
7828 assert(0 == "cache unknown message");
7829 }
7830 }
7831
7832 MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7833 {
7834 if (mdr) {
7835 dout(20) << "_get_waiter retryrequest" << dendl;
7836 return new C_MDS_RetryRequest(this, mdr);
7837 } else if (req) {
7838 dout(20) << "_get_waiter retrymessage" << dendl;
7839 return new C_MDS_RetryMessage(mds, req);
7840 } else {
7841 return fin;
7842 }
7843 }
7844
7845 int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7846 const filepath& path, // what
7847 vector<CDentry*> *pdnvec, // result
7848 CInode **pin,
7849 int onfail)
7850 {
7851 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7852 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7853 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7854
7855 assert(mdr || req || fin);
7856 assert(!forward || mdr || req); // forward requires a request
7857
7858 snapid_t snapid = CEPH_NOSNAP;
7859 if (mdr)
7860 mdr->snapid = snapid;
7861
7862 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7863
7864 if (mds->logger) mds->logger->inc(l_mds_traverse);
7865
7866 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7867 CInode *cur = get_inode(path.get_ino());
7868 if (cur == NULL) {
7869 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7870 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7871 else {
7872 //ceph_abort(); // hrm.. broken
7873 return -ESTALE;
7874 }
7875 return 1;
7876 }
7877 if (cur->state_test(CInode::STATE_PURGING))
7878 return -ESTALE;
7879
7880 // make sure snaprealm are open...
7881 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7882 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7883 return 1;
7884 }
7885
7886 // start trace
7887 if (pdnvec)
7888 pdnvec->clear();
7889 if (pin)
7890 *pin = cur;
7891
7892 unsigned depth = 0;
7893 while (depth < path.depth()) {
7894 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7895 << "' snapid " << snapid << dendl;
7896
7897 if (!cur->is_dir()) {
7898 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7899 return -ENOTDIR;
7900 }
7901
7902 // walk into snapdir?
7903 if (path[depth].length() == 0) {
7904 dout(10) << "traverse: snapdir" << dendl;
7905 if (!mdr)
7906 return -EINVAL;
7907 snapid = CEPH_SNAPDIR;
7908 mdr->snapid = snapid;
7909 depth++;
7910 continue;
7911 }
7912 // walk thru snapdir?
7913 if (snapid == CEPH_SNAPDIR) {
7914 if (!mdr)
7915 return -EINVAL;
7916 SnapRealm *realm = cur->find_snaprealm();
7917 snapid = realm->resolve_snapname(path[depth], cur->ino());
7918 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7919 if (!snapid)
7920 return -ENOENT;
7921 mdr->snapid = snapid;
7922 depth++;
7923 continue;
7924 }
7925
7926 // open dir
7927 frag_t fg = cur->pick_dirfrag(path[depth]);
7928 CDir *curdir = cur->get_dirfrag(fg);
7929 if (!curdir) {
7930 if (cur->is_auth()) {
7931 // parent dir frozen_dir?
7932 if (cur->is_frozen()) {
7933 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7934 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7935 return 1;
7936 }
7937 curdir = cur->get_or_open_dirfrag(this, fg);
7938 } else {
7939 // discover?
7940 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7941 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7942 null_okay);
7943 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7944 return 1;
7945 }
7946 }
7947 assert(curdir);
7948
7949 #ifdef MDS_VERIFY_FRAGSTAT
7950 if (curdir->is_complete())
7951 curdir->verify_fragstat();
7952 #endif
7953
7954 // frozen?
7955 /*
7956 if (curdir->is_frozen()) {
7957 // doh!
7958 // FIXME: traverse is allowed?
7959 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7960 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7961 if (onfinish) delete onfinish;
7962 return 1;
7963 }
7964 */
7965
7966 // Before doing dirfrag->dn lookup, compare with DamageTable's
7967 // record of which dentries were unreadable
7968 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7969 dout(4) << "traverse: stopped lookup at damaged dentry "
7970 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7971 return -EIO;
7972 }
7973
7974 // dentry
7975 CDentry *dn = curdir->lookup(path[depth], snapid);
7976 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7977
7978 // null and last_bit and xlocked by me?
7979 if (dnl && dnl->is_null() && null_okay) {
7980 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7981 if (pdnvec)
7982 pdnvec->push_back(dn);
7983 if (pin)
7984 *pin = 0;
7985 break; // done!
7986 }
7987
7988 if (dnl &&
7989 dn->lock.is_xlocked() &&
7990 dn->lock.get_xlock_by() != mdr &&
7991 !dn->lock.can_read(client) &&
7992 (dnl->is_null() || forward)) {
7993 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7994 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7995 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7996 mds->mdlog->flush();
7997 return 1;
7998 }
7999
8000 // can we conclude ENOENT?
8001 if (dnl && dnl->is_null()) {
8002 if (dn->lock.can_read(client) ||
8003 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8004 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8005 if (pdnvec) {
8006 if (depth == path.depth() - 1)
8007 pdnvec->push_back(dn);
8008 else
8009 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8010 }
8011 return -ENOENT;
8012 } else {
8013 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
8014 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8015 return 1;
8016 }
8017 }
8018
8019 if (dnl && !dnl->is_null()) {
8020 CInode *in = dnl->get_inode();
8021
8022 // do we have inode?
8023 if (!in) {
8024 assert(dnl->is_remote());
8025 // do i have it?
8026 in = get_inode(dnl->get_remote_ino());
8027 if (in) {
8028 dout(7) << "linking in remote in " << *in << dendl;
8029 dn->link_remote(dnl, in);
8030 } else {
8031 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8032 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8033 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8034 dout(4) << "traverse: remote dentry points to damaged ino "
8035 << *dn << dendl;
8036 return -EIO;
8037 }
8038 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8039 (null_okay && depth == path.depth() - 1));
8040 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8041 return 1;
8042 }
8043 }
8044
8045 cur = in;
8046 // make sure snaprealm are open...
8047 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8048 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8049 return 1;
8050 }
8051
8052 // add to trace, continue.
8053 touch_inode(cur);
8054 if (pdnvec)
8055 pdnvec->push_back(dn);
8056 if (pin)
8057 *pin = cur;
8058 depth++;
8059 continue;
8060 }
8061
8062
8063 // MISS. dentry doesn't exist.
8064 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8065
8066 if (curdir->is_auth()) {
8067 // dentry is mine.
8068 if (curdir->is_complete() ||
8069 (snapid == CEPH_NOSNAP &&
8070 curdir->has_bloom() &&
8071 !curdir->is_in_bloom(path[depth]))){
8072 // file not found
8073 if (pdnvec) {
8074 // instantiate a null dn?
8075 if (depth < path.depth()-1){
8076 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8077 dn = NULL;
8078 } else if (dn) {
8079 ceph_abort(); // should have fallen out in ->is_null() check above
8080 } else if (curdir->is_frozen()) {
8081 dout(20) << " not adding null to frozen dir " << dendl;
8082 } else if (snapid < CEPH_MAXSNAP) {
8083 dout(20) << " not adding null for snapid " << snapid << dendl;
8084 } else {
8085 // create a null dentry
8086 dn = curdir->add_null_dentry(path[depth]);
8087 dout(20) << " added null " << *dn << dendl;
8088 }
8089 if (dn)
8090 pdnvec->push_back(dn);
8091 else
8092 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8093 }
8094 return -ENOENT;
8095 } else {
8096
8097 // Check DamageTable for missing fragments before trying to fetch
8098 // this
8099 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8100 dout(4) << "traverse: damaged dirfrag " << *curdir
8101 << ", blocking fetch" << dendl;
8102 return -EIO;
8103 }
8104
8105 // directory isn't complete; reload
8106 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8107 touch_inode(cur);
8108 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8109 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8110 return 1;
8111 }
8112 } else {
8113 // dirfrag/dentry is not mine.
8114 mds_authority_t dauth = curdir->authority();
8115
8116 if (forward &&
8117 snapid && mdr && mdr->client_request &&
8118 (int)depth < mdr->client_request->get_num_fwd()) {
8119 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8120 << " < fwd " << mdr->client_request->get_num_fwd()
8121 << ", discovering instead of forwarding" << dendl;
8122 discover = true;
8123 }
8124
8125 if ((discover || null_okay)) {
8126 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8127 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8128 null_okay);
8129 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8130 return 1;
8131 }
8132 if (forward) {
8133 // forward
8134 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8135
8136 if (curdir->is_ambiguous_auth()) {
8137 // wait
8138 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8139 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8140 return 1;
8141 }
8142
8143 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8144
8145 if (mdr)
8146 request_forward(mdr, dauth.first);
8147 else
8148 mds->forward_message_mds(req, dauth.first);
8149
8150 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8151 assert(fin == NULL);
8152 return 2;
8153 }
8154 }
8155
8156 ceph_abort(); // i shouldn't get here
8157 }
8158
8159 // success.
8160 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8161 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8162 if (mdr)
8163 assert(mdr->snapid == snapid);
8164 return 0;
8165 }
8166
8167 CInode *MDCache::cache_traverse(const filepath& fp)
8168 {
8169 dout(10) << "cache_traverse " << fp << dendl;
8170
8171 CInode *in;
8172 if (fp.get_ino())
8173 in = get_inode(fp.get_ino());
8174 else
8175 in = root;
8176 if (!in)
8177 return NULL;
8178
8179 for (unsigned i = 0; i < fp.depth(); i++) {
8180 const string& dname = fp[i];
8181 frag_t fg = in->pick_dirfrag(dname);
8182 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8183 CDir *curdir = in->get_dirfrag(fg);
8184 if (!curdir)
8185 return NULL;
8186 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8187 if (!dn)
8188 return NULL;
8189 in = dn->get_linkage()->get_inode();
8190 if (!in)
8191 return NULL;
8192 }
8193 dout(10) << " got " << *in << dendl;
8194 return in;
8195 }
8196
8197
8198 /**
8199 * open_remote_dir -- open up a remote dirfrag
8200 *
8201 * @param diri base inode
8202 * @param approxfg approximate fragment.
8203 * @param fin completion callback
8204 */
8205 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8206 {
8207 dout(10) << "open_remote_dir on " << *diri << dendl;
8208 assert(diri->is_dir());
8209 assert(!diri->is_auth());
8210 assert(diri->get_dirfrag(approxfg) == 0);
8211
8212 discover_dir_frag(diri, approxfg, fin);
8213 }
8214
8215
8216 /**
8217 * get_dentry_inode - get or open inode
8218 *
8219 * @param dn the dentry
8220 * @param mdr current request
8221 *
8222 * will return inode for primary, or link up/open up remote link's inode as necessary.
8223 * If it's not available right now, puts mdr on wait list and returns null.
8224 */
8225 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8226 {
8227 CDentry::linkage_t *dnl;
8228 if (projected)
8229 dnl = dn->get_projected_linkage();
8230 else
8231 dnl = dn->get_linkage();
8232
8233 assert(!dnl->is_null());
8234
8235 if (dnl->is_primary())
8236 return dnl->inode;
8237
8238 assert(dnl->is_remote());
8239 CInode *in = get_inode(dnl->get_remote_ino());
8240 if (in) {
8241 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8242 dn->link_remote(dnl, in);
8243 return in;
8244 } else {
8245 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8246 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8247 return 0;
8248 }
8249 }
8250
8251 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8252 CDentry *dn;
8253 inodeno_t ino;
8254 MDSInternalContextBase *onfinish;
8255 bool want_xlocked;
8256 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
8257 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8258 dn->get(MDSCacheObject::PIN_PTRWAITER);
8259 }
8260 void finish(int r) override {
8261 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8262 dn->put(MDSCacheObject::PIN_PTRWAITER);
8263 }
8264 };
8265
8266 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8267 {
8268 dout(10) << "open_remote_dentry " << *dn << dendl;
8269 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8270 inodeno_t ino = dnl->get_remote_ino();
8271 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8272 open_ino(ino, pool,
8273 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8274 }
8275
8276 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8277 bool want_xlocked, int r)
8278 {
8279 if (r < 0) {
8280 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8281 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8282 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8283 dn->state_set(CDentry::STATE_BADREMOTEINO);
8284
8285 std::string path;
8286 CDir *dir = dn->get_dir();
8287 if (dir) {
8288 dir->get_inode()->make_path_string(path);
8289 path = path + "/" + dn->get_name();
8290 }
8291
8292 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8293 if (fatal) {
8294 mds->damaged();
8295 ceph_abort(); // unreachable, damaged() respawns us
8296 }
8297 } else {
8298 r = 0;
8299 }
8300 }
8301 fin->complete(r < 0 ? r : 0);
8302 }
8303
8304
8305 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8306 {
8307 // empty trace if we're a base inode
8308 if (in->is_base())
8309 return;
8310
8311 CInode *parent = in->get_parent_inode();
8312 assert(parent);
8313 make_trace(trace, parent);
8314
8315 CDentry *dn = in->get_parent_dn();
8316 dout(15) << "make_trace adding " << *dn << dendl;
8317 trace.push_back(dn);
8318 }
8319
8320
8321 // -------------------------------------------------------------------------------
8322 // Open inode by inode number
8323
8324 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8325 inodeno_t ino;
8326 public:
8327 bufferlist bl;
8328 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8329 MDCacheIOContext(c), ino(i) {}
8330 void finish(int r) override {
8331 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8332 }
8333 };
8334
8335 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8336 inodeno_t ino;
8337 MMDSOpenIno *msg;
8338 bool parent;
8339 public:
8340 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8341 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8342 void finish(int r) override {
8343 if (r < 0 && !parent)
8344 r = -EAGAIN;
8345 if (msg) {
8346 mdcache->handle_open_ino(msg, r);
8347 return;
8348 }
8349 assert(mdcache->opening_inodes.count(ino));
8350 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8351 }
8352 };
8353
8354 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8355 inodeno_t ino;
8356 public:
8357 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8358 void finish(int r) override {
8359 mdcache->_open_ino_parent_opened(ino, r);
8360 }
8361 };
8362
8363 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8364 {
8365 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8366
8367 assert(opening_inodes.count(ino));
8368 open_ino_info_t& info = opening_inodes[ino];
8369
8370 CInode *in = get_inode(ino);
8371 if (in) {
8372 dout(10) << " found cached " << *in << dendl;
8373 open_ino_finish(ino, info, in->authority().first);
8374 return;
8375 }
8376
8377 inode_backtrace_t backtrace;
8378 if (err == 0) {
8379 try {
8380 ::decode(backtrace, bl);
8381 } catch (const buffer::error &decode_exc) {
8382 derr << "corrupt backtrace on ino x0" << std::hex << ino
8383 << std::dec << ": " << decode_exc << dendl;
8384 open_ino_finish(ino, info, -EIO);
8385 return;
8386 }
8387 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8388 dout(10) << " old object in pool " << info.pool
8389 << ", retrying pool " << backtrace.pool << dendl;
8390 info.pool = backtrace.pool;
8391 C_IO_MDC_OpenInoBacktraceFetched *fin =
8392 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8393 fetch_backtrace(ino, info.pool, fin->bl,
8394 new C_OnFinisher(fin, mds->finisher));
8395 return;
8396 }
8397 } else if (err == -ENOENT) {
8398 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8399 if (info.pool != meta_pool) {
8400 dout(10) << " no object in pool " << info.pool
8401 << ", retrying pool " << meta_pool << dendl;
8402 info.pool = meta_pool;
8403 C_IO_MDC_OpenInoBacktraceFetched *fin =
8404 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8405 fetch_backtrace(ino, info.pool, fin->bl,
8406 new C_OnFinisher(fin, mds->finisher));
8407 return;
8408 }
8409 err = 0; // backtrace.ancestors.empty() is checked below
8410 }
8411
8412 if (err == 0) {
8413 if (backtrace.ancestors.empty()) {
8414 dout(10) << " got empty backtrace " << dendl;
8415 err = -EIO;
8416 } else if (!info.ancestors.empty()) {
8417 if (info.ancestors[0] == backtrace.ancestors[0]) {
8418 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8419 err = -EINVAL;
8420 } else {
8421 info.last_err = 0;
8422 }
8423 }
8424 }
8425 if (err) {
8426 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8427 if (info.last_err)
8428 err = info.last_err;
8429 open_ino_finish(ino, info, err);
8430 return;
8431 }
8432
8433 dout(10) << " got backtrace " << backtrace << dendl;
8434 info.ancestors = backtrace.ancestors;
8435
8436 _open_ino_traverse_dir(ino, info, 0);
8437 }
8438
8439 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8440 {
8441 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8442
8443 assert(opening_inodes.count(ino));
8444 open_ino_info_t& info = opening_inodes[ino];
8445
8446 CInode *in = get_inode(ino);
8447 if (in) {
8448 dout(10) << " found cached " << *in << dendl;
8449 open_ino_finish(ino, info, in->authority().first);
8450 return;
8451 }
8452
8453 if (ret == mds->get_nodeid()) {
8454 _open_ino_traverse_dir(ino, info, 0);
8455 } else {
8456 if (ret >= 0) {
8457 mds_rank_t checked_rank = mds_rank_t(ret);
8458 info.check_peers = true;
8459 info.auth_hint = checked_rank;
8460 info.checked.erase(checked_rank);
8461 }
8462 do_open_ino(ino, info, ret);
8463 }
8464 }
8465
8466 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8467 {
8468 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8469
8470 CInode *in = get_inode(ino);
8471 if (in) {
8472 dout(10) << " found cached " << *in << dendl;
8473 open_ino_finish(ino, info, in->authority().first);
8474 return;
8475 }
8476
8477 if (ret) {
8478 do_open_ino(ino, info, ret);
8479 return;
8480 }
8481
8482 mds_rank_t hint = info.auth_hint;
8483 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8484 info.discover, info.want_xlocked, &hint);
8485 if (ret > 0)
8486 return;
8487 if (hint != mds->get_nodeid())
8488 info.auth_hint = hint;
8489 do_open_ino(ino, info, ret);
8490 }
8491
8492 void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8493 {
8494 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8495 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8496 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8497 }
8498
8499 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8500 vector<inode_backpointer_t>& ancestors,
8501 bool discover, bool want_xlocked, mds_rank_t *hint)
8502 {
8503 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8504 int err = 0;
8505 for (unsigned i = 0; i < ancestors.size(); i++) {
8506 CInode *diri = get_inode(ancestors[i].dirino);
8507
8508 if (!diri) {
8509 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8510 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8511 return 1;
8512 }
8513 continue;
8514 }
8515
8516 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8517 CDir *dir = diri->get_parent_dir();
8518 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8519 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8520 dir = dir->get_inode()->get_parent_dir();
8521 _open_ino_fetch_dir(ino, m, dir, i == 0);
8522 return 1;
8523 }
8524
8525 if (!diri->is_dir()) {
8526 dout(10) << " " << *diri << " is not dir" << dendl;
8527 if (i == 0)
8528 err = -ENOTDIR;
8529 break;
8530 }
8531
8532 string &name = ancestors[i].dname;
8533 frag_t fg = diri->pick_dirfrag(name);
8534 CDir *dir = diri->get_dirfrag(fg);
8535 if (!dir) {
8536 if (diri->is_auth()) {
8537 if (diri->is_frozen()) {
8538 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8539 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8540 return 1;
8541 }
8542 dir = diri->get_or_open_dirfrag(this, fg);
8543 } else if (discover) {
8544 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8545 return 1;
8546 }
8547 }
8548 if (dir) {
8549 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8550 CDentry *dn = dir->lookup(name);
8551 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8552 if (dir->is_auth()) {
8553 if (dnl && dnl->is_primary() &&
8554 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8555 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8556 _open_ino_fetch_dir(ino, m, dir, i == 0);
8557 return 1;
8558 }
8559
8560 if (!dnl && !dir->is_complete() &&
8561 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8562 dout(10) << " fetching incomplete " << *dir << dendl;
8563 _open_ino_fetch_dir(ino, m, dir, i == 0);
8564 return 1;
8565 }
8566
8567 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8568 if (i == 0)
8569 err = -ENOENT;
8570 } else if (discover) {
8571 if (!dnl) {
8572 filepath path(name, 0);
8573 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8574 (i == 0 && want_xlocked));
8575 return 1;
8576 }
8577 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8578 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8579 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8580 return 1;
8581 }
8582 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8583 if (i == 0)
8584 err = -ENOENT;
8585 }
8586 }
8587 if (hint && i == 0)
8588 *hint = dir ? dir->authority().first : diri->authority().first;
8589 break;
8590 }
8591 return err;
8592 }
8593
8594 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8595 {
8596 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8597
8598 list<MDSInternalContextBase*> waiters;
8599 waiters.swap(info.waiters);
8600 opening_inodes.erase(ino);
8601 finish_contexts(g_ceph_context, waiters, ret);
8602 }
8603
8604 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8605 {
8606 if (err < 0 && err != -EAGAIN) {
8607 info.checked.clear();
8608 info.checking = MDS_RANK_NONE;
8609 info.check_peers = true;
8610 info.fetch_backtrace = true;
8611 if (info.discover) {
8612 info.discover = false;
8613 info.ancestors.clear();
8614 }
8615 if (err != -ENOENT && err != -ENOTDIR)
8616 info.last_err = err;
8617 }
8618
8619 if (info.check_peers || info.discover) {
8620 if (info.discover) {
8621 // got backtrace from peer, but failed to find inode. re-check peers
8622 info.discover = false;
8623 info.ancestors.clear();
8624 info.checked.clear();
8625 }
8626 info.check_peers = false;
8627 info.checking = MDS_RANK_NONE;
8628 do_open_ino_peer(ino, info);
8629 } else if (info.fetch_backtrace) {
8630 info.check_peers = true;
8631 info.fetch_backtrace = false;
8632 info.checking = mds->get_nodeid();
8633 info.checked.clear();
8634 C_IO_MDC_OpenInoBacktraceFetched *fin =
8635 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8636 fetch_backtrace(ino, info.pool, fin->bl,
8637 new C_OnFinisher(fin, mds->finisher));
8638 } else {
8639 assert(!info.ancestors.empty());
8640 info.checking = mds->get_nodeid();
8641 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8642 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8643 }
8644 }
8645
8646 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8647 {
8648 set<mds_rank_t> all, active;
8649 mds->mdsmap->get_mds_set(all);
8650 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8651 if (mds->get_state() == MDSMap::STATE_REJOIN)
8652 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8653
8654 dout(10) << "do_open_ino_peer " << ino << " active " << active
8655 << " all " << all << " checked " << info.checked << dendl;
8656
8657 mds_rank_t peer = MDS_RANK_NONE;
8658 if (info.auth_hint >= 0) {
8659 if (active.count(info.auth_hint)) {
8660 peer = info.auth_hint;
8661 info.auth_hint = MDS_RANK_NONE;
8662 }
8663 } else {
8664 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8665 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8666 peer = *p;
8667 break;
8668 }
8669 }
8670 if (peer < 0) {
8671 all.erase(mds->get_nodeid());
8672 if (all != info.checked) {
8673 dout(10) << " waiting for more peers to be active" << dendl;
8674 } else {
8675 dout(10) << " all MDS peers have been checked " << dendl;
8676 do_open_ino(ino, info, 0);
8677 }
8678 } else {
8679 info.checking = peer;
8680 vector<inode_backpointer_t> *pa = NULL;
8681 // got backtrace from peer or backtrace just fetched
8682 if (info.discover || !info.fetch_backtrace)
8683 pa = &info.ancestors;
8684 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8685 }
8686 }
8687
8688 void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8689 {
8690 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8691 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8692 m->put();
8693 return;
8694 }
8695
8696 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8697
8698 inodeno_t ino = m->ino;
8699 MMDSOpenInoReply *reply;
8700 CInode *in = get_inode(ino);
8701 if (in) {
8702 dout(10) << " have " << *in << dendl;
8703 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8704 if (in->is_auth()) {
8705 touch_inode(in);
8706 while (1) {
8707 CDentry *pdn = in->get_parent_dn();
8708 if (!pdn)
8709 break;
8710 CInode *diri = pdn->get_dir()->get_inode();
8711 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
8712 in->inode.version));
8713 in = diri;
8714 }
8715 } else {
8716 reply->hint = in->authority().first;
8717 }
8718 } else if (err < 0) {
8719 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8720 } else {
8721 mds_rank_t hint = MDS_RANK_NONE;
8722 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8723 if (ret > 0)
8724 return;
8725 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8726 }
8727 m->get_connection()->send_message(reply);
8728 m->put();
8729 }
8730
8731 void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8732 {
8733 dout(10) << "handle_open_ino_reply " << *m << dendl;
8734
8735 inodeno_t ino = m->ino;
8736 mds_rank_t from = mds_rank_t(m->get_source().num());
8737 auto it = opening_inodes.find(ino);
8738 if (it != opening_inodes.end() && it->second.checking == from) {
8739 open_ino_info_t& info = it->second;
8740 info.checking = MDS_RANK_NONE;
8741 info.checked.insert(from);
8742
8743 CInode *in = get_inode(ino);
8744 if (in) {
8745 dout(10) << " found cached " << *in << dendl;
8746 open_ino_finish(ino, info, in->authority().first);
8747 } else if (!m->ancestors.empty()) {
8748 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8749 if (!info.want_replica) {
8750 open_ino_finish(ino, info, from);
8751 m->put();
8752 return;
8753 }
8754
8755 info.ancestors = m->ancestors;
8756 info.auth_hint = from;
8757 info.checking = mds->get_nodeid();
8758 info.discover = true;
8759 _open_ino_traverse_dir(ino, info, 0);
8760 } else if (m->error) {
8761 dout(10) << " error " << m->error << " from mds." << from << dendl;
8762 do_open_ino(ino, info, m->error);
8763 } else {
8764 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8765 info.auth_hint = m->hint;
8766 info.checked.erase(m->hint);
8767 }
8768 do_open_ino_peer(ino, info);
8769 }
8770 }
8771 m->put();
8772 }
8773
8774 void MDCache::kick_open_ino_peers(mds_rank_t who)
8775 {
8776 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8777
8778 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8779 p != opening_inodes.end();
8780 ++p) {
8781 open_ino_info_t& info = p->second;
8782 if (info.checking == who) {
8783 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8784 info.checking = MDS_RANK_NONE;
8785 do_open_ino_peer(p->first, info);
8786 } else if (info.checking == MDS_RANK_NONE) {
8787 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8788 do_open_ino_peer(p->first, info);
8789 }
8790 }
8791 }
8792
8793 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8794 bool want_replica, bool want_xlocked)
8795 {
8796 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8797 << want_replica << dendl;
8798
8799 if (opening_inodes.count(ino)) {
8800 open_ino_info_t& info = opening_inodes[ino];
8801 if (want_replica) {
8802 info.want_replica = true;
8803 if (want_xlocked && !info.want_xlocked) {
8804 if (!info.ancestors.empty()) {
8805 CInode *diri = get_inode(info.ancestors[0].dirino);
8806 if (diri) {
8807 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8808 CDir *dir = diri->get_dirfrag(fg);
8809 if (dir && !dir->is_auth()) {
8810 filepath path(info.ancestors[0].dname, 0);
8811 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8812 }
8813 }
8814 }
8815 info.want_xlocked = true;
8816 }
8817 }
8818 info.waiters.push_back(fin);
8819 } else {
8820 open_ino_info_t& info = opening_inodes[ino];
8821 info.want_replica = want_replica;
8822 info.want_xlocked = want_xlocked;
8823 info.tid = ++open_ino_last_tid;
8824 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8825 info.waiters.push_back(fin);
8826 do_open_ino(ino, info, 0);
8827 }
8828 }
8829
8830 /* ---------------------------- */
8831
8832 /*
8833 * search for a given inode on MDS peers. optionally start with the given node.
8834
8835
8836 TODO
8837 - recover from mds node failure, recovery
8838 - traverse path
8839
8840 */
8841 void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8842 {
8843 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8844 assert(!have_inode(ino));
8845
8846 ceph_tid_t tid = ++find_ino_peer_last_tid;
8847 find_ino_peer_info_t& fip = find_ino_peer[tid];
8848 fip.ino = ino;
8849 fip.tid = tid;
8850 fip.fin = c;
8851 fip.hint = hint;
8852 _do_find_ino_peer(fip);
8853 }
8854
8855 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8856 {
8857 set<mds_rank_t> all, active;
8858 mds->mdsmap->get_mds_set(all);
8859 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8860
8861 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8862 << " active " << active << " all " << all
8863 << " checked " << fip.checked
8864 << dendl;
8865
8866 mds_rank_t m = MDS_RANK_NONE;
8867 if (fip.hint >= 0) {
8868 m = fip.hint;
8869 fip.hint = MDS_RANK_NONE;
8870 } else {
8871 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8872 if (*p != mds->get_nodeid() &&
8873 fip.checked.count(*p) == 0) {
8874 m = *p;
8875 break;
8876 }
8877 }
8878 if (m == MDS_RANK_NONE) {
8879 all.erase(mds->get_nodeid());
8880 if (all != fip.checked) {
8881 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8882 } else {
8883 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8884 fip.fin->complete(-ESTALE);
8885 find_ino_peer.erase(fip.tid);
8886 }
8887 } else {
8888 fip.checking = m;
8889 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8890 }
8891 }
8892
8893 void MDCache::handle_find_ino(MMDSFindIno *m)
8894 {
8895 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8896 m->put();
8897 return;
8898 }
8899
8900 dout(10) << "handle_find_ino " << *m << dendl;
8901 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8902 CInode *in = get_inode(m->ino);
8903 if (in) {
8904 in->make_path(r->path);
8905 dout(10) << " have " << r->path << " " << *in << dendl;
8906 }
8907 m->get_connection()->send_message(r);
8908 m->put();
8909 }
8910
8911
8912 void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8913 {
8914 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8915 if (p != find_ino_peer.end()) {
8916 dout(10) << "handle_find_ino_reply " << *m << dendl;
8917 find_ino_peer_info_t& fip = p->second;
8918
8919 // success?
8920 if (get_inode(fip.ino)) {
8921 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8922 mds->queue_waiter(fip.fin);
8923 find_ino_peer.erase(p);
8924 m->put();
8925 return;
8926 }
8927
8928 mds_rank_t from = mds_rank_t(m->get_source().num());
8929 if (fip.checking == from)
8930 fip.checking = MDS_RANK_NONE;
8931 fip.checked.insert(from);
8932
8933 if (!m->path.empty()) {
8934 // we got a path!
8935 vector<CDentry*> trace;
8936 MDRequestRef null_ref;
8937 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8938 if (r > 0)
8939 return;
8940 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8941 << ", retrying" << dendl;
8942 fip.checked.clear();
8943 _do_find_ino_peer(fip);
8944 } else {
8945 // nope, continue.
8946 _do_find_ino_peer(fip);
8947 }
8948 } else {
8949 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8950 }
8951 m->put();
8952 }
8953
8954 void MDCache::kick_find_ino_peers(mds_rank_t who)
8955 {
8956 // find_ino_peers requests we should move on from
8957 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8958 p != find_ino_peer.end();
8959 ++p) {
8960 find_ino_peer_info_t& fip = p->second;
8961 if (fip.checking == who) {
8962 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8963 fip.checking = MDS_RANK_NONE;
8964 _do_find_ino_peer(fip);
8965 } else if (fip.checking == MDS_RANK_NONE) {
8966 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8967 _do_find_ino_peer(fip);
8968 }
8969 }
8970 }
8971
8972 /* ---------------------------- */
8973
8974 int MDCache::get_num_client_requests()
8975 {
8976 int count = 0;
8977 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8978 p != active_requests.end();
8979 ++p) {
8980 MDRequestRef& mdr = p->second;
8981 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8982 count++;
8983 }
8984 return count;
8985 }
8986
8987 /* This function takes over the reference to the passed Message */
8988 MDRequestRef MDCache::request_start(MClientRequest *req)
8989 {
8990 // did we win a forward race against a slave?
8991 if (active_requests.count(req->get_reqid())) {
8992 MDRequestRef& mdr = active_requests[req->get_reqid()];
8993 assert(mdr);
8994 if (mdr->is_slave()) {
8995 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8996 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8997 } else {
8998 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8999 req->put();
9000 }
9001 return MDRequestRef();
9002 }
9003
9004 // register new client request
9005 MDRequestImpl::Params params;
9006 params.reqid = req->get_reqid();
9007 params.attempt = req->get_num_fwd();
9008 params.client_req = req;
9009 params.initiated = req->get_recv_stamp();
9010 params.throttled = req->get_throttle_stamp();
9011 params.all_read = req->get_recv_complete_stamp();
9012 params.dispatched = req->get_dispatch_stamp();
9013
9014 MDRequestRef mdr =
9015 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9016 active_requests[params.reqid] = mdr;
9017 mdr->set_op_stamp(req->get_stamp());
9018 dout(7) << "request_start " << *mdr << dendl;
9019 return mdr;
9020 }
9021
9022 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9023 {
9024 int by = m->get_source().num();
9025 MDRequestImpl::Params params;
9026 params.reqid = ri;
9027 params.attempt = attempt;
9028 params.triggering_slave_req = m;
9029 params.slave_to = by;
9030 params.initiated = m->get_recv_stamp();
9031 params.throttled = m->get_throttle_stamp();
9032 params.all_read = m->get_recv_complete_stamp();
9033 params.dispatched = m->get_dispatch_stamp();
9034 MDRequestRef mdr =
9035 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9036 assert(active_requests.count(mdr->reqid) == 0);
9037 active_requests[mdr->reqid] = mdr;
9038 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9039 return mdr;
9040 }
9041
9042 MDRequestRef MDCache::request_start_internal(int op)
9043 {
9044 MDRequestImpl::Params params;
9045 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9046 params.reqid.tid = mds->issue_tid();
9047 params.initiated = ceph_clock_now();
9048 params.internal_op = op;
9049 MDRequestRef mdr =
9050 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9051
9052 assert(active_requests.count(mdr->reqid) == 0);
9053 active_requests[mdr->reqid] = mdr;
9054 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9055 return mdr;
9056 }
9057
9058 MDRequestRef MDCache::request_get(metareqid_t rid)
9059 {
9060 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9061 assert(p != active_requests.end());
9062 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9063 return p->second;
9064 }
9065
9066 void MDCache::request_finish(MDRequestRef& mdr)
9067 {
9068 dout(7) << "request_finish " << *mdr << dendl;
9069 mdr->mark_event("finishing request");
9070
9071 // slave finisher?
9072 if (mdr->has_more() && mdr->more()->slave_commit) {
9073 Context *fin = mdr->more()->slave_commit;
9074 mdr->more()->slave_commit = 0;
9075 int ret;
9076 if (mdr->aborted) {
9077 mdr->aborted = false;
9078 ret = -1;
9079 mdr->more()->slave_rolling_back = true;
9080 } else {
9081 ret = 0;
9082 mdr->committing = true;
9083 }
9084 fin->complete(ret); // this must re-call request_finish.
9085 return;
9086 }
9087
9088 switch(mdr->internal_op) {
9089 case CEPH_MDS_OP_FRAGMENTDIR:
9090 logger->inc(l_mdss_ireq_fragmentdir);
9091 break;
9092 case CEPH_MDS_OP_EXPORTDIR:
9093 logger->inc(l_mdss_ireq_exportdir);
9094 break;
9095 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9096 logger->inc(l_mdss_ireq_enqueue_scrub);
9097 break;
9098 case CEPH_MDS_OP_FLUSH:
9099 logger->inc(l_mdss_ireq_flush);
9100 break;
9101 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9102 logger->inc(l_mdss_ireq_fragstats);
9103 break;
9104 case CEPH_MDS_OP_REPAIR_INODESTATS:
9105 logger->inc(l_mdss_ireq_inodestats);
9106 break;
9107 }
9108
9109 request_cleanup(mdr);
9110 }
9111
9112
9113 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9114 {
9115 mdr->mark_event("forwarding request");
9116 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9117 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9118 << *mdr->client_request << dendl;
9119 mds->forward_message_mds(mdr->client_request, who);
9120 mdr->client_request = 0;
9121 if (mds->logger) mds->logger->inc(l_mds_forward);
9122 } else if (mdr->internal_op >= 0) {
9123 dout(10) << "request_forward on internal op; cancelling" << dendl;
9124 mdr->internal_op_finish->complete(-EXDEV);
9125 } else {
9126 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9127 << " was from mds" << dendl;
9128 }
9129 request_cleanup(mdr);
9130 }
9131
9132
9133 void MDCache::dispatch_request(MDRequestRef& mdr)
9134 {
9135 if (mdr->client_request) {
9136 mds->server->dispatch_client_request(mdr);
9137 } else if (mdr->slave_request) {
9138 mds->server->dispatch_slave_request(mdr);
9139 } else {
9140 switch (mdr->internal_op) {
9141 case CEPH_MDS_OP_FRAGMENTDIR:
9142 dispatch_fragment_dir(mdr);
9143 break;
9144 case CEPH_MDS_OP_EXPORTDIR:
9145 migrator->dispatch_export_dir(mdr, 0);
9146 break;
9147 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9148 enqueue_scrub_work(mdr);
9149 break;
9150 case CEPH_MDS_OP_FLUSH:
9151 flush_dentry_work(mdr);
9152 break;
9153 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9154 repair_dirfrag_stats_work(mdr);
9155 break;
9156 case CEPH_MDS_OP_REPAIR_INODESTATS:
9157 repair_inode_stats_work(mdr);
9158 break;
9159 default:
9160 ceph_abort();
9161 }
9162 }
9163 }
9164
9165
9166 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9167 {
9168 if (!mdr->has_more())
9169 return;
9170
9171 // clean up slaves
9172 // (will implicitly drop remote dn pins)
9173 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9174 p != mdr->more()->slaves.end();
9175 ++p) {
9176 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9177 MMDSSlaveRequest::OP_FINISH);
9178
9179 if (mdr->killed && !mdr->committing) {
9180 r->mark_abort();
9181 } else if (mdr->more()->srcdn_auth_mds == *p &&
9182 mdr->more()->inode_import.length() > 0) {
9183 // information about rename imported caps
9184 r->inode_export.claim(mdr->more()->inode_import);
9185 }
9186
9187 mds->send_message_mds(r, *p);
9188 }
9189
9190 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9191 * implicitly. Note that we don't call the finishers -- there shouldn't
9192 * be any on a remote lock and the request finish wakes up all
9193 * the waiters anyway! */
9194 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9195 while (p != mdr->xlocks.end()) {
9196 if ((*p)->get_parent()->is_auth())
9197 ++p;
9198 else {
9199 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9200 << " on " << *(*p)->get_parent() << dendl;
9201 (*p)->put_xlock();
9202 mdr->locks.erase(*p);
9203 mdr->xlocks.erase(p++);
9204 }
9205 }
9206
9207 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9208 while (q != mdr->remote_wrlocks.end()) {
9209 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9210 << " on mds." << q->second
9211 << " on " << *(q->first)->get_parent() << dendl;
9212 mdr->locks.erase(q->first);
9213 mdr->remote_wrlocks.erase(q++);
9214 }
9215
9216 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9217 * leaving them in can cause double-notifies as
9218 * this function can get called more than once */
9219 }
9220
9221 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9222 {
9223 request_drop_foreign_locks(mdr);
9224 mds->locker->drop_non_rdlocks(mdr.get());
9225 }
9226
9227 void MDCache::request_drop_locks(MDRequestRef& mdr)
9228 {
9229 request_drop_foreign_locks(mdr);
9230 mds->locker->drop_locks(mdr.get());
9231 }
9232
9233 void MDCache::request_cleanup(MDRequestRef& mdr)
9234 {
9235 dout(15) << "request_cleanup " << *mdr << dendl;
9236
9237 if (mdr->has_more()) {
9238 if (mdr->more()->is_ambiguous_auth)
9239 mdr->clear_ambiguous_auth();
9240 if (!mdr->more()->waiting_for_finish.empty())
9241 mds->queue_waiters(mdr->more()->waiting_for_finish);
9242 }
9243
9244 request_drop_locks(mdr);
9245
9246 // drop (local) auth pins
9247 mdr->drop_local_auth_pins();
9248
9249 // drop stickydirs
9250 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9251 p != mdr->stickydirs.end();
9252 ++p)
9253 (*p)->put_stickydirs();
9254
9255 mds->locker->kick_cap_releases(mdr);
9256
9257 // drop cache pins
9258 mdr->drop_pins();
9259
9260 // remove from session
9261 mdr->item_session_request.remove_myself();
9262
9263 // remove from map
9264 active_requests.erase(mdr->reqid);
9265
9266 if (mds->logger)
9267 log_stat();
9268
9269 mdr->mark_event("cleaned up request");
9270 }
9271
9272 void MDCache::request_kill(MDRequestRef& mdr)
9273 {
9274 // rollback slave requests is tricky. just let the request proceed.
9275 if (mdr->done_locking && mdr->has_more() &&
9276 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9277 dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
9278
9279 assert(mdr->used_prealloc_ino == 0);
9280 assert(mdr->prealloc_inos.empty());
9281
9282 mdr->session = NULL;
9283 mdr->item_session_request.remove_myself();
9284 return;
9285 }
9286
9287 mdr->killed = true;
9288 mdr->mark_event("killing request");
9289
9290 if (mdr->committing) {
9291 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9292 } else {
9293 dout(10) << "request_kill " << *mdr << dendl;
9294 request_cleanup(mdr);
9295 }
9296 }
9297
9298 // -------------------------------------------------------------------------------
9299 // SNAPREALMS
9300
9301 struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9302 MDRequestRef mdr;
9303 MutationRef mut;
9304 CInode *in;
9305 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9306 MutationRef& mu, CInode *i) :
9307 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9308 void finish(int r) override {
9309 mdcache->_snaprealm_create_finish(mdr, mut, in);
9310 }
9311 };
9312
9313 void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9314 {
9315 dout(10) << "snaprealm_create " << *in << dendl;
9316 assert(!in->snaprealm);
9317
9318 // allocate an id..
9319 if (!mdr->more()->stid) {
9320 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9321 new C_MDS_RetryRequest(this, mdr));
9322 return;
9323 }
9324
9325 MutationRef mut(new MutationImpl());
9326 mut->ls = mds->mdlog->get_current_segment();
9327 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9328 mds->mdlog->start_entry(le);
9329
9330 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9331
9332 inode_t *pi = in->project_inode();
9333 pi->version = in->pre_dirty();
9334 pi->rstat.rsnaprealms++;
9335
9336 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9337 snapid_t seq;
9338 ::decode(seq, p);
9339
9340 sr_t *newsnap = in->project_snaprealm(seq);
9341 newsnap->seq = seq;
9342 newsnap->last_created = seq;
9343
9344 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9345 journal_cow_inode(mut, &le->metablob, in);
9346 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9347
9348 mds->server->submit_mdlog_entry(le,
9349 new C_MDC_snaprealm_create_finish(this, mdr,
9350 mut, in),
9351 mdr, __func__);
9352 mds->mdlog->flush();
9353 }
9354
9355
9356 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9357 {
9358 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9359
9360 vector<inodeno_t> split_inos;
9361 vector<inodeno_t> split_realms;
9362
9363 if (snapop == CEPH_SNAP_OP_SPLIT) {
9364 // notify clients of update|split
9365 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9366 !p.end(); ++p)
9367 split_inos.push_back((*p)->ino());
9368
9369 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9370 p != in->snaprealm->open_children.end();
9371 ++p)
9372 split_realms.push_back((*p)->inode->ino());
9373 }
9374
9375 bufferlist snapbl;
9376 in->snaprealm->build_snap_trace(snapbl);
9377
9378 set<SnapRealm*> past_children;
9379 map<client_t, MClientSnap*> updates;
9380 list<SnapRealm*> q;
9381 q.push_back(in->snaprealm);
9382 while (!q.empty()) {
9383 SnapRealm *realm = q.front();
9384 q.pop_front();
9385
9386 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9387 realm->invalidate_cached_snaps();
9388
9389 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9390 p != realm->client_caps.end();
9391 ++p) {
9392 assert(!p->second->empty());
9393 if (!nosend && updates.count(p->first) == 0) {
9394 MClientSnap *update = new MClientSnap(snapop);
9395 update->head.split = in->ino();
9396 update->split_inos = split_inos;
9397 update->split_realms = split_realms;
9398 update->bl = snapbl;
9399 updates[p->first] = update;
9400 }
9401 }
9402
9403 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9404 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9405 p != realm->open_past_children.end();
9406 ++p)
9407 past_children.insert(*p);
9408 }
9409
9410 // notify for active children, too.
9411 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9412 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9413 p != realm->open_children.end();
9414 ++p)
9415 q.push_back(*p);
9416 }
9417
9418 if (!nosend)
9419 send_snaps(updates);
9420
9421 // notify past children and their descendants if we update/delete old snapshots
9422 for (set<SnapRealm*>::iterator p = past_children.begin();
9423 p != past_children.end();
9424 ++p)
9425 q.push_back(*p);
9426
9427 while (!q.empty()) {
9428 SnapRealm *realm = q.front();
9429 q.pop_front();
9430
9431 realm->invalidate_cached_snaps();
9432
9433 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9434 p != realm->open_children.end();
9435 ++p) {
9436 if (past_children.count(*p) == 0)
9437 q.push_back(*p);
9438 }
9439
9440 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9441 p != realm->open_past_children.end();
9442 ++p) {
9443 if (past_children.count(*p) == 0) {
9444 q.push_back(*p);
9445 past_children.insert(*p);
9446 }
9447 }
9448 }
9449
9450 if (snapop == CEPH_SNAP_OP_DESTROY) {
9451 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9452 for (set<SnapRealm*>::iterator p = past_children.begin();
9453 p != past_children.end();
9454 ++p)
9455 maybe_eval_stray((*p)->inode, true);
9456 }
9457 }
9458
9459 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9460 {
9461 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9462
9463 // apply
9464 in->pop_and_dirty_projected_inode(mut->ls);
9465 mut->apply();
9466 mds->locker->drop_locks(mut.get());
9467 mut->cleanup();
9468
9469 // tell table we've committed
9470 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9471
9472 // create
9473 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9474 snapid_t seq;
9475 ::decode(seq, p);
9476
9477 in->open_snaprealm();
9478 in->snaprealm->srnode.seq = seq;
9479 in->snaprealm->srnode.created = seq;
9480 bool ok = in->snaprealm->_open_parents(NULL);
9481 assert(ok);
9482
9483 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9484
9485 /*
9486 static int count = 5;
9487 if (--count == 0)
9488 ceph_abort(); // hack test test **********
9489 */
9490
9491 // done.
9492 mdr->more()->stid = 0; // caller will likely need to reuse this
9493 dispatch_request(mdr);
9494 }
9495
9496
9497 // -------------------------------------------------------------------------------
9498 // STRAYS
9499
9500 struct C_MDC_RetryScanStray : public MDCacheContext {
9501 dirfrag_t next;
9502 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9503 void finish(int r) override {
9504 mdcache->scan_stray_dir(next);
9505 }
9506 };
9507
9508 void MDCache::scan_stray_dir(dirfrag_t next)
9509 {
9510 dout(10) << "scan_stray_dir " << next << dendl;
9511
9512 list<CDir*> ls;
9513 for (int i = 0; i < NUM_STRAY; ++i) {
9514 if (strays[i]->ino() < next.ino)
9515 continue;
9516 strays[i]->get_dirfrags(ls);
9517 }
9518
9519 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9520 CDir *dir = *p;
9521 if (dir->dirfrag() < next)
9522 continue;
9523 if (!dir->is_complete()) {
9524 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9525 return;
9526 }
9527 for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
9528 CDentry *dn = q->second;
9529 dn->state_set(CDentry::STATE_STRAY);
9530 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9531 if (dnl->is_primary()) {
9532 CInode *in = dnl->get_inode();
9533 if (in->inode.nlink == 0)
9534 in->state_set(CInode::STATE_ORPHAN);
9535 maybe_eval_stray(in);
9536 }
9537 }
9538 }
9539 }
9540
9541 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9542 {
9543 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9544 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9545 }
9546
9547
9548
9549
9550
9551 // ========================================================================================
9552 // DISCOVER
9553 /*
9554
9555 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9556 to the parent metadata object in the cache (pinning it).
9557
9558 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9559
9560 */
9561
9562 void MDCache::_send_discover(discover_info_t& d)
9563 {
9564 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9565 d.want_base_dir, d.want_xlocked);
9566 dis->set_tid(d.tid);
9567 mds->send_message_mds(dis, d.mds);
9568 }
9569
9570 void MDCache::discover_base_ino(inodeno_t want_ino,
9571 MDSInternalContextBase *onfinish,
9572 mds_rank_t from)
9573 {
9574 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9575 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9576 discover_info_t& d = _create_discover(from);
9577 d.ino = want_ino;
9578 _send_discover(d);
9579 }
9580 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9581 }
9582
9583
9584 void MDCache::discover_dir_frag(CInode *base,
9585 frag_t approx_fg,
9586 MDSInternalContextBase *onfinish,
9587 mds_rank_t from)
9588 {
9589 if (from < 0)
9590 from = base->authority().first;
9591
9592 dirfrag_t df(base->ino(), approx_fg);
9593 dout(7) << "discover_dir_frag " << df
9594 << " from mds." << from << dendl;
9595
9596 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9597 discover_info_t& d = _create_discover(from);
9598 d.pin_base(base);
9599 d.ino = base->ino();
9600 d.frag = approx_fg;
9601 d.want_base_dir = true;
9602 _send_discover(d);
9603 }
9604
9605 if (onfinish)
9606 base->add_dir_waiter(approx_fg, onfinish);
9607 }
9608
9609 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9610 CInode *base;
9611 snapid_t snapid;
9612 filepath path;
9613 mds_rank_t from;
9614 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9615 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9616 void finish(int r) override {
9617 mdcache->discover_path(base, snapid, path, 0, from);
9618 }
9619 };
9620
9621 void MDCache::discover_path(CInode *base,
9622 snapid_t snap,
9623 filepath want_path,
9624 MDSInternalContextBase *onfinish,
9625 bool want_xlocked,
9626 mds_rank_t from)
9627 {
9628 if (from < 0)
9629 from = base->authority().first;
9630
9631 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9632 << (want_xlocked ? " want_xlocked":"")
9633 << dendl;
9634
9635 if (base->is_ambiguous_auth()) {
9636 dout(10) << " waiting for single auth on " << *base << dendl;
9637 if (!onfinish)
9638 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9639 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9640 return;
9641 } else if (from == mds->get_nodeid()) {
9642 list<MDSInternalContextBase*> finished;
9643 base->take_waiting(CInode::WAIT_DIR, finished);
9644 mds->queue_waiters(finished);
9645 return;
9646 }
9647
9648 frag_t fg = base->pick_dirfrag(want_path[0]);
9649 if ((want_xlocked && want_path.depth() == 1) ||
9650 !base->is_waiting_for_dir(fg) || !onfinish) {
9651 discover_info_t& d = _create_discover(from);
9652 d.ino = base->ino();
9653 d.pin_base(base);
9654 d.frag = fg;
9655 d.snap = snap;
9656 d.want_path = want_path;
9657 d.want_base_dir = true;
9658 d.want_xlocked = want_xlocked;
9659 _send_discover(d);
9660 }
9661
9662 // register + wait
9663 if (onfinish)
9664 base->add_dir_waiter(fg, onfinish);
9665 }
9666
9667 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9668 CDir *base;
9669 snapid_t snapid;
9670 filepath path;
9671 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9672 MDCacheContext(c), base(b), snapid(s), path(p) {}
9673 void finish(int r) override {
9674 mdcache->discover_path(base, snapid, path, 0);
9675 }
9676 };
9677
9678 void MDCache::discover_path(CDir *base,
9679 snapid_t snap,
9680 filepath want_path,
9681 MDSInternalContextBase *onfinish,
9682 bool want_xlocked)
9683 {
9684 mds_rank_t from = base->authority().first;
9685
9686 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9687 << (want_xlocked ? " want_xlocked":"")
9688 << dendl;
9689
9690 if (base->is_ambiguous_auth()) {
9691 dout(7) << " waiting for single auth on " << *base << dendl;
9692 if (!onfinish)
9693 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9694 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9695 return;
9696 } else if (from == mds->get_nodeid()) {
9697 list<MDSInternalContextBase*> finished;
9698 base->take_sub_waiting(finished);
9699 mds->queue_waiters(finished);
9700 return;
9701 }
9702
9703 if ((want_xlocked && want_path.depth() == 1) ||
9704 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9705 discover_info_t& d = _create_discover(from);
9706 d.ino = base->ino();
9707 d.pin_base(base->inode);
9708 d.frag = base->get_frag();
9709 d.snap = snap;
9710 d.want_path = want_path;
9711 d.want_base_dir = false;
9712 d.want_xlocked = want_xlocked;
9713 _send_discover(d);
9714 }
9715
9716 // register + wait
9717 if (onfinish)
9718 base->add_dentry_waiter(want_path[0], snap, onfinish);
9719 }
9720
9721 void MDCache::kick_discovers(mds_rank_t who)
9722 {
9723 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9724 p != discovers.end();
9725 ++p) {
9726 if (p->second.mds != who)
9727 continue;
9728 _send_discover(p->second);
9729 }
9730 }
9731
9732
9733 /* This function DOES put the passed message before returning */
9734 void MDCache::handle_discover(MDiscover *dis)
9735 {
9736 mds_rank_t whoami = mds->get_nodeid();
9737 mds_rank_t from = mds_rank_t(dis->get_source().num());
9738
9739 assert(from != whoami);
9740
9741 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9742 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9743 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
9744 dis->put();
9745 return;
9746 }
9747
9748 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9749 // delay processing request from survivor because we may not yet choose lock states.
9750 if (!mds->mdsmap->is_rejoin(from)) {
9751 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9752 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9753 return;
9754 }
9755 }
9756
9757
9758 CInode *cur = 0;
9759 MDiscoverReply *reply = new MDiscoverReply(dis);
9760
9761 snapid_t snapid = dis->get_snapid();
9762
9763 // get started.
9764 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9765 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9766 // wants root
9767 dout(7) << "handle_discover from mds." << from
9768 << " wants base + " << dis->get_want().get_path()
9769 << " snap " << snapid
9770 << dendl;
9771
9772 cur = get_inode(dis->get_base_ino());
9773 assert(cur);
9774
9775 // add root
9776 reply->starts_with = MDiscoverReply::INODE;
9777 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9778 dout(10) << "added base " << *cur << dendl;
9779 }
9780 else {
9781 // there's a base inode
9782 cur = get_inode(dis->get_base_ino(), snapid);
9783 if (!cur && snapid != CEPH_NOSNAP) {
9784 cur = get_inode(dis->get_base_ino());
9785 if (cur && !cur->is_multiversion())
9786 cur = NULL; // nope!
9787 }
9788
9789 if (!cur) {
9790 dout(7) << "handle_discover mds." << from
9791 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9792 << dendl;
9793 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9794 reply->set_error_dentry(dis->get_dentry(0));
9795 reply->set_flag_error_dir();
9796 } else if (dis->wants_base_dir()) {
9797 dout(7) << "handle_discover mds." << from
9798 << " wants basedir+" << dis->get_want().get_path()
9799 << " has " << *cur
9800 << dendl;
9801 } else {
9802 dout(7) << "handle_discover mds." << from
9803 << " wants " << dis->get_want().get_path()
9804 << " has " << *cur
9805 << dendl;
9806 }
9807 }
9808
9809 assert(reply);
9810
9811 // add content
9812 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9813 for (unsigned i = 0;
9814 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9815 i++) {
9816
9817 // -- figure out the dir
9818
9819 // is *cur even a dir at all?
9820 if (!cur->is_dir()) {
9821 dout(7) << *cur << " not a dir" << dendl;
9822 reply->set_flag_error_dir();
9823 break;
9824 }
9825
9826 // pick frag
9827 frag_t fg;
9828 if (dis->get_want().depth()) {
9829 // dentry specifies
9830 fg = cur->pick_dirfrag(dis->get_dentry(i));
9831 } else {
9832 // requester explicity specified the frag
9833 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9834 fg = dis->get_base_dir_frag();
9835 if (!cur->dirfragtree.is_leaf(fg))
9836 fg = cur->dirfragtree[fg.value()];
9837 }
9838 CDir *curdir = cur->get_dirfrag(fg);
9839
9840 if ((!curdir && !cur->is_auth()) ||
9841 (curdir && !curdir->is_auth())) {
9842
9843 /* before:
9844 * ONLY set flag if empty!!
9845 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9846 * resulting in duplicate discovers in flight,
9847 * which can wreak havoc when discovering rename srcdn (which may move)
9848 */
9849
9850 if (reply->is_empty()) {
9851 // only hint if empty.
9852 // someday this could be better, but right now the waiter logic isn't smart enough.
9853
9854 // hint
9855 if (curdir) {
9856 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9857 reply->set_dir_auth_hint(curdir->authority().first);
9858 } else {
9859 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9860 << *cur << dendl;
9861 reply->set_dir_auth_hint(cur->authority().first);
9862 }
9863
9864 // note error dentry, if any
9865 // NOTE: important, as it allows requester to issue an equivalent discover
9866 // to whomever we hint at.
9867 if (dis->get_want().depth() > i)
9868 reply->set_error_dentry(dis->get_dentry(i));
9869 }
9870
9871 break;
9872 }
9873
9874 if (!curdir) { // open dir?
9875 if (cur->is_frozen()) {
9876 if (!reply->is_empty()) {
9877 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9878 break;
9879 }
9880 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9881 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9882 reply->put();
9883 return;
9884 }
9885 curdir = cur->get_or_open_dirfrag(this, fg);
9886 } else if (curdir->is_frozen_tree() ||
9887 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
9888 if (!reply->is_empty()) {
9889 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9890 break;
9891 }
9892 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9893 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9894 reply->set_flag_error_dir();
9895 break;
9896 }
9897 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9898 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9899 reply->put();
9900 return;
9901 }
9902
9903 // add dir
9904 if (curdir->get_version() == 0) {
9905 // fetch newly opened dir
9906 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9907 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9908 // make sure the base frag is correct, though, in there was a refragment since the
9909 // original request was sent.
9910 reply->set_base_dir_frag(curdir->get_frag());
9911 } else {
9912 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9913 if (!reply->trace.length())
9914 reply->starts_with = MDiscoverReply::DIR;
9915 replicate_dir(curdir, from, reply->trace);
9916 dout(7) << "handle_discover added dir " << *curdir << dendl;
9917 }
9918
9919 // lookup
9920 CDentry *dn = 0;
9921 if (curdir->get_version() == 0) {
9922 // fetch newly opened dir
9923 assert(!curdir->has_bloom());
9924 } else if (dis->get_want().depth() > 0) {
9925 // lookup dentry
9926 dn = curdir->lookup(dis->get_dentry(i), snapid);
9927 } else
9928 break; // done!
9929
9930 // incomplete dir?
9931 if (!dn) {
9932 if (!curdir->is_complete() &&
9933 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
9934 // readdir
9935 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9936 if (reply->is_empty()) {
9937 // fetch and wait
9938 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9939 dis->wants_base_dir() && curdir->get_version() == 0);
9940 reply->put();
9941 return;
9942 } else {
9943 // initiate fetch, but send what we have so far
9944 curdir->fetch(0);
9945 break;
9946 }
9947 }
9948
9949 // send null dentry
9950 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9951 << *curdir << dendl;
9952 dn = curdir->add_null_dentry(dis->get_dentry(i));
9953 }
9954 assert(dn);
9955
9956 // don't add replica to purging dentry/inode
9957 if (dn->state_test(CDentry::STATE_PURGING)) {
9958 if (reply->is_empty())
9959 reply->set_flag_error_dn(dis->get_dentry(i));
9960 break;
9961 }
9962
9963 CDentry::linkage_t *dnl = dn->get_linkage();
9964
9965 // xlocked dentry?
9966 // ...always block on non-tail items (they are unrelated)
9967 // ...allow xlocked tail disocvery _only_ if explicitly requested
9968 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9969 if (dn->lock.is_xlocked()) {
9970 // is this the last (tail) item in the discover traversal?
9971 if (tailitem && dis->wants_xlocked()) {
9972 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9973 } else if (reply->is_empty()) {
9974 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9975 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9976 reply->put();
9977 return;
9978 } else {
9979 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9980 break;
9981 }
9982 }
9983
9984 // frozen inode?
9985 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9986 if (tailitem && dis->wants_xlocked()) {
9987 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9988 } else if (reply->is_empty()) {
9989 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9990 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9991 reply->put();
9992 return;
9993 } else {
9994 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9995 break;
9996 }
9997 }
9998
9999 // add dentry
10000 if (!reply->trace.length())
10001 reply->starts_with = MDiscoverReply::DENTRY;
10002 replicate_dentry(dn, from, reply->trace);
10003 dout(7) << "handle_discover added dentry " << *dn << dendl;
10004
10005 if (!dnl->is_primary()) break; // stop on null or remote link.
10006
10007 // add inode
10008 CInode *next = dnl->get_inode();
10009 assert(next->is_auth());
10010
10011 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10012 dout(7) << "handle_discover added inode " << *next << dendl;
10013
10014 // descend, keep going.
10015 cur = next;
10016 continue;
10017 }
10018
10019 // how did we do?
10020 assert(!reply->is_empty());
10021 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10022 mds->send_message(reply, dis->get_connection());
10023
10024 dis->put();
10025 }
10026
10027 /* This function DOES put the passed message before returning */
10028 void MDCache::handle_discover_reply(MDiscoverReply *m)
10029 {
10030 /*
10031 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10032 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10033 m->put();
10034 return;
10035 }
10036 */
10037 dout(7) << "discover_reply " << *m << dendl;
10038 if (m->is_flag_error_dir())
10039 dout(7) << " flag error, dir" << dendl;
10040 if (m->is_flag_error_dn())
10041 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10042
10043 list<MDSInternalContextBase*> finished, error;
10044 mds_rank_t from = mds_rank_t(m->get_source().num());
10045
10046 // starting point
10047 CInode *cur = get_inode(m->get_base_ino());
10048 bufferlist::iterator p = m->trace.begin();
10049
10050 int next = m->starts_with;
10051
10052 // decrement discover counters
10053 if (m->get_tid()) {
10054 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10055 if (p != discovers.end()) {
10056 dout(10) << " found tid " << m->get_tid() << dendl;
10057 discovers.erase(p);
10058 } else {
10059 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10060 }
10061 }
10062
10063 // discover may start with an inode
10064 if (!p.end() && next == MDiscoverReply::INODE) {
10065 cur = add_replica_inode(p, NULL, finished);
10066 dout(7) << "discover_reply got base inode " << *cur << dendl;
10067 assert(cur->is_base());
10068
10069 next = MDiscoverReply::DIR;
10070
10071 // take waiters?
10072 if (cur->is_base() &&
10073 waiting_for_base_ino[from].count(cur->ino())) {
10074 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10075 waiting_for_base_ino[from].erase(cur->ino());
10076 }
10077 }
10078 assert(cur);
10079
10080 // loop over discover results.
10081 // indexes follow each ([[dir] dentry] inode)
10082 // can start, end with any type.
10083 while (!p.end()) {
10084 // dir
10085 frag_t fg;
10086 CDir *curdir = 0;
10087 if (next == MDiscoverReply::DIR) {
10088 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10089 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10090 assert(m->get_wanted_base_dir());
10091 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10092 }
10093 } else {
10094 // note: this can only happen our first way around this loop.
10095 if (p.end() && m->is_flag_error_dn()) {
10096 fg = cur->pick_dirfrag(m->get_error_dentry());
10097 curdir = cur->get_dirfrag(fg);
10098 } else
10099 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10100 }
10101
10102 if (p.end())
10103 break;
10104
10105 // dentry
10106 CDentry *dn = add_replica_dentry(p, curdir, finished);
10107
10108 if (p.end())
10109 break;
10110
10111 // inode
10112 cur = add_replica_inode(p, dn, finished);
10113
10114 next = MDiscoverReply::DIR;
10115 }
10116
10117 // dir error?
10118 // or dir_auth hint?
10119 if (m->is_flag_error_dir() && !cur->is_dir()) {
10120 // not a dir.
10121 cur->take_waiting(CInode::WAIT_DIR, error);
10122 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10123 mds_rank_t who = m->get_dir_auth_hint();
10124 if (who == mds->get_nodeid()) who = -1;
10125 if (who >= 0)
10126 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10127
10128
10129 if (m->get_wanted_base_dir()) {
10130 frag_t fg = m->get_base_dir_frag();
10131 CDir *dir = cur->get_dirfrag(fg);
10132
10133 if (cur->is_waiting_for_dir(fg)) {
10134 if (cur->is_auth())
10135 cur->take_waiting(CInode::WAIT_DIR, finished);
10136 else if (dir || !cur->dirfragtree.is_leaf(fg))
10137 cur->take_dir_waiting(fg, finished);
10138 else
10139 discover_dir_frag(cur, fg, 0, who);
10140 } else
10141 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10142 }
10143
10144 // try again?
10145 if (m->get_error_dentry().length()) {
10146 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10147 CDir *dir = cur->get_dirfrag(fg);
10148 // wanted a dentry
10149 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10150 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10151 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10152 m->get_wanted_snapid(), finished);
10153 } else {
10154 filepath relpath(m->get_error_dentry(), 0);
10155 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10156 }
10157 } else
10158 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10159 << m->get_error_dentry() << dendl;
10160 }
10161 } else if (m->is_flag_error_dn()) {
10162 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10163 CDir *dir = cur->get_dirfrag(fg);
10164 if (dir) {
10165 if (dir->is_auth()) {
10166 dir->take_sub_waiting(finished);
10167 } else {
10168 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10169 m->get_wanted_snapid(), error);
10170 }
10171 }
10172 }
10173
10174 // waiters
10175 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10176 mds->queue_waiters(finished);
10177
10178 // done
10179 m->put();
10180 }
10181
10182
10183
10184 // ----------------------------
10185 // REPLICAS
10186
10187 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10188 list<MDSInternalContextBase*>& finished)
10189 {
10190 dirfrag_t df;
10191 ::decode(df, p);
10192
10193 assert(diri->ino() == df.ino);
10194
10195 // add it (_replica_)
10196 CDir *dir = diri->get_dirfrag(df.frag);
10197
10198 if (dir) {
10199 // had replica. update w/ new nonce.
10200 dir->decode_replica(p);
10201 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10202 } else {
10203 // force frag to leaf in the diri tree
10204 if (!diri->dirfragtree.is_leaf(df.frag)) {
10205 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10206 << diri->dirfragtree << dendl;
10207 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10208 }
10209
10210 // add replica.
10211 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10212 dir->decode_replica(p);
10213
10214 // is this a dir_auth delegation boundary?
10215 if (from != diri->authority().first ||
10216 diri->is_ambiguous_auth() ||
10217 diri->is_base())
10218 adjust_subtree_auth(dir, from);
10219
10220 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10221
10222 // get waiters
10223 diri->take_dir_waiting(df.frag, finished);
10224 }
10225
10226 return dir;
10227 }
10228
10229 CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10230 {
10231 string name;
10232 snapid_t last;
10233 ::decode(name, p);
10234 ::decode(last, p);
10235
10236 CDentry *dn = dir->lookup(name, last);
10237
10238 // have it?
10239 if (dn) {
10240 dn->decode_replica(p, false);
10241 dout(7) << "add_replica_dentry had " << *dn << dendl;
10242 } else {
10243 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10244 dn->decode_replica(p, true);
10245 dout(7) << "add_replica_dentry added " << *dn << dendl;
10246 }
10247
10248 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10249
10250 return dn;
10251 }
10252
10253 CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10254 {
10255 inodeno_t ino;
10256 snapid_t last;
10257 ::decode(ino, p);
10258 ::decode(last, p);
10259 CInode *in = get_inode(ino, last);
10260 if (!in) {
10261 in = new CInode(this, false, 1, last);
10262 in->decode_replica(p, true);
10263 add_inode(in);
10264 if (in->ino() == MDS_INO_ROOT)
10265 in->inode_auth.first = 0;
10266 else if (in->is_mdsdir())
10267 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10268 dout(10) << "add_replica_inode added " << *in << dendl;
10269 if (dn) {
10270 assert(dn->get_linkage()->is_null());
10271 dn->dir->link_primary_inode(dn, in);
10272 }
10273 } else {
10274 in->decode_replica(p, false);
10275 dout(10) << "add_replica_inode had " << *in << dendl;
10276 }
10277
10278 if (dn) {
10279 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10280 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10281 }
10282
10283 return in;
10284 }
10285
10286
10287 void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10288 {
10289 uint64_t features = mds->mdsmap->get_up_features();
10290 replicate_inode(get_myin(), who, bl, features);
10291 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10292 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10293 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10294 replicate_dir(straydn->get_dir(), who, bl);
10295 replicate_dentry(straydn, who, bl);
10296 }
10297
10298 CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10299 {
10300 list<MDSInternalContextBase*> finished;
10301 bufferlist::iterator p = bl.begin();
10302
10303 CInode *mdsin = add_replica_inode(p, NULL, finished);
10304 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10305 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10306 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10307 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10308 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10309 if (!finished.empty())
10310 mds->queue_waiters(finished);
10311
10312 return straydn;
10313 }
10314
10315
10316 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10317 {
10318 // this is an FYI, re: replication
10319
10320 set<mds_rank_t> who;
10321 if (bcast) {
10322 mds->get_mds_map()->get_active_mds_set(who);
10323 } else {
10324 for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
10325 p != dir->replicas_end();
10326 ++p)
10327 who.insert(p->first);
10328 }
10329
10330 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10331
10332 filepath path;
10333 dir->inode->make_path(path);
10334
10335 mds_rank_t whoami = mds->get_nodeid();
10336 for (set<mds_rank_t>::iterator it = who.begin();
10337 it != who.end();
10338 ++it) {
10339 if (*it == whoami) continue;
10340 //if (*it == except) continue;
10341 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10342
10343 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10344 dir->dirfrag(),
10345 dir->dir_rep,
10346 dir->dir_rep_by,
10347 path,
10348 bcast),
10349 *it);
10350 }
10351
10352 return 0;
10353 }
10354
10355 /* This function DOES put the passed message before returning */
10356 void MDCache::handle_dir_update(MDirUpdate *m)
10357 {
10358 dirfrag_t df = m->get_dirfrag();
10359 CDir *dir = get_dirfrag(df);
10360 if (!dir) {
10361 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10362
10363 // discover it?
10364 if (m->should_discover()) {
10365 // only try once!
10366 // this is key to avoid a fragtree update race, among other things.
10367 m->inc_tried_discover();
10368 vector<CDentry*> trace;
10369 CInode *in;
10370 filepath path = m->get_path();
10371 dout(5) << "trying discover on dir_update for " << path << dendl;
10372 MDRequestRef null_ref;
10373 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10374 if (r > 0)
10375 return;
10376 if (r == 0 &&
10377 in->ino() == df.ino &&
10378 in->get_approx_dirfrag(df.frag) == NULL) {
10379 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10380 return;
10381 }
10382 }
10383
10384 m->put();
10385 return;
10386 }
10387
10388 if (!m->has_tried_discover()) {
10389 // Update if it already exists. Othwerwise it got updated by discover reply.
10390 dout(5) << "dir_update on " << *dir << dendl;
10391 dir->dir_rep = m->get_dir_rep();
10392 dir->dir_rep_by = m->get_dir_rep_by();
10393 }
10394
10395 // done
10396 m->put();
10397 }
10398
10399
10400
10401
10402
10403 // LINK
10404
10405 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10406 {
10407 dout(7) << "send_dentry_link " << *dn << dendl;
10408
10409 CDir *subtree = get_subtree_root(dn->get_dir());
10410 for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
10411 p != dn->replicas_end();
10412 ++p) {
10413 // don't tell (rename) witnesses; they already know
10414 if (mdr.get() && mdr->more()->witnessed.count(p->first))
10415 continue;
10416 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
10417 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
10418 rejoin_gather.count(p->first)))
10419 continue;
10420 CDentry::linkage_t *dnl = dn->get_linkage();
10421 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10422 dn->name, dnl->is_primary());
10423 if (dnl->is_primary()) {
10424 dout(10) << " primary " << *dnl->get_inode() << dendl;
10425 replicate_inode(dnl->get_inode(), p->first, m->bl,
10426 mds->mdsmap->get_up_features());
10427 } else if (dnl->is_remote()) {
10428 inodeno_t ino = dnl->get_remote_ino();
10429 __u8 d_type = dnl->get_remote_d_type();
10430 dout(10) << " remote " << ino << " " << d_type << dendl;
10431 ::encode(ino, m->bl);
10432 ::encode(d_type, m->bl);
10433 } else
10434 ceph_abort(); // aie, bad caller!
10435 mds->send_message_mds(m, p->first);
10436 }
10437 }
10438
10439 /* This function DOES put the passed message before returning */
10440 void MDCache::handle_dentry_link(MDentryLink *m)
10441 {
10442
10443 CDentry *dn = NULL;
10444 CDir *dir = get_dirfrag(m->get_dirfrag());
10445 if (!dir) {
10446 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10447 } else {
10448 dn = dir->lookup(m->get_dn());
10449 if (!dn) {
10450 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10451 } else {
10452 dout(7) << "handle_dentry_link on " << *dn << dendl;
10453 CDentry::linkage_t *dnl = dn->get_linkage();
10454
10455 assert(!dn->is_auth());
10456 assert(dnl->is_null());
10457 }
10458 }
10459
10460 bufferlist::iterator p = m->bl.begin();
10461 list<MDSInternalContextBase*> finished;
10462 if (dn) {
10463 if (m->get_is_primary()) {
10464 // primary link.
10465 add_replica_inode(p, dn, finished);
10466 } else {
10467 // remote link, easy enough.
10468 inodeno_t ino;
10469 __u8 d_type;
10470 ::decode(ino, p);
10471 ::decode(d_type, p);
10472 dir->link_remote_inode(dn, ino, d_type);
10473 }
10474 } else {
10475 ceph_abort();
10476 }
10477
10478 if (!finished.empty())
10479 mds->queue_waiters(finished);
10480
10481 m->put();
10482 return;
10483 }
10484
10485
10486 // UNLINK
10487
10488 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10489 {
10490 dout(10) << "send_dentry_unlink " << *dn << dendl;
10491 // share unlink news with replicas
10492 set<mds_rank_t> replicas;
10493 dn->list_replicas(replicas);
10494 if (straydn)
10495 straydn->list_replicas(replicas);
10496 for (set<mds_rank_t>::iterator it = replicas.begin();
10497 it != replicas.end();
10498 ++it) {
10499 // don't tell (rmdir) witnesses; they already know
10500 if (mdr.get() && mdr->more()->witnessed.count(*it))
10501 continue;
10502
10503 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10504 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10505 rejoin_gather.count(*it)))
10506 continue;
10507
10508 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
10509 if (straydn)
10510 replicate_stray(straydn, *it, unlink->straybl);
10511 mds->send_message_mds(unlink, *it);
10512 }
10513 }
10514
10515 /* This function DOES put the passed message before returning */
10516 void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10517 {
10518 // straydn
10519 CDentry *straydn = NULL;
10520 if (m->straybl.length())
10521 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10522
10523 CDir *dir = get_dirfrag(m->get_dirfrag());
10524 if (!dir) {
10525 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10526 } else {
10527 CDentry *dn = dir->lookup(m->get_dn());
10528 if (!dn) {
10529 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10530 } else {
10531 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10532 CDentry::linkage_t *dnl = dn->get_linkage();
10533
10534 // open inode?
10535 if (dnl->is_primary()) {
10536 CInode *in = dnl->get_inode();
10537 dn->dir->unlink_inode(dn);
10538 assert(straydn);
10539 straydn->dir->link_primary_inode(straydn, in);
10540
10541 // in->first is lazily updated on replica; drag it forward so
10542 // that we always keep it in sync with the dnq
10543 assert(straydn->first >= in->first);
10544 in->first = straydn->first;
10545
10546 // update subtree map?
10547 if (in->is_dir())
10548 adjust_subtree_after_rename(in, dir, false);
10549
10550 // send caps to auth (if we're not already)
10551 if (in->is_any_caps() &&
10552 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10553 migrator->export_caps(in);
10554
10555 straydn = NULL;
10556 } else {
10557 assert(!straydn);
10558 assert(dnl->is_remote());
10559 dn->dir->unlink_inode(dn);
10560 }
10561 assert(dnl->is_null());
10562 }
10563 }
10564
10565 // race with trim_dentry()
10566 if (straydn) {
10567 assert(straydn->get_num_ref() == 0);
10568 assert(straydn->get_linkage()->is_null());
10569 map<mds_rank_t, MCacheExpire*> expiremap;
10570 trim_dentry(straydn, expiremap);
10571 send_expire_messages(expiremap);
10572 }
10573
10574 m->put();
10575 return;
10576 }
10577
10578
10579
10580
10581
10582
10583 // ===================================================================
10584
10585
10586
10587 // ===================================================================
10588 // FRAGMENT
10589
10590
10591 /**
10592 * adjust_dir_fragments -- adjust fragmentation for a directory
10593 *
10594 * @param diri directory inode
10595 * @param basefrag base fragment
10596 * @param bits bit adjustment. positive for split, negative for merge.
10597 */
10598 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10599 list<CDir*>& resultfrags,
10600 list<MDSInternalContextBase*>& waiters,
10601 bool replay)
10602 {
10603 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10604 << " on " << *diri << dendl;
10605
10606 list<CDir*> srcfrags;
10607 diri->get_dirfrags_under(basefrag, srcfrags);
10608
10609 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10610 }
10611
10612 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10613 {
10614 CDir *dir = diri->get_dirfrag(fg);
10615 if (dir)
10616 return dir;
10617
10618 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10619
10620 list<CDir*> src, result;
10621 list<MDSInternalContextBase*> waiters;
10622
10623 // split a parent?
10624 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10625 while (1) {
10626 CDir *pdir = diri->get_dirfrag(parent);
10627 if (pdir) {
10628 int split = fg.bits() - parent.bits();
10629 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10630 src.push_back(pdir);
10631 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10632 dir = diri->get_dirfrag(fg);
10633 if (dir) {
10634 dout(10) << "force_dir_fragment result " << *dir << dendl;
10635 break;
10636 }
10637 }
10638 if (parent == frag_t())
10639 break;
10640 frag_t last = parent;
10641 parent = parent.parent();
10642 dout(10) << " " << last << " parent is " << parent << dendl;
10643 }
10644
10645 if (!dir) {
10646 // hoover up things under fg?
10647 diri->get_dirfrags_under(fg, src);
10648 if (src.empty()) {
10649 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10650 } else {
10651 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10652 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10653 dir = result.front();
10654 dout(10) << "force_dir_fragment result " << *dir << dendl;
10655 }
10656 }
10657 if (!replay)
10658 mds->queue_waiters(waiters);
10659 return dir;
10660 }
10661
10662 void MDCache::adjust_dir_fragments(CInode *diri,
10663 list<CDir*>& srcfrags,
10664 frag_t basefrag, int bits,
10665 list<CDir*>& resultfrags,
10666 list<MDSInternalContextBase*>& waiters,
10667 bool replay)
10668 {
10669 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10670 << " srcfrags " << srcfrags
10671 << " on " << *diri << dendl;
10672
10673 // adjust fragtree
10674 // yuck. we may have discovered the inode while it was being fragmented.
10675 if (!diri->dirfragtree.is_leaf(basefrag))
10676 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10677
10678 if (bits > 0)
10679 diri->dirfragtree.split(basefrag, bits);
10680 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10681
10682 if (srcfrags.empty())
10683 return;
10684
10685 // split
10686 CDir *parent_dir = diri->get_parent_dir();
10687 CDir *parent_subtree = 0;
10688 if (parent_dir)
10689 parent_subtree = get_subtree_root(parent_dir);
10690
10691 if (bits > 0) {
10692 // SPLIT
10693 assert(srcfrags.size() == 1);
10694 CDir *dir = srcfrags.front();
10695
10696 dir->split(bits, resultfrags, waiters, replay);
10697
10698 // did i change the subtree map?
10699 if (dir->is_subtree_root()) {
10700 // new frags are now separate subtrees
10701 for (list<CDir*>::iterator p = resultfrags.begin();
10702 p != resultfrags.end();
10703 ++p)
10704 subtrees[*p].clear(); // new frag is now its own subtree
10705
10706 // was i a bound?
10707 if (parent_subtree) {
10708 assert(subtrees[parent_subtree].count(dir));
10709 subtrees[parent_subtree].erase(dir);
10710 for (list<CDir*>::iterator p = resultfrags.begin();
10711 p != resultfrags.end();
10712 ++p) {
10713 assert((*p)->is_subtree_root());
10714 subtrees[parent_subtree].insert(*p);
10715 }
10716 }
10717
10718 // adjust my bounds.
10719 set<CDir*> bounds;
10720 bounds.swap(subtrees[dir]);
10721 subtrees.erase(dir);
10722 for (set<CDir*>::iterator p = bounds.begin();
10723 p != bounds.end();
10724 ++p) {
10725 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10726 subtrees[frag].insert(*p);
10727 }
10728
10729 show_subtrees(10);
10730
10731 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10732 dir->dir_auth = CDIR_AUTH_DEFAULT;
10733 }
10734
10735 diri->close_dirfrag(dir->get_frag());
10736
10737 } else {
10738 // MERGE
10739
10740 // are my constituent bits subtrees? if so, i will be too.
10741 // (it's all or none, actually.)
10742 bool any_subtree = false;
10743 for (CDir *dir : srcfrags) {
10744 if (dir->is_subtree_root()) {
10745 any_subtree = true;
10746 break;
10747 }
10748 }
10749 set<CDir*> new_bounds;
10750 if (any_subtree) {
10751 for (CDir *dir : srcfrags) {
10752 // this simplifies the code that find subtrees underneath the dirfrag
10753 if (!dir->is_subtree_root()) {
10754 dir->state_set(CDir::STATE_AUXSUBTREE);
10755 adjust_subtree_auth(dir, mds->get_nodeid());
10756 }
10757 }
10758
10759 for (CDir *dir : srcfrags) {
10760 assert(dir->is_subtree_root());
10761 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
10762 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10763 set<CDir*>::iterator r = q->second.begin();
10764 while (r != subtrees[dir].end()) {
10765 new_bounds.insert(*r);
10766 subtrees[dir].erase(r++);
10767 }
10768 subtrees.erase(q);
10769
10770 // remove myself as my parent's bound
10771 if (parent_subtree)
10772 subtrees[parent_subtree].erase(dir);
10773 }
10774 }
10775
10776 // merge
10777 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10778 f->merge(srcfrags, waiters, replay);
10779
10780 if (any_subtree) {
10781 assert(f->is_subtree_root());
10782 subtrees[f].swap(new_bounds);
10783 if (parent_subtree)
10784 subtrees[parent_subtree].insert(f);
10785
10786 show_subtrees(10);
10787 }
10788
10789 resultfrags.push_back(f);
10790 }
10791 }
10792
10793
10794 class C_MDC_FragmentFrozen : public MDSInternalContext {
10795 MDCache *mdcache;
10796 MDRequestRef mdr;
10797 public:
10798 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10799 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10800 void finish(int r) override {
10801 mdcache->fragment_frozen(mdr, r);
10802 }
10803 };
10804
10805 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10806 {
10807 if (is_readonly()) {
10808 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10809 return false;
10810 }
10811 if (mds->is_cluster_degraded()) {
10812 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10813 return false;
10814 }
10815 if (diri->get_parent_dir() &&
10816 diri->get_parent_dir()->get_inode()->is_stray()) {
10817 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10818 return false;
10819 }
10820 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10821 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10822 return false;
10823 }
10824
10825 if (diri->scrub_is_in_progress()) {
10826 dout(7) << "can_fragment: scrub in progress" << dendl;
10827 return false;
10828 }
10829
10830 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10831 CDir *dir = *p;
10832 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10833 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10834 return false;
10835 }
10836 if (!dir->is_auth()) {
10837 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10838 return false;
10839 }
10840 if (dir->is_bad()) {
10841 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10842 return false;
10843 }
10844 if (dir->is_frozen() ||
10845 dir->is_freezing()) {
10846 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10847 return false;
10848 }
10849 }
10850
10851 return true;
10852 }
10853
10854 void MDCache::split_dir(CDir *dir, int bits)
10855 {
10856 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10857 assert(dir->is_auth());
10858 CInode *diri = dir->inode;
10859
10860 list<CDir*> dirs;
10861 dirs.push_back(dir);
10862
10863 if (!can_fragment(diri, dirs)) {
10864 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10865 return;
10866 }
10867
10868 if (dir->frag.bits() + bits > 24) {
10869 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10870 return;
10871 }
10872
10873 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10874 mdr->more()->fragment_base = dir->dirfrag();
10875
10876 assert(fragments.count(dir->dirfrag()) == 0);
10877 fragment_info_t& info = fragments[dir->dirfrag()];
10878 info.mdr = mdr;
10879 info.dirs.push_back(dir);
10880 info.bits = bits;
10881 info.last_cum_auth_pins_change = ceph_clock_now();
10882
10883 fragment_freeze_dirs(dirs);
10884 // initial mark+complete pass
10885 fragment_mark_and_complete(mdr);
10886 }
10887
10888 void MDCache::merge_dir(CInode *diri, frag_t frag)
10889 {
10890 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10891
10892 list<CDir*> dirs;
10893 if (!diri->get_dirfrags_under(frag, dirs)) {
10894 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10895 return;
10896 }
10897
10898 if (diri->dirfragtree.is_leaf(frag)) {
10899 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10900 return;
10901 }
10902
10903 if (!can_fragment(diri, dirs))
10904 return;
10905
10906 CDir *first = dirs.front();
10907 int bits = first->get_frag().bits() - frag.bits();
10908 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10909
10910 dirfrag_t basedirfrag(diri->ino(), frag);
10911 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10912 mdr->more()->fragment_base = basedirfrag;
10913
10914 assert(fragments.count(basedirfrag) == 0);
10915 fragment_info_t& info = fragments[basedirfrag];
10916 info.mdr = mdr;
10917 info.dirs = dirs;
10918 info.bits = -bits;
10919 info.last_cum_auth_pins_change = ceph_clock_now();
10920
10921 fragment_freeze_dirs(dirs);
10922 // initial mark+complete pass
10923 fragment_mark_and_complete(mdr);
10924 }
10925
10926 void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10927 {
10928 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10929 CDir *dir = *p;
10930 dir->auth_pin(dir); // until we mark and complete them
10931 dir->state_set(CDir::STATE_FRAGMENTING);
10932 dir->freeze_dir();
10933 assert(dir->is_freezing_dir());
10934 }
10935 }
10936
10937 class C_MDC_FragmentMarking : public MDCacheContext {
10938 MDRequestRef mdr;
10939 public:
10940 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10941 void finish(int r) override {
10942 mdcache->fragment_mark_and_complete(mdr);
10943 }
10944 };
10945
10946 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10947 {
10948 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10949 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10950 if (it == fragments.end() || it->second.mdr != mdr) {
10951 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10952 request_finish(mdr);
10953 return;
10954 }
10955
10956 fragment_info_t& info = it->second;
10957 CInode *diri = info.dirs.front()->get_inode();
10958 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10959
10960 MDSGatherBuilder gather(g_ceph_context);
10961
10962 for (list<CDir*>::iterator p = info.dirs.begin();
10963 p != info.dirs.end();
10964 ++p) {
10965 CDir *dir = *p;
10966
10967 bool ready = true;
10968 if (!dir->is_complete()) {
10969 dout(15) << " fetching incomplete " << *dir << dendl;
10970 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10971 ready = false;
10972 } else if (dir->get_frag() == frag_t()) {
10973 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10974 // the operation. To avoid CDir::fetch() complaining about missing object,
10975 // we commit new dirfrag first.
10976 if (dir->state_test(CDir::STATE_CREATING)) {
10977 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10978 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10979 ready = false;
10980 } else if (dir->is_new()) {
10981 dout(15) << " committing new " << *dir << dendl;
10982 assert(dir->is_dirty());
10983 dir->commit(0, gather.new_sub(), true);
10984 ready = false;
10985 }
10986 }
10987 if (!ready)
10988 continue;
10989
10990 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10991 dout(15) << " marking " << *dir << dendl;
10992 for (CDir::map_t::iterator p = dir->items.begin();
10993 p != dir->items.end();
10994 ++p) {
10995 CDentry *dn = p->second;
10996 dn->get(CDentry::PIN_FRAGMENTING);
10997 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
10998 dn->state_set(CDentry::STATE_FRAGMENTING);
10999 }
11000 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11001 dir->auth_unpin(dir);
11002 } else {
11003 dout(15) << " already marked " << *dir << dendl;
11004 }
11005 }
11006 if (gather.has_subs()) {
11007 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11008 gather.activate();
11009 return;
11010 }
11011
11012 for (list<CDir*>::iterator p = info.dirs.begin();
11013 p != info.dirs.end();
11014 ++p) {
11015 CDir *dir = *p;
11016 if (!dir->is_frozen_dir()) {
11017 assert(dir->is_freezing_dir());
11018 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11019 }
11020 }
11021 if (gather.has_subs()) {
11022 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11023 gather.activate();
11024 // flush log so that request auth_pins are retired
11025 mds->mdlog->flush();
11026 return;
11027 }
11028
11029 fragment_frozen(mdr, 0);
11030 }
11031
11032 void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11033 {
11034 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11035 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11036 CDir *dir = *p;
11037 dout(10) << " frag " << *dir << dendl;
11038
11039 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11040 dir->state_clear(CDir::STATE_FRAGMENTING);
11041
11042 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11043 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11044
11045 for (CDir::map_t::iterator p = dir->items.begin();
11046 p != dir->items.end();
11047 ++p) {
11048 CDentry *dn = p->second;
11049 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11050 dn->state_clear(CDentry::STATE_FRAGMENTING);
11051 dn->put(CDentry::PIN_FRAGMENTING);
11052 }
11053 } else {
11054 dir->auth_unpin(dir);
11055 }
11056
11057 dir->unfreeze_dir();
11058 }
11059 }
11060
11061 bool MDCache::fragment_are_all_frozen(CDir *dir)
11062 {
11063 assert(dir->is_frozen_dir());
11064 map<dirfrag_t,fragment_info_t>::iterator p;
11065 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11066 p != fragments.end() && p->first.ino == dir->ino();
11067 ++p) {
11068 if (p->first.frag.contains(dir->get_frag()))
11069 return p->second.all_frozen;
11070 }
11071 ceph_abort();
11072 return false;
11073 }
11074
11075 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11076 {
11077 map<dirfrag_t,fragment_info_t>::iterator p;
11078 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11079 p != fragments.end() && p->first.ino == dir->ino();
11080 ++p) {
11081 if (p->first.frag.contains(dir->get_frag())) {
11082 p->second.num_remote_waiters++;
11083 return;
11084 }
11085 }
11086 ceph_abort();
11087 }
11088
11089 void MDCache::find_stale_fragment_freeze()
11090 {
11091 dout(10) << "find_stale_fragment_freeze" << dendl;
11092 // see comment in Migrator::find_stale_export_freeze()
11093 utime_t now = ceph_clock_now();
11094 utime_t cutoff = now;
11095 cutoff -= g_conf->mds_freeze_tree_timeout;
11096
11097 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11098 p != fragments.end(); ) {
11099 dirfrag_t df = p->first;
11100 fragment_info_t& info = p->second;
11101 ++p;
11102 if (info.all_frozen)
11103 continue;
11104 CDir *dir;
11105 int total_auth_pins = 0;
11106 for (list<CDir*>::iterator q = info.dirs.begin();
11107 q != info.dirs.end();
11108 ++q) {
11109 dir = *q;
11110 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11111 total_auth_pins = -1;
11112 break;
11113 }
11114 if (dir->is_frozen_dir())
11115 continue;
11116 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11117 }
11118 if (total_auth_pins < 0)
11119 continue;
11120 if (info.last_cum_auth_pins != total_auth_pins) {
11121 info.last_cum_auth_pins = total_auth_pins;
11122 info.last_cum_auth_pins_change = now;
11123 continue;
11124 }
11125 if (info.last_cum_auth_pins_change >= cutoff)
11126 continue;
11127 dir = info.dirs.front();
11128 if (info.num_remote_waiters > 0 ||
11129 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11130 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11131 list<CDir*> dirs;
11132 info.dirs.swap(dirs);
11133 fragments.erase(df);
11134 fragment_unmark_unfreeze_dirs(dirs);
11135 }
11136 }
11137 }
11138
11139 class C_MDC_FragmentPrep : public MDCacheLogContext {
11140 MDRequestRef mdr;
11141 public:
11142 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11143 void finish(int r) override {
11144 mdcache->_fragment_logged(mdr);
11145 }
11146 };
11147
11148 class C_MDC_FragmentStore : public MDCacheContext {
11149 MDRequestRef mdr;
11150 public:
11151 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11152 void finish(int r) override {
11153 mdcache->_fragment_stored(mdr);
11154 }
11155 };
11156
11157 class C_MDC_FragmentCommit : public MDCacheLogContext {
11158 dirfrag_t basedirfrag;
11159 list<CDir*> resultfrags;
11160 public:
11161 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11162 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11163 void finish(int r) override {
11164 mdcache->_fragment_committed(basedirfrag, resultfrags);
11165 }
11166 };
11167
11168 class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11169 dirfrag_t basedirfrag;
11170 list<CDir*> resultfrags;
11171 public:
11172 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11173 MDCacheIOContext(m), basedirfrag(f) {
11174 resultfrags.swap(l);
11175 }
11176 void finish(int r) override {
11177 assert(r == 0 || r == -ENOENT);
11178 mdcache->_fragment_finish(basedirfrag, resultfrags);
11179 }
11180 };
11181
11182 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11183 {
11184 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11185 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11186 if (it == fragments.end() || it->second.mdr != mdr) {
11187 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11188 request_finish(mdr);
11189 return;
11190 }
11191
11192 assert(r == 0);
11193 fragment_info_t& info = it->second;
11194 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11195 << " on " << info.dirs.front()->get_inode() << dendl;
11196
11197 info.all_frozen = true;
11198 dispatch_fragment_dir(mdr);
11199 }
11200
11201 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11202 {
11203 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11204 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11205 if (it == fragments.end() || it->second.mdr != mdr) {
11206 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11207 request_finish(mdr);
11208 return;
11209 }
11210
11211 fragment_info_t& info = it->second;
11212 CInode *diri = info.dirs.front()->get_inode();
11213
11214 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11215 << " on " << *diri << dendl;
11216 if (!mdr->aborted) {
11217 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11218 wrlocks.insert(&diri->dirfragtreelock);
11219 // prevent a racing gather on any other scatterlocks too
11220 wrlocks.insert(&diri->nestlock);
11221 wrlocks.insert(&diri->filelock);
11222 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11223 if (!mdr->aborted)
11224 return;
11225 }
11226
11227 if (mdr->aborted) {
11228 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11229 << info.dirs.front()->dirfrag() << dendl;
11230 if (info.bits > 0)
11231 mds->balancer->queue_split(info.dirs.front(), false);
11232 else
11233 mds->balancer->queue_merge(info.dirs.front());
11234 fragment_unmark_unfreeze_dirs(info.dirs);
11235 fragments.erase(it);
11236 request_finish(mdr);
11237 return;
11238 }
11239
11240 mdr->ls = mds->mdlog->get_current_segment();
11241 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11242 mds->mdlog->start_entry(le);
11243
11244 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11245 CDir *dir = *p;
11246 dirfrag_rollback rollback;
11247 rollback.fnode = dir->fnode;
11248 le->add_orig_frag(dir->get_frag(), &rollback);
11249 }
11250
11251 // refragment
11252 list<MDSInternalContextBase*> waiters;
11253 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11254 info.resultfrags, waiters, false);
11255 if (g_conf->mds_debug_frag)
11256 diri->verify_dirfrags();
11257 mds->queue_waiters(waiters);
11258
11259 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11260 assert(!diri->dirfragtree.is_leaf(*p));
11261
11262 le->metablob.add_dir_context(*info.resultfrags.begin());
11263 for (list<CDir*>::iterator p = info.resultfrags.begin();
11264 p != info.resultfrags.end();
11265 ++p) {
11266 if (diri->is_auth()) {
11267 le->metablob.add_fragmented_dir(*p, false, false);
11268 } else {
11269 (*p)->state_set(CDir::STATE_DIRTYDFT);
11270 le->metablob.add_fragmented_dir(*p, false, true);
11271 }
11272 }
11273
11274 // dft lock
11275 if (diri->is_auth()) {
11276 // journal dirfragtree
11277 inode_t *pi = diri->project_inode();
11278 pi->version = diri->pre_dirty();
11279 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11280 } else {
11281 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11282 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11283 mdr->add_updated_lock(&diri->dirfragtreelock);
11284 }
11285
11286 /*
11287 // filelock
11288 mds->locker->mark_updated_scatterlock(&diri->filelock);
11289 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11290 mut->add_updated_lock(&diri->filelock);
11291
11292 // dirlock
11293 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11294 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11295 mut->add_updated_lock(&diri->nestlock);
11296 */
11297
11298 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11299 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11300 mdr, __func__);
11301 mds->mdlog->flush();
11302 }
11303
11304 void MDCache::_fragment_logged(MDRequestRef& mdr)
11305 {
11306 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11307 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11308 assert(it != fragments.end());
11309 fragment_info_t &info = it->second;
11310 CInode *diri = info.resultfrags.front()->get_inode();
11311
11312 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11313 << " on " << *diri << dendl;
11314
11315 if (diri->is_auth())
11316 diri->pop_and_dirty_projected_inode(mdr->ls);
11317
11318 mdr->apply(); // mark scatterlock
11319
11320 // store resulting frags
11321 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11322
11323 for (list<CDir*>::iterator p = info.resultfrags.begin();
11324 p != info.resultfrags.end();
11325 ++p) {
11326 CDir *dir = *p;
11327 dout(10) << " storing result frag " << *dir << dendl;
11328
11329 // freeze and store them too
11330 dir->auth_pin(this);
11331 dir->state_set(CDir::STATE_FRAGMENTING);
11332 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11333 }
11334
11335 gather.activate();
11336 }
11337
11338 void MDCache::_fragment_stored(MDRequestRef& mdr)
11339 {
11340 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11341 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11342 assert(it != fragments.end());
11343 fragment_info_t &info = it->second;
11344 CInode *diri = info.resultfrags.front()->get_inode();
11345
11346 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11347 << " on " << *diri << dendl;
11348
11349 // tell peers
11350 CDir *first = *info.resultfrags.begin();
11351 for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
11352 p != first->replicas_end();
11353 ++p) {
11354 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
11355 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
11356 rejoin_gather.count(p->first)))
11357 continue;
11358
11359 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11360
11361 // freshly replicate new dirs to peers
11362 for (list<CDir*>::iterator q = info.resultfrags.begin();
11363 q != info.resultfrags.end();
11364 ++q)
11365 replicate_dir(*q, p->first, notify->basebl);
11366
11367 mds->send_message_mds(notify, p->first);
11368 }
11369
11370 // journal commit
11371 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11372 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11373 info.resultfrags));
11374
11375 mds->locker->drop_locks(mdr.get());
11376
11377 // unfreeze resulting frags
11378 for (list<CDir*>::iterator p = info.resultfrags.begin();
11379 p != info.resultfrags.end();
11380 ++p) {
11381 CDir *dir = *p;
11382 dout(10) << " result frag " << *dir << dendl;
11383
11384 for (CDir::map_t::iterator p = dir->items.begin();
11385 p != dir->items.end();
11386 ++p) {
11387 CDentry *dn = p->second;
11388 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11389 dn->state_clear(CDentry::STATE_FRAGMENTING);
11390 dn->put(CDentry::PIN_FRAGMENTING);
11391 }
11392
11393 // unfreeze
11394 dir->unfreeze_dir();
11395 }
11396
11397 fragments.erase(it);
11398 request_finish(mdr);
11399 }
11400
11401 void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11402 {
11403 dout(10) << "fragment_committed " << basedirfrag << dendl;
11404 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11405 assert(it != uncommitted_fragments.end());
11406 ufragment &uf = it->second;
11407
11408 // remove old frags
11409 C_GatherBuilder gather(
11410 g_ceph_context,
11411 new C_OnFinisher(
11412 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11413 mds->finisher));
11414
11415 SnapContext nullsnapc;
11416 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11417 for (list<frag_t>::iterator p = uf.old_frags.begin();
11418 p != uf.old_frags.end();
11419 ++p) {
11420 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11421 ObjectOperation op;
11422 if (*p == frag_t()) {
11423 // backtrace object
11424 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11425 op.truncate(0);
11426 op.omap_clear();
11427 } else {
11428 dout(10) << " removing orphan dirfrag " << oid << dendl;
11429 op.remove();
11430 }
11431 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11432 ceph::real_clock::now(),
11433 0, gather.new_sub());
11434 }
11435
11436 assert(gather.has_subs());
11437 gather.activate();
11438 }
11439
11440 void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11441 {
11442 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11443 << resultfrags.size() << dendl;
11444 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11445 assert(it != uncommitted_fragments.end());
11446 ufragment &uf = it->second;
11447
11448 // unmark & auth_unpin
11449 for (const auto &dir : resultfrags) {
11450 dir->state_clear(CDir::STATE_FRAGMENTING);
11451 dir->auth_unpin(this);
11452
11453 // In case the resulting fragments are beyond the split size,
11454 // we might need to split them again right away (they could
11455 // have been taking inserts between unfreezing and getting
11456 // here)
11457 mds->balancer->maybe_fragment(dir, false);
11458 }
11459
11460 if (mds->logger) {
11461 if (resultfrags.size() > 1) {
11462 mds->logger->inc(l_mds_dir_split);
11463 } else {
11464 mds->logger->inc(l_mds_dir_merge);
11465 }
11466 }
11467
11468 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11469 mds->mdlog->start_submit_entry(le);
11470
11471 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11472 }
11473
11474 /* This function DOES put the passed message before returning */
11475 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11476 {
11477 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11478
11479 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11480 notify->put();
11481 return;
11482 }
11483
11484 CInode *diri = get_inode(notify->get_ino());
11485 if (diri) {
11486 frag_t base = notify->get_basefrag();
11487 int bits = notify->get_bits();
11488
11489 /*
11490 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11491 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11492 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11493 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11494 notify->put();
11495 return;
11496 }
11497 */
11498
11499 // refragment
11500 list<MDSInternalContextBase*> waiters;
11501 list<CDir*> resultfrags;
11502 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11503 if (g_conf->mds_debug_frag)
11504 diri->verify_dirfrags();
11505
11506 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11507 diri->take_dir_waiting((*p)->get_frag(), waiters);
11508
11509 // add new replica dirs values
11510 bufferlist::iterator p = notify->basebl.begin();
11511 while (!p.end())
11512 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11513
11514 mds->queue_waiters(waiters);
11515 } else {
11516 ceph_abort();
11517 }
11518
11519 notify->put();
11520 }
11521
11522 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11523 LogSegment *ls, bufferlist *rollback)
11524 {
11525 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11526 assert(!uncommitted_fragments.count(basedirfrag));
11527 ufragment& uf = uncommitted_fragments[basedirfrag];
11528 uf.old_frags = old_frags;
11529 uf.bits = bits;
11530 uf.ls = ls;
11531 ls->uncommitted_fragments.insert(basedirfrag);
11532 if (rollback)
11533 uf.rollback.swap(*rollback);
11534 }
11535
11536 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11537 {
11538 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11539 << " op " << EFragment::op_name(op) << dendl;
11540 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11541 if (it != uncommitted_fragments.end()) {
11542 ufragment& uf = it->second;
11543 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11544 uf.committed = true;
11545 } else {
11546 uf.ls->uncommitted_fragments.erase(basedirfrag);
11547 mds->queue_waiters(uf.waiters);
11548 uncommitted_fragments.erase(it);
11549 }
11550 }
11551 }
11552
11553 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11554 {
11555 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11556 << " old_frags (" << old_frags << ")" << dendl;
11557 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11558 if (it != uncommitted_fragments.end()) {
11559 ufragment& uf = it->second;
11560 if (!uf.old_frags.empty()) {
11561 uf.old_frags.swap(old_frags);
11562 uf.committed = true;
11563 } else {
11564 uf.ls->uncommitted_fragments.erase(basedirfrag);
11565 uncommitted_fragments.erase(it);
11566 }
11567 }
11568 }
11569
11570 void MDCache::rollback_uncommitted_fragments()
11571 {
11572 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11573 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11574 p != uncommitted_fragments.end();
11575 ++p) {
11576 ufragment &uf = p->second;
11577 CInode *diri = get_inode(p->first.ino);
11578 assert(diri);
11579
11580 if (uf.committed) {
11581 list<CDir*> frags;
11582 diri->get_dirfrags_under(p->first.frag, frags);
11583 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11584 CDir *dir = *q;
11585 dir->auth_pin(this);
11586 dir->state_set(CDir::STATE_FRAGMENTING);
11587 }
11588 _fragment_committed(p->first, frags);
11589 continue;
11590 }
11591
11592 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11593
11594 LogSegment *ls = mds->mdlog->get_current_segment();
11595 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11596 mds->mdlog->start_entry(le);
11597 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11598
11599 list<frag_t> old_frags;
11600 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11601
11602 list<CDir*> resultfrags;
11603 if (uf.old_frags.empty()) {
11604 // created by old format EFragment
11605 list<MDSInternalContextBase*> waiters;
11606 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11607 } else {
11608 bufferlist::iterator bp = uf.rollback.begin();
11609 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11610 CDir *dir = force_dir_fragment(diri, *q);
11611 resultfrags.push_back(dir);
11612
11613 dirfrag_rollback rollback;
11614 ::decode(rollback, bp);
11615
11616 dir->set_version(rollback.fnode.version);
11617 dir->fnode = rollback.fnode;
11618
11619 dir->_mark_dirty(ls);
11620
11621 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11622 dout(10) << " dirty nestinfo on " << *dir << dendl;
11623 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11624 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11625 }
11626 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11627 dout(10) << " dirty fragstat on " << *dir << dendl;
11628 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11629 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11630 }
11631
11632 le->add_orig_frag(dir->get_frag());
11633 le->metablob.add_dir_context(dir);
11634 if (diri_auth) {
11635 le->metablob.add_fragmented_dir(dir, true, false);
11636 } else {
11637 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11638 dir->state_set(CDir::STATE_DIRTYDFT);
11639 le->metablob.add_fragmented_dir(dir, true, true);
11640 }
11641 }
11642 }
11643
11644 if (diri_auth) {
11645 diri->project_inode()->version = diri->pre_dirty();
11646 diri->pop_and_dirty_projected_inode(ls); // hacky
11647 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11648 } else {
11649 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11650 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11651 }
11652
11653 if (g_conf->mds_debug_frag)
11654 diri->verify_dirfrags();
11655
11656 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11657 assert(!diri->dirfragtree.is_leaf(*q));
11658
11659 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11660 CDir *dir = *q;
11661 dir->auth_pin(this);
11662 dir->state_set(CDir::STATE_FRAGMENTING);
11663 }
11664
11665 mds->mdlog->submit_entry(le);
11666
11667 uf.old_frags.swap(old_frags);
11668 _fragment_committed(p->first, resultfrags);
11669 }
11670 }
11671
11672 void MDCache::force_readonly()
11673 {
11674 if (is_readonly())
11675 return;
11676
11677 dout(1) << "force file system read-only" << dendl;
11678 mds->clog->warn() << "force file system read-only";
11679
11680 set_readonly();
11681
11682 mds->server->force_clients_readonly();
11683
11684 // revoke write caps
11685 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
11686 p != inode_map.end();
11687 ++p) {
11688 CInode *in = p->second;
11689 if (in->is_head())
11690 mds->locker->eval(in, CEPH_CAP_LOCKS);
11691 }
11692
11693 mds->mdlog->flush();
11694 }
11695
11696
11697 // ==============================================================
11698 // debug crap
11699
11700 void MDCache::show_subtrees(int dbl)
11701 {
11702 if (g_conf->mds_thrash_exports)
11703 dbl += 15;
11704
11705 //dout(10) << "show_subtrees" << dendl;
11706
11707 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11708 return; // i won't print anything.
11709
11710 if (subtrees.empty()) {
11711 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11712 return;
11713 }
11714
11715 // root frags
11716 list<CDir*> basefrags;
11717 for (set<CInode*>::iterator p = base_inodes.begin();
11718 p != base_inodes.end();
11719 ++p)
11720 (*p)->get_dirfrags(basefrags);
11721 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11722 dout(15) << "show_subtrees" << dendl;
11723
11724 // queue stuff
11725 list<pair<CDir*,int> > q;
11726 string indent;
11727 set<CDir*> seen;
11728
11729 // calc max depth
11730 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11731 q.push_back(pair<CDir*,int>(*p, 0));
11732
11733 set<CDir*> subtrees_seen;
11734
11735 int depth = 0;
11736 while (!q.empty()) {
11737 CDir *dir = q.front().first;
11738 int d = q.front().second;
11739 q.pop_front();
11740
11741 if (subtrees.count(dir) == 0) continue;
11742
11743 subtrees_seen.insert(dir);
11744
11745 if (d > depth) depth = d;
11746
11747 // sanity check
11748 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11749 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11750 assert(seen.count(dir) == 0);
11751 seen.insert(dir);
11752
11753 // nested items?
11754 if (!subtrees[dir].empty()) {
11755 for (set<CDir*>::iterator p = subtrees[dir].begin();
11756 p != subtrees[dir].end();
11757 ++p) {
11758 //dout(25) << " saw sub " << **p << dendl;
11759 q.push_front(pair<CDir*,int>(*p, d+1));
11760 }
11761 }
11762 }
11763
11764
11765 // print tree
11766 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11767 q.push_back(pair<CDir*,int>(*p, 0));
11768
11769 while (!q.empty()) {
11770 CDir *dir = q.front().first;
11771 int d = q.front().second;
11772 q.pop_front();
11773
11774 if (subtrees.count(dir) == 0) continue;
11775
11776 // adjust indenter
11777 while ((unsigned)d < indent.size())
11778 indent.resize(d);
11779
11780 // pad
11781 string pad = "______________________________________";
11782 pad.resize(depth*2+1-indent.size());
11783 if (!subtrees[dir].empty())
11784 pad[0] = '.'; // parent
11785
11786
11787 string auth;
11788 if (dir->is_auth())
11789 auth = "auth ";
11790 else
11791 auth = " rep ";
11792
11793 char s[10];
11794 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11795 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11796 else
11797 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11798
11799 // print
11800 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11801
11802 if (dir->ino() == MDS_INO_ROOT)
11803 assert(dir->inode == root);
11804 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11805 assert(dir->inode == myin);
11806 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11807 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11808
11809 // nested items?
11810 if (!subtrees[dir].empty()) {
11811 // more at my level?
11812 if (!q.empty() && q.front().second == d)
11813 indent += "| ";
11814 else
11815 indent += " ";
11816
11817 for (set<CDir*>::iterator p = subtrees[dir].begin();
11818 p != subtrees[dir].end();
11819 ++p)
11820 q.push_front(pair<CDir*,int>(*p, d+2));
11821 }
11822 }
11823
11824 // verify there isn't stray crap in subtree map
11825 int lost = 0;
11826 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11827 p != subtrees.end();
11828 ++p) {
11829 if (subtrees_seen.count(p->first)) continue;
11830 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11831 lost++;
11832 }
11833 assert(lost == 0);
11834 }
11835
11836
11837 void MDCache::show_cache()
11838 {
11839 dout(7) << "show_cache" << dendl;
11840
11841 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11842 it != inode_map.end();
11843 ++it) {
11844 // unlinked?
11845 if (!it->second->parent)
11846 dout(7) << " unlinked " << *it->second << dendl;
11847
11848 // dirfrags?
11849 list<CDir*> dfs;
11850 it->second->get_dirfrags(dfs);
11851 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11852 CDir *dir = *p;
11853 dout(7) << " dirfrag " << *dir << dendl;
11854
11855 for (CDir::map_t::iterator p = dir->items.begin();
11856 p != dir->items.end();
11857 ++p) {
11858 CDentry *dn = p->second;
11859 dout(7) << " dentry " << *dn << dendl;
11860 CDentry::linkage_t *dnl = dn->get_linkage();
11861 if (dnl->is_primary() && dnl->get_inode())
11862 dout(7) << " inode " << *dnl->get_inode() << dendl;
11863 }
11864 }
11865 }
11866 }
11867
11868 int MDCache::dump_cache(std::string const &file_name)
11869 {
11870 return dump_cache(file_name.c_str(), NULL);
11871 }
11872
11873 int MDCache::dump_cache(Formatter *f)
11874 {
11875 return dump_cache(NULL, f);
11876 }
11877
11878 int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
11879 {
11880 return dump_cache(NULL, f, dump_root, depth);
11881 }
11882
11883 /**
11884 * Dump the metadata cache, either to a Formatter, if
11885 * provided, else to a plain text file.
11886 */
11887 int MDCache::dump_cache(const char *fn, Formatter *f,
11888 const string& dump_root, int depth)
11889 {
11890 int r = 0;
11891 int fd = -1;
11892
11893 if (f) {
11894 f->open_array_section("inodes");
11895 } else {
11896 char deffn[200];
11897 if (!fn) {
11898 snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11899 fn = deffn;
11900 }
11901
11902 dout(1) << "dump_cache to " << fn << dendl;
11903
11904 fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
11905 if (fd < 0) {
11906 derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
11907 return errno;
11908 }
11909 }
11910
11911 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11912 it != inode_map.end();
11913 ++it) {
11914 CInode *in = it->second;
11915
11916 if (!dump_root.empty()) {
11917 string ipath;
11918 if (in->is_root())
11919 ipath = "/";
11920 else
11921 in->make_path_string(ipath);
11922
11923 if (dump_root.length() > ipath.length() ||
11924 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11925 continue;
11926
11927 if (depth >= 0 &&
11928 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11929 continue;
11930 }
11931
11932 if (f) {
11933 f->open_object_section("inode");
11934 in->dump(f);
11935 } else {
11936 ostringstream ss;
11937 ss << *in << std::endl;
11938 std::string s = ss.str();
11939 r = safe_write(fd, s.c_str(), s.length());
11940 if (r < 0) {
11941 goto out;
11942 }
11943 }
11944
11945 list<CDir*> dfs;
11946 in->get_dirfrags(dfs);
11947 if (f) {
11948 f->open_array_section("dirfrags");
11949 }
11950 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11951 CDir *dir = *p;
11952 if (f) {
11953 f->open_object_section("dir");
11954 dir->dump(f);
11955 } else {
11956 ostringstream tt;
11957 tt << " " << *dir << std::endl;
11958 string t = tt.str();
11959 r = safe_write(fd, t.c_str(), t.length());
11960 if (r < 0) {
11961 goto out;
11962 }
11963 }
11964
11965 if (f) {
11966 f->open_array_section("dentries");
11967 }
11968 for (CDir::map_t::iterator q = dir->items.begin();
11969 q != dir->items.end();
11970 ++q) {
11971 CDentry *dn = q->second;
11972 if (f) {
11973 f->open_object_section("dentry");
11974 dn->dump(f);
11975 f->close_section();
11976 } else {
11977 ostringstream uu;
11978 uu << " " << *dn << std::endl;
11979 string u = uu.str();
11980 r = safe_write(fd, u.c_str(), u.length());
11981 if (r < 0) {
11982 goto out;
11983 }
11984 }
11985 }
11986 if (f) {
11987 f->close_section(); //dentries
11988 }
11989 dir->check_rstats();
11990 if (f) {
11991 f->close_section(); //dir
11992 }
11993 }
11994 if (f) {
11995 f->close_section(); // dirfrags
11996 }
11997
11998 if (f) {
11999 f->close_section(); // inode
12000 }
12001 }
12002
12003 out:
12004 if (f) {
12005 f->close_section(); // inodes
12006 } else {
12007 ::close(fd);
12008 }
12009 return r;
12010 }
12011
12012
12013
12014 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12015 : MDSInternalContext(c->mds), cache(c), mdr(r)
12016 {}
12017
12018 void C_MDS_RetryRequest::finish(int r)
12019 {
12020 mdr->retry++;
12021 cache->dispatch_request(mdr);
12022 }
12023
12024
12025 class C_MDS_EnqueueScrub : public Context
12026 {
12027 Formatter *formatter;
12028 Context *on_finish;
12029 public:
12030 ScrubHeaderRef header;
12031 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12032 formatter(f), on_finish(fin), header(nullptr) {}
12033
12034 Context *take_finisher() {
12035 Context *fin = on_finish;
12036 on_finish = NULL;
12037 return fin;
12038 }
12039
12040 void finish(int r) override {
12041 if (r < 0) { // we failed the lookup or something; dump ourselves
12042 formatter->open_object_section("results");
12043 formatter->dump_int("return_code", r);
12044 formatter->close_section(); // results
12045 }
12046 if (on_finish)
12047 on_finish->complete(r);
12048 }
12049 };
12050
12051 void MDCache::enqueue_scrub(
12052 const string& path,
12053 const std::string &tag,
12054 bool force, bool recursive, bool repair,
12055 Formatter *f, Context *fin)
12056 {
12057 dout(10) << __func__ << path << dendl;
12058 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12059 filepath fp(path.c_str());
12060 mdr->set_filepath(fp);
12061
12062 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12063 cs->header = std::make_shared<ScrubHeader>(
12064 tag, force, recursive, repair, f);
12065
12066 mdr->internal_op_finish = cs;
12067 enqueue_scrub_work(mdr);
12068 }
12069
12070 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12071 {
12072 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12073 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12074 if (NULL == in)
12075 return;
12076
12077 // TODO: Remove this restriction
12078 assert(in->is_auth());
12079
12080 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12081 if (!locked)
12082 return;
12083
12084 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12085 ScrubHeaderRef &header = cs->header;
12086
12087 // Cannot scrub same dentry twice at same time
12088 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12089 mds->server->respond_to_request(mdr, -EBUSY);
12090 return;
12091 } else {
12092 in->scrub_info();
12093 }
12094
12095 header->set_origin(in);
12096
12097 // only set completion context for non-recursive scrub, because we don't
12098 // want to block asok caller on long running scrub
12099 if (!header->get_recursive()) {
12100 Context *fin = cs->take_finisher();
12101 mds->scrubstack->enqueue_inode_top(in, header,
12102 new MDSInternalContextWrapper(mds, fin));
12103 } else
12104 mds->scrubstack->enqueue_inode_bottom(in, header, NULL);
12105
12106 mds->server->respond_to_request(mdr, 0);
12107 return;
12108 }
12109
12110 struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12111 MDRequestRef mdr;
12112 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12113 MDCacheLogContext(c), mdr(m) {}
12114 void finish(int r) override {
12115 mdr->apply();
12116 get_mds()->server->respond_to_request(mdr, r);
12117 }
12118 };
12119
12120 void MDCache::repair_dirfrag_stats(CDir *dir)
12121 {
12122 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12123 mdr->pin(dir);
12124 mdr->internal_op_private = dir;
12125 mdr->internal_op_finish = new C_MDSInternalNoop;
12126 repair_dirfrag_stats_work(mdr);
12127 }
12128
12129 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12130 {
12131 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12132 dout(10) << __func__ << " " << *dir << dendl;
12133
12134 if (!dir->is_auth()) {
12135 mds->server->respond_to_request(mdr, -ESTALE);
12136 return;
12137 }
12138
12139 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12140 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12141
12142 mds->locker->drop_locks(mdr.get());
12143 mdr->drop_local_auth_pins();
12144 if (!mdr->remote_auth_pins.empty())
12145 mds->locker->notify_freeze_waiter(dir);
12146 return;
12147 }
12148
12149 mdr->auth_pin(dir);
12150
12151 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12152 CInode *diri = dir->inode;
12153 rdlocks.insert(&diri->dirfragtreelock);
12154 wrlocks.insert(&diri->nestlock);
12155 wrlocks.insert(&diri->filelock);
12156 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12157 return;
12158
12159 if (!dir->is_complete()) {
12160 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12161 return;
12162 }
12163
12164 frag_info_t frag_info;
12165 nest_info_t nest_info;
12166 for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
12167 CDentry *dn = it->second;
12168 if (dn->last != CEPH_NOSNAP)
12169 continue;
12170 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12171 if (dnl->is_primary()) {
12172 CInode *in = dnl->get_inode();
12173 nest_info.add(in->get_projected_inode()->accounted_rstat);
12174 if (in->is_dir())
12175 frag_info.nsubdirs++;
12176 else
12177 frag_info.nfiles++;
12178 } else if (dnl->is_remote())
12179 frag_info.nfiles++;
12180 }
12181
12182 fnode_t *pf = dir->get_projected_fnode();
12183 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12184 bool good_rstat = nest_info.same_sums(pf->rstat);
12185 if (good_fragstat && good_rstat) {
12186 dout(10) << __func__ << " no corruption found" << dendl;
12187 mds->server->respond_to_request(mdr, 0);
12188 return;
12189 }
12190
12191 pf = dir->project_fnode();
12192 pf->version = dir->pre_dirty();
12193 mdr->add_projected_fnode(dir);
12194
12195 mdr->ls = mds->mdlog->get_current_segment();
12196 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12197 mds->mdlog->start_entry(le);
12198
12199 if (!good_fragstat) {
12200 if (pf->fragstat.mtime > frag_info.mtime)
12201 frag_info.mtime = pf->fragstat.mtime;
12202 if (pf->fragstat.change_attr > frag_info.change_attr)
12203 frag_info.change_attr = pf->fragstat.change_attr;
12204 pf->fragstat = frag_info;
12205 mds->locker->mark_updated_scatterlock(&diri->filelock);
12206 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12207 mdr->add_updated_lock(&diri->filelock);
12208 }
12209
12210 if (!good_rstat) {
12211 if (pf->rstat.rctime > nest_info.rctime)
12212 nest_info.rctime = pf->rstat.rctime;
12213 pf->rstat = nest_info;
12214 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12215 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12216 mdr->add_updated_lock(&diri->nestlock);
12217 }
12218
12219 le->metablob.add_dir_context(dir);
12220 le->metablob.add_dir(dir, true);
12221
12222 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12223 }
12224
12225 void MDCache::repair_inode_stats(CInode *diri)
12226 {
12227 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12228 mdr->pin(diri);
12229 mdr->internal_op_private = diri;
12230 mdr->internal_op_finish = new C_MDSInternalNoop;
12231 repair_inode_stats_work(mdr);
12232 }
12233
12234 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12235 {
12236 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12237 dout(10) << __func__ << " " << *diri << dendl;
12238
12239 if (!diri->is_auth()) {
12240 mds->server->respond_to_request(mdr, -ESTALE);
12241 return;
12242 }
12243 if (!diri->is_dir()) {
12244 mds->server->respond_to_request(mdr, -ENOTDIR);
12245 return;
12246 }
12247
12248 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12249 std::list<frag_t> frags;
12250
12251 if (mdr->ls) // already marked filelock/nestlock dirty ?
12252 goto do_rdlocks;
12253
12254 rdlocks.insert(&diri->dirfragtreelock);
12255 wrlocks.insert(&diri->nestlock);
12256 wrlocks.insert(&diri->filelock);
12257 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12258 return;
12259
12260 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12261 // the scatter-gather process, which will fix any fragstat/rstat errors.
12262 diri->dirfragtree.get_leaves(frags);
12263 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12264 CDir *dir = diri->get_dirfrag(*p);
12265 if (!dir) {
12266 assert(mdr->is_auth_pinned(diri));
12267 dir = diri->get_or_open_dirfrag(this, *p);
12268 }
12269 if (dir->get_version() == 0) {
12270 assert(dir->is_auth());
12271 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12272 return;
12273 }
12274 }
12275
12276 diri->state_set(CInode::STATE_REPAIRSTATS);
12277 mdr->ls = mds->mdlog->get_current_segment();
12278 mds->locker->mark_updated_scatterlock(&diri->filelock);
12279 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12280 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12281 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12282
12283 mds->locker->drop_locks(mdr.get());
12284
12285 do_rdlocks:
12286 // force the scatter-gather process
12287 rdlocks.insert(&diri->dirfragtreelock);
12288 rdlocks.insert(&diri->nestlock);
12289 rdlocks.insert(&diri->filelock);
12290 wrlocks.clear();
12291 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12292 return;
12293
12294 diri->state_clear(CInode::STATE_REPAIRSTATS);
12295
12296 frag_info_t dir_info;
12297 nest_info_t nest_info;
12298 nest_info.rsubdirs++; // it gets one to account for self
12299
12300 diri->dirfragtree.get_leaves(frags);
12301 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12302 CDir *dir = diri->get_dirfrag(*p);
12303 assert(dir);
12304 assert(dir->get_version() > 0);
12305 dir_info.add(dir->fnode.accounted_fragstat);
12306 nest_info.add(dir->fnode.accounted_rstat);
12307 }
12308
12309 if (!dir_info.same_sums(diri->inode.dirstat) ||
12310 !nest_info.same_sums(diri->inode.rstat)) {
12311 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12312 << *diri << dendl;
12313 }
12314
12315 mds->server->respond_to_request(mdr, 0);
12316 }
12317
12318 void MDCache::flush_dentry(const string& path, Context *fin)
12319 {
12320 if (is_readonly()) {
12321 dout(10) << __func__ << ": read-only FS" << dendl;
12322 fin->complete(-EROFS);
12323 return;
12324 }
12325 dout(10) << "flush_dentry " << path << dendl;
12326 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12327 filepath fp(path.c_str());
12328 mdr->set_filepath(fp);
12329 mdr->internal_op_finish = fin;
12330 flush_dentry_work(mdr);
12331 }
12332
12333 class C_FinishIOMDR : public MDSInternalContextBase {
12334 protected:
12335 MDSRank *mds;
12336 MDRequestRef mdr;
12337 MDSRank *get_mds() override { return mds; }
12338 public:
12339 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12340 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12341 };
12342
12343 void MDCache::flush_dentry_work(MDRequestRef& mdr)
12344 {
12345 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12346 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12347 if (NULL == in)
12348 return;
12349
12350 // TODO: Is this necessary? Fix it if so
12351 assert(in->is_auth());
12352 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12353 if (!locked)
12354 return;
12355 in->flush(new C_FinishIOMDR(mds, mdr));
12356 }
12357
12358
12359 /**
12360 * Initialize performance counters with global perfcounter
12361 * collection.
12362 */
12363 void MDCache::register_perfcounters()
12364 {
12365 PerfCountersBuilder pcb(g_ceph_context,
12366 "mds_cache", l_mdc_first, l_mdc_last);
12367
12368 /* Stray/purge statistics */
12369 pcb.add_u64(l_mdc_num_strays, "num_strays",
12370 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
12371 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12372 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12373
12374 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12375 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12376 "Stray dentries enqueued for purge");
12377 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12378 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12379
12380
12381 /* Recovery queue statistics */
12382 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12383 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
12384 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
12385 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12386 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12387 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
12388 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
12389
12390 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12391 "Internal Request type enqueue scrub");
12392 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12393 "Internal Request type export dir");
12394 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12395 "Internal Request type flush");
12396 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12397 "Internal Request type fragmentdir");
12398 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12399 "Internal Request type frag stats");
12400 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12401 "Internal Request type inode stats");
12402
12403 logger.reset(pcb.create_perf_counters());
12404 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12405 recovery_queue.set_logger(logger.get());
12406 stray_manager.set_logger(logger.get());
12407 }
12408
12409 void MDCache::activate_stray_manager()
12410 {
12411 if (open) {
12412 stray_manager.activate();
12413 } else {
12414 wait_for_open(
12415 new MDSInternalContextWrapper(mds,
12416 new FunctionContext([this](int r){
12417 stray_manager.activate();
12418 })
12419 )
12420 );
12421 }
12422 }
12423
12424 /**
12425 * Call this when putting references to an inode/dentry or
12426 * when attempting to trim it.
12427 *
12428 * If this inode is no longer linked by anyone, and this MDS
12429 * rank holds the primary dentry, and that dentry is in a stray
12430 * directory, then give up the dentry to the StrayManager, never
12431 * to be seen again by MDCache.
12432 *
12433 * @param delay if true, then purgeable inodes are stashed til
12434 * the next trim(), rather than being purged right
12435 * away.
12436 */
12437 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
12438 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12439 mds->get_state() <= MDSMap::STATE_REJOIN)
12440 return;
12441
12442 CDentry *dn = in->get_projected_parent_dn();
12443
12444 if (dn->state_test(CDentry::STATE_PURGING)) {
12445 /* We have already entered the purging process, no need
12446 * to re-evaluate me ! */
12447 return;
12448 }
12449
12450 if (dn->get_projected_linkage()->is_primary() &&
12451 dn->get_dir()->get_inode()->is_stray()) {
12452 stray_manager.eval_stray(dn, delay);
12453 }
12454 }
12455
12456 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12457 dout(10) << __func__ << " " << *diri << dendl;
12458 assert(diri->get_projected_parent_dir()->inode->is_stray());
12459 list<CDir*> ls;
12460 diri->get_dirfrags(ls);
12461 for (auto p : ls) {
12462 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12463 p->try_remove_dentries_for_stray();
12464 }
12465 if (!diri->snaprealm) {
12466 if (diri->is_auth())
12467 diri->clear_dirty_rstat();
12468 diri->clear_scatter_dirty();
12469 }
12470 }
12471