]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
6c9f47a7290343266f4fbf77bc52e3fdf36250be
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <map>
21
22 #include "MDCache.h"
23 #include "MDSRank.h"
24 #include "Server.h"
25 #include "Locker.h"
26 #include "MDLog.h"
27 #include "MDBalancer.h"
28 #include "Migrator.h"
29 #include "ScrubStack.h"
30
31 #include "SnapClient.h"
32
33 #include "MDSMap.h"
34
35 #include "CInode.h"
36 #include "CDir.h"
37
38 #include "Mutation.h"
39
40 #include "include/ceph_fs.h"
41 #include "include/filepath.h"
42
43 #include "msg/Message.h"
44 #include "msg/Messenger.h"
45
46 #include "common/errno.h"
47 #include "common/safe_io.h"
48 #include "common/perf_counters.h"
49 #include "common/MemoryModel.h"
50 #include "osdc/Journaler.h"
51 #include "osdc/Filer.h"
52
53 #include "events/ESubtreeMap.h"
54 #include "events/EUpdate.h"
55 #include "events/ESlaveUpdate.h"
56 #include "events/EImportFinish.h"
57 #include "events/EFragment.h"
58 #include "events/ECommitted.h"
59 #include "events/ESessions.h"
60
61 #include "messages/MGenericMessage.h"
62
63 #include "messages/MMDSResolve.h"
64 #include "messages/MMDSResolveAck.h"
65 #include "messages/MMDSCacheRejoin.h"
66
67 #include "messages/MDiscover.h"
68 #include "messages/MDiscoverReply.h"
69
70 //#include "messages/MInodeUpdate.h"
71 #include "messages/MDirUpdate.h"
72 #include "messages/MCacheExpire.h"
73
74 #include "messages/MInodeFileCaps.h"
75
76 #include "messages/MLock.h"
77 #include "messages/MDentryLink.h"
78 #include "messages/MDentryUnlink.h"
79
80 #include "messages/MMDSFindIno.h"
81 #include "messages/MMDSFindInoReply.h"
82
83 #include "messages/MMDSOpenIno.h"
84 #include "messages/MMDSOpenInoReply.h"
85
86 #include "messages/MClientRequest.h"
87 #include "messages/MClientCaps.h"
88 #include "messages/MClientSnap.h"
89 #include "messages/MClientQuota.h"
90
91 #include "messages/MMDSSlaveRequest.h"
92
93 #include "messages/MMDSFragmentNotify.h"
94
95 #include "messages/MGatherCaps.h"
96
97 #include "InoTable.h"
98
99 #include "common/Timer.h"
100
101 #include "perfglue/heap_profiler.h"
102
103 using namespace std;
104
105 #include "common/config.h"
106 #include "include/assert.h"
107
108 #define dout_context g_ceph_context
109 #define dout_subsys ceph_subsys_mds
110 #undef dout_prefix
111 #define dout_prefix _prefix(_dout, mds)
112 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
113 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
114 }
115
116 set<int> SimpleLock::empty_gather_set;
117
118
119 /**
120 * All non-I/O contexts that require a reference
121 * to an MDCache instance descend from this.
122 */
123 class MDCacheContext : public virtual MDSInternalContextBase {
124 protected:
125 MDCache *mdcache;
126 MDSRank *get_mds() override
127 {
128 assert(mdcache != NULL);
129 return mdcache->mds;
130 }
131 public:
132 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
133 };
134
135
136 /**
137 * Only for contexts called back from an I/O completion
138 *
139 * Note: duplication of members wrt MDCacheContext, because
140 * it'ls the lesser of two evils compared with introducing
141 * yet another piece of (multiple) inheritance.
142 */
143 class MDCacheIOContext : public virtual MDSIOContextBase {
144 protected:
145 MDCache *mdcache;
146 MDSRank *get_mds() override
147 {
148 assert(mdcache != NULL);
149 return mdcache->mds;
150 }
151 public:
152 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
153 };
154
155 class MDCacheLogContext : public virtual MDSLogContextBase {
156 protected:
157 MDCache *mdcache;
158 MDSRank *get_mds() override
159 {
160 assert(mdcache != NULL);
161 return mdcache->mds;
162 }
163 public:
164 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
165 };
166
167 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
168 mds(m),
169 filer(m->objecter, m->finisher),
170 exceeded_size_limit(false),
171 recovery_queue(m),
172 stray_manager(m, purge_queue_)
173 {
174 migrator.reset(new Migrator(mds, this));
175 root = NULL;
176 myin = NULL;
177 readonly = false;
178
179 stray_index = 0;
180 for (int i = 0; i < NUM_STRAY; ++i) {
181 strays[i] = NULL;
182 }
183
184 num_inodes_with_caps = 0;
185
186 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
187 (g_conf->mds_dir_max_commit_size << 20) :
188 (0.9 *(g_conf->osd_max_write_size << 20));
189
190 discover_last_tid = 0;
191 open_ino_last_tid = 0;
192 find_ino_peer_last_tid = 0;
193
194 last_cap_id = 0;
195
196 client_lease_durations[0] = 5.0;
197 client_lease_durations[1] = 30.0;
198 client_lease_durations[2] = 300.0;
199
200 resolves_pending = false;
201 rejoins_pending = false;
202 cap_imports_num_opening = 0;
203
204 opening_root = open = false;
205 lru.lru_set_max(g_conf->mds_cache_size);
206 lru.lru_set_midpoint(g_conf->mds_cache_mid);
207
208 bottom_lru.lru_set_max(0);
209 bottom_lru.lru_set_midpoint(0);
210
211 decayrate.set_halflife(g_conf->mds_decay_halflife);
212
213 did_shutdown_log_cap = false;
214 }
215
216 MDCache::~MDCache()
217 {
218 if (logger) {
219 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
220 }
221 }
222
223
224
225 void MDCache::log_stat()
226 {
227 mds->logger->set(l_mds_inode_max, g_conf->mds_cache_size);
228 mds->logger->set(l_mds_inodes, lru.lru_get_size());
229 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
230 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
231 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
232 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
233 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
234 mds->logger->set(l_mds_caps, Capability::count());
235 }
236
237
238 //
239
240 bool MDCache::shutdown()
241 {
242 if (lru.lru_get_size() > 0) {
243 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
244 //show_cache();
245 show_subtrees();
246 //dump();
247 }
248 return true;
249 }
250
251
252 // ====================================================================
253 // some inode functions
254
255 void MDCache::add_inode(CInode *in)
256 {
257 // add to lru, inode map
258 assert(inode_map.count(in->vino()) == 0); // should be no dup inos!
259 inode_map[ in->vino() ] = in;
260
261 if (in->ino() < MDS_INO_SYSTEM_BASE) {
262 if (in->ino() == MDS_INO_ROOT)
263 root = in;
264 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
265 myin = in;
266 else if (in->is_stray()) {
267 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
268 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
269 }
270 }
271 if (in->is_base())
272 base_inodes.insert(in);
273 }
274
275 if (CInode::count() >
276 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
277 exceeded_size_limit = true;
278 }
279 }
280
281 void MDCache::remove_inode(CInode *o)
282 {
283 dout(14) << "remove_inode " << *o << dendl;
284
285 if (o->get_parent_dn()) {
286 // FIXME: multiple parents?
287 CDentry *dn = o->get_parent_dn();
288 assert(!dn->is_dirty());
289 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
290 }
291
292 if (o->is_dirty())
293 o->mark_clean();
294 if (o->is_dirty_parent())
295 o->clear_dirty_parent();
296
297 o->clear_scatter_dirty();
298
299 o->item_open_file.remove_myself();
300
301 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
302 export_pin_queue.erase(o);
303
304 // remove from inode map
305 inode_map.erase(o->vino());
306
307 if (o->ino() < MDS_INO_SYSTEM_BASE) {
308 if (o == root) root = 0;
309 if (o == myin) myin = 0;
310 if (o->is_stray()) {
311 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
312 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
313 }
314 }
315 if (o->is_base())
316 base_inodes.erase(o);
317 }
318
319 // delete it
320 assert(o->get_num_ref() == 0);
321 delete o;
322 }
323
324 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
325 {
326 file_layout_t result = file_layout_t::get_default();
327 result.pool_id = mdsmap.get_first_data_pool();
328 return result;
329 }
330
331 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
332 {
333 file_layout_t result = file_layout_t::get_default();
334 result.pool_id = mdsmap.get_metadata_pool();
335 if (g_conf->mds_log_segment_size > 0) {
336 result.object_size = g_conf->mds_log_segment_size;
337 result.stripe_unit = g_conf->mds_log_segment_size;
338 }
339 return result;
340 }
341
342 void MDCache::init_layouts()
343 {
344 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
345 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
346 }
347
348 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
349 int mode) const
350 {
351 in->inode.ino = ino;
352 in->inode.version = 1;
353 in->inode.xattr_version = 1;
354 in->inode.mode = 0500 | mode;
355 in->inode.size = 0;
356 in->inode.ctime =
357 in->inode.mtime =
358 in->inode.btime = ceph_clock_now();
359 in->inode.nlink = 1;
360 in->inode.truncate_size = -1ull;
361 in->inode.change_attr = 0;
362 in->inode.export_pin = MDS_RANK_NONE;
363
364 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
365 if (in->inode.is_dir()) {
366 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
367 ++in->inode.rstat.rsubdirs;
368 } else {
369 in->inode.layout = default_file_layout;
370 ++in->inode.rstat.rfiles;
371 }
372 in->inode.accounted_rstat = in->inode.rstat;
373
374 if (in->is_base()) {
375 if (in->is_root())
376 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
377 else
378 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
379 in->open_snaprealm(); // empty snaprealm
380 assert(!in->snaprealm->parent); // created its own
381 in->snaprealm->srnode.seq = 1;
382 }
383 }
384
385 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
386 {
387 dout(0) << "creating system inode with ino:" << ino << dendl;
388 CInode *in = new CInode(this);
389 create_unlinked_system_inode(in, ino, mode);
390 add_inode(in);
391 return in;
392 }
393
394 CInode *MDCache::create_root_inode()
395 {
396 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
397 i->inode.uid = g_conf->mds_root_ino_uid;
398 i->inode.gid = g_conf->mds_root_ino_gid;
399 i->inode.layout = default_file_layout;
400 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
401 return i;
402 }
403
404 void MDCache::create_empty_hierarchy(MDSGather *gather)
405 {
406 // create root dir
407 CInode *root = create_root_inode();
408
409 // force empty root dir
410 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
411 adjust_subtree_auth(rootdir, mds->get_nodeid());
412 rootdir->dir_rep = CDir::REP_ALL; //NONE;
413
414 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
415 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
416
417 root->inode.dirstat = rootdir->fnode.fragstat;
418 root->inode.rstat = rootdir->fnode.rstat;
419 ++root->inode.rstat.rsubdirs;
420 root->inode.accounted_rstat = root->inode.rstat;
421
422 rootdir->mark_complete();
423 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
424 rootdir->commit(0, gather->new_sub());
425
426 root->store(gather->new_sub());
427 }
428
429 void MDCache::create_mydir_hierarchy(MDSGather *gather)
430 {
431 // create mds dir
432 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
433
434 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
435 adjust_subtree_auth(mydir, mds->get_nodeid());
436
437 LogSegment *ls = mds->mdlog->get_current_segment();
438
439 // stray dir
440 for (int i = 0; i < NUM_STRAY; ++i) {
441 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
442 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
443 stringstream name;
444 name << "stray" << i;
445 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
446 sdn->_mark_dirty(mds->mdlog->get_current_segment());
447
448 stray->inode.dirstat = straydir->fnode.fragstat;
449
450 mydir->fnode.rstat.add(stray->inode.rstat);
451 mydir->fnode.fragstat.nsubdirs++;
452 // save them
453 straydir->mark_complete();
454 straydir->mark_dirty(straydir->pre_dirty(), ls);
455 straydir->commit(0, gather->new_sub());
456 stray->_mark_dirty_parent(ls, true);
457 stray->store_backtrace(gather->new_sub());
458 }
459
460 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
461 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
462
463 myin->inode.dirstat = mydir->fnode.fragstat;
464 myin->inode.rstat = mydir->fnode.rstat;
465 ++myin->inode.rstat.rsubdirs;
466 myin->inode.accounted_rstat = myin->inode.rstat;
467
468 mydir->mark_complete();
469 mydir->mark_dirty(mydir->pre_dirty(), ls);
470 mydir->commit(0, gather->new_sub());
471
472 myin->store(gather->new_sub());
473 }
474
475 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
476 MutationRef mut;
477 CDentry *dn;
478 version_t dpv;
479 MDSInternalContextBase *fin;
480 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
481 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
482 void finish(int r) override {
483 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
484 }
485 };
486
487 void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
488 {
489 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
490 CDentry *dn = dir->add_null_dentry(name);
491
492 dn->push_projected_linkage(in);
493 version_t dpv = dn->pre_dirty();
494
495 CDir *mdir = 0;
496 if (in->inode.is_dir()) {
497 in->inode.rstat.rsubdirs = 1;
498
499 mdir = in->get_or_open_dirfrag(this, frag_t());
500 mdir->mark_complete();
501 mdir->pre_dirty();
502 } else
503 in->inode.rstat.rfiles = 1;
504 in->inode.version = dn->pre_dirty();
505
506 SnapRealm *realm = dir->get_inode()->find_snaprealm();
507 dn->first = in->first = realm->get_newest_seq() + 1;
508
509 MutationRef mut(new MutationImpl());
510
511 // force some locks. hacky.
512 mds->locker->wrlock_force(&dir->inode->filelock, mut);
513 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
514
515 mut->ls = mds->mdlog->get_current_segment();
516 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
517 mds->mdlog->start_entry(le);
518
519 if (!in->is_mdsdir()) {
520 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
521 le->metablob.add_primary_dentry(dn, in, true);
522 } else {
523 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
524 journal_dirty_inode(mut.get(), &le->metablob, in);
525 dn->push_projected_linkage(in->ino(), in->d_type());
526 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
527 le->metablob.add_root(true, in);
528 }
529 if (mdir)
530 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
531
532 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
533 mds->mdlog->flush();
534 }
535
536 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
537 {
538 dout(10) << "_create_system_file_finish " << *dn << dendl;
539
540 dn->pop_projected_linkage();
541 dn->mark_dirty(dpv, mut->ls);
542
543 CInode *in = dn->get_linkage()->get_inode();
544 in->inode.version--;
545 in->mark_dirty(in->inode.version + 1, mut->ls);
546
547 if (in->inode.is_dir()) {
548 CDir *dir = in->get_dirfrag(frag_t());
549 assert(dir);
550 dir->mark_dirty(1, mut->ls);
551 dir->mark_new(mut->ls);
552 }
553
554 mut->apply();
555 mds->locker->drop_locks(mut.get());
556 mut->cleanup();
557
558 fin->complete(0);
559
560 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
561 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
562 }
563
564
565
566 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
567 MDCache *cache;
568 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
569 void finish(int r) override {
570 if (r < 0) {
571 // If we can't open root, something disastrous has happened: mark
572 // this rank damaged for operator intervention. Note that
573 // it is not okay to call suicide() here because we are in
574 // a Finisher callback.
575 cache->mds->damaged();
576 ceph_abort(); // damaged should never return
577 } else {
578 cache->open_root();
579 }
580 }
581 };
582
583 void MDCache::open_root_inode(MDSInternalContextBase *c)
584 {
585 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
586 CInode *in;
587 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
588 in->fetch(c);
589 } else {
590 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
591 }
592 }
593
594 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
595 {
596 MDSGatherBuilder gather(g_ceph_context);
597
598 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
599 in->fetch(gather.new_sub());
600
601 gather.set_finisher(c);
602 gather.activate();
603 }
604
605 void MDCache::open_root()
606 {
607 dout(10) << "open_root" << dendl;
608
609 if (!root) {
610 open_root_inode(new C_MDS_RetryOpenRoot(this));
611 return;
612 }
613 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
614 assert(root->is_auth());
615 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
616 assert(rootdir);
617 if (!rootdir->is_subtree_root())
618 adjust_subtree_auth(rootdir, mds->get_nodeid());
619 if (!rootdir->is_complete()) {
620 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
621 return;
622 }
623 } else {
624 assert(!root->is_auth());
625 CDir *rootdir = root->get_dirfrag(frag_t());
626 if (!rootdir) {
627 discover_dir_frag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
628 return;
629 }
630 }
631
632 if (!myin) {
633 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
634 in->fetch(new C_MDS_RetryOpenRoot(this));
635 return;
636 }
637 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
638 assert(mydir);
639 adjust_subtree_auth(mydir, mds->get_nodeid());
640
641 populate_mydir();
642 }
643
644 void MDCache::populate_mydir()
645 {
646 assert(myin);
647 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
648 assert(mydir);
649
650 dout(10) << "populate_mydir " << *mydir << dendl;
651
652 if (!mydir->is_complete()) {
653 mydir->fetch(new C_MDS_RetryOpenRoot(this));
654 return;
655 }
656
657 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
658 // A missing dirfrag, we will recreate it. Before that, we must dirty
659 // it before dirtying any of the strays we create within it.
660 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
661 "recreating it now";
662 LogSegment *ls = mds->mdlog->get_current_segment();
663 mydir->state_clear(CDir::STATE_BADFRAG);
664 mydir->mark_complete();
665 mydir->mark_dirty(mydir->pre_dirty(), ls);
666 }
667
668 // open or create stray
669 uint64_t num_strays = 0;
670 for (int i = 0; i < NUM_STRAY; ++i) {
671 stringstream name;
672 name << "stray" << i;
673 CDentry *straydn = mydir->lookup(name.str());
674
675 // allow for older fs's with stray instead of stray0
676 if (straydn == NULL && i == 0)
677 straydn = mydir->lookup("stray");
678
679 if (!straydn || !straydn->get_linkage()->get_inode()) {
680 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
681 new C_MDS_RetryOpenRoot(this));
682 return;
683 }
684 assert(straydn);
685 assert(strays[i]);
686 // we make multiple passes through this method; make sure we only pin each stray once.
687 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
688 strays[i]->get(CInode::PIN_STRAY);
689 strays[i]->state_set(CInode::STATE_STRAYPINNED);
690 strays[i]->get_stickydirs();
691 }
692 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
693
694 // open all frags
695 list<frag_t> ls;
696 strays[i]->dirfragtree.get_leaves(ls);
697 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
698 frag_t fg = *p;
699 CDir *dir = strays[i]->get_dirfrag(fg);
700 if (!dir) {
701 dir = strays[i]->get_or_open_dirfrag(this, fg);
702 }
703
704 // DamageTable applies special handling to strays: it will
705 // have damaged() us out if one is damaged.
706 assert(!dir->state_test(CDir::STATE_BADFRAG));
707
708 if (dir->get_version() == 0) {
709 dir->fetch(new C_MDS_RetryOpenRoot(this));
710 return;
711 }
712
713 if (dir->get_frag_size() > 0)
714 num_strays += dir->get_frag_size();
715 }
716 }
717
718 stray_manager.set_num_strays(num_strays);
719
720 // okay!
721 dout(10) << "populate_mydir done" << dendl;
722 assert(!open);
723 open = true;
724 mds->queue_waiters(waiting_for_open);
725
726 scan_stray_dir();
727 }
728
729 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
730 {
731 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
732 }
733
734 CDir *MDCache::get_stray_dir(CInode *in)
735 {
736 string straydname;
737 in->name_stray_dentry(straydname);
738
739 CInode *strayi = get_stray();
740 assert(strayi);
741 frag_t fg = strayi->pick_dirfrag(straydname);
742 CDir *straydir = strayi->get_dirfrag(fg);
743 assert(straydir);
744 return straydir;
745 }
746
747 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
748 {
749 CDir *straydir = get_stray_dir(in);
750 string straydname;
751 in->name_stray_dentry(straydname);
752 CDentry *straydn = straydir->lookup(straydname);
753 if (!straydn) {
754 straydn = straydir->add_null_dentry(straydname);
755 straydn->mark_new();
756 } else {
757 assert(straydn->get_projected_linkage()->is_null());
758 }
759
760 straydn->state_set(CDentry::STATE_STRAY);
761 return straydn;
762 }
763
764
765
766 MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
767 {
768 // inode?
769 if (info.ino)
770 return get_inode(info.ino, info.snapid);
771
772 // dir or dentry.
773 CDir *dir = get_dirfrag(info.dirfrag);
774 if (!dir) return 0;
775
776 if (info.dname.length())
777 return dir->lookup(info.dname, info.snapid);
778 else
779 return dir;
780 }
781
782
783
784
785 // ====================================================================
786 // subtree management
787
788 void MDCache::list_subtrees(list<CDir*>& ls)
789 {
790 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
791 p != subtrees.end();
792 ++p)
793 ls.push_back(p->first);
794 }
795
796 /*
797 * adjust the dir_auth of a subtree.
798 * merge with parent and/or child subtrees, if is it appropriate.
799 * merge can ONLY happen if both parent and child have unambiguous auth.
800 */
801 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool do_eval)
802 {
803 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
804 << " on " << *dir << dendl;
805
806 if (mds->is_any_replay() || mds->is_resolve())
807 do_eval = false;
808
809 show_subtrees();
810
811 CDir *root;
812 if (dir->inode->is_base()) {
813 root = dir; // bootstrap hack.
814 if (subtrees.count(root) == 0) {
815 subtrees[root];
816 root->get(CDir::PIN_SUBTREE);
817 }
818 } else {
819 root = get_subtree_root(dir); // subtree root
820 }
821 assert(root);
822 assert(subtrees.count(root));
823 dout(7) << " current root is " << *root << dendl;
824
825 if (root == dir) {
826 // i am already a subtree.
827 dir->set_dir_auth(auth);
828 } else {
829 // i am a new subtree.
830 dout(10) << " new subtree at " << *dir << dendl;
831 assert(subtrees.count(dir) == 0);
832 subtrees[dir]; // create empty subtree bounds list for me.
833 dir->get(CDir::PIN_SUBTREE);
834
835 // set dir_auth
836 dir->set_dir_auth(auth);
837
838 // move items nested beneath me, under me.
839 set<CDir*>::iterator p = subtrees[root].begin();
840 while (p != subtrees[root].end()) {
841 set<CDir*>::iterator next = p;
842 ++next;
843 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
844 // move under me
845 dout(10) << " claiming child bound " << **p << dendl;
846 subtrees[dir].insert(*p);
847 subtrees[root].erase(p);
848 }
849 p = next;
850 }
851
852 // i am a bound of the parent subtree.
853 subtrees[root].insert(dir);
854
855 // i am now the subtree root.
856 root = dir;
857
858 // adjust recursive pop counters
859 if (dir->is_auth()) {
860 utime_t now = ceph_clock_now();
861 CDir *p = dir->get_parent_dir();
862 while (p) {
863 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
864 if (p->is_subtree_root()) break;
865 p = p->inode->get_parent_dir();
866 }
867 }
868
869 if (do_eval)
870 eval_subtree_root(dir->get_inode());
871 }
872
873 show_subtrees();
874 }
875
876
877 void MDCache::try_subtree_merge(CDir *dir)
878 {
879 dout(7) << "try_subtree_merge " << *dir << dendl;
880 assert(subtrees.count(dir));
881 set<CDir*> oldbounds = subtrees[dir];
882
883 // try merge at my root
884 try_subtree_merge_at(dir);
885
886 // try merge at my old bounds
887 for (set<CDir*>::iterator p = oldbounds.begin();
888 p != oldbounds.end();
889 ++p)
890 try_subtree_merge_at(*p);
891 }
892
893 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
894 CInode *in;
895 MutationRef mut;
896 public:
897 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
898 void finish(int r) override {
899 mdcache->subtree_merge_writebehind_finish(in, mut);
900 }
901 };
902
903 void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
904 {
905 dout(10) << "try_subtree_merge_at " << *dir << dendl;
906 assert(subtrees.count(dir));
907
908 if (mds->is_any_replay() || mds->is_resolve())
909 do_eval = false;
910
911 // merge with parent?
912 CDir *parent = dir;
913 if (!dir->inode->is_base())
914 parent = get_subtree_root(dir->get_parent_dir());
915
916 if (parent != dir && // we have a parent,
917 parent->dir_auth == dir->dir_auth && // auth matches,
918 dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous,
919 !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound,
920 !dir->state_test(CDir::STATE_AUXSUBTREE)) { // not aux subtree
921 // merge with parent.
922 dout(10) << " subtree merge at " << *dir << dendl;
923 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
924
925 // move our bounds under the parent
926 for (set<CDir*>::iterator p = subtrees[dir].begin();
927 p != subtrees[dir].end();
928 ++p)
929 subtrees[parent].insert(*p);
930
931 // we are no longer a subtree or bound
932 dir->put(CDir::PIN_SUBTREE);
933 subtrees.erase(dir);
934 subtrees[parent].erase(dir);
935
936 // adjust popularity?
937 if (dir->is_auth()) {
938 utime_t now = ceph_clock_now();
939 CDir *p = dir->get_parent_dir();
940 while (p) {
941 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
942 if (p->is_subtree_root()) break;
943 p = p->inode->get_parent_dir();
944 }
945 }
946
947 if (do_eval)
948 eval_subtree_root(dir->get_inode());
949 }
950
951 show_subtrees(15);
952 }
953
954 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
955 {
956 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
957 in->pop_and_dirty_projected_inode(mut->ls);
958
959 mut->apply();
960 mds->locker->drop_locks(mut.get());
961 mut->cleanup();
962
963 in->auth_unpin(this);
964 }
965
966 void MDCache::eval_subtree_root(CInode *diri)
967 {
968 // evaluate subtree inode filelock?
969 // (we should scatter the filelock on subtree bounds)
970 if (diri->is_auth())
971 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
972 }
973
974
975 void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
976 {
977 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
978 << " on " << *dir
979 << " bounds " << bounds
980 << dendl;
981
982 show_subtrees();
983
984 CDir *root;
985 if (dir->ino() == MDS_INO_ROOT) {
986 root = dir; // bootstrap hack.
987 if (subtrees.count(root) == 0) {
988 subtrees[root];
989 root->get(CDir::PIN_SUBTREE);
990 }
991 } else {
992 root = get_subtree_root(dir); // subtree root
993 }
994 assert(root);
995 assert(subtrees.count(root));
996 dout(7) << " current root is " << *root << dendl;
997
998 mds_authority_t oldauth = dir->authority();
999
1000 if (root == dir) {
1001 // i am already a subtree.
1002 dir->set_dir_auth(auth);
1003 } else {
1004 // i am a new subtree.
1005 dout(10) << " new subtree at " << *dir << dendl;
1006 assert(subtrees.count(dir) == 0);
1007 subtrees[dir]; // create empty subtree bounds list for me.
1008 dir->get(CDir::PIN_SUBTREE);
1009
1010 // set dir_auth
1011 dir->set_dir_auth(auth);
1012
1013 // move items nested beneath me, under me.
1014 set<CDir*>::iterator p = subtrees[root].begin();
1015 while (p != subtrees[root].end()) {
1016 set<CDir*>::iterator next = p;
1017 ++next;
1018 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1019 // move under me
1020 dout(10) << " claiming child bound " << **p << dendl;
1021 subtrees[dir].insert(*p);
1022 subtrees[root].erase(p);
1023 }
1024 p = next;
1025 }
1026
1027 // i am a bound of the parent subtree.
1028 subtrees[root].insert(dir);
1029
1030 // i am now the subtree root.
1031 root = dir;
1032 }
1033
1034 // verify/adjust bounds.
1035 // - these may be new, or
1036 // - beneath existing ambiguous bounds (which will be collapsed),
1037 // - but NOT beneath unambiguous bounds.
1038 for (set<CDir*>::iterator p = bounds.begin();
1039 p != bounds.end();
1040 ++p) {
1041 CDir *bound = *p;
1042
1043 // new bound?
1044 if (subtrees[dir].count(bound) == 0) {
1045 if (get_subtree_root(bound) == dir) {
1046 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1047 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1048 }
1049 else {
1050 dout(10) << " want bound " << *bound << dendl;
1051 CDir *t = get_subtree_root(bound->get_parent_dir());
1052 if (subtrees[t].count(bound) == 0) {
1053 assert(t != dir);
1054 dout(10) << " new bound " << *bound << dendl;
1055 adjust_subtree_auth(bound, t->authority());
1056 }
1057 // make sure it's nested beneath ambiguous subtree(s)
1058 while (1) {
1059 while (subtrees[dir].count(t) == 0)
1060 t = get_subtree_root(t->get_parent_dir());
1061 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1062 adjust_subtree_auth(t, auth);
1063 try_subtree_merge_at(t);
1064 t = get_subtree_root(bound->get_parent_dir());
1065 if (t == dir) break;
1066 }
1067 }
1068 }
1069 else {
1070 dout(10) << " already have bound " << *bound << dendl;
1071 }
1072 }
1073 // merge stray bounds?
1074 while (!subtrees[dir].empty()) {
1075 set<CDir*> copy = subtrees[dir];
1076 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1077 if (bounds.count(*p) == 0) {
1078 CDir *stray = *p;
1079 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1080 adjust_subtree_auth(stray, auth);
1081 try_subtree_merge_at(stray);
1082 }
1083 }
1084 // swallowing subtree may add new subtree bounds
1085 if (copy == subtrees[dir])
1086 break;
1087 }
1088
1089 // bound should now match.
1090 verify_subtree_bounds(dir, bounds);
1091
1092 show_subtrees();
1093 }
1094
1095
1096 /*
1097 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1098 * fragmentation as necessary to get an equivalent bounding set. That is, only
1099 * split if one of our frags spans the provided bounding set. Never merge.
1100 */
1101 void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1102 {
1103 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1104
1105 // sort by ino
1106 map<inodeno_t, fragset_t> byino;
1107 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1108 byino[p->ino].insert(p->frag);
1109 dout(10) << " by ino: " << byino << dendl;
1110
1111 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1112 CInode *diri = get_inode(p->first);
1113 if (!diri)
1114 continue;
1115 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1116
1117 fragtree_t tmpdft;
1118 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1119 tmpdft.force_to_leaf(g_ceph_context, *q);
1120
1121 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1122 frag_t fg = *q;
1123 list<frag_t> fgls;
1124 diri->dirfragtree.get_leaves_under(fg, fgls);
1125 if (fgls.empty()) {
1126 bool all = true;
1127 frag_t approx_fg = diri->dirfragtree[fg.value()];
1128 list<frag_t> ls;
1129 tmpdft.get_leaves_under(approx_fg, ls);
1130 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1131 if (p->second.get().count(*r) == 0) {
1132 // not bound, so the resolve message is from auth MDS of the dirfrag
1133 force_dir_fragment(diri, *r);
1134 all = false;
1135 }
1136 }
1137 if (all)
1138 fgls.push_back(approx_fg);
1139 else
1140 diri->dirfragtree.get_leaves_under(fg, fgls);
1141 }
1142 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1143 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1144 CDir *dir = diri->get_dirfrag(*r);
1145 if (dir)
1146 bounds.insert(dir);
1147 }
1148 }
1149 }
1150 }
1151
1152 void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1153 {
1154 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1155 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1156
1157 set<CDir*> bounds;
1158 get_force_dirfrag_bound_set(bound_dfs, bounds);
1159 adjust_bounded_subtree_auth(dir, bounds, auth);
1160 }
1161
1162 void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1163 {
1164 dout(10) << "map_dirfrag_set " << dfs << dendl;
1165
1166 // group by inode
1167 map<inodeno_t, fragset_t> ino_fragset;
1168 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1169 ino_fragset[p->ino].insert(p->frag);
1170
1171 // get frags
1172 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1173 p != ino_fragset.end();
1174 ++p) {
1175 CInode *in = get_inode(p->first);
1176 if (!in)
1177 continue;
1178
1179 list<frag_t> fglist;
1180 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1181 in->dirfragtree.get_leaves_under(*q, fglist);
1182
1183 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1184 << " on " << *in << dendl;
1185
1186 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1187 CDir *dir = in->get_dirfrag(*q);
1188 if (dir)
1189 result.insert(dir);
1190 }
1191 }
1192 }
1193
1194
1195
1196 CDir *MDCache::get_subtree_root(CDir *dir)
1197 {
1198 // find the underlying dir that delegates (or is about to delegate) auth
1199 while (true) {
1200 if (dir->is_subtree_root())
1201 return dir;
1202 dir = dir->get_inode()->get_parent_dir();
1203 if (!dir)
1204 return 0; // none
1205 }
1206 }
1207
1208 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1209 {
1210 // find the underlying dir that delegates (or is about to delegate) auth
1211 while (true) {
1212 if (dir->is_subtree_root())
1213 return dir;
1214 dir = dir->get_inode()->get_projected_parent_dir();
1215 if (!dir)
1216 return 0; // none
1217 }
1218 }
1219
1220 void MDCache::remove_subtree(CDir *dir)
1221 {
1222 dout(10) << "remove_subtree " << *dir << dendl;
1223 assert(subtrees.count(dir));
1224 assert(subtrees[dir].empty());
1225 subtrees.erase(dir);
1226 dir->put(CDir::PIN_SUBTREE);
1227 if (dir->get_parent_dir()) {
1228 CDir *p = get_subtree_root(dir->get_parent_dir());
1229 assert(subtrees[p].count(dir));
1230 subtrees[p].erase(dir);
1231 }
1232 }
1233
1234 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1235 {
1236 assert(subtrees.count(dir));
1237 bounds = subtrees[dir];
1238 }
1239
1240 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1241 {
1242 if (subtrees.count(dir)) {
1243 // just copy them, dir is a subtree.
1244 get_subtree_bounds(dir, bounds);
1245 } else {
1246 // find them
1247 CDir *root = get_subtree_root(dir);
1248 for (set<CDir*>::iterator p = subtrees[root].begin();
1249 p != subtrees[root].end();
1250 ++p) {
1251 CDir *t = *p;
1252 while (t != root) {
1253 t = t->get_parent_dir();
1254 assert(t);
1255 if (t == dir) {
1256 bounds.insert(*p);
1257 continue;
1258 }
1259 }
1260 }
1261 }
1262 }
1263
1264 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1265 {
1266 // for debugging only.
1267 assert(subtrees.count(dir));
1268 if (bounds != subtrees[dir]) {
1269 dout(0) << "verify_subtree_bounds failed" << dendl;
1270 set<CDir*> b = bounds;
1271 for (auto &cd : subtrees[dir]) {
1272 if (bounds.count(cd)) {
1273 b.erase(cd);
1274 continue;
1275 }
1276 dout(0) << " missing bound " << *cd << dendl;
1277 }
1278 for (const auto &cd : b)
1279 dout(0) << " extra bound " << *cd << dendl;
1280 }
1281 assert(bounds == subtrees[dir]);
1282 }
1283
1284 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1285 {
1286 // for debugging only.
1287 assert(subtrees.count(dir));
1288
1289 // make sure that any bounds i do have are properly noted as such.
1290 int failed = 0;
1291 for (const auto &fg : bounds) {
1292 CDir *bd = get_dirfrag(fg);
1293 if (!bd) continue;
1294 if (subtrees[dir].count(bd) == 0) {
1295 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1296 failed++;
1297 }
1298 }
1299 assert(failed == 0);
1300 }
1301
1302 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1303 {
1304 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1305 << " to " << *newdir << dendl;
1306 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1307 }
1308
1309 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir,
1310 bool pop, bool imported)
1311 {
1312 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1313
1314 //show_subtrees();
1315
1316 CDir *newdir = diri->get_parent_dir();
1317
1318 if (pop) {
1319 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1320 assert(p != projected_subtree_renames.end());
1321 assert(!p->second.empty());
1322 assert(p->second.front().first == olddir);
1323 assert(p->second.front().second == newdir);
1324 p->second.pop_front();
1325 if (p->second.empty())
1326 projected_subtree_renames.erase(p);
1327 }
1328
1329 // adjust subtree
1330 list<CDir*> dfls;
1331 // make sure subtree dirfrags are at the front of the list
1332 diri->get_subtree_dirfrags(dfls);
1333 diri->get_nested_dirfrags(dfls);
1334 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1335 CDir *dir = *p;
1336
1337 dout(10) << "dirfrag " << *dir << dendl;
1338 CDir *oldparent = get_subtree_root(olddir);
1339 dout(10) << " old parent " << *oldparent << dendl;
1340 CDir *newparent = get_subtree_root(newdir);
1341 dout(10) << " new parent " << *newparent << dendl;
1342
1343 if (oldparent == newparent) {
1344 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1345 continue;
1346 }
1347
1348 if (dir->is_subtree_root()) {
1349 // children are fine. change parent.
1350 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1351 assert(subtrees[oldparent].count(dir));
1352 subtrees[oldparent].erase(dir);
1353 assert(subtrees.count(newparent));
1354 subtrees[newparent].insert(dir);
1355 try_subtree_merge_at(dir, !imported);
1356 } else {
1357 // mid-subtree.
1358
1359 // see if any old bounds move to the new parent.
1360 list<CDir*> tomove;
1361 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1362 p != subtrees[oldparent].end();
1363 ++p) {
1364 CDir *bound = *p;
1365 CDir *broot = get_subtree_root(bound->get_parent_dir());
1366 if (broot != oldparent) {
1367 assert(broot == newparent);
1368 tomove.push_back(bound);
1369 }
1370 }
1371 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1372 CDir *bound = *p;
1373 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1374 subtrees[oldparent].erase(bound);
1375 subtrees[newparent].insert(bound);
1376 }
1377
1378 // did auth change?
1379 if (oldparent->authority() != newparent->authority()) {
1380 adjust_subtree_auth(dir, oldparent->authority(), !imported); // caller is responsible for *diri.
1381 try_subtree_merge_at(dir, !imported);
1382 }
1383 }
1384 }
1385
1386 show_subtrees();
1387 }
1388
1389
1390 void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1391 {
1392 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1393 p != subtrees.end();
1394 ++p) {
1395 CDir *root = p->first;
1396 if (root->is_full_dir_auth())
1397 s.insert(root);
1398 }
1399 }
1400 void MDCache::get_auth_subtrees(set<CDir*>& s)
1401 {
1402 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1403 p != subtrees.end();
1404 ++p) {
1405 CDir *root = p->first;
1406 if (root->is_auth())
1407 s.insert(root);
1408 }
1409 }
1410
1411
1412 // count.
1413
1414 int MDCache::num_subtrees()
1415 {
1416 return subtrees.size();
1417 }
1418
1419 int MDCache::num_subtrees_fullauth()
1420 {
1421 int n = 0;
1422 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1423 p != subtrees.end();
1424 ++p) {
1425 CDir *root = p->first;
1426 if (root->is_full_dir_auth())
1427 n++;
1428 }
1429 return n;
1430 }
1431
1432 int MDCache::num_subtrees_fullnonauth()
1433 {
1434 int n = 0;
1435 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1436 p != subtrees.end();
1437 ++p) {
1438 CDir *root = p->first;
1439 if (root->is_full_dir_nonauth())
1440 n++;
1441 }
1442 return n;
1443 }
1444
1445
1446
1447 // ===================================
1448 // journal and snap/cow helpers
1449
1450
1451 /*
1452 * find first inode in cache that follows given snapid. otherwise, return current.
1453 */
1454 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1455 {
1456 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1457 assert(in->last == CEPH_NOSNAP);
1458
1459 SnapRealm *realm = in->find_snaprealm();
1460 const set<snapid_t>& snaps = realm->get_snaps();
1461 dout(10) << " realm " << *realm << " " << *realm->inode << dendl;
1462 dout(10) << " snaps " << snaps << dendl;
1463
1464 if (snaps.empty())
1465 return in;
1466
1467 for (set<snapid_t>::const_iterator p = snaps.upper_bound(follows); // first item > follows
1468 p != snaps.end();
1469 ++p) {
1470 CInode *t = get_inode(in->ino(), *p);
1471 if (t) {
1472 in = t;
1473 dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl;
1474 break;
1475 }
1476 }
1477 return in;
1478 }
1479
1480
1481 /*
1482 * note: i'm currently cheating wrt dirty and inode.version on cow
1483 * items. instead of doing a full dir predirty, i just take the
1484 * original item's version, and set the dirty flag (via
1485 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1486 * means a special case in the dir commit clean sweep assertions.
1487 * bah.
1488 */
1489 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1490 {
1491 assert(last >= in->first);
1492
1493 SnapRealm *realm = in->find_snaprealm();
1494 const set<snapid_t>& snaps = realm->get_snaps();
1495
1496 // make sure snap inode's last match existing snapshots.
1497 // MDCache::pick_inode_snap() requires this.
1498 snapid_t last_snap = last;
1499 if (snaps.count(last) == 0) {
1500 set<snapid_t>::const_iterator p = snaps.upper_bound(last);
1501 if (p != snaps.begin()) {
1502 --p;
1503 if (*p >= in->first)
1504 last_snap = *p;
1505 }
1506 }
1507
1508 CInode *oldin = new CInode(this, true, in->first, last_snap);
1509 oldin->inode = *in->get_previous_projected_inode();
1510 oldin->symlink = in->symlink;
1511 oldin->xattrs = *in->get_previous_projected_xattrs();
1512 oldin->inode.trim_client_ranges(last);
1513
1514 if (in->first < in->oldest_snap)
1515 in->oldest_snap = in->first;
1516
1517 in->first = last+1;
1518
1519 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1520 add_inode(oldin);
1521
1522 if (in->last != CEPH_NOSNAP) {
1523 CInode *head_in = get_inode(in->ino());
1524 assert(head_in);
1525 if (head_in->split_need_snapflush(oldin, in)) {
1526 oldin->client_snap_caps = in->client_snap_caps;
1527 for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
1528 p != in->client_snap_caps.end();
1529 ++p) {
1530 SimpleLock *lock = oldin->get_lock(p->first);
1531 assert(lock);
1532 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
1533 oldin->auth_pin(lock);
1534 lock->set_state(LOCK_SNAP_SYNC); // gathering
1535 lock->get_wrlock(true);
1536 }
1537 }
1538 }
1539 return oldin;
1540 }
1541
1542 // clone caps?
1543 for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
1544 p != in->client_caps.end();
1545 ++p) {
1546 client_t client = p->first;
1547 Capability *cap = p->second;
1548 int issued = cap->issued();
1549 if ((issued & CEPH_CAP_ANY_WR) &&
1550 cap->client_follows < last) {
1551 // note in oldin
1552 for (int i = 0; i < num_cinode_locks; i++) {
1553 if (issued & cinode_lock_info[i].wr_caps) {
1554 int lockid = cinode_lock_info[i].lock;
1555 SimpleLock *lock = oldin->get_lock(lockid);
1556 assert(lock);
1557 oldin->client_snap_caps[lockid].insert(client);
1558 oldin->auth_pin(lock);
1559 lock->set_state(LOCK_SNAP_SYNC); // gathering
1560 lock->get_wrlock(true);
1561 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1562 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1563 }
1564 }
1565 cap->client_follows = last;
1566
1567 // we need snapflushes for any intervening snaps
1568 dout(10) << " snaps " << snaps << dendl;
1569 for (set<snapid_t>::const_iterator q = snaps.lower_bound(oldin->first);
1570 q != snaps.end() && *q <= last;
1571 ++q) {
1572 in->add_need_snapflush(oldin, *q, client);
1573 }
1574 } else {
1575 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1576 }
1577 }
1578
1579 return oldin;
1580 }
1581
1582 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1583 CDentry *dn, snapid_t follows,
1584 CInode **pcow_inode, CDentry::linkage_t *dnl)
1585 {
1586 if (!dn) {
1587 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1588 return;
1589 }
1590 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1591 assert(dn->is_auth());
1592
1593 // nothing to cow on a null dentry, fix caller
1594 if (!dnl)
1595 dnl = dn->get_projected_linkage();
1596 assert(!dnl->is_null());
1597
1598 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1599 // multiversion inode.
1600 CInode *in = dnl->get_inode();
1601 SnapRealm *realm = NULL;
1602
1603 if (in->get_projected_parent_dn() != dn) {
1604 assert(follows == CEPH_NOSNAP);
1605 realm = dn->dir->inode->find_snaprealm();
1606 snapid_t dir_follows = realm->get_newest_snap();
1607
1608 if (dir_follows+1 > dn->first) {
1609 snapid_t oldfirst = dn->first;
1610 dn->first = dir_follows+1;
1611 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1612 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
1613 oldfirst, dir_follows);
1614 olddn->pre_dirty();
1615 dout(10) << " olddn " << *olddn << dendl;
1616 metablob->add_remote_dentry(olddn, true);
1617 mut->add_cow_dentry(olddn);
1618 // FIXME: adjust link count here? hmm.
1619
1620 if (dir_follows+1 > in->first)
1621 in->cow_old_inode(dir_follows, false);
1622 }
1623 }
1624
1625 if (in->snaprealm) {
1626 realm = in->snaprealm;
1627 follows = realm->get_newest_seq();
1628 } else
1629 follows = dir_follows;
1630 } else {
1631 realm = in->find_snaprealm();
1632 if (follows == CEPH_NOSNAP)
1633 follows = realm->get_newest_seq();
1634 }
1635
1636 // already cloned?
1637 if (follows < in->first) {
1638 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1639 return;
1640 }
1641
1642 if (!realm->has_snaps_in_range(in->first, follows)) {
1643 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1644 in->first = follows + 1;
1645 return;
1646 }
1647
1648 in->cow_old_inode(follows, false);
1649
1650 } else {
1651 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1652 if (follows == CEPH_NOSNAP)
1653 follows = realm->get_newest_seq();
1654
1655 // already cloned?
1656 if (follows < dn->first) {
1657 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1658 return;
1659 }
1660
1661 // update dn.first before adding old dentry to cdir's map
1662 snapid_t oldfirst = dn->first;
1663 dn->first = follows+1;
1664
1665 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1666
1667 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1668 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1669 if (in)
1670 in->first = follows+1;
1671 return;
1672 }
1673
1674 dout(10) << " dn " << *dn << dendl;
1675 if (in) {
1676 CInode *oldin = cow_inode(in, follows);
1677 mut->add_cow_inode(oldin);
1678 if (pcow_inode)
1679 *pcow_inode = oldin;
1680 CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
1681 oldin->inode.version = olddn->pre_dirty();
1682 dout(10) << " olddn " << *olddn << dendl;
1683 bool need_snapflush = !oldin->client_snap_caps.empty();
1684 if (need_snapflush)
1685 mut->ls->open_files.push_back(&oldin->item_open_file);
1686 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1687 mut->add_cow_dentry(olddn);
1688 } else {
1689 assert(dnl->is_remote());
1690 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
1691 oldfirst, follows);
1692 olddn->pre_dirty();
1693 dout(10) << " olddn " << *olddn << dendl;
1694 metablob->add_remote_dentry(olddn, true);
1695 mut->add_cow_dentry(olddn);
1696 }
1697 }
1698 }
1699
1700
1701 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1702 CInode *in, snapid_t follows,
1703 CInode **pcow_inode)
1704 {
1705 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1706 CDentry *dn = in->get_projected_parent_dn();
1707 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1708 }
1709
1710 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1711 {
1712 if (in->is_base()) {
1713 metablob->add_root(true, in, in->get_projected_inode());
1714 } else {
1715 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1716 follows = in->first - 1;
1717 CDentry *dn = in->get_projected_parent_dn();
1718 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1719 journal_cow_dentry(mut, metablob, dn, follows);
1720 if (in->get_projected_inode()->is_backtrace_updated()) {
1721 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1722 in->get_previous_projected_inode()->layout.pool_id;
1723 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1724 } else {
1725 metablob->add_primary_dentry(dn, in, true);
1726 }
1727 }
1728 }
1729
1730
1731
1732 // nested ---------------------------------------------------------------
1733
1734 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1735 int linkunlink, SnapRealm *prealm)
1736 {
1737 CDentry *parentdn = cur->get_projected_parent_dn();
1738 inode_t *curi = cur->get_projected_inode();
1739
1740 if (cur->first > first)
1741 first = cur->first;
1742
1743 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1744 << " " << *cur << dendl;
1745 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1746 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1747
1748 /*
1749 * FIXME. this incompletely propagates rstats to _old_ parents
1750 * (i.e. shortly after a directory rename). but we need full
1751 * blown hard link backpointers to make this work properly...
1752 */
1753 snapid_t floor = parentdn->first;
1754 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1755
1756 if (!prealm)
1757 prealm = parent->inode->find_snaprealm();
1758 const set<snapid_t> snaps = prealm->get_snaps();
1759
1760 if (cur->last != CEPH_NOSNAP) {
1761 assert(cur->dirty_old_rstats.empty());
1762 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1763 if (q == snaps.end() || *q > cur->last)
1764 return;
1765 }
1766
1767 if (cur->last >= floor) {
1768 bool update = true;
1769 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1770 // rename src inode is not projected in the slave rename prep case. so we should
1771 // avoid updateing the inode.
1772 assert(linkunlink < 0);
1773 assert(cur->is_frozen_inode());
1774 update = false;
1775 }
1776 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1777 linkunlink, update);
1778 }
1779
1780 if (g_conf->mds_snap_rstat) {
1781 for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
1782 p != cur->dirty_old_rstats.end();
1783 ++p) {
1784 old_inode_t& old = cur->old_inodes[*p];
1785 snapid_t ofirst = MAX(old.first, floor);
1786 set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
1787 if (q == snaps.end() || *q > *p)
1788 continue;
1789 if (*p >= floor)
1790 _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
1791 }
1792 }
1793 cur->dirty_old_rstats.clear();
1794 }
1795
1796
1797 void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
1798 CDir *parent, int linkunlink, bool update_inode)
1799 {
1800 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1801 dout(20) << " inode rstat " << inode.rstat << dendl;
1802 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1803 nest_info_t delta;
1804 if (linkunlink == 0) {
1805 delta.add(inode.rstat);
1806 delta.sub(inode.accounted_rstat);
1807 } else if (linkunlink < 0) {
1808 delta.sub(inode.accounted_rstat);
1809 } else {
1810 delta.add(inode.rstat);
1811 }
1812 dout(20) << " delta " << delta << dendl;
1813
1814 if (update_inode)
1815 inode.accounted_rstat = inode.rstat;
1816
1817 while (last >= ofirst) {
1818 /*
1819 * pick fnode version to update. at each iteration, we want to
1820 * pick a segment ending in 'last' to update. split as necessary
1821 * to make that work. then, adjust first up so that we only
1822 * update one segment at a time. then loop to cover the whole
1823 * [ofirst,last] interval.
1824 */
1825 nest_info_t *prstat;
1826 snapid_t first;
1827 fnode_t *pf = parent->get_projected_fnode();
1828 if (last == CEPH_NOSNAP) {
1829 if (g_conf->mds_snap_rstat)
1830 first = MAX(ofirst, parent->first);
1831 else
1832 first = parent->first;
1833 prstat = &pf->rstat;
1834 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1835
1836 if (first > parent->first &&
1837 !(pf->rstat == pf->accounted_rstat)) {
1838 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1839 << parent->first << "," << (first-1) << "] "
1840 << " " << *prstat << "/" << pf->accounted_rstat
1841 << dendl;
1842 parent->dirty_old_rstat[first-1].first = parent->first;
1843 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1844 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1845 }
1846 parent->first = first;
1847 } else if (!g_conf->mds_snap_rstat) {
1848 // drop snapshots' rstats
1849 break;
1850 } else if (last >= parent->first) {
1851 first = parent->first;
1852 parent->dirty_old_rstat[last].first = first;
1853 parent->dirty_old_rstat[last].rstat = pf->rstat;
1854 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1855 prstat = &parent->dirty_old_rstat[last].rstat;
1856 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1857 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1858 } else {
1859 // be careful, dirty_old_rstat is a _sparse_ map.
1860 // sorry, this is ugly.
1861 first = ofirst;
1862
1863 // find any intersection with last
1864 compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
1865 if (p == parent->dirty_old_rstat.end()) {
1866 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1867 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1868 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1869 first = parent->dirty_old_rstat.rbegin()->first+1;
1870 }
1871 } else {
1872 // *p last is >= last
1873 if (p->second.first <= last) {
1874 // *p intersects [first,last]
1875 if (p->second.first < first) {
1876 dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
1877 parent->dirty_old_rstat[first-1] = p->second;
1878 p->second.first = first;
1879 }
1880 if (p->second.first > first)
1881 first = p->second.first;
1882 if (last < p->first) {
1883 dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
1884 parent->dirty_old_rstat[last] = p->second;
1885 p->second.first = last+1;
1886 }
1887 } else {
1888 // *p is to the _right_ of [first,last]
1889 p = parent->dirty_old_rstat.lower_bound(first);
1890 // new *p last is >= first
1891 if (p->second.first <= last && // new *p isn't also to the right, and
1892 p->first >= first) { // it intersects our first bit,
1893 dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
1894 first = p->first+1;
1895 }
1896 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1897 }
1898 }
1899 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1900 parent->dirty_old_rstat[last].first = first;
1901 prstat = &parent->dirty_old_rstat[last].rstat;
1902 }
1903
1904 // apply
1905 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1906 assert(last >= first);
1907 prstat->add(delta);
1908 if (update_inode)
1909 inode.accounted_rstat = inode.rstat;
1910 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1911
1912 last = first-1;
1913 }
1914 }
1915
1916 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1917 snapid_t ofirst, snapid_t last,
1918 CInode *pin, bool cow_head)
1919 {
1920 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1921 dout(20) << " frag rstat " << rstat << dendl;
1922 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1923 nest_info_t delta = rstat;
1924 delta.sub(accounted_rstat);
1925 dout(20) << " delta " << delta << dendl;
1926
1927 while (last >= ofirst) {
1928 inode_t *pi;
1929 snapid_t first;
1930 if (last == pin->last) {
1931 pi = pin->get_projected_inode();
1932 first = MAX(ofirst, pin->first);
1933 if (first > pin->first) {
1934 old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
1935 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1936 }
1937 } else {
1938 if (last >= pin->first) {
1939 first = pin->first;
1940 pin->cow_old_inode(last, cow_head);
1941 } else {
1942 // our life is easier here because old_inodes is not sparse
1943 // (although it may not begin at snapid 1)
1944 compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
1945 if (p == pin->old_inodes.end()) {
1946 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1947 break;
1948 }
1949 first = p->second.first;
1950 if (first > last) {
1951 dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
1952 //assert(p == pin->old_inodes.begin());
1953 break;
1954 }
1955 if (p->first > last) {
1956 dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
1957 << (last+1) << "," << p->first << "]" << dendl;
1958 pin->old_inodes[last] = p->second;
1959 p->second.first = last+1;
1960 pin->dirty_old_rstats.insert(p->first);
1961 }
1962 }
1963 if (first < ofirst) {
1964 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1965 << first << "," << ofirst-1 << "]" << dendl;
1966 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1967 pin->dirty_old_rstats.insert(ofirst-1);
1968 pin->old_inodes[last].first = first = ofirst;
1969 }
1970 pi = &pin->old_inodes[last].inode;
1971 pin->dirty_old_rstats.insert(last);
1972 }
1973 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1974 pi->rstat.add(delta);
1975 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1976
1977 last = first-1;
1978 }
1979 }
1980
1981 void MDCache::broadcast_quota_to_client(CInode *in)
1982 {
1983 if (!in->is_auth() || in->is_frozen())
1984 return;
1985
1986 inode_t *i = in->get_projected_inode();
1987
1988 if (!i->quota.is_enable())
1989 return;
1990
1991 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1992 it != in->client_caps.end();
1993 ++it) {
1994 Session *session = mds->get_session(it->first);
1995 if (!session || !session->connection ||
1996 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
1997 continue;
1998
1999 Capability *cap = it->second;
2000 if (cap->last_rbytes == i->rstat.rbytes &&
2001 cap->last_rsize == i->rstat.rsize())
2002 continue;
2003
2004 if (i->quota.max_files > 0) {
2005 if (i->rstat.rsize() >= i->quota.max_files)
2006 goto update;
2007
2008 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2009 abs(cap->last_rsize - i->rstat.rsize()))
2010 goto update;
2011 }
2012
2013 if (i->quota.max_bytes > 0) {
2014 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2015 goto update;
2016
2017 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2018 abs(cap->last_rbytes - i->rstat.rbytes))
2019 goto update;
2020 }
2021
2022 continue;
2023
2024 update:
2025 cap->last_rsize = i->rstat.rsize();
2026 cap->last_rbytes = i->rstat.rbytes;
2027
2028 MClientQuota *msg = new MClientQuota();
2029 msg->ino = in->ino();
2030 msg->rstat = i->rstat;
2031 msg->quota = i->quota;
2032 mds->send_message_client_counted(msg, session->connection);
2033 }
2034 for (compact_map<mds_rank_t, unsigned>::iterator it = in->replicas_begin();
2035 it != in->replicas_end();
2036 ++it) {
2037 MGatherCaps *msg = new MGatherCaps;
2038 msg->ino = in->ino();
2039 mds->send_message_mds(msg, it->first);
2040 }
2041 }
2042
2043 /*
2044 * NOTE: we _have_ to delay the scatter if we are called during a
2045 * rejoin, because we can't twiddle locks between when the
2046 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2047 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2048 * (no requests), and a survivor acks immediately. _except_ that
2049 * during rejoin_(weak|strong) processing, we may complete a lock
2050 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2051 * scatterlock state in that case or the lock states will get out of
2052 * sync between the auth and replica.
2053 *
2054 * the simple solution is to never do the scatter here. instead, put
2055 * the scatterlock on a list if it isn't already wrlockable. this is
2056 * probably the best plan anyway, since we avoid too many
2057 * scatters/locks under normal usage.
2058 */
2059 /*
2060 * some notes on dirlock/nestlock scatterlock semantics:
2061 *
2062 * the fragstat (dirlock) will never be updated without
2063 * dirlock+nestlock wrlock held by the caller.
2064 *
2065 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2066 * data is pushed up the tree. this could be changed with some
2067 * restructuring here, but in its current form we ensure that the
2068 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2069 * frag, which is nice. and, we only need to track frags that need to
2070 * be nudged (and not inodes with pending rstat changes that need to
2071 * be pushed into the frag). a consequence of this is that the
2072 * accounted_rstat on scatterlock sync may not match our current
2073 * rstat. this is normal and expected.
2074 */
2075 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2076 CInode *in, CDir *parent,
2077 int flags, int linkunlink,
2078 snapid_t cfollows)
2079 {
2080 bool primary_dn = flags & PREDIRTY_PRIMARY;
2081 bool do_parent_mtime = flags & PREDIRTY_DIR;
2082 bool shallow = flags & PREDIRTY_SHALLOW;
2083
2084 assert(mds->mdlog->entry_is_open());
2085
2086 // make sure stamp is set
2087 if (mut->get_mds_stamp() == utime_t())
2088 mut->set_mds_stamp(ceph_clock_now());
2089
2090 if (in->is_base())
2091 return;
2092
2093 dout(10) << "predirty_journal_parents"
2094 << (do_parent_mtime ? " do_parent_mtime":"")
2095 << " linkunlink=" << linkunlink
2096 << (primary_dn ? " primary_dn":" remote_dn")
2097 << (shallow ? " SHALLOW":"")
2098 << " follows " << cfollows
2099 << " " << *in << dendl;
2100
2101 if (!parent) {
2102 assert(primary_dn);
2103 parent = in->get_projected_parent_dn()->get_dir();
2104 }
2105
2106 if (flags == 0 && linkunlink == 0) {
2107 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2108 blob->add_dir_context(parent);
2109 return;
2110 }
2111
2112 // build list of inodes to wrlock, dirty, and update
2113 list<CInode*> lsi;
2114 CInode *cur = in;
2115 CDentry *parentdn = NULL;
2116 bool first = true;
2117 while (parent) {
2118 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2119 assert(parent->is_auth());
2120
2121 // opportunistically adjust parent dirfrag
2122 CInode *pin = parent->get_inode();
2123
2124 // inode -> dirfrag
2125 mut->auth_pin(parent);
2126 mut->add_projected_fnode(parent);
2127
2128 fnode_t *pf = parent->project_fnode();
2129 pf->version = parent->pre_dirty();
2130
2131 if (do_parent_mtime || linkunlink) {
2132 assert(mut->wrlocks.count(&pin->filelock));
2133 assert(mut->wrlocks.count(&pin->nestlock));
2134 assert(cfollows == CEPH_NOSNAP);
2135
2136 // update stale fragstat/rstat?
2137 parent->resync_accounted_fragstat();
2138 parent->resync_accounted_rstat();
2139
2140 if (do_parent_mtime) {
2141 pf->fragstat.mtime = mut->get_op_stamp();
2142 pf->fragstat.change_attr++;
2143 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2144 if (pf->fragstat.mtime > pf->rstat.rctime) {
2145 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2146 pf->rstat.rctime = pf->fragstat.mtime;
2147 } else {
2148 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2149 }
2150 }
2151 if (linkunlink) {
2152 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2153 if (in->is_dir()) {
2154 pf->fragstat.nsubdirs += linkunlink;
2155 //pf->rstat.rsubdirs += linkunlink;
2156 } else {
2157 pf->fragstat.nfiles += linkunlink;
2158 //pf->rstat.rfiles += linkunlink;
2159 }
2160 }
2161 }
2162
2163 // rstat
2164 if (!primary_dn) {
2165 // don't update parent this pass
2166 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2167 pin->versionlock.can_wrlock())) {
2168 dout(20) << " unwritable parent nestlock " << pin->nestlock
2169 << ", marking dirty rstat on " << *cur << dendl;
2170 cur->mark_dirty_rstat();
2171 } else {
2172 // if we don't hold a wrlock reference on this nestlock, take one,
2173 // because we are about to write into the dirfrag fnode and that needs
2174 // to commit before the lock can cycle.
2175 if (linkunlink) {
2176 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2177 }
2178
2179 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2180 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2181 mds->locker->wrlock_force(&pin->nestlock, mut);
2182 }
2183
2184 // now we can project the inode rstat diff the dirfrag
2185 SnapRealm *prealm = pin->find_snaprealm();
2186
2187 snapid_t follows = cfollows;
2188 if (follows == CEPH_NOSNAP)
2189 follows = prealm->get_newest_seq();
2190
2191 snapid_t first = follows+1;
2192
2193 // first, if the frag is stale, bring it back in sync.
2194 parent->resync_accounted_rstat();
2195
2196 // now push inode rstats into frag
2197 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2198 cur->clear_dirty_rstat();
2199 }
2200
2201 bool stop = false;
2202 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2203 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2204 stop = true;
2205 }
2206
2207 // delay propagating until later?
2208 if (!stop && !first &&
2209 g_conf->mds_dirstat_min_interval > 0) {
2210 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2211 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2212 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2213 << " < " << g_conf->mds_dirstat_min_interval
2214 << ", stopping" << dendl;
2215 stop = true;
2216 } else {
2217 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2218 }
2219 }
2220
2221 // can cast only because i'm passing nowait=true in the sole user
2222 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2223 if (!stop &&
2224 mut->wrlocks.count(&pin->nestlock) == 0 &&
2225 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2226 //true
2227 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2228 )) { // ** do not initiate.. see above comment **
2229 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2230 << " on " << *pin << dendl;
2231 stop = true;
2232 }
2233 if (stop) {
2234 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2235 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2236 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2237 mut->add_updated_lock(&pin->nestlock);
2238 if (do_parent_mtime || linkunlink) {
2239 mds->locker->mark_updated_scatterlock(&pin->filelock);
2240 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2241 mut->add_updated_lock(&pin->filelock);
2242 }
2243 break;
2244 }
2245 if (!mut->wrlocks.count(&pin->versionlock))
2246 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2247
2248 assert(mut->wrlocks.count(&pin->nestlock) ||
2249 mut->is_slave());
2250
2251 pin->last_dirstat_prop = mut->get_mds_stamp();
2252
2253 // dirfrag -> diri
2254 mut->auth_pin(pin);
2255 mut->add_projected_inode(pin);
2256 lsi.push_front(pin);
2257
2258 pin->pre_cow_old_inode(); // avoid cow mayhem!
2259
2260 inode_t *pi = pin->project_inode();
2261 pi->version = pin->pre_dirty();
2262
2263 // dirstat
2264 if (do_parent_mtime || linkunlink) {
2265 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2266 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2267 bool touched_mtime = false, touched_chattr = false;
2268 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2269 pf->accounted_fragstat = pf->fragstat;
2270 if (touched_mtime)
2271 pi->mtime = pi->ctime = pi->dirstat.mtime;
2272 if (touched_chattr)
2273 pi->change_attr = pi->dirstat.change_attr;
2274 dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
2275
2276 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2277 if (pi->dirstat.size() < 0)
2278 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2279 if (pi->dirstat.size() != pf->fragstat.size()) {
2280 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2281 << parent->dirfrag() << ", inode has " << pi->dirstat
2282 << ", dirfrag has " << pf->fragstat;
2283
2284 // trust the dirfrag for now
2285 pi->dirstat = pf->fragstat;
2286
2287 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2288 }
2289 }
2290 }
2291
2292 /*
2293 * the rule here is to follow the _oldest_ parent with dirty rstat
2294 * data. if we don't propagate all data, we add ourselves to the
2295 * nudge list. that way all rstat data will (eventually) get
2296 * pushed up the tree.
2297 *
2298 * actually, no. for now, silently drop rstats for old parents. we need
2299 * hard link backpointers to do the above properly.
2300 */
2301
2302 // stop?
2303 if (pin->is_base())
2304 break;
2305 parentdn = pin->get_projected_parent_dn();
2306 assert(parentdn);
2307
2308 // rstat
2309 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2310
2311 // first, if the frag is stale, bring it back in sync.
2312 parent->resync_accounted_rstat();
2313
2314 if (g_conf->mds_snap_rstat) {
2315 for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
2316 p != parent->dirty_old_rstat.end();
2317 ++p)
2318 project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
2319 p->first, pin, true);//false);
2320 }
2321 parent->dirty_old_rstat.clear();
2322 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2323
2324 pf->accounted_rstat = pf->rstat;
2325
2326 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2327 if (pi->rstat.rbytes != pf->rstat.rbytes) {
2328 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2329 << parent->dirfrag() << ", inode has " << pi->rstat
2330 << ", dirfrag has " << pf->rstat;
2331
2332 // trust the dirfrag for now
2333 pi->rstat = pf->rstat;
2334
2335 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2336 }
2337 }
2338
2339 parent->check_rstats();
2340 broadcast_quota_to_client(pin);
2341 // next parent!
2342 cur = pin;
2343 parent = parentdn->get_dir();
2344 linkunlink = 0;
2345 do_parent_mtime = false;
2346 primary_dn = true;
2347 first = false;
2348 }
2349
2350 // now, stick it in the blob
2351 assert(parent);
2352 assert(parent->is_auth());
2353 blob->add_dir_context(parent);
2354 blob->add_dir(parent, true);
2355 for (list<CInode*>::iterator p = lsi.begin();
2356 p != lsi.end();
2357 ++p) {
2358 CInode *cur = *p;
2359 journal_dirty_inode(mut.get(), blob, cur);
2360 }
2361
2362 }
2363
2364
2365
2366
2367
2368 // ===================================
2369 // slave requests
2370
2371
2372 /*
2373 * some handlers for master requests with slaves. we need to make
2374 * sure slaves journal commits before we forget we mastered them and
2375 * remove them from the uncommitted_masters map (used during recovery
2376 * to commit|abort slaves).
2377 */
2378 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2379 metareqid_t reqid;
2380 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2381 void finish(int r) override {
2382 mdcache->_logged_master_commit(reqid);
2383 }
2384 };
2385
2386 void MDCache::log_master_commit(metareqid_t reqid)
2387 {
2388 dout(10) << "log_master_commit " << reqid << dendl;
2389 uncommitted_masters[reqid].committing = true;
2390 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2391 new C_MDC_CommittedMaster(this, reqid));
2392 }
2393
2394 void MDCache::_logged_master_commit(metareqid_t reqid)
2395 {
2396 dout(10) << "_logged_master_commit " << reqid << dendl;
2397 assert(uncommitted_masters.count(reqid));
2398 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2399 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2400 uncommitted_masters.erase(reqid);
2401 }
2402
2403 // while active...
2404
2405 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2406 {
2407 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2408 assert(uncommitted_masters.count(r));
2409 uncommitted_masters[r].slaves.erase(from);
2410 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2411 log_master_commit(r);
2412 }
2413
2414 void MDCache::logged_master_update(metareqid_t reqid)
2415 {
2416 dout(10) << "logged_master_update " << reqid << dendl;
2417 assert(uncommitted_masters.count(reqid));
2418 uncommitted_masters[reqid].safe = true;
2419 if (pending_masters.count(reqid)) {
2420 pending_masters.erase(reqid);
2421 if (pending_masters.empty())
2422 process_delayed_resolve();
2423 }
2424 }
2425
2426 /*
2427 * Master may crash after receiving all slaves' commit acks, but before journalling
2428 * the final commit. Slaves may crash after journalling the slave commit, but before
2429 * sending commit ack to the master. Commit masters with no uncommitted slave when
2430 * resolve finishes.
2431 */
2432 void MDCache::finish_committed_masters()
2433 {
2434 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2435 p != uncommitted_masters.end();
2436 ++p) {
2437 p->second.recovering = false;
2438 if (!p->second.committing && p->second.slaves.empty()) {
2439 dout(10) << "finish_committed_masters " << p->first << dendl;
2440 log_master_commit(p->first);
2441 }
2442 }
2443 }
2444
2445 /*
2446 * at end of resolve... we must journal a commit|abort for all slave
2447 * updates, before moving on.
2448 *
2449 * this is so that the master can safely journal ECommitted on ops it
2450 * masters when it reaches up:active (all other recovering nodes must
2451 * complete resolve before that happens).
2452 */
2453 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2454 mds_rank_t from;
2455 metareqid_t reqid;
2456 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2457 void finish(int r) override {
2458 mdcache->_logged_slave_commit(from, reqid);
2459 }
2460 };
2461
2462 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2463 {
2464 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2465
2466 // send a message
2467 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2468 mds->send_message_mds(req, from);
2469 }
2470
2471
2472
2473
2474
2475
2476 // ====================================================================
2477 // import map, recovery
2478
2479 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2480 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2481 {
2482 if (subtrees.count(oldparent)) {
2483 vector<dirfrag_t>& v = subtrees[oldparent];
2484 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2485 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2486 if (*it == df) {
2487 v.erase(it);
2488 break;
2489 }
2490 }
2491 if (subtrees.count(newparent)) {
2492 vector<dirfrag_t>& v = subtrees[newparent];
2493 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2494 v.push_back(df);
2495 }
2496 }
2497
2498 ESubtreeMap *MDCache::create_subtree_map()
2499 {
2500 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2501 << num_subtrees_fullauth() << " fullauth"
2502 << dendl;
2503
2504 show_subtrees();
2505
2506 ESubtreeMap *le = new ESubtreeMap();
2507 mds->mdlog->_start_entry(le);
2508
2509 map<dirfrag_t, CDir*> dirs_to_add;
2510
2511 if (myin) {
2512 CDir* mydir = myin->get_dirfrag(frag_t());
2513 dirs_to_add[mydir->dirfrag()] = mydir;
2514 }
2515
2516 // include all auth subtrees, and their bounds.
2517 // and a spanning tree to tie it to the root.
2518 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2519 p != subtrees.end();
2520 ++p) {
2521 CDir *dir = p->first;
2522
2523 // journal subtree as "ours" if we are
2524 // me, -2
2525 // me, me
2526 // me, !me (may be importing and ambiguous!)
2527
2528 // so not
2529 // !me, *
2530 if (dir->get_dir_auth().first != mds->get_nodeid())
2531 continue;
2532
2533 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2534 my_ambiguous_imports.count(dir->dirfrag())) {
2535 dout(15) << " ambig subtree " << *dir << dendl;
2536 le->ambiguous_subtrees.insert(dir->dirfrag());
2537 } else {
2538 dout(15) << " subtree " << *dir << dendl;
2539 }
2540
2541 dirs_to_add[dir->dirfrag()] = dir;
2542 le->subtrees[dir->dirfrag()].clear();
2543
2544
2545 // bounds
2546 for (set<CDir*>::iterator q = p->second.begin();
2547 q != p->second.end();
2548 ++q) {
2549 CDir *bound = *q;
2550 dout(15) << " subtree bound " << *bound << dendl;
2551 dirs_to_add[bound->dirfrag()] = bound;
2552 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2553 }
2554 }
2555
2556 // apply projected renames
2557 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2558 p != projected_subtree_renames.end();
2559 ++p) {
2560 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2561 CInode *diri = p->first;
2562 CDir *olddir = q->first;
2563 CDir *newdir = q->second;
2564 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2565
2566 list<CDir*> dfls;
2567 diri->get_dirfrags(dfls);
2568 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2569 CDir *dir = *p;
2570 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2571 CDir *oldparent = get_projected_subtree_root(olddir);
2572 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2573 CDir *newparent = get_projected_subtree_root(newdir);
2574 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2575
2576 if (oldparent == newparent) {
2577 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2578 << oldparent->dirfrag() << dendl;
2579 continue;
2580 }
2581
2582 if (dir->is_subtree_root()) {
2583 if (le->subtrees.count(newparent->dirfrag()) &&
2584 oldparent->get_dir_auth() != newparent->get_dir_auth())
2585 dirs_to_add[dir->dirfrag()] = dir;
2586 // children are fine. change parent.
2587 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2588 le->subtrees);
2589 } else {
2590 // mid-subtree.
2591
2592 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2593 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2594 // if oldparent is auth, subtree is mine; include it.
2595 if (le->subtrees.count(oldparent->dirfrag())) {
2596 dirs_to_add[dir->dirfrag()] = dir;
2597 le->subtrees[dir->dirfrag()].clear();
2598 }
2599 // if newparent is auth, subtree is a new bound
2600 if (le->subtrees.count(newparent->dirfrag())) {
2601 dirs_to_add[dir->dirfrag()] = dir;
2602 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2603 }
2604 newparent = dir;
2605 }
2606
2607 // see if any old bounds move to the new parent.
2608 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2609 p != subtrees[oldparent].end();
2610 ++p) {
2611 CDir *bound = *p;
2612 if (dir->contains(bound->get_parent_dir()))
2613 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2614 le->subtrees);
2615 }
2616 }
2617 }
2618 }
2619 }
2620
2621 // simplify the journaled map. our in memory map may have more
2622 // subtrees than needed due to migrations that are just getting
2623 // started or just completing. but on replay, the "live" map will
2624 // be simple and we can do a straight comparison.
2625 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2626 if (le->ambiguous_subtrees.count(p->first))
2627 continue;
2628 unsigned i = 0;
2629 while (i < p->second.size()) {
2630 dirfrag_t b = p->second[i];
2631 if (le->subtrees.count(b) &&
2632 le->ambiguous_subtrees.count(b) == 0) {
2633 vector<dirfrag_t>& bb = le->subtrees[b];
2634 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2635 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2636 p->second.push_back(*r);
2637 dirs_to_add.erase(b);
2638 le->subtrees.erase(b);
2639 p->second.erase(p->second.begin() + i);
2640 } else {
2641 ++i;
2642 }
2643 }
2644 }
2645
2646 for (auto p : dirs_to_add) {
2647 CDir *dir = p.second;
2648 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2649 le->metablob.add_dir(dir, false);
2650 }
2651
2652 dout(15) << " subtrees " << le->subtrees << dendl;
2653 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2654
2655 //le->metablob.print(cout);
2656 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2657 return le;
2658 }
2659
2660 void MDCache::dump_resolve_status(Formatter *f) const
2661 {
2662 f->open_object_section("resolve_status");
2663 f->dump_stream("resolve_gather") << resolve_gather;
2664 f->dump_stream("resolve_ack_gather") << resolve_gather;
2665 f->close_section();
2666 }
2667
2668 void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2669 {
2670 dout(10) << "resolve_start" << dendl;
2671 assert(!resolve_done);
2672 resolve_done.reset(resolve_done_);
2673
2674 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2675 // if we don't have the root dir, adjust it to UNKNOWN. during
2676 // resolve we want mds0 to explicit claim the portion of it that
2677 // it owns, so that anything beyond its bounds get left as
2678 // unknown.
2679 CDir *rootdir = root->get_dirfrag(frag_t());
2680 if (rootdir)
2681 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2682 }
2683 resolve_gather = recovery_set;
2684 }
2685
2686 void MDCache::send_resolves()
2687 {
2688 send_slave_resolves();
2689 if (!resolve_ack_gather.empty()) {
2690 dout(10) << "send_resolves still waiting for resolve ack from ("
2691 << resolve_ack_gather << ")" << dendl;
2692 return;
2693 }
2694 if (!need_resolve_rollback.empty()) {
2695 dout(10) << "send_resolves still waiting for rollback to commit on ("
2696 << need_resolve_rollback << ")" << dendl;
2697 return;
2698 }
2699 send_subtree_resolves();
2700 }
2701
2702 void MDCache::send_slave_resolves()
2703 {
2704 dout(10) << "send_slave_resolves" << dendl;
2705
2706 map<mds_rank_t, MMDSResolve*> resolves;
2707
2708 if (mds->is_resolve()) {
2709 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2710 p != uncommitted_slave_updates.end();
2711 ++p) {
2712 resolves[p->first] = new MMDSResolve;
2713 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2714 q != p->second.end();
2715 ++q) {
2716 dout(10) << " including uncommitted " << q->first << dendl;
2717 resolves[p->first]->add_slave_request(q->first, false);
2718 }
2719 }
2720 } else {
2721 set<mds_rank_t> resolve_set;
2722 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2723 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2724 p != active_requests.end();
2725 ++p) {
2726 MDRequestRef& mdr = p->second;
2727 if (!mdr->is_slave())
2728 continue;
2729 if (!mdr->slave_did_prepare() && !mdr->committing) {
2730 continue;
2731 }
2732 mds_rank_t master = mdr->slave_to_mds;
2733 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2734 dout(10) << " including uncommitted " << *mdr << dendl;
2735 if (!resolves.count(master))
2736 resolves[master] = new MMDSResolve;
2737 if (!mdr->committing &&
2738 mdr->has_more() && mdr->more()->is_inode_exporter) {
2739 // re-send cap exports
2740 CInode *in = mdr->more()->rename_inode;
2741 map<client_t, Capability::Export> cap_map;
2742 in->export_client_caps(cap_map);
2743 bufferlist bl;
2744 ::encode(in->ino(), bl);
2745 ::encode(cap_map, bl);
2746 resolves[master]->add_slave_request(p->first, bl);
2747 } else {
2748 resolves[master]->add_slave_request(p->first, mdr->committing);
2749 }
2750 }
2751 }
2752 }
2753
2754 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2755 p != resolves.end();
2756 ++p) {
2757 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2758 mds->send_message_mds(p->second, p->first);
2759 resolve_ack_gather.insert(p->first);
2760 }
2761 }
2762
2763 void MDCache::send_subtree_resolves()
2764 {
2765 dout(10) << "send_subtree_resolves" << dendl;
2766
2767 if (migrator->is_exporting() || migrator->is_importing()) {
2768 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2769 migrator->show_importing();
2770 migrator->show_exporting();
2771 resolves_pending = true;
2772 return; // not now
2773 }
2774
2775 map<mds_rank_t, MMDSResolve*> resolves;
2776 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2777 p != recovery_set.end();
2778 ++p) {
2779 if (*p == mds->get_nodeid())
2780 continue;
2781 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2782 resolves[*p] = new MMDSResolve;
2783 }
2784
2785 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2786 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2787
2788 // known
2789 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2790 p != subtrees.end();
2791 ++p) {
2792 CDir *dir = p->first;
2793
2794 // only our subtrees
2795 if (dir->authority().first != mds->get_nodeid())
2796 continue;
2797
2798 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2799 continue; // we'll add it below
2800
2801 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2802 // ambiguous (mid-import)
2803 set<CDir*> bounds;
2804 get_subtree_bounds(dir, bounds);
2805 vector<dirfrag_t> dfls;
2806 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2807 dfls.push_back((*q)->dirfrag());
2808
2809 my_ambig_imports[dir->dirfrag()] = dfls;
2810 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2811 } else {
2812 // not ambiguous.
2813 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2814 q != resolves.end();
2815 ++q)
2816 resolves[q->first]->add_subtree(dir->dirfrag());
2817 // bounds too
2818 vector<dirfrag_t> dfls;
2819 for (set<CDir*>::iterator q = subtrees[dir].begin();
2820 q != subtrees[dir].end();
2821 ++q) {
2822 CDir *bound = *q;
2823 dfls.push_back(bound->dirfrag());
2824 }
2825
2826 my_subtrees[dir->dirfrag()] = dfls;
2827 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2828 }
2829 }
2830
2831 // ambiguous
2832 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2833 p != my_ambiguous_imports.end();
2834 ++p) {
2835 my_ambig_imports[p->first] = p->second;
2836 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2837 }
2838
2839 // simplify the claimed subtree.
2840 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2841 unsigned i = 0;
2842 while (i < p->second.size()) {
2843 dirfrag_t b = p->second[i];
2844 if (my_subtrees.count(b)) {
2845 vector<dirfrag_t>& bb = my_subtrees[b];
2846 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2847 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2848 p->second.push_back(*r);
2849 my_subtrees.erase(b);
2850 p->second.erase(p->second.begin() + i);
2851 } else {
2852 ++i;
2853 }
2854 }
2855 }
2856
2857 // send
2858 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2859 p != resolves.end();
2860 ++p) {
2861 MMDSResolve* m = p->second;
2862 m->subtrees = my_subtrees;
2863 m->ambiguous_imports = my_ambig_imports;
2864 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2865 mds->send_message_mds(m, p->first);
2866 }
2867 resolves_pending = false;
2868 }
2869
2870 void MDCache::handle_mds_failure(mds_rank_t who)
2871 {
2872 dout(7) << "handle_mds_failure mds." << who << dendl;
2873
2874 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2875
2876 resolve_gather.insert(who);
2877 discard_delayed_resolve(who);
2878 ambiguous_slave_updates.erase(who);
2879
2880 rejoin_gather.insert(who);
2881 rejoin_sent.erase(who); // i need to send another
2882 rejoin_ack_sent.erase(who); // i need to send another
2883 rejoin_ack_gather.erase(who); // i'll need/get another.
2884
2885 dout(10) << " resolve_gather " << resolve_gather << dendl;
2886 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2887 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2888 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2889 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2890
2891
2892 // tell the migrator too.
2893 migrator->handle_mds_failure_or_stop(who);
2894
2895 // clean up any requests slave to/from this node
2896 list<MDRequestRef> finish;
2897 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2898 p != active_requests.end();
2899 ++p) {
2900 MDRequestRef& mdr = p->second;
2901 // slave to the failed node?
2902 if (mdr->slave_to_mds == who) {
2903 if (mdr->slave_did_prepare()) {
2904 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2905 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2906 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2907
2908 if (!mdr->more()->waiting_on_slave.empty()) {
2909 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2910 // will rollback, no need to wait
2911 if (mdr->slave_request) {
2912 mdr->slave_request->put();
2913 mdr->slave_request = 0;
2914 }
2915 mdr->more()->waiting_on_slave.clear();
2916 }
2917 } else if (!mdr->committing) {
2918 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2919 if (mdr->slave_request || mdr->slave_rolling_back())
2920 mdr->aborted = true;
2921 else
2922 finish.push_back(mdr);
2923 }
2924 }
2925
2926 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2927 if (mdr->more()->waiting_on_slave.count(who)) {
2928 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2929 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2930 << who << dendl;
2931 mdr->more()->waiting_on_slave.erase(who);
2932 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2933 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2934 }
2935
2936 if (mdr->more()->srcdn_auth_mds == who &&
2937 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2938 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2939 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2940 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2941 }
2942 } else if (mdr->slave_request) {
2943 MMDSSlaveRequest *slave_req = mdr->slave_request;
2944 // FIXME: Slave rename request can arrive after we notice mds failure.
2945 // This can cause mds to crash (does not affect integrity of FS).
2946 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2947 slave_req->srcdn_auth == who)
2948 slave_req->mark_interrupted();
2949 }
2950
2951 // failed node is slave?
2952 if (mdr->is_master() && !mdr->committing) {
2953 if (mdr->more()->srcdn_auth_mds == who) {
2954 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2955 << who << " to recover" << dendl;
2956 assert(mdr->more()->witnessed.count(who) == 0);
2957 if (mdr->more()->is_ambiguous_auth)
2958 mdr->clear_ambiguous_auth();
2959 // rename srcdn's auth mds failed, all witnesses will rollback
2960 mdr->more()->witnessed.clear();
2961 pending_masters.erase(p->first);
2962 }
2963
2964 if (mdr->more()->witnessed.count(who)) {
2965 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2966 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2967 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2968 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2969 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2970 // until either the request is committing or the slave also fails.
2971 assert(mdr->more()->waiting_on_slave.size() == 1);
2972 pending_masters.insert(p->first);
2973 } else {
2974 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2975 << who << " to recover" << dendl;
2976 if (srcdn_auth >= 0)
2977 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2978
2979 // discard this peer's prepare (if any)
2980 mdr->more()->witnessed.erase(who);
2981 }
2982 }
2983
2984 if (mdr->more()->waiting_on_slave.count(who)) {
2985 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2986 << " to recover" << dendl;
2987 // retry request when peer recovers
2988 mdr->more()->waiting_on_slave.erase(who);
2989 if (mdr->more()->waiting_on_slave.empty())
2990 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2991 }
2992
2993 if (mdr->locking && mdr->locking_target_mds == who)
2994 mdr->finish_locking(mdr->locking);
2995 }
2996 }
2997
2998 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2999 p != uncommitted_masters.end();
3000 ++p) {
3001 // The failed MDS may have already committed the slave update
3002 if (p->second.slaves.count(who)) {
3003 p->second.recovering = true;
3004 p->second.slaves.erase(who);
3005 }
3006 }
3007
3008 while (!finish.empty()) {
3009 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3010 request_finish(finish.front());
3011 finish.pop_front();
3012 }
3013
3014 kick_find_ino_peers(who);
3015 kick_open_ino_peers(who);
3016
3017 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3018 p != fragments.end(); ) {
3019 dirfrag_t df = p->first;
3020 fragment_info_t& info = p->second;
3021 ++p;
3022 if (info.is_fragmenting())
3023 continue;
3024 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3025 list<CDir*> dirs;
3026 info.dirs.swap(dirs);
3027 fragments.erase(df);
3028 fragment_unmark_unfreeze_dirs(dirs);
3029 }
3030
3031 // MDCache::shutdown_export_strays() always exports strays to mds.0
3032 if (who == mds_rank_t(0))
3033 shutdown_exported_strays.clear();
3034
3035 show_subtrees();
3036 }
3037
3038 /*
3039 * handle_mds_recovery - called on another node's transition
3040 * from resolve -> active.
3041 */
3042 void MDCache::handle_mds_recovery(mds_rank_t who)
3043 {
3044 dout(7) << "handle_mds_recovery mds." << who << dendl;
3045
3046 // exclude all discover waiters. kick_discovers() will do the job
3047 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3048 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3049
3050 list<MDSInternalContextBase*> waiters;
3051
3052 // wake up any waiters in their subtrees
3053 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3054 p != subtrees.end();
3055 ++p) {
3056 CDir *dir = p->first;
3057
3058 if (dir->authority().first != who ||
3059 dir->authority().second == mds->get_nodeid())
3060 continue;
3061 assert(!dir->is_auth());
3062
3063 // wake any waiters
3064 list<CDir*> q;
3065 q.push_back(dir);
3066
3067 while (!q.empty()) {
3068 CDir *d = q.front();
3069 q.pop_front();
3070 d->take_waiting(d_mask, waiters);
3071
3072 // inode waiters too
3073 for (CDir::map_t::iterator p = d->items.begin();
3074 p != d->items.end();
3075 ++p) {
3076 CDentry *dn = p->second;
3077 CDentry::linkage_t *dnl = dn->get_linkage();
3078 if (dnl->is_primary()) {
3079 dnl->get_inode()->take_waiting(i_mask, waiters);
3080
3081 // recurse?
3082 list<CDir*> ls;
3083 dnl->get_inode()->get_dirfrags(ls);
3084 for (list<CDir*>::iterator p = ls.begin();
3085 p != ls.end();
3086 ++p) {
3087 CDir *subdir = *p;
3088 if (!subdir->is_subtree_root())
3089 q.push_back(subdir);
3090 }
3091 }
3092 }
3093 }
3094 }
3095
3096 kick_open_ino_peers(who);
3097 kick_find_ino_peers(who);
3098
3099 // queue them up.
3100 mds->queue_waiters(waiters);
3101 }
3102
3103 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3104 {
3105 dout(7) << "set_recovery_set " << s << dendl;
3106 recovery_set = s;
3107 }
3108
3109
3110 /*
3111 * during resolve state, we share resolves to determine who
3112 * is authoritative for which trees. we expect to get an resolve
3113 * from _everyone_ in the recovery_set (the mds cluster at the time of
3114 * the first failure).
3115 *
3116 * This functions puts the passed message before returning
3117 */
3118 void MDCache::handle_resolve(MMDSResolve *m)
3119 {
3120 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3121 mds_rank_t from = mds_rank_t(m->get_source().num());
3122
3123 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3124 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3125 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3126 return;
3127 }
3128 // wait until we reach the resolve stage!
3129 m->put();
3130 return;
3131 }
3132
3133 discard_delayed_resolve(from);
3134
3135 // ambiguous slave requests?
3136 if (!m->slave_requests.empty()) {
3137 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3138 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3139 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3140 assert(!p->second.committing);
3141 pending_masters.insert(p->first);
3142 }
3143 }
3144
3145 if (!pending_masters.empty()) {
3146 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3147 delayed_resolve[from] = m;
3148 return;
3149 }
3150 }
3151
3152 MMDSResolveAck *ack = new MMDSResolveAck;
3153 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3154 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3155 // COMMIT
3156 if (p->second.committing) {
3157 // already committing, waiting for the OP_COMMITTED slave reply
3158 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3159 } else {
3160 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3161 ack->add_commit(p->first);
3162 }
3163 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3164
3165 if (p->second.inode_caps.length() > 0) {
3166 // slave wants to export caps (rename)
3167 assert(mds->is_resolve());
3168
3169 inodeno_t ino;
3170 map<client_t,Capability::Export> cap_exports;
3171 bufferlist::iterator q = p->second.inode_caps.begin();
3172 ::decode(ino, q);
3173 ::decode(cap_exports, q);
3174
3175 assert(get_inode(ino));
3176
3177 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3178 q != cap_exports.end();
3179 ++q) {
3180 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3181 im.cap_id = ++last_cap_id; // assign a new cap ID
3182 im.issue_seq = 1;
3183 im.mseq = q->second.mseq;
3184 }
3185
3186 // will process these caps in rejoin stage
3187 rejoin_slave_exports[ino].first = from;
3188 rejoin_slave_exports[ino].second.swap(cap_exports);
3189
3190 // send information of imported caps back to slave
3191 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3192 }
3193 } else {
3194 // ABORT
3195 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3196 assert(!p->second.committing);
3197 ack->add_abort(p->first);
3198 }
3199 }
3200 mds->send_message(ack, m->get_connection());
3201 m->put();
3202 return;
3203 }
3204
3205 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3206 dout(10) << "delay processing subtree resolve" << dendl;
3207 delayed_resolve[from] = m;
3208 return;
3209 }
3210
3211 bool survivor = false;
3212 // am i a surviving ambiguous importer?
3213 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3214 survivor = true;
3215 // check for any import success/failure (from this node)
3216 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3217 while (p != my_ambiguous_imports.end()) {
3218 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3219 ++next;
3220 CDir *dir = get_dirfrag(p->first);
3221 assert(dir);
3222 dout(10) << "checking ambiguous import " << *dir << dendl;
3223 if (migrator->is_importing(dir->dirfrag()) &&
3224 migrator->get_import_peer(dir->dirfrag()) == from) {
3225 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3226
3227 // check if sender claims the subtree
3228 bool claimed_by_sender = false;
3229 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3230 q != m->subtrees.end();
3231 ++q) {
3232 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3233 CDir *base = get_force_dirfrag(q->first, false);
3234 if (!base || !base->contains(dir))
3235 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3236
3237 bool inside = true;
3238 set<CDir*> bounds;
3239 get_force_dirfrag_bound_set(q->second, bounds);
3240 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3241 CDir *bound = *p;
3242 if (bound->contains(dir)) {
3243 inside = false; // nope, bound is dir or parent of dir, not inside.
3244 break;
3245 }
3246 }
3247 if (inside)
3248 claimed_by_sender = true;
3249 }
3250
3251 my_ambiguous_imports.erase(p); // no longer ambiguous.
3252 if (claimed_by_sender) {
3253 dout(7) << "ambiguous import failed on " << *dir << dendl;
3254 migrator->import_reverse(dir);
3255 } else {
3256 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3257 migrator->import_finish(dir, true);
3258 }
3259 }
3260 p = next;
3261 }
3262 }
3263
3264 // update my dir_auth values
3265 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3266 // migrations between other nodes)
3267 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3268 pi != m->subtrees.end();
3269 ++pi) {
3270 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3271 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3272 if (!dir)
3273 continue;
3274 adjust_bounded_subtree_auth(dir, pi->second, from);
3275 try_subtree_merge(dir);
3276 }
3277
3278 show_subtrees();
3279
3280 // note ambiguous imports too
3281 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3282 pi != m->ambiguous_imports.end();
3283 ++pi) {
3284 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3285 other_ambiguous_imports[from][pi->first].swap( pi->second );
3286 }
3287
3288 // did i get them all?
3289 resolve_gather.erase(from);
3290
3291 maybe_resolve_finish();
3292
3293 m->put();
3294 }
3295
3296 void MDCache::process_delayed_resolve()
3297 {
3298 dout(10) << "process_delayed_resolve" << dendl;
3299 map<mds_rank_t, MMDSResolve*> tmp;
3300 tmp.swap(delayed_resolve);
3301 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3302 handle_resolve(p->second);
3303 }
3304
3305 void MDCache::discard_delayed_resolve(mds_rank_t who)
3306 {
3307 if (delayed_resolve.count(who)) {
3308 delayed_resolve[who]->put();
3309 delayed_resolve.erase(who);
3310 }
3311 }
3312
3313 void MDCache::maybe_resolve_finish()
3314 {
3315 assert(resolve_ack_gather.empty());
3316 assert(need_resolve_rollback.empty());
3317
3318 if (!resolve_gather.empty()) {
3319 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3320 << resolve_gather << ")" << dendl;
3321 return;
3322 }
3323
3324 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3325 disambiguate_my_imports();
3326 finish_committed_masters();
3327
3328 if (resolve_done) {
3329 assert(mds->is_resolve());
3330 trim_unlinked_inodes();
3331 recalc_auth_bits(false);
3332 resolve_done.release()->complete(0);
3333 } else {
3334 maybe_send_pending_rejoins();
3335 }
3336 }
3337
3338 /* This functions puts the passed message before returning */
3339 void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3340 {
3341 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3342 mds_rank_t from = mds_rank_t(ack->get_source().num());
3343
3344 if (!resolve_ack_gather.count(from) ||
3345 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3346 ack->put();
3347 return;
3348 }
3349
3350 if (ambiguous_slave_updates.count(from)) {
3351 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3352 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3353 }
3354
3355 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3356 p != ack->commit.end();
3357 ++p) {
3358 dout(10) << " commit on slave " << p->first << dendl;
3359
3360 if (ambiguous_slave_updates.count(from)) {
3361 remove_ambiguous_slave_update(p->first, from);
3362 continue;
3363 }
3364
3365 if (mds->is_resolve()) {
3366 // replay
3367 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3368 assert(su);
3369
3370 // log commit
3371 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3372 ESlaveUpdate::OP_COMMIT, su->origop),
3373 new C_MDC_SlaveCommit(this, from, p->first));
3374 mds->mdlog->flush();
3375
3376 finish_uncommitted_slave_update(p->first, from);
3377 } else {
3378 MDRequestRef mdr = request_get(p->first);
3379 // information about master imported caps
3380 if (p->second.length() > 0)
3381 mdr->more()->inode_import.claim(p->second);
3382
3383 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3384 request_finish(mdr);
3385 }
3386 }
3387
3388 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3389 p != ack->abort.end();
3390 ++p) {
3391 dout(10) << " abort on slave " << *p << dendl;
3392
3393 if (mds->is_resolve()) {
3394 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3395 assert(su);
3396
3397 // perform rollback (and journal a rollback entry)
3398 // note: this will hold up the resolve a bit, until the rollback entries journal.
3399 MDRequestRef null_ref;
3400 switch (su->origop) {
3401 case ESlaveUpdate::LINK:
3402 mds->server->do_link_rollback(su->rollback, from, null_ref);
3403 break;
3404 case ESlaveUpdate::RENAME:
3405 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3406 break;
3407 case ESlaveUpdate::RMDIR:
3408 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3409 break;
3410 default:
3411 ceph_abort();
3412 }
3413 } else {
3414 MDRequestRef mdr = request_get(*p);
3415 mdr->aborted = true;
3416 if (mdr->slave_request) {
3417 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3418 add_rollback(*p, from);
3419 } else {
3420 request_finish(mdr);
3421 }
3422 }
3423 }
3424
3425 if (!ambiguous_slave_updates.count(from))
3426 resolve_ack_gather.erase(from);
3427 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3428 send_subtree_resolves();
3429 process_delayed_resolve();
3430 }
3431
3432 ack->put();
3433 }
3434
3435 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3436 {
3437 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3438 uncommitted_slave_updates[master][reqid] = su;
3439 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3440 uncommitted_slave_rename_olddir[*p]++;
3441 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3442 uncommitted_slave_unlink[*p]++;
3443 }
3444
3445 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3446 {
3447 assert(uncommitted_slave_updates[master].count(reqid));
3448 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3449
3450 uncommitted_slave_updates[master].erase(reqid);
3451 if (uncommitted_slave_updates[master].empty())
3452 uncommitted_slave_updates.erase(master);
3453 // discard the non-auth subtree we renamed out of
3454 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3455 CInode *diri = *p;
3456 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3457 assert(it != uncommitted_slave_rename_olddir.end());
3458 it->second--;
3459 if (it->second == 0) {
3460 uncommitted_slave_rename_olddir.erase(it);
3461 list<CDir*> ls;
3462 diri->get_dirfrags(ls);
3463 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3464 CDir *root = get_subtree_root(*q);
3465 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3466 try_trim_non_auth_subtree(root);
3467 if (*q != root)
3468 break;
3469 }
3470 }
3471 } else
3472 assert(it->second > 0);
3473 }
3474 // removed the inodes that were unlinked by slave update
3475 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3476 CInode *in = *p;
3477 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3478 assert(it != uncommitted_slave_unlink.end());
3479 it->second--;
3480 if (it->second == 0) {
3481 uncommitted_slave_unlink.erase(it);
3482 if (!in->get_projected_parent_dn())
3483 mds->mdcache->remove_inode_recursive(in);
3484 } else
3485 assert(it->second > 0);
3486 }
3487 delete su;
3488 }
3489
3490 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3491 {
3492
3493 MDSlaveUpdate* su = NULL;
3494 if (uncommitted_slave_updates.count(master) &&
3495 uncommitted_slave_updates[master].count(reqid)) {
3496 su = uncommitted_slave_updates[master][reqid];
3497 assert(su);
3498 }
3499 return su;
3500 }
3501
3502 void MDCache::finish_rollback(metareqid_t reqid) {
3503 assert(need_resolve_rollback.count(reqid));
3504 if (mds->is_resolve())
3505 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3506 need_resolve_rollback.erase(reqid);
3507 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3508 send_subtree_resolves();
3509 process_delayed_resolve();
3510 }
3511 }
3512
3513 void MDCache::disambiguate_other_imports()
3514 {
3515 dout(10) << "disambiguate_other_imports" << dendl;
3516
3517 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3518 // other nodes' ambiguous imports
3519 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3520 p != other_ambiguous_imports.end();
3521 ++p) {
3522 mds_rank_t who = p->first;
3523 dout(10) << "ambiguous imports for mds." << who << dendl;
3524
3525 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3526 q != p->second.end();
3527 ++q) {
3528 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3529 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3530 CDir *dir = get_force_dirfrag(q->first, recovering);
3531 if (!dir) continue;
3532
3533 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3534 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3535 dout(10) << " mds." << who << " did import " << *dir << dendl;
3536 adjust_bounded_subtree_auth(dir, q->second, who);
3537 try_subtree_merge(dir);
3538 } else {
3539 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3540 }
3541 }
3542 }
3543 other_ambiguous_imports.clear();
3544 }
3545
3546 void MDCache::disambiguate_my_imports()
3547 {
3548 dout(10) << "disambiguate_my_imports" << dendl;
3549
3550 if (!mds->is_resolve()) {
3551 assert(my_ambiguous_imports.empty());
3552 return;
3553 }
3554
3555 disambiguate_other_imports();
3556
3557 // my ambiguous imports
3558 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3559 while (!my_ambiguous_imports.empty()) {
3560 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3561
3562 CDir *dir = get_dirfrag(q->first);
3563 assert(dir);
3564
3565 if (dir->authority() != me_ambig) {
3566 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3567 cancel_ambiguous_import(dir);
3568
3569 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3570
3571 // subtree may have been swallowed by another node claiming dir
3572 // as their own.
3573 CDir *root = get_subtree_root(dir);
3574 if (root != dir)
3575 dout(10) << " subtree root is " << *root << dendl;
3576 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3577 try_trim_non_auth_subtree(root);
3578 } else {
3579 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3580 finish_ambiguous_import(q->first);
3581 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3582 }
3583 }
3584 assert(my_ambiguous_imports.empty());
3585 mds->mdlog->flush();
3586
3587 // verify all my subtrees are unambiguous!
3588 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3589 p != subtrees.end();
3590 ++p) {
3591 CDir *dir = p->first;
3592 if (dir->is_ambiguous_dir_auth()) {
3593 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3594 }
3595 assert(!dir->is_ambiguous_dir_auth());
3596 }
3597
3598 show_subtrees();
3599 }
3600
3601
3602 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3603 {
3604 assert(my_ambiguous_imports.count(base) == 0);
3605 my_ambiguous_imports[base] = bounds;
3606 }
3607
3608
3609 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3610 {
3611 // make a list
3612 vector<dirfrag_t> binos;
3613 for (set<CDir*>::iterator p = bounds.begin();
3614 p != bounds.end();
3615 ++p)
3616 binos.push_back((*p)->dirfrag());
3617
3618 // note: this can get called twice if the exporter fails during recovery
3619 if (my_ambiguous_imports.count(base->dirfrag()))
3620 my_ambiguous_imports.erase(base->dirfrag());
3621
3622 add_ambiguous_import(base->dirfrag(), binos);
3623 }
3624
3625 void MDCache::cancel_ambiguous_import(CDir *dir)
3626 {
3627 dirfrag_t df = dir->dirfrag();
3628 assert(my_ambiguous_imports.count(df));
3629 dout(10) << "cancel_ambiguous_import " << df
3630 << " bounds " << my_ambiguous_imports[df]
3631 << " " << *dir
3632 << dendl;
3633 my_ambiguous_imports.erase(df);
3634 }
3635
3636 void MDCache::finish_ambiguous_import(dirfrag_t df)
3637 {
3638 assert(my_ambiguous_imports.count(df));
3639 vector<dirfrag_t> bounds;
3640 bounds.swap(my_ambiguous_imports[df]);
3641 my_ambiguous_imports.erase(df);
3642
3643 dout(10) << "finish_ambiguous_import " << df
3644 << " bounds " << bounds
3645 << dendl;
3646 CDir *dir = get_dirfrag(df);
3647 assert(dir);
3648
3649 // adjust dir_auth, import maps
3650 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3651 try_subtree_merge(dir);
3652 }
3653
3654 void MDCache::remove_inode_recursive(CInode *in)
3655 {
3656 dout(10) << "remove_inode_recursive " << *in << dendl;
3657 list<CDir*> ls;
3658 in->get_dirfrags(ls);
3659 list<CDir*>::iterator p = ls.begin();
3660 while (p != ls.end()) {
3661 CDir *subdir = *p++;
3662
3663 dout(10) << " removing dirfrag " << subdir << dendl;
3664 CDir::map_t::iterator q = subdir->items.begin();
3665 while (q != subdir->items.end()) {
3666 CDentry *dn = q->second;
3667 ++q;
3668 CDentry::linkage_t *dnl = dn->get_linkage();
3669 if (dnl->is_primary()) {
3670 CInode *tin = dnl->get_inode();
3671 subdir->unlink_inode(dn, false);
3672 remove_inode_recursive(tin);
3673 }
3674 subdir->remove_dentry(dn);
3675 }
3676
3677 if (subdir->is_subtree_root())
3678 remove_subtree(subdir);
3679 in->close_dirfrag(subdir->dirfrag().frag);
3680 }
3681 remove_inode(in);
3682 }
3683
3684 bool MDCache::expire_recursive(
3685 CInode *in,
3686 map<mds_rank_t, MCacheExpire*>& expiremap)
3687 {
3688 assert(!in->is_auth());
3689
3690 dout(10) << __func__ << ":" << *in << dendl;
3691
3692 // Recurse into any dirfrags beneath this inode
3693 list<CDir*> ls;
3694 in->get_dirfrags(ls);
3695 for (auto subdir : ls) {
3696 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3697 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3698 return true;
3699 }
3700
3701 for (auto &it : subdir->items) {
3702 CDentry *dn = it.second;
3703 CDentry::linkage_t *dnl = dn->get_linkage();
3704 if (dnl->is_primary()) {
3705 CInode *tin = dnl->get_inode();
3706
3707 /* Remote strays with linkage (i.e. hardlinks) should not be
3708 * expired, because they may be the target of
3709 * a rename() as the owning MDS shuts down */
3710 if (!tin->is_stray() && tin->inode.nlink) {
3711 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3712 return true;
3713 }
3714
3715 const bool abort = expire_recursive(tin, expiremap);
3716 if (abort) {
3717 return true;
3718 }
3719 }
3720 if (dn->lru_is_expireable()) {
3721 trim_dentry(dn, expiremap);
3722 } else {
3723 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3724 return true;
3725 }
3726 }
3727 }
3728
3729 return false;
3730 }
3731
3732 void MDCache::trim_unlinked_inodes()
3733 {
3734 dout(7) << "trim_unlinked_inodes" << dendl;
3735 list<CInode*> q;
3736 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
3737 p != inode_map.end();
3738 ++p) {
3739 CInode *in = p->second;
3740 if (in->get_parent_dn() == NULL && !in->is_base()) {
3741 dout(7) << " will trim from " << *in << dendl;
3742 q.push_back(in);
3743 }
3744 }
3745 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3746 remove_inode_recursive(*p);
3747 }
3748
3749 /** recalc_auth_bits()
3750 * once subtree auth is disambiguated, we need to adjust all the
3751 * auth and dirty bits in our cache before moving on.
3752 */
3753 void MDCache::recalc_auth_bits(bool replay)
3754 {
3755 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3756
3757 if (root) {
3758 root->inode_auth.first = mds->mdsmap->get_root();
3759 bool auth = mds->get_nodeid() == root->inode_auth.first;
3760 if (auth) {
3761 root->state_set(CInode::STATE_AUTH);
3762 } else {
3763 root->state_clear(CInode::STATE_AUTH);
3764 if (!replay)
3765 root->state_set(CInode::STATE_REJOINING);
3766 }
3767 }
3768
3769 set<CInode*> subtree_inodes;
3770 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3771 p != subtrees.end();
3772 ++p) {
3773 if (p->first->dir_auth.first == mds->get_nodeid())
3774 subtree_inodes.insert(p->first->inode);
3775 }
3776
3777 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3778 p != subtrees.end();
3779 ++p) {
3780 if (p->first->inode->is_mdsdir()) {
3781 CInode *in = p->first->inode;
3782 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3783 if (auth) {
3784 in->state_set(CInode::STATE_AUTH);
3785 } else {
3786 in->state_clear(CInode::STATE_AUTH);
3787 if (!replay)
3788 in->state_set(CInode::STATE_REJOINING);
3789 }
3790 }
3791
3792 list<CDir*> dfq; // dirfrag queue
3793 dfq.push_back(p->first);
3794
3795 bool auth = p->first->authority().first == mds->get_nodeid();
3796 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3797
3798 while (!dfq.empty()) {
3799 CDir *dir = dfq.front();
3800 dfq.pop_front();
3801
3802 // dir
3803 if (auth) {
3804 dir->state_set(CDir::STATE_AUTH);
3805 } else {
3806 dir->state_clear(CDir::STATE_AUTH);
3807 if (!replay) {
3808 // close empty non-auth dirfrag
3809 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3810 dir->inode->close_dirfrag(dir->get_frag());
3811 continue;
3812 }
3813 dir->state_set(CDir::STATE_REJOINING);
3814 dir->state_clear(CDir::STATE_COMPLETE);
3815 if (dir->is_dirty())
3816 dir->mark_clean();
3817 }
3818 }
3819
3820 // dentries in this dir
3821 for (CDir::map_t::iterator q = dir->items.begin();
3822 q != dir->items.end();
3823 ++q) {
3824 // dn
3825 CDentry *dn = q->second;
3826 CDentry::linkage_t *dnl = dn->get_linkage();
3827 if (auth) {
3828 dn->state_set(CDentry::STATE_AUTH);
3829 } else {
3830 dn->state_clear(CDentry::STATE_AUTH);
3831 if (!replay) {
3832 dn->state_set(CDentry::STATE_REJOINING);
3833 if (dn->is_dirty())
3834 dn->mark_clean();
3835 }
3836 }
3837
3838 if (dnl->is_primary()) {
3839 // inode
3840 CInode *in = dnl->get_inode();
3841 if (auth) {
3842 in->state_set(CInode::STATE_AUTH);
3843 } else {
3844 in->state_clear(CInode::STATE_AUTH);
3845 if (!replay) {
3846 in->state_set(CInode::STATE_REJOINING);
3847 if (in->is_dirty())
3848 in->mark_clean();
3849 if (in->is_dirty_parent())
3850 in->clear_dirty_parent();
3851 // avoid touching scatterlocks for our subtree roots!
3852 if (subtree_inodes.count(in) == 0)
3853 in->clear_scatter_dirty();
3854 }
3855 }
3856 // recurse?
3857 if (in->is_dir())
3858 in->get_nested_dirfrags(dfq);
3859 }
3860 }
3861 }
3862 }
3863
3864 show_subtrees();
3865 show_cache();
3866 }
3867
3868
3869
3870 // ===========================================================================
3871 // REJOIN
3872
3873 /*
3874 * notes on scatterlock recovery:
3875 *
3876 * - recovering inode replica sends scatterlock data for any subtree
3877 * roots (the only ones that are possibly dirty).
3878 *
3879 * - surviving auth incorporates any provided scatterlock data. any
3880 * pending gathers are then finished, as with the other lock types.
3881 *
3882 * that takes care of surviving auth + (recovering replica)*.
3883 *
3884 * - surviving replica sends strong_inode, which includes current
3885 * scatterlock state, AND any dirty scatterlock data. this
3886 * provides the recovering auth with everything it might need.
3887 *
3888 * - recovering auth must pick initial scatterlock state based on
3889 * (weak|strong) rejoins.
3890 * - always assimilate scatterlock data (it can't hurt)
3891 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3892 * - include base inode in ack for all inodes that saw scatterlock content
3893 *
3894 * also, for scatter gather,
3895 *
3896 * - auth increments {frag,r}stat.version on completion of any gather.
3897 *
3898 * - auth incorporates changes in a gather _only_ if the version
3899 * matches.
3900 *
3901 * - replica discards changes any time the scatterlock syncs, and
3902 * after recovery.
3903 */
3904
3905 void MDCache::dump_rejoin_status(Formatter *f) const
3906 {
3907 f->open_object_section("rejoin_status");
3908 f->dump_stream("rejoin_gather") << rejoin_gather;
3909 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3910 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3911 f->close_section();
3912 }
3913
3914 void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3915 {
3916 dout(10) << "rejoin_start" << dendl;
3917 assert(!rejoin_done);
3918 rejoin_done.reset(rejoin_done_);
3919
3920 rejoin_gather = recovery_set;
3921 // need finish opening cap inodes before sending cache rejoins
3922 rejoin_gather.insert(mds->get_nodeid());
3923 process_imported_caps();
3924 }
3925
3926 /*
3927 * rejoin phase!
3928 *
3929 * this initiates rejoin. it shoudl be called before we get any
3930 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3931 *
3932 * we start out by sending rejoins to everyone in the recovery set.
3933 *
3934 * if we are rejoin, send for all regions in our cache.
3935 * if we are active|stopping, send only to nodes that are are rejoining.
3936 */
3937 void MDCache::rejoin_send_rejoins()
3938 {
3939 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3940
3941 if (rejoin_gather.count(mds->get_nodeid())) {
3942 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3943 rejoins_pending = true;
3944 return;
3945 }
3946 if (!resolve_gather.empty()) {
3947 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3948 << resolve_gather << ")" << dendl;
3949 rejoins_pending = true;
3950 return;
3951 }
3952
3953 assert(!migrator->is_importing());
3954 assert(!migrator->is_exporting());
3955
3956 if (!mds->is_rejoin()) {
3957 disambiguate_other_imports();
3958 }
3959
3960 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3961
3962
3963 // if i am rejoining, send a rejoin to everyone.
3964 // otherwise, just send to others who are rejoining.
3965 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3966 p != recovery_set.end();
3967 ++p) {
3968 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3969 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3970 if (mds->is_rejoin())
3971 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3972 else if (mds->mdsmap->is_rejoin(*p))
3973 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3974 }
3975
3976 if (mds->is_rejoin()) {
3977 map<client_t, set<mds_rank_t> > client_exports;
3978 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3979 assert(cap_export_targets.count(p->first));
3980 mds_rank_t target = cap_export_targets[p->first];
3981 if (rejoins.count(target) == 0)
3982 continue;
3983 rejoins[target]->cap_exports[p->first] = p->second;
3984 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3985 client_exports[q->first].insert(target);
3986 }
3987 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3988 p != client_exports.end();
3989 ++p) {
3990 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3991 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3992 rejoins[*q]->client_map[p->first] = inst;
3993 }
3994 }
3995
3996
3997 // check all subtrees
3998 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
3999 p != subtrees.end();
4000 ++p) {
4001 CDir *dir = p->first;
4002 assert(dir->is_subtree_root());
4003 if (dir->is_ambiguous_dir_auth()) {
4004 // exporter is recovering, importer is survivor.
4005 assert(rejoins.count(dir->authority().first));
4006 assert(!rejoins.count(dir->authority().second));
4007 continue;
4008 }
4009
4010 // my subtree?
4011 if (dir->is_auth())
4012 continue; // skip my own regions!
4013
4014 mds_rank_t auth = dir->get_dir_auth().first;
4015 assert(auth >= 0);
4016 if (rejoins.count(auth) == 0)
4017 continue; // don't care about this node's subtrees
4018
4019 rejoin_walk(dir, rejoins[auth]);
4020 }
4021
4022 // rejoin root inodes, too
4023 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4024 p != rejoins.end();
4025 ++p) {
4026 if (mds->is_rejoin()) {
4027 // weak
4028 if (p->first == 0 && root) {
4029 p->second->add_weak_inode(root->vino());
4030 if (root->is_dirty_scattered()) {
4031 dout(10) << " sending scatterlock state on root " << *root << dendl;
4032 p->second->add_scatterlock_state(root);
4033 }
4034 }
4035 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4036 if (in)
4037 p->second->add_weak_inode(in->vino());
4038 }
4039 } else {
4040 // strong
4041 if (p->first == 0 && root) {
4042 p->second->add_strong_inode(root->vino(),
4043 root->get_replica_nonce(),
4044 root->get_caps_wanted(),
4045 root->filelock.get_state(),
4046 root->nestlock.get_state(),
4047 root->dirfragtreelock.get_state());
4048 root->state_set(CInode::STATE_REJOINING);
4049 if (root->is_dirty_scattered()) {
4050 dout(10) << " sending scatterlock state on root " << *root << dendl;
4051 p->second->add_scatterlock_state(root);
4052 }
4053 }
4054
4055 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4056 p->second->add_strong_inode(in->vino(),
4057 in->get_replica_nonce(),
4058 in->get_caps_wanted(),
4059 in->filelock.get_state(),
4060 in->nestlock.get_state(),
4061 in->dirfragtreelock.get_state());
4062 in->state_set(CInode::STATE_REJOINING);
4063 }
4064 }
4065 }
4066
4067 if (!mds->is_rejoin()) {
4068 // i am survivor. send strong rejoin.
4069 // note request remote_auth_pins, xlocks
4070 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4071 p != active_requests.end();
4072 ++p) {
4073 MDRequestRef& mdr = p->second;
4074 if (mdr->is_slave())
4075 continue;
4076 // auth pins
4077 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4078 q != mdr->remote_auth_pins.end();
4079 ++q) {
4080 if (!q->first->is_auth()) {
4081 assert(q->second == q->first->authority().first);
4082 if (rejoins.count(q->second) == 0) continue;
4083 MMDSCacheRejoin *rejoin = rejoins[q->second];
4084
4085 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4086 MDSCacheObjectInfo i;
4087 q->first->set_object_info(i);
4088 if (i.ino)
4089 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4090 else
4091 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4092
4093 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4094 mdr->more()->rename_inode == q->first)
4095 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4096 mdr->reqid, mdr->attempt);
4097 }
4098 }
4099 // xlocks
4100 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4101 q != mdr->xlocks.end();
4102 ++q) {
4103 if (!(*q)->get_parent()->is_auth()) {
4104 mds_rank_t who = (*q)->get_parent()->authority().first;
4105 if (rejoins.count(who) == 0) continue;
4106 MMDSCacheRejoin *rejoin = rejoins[who];
4107
4108 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4109 MDSCacheObjectInfo i;
4110 (*q)->get_parent()->set_object_info(i);
4111 if (i.ino)
4112 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4113 mdr->reqid, mdr->attempt);
4114 else
4115 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4116 mdr->reqid, mdr->attempt);
4117 }
4118 }
4119 // remote wrlocks
4120 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4121 q != mdr->remote_wrlocks.end();
4122 ++q) {
4123 mds_rank_t who = q->second;
4124 if (rejoins.count(who) == 0) continue;
4125 MMDSCacheRejoin *rejoin = rejoins[who];
4126
4127 dout(15) << " " << *mdr << " wrlock on " << q->second
4128 << " " << q->first->get_parent() << dendl;
4129 MDSCacheObjectInfo i;
4130 q->first->get_parent()->set_object_info(i);
4131 assert(i.ino);
4132 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4133 mdr->reqid, mdr->attempt);
4134 }
4135 }
4136 }
4137
4138 // send the messages
4139 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4140 p != rejoins.end();
4141 ++p) {
4142 assert(rejoin_sent.count(p->first) == 0);
4143 assert(rejoin_ack_gather.count(p->first) == 0);
4144 rejoin_sent.insert(p->first);
4145 rejoin_ack_gather.insert(p->first);
4146 mds->send_message_mds(p->second, p->first);
4147 }
4148 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4149 rejoins_pending = false;
4150
4151 // nothing?
4152 if (mds->is_rejoin() && rejoins.empty()) {
4153 dout(10) << "nothing to rejoin" << dendl;
4154 rejoin_gather_finish();
4155 }
4156 }
4157
4158
4159 /**
4160 * rejoin_walk - build rejoin declarations for a subtree
4161 *
4162 * @param dir subtree root
4163 * @param rejoin rejoin message
4164 *
4165 * from a rejoining node:
4166 * weak dirfrag
4167 * weak dentries (w/ connectivity)
4168 *
4169 * from a surviving node:
4170 * strong dirfrag
4171 * strong dentries (no connectivity!)
4172 * strong inodes
4173 */
4174 void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4175 {
4176 dout(10) << "rejoin_walk " << *dir << dendl;
4177
4178 list<CDir*> nested; // finish this dir, then do nested items
4179
4180 if (mds->is_rejoin()) {
4181 // WEAK
4182 rejoin->add_weak_dirfrag(dir->dirfrag());
4183 for (CDir::map_t::iterator p = dir->items.begin();
4184 p != dir->items.end();
4185 ++p) {
4186 CDentry *dn = p->second;
4187 CDentry::linkage_t *dnl = dn->get_linkage();
4188 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4189 assert(dnl->is_primary());
4190 CInode *in = dnl->get_inode();
4191 assert(dnl->get_inode()->is_dir());
4192 rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
4193 in->get_nested_dirfrags(nested);
4194 if (in->is_dirty_scattered()) {
4195 dout(10) << " sending scatterlock state on " << *in << dendl;
4196 rejoin->add_scatterlock_state(in);
4197 }
4198 }
4199 } else {
4200 // STRONG
4201 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4202 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4203 dir->state_set(CDir::STATE_REJOINING);
4204
4205 for (CDir::map_t::iterator p = dir->items.begin();
4206 p != dir->items.end();
4207 ++p) {
4208 CDentry *dn = p->second;
4209 CDentry::linkage_t *dnl = dn->get_linkage();
4210 dout(15) << " add_strong_dentry " << *dn << dendl;
4211 rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4212 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4213 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4214 dnl->is_remote() ? dnl->get_remote_d_type():0,
4215 dn->get_replica_nonce(),
4216 dn->lock.get_state());
4217 dn->state_set(CDentry::STATE_REJOINING);
4218 if (dnl->is_primary()) {
4219 CInode *in = dnl->get_inode();
4220 dout(15) << " add_strong_inode " << *in << dendl;
4221 rejoin->add_strong_inode(in->vino(),
4222 in->get_replica_nonce(),
4223 in->get_caps_wanted(),
4224 in->filelock.get_state(),
4225 in->nestlock.get_state(),
4226 in->dirfragtreelock.get_state());
4227 in->state_set(CInode::STATE_REJOINING);
4228 in->get_nested_dirfrags(nested);
4229 if (in->is_dirty_scattered()) {
4230 dout(10) << " sending scatterlock state on " << *in << dendl;
4231 rejoin->add_scatterlock_state(in);
4232 }
4233 }
4234 }
4235 }
4236
4237 // recurse into nested dirs
4238 for (list<CDir*>::iterator p = nested.begin();
4239 p != nested.end();
4240 ++p)
4241 rejoin_walk(*p, rejoin);
4242 }
4243
4244
4245 /*
4246 * i got a rejoin.
4247 * - reply with the lockstate
4248 *
4249 * if i am active|stopping,
4250 * - remove source from replica list for everything not referenced here.
4251 * This function puts the passed message before returning.
4252 */
4253 void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4254 {
4255 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4256 << " (" << m->get_payload().length() << " bytes)"
4257 << dendl;
4258
4259 switch (m->op) {
4260 case MMDSCacheRejoin::OP_WEAK:
4261 handle_cache_rejoin_weak(m);
4262 break;
4263 case MMDSCacheRejoin::OP_STRONG:
4264 handle_cache_rejoin_strong(m);
4265 break;
4266 case MMDSCacheRejoin::OP_ACK:
4267 handle_cache_rejoin_ack(m);
4268 break;
4269
4270 default:
4271 ceph_abort();
4272 }
4273 m->put();
4274 }
4275
4276
4277 /*
4278 * handle_cache_rejoin_weak
4279 *
4280 * the sender
4281 * - is recovering from their journal.
4282 * - may have incorrect (out of date) inode contents
4283 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4284 *
4285 * if the sender didn't trim_non_auth(), they
4286 * - may have incorrect (out of date) dentry/inode linkage
4287 * - may have deleted/purged inodes
4288 * and i may have to go to disk to get accurate inode contents. yuck.
4289 * This functions DOES NOT put the passed message before returning
4290 */
4291 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4292 {
4293 mds_rank_t from = mds_rank_t(weak->get_source().num());
4294
4295 // possible response(s)
4296 MMDSCacheRejoin *ack = 0; // if survivor
4297 set<vinodeno_t> acked_inodes; // if survivor
4298 set<SimpleLock *> gather_locks; // if survivor
4299 bool survivor = false; // am i a survivor?
4300
4301 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4302 survivor = true;
4303 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4304 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4305
4306 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4307
4308 // check cap exports
4309 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4310 CInode *in = get_inode(p->first);
4311 assert(!in || in->is_auth());
4312 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4313 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4314 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4315 Capability::Import& im = imported_caps[p->first][q->first];
4316 if (cap) {
4317 im.cap_id = cap->get_cap_id();
4318 im.issue_seq = cap->get_last_seq();
4319 im.mseq = cap->get_mseq();
4320 } else {
4321 // all are zero
4322 }
4323 }
4324 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4325 }
4326
4327 ::encode(imported_caps, ack->imported_caps);
4328 } else {
4329 assert(mds->is_rejoin());
4330
4331 // we may have already received a strong rejoin from the sender.
4332 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4333 assert(gather_locks.empty());
4334
4335 // check cap exports.
4336 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4337
4338 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4339 CInode *in = get_inode(p->first);
4340 assert(in && in->is_auth());
4341 // note
4342 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4343 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4344 cap_imports[p->first][q->first][from] = q->second;
4345 }
4346 }
4347 }
4348
4349 // assimilate any potentially dirty scatterlock state
4350 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4351 p != weak->inode_scatterlocks.end();
4352 ++p) {
4353 CInode *in = get_inode(p->first);
4354 assert(in);
4355 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4356 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4357 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4358 if (!survivor)
4359 rejoin_potential_updated_scatterlocks.insert(in);
4360 }
4361
4362 // recovering peer may send incorrect dirfrags here. we need to
4363 // infer which dirfrag they meant. the ack will include a
4364 // strong_dirfrag that will set them straight on the fragmentation.
4365
4366 // walk weak map
4367 set<CDir*> dirs_to_share;
4368 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4369 p != weak->weak_dirfrags.end();
4370 ++p) {
4371 CInode *diri = get_inode(p->ino);
4372 if (!diri)
4373 dout(0) << " missing dir ino " << p->ino << dendl;
4374 assert(diri);
4375
4376 list<frag_t> ls;
4377 if (diri->dirfragtree.is_leaf(p->frag)) {
4378 ls.push_back(p->frag);
4379 } else {
4380 diri->dirfragtree.get_leaves_under(p->frag, ls);
4381 if (ls.empty())
4382 ls.push_back(diri->dirfragtree[p->frag.value()]);
4383 }
4384 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4385 frag_t fg = *q;
4386 CDir *dir = diri->get_dirfrag(fg);
4387 if (!dir) {
4388 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4389 continue;
4390 }
4391 assert(dir);
4392 if (dirs_to_share.count(dir)) {
4393 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4394 } else {
4395 dirs_to_share.insert(dir);
4396 unsigned nonce = dir->add_replica(from);
4397 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4398 if (ack) {
4399 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4400 ack->add_dirfrag_base(dir);
4401 }
4402 }
4403 }
4404 }
4405
4406 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4407 p != weak->weak.end();
4408 ++p) {
4409 CInode *diri = get_inode(p->first);
4410 if (!diri)
4411 dout(0) << " missing dir ino " << p->first << dendl;
4412 assert(diri);
4413
4414 // weak dentries
4415 CDir *dir = 0;
4416 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4417 q != p->second.end();
4418 ++q) {
4419 // locate proper dirfrag.
4420 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4421 frag_t fg = diri->pick_dirfrag(q->first.name);
4422 if (!dir || dir->get_frag() != fg) {
4423 dir = diri->get_dirfrag(fg);
4424 if (!dir)
4425 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4426 assert(dir);
4427 assert(dirs_to_share.count(dir));
4428 }
4429
4430 // and dentry
4431 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4432 assert(dn);
4433 CDentry::linkage_t *dnl = dn->get_linkage();
4434 assert(dnl->is_primary());
4435
4436 if (survivor && dn->is_replica(from))
4437 dentry_remove_replica(dn, from, gather_locks);
4438 unsigned dnonce = dn->add_replica(from);
4439 dout(10) << " have " << *dn << dendl;
4440 if (ack)
4441 ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4442 dnl->get_inode()->ino(), inodeno_t(0), 0,
4443 dnonce, dn->lock.get_replica_state());
4444
4445 // inode
4446 CInode *in = dnl->get_inode();
4447 assert(in);
4448
4449 if (survivor && in->is_replica(from))
4450 inode_remove_replica(in, from, true, gather_locks);
4451 unsigned inonce = in->add_replica(from);
4452 dout(10) << " have " << *in << dendl;
4453
4454 // scatter the dirlock, just in case?
4455 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4456 in->filelock.set_state(LOCK_MIX);
4457
4458 if (ack) {
4459 acked_inodes.insert(in->vino());
4460 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4461 bufferlist bl;
4462 in->_encode_locks_state_for_rejoin(bl, from);
4463 ack->add_inode_locks(in, inonce, bl);
4464 }
4465 }
4466 }
4467
4468 // weak base inodes? (root, stray, etc.)
4469 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4470 p != weak->weak_inodes.end();
4471 ++p) {
4472 CInode *in = get_inode(*p);
4473 assert(in); // hmm fixme wrt stray?
4474 if (survivor && in->is_replica(from))
4475 inode_remove_replica(in, from, true, gather_locks);
4476 unsigned inonce = in->add_replica(from);
4477 dout(10) << " have base " << *in << dendl;
4478
4479 if (ack) {
4480 acked_inodes.insert(in->vino());
4481 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4482 bufferlist bl;
4483 in->_encode_locks_state_for_rejoin(bl, from);
4484 ack->add_inode_locks(in, inonce, bl);
4485 }
4486 }
4487
4488 assert(rejoin_gather.count(from));
4489 rejoin_gather.erase(from);
4490 if (survivor) {
4491 // survivor. do everything now.
4492 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4493 p != weak->inode_scatterlocks.end();
4494 ++p) {
4495 CInode *in = get_inode(p->first);
4496 assert(in);
4497 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4498 acked_inodes.insert(in->vino());
4499 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4500 }
4501
4502 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4503 mds->send_message(ack, weak->get_connection());
4504
4505 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4506 if (!(*p)->is_stable())
4507 mds->locker->eval_gather(*p);
4508 }
4509 } else {
4510 // done?
4511 if (rejoin_gather.empty()) {
4512 rejoin_gather_finish();
4513 } else {
4514 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4515 }
4516 }
4517 }
4518
4519 class C_MDC_RejoinGatherFinish : public MDCacheContext {
4520 public:
4521 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4522 void finish(int r) override {
4523 mdcache->rejoin_gather_finish();
4524 }
4525 };
4526
4527 /*
4528 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4529 *
4530 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4531 * ack, the replica dne, and we can remove it from our replica maps.
4532 */
4533 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4534 set<vinodeno_t>& acked_inodes,
4535 set<SimpleLock *>& gather_locks)
4536 {
4537 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4538
4539 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
4540 p != inode_map.end();
4541 ++p) {
4542 CInode *in = p->second;
4543
4544 // inode?
4545 if (in->is_auth() &&
4546 in->is_replica(from) &&
4547 (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) {
4548 inode_remove_replica(in, from, false, gather_locks);
4549 dout(10) << " rem " << *in << dendl;
4550 }
4551
4552 if (!in->is_dir()) continue;
4553
4554 list<CDir*> dfs;
4555 in->get_dirfrags(dfs);
4556 for (list<CDir*>::iterator p = dfs.begin();
4557 p != dfs.end();
4558 ++p) {
4559 CDir *dir = *p;
4560
4561 if (dir->is_auth() &&
4562 dir->is_replica(from) &&
4563 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4564 dir->remove_replica(from);
4565 dout(10) << " rem " << *dir << dendl;
4566 }
4567
4568 // dentries
4569 for (CDir::map_t::iterator p = dir->items.begin();
4570 p != dir->items.end();
4571 ++p) {
4572 CDentry *dn = p->second;
4573
4574 if (dn->is_replica(from) &&
4575 (ack == NULL ||
4576 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4577 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
4578 dentry_remove_replica(dn, from, gather_locks);
4579 dout(10) << " rem " << *dn << dendl;
4580 }
4581 }
4582 }
4583 }
4584 }
4585
4586
4587 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4588 {
4589 CInode *in = new CInode(this, true, 1, last);
4590 in->inode.ino = ino;
4591 in->state_set(CInode::STATE_REJOINUNDEF);
4592 add_inode(in);
4593 rejoin_undef_inodes.insert(in);
4594 dout(10) << " invented " << *in << dendl;
4595 return in;
4596 }
4597
4598 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4599 {
4600 CInode *in = get_inode(df.ino);
4601 if (!in)
4602 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4603 if (!in->is_dir()) {
4604 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4605 in->inode.mode = S_IFDIR;
4606 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4607 }
4608 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4609 dir->state_set(CDir::STATE_REJOINUNDEF);
4610 rejoin_undef_dirfrags.insert(dir);
4611 dout(10) << " invented " << *dir << dendl;
4612 return dir;
4613 }
4614
4615 /* This functions DOES NOT put the passed message before returning */
4616 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4617 {
4618 mds_rank_t from = mds_rank_t(strong->get_source().num());
4619
4620 // only a recovering node will get a strong rejoin.
4621 assert(mds->is_rejoin());
4622
4623 // assimilate any potentially dirty scatterlock state
4624 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4625 p != strong->inode_scatterlocks.end();
4626 ++p) {
4627 CInode *in = get_inode(p->first);
4628 assert(in);
4629 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4630 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4631 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4632 rejoin_potential_updated_scatterlocks.insert(in);
4633 }
4634
4635 rejoin_unlinked_inodes[from].clear();
4636
4637 // surviving peer may send incorrect dirfrag here (maybe they didn't
4638 // get the fragment notify, or maybe we rolled back?). we need to
4639 // infer the right frag and get them with the program. somehow.
4640 // we don't normally send ACK.. so we'll need to bundle this with
4641 // MISSING or something.
4642
4643 // strong dirfrags/dentries.
4644 // also process auth_pins, xlocks.
4645 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4646 p != strong->strong_dirfrags.end();
4647 ++p) {
4648 CInode *diri = get_inode(p->first.ino);
4649 if (!diri)
4650 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4651 CDir *dir = diri->get_dirfrag(p->first.frag);
4652 bool refragged = false;
4653 if (dir) {
4654 dout(10) << " have " << *dir << dendl;
4655 } else {
4656 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4657 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4658 else if (diri->dirfragtree.is_leaf(p->first.frag))
4659 dir = rejoin_invent_dirfrag(p->first);
4660 }
4661 if (dir) {
4662 dir->add_replica(from, p->second.nonce);
4663 dir->dir_rep = p->second.dir_rep;
4664 } else {
4665 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4666 list<frag_t> ls;
4667 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4668 if (ls.empty())
4669 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4670 dout(10) << " maps to frag(s) " << ls << dendl;
4671 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4672 CDir *dir = diri->get_dirfrag(*q);
4673 if (!dir)
4674 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4675 else
4676 dout(10) << " have(approx) " << *dir << dendl;
4677 dir->add_replica(from, p->second.nonce);
4678 dir->dir_rep = p->second.dir_rep;
4679 }
4680 refragged = true;
4681 }
4682
4683 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4684 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4685 q != dmap.end();
4686 ++q) {
4687 CDentry *dn;
4688 if (!refragged)
4689 dn = dir->lookup(q->first.name, q->first.snapid);
4690 else {
4691 frag_t fg = diri->pick_dirfrag(q->first.name);
4692 dir = diri->get_dirfrag(fg);
4693 assert(dir);
4694 dn = dir->lookup(q->first.name, q->first.snapid);
4695 }
4696 if (!dn) {
4697 if (q->second.is_remote()) {
4698 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4699 q->second.first, q->first.snapid);
4700 } else if (q->second.is_null()) {
4701 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4702 } else {
4703 CInode *in = get_inode(q->second.ino, q->first.snapid);
4704 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4705 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4706 }
4707 dout(10) << " invented " << *dn << dendl;
4708 }
4709 CDentry::linkage_t *dnl = dn->get_linkage();
4710
4711 // dn auth_pin?
4712 if (strong->authpinned_dentries.count(p->first) &&
4713 strong->authpinned_dentries[p->first].count(q->first)) {
4714 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4715 r != strong->authpinned_dentries[p->first][q->first].end();
4716 ++r) {
4717 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4718
4719 // get/create slave mdrequest
4720 MDRequestRef mdr;
4721 if (have_request(r->reqid))
4722 mdr = request_get(r->reqid);
4723 else
4724 mdr = request_start_slave(r->reqid, r->attempt, strong);
4725 mdr->auth_pin(dn);
4726 }
4727 }
4728
4729 // dn xlock?
4730 if (strong->xlocked_dentries.count(p->first) &&
4731 strong->xlocked_dentries[p->first].count(q->first)) {
4732 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4733 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4734 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4735 assert(mdr->is_auth_pinned(dn));
4736 if (!mdr->xlocks.count(&dn->versionlock)) {
4737 assert(dn->versionlock.can_xlock_local());
4738 dn->versionlock.get_xlock(mdr, mdr->get_client());
4739 mdr->xlocks.insert(&dn->versionlock);
4740 mdr->locks.insert(&dn->versionlock);
4741 }
4742 if (dn->lock.is_stable())
4743 dn->auth_pin(&dn->lock);
4744 dn->lock.set_state(LOCK_XLOCK);
4745 dn->lock.get_xlock(mdr, mdr->get_client());
4746 mdr->xlocks.insert(&dn->lock);
4747 mdr->locks.insert(&dn->lock);
4748 }
4749
4750 dn->add_replica(from, q->second.nonce);
4751 dout(10) << " have " << *dn << dendl;
4752
4753 if (dnl->is_primary()) {
4754 if (q->second.is_primary()) {
4755 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4756 // the survivor missed MDentryUnlink+MDentryLink messages ?
4757 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4758 CInode *in = get_inode(q->second.ino, q->first.snapid);
4759 assert(in);
4760 assert(in->get_parent_dn());
4761 rejoin_unlinked_inodes[from].insert(in);
4762 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4763 }
4764 } else {
4765 // the survivor missed MDentryLink message ?
4766 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4767 dout(7) << " sender doesn't have primay dentry" << dendl;
4768 }
4769 } else {
4770 if (q->second.is_primary()) {
4771 // the survivor missed MDentryUnlink message ?
4772 CInode *in = get_inode(q->second.ino, q->first.snapid);
4773 assert(in);
4774 assert(in->get_parent_dn());
4775 rejoin_unlinked_inodes[from].insert(in);
4776 dout(7) << " sender has primary dentry but we don't" << dendl;
4777 }
4778 }
4779 }
4780 }
4781
4782 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4783 p != strong->strong_inodes.end();
4784 ++p) {
4785 CInode *in = get_inode(p->first);
4786 assert(in);
4787 in->add_replica(from, p->second.nonce);
4788 dout(10) << " have " << *in << dendl;
4789
4790 MMDSCacheRejoin::inode_strong &is = p->second;
4791
4792 // caps_wanted
4793 if (is.caps_wanted) {
4794 in->mds_caps_wanted[from] = is.caps_wanted;
4795 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4796 << " on " << *in << dendl;
4797 }
4798
4799 // scatterlocks?
4800 // infer state from replica state:
4801 // * go to MIX if they might have wrlocks
4802 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4803 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4804 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4805 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4806
4807 // auth pin?
4808 if (strong->authpinned_inodes.count(in->vino())) {
4809 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4810 r != strong->authpinned_inodes[in->vino()].end();
4811 ++r) {
4812 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4813
4814 // get/create slave mdrequest
4815 MDRequestRef mdr;
4816 if (have_request(r->reqid))
4817 mdr = request_get(r->reqid);
4818 else
4819 mdr = request_start_slave(r->reqid, r->attempt, strong);
4820 if (strong->frozen_authpin_inodes.count(in->vino())) {
4821 assert(!in->get_num_auth_pins());
4822 mdr->freeze_auth_pin(in);
4823 } else {
4824 assert(!in->is_frozen_auth_pin());
4825 }
4826 mdr->auth_pin(in);
4827 }
4828 }
4829 // xlock(s)?
4830 if (strong->xlocked_inodes.count(in->vino())) {
4831 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4832 q != strong->xlocked_inodes[in->vino()].end();
4833 ++q) {
4834 SimpleLock *lock = in->get_lock(q->first);
4835 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4836 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4837 assert(mdr->is_auth_pinned(in));
4838 if (!mdr->xlocks.count(&in->versionlock)) {
4839 assert(in->versionlock.can_xlock_local());
4840 in->versionlock.get_xlock(mdr, mdr->get_client());
4841 mdr->xlocks.insert(&in->versionlock);
4842 mdr->locks.insert(&in->versionlock);
4843 }
4844 if (lock->is_stable())
4845 in->auth_pin(lock);
4846 lock->set_state(LOCK_XLOCK);
4847 if (lock == &in->filelock)
4848 in->loner_cap = -1;
4849 lock->get_xlock(mdr, mdr->get_client());
4850 mdr->xlocks.insert(lock);
4851 mdr->locks.insert(lock);
4852 }
4853 }
4854 }
4855 // wrlock(s)?
4856 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4857 p != strong->wrlocked_inodes.end();
4858 ++p) {
4859 CInode *in = get_inode(p->first);
4860 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4861 q != p->second.end();
4862 ++q) {
4863 SimpleLock *lock = in->get_lock(q->first);
4864 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4865 r != q->second.end();
4866 ++r) {
4867 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4868 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4869 if (in->is_auth())
4870 assert(mdr->is_auth_pinned(in));
4871 lock->set_state(LOCK_MIX);
4872 if (lock == &in->filelock)
4873 in->loner_cap = -1;
4874 lock->get_wrlock(true);
4875 mdr->wrlocks.insert(lock);
4876 mdr->locks.insert(lock);
4877 }
4878 }
4879 }
4880
4881 // done?
4882 assert(rejoin_gather.count(from));
4883 rejoin_gather.erase(from);
4884 if (rejoin_gather.empty()) {
4885 rejoin_gather_finish();
4886 } else {
4887 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4888 }
4889 }
4890
4891 /* This functions DOES NOT put the passed message before returning */
4892 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4893 {
4894 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4895 mds_rank_t from = mds_rank_t(ack->get_source().num());
4896
4897 // for sending cache expire message
4898 set<CInode*> isolated_inodes;
4899 set<CInode*> refragged_inodes;
4900
4901 // dirs
4902 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4903 p != ack->strong_dirfrags.end();
4904 ++p) {
4905 // we may have had incorrect dir fragmentation; refragment based
4906 // on what they auth tells us.
4907 CDir *dir = get_dirfrag(p->first);
4908 if (!dir) {
4909 dir = get_force_dirfrag(p->first, false);
4910 if (dir)
4911 refragged_inodes.insert(dir->get_inode());
4912 }
4913 if (!dir) {
4914 CInode *diri = get_inode(p->first.ino);
4915 if (!diri) {
4916 // barebones inode; the full inode loop below will clean up.
4917 diri = new CInode(this, false);
4918 diri->inode.ino = p->first.ino;
4919 diri->inode.mode = S_IFDIR;
4920 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4921 add_inode(diri);
4922 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4923 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4924 dout(10) << " add inode " << *diri << dendl;
4925 } else {
4926 diri->inode_auth = CDIR_AUTH_DEFAULT;
4927 isolated_inodes.insert(diri);
4928 dout(10) << " unconnected dirfrag " << p->first << dendl;
4929 }
4930 }
4931 // barebones dirfrag; the full dirfrag loop below will clean up.
4932 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4933 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4934 (dir->authority() != CDIR_AUTH_UNDEF &&
4935 dir->authority().first != from))
4936 adjust_subtree_auth(dir, from);
4937 dout(10) << " add dirfrag " << *dir << dendl;
4938 }
4939
4940 dir->set_replica_nonce(p->second.nonce);
4941 dir->state_clear(CDir::STATE_REJOINING);
4942 dout(10) << " got " << *dir << dendl;
4943
4944 // dentries
4945 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4946 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4947 q != dmap.end();
4948 ++q) {
4949 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4950 if(!dn)
4951 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4952
4953 CDentry::linkage_t *dnl = dn->get_linkage();
4954
4955 assert(dn->last == q->first.snapid);
4956 if (dn->first != q->second.first) {
4957 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4958 dn->first = q->second.first;
4959 }
4960
4961 // may have bad linkage if we missed dentry link/unlink messages
4962 if (dnl->is_primary()) {
4963 CInode *in = dnl->get_inode();
4964 if (!q->second.is_primary() ||
4965 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4966 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4967 dir->unlink_inode(dn);
4968 }
4969 } else if (dnl->is_remote()) {
4970 if (!q->second.is_remote() ||
4971 q->second.remote_ino != dnl->get_remote_ino() ||
4972 q->second.remote_d_type != dnl->get_remote_d_type()) {
4973 dout(10) << " had bad linkage for " << *dn << dendl;
4974 dir->unlink_inode(dn);
4975 }
4976 } else {
4977 if (!q->second.is_null())
4978 dout(10) << " had bad linkage for " << *dn << dendl;
4979 }
4980
4981 // hmm, did we have the proper linkage here?
4982 if (dnl->is_null() && !q->second.is_null()) {
4983 if (q->second.is_remote()) {
4984 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4985 } else {
4986 CInode *in = get_inode(q->second.ino, q->first.snapid);
4987 if (!in) {
4988 // barebones inode; assume it's dir, the full inode loop below will clean up.
4989 in = new CInode(this, false, q->second.first, q->first.snapid);
4990 in->inode.ino = q->second.ino;
4991 in->inode.mode = S_IFDIR;
4992 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4993 add_inode(in);
4994 dout(10) << " add inode " << *in << dendl;
4995 } else if (in->get_parent_dn()) {
4996 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
4997 << ", unlinking " << *in << dendl;
4998 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
4999 }
5000 dn->dir->link_primary_inode(dn, in);
5001 isolated_inodes.erase(in);
5002 }
5003 }
5004
5005 dn->set_replica_nonce(q->second.nonce);
5006 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
5007 dn->state_clear(CDentry::STATE_REJOINING);
5008 dout(10) << " got " << *dn << dendl;
5009 }
5010 }
5011
5012 for (set<CInode*>::iterator p = refragged_inodes.begin();
5013 p != refragged_inodes.end();
5014 ++p) {
5015 list<CDir*> ls;
5016 (*p)->get_nested_dirfrags(ls);
5017 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5018 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5019 continue;
5020 assert((*q)->get_num_any() == 0);
5021 (*p)->close_dirfrag((*q)->get_frag());
5022 }
5023 }
5024
5025 // full dirfrags
5026 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5027 p != ack->dirfrag_bases.end();
5028 ++p) {
5029 CDir *dir = get_dirfrag(p->first);
5030 assert(dir);
5031 bufferlist::iterator q = p->second.begin();
5032 dir->_decode_base(q);
5033 dout(10) << " got dir replica " << *dir << dendl;
5034 }
5035
5036 // full inodes
5037 bufferlist::iterator p = ack->inode_base.begin();
5038 while (!p.end()) {
5039 inodeno_t ino;
5040 snapid_t last;
5041 bufferlist basebl;
5042 ::decode(ino, p);
5043 ::decode(last, p);
5044 ::decode(basebl, p);
5045 CInode *in = get_inode(ino, last);
5046 assert(in);
5047 bufferlist::iterator q = basebl.begin();
5048 in->_decode_base(q);
5049 dout(10) << " got inode base " << *in << dendl;
5050 }
5051
5052 // inodes
5053 p = ack->inode_locks.begin();
5054 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5055 while (!p.end()) {
5056 inodeno_t ino;
5057 snapid_t last;
5058 __u32 nonce;
5059 bufferlist lockbl;
5060 ::decode(ino, p);
5061 ::decode(last, p);
5062 ::decode(nonce, p);
5063 ::decode(lockbl, p);
5064
5065 CInode *in = get_inode(ino, last);
5066 assert(in);
5067 in->set_replica_nonce(nonce);
5068 bufferlist::iterator q = lockbl.begin();
5069 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
5070 in->state_clear(CInode::STATE_REJOINING);
5071 dout(10) << " got inode locks " << *in << dendl;
5072 }
5073
5074 // FIXME: This can happen if entire subtree, together with the inode subtree root
5075 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5076 assert(isolated_inodes.empty());
5077
5078 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5079 bufferlist::iterator bp = ack->imported_caps.begin();
5080 ::decode(peer_imported, bp);
5081
5082 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5083 p != peer_imported.end();
5084 ++p) {
5085 assert(cap_exports.count(p->first));
5086 assert(cap_export_targets.count(p->first));
5087 assert(cap_export_targets[p->first] == from);
5088 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5089 q != p->second.end();
5090 ++q) {
5091 assert(cap_exports[p->first].count(q->first));
5092
5093 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5094 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5095 assert(session);
5096
5097 // mark client caps stale.
5098 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5099 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5100 mds->get_osd_epoch_barrier());
5101 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5102 (q->second.cap_id > 0 ? from : -1), 0);
5103 mds->send_message_client_counted(m, session);
5104
5105 cap_exports[p->first].erase(q->first);
5106 }
5107 assert(cap_exports[p->first].empty());
5108 }
5109
5110 // done?
5111 assert(rejoin_ack_gather.count(from));
5112 rejoin_ack_gather.erase(from);
5113 if (mds->is_rejoin()) {
5114
5115 if (rejoin_gather.empty()) {
5116 // eval unstable scatter locks after all wrlocks are rejoined.
5117 while (!rejoin_eval_locks.empty()) {
5118 SimpleLock *lock = rejoin_eval_locks.front();
5119 rejoin_eval_locks.pop_front();
5120 if (!lock->is_stable())
5121 mds->locker->eval_gather(lock);
5122 }
5123 }
5124
5125 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5126 rejoin_ack_gather.empty()) {
5127 // finally, kickstart past snap parent opens
5128 open_snap_parents();
5129 } else {
5130 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5131 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5132 }
5133 } else {
5134 // survivor.
5135 mds->queue_waiters(rejoin_waiters);
5136 }
5137 }
5138
5139 /**
5140 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5141 *
5142 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5143 * messages that clean these guys up...
5144 */
5145 void MDCache::rejoin_trim_undef_inodes()
5146 {
5147 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5148
5149 while (!rejoin_undef_inodes.empty()) {
5150 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5151 CInode *in = *p;
5152 rejoin_undef_inodes.erase(p);
5153
5154 in->clear_replica_map();
5155
5156 // close out dirfrags
5157 if (in->is_dir()) {
5158 list<CDir*> dfls;
5159 in->get_dirfrags(dfls);
5160 for (list<CDir*>::iterator p = dfls.begin();
5161 p != dfls.end();
5162 ++p) {
5163 CDir *dir = *p;
5164 dir->clear_replica_map();
5165
5166 for (CDir::map_t::iterator p = dir->items.begin();
5167 p != dir->items.end();
5168 ++p) {
5169 CDentry *dn = p->second;
5170 dn->clear_replica_map();
5171
5172 dout(10) << " trimming " << *dn << dendl;
5173 dir->remove_dentry(dn);
5174 }
5175
5176 dout(10) << " trimming " << *dir << dendl;
5177 in->close_dirfrag(dir->dirfrag().frag);
5178 }
5179 }
5180
5181 CDentry *dn = in->get_parent_dn();
5182 if (dn) {
5183 dn->clear_replica_map();
5184 dout(10) << " trimming " << *dn << dendl;
5185 dn->dir->remove_dentry(dn);
5186 } else {
5187 dout(10) << " trimming " << *in << dendl;
5188 remove_inode(in);
5189 }
5190 }
5191
5192 assert(rejoin_undef_inodes.empty());
5193 }
5194
5195 void MDCache::rejoin_gather_finish()
5196 {
5197 dout(10) << "rejoin_gather_finish" << dendl;
5198 assert(mds->is_rejoin());
5199
5200 if (open_undef_inodes_dirfrags())
5201 return;
5202
5203 if (process_imported_caps())
5204 return;
5205
5206 choose_lock_states_and_reconnect_caps();
5207
5208 identify_files_to_recover();
5209 rejoin_send_acks();
5210
5211 // signal completion of fetches, rejoin_gather_finish, etc.
5212 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5213 rejoin_ack_gather.erase(mds->get_nodeid());
5214
5215 // did we already get our acks too?
5216 if (rejoin_ack_gather.empty()) {
5217 // finally, kickstart past snap parent opens
5218 open_snap_parents();
5219 }
5220 }
5221
5222 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5223 inodeno_t ino;
5224 public:
5225 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5226 void finish(int r) override {
5227 mdcache->rejoin_open_ino_finish(ino, r);
5228 }
5229 };
5230
5231 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5232 {
5233 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5234
5235 if (ret < 0) {
5236 cap_imports_missing.insert(ino);
5237 } else if (ret == mds->get_nodeid()) {
5238 assert(get_inode(ino));
5239 } else {
5240 auto p = cap_imports.find(ino);
5241 assert(p != cap_imports.end());
5242 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5243 assert(q->second.count(MDS_RANK_NONE));
5244 assert(q->second.size() == 1);
5245 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5246 }
5247 cap_imports.erase(p);
5248 }
5249
5250 assert(cap_imports_num_opening > 0);
5251 cap_imports_num_opening--;
5252
5253 if (cap_imports_num_opening == 0) {
5254 if (rejoin_gather.empty())
5255 rejoin_gather_finish();
5256 else if (rejoin_gather.count(mds->get_nodeid()))
5257 process_imported_caps();
5258 }
5259 }
5260
5261 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5262 public:
5263 map<client_t,entity_inst_t> client_map;
5264 map<client_t,uint64_t> sseqmap;
5265
5266 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5267 MDCacheLogContext(c), client_map(cm) {}
5268 void finish(int r) override {
5269 assert(r == 0);
5270 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5271 }
5272 };
5273
5274 void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5275 map<client_t,uint64_t>& sseqmap)
5276 {
5277 dout(10) << "rejoin_open_sessions_finish" << dendl;
5278 mds->server->finish_force_open_sessions(client_map, sseqmap);
5279 if (rejoin_gather.empty())
5280 rejoin_gather_finish();
5281 }
5282
5283 bool MDCache::process_imported_caps()
5284 {
5285 dout(10) << "process_imported_caps" << dendl;
5286
5287 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5288 CInode *in = get_inode(p->first);
5289 if (in) {
5290 assert(in->is_auth());
5291 cap_imports_missing.erase(p->first);
5292 continue;
5293 }
5294 if (cap_imports_missing.count(p->first) > 0)
5295 continue;
5296
5297 cap_imports_num_opening++;
5298 dout(10) << " opening missing ino " << p->first << dendl;
5299 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5300 }
5301
5302 if (cap_imports_num_opening > 0)
5303 return true;
5304
5305 // called by rejoin_gather_finish() ?
5306 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5307 // if sessions for imported caps are all open ?
5308 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5309 p != rejoin_client_map.end();
5310 ++p) {
5311 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5312 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5313 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5314 ESessions *le = new ESessions(pv, rejoin_client_map);
5315 mds->mdlog->start_submit_entry(le, finish);
5316 mds->mdlog->flush();
5317 rejoin_client_map.clear();
5318 return true;
5319 }
5320 }
5321 rejoin_client_map.clear();
5322
5323 // process caps that were exported by slave rename
5324 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5325 p != rejoin_slave_exports.end();
5326 ++p) {
5327 CInode *in = get_inode(p->first);
5328 assert(in);
5329 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5330 q != p->second.second.end();
5331 ++q) {
5332 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5333 assert(session);
5334
5335 Capability *cap = in->get_client_cap(q->first);
5336 if (!cap)
5337 cap = in->add_client_cap(q->first, session);
5338 cap->merge(q->second, true);
5339
5340 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5341 assert(cap->get_last_seq() == im.issue_seq);
5342 assert(cap->get_mseq() == im.mseq);
5343 cap->set_cap_id(im.cap_id);
5344 // send cap import because we assigned a new cap ID
5345 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5346 p->second.first, CEPH_CAP_FLAG_AUTH);
5347 }
5348 }
5349 rejoin_slave_exports.clear();
5350 rejoin_imported_caps.clear();
5351
5352 // process cap imports
5353 // ino -> client -> frommds -> capex
5354 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5355 CInode *in = get_inode(p->first);
5356 if (!in) {
5357 dout(10) << " still missing ino " << p->first
5358 << ", will try again after replayed client requests" << dendl;
5359 ++p;
5360 continue;
5361 }
5362 assert(in->is_auth());
5363 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5364 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5365 assert(session);
5366 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5367 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5368 add_reconnected_cap(q->first, in->ino(), r->second);
5369 if (r->first >= 0) {
5370 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5371 cap->inc_mseq();
5372 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5373
5374 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5375 im.cap_id = cap->get_cap_id();
5376 im.issue_seq = cap->get_last_seq();
5377 im.mseq = cap->get_mseq();
5378 }
5379 }
5380 }
5381 cap_imports.erase(p++); // remove and move on
5382 }
5383 } else {
5384 trim_non_auth();
5385
5386 rejoin_gather.erase(mds->get_nodeid());
5387 maybe_send_pending_rejoins();
5388
5389 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5390 rejoin_gather_finish();
5391 }
5392 return false;
5393 }
5394
5395 void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5396 {
5397 // are this realm's parents fully open?
5398 if (realm->have_past_parents_open()) {
5399 dout(10) << " have past snap parents for realm " << *realm
5400 << " on " << *realm->inode << dendl;
5401 if (reconnect) {
5402 // finish off client snaprealm reconnects?
5403 auto p = reconnected_snaprealms.find(realm->inode->ino());
5404 if (p != reconnected_snaprealms.end()) {
5405 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5406 finish_snaprealm_reconnect(q->first, realm, q->second);
5407 reconnected_snaprealms.erase(p);
5408 }
5409 }
5410 } else {
5411 if (!missing_snap_parents.count(realm->inode)) {
5412 dout(10) << " MISSING past snap parents for realm " << *realm
5413 << " on " << *realm->inode << dendl;
5414 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5415 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5416 } else {
5417 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5418 << " on " << *realm->inode << dendl;
5419 }
5420 }
5421 }
5422
5423 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5424 client_t client, snapid_t snap_follows)
5425 {
5426 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5427
5428 const set<snapid_t>& snaps = realm->get_snaps();
5429 snapid_t follows = snap_follows;
5430
5431 while (true) {
5432 CInode *in = pick_inode_snap(head_in, follows);
5433 if (in == head_in)
5434 break;
5435 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5436
5437 /* TODO: we can check the reconnected/flushing caps to find
5438 * which locks need gathering */
5439 for (int i = 0; i < num_cinode_locks; i++) {
5440 int lockid = cinode_lock_info[i].lock;
5441 SimpleLock *lock = in->get_lock(lockid);
5442 assert(lock);
5443 in->client_snap_caps[lockid].insert(client);
5444 in->auth_pin(lock);
5445 lock->set_state(LOCK_SNAP_SYNC);
5446 lock->get_wrlock(true);
5447 }
5448
5449 for (auto p = snaps.lower_bound(in->first);
5450 p != snaps.end() && *p <= in->last;
5451 ++p) {
5452 head_in->add_need_snapflush(in, *p, client);
5453 }
5454
5455 follows = in->last;
5456 }
5457 }
5458
5459 /*
5460 * choose lock states based on reconnected caps
5461 */
5462 void MDCache::choose_lock_states_and_reconnect_caps()
5463 {
5464 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5465
5466 map<client_t,MClientSnap*> splits;
5467
5468 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator i = inode_map.begin();
5469 i != inode_map.end();
5470 ++i) {
5471 CInode *in = i->second;
5472
5473 if (in->last != CEPH_NOSNAP)
5474 continue;
5475
5476 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5477 in->mark_dirty_rstat();
5478
5479 auto p = reconnected_caps.find(in->ino());
5480
5481 int dirty_caps = 0;
5482 if (p != reconnected_caps.end()) {
5483 for (const auto &it : p->second)
5484 dirty_caps |= it.second.dirty_caps;
5485 }
5486 in->choose_lock_states(dirty_caps);
5487 dout(15) << " chose lock states on " << *in << dendl;
5488
5489 SnapRealm *realm = in->find_snaprealm();
5490
5491 check_realm_past_parents(realm, realm == in->snaprealm);
5492
5493 if (p != reconnected_caps.end()) {
5494 bool missing_snap_parent = false;
5495 // also, make sure client's cap is in the correct snaprealm.
5496 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5497 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5498 if (realm->have_past_parents_open()) {
5499 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5500 } else {
5501 missing_snap_parent = true;
5502 }
5503 }
5504
5505 if (q->second.realm_ino == realm->inode->ino()) {
5506 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5507 } else {
5508 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5509 << " != " << realm->inode->ino() << dendl;
5510 if (realm->have_past_parents_open()) {
5511 // ok, include in a split message _now_.
5512 prepare_realm_split(realm, q->first, in->ino(), splits);
5513 } else {
5514 // send the split later.
5515 missing_snap_parent = true;
5516 }
5517 }
5518 }
5519 if (missing_snap_parent)
5520 missing_snap_parents[realm->inode].insert(in);
5521 }
5522 }
5523
5524 send_snaps(splits);
5525 }
5526
5527 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5528 map<client_t,MClientSnap*>& splits)
5529 {
5530 MClientSnap *snap;
5531 if (splits.count(client) == 0) {
5532 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5533 snap->head.split = realm->inode->ino();
5534 realm->build_snap_trace(snap->bl);
5535
5536 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5537 p != realm->open_children.end();
5538 ++p)
5539 snap->split_realms.push_back((*p)->inode->ino());
5540
5541 } else
5542 snap = splits[client];
5543 snap->split_inos.push_back(ino);
5544 }
5545
5546 void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5547 {
5548 dout(10) << "send_snaps" << dendl;
5549
5550 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5551 p != splits.end();
5552 ++p) {
5553 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5554 if (session) {
5555 dout(10) << " client." << p->first
5556 << " split " << p->second->head.split
5557 << " inos " << p->second->split_inos
5558 << dendl;
5559 mds->send_message_client_counted(p->second, session);
5560 } else {
5561 dout(10) << " no session for client." << p->first << dendl;
5562 p->second->put();
5563 }
5564 }
5565 splits.clear();
5566 }
5567
5568
5569 /*
5570 * remove any items from logsegment open_file lists that don't have
5571 * any caps
5572 */
5573 void MDCache::clean_open_file_lists()
5574 {
5575 dout(10) << "clean_open_file_lists" << dendl;
5576
5577 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5578 p != mds->mdlog->segments.end();
5579 ++p) {
5580 LogSegment *ls = p->second;
5581
5582 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5583 while (!q.end()) {
5584 CInode *in = *q;
5585 ++q;
5586 if (in->last == CEPH_NOSNAP) {
5587 if (!in->is_any_caps_wanted()) {
5588 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5589 in->item_open_file.remove_myself();
5590 }
5591 } else if (in->last != CEPH_NOSNAP) {
5592 if (in->client_snap_caps.empty()) {
5593 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5594 in->item_open_file.remove_myself();
5595 }
5596 }
5597 }
5598 }
5599 }
5600
5601
5602
5603 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5604 {
5605 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5606 << " on " << *in << dendl;
5607 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5608 if (!session) {
5609 dout(10) << " no session for client." << client << dendl;
5610 return NULL;
5611 }
5612
5613 Capability *cap = in->reconnect_cap(client, icr, session);
5614
5615 if (frommds >= 0) {
5616 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5617 cap->inc_mseq();
5618 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5619 }
5620
5621 return cap;
5622 }
5623
5624 void MDCache::export_remaining_imported_caps()
5625 {
5626 dout(10) << "export_remaining_imported_caps" << dendl;
5627
5628 stringstream warn_str;
5629
5630 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5631 warn_str << " ino " << p->first << "\n";
5632 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5633 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5634 if (session) {
5635 // mark client caps stale.
5636 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5637 stale->set_cap_peer(0, 0, 0, -1, 0);
5638 mds->send_message_client_counted(stale, q->first);
5639 }
5640 }
5641
5642 mds->heartbeat_reset();
5643 }
5644
5645 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5646 p != cap_reconnect_waiters.end();
5647 ++p)
5648 mds->queue_waiters(p->second);
5649
5650 cap_imports.clear();
5651 cap_reconnect_waiters.clear();
5652
5653 if (warn_str.peek() != EOF) {
5654 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5655 mds->clog->warn(warn_str);
5656 }
5657 }
5658
5659 void MDCache::try_reconnect_cap(CInode *in, Session *session)
5660 {
5661 client_t client = session->info.get_client();
5662 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5663 if (rc) {
5664 in->reconnect_cap(client, *rc, session);
5665 dout(10) << "try_reconnect_cap client." << client
5666 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5667 << " issue " << ccap_string(rc->capinfo.issued)
5668 << " on " << *in << dendl;
5669 remove_replay_cap_reconnect(in->ino(), client);
5670
5671 if (in->is_replicated()) {
5672 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5673 } else {
5674 int dirty_caps = 0;
5675 auto p = reconnected_caps.find(in->ino());
5676 if (p != reconnected_caps.end()) {
5677 auto q = p->second.find(client);
5678 if (q != p->second.end())
5679 dirty_caps = q->second.dirty_caps;
5680 }
5681 in->choose_lock_states(dirty_caps);
5682 dout(15) << " chose lock states on " << *in << dendl;
5683 }
5684
5685 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5686 cap_reconnect_waiters.find(in->ino());
5687 if (it != cap_reconnect_waiters.end()) {
5688 mds->queue_waiters(it->second);
5689 cap_reconnect_waiters.erase(it);
5690 }
5691 }
5692 }
5693
5694
5695
5696 // -------
5697 // cap imports and delayed snap parent opens
5698
5699 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5700 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5701 int peer, int p_flags)
5702 {
5703 client_t client = session->info.inst.name.num();
5704 SnapRealm *realm = in->find_snaprealm();
5705 if (realm->have_past_parents_open()) {
5706 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5707 if (cap->get_last_seq() == 0) // reconnected cap
5708 cap->inc_last_seq();
5709 cap->set_last_issue();
5710 cap->set_last_issue_stamp(ceph_clock_now());
5711 cap->clear_new();
5712 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5713 in->ino(),
5714 realm->inode->ino(),
5715 cap->get_cap_id(), cap->get_last_seq(),
5716 cap->pending(), cap->wanted(), 0,
5717 cap->get_mseq(), mds->get_osd_epoch_barrier());
5718 in->encode_cap_message(reap, cap);
5719 realm->build_snap_trace(reap->snapbl);
5720 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5721 mds->send_message_client_counted(reap, session);
5722 } else {
5723 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5724 << cap->get_mseq() << " on " << *in << dendl;
5725 in->auth_pin(this);
5726 cap->inc_suppress();
5727 delayed_imported_caps[client].insert(in);
5728 missing_snap_parents[in].size();
5729 }
5730 }
5731
5732 void MDCache::do_delayed_cap_imports()
5733 {
5734 dout(10) << "do_delayed_cap_imports" << dendl;
5735
5736 assert(delayed_imported_caps.empty());
5737 }
5738
5739 struct C_MDC_OpenSnapParents : public MDCacheContext {
5740 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5741 void finish(int r) override {
5742 mdcache->open_snap_parents();
5743 }
5744 };
5745
5746 void MDCache::open_snap_parents()
5747 {
5748 dout(10) << "open_snap_parents" << dendl;
5749
5750 map<client_t,MClientSnap*> splits;
5751 MDSGatherBuilder gather(g_ceph_context);
5752
5753 auto p = missing_snap_parents.begin();
5754 while (p != missing_snap_parents.end()) {
5755 CInode *in = p->first;
5756 assert(in->snaprealm);
5757 if (in->snaprealm->open_parents(gather.new_sub())) {
5758 dout(10) << " past parents now open on " << *in << dendl;
5759
5760 for (CInode *child : p->second) {
5761 auto q = reconnected_caps.find(child->ino());
5762 assert(q != reconnected_caps.end());
5763 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5764 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5765 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5766 }
5767 // make sure client's cap is in the correct snaprealm.
5768 if (r->second.realm_ino != in->ino()) {
5769 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5770 }
5771 }
5772 }
5773
5774 missing_snap_parents.erase(p++);
5775
5776 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5777
5778 // finish off client snaprealm reconnects?
5779 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5780 if (q != reconnected_snaprealms.end()) {
5781 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5782 r != q->second.end();
5783 ++r)
5784 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5785 reconnected_snaprealms.erase(q);
5786 }
5787 } else {
5788 dout(10) << " opening past parents on " << *in << dendl;
5789 ++p;
5790 }
5791 }
5792
5793 send_snaps(splits);
5794
5795 if (gather.has_subs()) {
5796 dout(10) << "open_snap_parents - waiting for "
5797 << gather.num_subs_remaining() << dendl;
5798 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5799 gather.activate();
5800 } else {
5801 if (!reconnected_snaprealms.empty()) {
5802 stringstream warn_str;
5803 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5804 p != reconnected_snaprealms.end();
5805 ++p) {
5806 warn_str << " unconnected snaprealm " << p->first << "\n";
5807 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5808 q != p->second.end();
5809 ++q)
5810 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5811 }
5812 mds->clog->warn() << "open_snap_parents has:";
5813 mds->clog->warn(warn_str);
5814 }
5815 assert(rejoin_waiters.empty());
5816 assert(missing_snap_parents.empty());
5817 dout(10) << "open_snap_parents - all open" << dendl;
5818 do_delayed_cap_imports();
5819
5820 assert(rejoin_done);
5821 rejoin_done.release()->complete(0);
5822 reconnected_caps.clear();
5823 }
5824 }
5825
5826 bool MDCache::open_undef_inodes_dirfrags()
5827 {
5828 dout(10) << "open_undef_inodes_dirfrags "
5829 << rejoin_undef_inodes.size() << " inodes "
5830 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5831
5832 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5833
5834 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5835 p != rejoin_undef_inodes.end();
5836 ++p) {
5837 CInode *in = *p;
5838 assert(!in->is_base());
5839 fetch_queue.insert(in->get_parent_dir());
5840 }
5841
5842 if (fetch_queue.empty())
5843 return false;
5844
5845 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5846 for (set<CDir*>::iterator p = fetch_queue.begin();
5847 p != fetch_queue.end();
5848 ++p) {
5849 CDir *dir = *p;
5850 CInode *diri = dir->get_inode();
5851 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5852 continue;
5853 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5854 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5855 dir->fetch(gather.new_sub());
5856 }
5857 assert(gather.has_subs());
5858 gather.activate();
5859 return true;
5860 }
5861
5862 void MDCache::opened_undef_inode(CInode *in) {
5863 dout(10) << "opened_undef_inode " << *in << dendl;
5864 rejoin_undef_inodes.erase(in);
5865 if (in->is_dir()) {
5866 // FIXME: re-hash dentries if necessary
5867 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5868 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5869 CDir *dir = in->get_dirfrag(frag_t());
5870 assert(dir);
5871 rejoin_undef_dirfrags.erase(dir);
5872 in->force_dirfrags();
5873 list<CDir*> ls;
5874 in->get_dirfrags(ls);
5875 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5876 rejoin_undef_dirfrags.insert(*p);
5877 }
5878 }
5879 }
5880
5881 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5882 {
5883 if (seq < realm->get_newest_seq()) {
5884 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5885 << realm->get_newest_seq()
5886 << " on " << *realm << dendl;
5887 // send an update
5888 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5889 if (session) {
5890 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5891 realm->build_snap_trace(snap->bl);
5892 mds->send_message_client_counted(snap, session);
5893 } else {
5894 dout(10) << " ...or not, no session for this client!" << dendl;
5895 }
5896 } else {
5897 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5898 << " on " << *realm << dendl;
5899 }
5900 }
5901
5902
5903
5904 void MDCache::rejoin_send_acks()
5905 {
5906 dout(7) << "rejoin_send_acks" << dendl;
5907
5908 // replicate stray
5909 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5910 p != rejoin_unlinked_inodes.end();
5911 ++p) {
5912 for (set<CInode*>::iterator q = p->second.begin();
5913 q != p->second.end();
5914 ++q) {
5915 CInode *in = *q;
5916 dout(7) << " unlinked inode " << *in << dendl;
5917 // inode expired
5918 if (!in->is_replica(p->first))
5919 continue;
5920 while (1) {
5921 CDentry *dn = in->get_parent_dn();
5922 if (dn->is_replica(p->first))
5923 break;
5924 dn->add_replica(p->first);
5925 CDir *dir = dn->get_dir();
5926 if (dir->is_replica(p->first))
5927 break;
5928 dir->add_replica(p->first);
5929 in = dir->get_inode();
5930 if (in->is_replica(p->first))
5931 break;
5932 if (in->is_base())
5933 break;
5934 }
5935 }
5936 }
5937 rejoin_unlinked_inodes.clear();
5938
5939 // send acks to everyone in the recovery set
5940 map<mds_rank_t,MMDSCacheRejoin*> acks;
5941 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5942 p != recovery_set.end();
5943 ++p) {
5944 if (rejoin_ack_sent.count(*p))
5945 continue;
5946 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5947 }
5948
5949 rejoin_ack_sent = recovery_set;
5950
5951 // walk subtrees
5952 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5953 p != subtrees.end();
5954 ++p) {
5955 CDir *dir = p->first;
5956 if (!dir->is_auth())
5957 continue;
5958 dout(10) << "subtree " << *dir << dendl;
5959
5960 // auth items in this subtree
5961 list<CDir*> dq;
5962 dq.push_back(dir);
5963
5964 while (!dq.empty()) {
5965 CDir *dir = dq.front();
5966 dq.pop_front();
5967
5968 // dir
5969 for (compact_map<mds_rank_t,unsigned>::iterator r = dir->replicas_begin();
5970 r != dir->replicas_end();
5971 ++r) {
5972 auto it = acks.find(r->first);
5973 if (it == acks.end())
5974 continue;
5975 it->second->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
5976 it->second->add_dirfrag_base(dir);
5977 }
5978
5979 for (CDir::map_t::iterator q = dir->items.begin();
5980 q != dir->items.end();
5981 ++q) {
5982 CDentry *dn = q->second;
5983 CDentry::linkage_t *dnl = dn->get_linkage();
5984
5985 // inode
5986 CInode *in = NULL;
5987 if (dnl->is_primary())
5988 in = dnl->get_inode();
5989
5990 // dentry
5991 for (compact_map<mds_rank_t,unsigned>::iterator r = dn->replicas_begin();
5992 r != dn->replicas_end();
5993 ++r) {
5994 auto it = acks.find(r->first);
5995 if (it == acks.end())
5996 continue;
5997 it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
5998 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
5999 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6000 dnl->is_remote() ? dnl->get_remote_d_type():0,
6001 ++r->second,
6002 dn->lock.get_replica_state());
6003 // peer missed MDentrylink message ?
6004 if (in && !in->is_replica(r->first))
6005 in->add_replica(r->first);
6006 }
6007
6008 if (!in)
6009 continue;
6010
6011 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
6012 r != in->replicas_end();
6013 ++r) {
6014 auto it = acks.find(r->first);
6015 if (it == acks.end())
6016 continue;
6017 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6018 bufferlist bl;
6019 in->_encode_locks_state_for_rejoin(bl, r->first);
6020 it->second->add_inode_locks(in, ++r->second, bl);
6021 }
6022
6023 // subdirs in this subtree?
6024 in->get_nested_dirfrags(dq);
6025 }
6026 }
6027 }
6028
6029 // base inodes too
6030 if (root && root->is_auth())
6031 for (compact_map<mds_rank_t,unsigned>::iterator r = root->replicas_begin();
6032 r != root->replicas_end();
6033 ++r) {
6034 auto it = acks.find(r->first);
6035 if (it == acks.end())
6036 continue;
6037 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6038 bufferlist bl;
6039 root->_encode_locks_state_for_rejoin(bl, r->first);
6040 it->second->add_inode_locks(root, ++r->second, bl);
6041 }
6042 if (myin)
6043 for (compact_map<mds_rank_t,unsigned>::iterator r = myin->replicas_begin();
6044 r != myin->replicas_end();
6045 ++r) {
6046 auto it = acks.find(r->first);
6047 if (it == acks.end())
6048 continue;
6049 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6050 bufferlist bl;
6051 myin->_encode_locks_state_for_rejoin(bl, r->first);
6052 it->second->add_inode_locks(myin, ++r->second, bl);
6053 }
6054
6055 // include inode base for any inodes whose scatterlocks may have updated
6056 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6057 p != rejoin_potential_updated_scatterlocks.end();
6058 ++p) {
6059 CInode *in = *p;
6060 for (compact_map<mds_rank_t,unsigned>::iterator r = in->replicas_begin();
6061 r != in->replicas_end();
6062 ++r) {
6063 auto it = acks.find(r->first);
6064 if (it == acks.end())
6065 continue;
6066 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6067 }
6068 }
6069
6070 // send acks
6071 for (auto p = acks.begin(); p != acks.end(); ++p) {
6072 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6073 mds->send_message_mds(p->second, p->first);
6074 }
6075
6076 rejoin_imported_caps.clear();
6077 }
6078
6079
6080 void MDCache::reissue_all_caps()
6081 {
6082 dout(10) << "reissue_all_caps" << dendl;
6083
6084 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6085 p != inode_map.end();
6086 ++p) {
6087 CInode *in = p->second;
6088 if (in->is_head() && in->is_any_caps()) {
6089 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6090 mds->locker->issue_caps(in);
6091 }
6092 }
6093 }
6094
6095
6096 // ===============================================================================
6097
6098 struct C_MDC_QueuedCow : public MDCacheContext {
6099 CInode *in;
6100 MutationRef mut;
6101 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6102 MDCacheContext(mdc), in(i), mut(m) {}
6103 void finish(int r) override {
6104 mdcache->_queued_file_recover_cow(in, mut);
6105 }
6106 };
6107
6108
6109 void MDCache::queue_file_recover(CInode *in)
6110 {
6111 dout(10) << "queue_file_recover " << *in << dendl;
6112 assert(in->is_auth());
6113
6114 // cow?
6115 /*
6116 SnapRealm *realm = in->find_snaprealm();
6117 set<snapid_t> s = realm->get_snaps();
6118 while (!s.empty() && *s.begin() < in->first)
6119 s.erase(s.begin());
6120 while (!s.empty() && *s.rbegin() > in->last)
6121 s.erase(*s.rbegin());
6122 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6123 if (s.size() > 1) {
6124 inode_t *pi = in->project_inode();
6125 pi->version = in->pre_dirty();
6126
6127 auto mut(std::make_shared<MutationImpl>());
6128 mut->ls = mds->mdlog->get_current_segment();
6129 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6130 mds->mdlog->start_entry(le);
6131 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6132
6133 s.erase(*s.begin());
6134 while (!s.empty()) {
6135 snapid_t snapid = *s.begin();
6136 CInode *cow_inode = 0;
6137 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6138 assert(cow_inode);
6139 recovery_queue.enqueue(cow_inode);
6140 s.erase(*s.begin());
6141 }
6142
6143 in->parent->first = in->first;
6144 le->metablob.add_primary_dentry(in->parent, in, true);
6145 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6146 mds->mdlog->flush();
6147 }
6148 */
6149
6150 recovery_queue.enqueue(in);
6151 }
6152
6153 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6154 {
6155 in->pop_and_dirty_projected_inode(mut->ls);
6156 mut->apply();
6157 mds->locker->drop_locks(mut.get());
6158 mut->cleanup();
6159 }
6160
6161
6162 /*
6163 * called after recovery to recover file sizes for previously opened (for write)
6164 * files. that is, those where max_size > size.
6165 */
6166 void MDCache::identify_files_to_recover()
6167 {
6168 dout(10) << "identify_files_to_recover" << dendl;
6169 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6170 p != inode_map.end();
6171 ++p) {
6172 CInode *in = p->second;
6173 if (!in->is_auth())
6174 continue;
6175
6176 if (in->last != CEPH_NOSNAP)
6177 continue;
6178
6179 // Only normal files need file size recovery
6180 if (!in->is_file()) {
6181 continue;
6182 }
6183
6184 bool recover = false;
6185 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6186 p != in->inode.client_ranges.end();
6187 ++p) {
6188 Capability *cap = in->get_client_cap(p->first);
6189 if (!cap) {
6190 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6191 recover = true;
6192 break;
6193 }
6194 }
6195
6196 if (recover) {
6197 if (in->filelock.is_stable()) {
6198 in->auth_pin(&in->filelock);
6199 } else {
6200 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6201 }
6202 in->filelock.set_state(LOCK_PRE_SCAN);
6203 rejoin_recover_q.push_back(in);
6204 } else {
6205 rejoin_check_q.push_back(in);
6206 }
6207 }
6208 }
6209
6210 void MDCache::start_files_to_recover()
6211 {
6212 for (CInode *in : rejoin_check_q) {
6213 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6214 mds->locker->issue_caps(in);
6215 mds->locker->check_inode_max_size(in);
6216 }
6217 rejoin_check_q.clear();
6218 for (CInode *in : rejoin_recover_q) {
6219 mds->locker->file_recover(&in->filelock);
6220 }
6221 if (!rejoin_recover_q.empty()) {
6222 rejoin_recover_q.clear();
6223 do_file_recover();
6224 }
6225 }
6226
6227 void MDCache::do_file_recover()
6228 {
6229 recovery_queue.advance();
6230 }
6231
6232 // ===============================================================================
6233
6234
6235 // ----------------------------
6236 // truncate
6237
6238 class C_MDC_RetryTruncate : public MDCacheContext {
6239 CInode *in;
6240 LogSegment *ls;
6241 public:
6242 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6243 MDCacheContext(c), in(i), ls(l) {}
6244 void finish(int r) override {
6245 mdcache->_truncate_inode(in, ls);
6246 }
6247 };
6248
6249 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6250 {
6251 inode_t *pi = in->get_projected_inode();
6252 dout(10) << "truncate_inode "
6253 << pi->truncate_from << " -> " << pi->truncate_size
6254 << " on " << *in
6255 << dendl;
6256
6257 ls->truncating_inodes.insert(in);
6258 in->get(CInode::PIN_TRUNCATING);
6259 in->auth_pin(this);
6260
6261 if (!in->client_need_snapflush.empty() &&
6262 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6263 assert(in->filelock.is_xlocked());
6264 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6265 mds->locker->issue_caps(in);
6266 return;
6267 }
6268
6269 _truncate_inode(in, ls);
6270 }
6271
6272 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6273 CInode *in;
6274 LogSegment *ls;
6275 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6276 MDCacheIOContext(c), in(i), ls(l) {}
6277 void finish(int r) override {
6278 assert(r == 0 || r == -ENOENT);
6279 mdcache->truncate_inode_finish(in, ls);
6280 }
6281 };
6282
6283 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6284 {
6285 inode_t *pi = &in->inode;
6286 dout(10) << "_truncate_inode "
6287 << pi->truncate_from << " -> " << pi->truncate_size
6288 << " on " << *in << dendl;
6289
6290 assert(pi->is_truncating());
6291 assert(pi->truncate_size < (1ULL << 63));
6292 assert(pi->truncate_from < (1ULL << 63));
6293 assert(pi->truncate_size < pi->truncate_from);
6294
6295
6296 SnapRealm *realm = in->find_snaprealm();
6297 SnapContext nullsnap;
6298 const SnapContext *snapc;
6299 if (realm) {
6300 dout(10) << " realm " << *realm << dendl;
6301 snapc = &realm->get_snap_context();
6302 } else {
6303 dout(10) << " NO realm, using null context" << dendl;
6304 snapc = &nullsnap;
6305 assert(in->last == CEPH_NOSNAP);
6306 }
6307 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6308 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6309 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6310 pi->truncate_seq, ceph::real_time::min(), 0,
6311 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6312 mds->finisher));
6313 }
6314
6315 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6316 CInode *in;
6317 MutationRef mut;
6318 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6319 MDCacheLogContext(m), in(i), mut(mu) {}
6320 void finish(int r) override {
6321 mdcache->truncate_inode_logged(in, mut);
6322 }
6323 };
6324
6325 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6326 {
6327 dout(10) << "truncate_inode_finish " << *in << dendl;
6328
6329 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6330 assert(p != ls->truncating_inodes.end());
6331 ls->truncating_inodes.erase(p);
6332
6333 // update
6334 inode_t *pi = in->project_inode();
6335 pi->version = in->pre_dirty();
6336 pi->truncate_from = 0;
6337 pi->truncate_pending--;
6338
6339 MutationRef mut(new MutationImpl());
6340 mut->ls = mds->mdlog->get_current_segment();
6341 mut->add_projected_inode(in);
6342
6343 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6344 mds->mdlog->start_entry(le);
6345 CDentry *dn = in->get_projected_parent_dn();
6346 le->metablob.add_dir_context(dn->get_dir());
6347 le->metablob.add_primary_dentry(dn, in, true);
6348 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6349
6350 journal_dirty_inode(mut.get(), &le->metablob, in);
6351 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6352
6353 // flush immediately if there are readers/writers waiting
6354 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6355 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6356 mds->mdlog->flush();
6357 }
6358
6359 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6360 {
6361 dout(10) << "truncate_inode_logged " << *in << dendl;
6362 mut->apply();
6363 mds->locker->drop_locks(mut.get());
6364 mut->cleanup();
6365
6366 in->put(CInode::PIN_TRUNCATING);
6367 in->auth_unpin(this);
6368
6369 list<MDSInternalContextBase*> waiters;
6370 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6371 mds->queue_waiters(waiters);
6372 }
6373
6374
6375 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6376 {
6377 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6378 << ls->seq << "/" << ls->offset << dendl;
6379 ls->truncating_inodes.insert(in);
6380 in->get(CInode::PIN_TRUNCATING);
6381 }
6382
6383 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6384 {
6385 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6386 << ls->seq << "/" << ls->offset << dendl;
6387 // if we have the logseg the truncate started in, it must be in our list.
6388 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6389 assert(p != ls->truncating_inodes.end());
6390 ls->truncating_inodes.erase(p);
6391 in->put(CInode::PIN_TRUNCATING);
6392 }
6393
6394 void MDCache::start_recovered_truncates()
6395 {
6396 dout(10) << "start_recovered_truncates" << dendl;
6397 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6398 p != mds->mdlog->segments.end();
6399 ++p) {
6400 LogSegment *ls = p->second;
6401 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6402 q != ls->truncating_inodes.end();
6403 ++q) {
6404 CInode *in = *q;
6405 in->auth_pin(this);
6406
6407 if (!in->client_need_snapflush.empty() &&
6408 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6409 assert(in->filelock.is_stable());
6410 in->filelock.set_state(LOCK_XLOCKDONE);
6411 in->auth_pin(&in->filelock);
6412 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6413 // start_files_to_recover will revoke caps
6414 continue;
6415 }
6416 _truncate_inode(in, ls);
6417 }
6418 }
6419 }
6420
6421
6422
6423
6424
6425
6426 // ================================================================================
6427 // cache trimming
6428
6429
6430 /*
6431 * note: only called while MDS is active or stopping... NOT during recovery.
6432 * however, we may expire a replica whose authority is recovering.
6433 *
6434 */
6435 bool MDCache::trim(int max, int count)
6436 {
6437 // trim LRU
6438 if (count > 0) {
6439 max = lru.lru_get_size() - count;
6440 if (max <= 0)
6441 max = 1;
6442 } else if (max < 0) {
6443 max = g_conf->mds_cache_size;
6444 if (max <= 0)
6445 return false;
6446 }
6447 dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size()
6448 << "/" << bottom_lru.lru_get_size() << dendl;
6449
6450 // process delayed eval_stray()
6451 stray_manager.advance_delayed();
6452
6453 map<mds_rank_t, MCacheExpire*> expiremap;
6454 bool is_standby_replay = mds->is_standby_replay();
6455 int unexpirable = 0;
6456 list<CDentry*> unexpirables;
6457
6458 for (;;) {
6459 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6460 if (!dn)
6461 break;
6462 if (trim_dentry(dn, expiremap)) {
6463 unexpirables.push_back(dn);
6464 ++unexpirable;
6465 }
6466 }
6467
6468 for(auto dn : unexpirables)
6469 bottom_lru.lru_insert_mid(dn);
6470 unexpirables.clear();
6471
6472 // trim dentries from the LRU: only enough to satisfy `max`,
6473 while (lru.lru_get_size() + unexpirable > (unsigned)max) {
6474 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6475 if (!dn) {
6476 break;
6477 }
6478 if ((is_standby_replay && dn->get_linkage()->inode &&
6479 dn->get_linkage()->inode->item_open_file.is_on_list()) ||
6480 trim_dentry(dn, expiremap)) {
6481 unexpirables.push_back(dn);
6482 ++unexpirable;
6483 }
6484 }
6485 for(auto dn : unexpirables)
6486 lru.lru_insert_mid(dn);
6487 unexpirables.clear();
6488
6489 // trim non-auth, non-bound subtrees
6490 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6491 p != subtrees.end();) {
6492 CDir *dir = p->first;
6493 ++p;
6494 CInode *diri = dir->get_inode();
6495 if (dir->is_auth()) {
6496 if (!diri->is_auth() && !diri->is_base() &&
6497 dir->get_num_head_items() == 0) {
6498 if (dir->state_test(CDir::STATE_EXPORTING) ||
6499 dir->is_freezing() || dir->is_frozen())
6500 continue;
6501
6502 migrator->export_empty_import(dir);
6503 }
6504 } else {
6505 if (!diri->is_auth()) {
6506 if (dir->get_num_ref() > 1) // only subtree pin
6507 continue;
6508 list<CDir*> ls;
6509 diri->get_subtree_dirfrags(ls);
6510 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6511 continue;
6512
6513 // don't trim subtree root if its auth MDS is recovering.
6514 // This simplify the cache rejoin code.
6515 if (dir->is_subtree_root() &&
6516 rejoin_ack_gather.count(dir->get_dir_auth().first))
6517 continue;
6518 trim_dirfrag(dir, 0, expiremap);
6519 }
6520 }
6521 }
6522
6523 // trim root?
6524 if (max == 0 && root) {
6525 list<CDir*> ls;
6526 root->get_dirfrags(ls);
6527 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6528 CDir *dir = *p;
6529 if (dir->get_num_ref() == 1) // subtree pin
6530 trim_dirfrag(dir, 0, expiremap);
6531 }
6532 if (root->get_num_ref() == 0)
6533 trim_inode(0, root, 0, expiremap);
6534 }
6535
6536 std::set<mds_rank_t> stopping;
6537 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6538 stopping.erase(mds->get_nodeid());
6539 for (auto rank : stopping) {
6540 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6541 if (!mdsdir_in)
6542 continue;
6543
6544 if (expiremap.count(rank) == 0) {
6545 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6546 }
6547
6548 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6549
6550 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6551 if (!aborted) {
6552 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6553 list<CDir*> ls;
6554 mdsdir_in->get_dirfrags(ls);
6555 for (auto dir : ls) {
6556 if (dir->get_num_ref() == 1) // subtree pin
6557 trim_dirfrag(dir, dir, expiremap);
6558 }
6559 if (mdsdir_in->get_num_ref() == 0)
6560 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6561 } else {
6562 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6563 }
6564 }
6565
6566 // Other rank's base inodes (when I'm stopping)
6567 if (max == 0) {
6568 for (set<CInode*>::iterator p = base_inodes.begin();
6569 p != base_inodes.end(); ++p) {
6570 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6571 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6572 if ((*p)->get_num_ref() == 0) {
6573 trim_inode(NULL, *p, NULL, expiremap);
6574 }
6575 }
6576 }
6577 }
6578
6579 // send any expire messages
6580 send_expire_messages(expiremap);
6581
6582 return true;
6583 }
6584
6585 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6586 {
6587 // send expires
6588 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6589 it != expiremap.end();
6590 ++it) {
6591 if (mds->is_cluster_degraded() &&
6592 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6593 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6594 rejoin_sent.count(it->first) == 0))) {
6595 it->second->put();
6596 continue;
6597 }
6598 dout(7) << "sending cache_expire to " << it->first << dendl;
6599 mds->send_message_mds(it->second, it->first);
6600 }
6601 }
6602
6603
6604 bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6605 {
6606 dout(12) << "trim_dentry " << *dn << dendl;
6607
6608 CDentry::linkage_t *dnl = dn->get_linkage();
6609
6610 CDir *dir = dn->get_dir();
6611 assert(dir);
6612
6613 CDir *con = get_subtree_root(dir);
6614 if (con)
6615 dout(12) << " in container " << *con << dendl;
6616 else {
6617 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6618 assert(dn->is_auth());
6619 }
6620
6621 // If replica dentry is not readable, it's likely we will receive
6622 // MDentryLink/MDentryUnlink message soon (It's possible we first
6623 // receive a MDentryUnlink message, then MDentryLink message)
6624 // MDentryLink message only replicates an inode, so we should
6625 // avoid trimming the inode's parent dentry. This is because that
6626 // unconnected replicas are problematic for subtree migration.
6627 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6628 !dn->get_dir()->get_inode()->is_stray())
6629 return true;
6630
6631 // adjust the dir state
6632 // NOTE: we can safely remove a clean, null dentry without effecting
6633 // directory completeness.
6634 // (check this _before_ we unlink the inode, below!)
6635 bool clear_complete = false;
6636 if (!(dnl->is_null() && dn->is_clean()))
6637 clear_complete = true;
6638
6639 // unlink the dentry
6640 if (dnl->is_remote()) {
6641 // just unlink.
6642 dir->unlink_inode(dn, false);
6643 } else if (dnl->is_primary()) {
6644 // expire the inode, too.
6645 CInode *in = dnl->get_inode();
6646 assert(in);
6647 if (trim_inode(dn, in, con, expiremap))
6648 return true; // purging stray instead of trimming
6649 } else {
6650 assert(dnl->is_null());
6651 }
6652
6653 if (!dn->is_auth()) {
6654 // notify dentry authority.
6655 mds_authority_t auth = dn->authority();
6656
6657 for (int p=0; p<2; p++) {
6658 mds_rank_t a = auth.first;
6659 if (p) a = auth.second;
6660 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6661 if (mds->get_nodeid() == auth.second &&
6662 con->is_importing()) break; // don't send any expire while importing.
6663 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6664
6665 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6666 assert(a != mds->get_nodeid());
6667 if (expiremap.count(a) == 0)
6668 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6669 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
6670 }
6671 }
6672
6673 // remove dentry
6674 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6675 dir->add_to_bloom(dn);
6676 dir->remove_dentry(dn);
6677
6678 if (clear_complete)
6679 dir->state_clear(CDir::STATE_COMPLETE);
6680
6681 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6682 return false;
6683 }
6684
6685
6686 void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6687 {
6688 dout(15) << "trim_dirfrag " << *dir << dendl;
6689
6690 if (dir->is_subtree_root()) {
6691 assert(!dir->is_auth() ||
6692 (!dir->is_replicated() && dir->inode->is_base()));
6693 remove_subtree(dir); // remove from subtree map
6694 }
6695 assert(dir->get_num_ref() == 0);
6696
6697 CInode *in = dir->get_inode();
6698
6699 if (!dir->is_auth()) {
6700 mds_authority_t auth = dir->authority();
6701
6702 // was this an auth delegation? (if so, slightly modified container)
6703 dirfrag_t condf;
6704 if (dir->is_subtree_root()) {
6705 dout(12) << " subtree root, container is " << *dir << dendl;
6706 con = dir;
6707 condf = dir->dirfrag();
6708 } else {
6709 condf = con->dirfrag();
6710 }
6711
6712 for (int p=0; p<2; p++) {
6713 mds_rank_t a = auth.first;
6714 if (p) a = auth.second;
6715 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6716 if (mds->get_nodeid() == auth.second &&
6717 con->is_importing()) break; // don't send any expire while importing.
6718 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6719
6720 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6721 assert(a != mds->get_nodeid());
6722 if (expiremap.count(a) == 0)
6723 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6724 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6725 }
6726 }
6727
6728 in->close_dirfrag(dir->dirfrag().frag);
6729 }
6730
6731 /**
6732 * Try trimming an inode from the cache
6733 *
6734 * @return true if the inode is still in cache, else false if it was trimmed
6735 */
6736 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6737 {
6738 dout(15) << "trim_inode " << *in << dendl;
6739 assert(in->get_num_ref() == 0);
6740
6741 if (in->is_dir()) {
6742 // If replica inode's dirfragtreelock is not readable, it's likely
6743 // some dirfrags of the inode are being fragmented and we will receive
6744 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6745 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6746 // This is because that unconnected replicas are problematic for
6747 // subtree migration.
6748 //
6749 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6750 return true;
6751
6752 // DIR
6753 list<CDir*> dfls;
6754 in->get_dirfrags(dfls);
6755 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6756 CDir *dir = *p;
6757 assert(!dir->is_subtree_root());
6758 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6759 }
6760 }
6761
6762 // INODE
6763 if (in->is_auth()) {
6764 // eval stray after closing dirfrags
6765 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6766 maybe_eval_stray(in);
6767 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6768 return true;
6769 }
6770 } else {
6771 mds_authority_t auth = in->authority();
6772
6773 dirfrag_t df;
6774 if (con)
6775 df = con->dirfrag();
6776 else
6777 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6778
6779 for (int p=0; p<2; p++) {
6780 mds_rank_t a = auth.first;
6781 if (p) a = auth.second;
6782 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6783 if (con && mds->get_nodeid() == auth.second &&
6784 con->is_importing()) break; // don't send any expire while importing.
6785 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6786
6787 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6788 assert(a != mds->get_nodeid());
6789 if (expiremap.count(a) == 0)
6790 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6791 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6792 }
6793 }
6794
6795 /*
6796 if (in->is_auth()) {
6797 if (in->hack_accessed)
6798 mds->logger->inc("outt");
6799 else {
6800 mds->logger->inc("outut");
6801 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6802 }
6803 }
6804 */
6805
6806 // unlink
6807 if (dn)
6808 dn->get_dir()->unlink_inode(dn, false);
6809 remove_inode(in);
6810 return false;
6811 }
6812
6813
6814 /**
6815 * trim_non_auth - remove any non-auth items from our cache
6816 *
6817 * this reduces the amount of non-auth metadata in our cache, reducing the
6818 * load incurred by the rejoin phase.
6819 *
6820 * the only non-auth items that remain are those that are needed to
6821 * attach our own subtrees to the root.
6822 *
6823 * when we are done, all dentries will be in the top bit of the lru.
6824 *
6825 * why we have to do this:
6826 * we may not have accurate linkage for non-auth items. which means we will
6827 * know which subtree it falls into, and can not be sure to declare it to the
6828 * correct authority.
6829 */
6830 void MDCache::trim_non_auth()
6831 {
6832 dout(7) << "trim_non_auth" << dendl;
6833
6834 // temporarily pin all subtree roots
6835 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6836 p != subtrees.end();
6837 ++p)
6838 p->first->get(CDir::PIN_SUBTREETEMP);
6839
6840 list<CDentry*> auth_list;
6841
6842 // trim non-auth items from the lru
6843 for (;;) {
6844 CDentry *dn = NULL;
6845 if (bottom_lru.lru_get_size() > 0)
6846 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6847 if (!dn && lru.lru_get_size() > 0)
6848 dn = static_cast<CDentry*>(lru.lru_expire());
6849 if (!dn)
6850 break;
6851
6852 CDentry::linkage_t *dnl = dn->get_linkage();
6853
6854 if (dn->is_auth()) {
6855 // add back into lru (at the top)
6856 auth_list.push_back(dn);
6857
6858 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6859 dn->unlink_remote(dnl);
6860 } else {
6861 // non-auth. expire.
6862 CDir *dir = dn->get_dir();
6863 assert(dir);
6864
6865 // unlink the dentry
6866 dout(10) << " removing " << *dn << dendl;
6867 if (dnl->is_remote()) {
6868 dir->unlink_inode(dn, false);
6869 }
6870 else if (dnl->is_primary()) {
6871 CInode *in = dnl->get_inode();
6872 dout(10) << " removing " << *in << dendl;
6873 list<CDir*> ls;
6874 in->get_dirfrags(ls);
6875 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6876 CDir *subdir = *p;
6877 assert(!subdir->is_subtree_root());
6878 in->close_dirfrag(subdir->dirfrag().frag);
6879 }
6880 dir->unlink_inode(dn, false);
6881 remove_inode(in);
6882 }
6883 else {
6884 assert(dnl->is_null());
6885 }
6886
6887 assert(!dir->has_bloom());
6888 dir->remove_dentry(dn);
6889 // adjust the dir state
6890 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6891 // close empty non-auth dirfrag
6892 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6893 dir->inode->close_dirfrag(dir->get_frag());
6894 }
6895 }
6896
6897 for (auto dn : auth_list) {
6898 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6899 bottom_lru.lru_insert_mid(dn);
6900 else
6901 lru.lru_insert_top(dn);
6902 }
6903
6904 // move everything in the pintail to the top bit of the lru.
6905 lru.lru_touch_entire_pintail();
6906
6907 // unpin all subtrees
6908 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6909 p != subtrees.end();
6910 ++p)
6911 p->first->put(CDir::PIN_SUBTREETEMP);
6912
6913 if (lru.lru_get_size() == 0 &&
6914 bottom_lru.lru_get_size() == 0) {
6915 // root, stray, etc.?
6916 ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6917 while (p != inode_map.end()) {
6918 ceph::unordered_map<vinodeno_t,CInode*>::iterator next = p;
6919 ++next;
6920 CInode *in = p->second;
6921 if (!in->is_auth()) {
6922 list<CDir*> ls;
6923 in->get_dirfrags(ls);
6924 for (list<CDir*>::iterator p = ls.begin();
6925 p != ls.end();
6926 ++p) {
6927 dout(10) << " removing " << **p << dendl;
6928 assert((*p)->get_num_ref() == 1); // SUBTREE
6929 remove_subtree((*p));
6930 in->close_dirfrag((*p)->dirfrag().frag);
6931 }
6932 dout(10) << " removing " << *in << dendl;
6933 assert(!in->get_parent_dn());
6934 assert(in->get_num_ref() == 0);
6935 remove_inode(in);
6936 }
6937 p = next;
6938 }
6939 }
6940
6941 show_subtrees();
6942 }
6943
6944 /**
6945 * Recursively trim the subtree rooted at directory to remove all
6946 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6947 * of those links. This is used to clear invalid data out of the cache.
6948 * Note that it doesn't clear the passed-in directory, since that's not
6949 * always safe.
6950 */
6951 bool MDCache::trim_non_auth_subtree(CDir *dir)
6952 {
6953 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6954
6955 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6956
6957 CDir::map_t::iterator j = dir->begin();
6958 CDir::map_t::iterator i = j;
6959 while (j != dir->end()) {
6960 i = j++;
6961 CDentry *dn = i->second;
6962 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6963 CDentry::linkage_t *dnl = dn->get_linkage();
6964 if (dnl->is_primary()) { // check for subdirectories, etc
6965 CInode *in = dnl->get_inode();
6966 bool keep_inode = false;
6967 if (in->is_dir()) {
6968 list<CDir*> subdirs;
6969 in->get_dirfrags(subdirs);
6970 for (list<CDir*>::iterator subdir = subdirs.begin();
6971 subdir != subdirs.end();
6972 ++subdir) {
6973 if ((*subdir)->is_subtree_root()) {
6974 keep_inode = true;
6975 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
6976 } else {
6977 if (trim_non_auth_subtree(*subdir))
6978 keep_inode = true;
6979 else {
6980 in->close_dirfrag((*subdir)->get_frag());
6981 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
6982 }
6983 }
6984 }
6985
6986 }
6987 if (!keep_inode) { // remove it!
6988 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
6989 dir->unlink_inode(dn, false);
6990 remove_inode(in);
6991 assert(!dir->has_bloom());
6992 dir->remove_dentry(dn);
6993 } else {
6994 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
6995 dn->state_clear(CDentry::STATE_AUTH);
6996 in->state_clear(CInode::STATE_AUTH);
6997 }
6998 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
6999 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7000 } else { // just remove it
7001 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7002 if (dnl->is_remote())
7003 dir->unlink_inode(dn, false);
7004 dir->remove_dentry(dn);
7005 }
7006 }
7007 dir->state_clear(CDir::STATE_AUTH);
7008 /**
7009 * We've now checked all our children and deleted those that need it.
7010 * Now return to caller, and tell them if *we're* a keeper.
7011 */
7012 return keep_dir || dir->get_num_any();
7013 }
7014
7015 /*
7016 * during replay, when we determine a subtree is no longer ours, we
7017 * try to trim it from our cache. because subtrees must be connected
7018 * to the root, the fact that we can trim this tree may mean that our
7019 * children or parents can also be trimmed.
7020 */
7021 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7022 {
7023 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7024
7025 // can we now trim child subtrees?
7026 set<CDir*> bounds;
7027 get_subtree_bounds(dir, bounds);
7028 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7029 CDir *bd = *p;
7030 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7031 bd->get_num_any() == 0 && // and empty
7032 can_trim_non_auth_dirfrag(bd)) {
7033 CInode *bi = bd->get_inode();
7034 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7035 remove_subtree(bd);
7036 bd->mark_clean();
7037 bi->close_dirfrag(bd->get_frag());
7038 }
7039 }
7040
7041 if (trim_non_auth_subtree(dir)) {
7042 // keep
7043 try_subtree_merge(dir);
7044 } else {
7045 // can we trim this subtree (and possibly our ancestors) too?
7046 while (true) {
7047 CInode *diri = dir->get_inode();
7048 if (diri->is_base()) {
7049 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7050 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7051 remove_subtree(dir);
7052 dir->mark_clean();
7053 diri->close_dirfrag(dir->get_frag());
7054
7055 dout(10) << " removing " << *diri << dendl;
7056 assert(!diri->get_parent_dn());
7057 assert(diri->get_num_ref() == 0);
7058 remove_inode(diri);
7059 }
7060 break;
7061 }
7062
7063 CDir *psub = get_subtree_root(diri->get_parent_dir());
7064 dout(10) << " parent subtree is " << *psub << dendl;
7065 if (psub->get_dir_auth().first == mds->get_nodeid())
7066 break; // we are auth, keep.
7067
7068 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7069 remove_subtree(dir);
7070 dir->mark_clean();
7071 diri->close_dirfrag(dir->get_frag());
7072
7073 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7074 if (trim_non_auth_subtree(psub))
7075 break;
7076 dir = psub;
7077 }
7078 }
7079
7080 show_subtrees();
7081 }
7082
7083 void MDCache::standby_trim_segment(LogSegment *ls)
7084 {
7085 ls->new_dirfrags.clear_list();
7086 ls->open_files.clear_list();
7087
7088 while (!ls->dirty_dirfrags.empty()) {
7089 CDir *dir = ls->dirty_dirfrags.front();
7090 dir->mark_clean();
7091 }
7092 while (!ls->dirty_inodes.empty()) {
7093 CInode *in = ls->dirty_inodes.front();
7094 in->mark_clean();
7095 }
7096 while (!ls->dirty_dentries.empty()) {
7097 CDentry *dn = ls->dirty_dentries.front();
7098 dn->mark_clean();
7099 }
7100 while (!ls->dirty_parent_inodes.empty()) {
7101 CInode *in = ls->dirty_parent_inodes.front();
7102 in->clear_dirty_parent();
7103 }
7104 while (!ls->dirty_dirfrag_dir.empty()) {
7105 CInode *in = ls->dirty_dirfrag_dir.front();
7106 in->filelock.remove_dirty();
7107 }
7108 while (!ls->dirty_dirfrag_nest.empty()) {
7109 CInode *in = ls->dirty_dirfrag_nest.front();
7110 in->nestlock.remove_dirty();
7111 }
7112 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7113 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7114 in->dirfragtreelock.remove_dirty();
7115 }
7116 }
7117
7118 /* This function DOES put the passed message before returning */
7119 void MDCache::handle_cache_expire(MCacheExpire *m)
7120 {
7121 mds_rank_t from = mds_rank_t(m->get_from());
7122
7123 dout(7) << "cache_expire from mds." << from << dendl;
7124
7125 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7126 m->put();
7127 return;
7128 }
7129
7130 set<SimpleLock *> gather_locks;
7131 // loop over realms
7132 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7133 p != m->realms.end();
7134 ++p) {
7135 // check container?
7136 if (p->first.ino > 0) {
7137 CInode *expired_inode = get_inode(p->first.ino);
7138 assert(expired_inode); // we had better have this.
7139 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7140 assert(parent_dir);
7141
7142 int export_state = -1;
7143 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7144 export_state = migrator->get_export_state(parent_dir);
7145 assert(export_state >= 0);
7146 }
7147
7148 if (!parent_dir->is_auth() ||
7149 (export_state != -1 &&
7150 ((export_state == Migrator::EXPORT_WARNING &&
7151 migrator->export_has_warned(parent_dir,from)) ||
7152 export_state == Migrator::EXPORT_EXPORTING ||
7153 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7154 (export_state == Migrator::EXPORT_NOTIFYING &&
7155 !migrator->export_has_notified(parent_dir,from))))) {
7156
7157 // not auth.
7158 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7159 assert(parent_dir->is_frozen_tree_root());
7160
7161 // make a message container
7162 if (delayed_expire[parent_dir].count(from) == 0)
7163 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7164
7165 // merge these expires into it
7166 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7167 continue;
7168 }
7169 assert(export_state <= Migrator::EXPORT_PREPPING ||
7170 (export_state == Migrator::EXPORT_WARNING &&
7171 !migrator->export_has_warned(parent_dir, from)));
7172
7173 dout(7) << "expires for " << *parent_dir << dendl;
7174 } else {
7175 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7176 }
7177
7178 // INODES
7179 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7180 it != p->second.inodes.end();
7181 ++it) {
7182 CInode *in = get_inode(it->first);
7183 unsigned nonce = it->second;
7184
7185 if (!in) {
7186 dout(0) << " inode expire on " << it->first << " from " << from
7187 << ", don't have it" << dendl;
7188 assert(in);
7189 }
7190 assert(in->is_auth());
7191 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7192
7193 // check nonce
7194 if (nonce == in->get_replica_nonce(from)) {
7195 // remove from our cached_by
7196 dout(7) << " inode expire on " << *in << " from mds." << from
7197 << " cached_by was " << in->get_replicas() << dendl;
7198 inode_remove_replica(in, from, false, gather_locks);
7199 }
7200 else {
7201 // this is an old nonce, ignore expire.
7202 dout(7) << " inode expire on " << *in << " from mds." << from
7203 << " with old nonce " << nonce
7204 << " (current " << in->get_replica_nonce(from) << "), dropping"
7205 << dendl;
7206 }
7207 }
7208
7209 // DIRS
7210 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7211 it != p->second.dirs.end();
7212 ++it) {
7213 CDir *dir = get_dirfrag(it->first);
7214 unsigned nonce = it->second;
7215
7216 if (!dir) {
7217 CInode *diri = get_inode(it->first.ino);
7218 if (diri) {
7219 if (mds->is_rejoin() &&
7220 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7221 !diri->is_replica(from)) {
7222 list<CDir*> ls;
7223 diri->get_nested_dirfrags(ls);
7224 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7225 << " while rejoining, inode isn't replicated" << dendl;
7226 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7227 dir = *q;
7228 if (dir->is_replica(from)) {
7229 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7230 dir->remove_replica(from);
7231 }
7232 }
7233 continue;
7234 }
7235 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7236 if (other) {
7237 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7238 << " have " << *other << ", mismatched frags, dropping" << dendl;
7239 continue;
7240 }
7241 }
7242 dout(0) << " dir expire on " << it->first << " from " << from
7243 << ", don't have it" << dendl;
7244 assert(dir);
7245 }
7246 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7247
7248 assert(dir->is_auth());
7249
7250 // check nonce
7251 if (nonce == dir->get_replica_nonce(from)) {
7252 // remove from our cached_by
7253 dout(7) << " dir expire on " << *dir << " from mds." << from
7254 << " replicas was " << dir->replica_map << dendl;
7255 dir->remove_replica(from);
7256 }
7257 else {
7258 // this is an old nonce, ignore expire.
7259 dout(7) << " dir expire on " << *dir << " from mds." << from
7260 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7261 << "), dropping" << dendl;
7262 }
7263 }
7264
7265 // DENTRIES
7266 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7267 pd != p->second.dentries.end();
7268 ++pd) {
7269 dout(10) << " dn expires in dir " << pd->first << dendl;
7270 CInode *diri = get_inode(pd->first.ino);
7271 assert(diri);
7272 CDir *dir = diri->get_dirfrag(pd->first.frag);
7273
7274 if (!dir) {
7275 dout(0) << " dn expires on " << pd->first << " from " << from
7276 << ", must have refragmented" << dendl;
7277 } else {
7278 assert(dir->is_auth());
7279 }
7280
7281 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7282 p != pd->second.end();
7283 ++p) {
7284 unsigned nonce = p->second;
7285 CDentry *dn;
7286
7287 if (dir) {
7288 dn = dir->lookup(p->first.first, p->first.second);
7289 } else {
7290 // which dirfrag for this dentry?
7291 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7292 assert(dir);
7293 assert(dir->is_auth());
7294 dn = dir->lookup(p->first.first, p->first.second);
7295 }
7296
7297 if (!dn) {
7298 if (dir)
7299 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7300 else
7301 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7302 }
7303 assert(dn);
7304
7305 if (nonce == dn->get_replica_nonce(from)) {
7306 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7307 dentry_remove_replica(dn, from, gather_locks);
7308 }
7309 else {
7310 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7311 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7312 << "), dropping" << dendl;
7313 }
7314 }
7315 }
7316 }
7317
7318 // done
7319 m->put();
7320
7321 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7322 if (!(*p)->is_stable())
7323 mds->locker->eval_gather(*p);
7324 }
7325 }
7326
7327 void MDCache::process_delayed_expire(CDir *dir)
7328 {
7329 dout(7) << "process_delayed_expire on " << *dir << dendl;
7330 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7331 p != delayed_expire[dir].end();
7332 ++p)
7333 handle_cache_expire(p->second);
7334 delayed_expire.erase(dir);
7335 }
7336
7337 void MDCache::discard_delayed_expire(CDir *dir)
7338 {
7339 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7340 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7341 p != delayed_expire[dir].end();
7342 ++p)
7343 p->second->put();
7344 delayed_expire.erase(dir);
7345 }
7346
7347 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7348 set<SimpleLock *>& gather_locks)
7349 {
7350 in->remove_replica(from);
7351 in->mds_caps_wanted.erase(from);
7352
7353 // note: this code calls _eval more often than it needs to!
7354 // fix lock
7355 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7356 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7357 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7358 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7359 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7360 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7361
7362 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7363 // Don't remove the recovering mds from lock's gathering list because
7364 // it may hold rejoined wrlocks.
7365 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7366 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7367 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7368 }
7369
7370 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7371 {
7372 dn->remove_replica(from);
7373
7374 // fix lock
7375 if (dn->lock.remove_replica(from))
7376 gather_locks.insert(&dn->lock);
7377
7378 // Replicated strays might now be elegible for purge
7379 CDentry::linkage_t *dnl = dn->get_linkage();
7380 if (dnl->is_primary()) {
7381 maybe_eval_stray(dnl->get_inode());
7382 }
7383 }
7384
7385 void MDCache::trim_client_leases()
7386 {
7387 utime_t now = ceph_clock_now();
7388
7389 dout(10) << "trim_client_leases" << dendl;
7390
7391 for (int pool=0; pool<client_lease_pools; pool++) {
7392 int before = client_leases[pool].size();
7393 if (client_leases[pool].empty())
7394 continue;
7395
7396 while (!client_leases[pool].empty()) {
7397 ClientLease *r = client_leases[pool].front();
7398 if (r->ttl > now) break;
7399 CDentry *dn = static_cast<CDentry*>(r->parent);
7400 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7401 dn->remove_client_lease(r, mds->locker);
7402 }
7403 int after = client_leases[pool].size();
7404 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7405 << (before-after) << " leases, " << after << " left" << dendl;
7406 }
7407 }
7408
7409
7410 void MDCache::check_memory_usage()
7411 {
7412 static MemoryModel mm(g_ceph_context);
7413 static MemoryModel::snap last;
7414 mm.sample(&last);
7415 static MemoryModel::snap baseline = last;
7416
7417 // check client caps
7418 assert(CInode::count() == inode_map.size());
7419 float caps_per_inode = 0.0;
7420 if (CInode::count())
7421 caps_per_inode = (float)Capability::count() / (float)CInode::count();
7422
7423 dout(2) << "check_memory_usage"
7424 << " total " << last.get_total()
7425 << ", rss " << last.get_rss()
7426 << ", heap " << last.get_heap()
7427 << ", baseline " << baseline.get_heap()
7428 << ", buffers " << (buffer::get_total_alloc() >> 10)
7429 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7430 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7431 << dendl;
7432
7433 mds->mlogger->set(l_mdm_rss, last.get_rss());
7434 mds->mlogger->set(l_mdm_heap, last.get_heap());
7435
7436 if (num_inodes_with_caps > g_conf->mds_cache_size) {
7437 float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
7438 if (ratio < 1.0) {
7439 last_recall_state = ceph_clock_now();
7440 mds->server->recall_client_state(ratio);
7441 }
7442 }
7443
7444 // If the cache size had exceeded its limit, but we're back in bounds
7445 // now, free any unused pool memory so that our memory usage isn't
7446 // permanently bloated.
7447 if (exceeded_size_limit
7448 && CInode::count() <=
7449 g_conf->mds_cache_size * g_conf->mds_health_cache_threshold) {
7450 // Only do this once we are back in bounds: otherwise the releases would
7451 // slow down whatever process caused us to exceed bounds to begin with
7452 if (ceph_using_tcmalloc()) {
7453 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7454 << dendl;
7455 ceph_heap_release_free_memory();
7456 }
7457 exceeded_size_limit = false;
7458 }
7459 }
7460
7461
7462
7463 // =========================================================================================
7464 // shutdown
7465
7466 class C_MDC_ShutdownCheck : public MDCacheContext {
7467 public:
7468 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7469 void finish(int) override {
7470 mdcache->shutdown_check();
7471 }
7472 };
7473
7474 void MDCache::shutdown_check()
7475 {
7476 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7477
7478 // cache
7479 char old_val[32] = { 0 };
7480 char *o = old_val;
7481 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7482 g_conf->set_val("debug_mds", "10");
7483 g_conf->apply_changes(NULL);
7484 show_cache();
7485 g_conf->set_val("debug_mds", old_val);
7486 g_conf->apply_changes(NULL);
7487 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7488
7489 // this
7490 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7491 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7492
7493
7494 if (mds->objecter->is_active()) {
7495 dout(0) << "objecter still active" << dendl;
7496 mds->objecter->dump_active();
7497 }
7498 }
7499
7500
7501 void MDCache::shutdown_start()
7502 {
7503 dout(2) << "shutdown_start" << dendl;
7504
7505 if (g_conf->mds_shutdown_check)
7506 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7507
7508 // g_conf->debug_mds = 10;
7509 }
7510
7511
7512
7513 bool MDCache::shutdown_pass()
7514 {
7515 dout(7) << "shutdown_pass" << dendl;
7516
7517 if (mds->is_stopped()) {
7518 dout(7) << " already shut down" << dendl;
7519 show_cache();
7520 show_subtrees();
7521 return true;
7522 }
7523
7524 // empty stray dir
7525 if (!shutdown_export_strays()) {
7526 dout(7) << "waiting for strays to migrate" << dendl;
7527 return false;
7528 }
7529
7530 // drop our reference to our stray dir inode
7531 for (int i = 0; i < NUM_STRAY; ++i) {
7532 if (strays[i] &&
7533 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7534 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7535 strays[i]->put(CInode::PIN_STRAY);
7536 strays[i]->put_stickydirs();
7537 }
7538 }
7539
7540 // trim cache
7541 trim(0);
7542 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7543
7544 // SUBTREES
7545 int num_auth_subtree = 0;
7546 if (!subtrees.empty() &&
7547 mds->get_nodeid() != 0 &&
7548 migrator->get_export_queue_size() == 0) {
7549 dout(7) << "looking for subtrees to export to mds0" << dendl;
7550 list<CDir*> ls;
7551 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7552 it != subtrees.end();
7553 ++it) {
7554 CDir *dir = it->first;
7555 if (dir->get_inode()->is_mdsdir())
7556 continue;
7557 if (dir->is_auth()) {
7558 num_auth_subtree++;
7559 if (dir->is_frozen() ||
7560 dir->is_freezing() ||
7561 dir->is_ambiguous_dir_auth() ||
7562 dir->state_test(CDir::STATE_EXPORTING))
7563 continue;
7564 ls.push_back(dir);
7565 }
7566 }
7567 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7568 CDir *dir = *p;
7569 mds_rank_t dest = dir->get_inode()->authority().first;
7570 if (dest > 0 && !mds->mdsmap->is_active(dest))
7571 dest = 0;
7572 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7573 migrator->export_dir_nicely(dir, dest);
7574 }
7575 }
7576
7577 if (num_auth_subtree > 0) {
7578 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7579 show_subtrees();
7580 return false;
7581 }
7582
7583 // close out any sessions (and open files!) before we try to trim the log, etc.
7584 if (mds->sessionmap.have_unclosed_sessions()) {
7585 if (!mds->server->terminating_sessions)
7586 mds->server->terminate_sessions();
7587 return false;
7588 }
7589
7590 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7591 if (mydir && !mydir->is_subtree_root())
7592 mydir = NULL;
7593
7594 // subtrees map not empty yet?
7595 if (subtrees.size() > (mydir ? 1 : 0)) {
7596 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7597 show_subtrees();
7598 migrator->show_importing();
7599 migrator->show_exporting();
7600 if (!migrator->is_importing() && !migrator->is_exporting())
7601 show_cache();
7602 return false;
7603 }
7604 assert(!migrator->is_exporting());
7605 assert(!migrator->is_importing());
7606
7607
7608 // flush what we can from the log
7609 mds->mdlog->trim(0);
7610 if (mds->mdlog->get_num_segments() > 1) {
7611 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7612 return false;
7613 }
7614
7615 // (only do this once!)
7616 if (!mds->mdlog->is_capped()) {
7617 dout(7) << "capping the log" << dendl;
7618 mds->mdlog->cap();
7619 mds->mdlog->trim();
7620 }
7621
7622 if (!mds->mdlog->empty()) {
7623 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7624 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7625 return false;
7626 }
7627
7628 if (!did_shutdown_log_cap) {
7629 // flush journal header
7630 dout(7) << "writing header for (now-empty) journal" << dendl;
7631 assert(mds->mdlog->empty());
7632 mds->mdlog->write_head(0);
7633 // NOTE: filer active checker below will block us until this completes.
7634 did_shutdown_log_cap = true;
7635 return false;
7636 }
7637
7638 // filer active?
7639 if (mds->objecter->is_active()) {
7640 dout(7) << "objecter still active" << dendl;
7641 mds->objecter->dump_active();
7642 return false;
7643 }
7644
7645 // trim what we can from the cache
7646 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7647 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7648 show_cache();
7649 //dump();
7650 return false;
7651 }
7652
7653 // make mydir subtree go away
7654 if (mydir) {
7655 if (mydir->get_num_ref() > 1) { // subtree pin
7656 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7657 show_cache();
7658 return false;
7659 }
7660
7661 remove_subtree(mydir);
7662 myin->close_dirfrag(mydir->get_frag());
7663 }
7664 assert(subtrees.empty());
7665
7666 if (myin)
7667 remove_inode(myin);
7668
7669 // done!
7670 dout(2) << "shutdown done." << dendl;
7671 return true;
7672 }
7673
7674 bool MDCache::shutdown_export_strays()
7675 {
7676 if (mds->get_nodeid() == 0)
7677 return true;
7678
7679 dout(10) << "shutdown_export_strays" << dendl;
7680
7681 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7682
7683 bool done = true;
7684
7685 list<CDir*> dfs;
7686 for (int i = 0; i < NUM_STRAY; ++i) {
7687 if (!strays[i]) {
7688 continue;
7689 }
7690 strays[i]->get_dirfrags(dfs);
7691 }
7692
7693 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7694 dfs_i != dfs.end(); ++dfs_i)
7695 {
7696 CDir *dir = *dfs_i;
7697
7698 if (!dir->is_complete()) {
7699 dir->fetch(0);
7700 done = false;
7701 if (!mds0_active)
7702 break;
7703 }
7704
7705 for (CDir::map_t::iterator p = dir->items.begin();
7706 p != dir->items.end();
7707 ++p) {
7708 CDentry *dn = p->second;
7709 CDentry::linkage_t *dnl = dn->get_linkage();
7710 if (dnl->is_null())
7711 continue;
7712 done = false;
7713 if (!mds0_active)
7714 break;
7715
7716 if (dn->state_test(CDentry::STATE_PURGING)) {
7717 // Don't try to migrate anything that is actually
7718 // being purged right now
7719 continue;
7720 }
7721
7722 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7723 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7724 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7725 } else {
7726 dout(10) << "already exporting " << *dn << dendl;
7727 }
7728 }
7729 }
7730
7731 return done;
7732 }
7733
7734 // ========= messaging ==============
7735
7736 /* This function DOES put the passed message before returning */
7737 void MDCache::dispatch(Message *m)
7738 {
7739 switch (m->get_type()) {
7740
7741 // RESOLVE
7742 case MSG_MDS_RESOLVE:
7743 handle_resolve(static_cast<MMDSResolve*>(m));
7744 break;
7745 case MSG_MDS_RESOLVEACK:
7746 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7747 break;
7748
7749 // REJOIN
7750 case MSG_MDS_CACHEREJOIN:
7751 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7752 break;
7753
7754 case MSG_MDS_DISCOVER:
7755 handle_discover(static_cast<MDiscover*>(m));
7756 break;
7757 case MSG_MDS_DISCOVERREPLY:
7758 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7759 break;
7760
7761 case MSG_MDS_DIRUPDATE:
7762 handle_dir_update(static_cast<MDirUpdate*>(m));
7763 break;
7764
7765 case MSG_MDS_CACHEEXPIRE:
7766 handle_cache_expire(static_cast<MCacheExpire*>(m));
7767 break;
7768
7769 case MSG_MDS_DENTRYLINK:
7770 handle_dentry_link(static_cast<MDentryLink*>(m));
7771 break;
7772 case MSG_MDS_DENTRYUNLINK:
7773 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7774 break;
7775
7776 case MSG_MDS_FRAGMENTNOTIFY:
7777 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7778 break;
7779
7780 case MSG_MDS_FINDINO:
7781 handle_find_ino(static_cast<MMDSFindIno *>(m));
7782 break;
7783 case MSG_MDS_FINDINOREPLY:
7784 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7785 break;
7786
7787 case MSG_MDS_OPENINO:
7788 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7789 break;
7790 case MSG_MDS_OPENINOREPLY:
7791 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7792 break;
7793
7794 default:
7795 derr << "cache unknown message " << m->get_type() << dendl;
7796 assert(0 == "cache unknown message");
7797 }
7798 }
7799
7800 MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7801 {
7802 if (mdr) {
7803 dout(20) << "_get_waiter retryrequest" << dendl;
7804 return new C_MDS_RetryRequest(this, mdr);
7805 } else if (req) {
7806 dout(20) << "_get_waiter retrymessage" << dendl;
7807 return new C_MDS_RetryMessage(mds, req);
7808 } else {
7809 return fin;
7810 }
7811 }
7812
7813 int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7814 const filepath& path, // what
7815 vector<CDentry*> *pdnvec, // result
7816 CInode **pin,
7817 int onfail)
7818 {
7819 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7820 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7821 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7822
7823 assert(mdr || req || fin);
7824 assert(!forward || mdr || req); // forward requires a request
7825
7826 snapid_t snapid = CEPH_NOSNAP;
7827 if (mdr)
7828 mdr->snapid = snapid;
7829
7830 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7831
7832 if (mds->logger) mds->logger->inc(l_mds_traverse);
7833
7834 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7835 CInode *cur = get_inode(path.get_ino());
7836 if (cur == NULL) {
7837 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7838 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7839 else {
7840 //ceph_abort(); // hrm.. broken
7841 return -ESTALE;
7842 }
7843 return 1;
7844 }
7845 if (cur->state_test(CInode::STATE_PURGING))
7846 return -ESTALE;
7847
7848 // make sure snaprealm are open...
7849 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7850 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7851 return 1;
7852 }
7853
7854 // start trace
7855 if (pdnvec)
7856 pdnvec->clear();
7857 if (pin)
7858 *pin = cur;
7859
7860 unsigned depth = 0;
7861 while (depth < path.depth()) {
7862 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7863 << "' snapid " << snapid << dendl;
7864
7865 if (!cur->is_dir()) {
7866 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7867 return -ENOTDIR;
7868 }
7869
7870 // walk into snapdir?
7871 if (path[depth].length() == 0) {
7872 dout(10) << "traverse: snapdir" << dendl;
7873 if (!mdr)
7874 return -EINVAL;
7875 snapid = CEPH_SNAPDIR;
7876 mdr->snapid = snapid;
7877 depth++;
7878 continue;
7879 }
7880 // walk thru snapdir?
7881 if (snapid == CEPH_SNAPDIR) {
7882 if (!mdr)
7883 return -EINVAL;
7884 SnapRealm *realm = cur->find_snaprealm();
7885 snapid = realm->resolve_snapname(path[depth], cur->ino());
7886 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7887 if (!snapid)
7888 return -ENOENT;
7889 mdr->snapid = snapid;
7890 depth++;
7891 continue;
7892 }
7893
7894 // open dir
7895 frag_t fg = cur->pick_dirfrag(path[depth]);
7896 CDir *curdir = cur->get_dirfrag(fg);
7897 if (!curdir) {
7898 if (cur->is_auth()) {
7899 // parent dir frozen_dir?
7900 if (cur->is_frozen()) {
7901 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7902 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7903 return 1;
7904 }
7905 curdir = cur->get_or_open_dirfrag(this, fg);
7906 } else {
7907 // discover?
7908 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7909 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7910 null_okay);
7911 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7912 return 1;
7913 }
7914 }
7915 assert(curdir);
7916
7917 #ifdef MDS_VERIFY_FRAGSTAT
7918 if (curdir->is_complete())
7919 curdir->verify_fragstat();
7920 #endif
7921
7922 // frozen?
7923 /*
7924 if (curdir->is_frozen()) {
7925 // doh!
7926 // FIXME: traverse is allowed?
7927 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7928 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7929 if (onfinish) delete onfinish;
7930 return 1;
7931 }
7932 */
7933
7934 // Before doing dirfrag->dn lookup, compare with DamageTable's
7935 // record of which dentries were unreadable
7936 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7937 dout(4) << "traverse: stopped lookup at damaged dentry "
7938 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7939 return -EIO;
7940 }
7941
7942 // dentry
7943 CDentry *dn = curdir->lookup(path[depth], snapid);
7944 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7945
7946 // null and last_bit and xlocked by me?
7947 if (dnl && dnl->is_null() && null_okay) {
7948 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7949 if (pdnvec)
7950 pdnvec->push_back(dn);
7951 if (pin)
7952 *pin = 0;
7953 break; // done!
7954 }
7955
7956 if (dnl &&
7957 dn->lock.is_xlocked() &&
7958 dn->lock.get_xlock_by() != mdr &&
7959 !dn->lock.can_read(client) &&
7960 (dnl->is_null() || forward)) {
7961 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7962 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7963 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7964 mds->mdlog->flush();
7965 return 1;
7966 }
7967
7968 // can we conclude ENOENT?
7969 if (dnl && dnl->is_null()) {
7970 if (dn->lock.can_read(client) ||
7971 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
7972 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
7973 if (pdnvec) {
7974 if (depth == path.depth() - 1)
7975 pdnvec->push_back(dn);
7976 else
7977 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
7978 }
7979 return -ENOENT;
7980 } else {
7981 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
7982 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7983 return 1;
7984 }
7985 }
7986
7987 if (dnl && !dnl->is_null()) {
7988 CInode *in = dnl->get_inode();
7989
7990 // do we have inode?
7991 if (!in) {
7992 assert(dnl->is_remote());
7993 // do i have it?
7994 in = get_inode(dnl->get_remote_ino());
7995 if (in) {
7996 dout(7) << "linking in remote in " << *in << dendl;
7997 dn->link_remote(dnl, in);
7998 } else {
7999 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8000 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8001 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8002 dout(4) << "traverse: remote dentry points to damaged ino "
8003 << *dn << dendl;
8004 return -EIO;
8005 }
8006 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8007 (null_okay && depth == path.depth() - 1));
8008 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8009 return 1;
8010 }
8011 }
8012
8013 cur = in;
8014 // make sure snaprealm are open...
8015 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8016 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8017 return 1;
8018 }
8019
8020 // add to trace, continue.
8021 touch_inode(cur);
8022 if (pdnvec)
8023 pdnvec->push_back(dn);
8024 if (pin)
8025 *pin = cur;
8026 depth++;
8027 continue;
8028 }
8029
8030
8031 // MISS. dentry doesn't exist.
8032 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8033
8034 if (curdir->is_auth()) {
8035 // dentry is mine.
8036 if (curdir->is_complete() ||
8037 (snapid == CEPH_NOSNAP &&
8038 curdir->has_bloom() &&
8039 !curdir->is_in_bloom(path[depth]))){
8040 // file not found
8041 if (pdnvec) {
8042 // instantiate a null dn?
8043 if (depth < path.depth()-1){
8044 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8045 dn = NULL;
8046 } else if (dn) {
8047 ceph_abort(); // should have fallen out in ->is_null() check above
8048 } else if (curdir->is_frozen()) {
8049 dout(20) << " not adding null to frozen dir " << dendl;
8050 } else if (snapid < CEPH_MAXSNAP) {
8051 dout(20) << " not adding null for snapid " << snapid << dendl;
8052 } else {
8053 // create a null dentry
8054 dn = curdir->add_null_dentry(path[depth]);
8055 dout(20) << " added null " << *dn << dendl;
8056 }
8057 if (dn)
8058 pdnvec->push_back(dn);
8059 else
8060 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8061 }
8062 return -ENOENT;
8063 } else {
8064
8065 // Check DamageTable for missing fragments before trying to fetch
8066 // this
8067 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8068 dout(4) << "traverse: damaged dirfrag " << *curdir
8069 << ", blocking fetch" << dendl;
8070 return -EIO;
8071 }
8072
8073 // directory isn't complete; reload
8074 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8075 touch_inode(cur);
8076 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8077 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8078 return 1;
8079 }
8080 } else {
8081 // dirfrag/dentry is not mine.
8082 mds_authority_t dauth = curdir->authority();
8083
8084 if (forward &&
8085 snapid && mdr && mdr->client_request &&
8086 (int)depth < mdr->client_request->get_num_fwd()) {
8087 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8088 << " < fwd " << mdr->client_request->get_num_fwd()
8089 << ", discovering instead of forwarding" << dendl;
8090 discover = true;
8091 }
8092
8093 if ((discover || null_okay)) {
8094 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8095 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8096 null_okay);
8097 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8098 return 1;
8099 }
8100 if (forward) {
8101 // forward
8102 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8103
8104 if (curdir->is_ambiguous_auth()) {
8105 // wait
8106 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8107 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8108 return 1;
8109 }
8110
8111 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8112
8113 if (mdr)
8114 request_forward(mdr, dauth.first);
8115 else
8116 mds->forward_message_mds(req, dauth.first);
8117
8118 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8119 assert(fin == NULL);
8120 return 2;
8121 }
8122 }
8123
8124 ceph_abort(); // i shouldn't get here
8125 }
8126
8127 // success.
8128 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8129 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8130 if (mdr)
8131 assert(mdr->snapid == snapid);
8132 return 0;
8133 }
8134
8135 CInode *MDCache::cache_traverse(const filepath& fp)
8136 {
8137 dout(10) << "cache_traverse " << fp << dendl;
8138
8139 CInode *in;
8140 if (fp.get_ino())
8141 in = get_inode(fp.get_ino());
8142 else
8143 in = root;
8144 if (!in)
8145 return NULL;
8146
8147 for (unsigned i = 0; i < fp.depth(); i++) {
8148 const string& dname = fp[i];
8149 frag_t fg = in->pick_dirfrag(dname);
8150 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8151 CDir *curdir = in->get_dirfrag(fg);
8152 if (!curdir)
8153 return NULL;
8154 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8155 if (!dn)
8156 return NULL;
8157 in = dn->get_linkage()->get_inode();
8158 if (!in)
8159 return NULL;
8160 }
8161 dout(10) << " got " << *in << dendl;
8162 return in;
8163 }
8164
8165
8166 /**
8167 * open_remote_dir -- open up a remote dirfrag
8168 *
8169 * @param diri base inode
8170 * @param approxfg approximate fragment.
8171 * @param fin completion callback
8172 */
8173 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8174 {
8175 dout(10) << "open_remote_dir on " << *diri << dendl;
8176
8177 assert(diri->is_dir());
8178 assert(!diri->is_auth());
8179 assert(diri->get_dirfrag(approxfg) == 0);
8180
8181 mds_rank_t auth = diri->authority().first;
8182
8183 if (!mds->is_cluster_degraded() ||
8184 mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
8185 discover_dir_frag(diri, approxfg, fin);
8186 } else {
8187 // mds is down or recovering. forge a replica!
8188 forge_replica_dir(diri, approxfg, auth);
8189 if (fin)
8190 mds->queue_waiter(fin);
8191 }
8192 }
8193
8194
8195 /**
8196 * get_dentry_inode - get or open inode
8197 *
8198 * @param dn the dentry
8199 * @param mdr current request
8200 *
8201 * will return inode for primary, or link up/open up remote link's inode as necessary.
8202 * If it's not available right now, puts mdr on wait list and returns null.
8203 */
8204 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8205 {
8206 CDentry::linkage_t *dnl;
8207 if (projected)
8208 dnl = dn->get_projected_linkage();
8209 else
8210 dnl = dn->get_linkage();
8211
8212 assert(!dnl->is_null());
8213
8214 if (dnl->is_primary())
8215 return dnl->inode;
8216
8217 assert(dnl->is_remote());
8218 CInode *in = get_inode(dnl->get_remote_ino());
8219 if (in) {
8220 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8221 dn->link_remote(dnl, in);
8222 return in;
8223 } else {
8224 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8225 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8226 return 0;
8227 }
8228 }
8229
8230 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8231 CDentry *dn;
8232 inodeno_t ino;
8233 MDSInternalContextBase *onfinish;
8234 bool want_xlocked;
8235 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
8236 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8237 dn->get(MDSCacheObject::PIN_PTRWAITER);
8238 }
8239 void finish(int r) override {
8240 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8241 dn->put(MDSCacheObject::PIN_PTRWAITER);
8242 }
8243 };
8244
8245 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8246 {
8247 dout(10) << "open_remote_dentry " << *dn << dendl;
8248 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8249 inodeno_t ino = dnl->get_remote_ino();
8250 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8251 open_ino(ino, pool,
8252 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8253 }
8254
8255 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8256 bool want_xlocked, int r)
8257 {
8258 if (r < 0) {
8259 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8260 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8261 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8262 dn->state_set(CDentry::STATE_BADREMOTEINO);
8263
8264 std::string path;
8265 CDir *dir = dn->get_dir();
8266 if (dir) {
8267 dir->get_inode()->make_path_string(path);
8268 path = path + "/" + dn->get_name();
8269 }
8270
8271 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8272 if (fatal) {
8273 mds->damaged();
8274 ceph_abort(); // unreachable, damaged() respawns us
8275 }
8276 } else {
8277 r = 0;
8278 }
8279 }
8280 fin->complete(r < 0 ? r : 0);
8281 }
8282
8283
8284 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8285 {
8286 // empty trace if we're a base inode
8287 if (in->is_base())
8288 return;
8289
8290 CInode *parent = in->get_parent_inode();
8291 assert(parent);
8292 make_trace(trace, parent);
8293
8294 CDentry *dn = in->get_parent_dn();
8295 dout(15) << "make_trace adding " << *dn << dendl;
8296 trace.push_back(dn);
8297 }
8298
8299
8300 // -------------------------------------------------------------------------------
8301 // Open inode by inode number
8302
8303 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8304 inodeno_t ino;
8305 public:
8306 bufferlist bl;
8307 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8308 MDCacheIOContext(c), ino(i) {}
8309 void finish(int r) override {
8310 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8311 }
8312 };
8313
8314 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8315 inodeno_t ino;
8316 MMDSOpenIno *msg;
8317 bool parent;
8318 public:
8319 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8320 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8321 void finish(int r) override {
8322 if (r < 0 && !parent)
8323 r = -EAGAIN;
8324 if (msg) {
8325 mdcache->handle_open_ino(msg, r);
8326 return;
8327 }
8328 assert(mdcache->opening_inodes.count(ino));
8329 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8330 }
8331 };
8332
8333 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8334 inodeno_t ino;
8335 public:
8336 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8337 void finish(int r) override {
8338 mdcache->_open_ino_parent_opened(ino, r);
8339 }
8340 };
8341
8342 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8343 {
8344 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8345
8346 assert(opening_inodes.count(ino));
8347 open_ino_info_t& info = opening_inodes[ino];
8348
8349 CInode *in = get_inode(ino);
8350 if (in) {
8351 dout(10) << " found cached " << *in << dendl;
8352 open_ino_finish(ino, info, in->authority().first);
8353 return;
8354 }
8355
8356 inode_backtrace_t backtrace;
8357 if (err == 0) {
8358 try {
8359 ::decode(backtrace, bl);
8360 } catch (const buffer::error &decode_exc) {
8361 derr << "corrupt backtrace on ino x0" << std::hex << ino
8362 << std::dec << ": " << decode_exc << dendl;
8363 open_ino_finish(ino, info, -EIO);
8364 return;
8365 }
8366 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8367 dout(10) << " old object in pool " << info.pool
8368 << ", retrying pool " << backtrace.pool << dendl;
8369 info.pool = backtrace.pool;
8370 C_IO_MDC_OpenInoBacktraceFetched *fin =
8371 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8372 fetch_backtrace(ino, info.pool, fin->bl,
8373 new C_OnFinisher(fin, mds->finisher));
8374 return;
8375 }
8376 } else if (err == -ENOENT) {
8377 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8378 if (info.pool != meta_pool) {
8379 dout(10) << " no object in pool " << info.pool
8380 << ", retrying pool " << meta_pool << dendl;
8381 info.pool = meta_pool;
8382 C_IO_MDC_OpenInoBacktraceFetched *fin =
8383 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8384 fetch_backtrace(ino, info.pool, fin->bl,
8385 new C_OnFinisher(fin, mds->finisher));
8386 return;
8387 }
8388 err = 0; // backtrace.ancestors.empty() is checked below
8389 }
8390
8391 if (err == 0) {
8392 if (backtrace.ancestors.empty()) {
8393 dout(10) << " got empty backtrace " << dendl;
8394 err = -EIO;
8395 } else if (!info.ancestors.empty()) {
8396 if (info.ancestors[0] == backtrace.ancestors[0]) {
8397 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8398 err = -EINVAL;
8399 } else {
8400 info.last_err = 0;
8401 }
8402 }
8403 }
8404 if (err) {
8405 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8406 if (info.last_err)
8407 err = info.last_err;
8408 open_ino_finish(ino, info, err);
8409 return;
8410 }
8411
8412 dout(10) << " got backtrace " << backtrace << dendl;
8413 info.ancestors = backtrace.ancestors;
8414
8415 _open_ino_traverse_dir(ino, info, 0);
8416 }
8417
8418 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8419 {
8420 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8421
8422 assert(opening_inodes.count(ino));
8423 open_ino_info_t& info = opening_inodes[ino];
8424
8425 CInode *in = get_inode(ino);
8426 if (in) {
8427 dout(10) << " found cached " << *in << dendl;
8428 open_ino_finish(ino, info, in->authority().first);
8429 return;
8430 }
8431
8432 if (ret == mds->get_nodeid()) {
8433 _open_ino_traverse_dir(ino, info, 0);
8434 } else {
8435 if (ret >= 0) {
8436 mds_rank_t checked_rank = mds_rank_t(ret);
8437 info.check_peers = true;
8438 info.auth_hint = checked_rank;
8439 info.checked.erase(checked_rank);
8440 }
8441 do_open_ino(ino, info, ret);
8442 }
8443 }
8444
8445 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8446 {
8447 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8448
8449 CInode *in = get_inode(ino);
8450 if (in) {
8451 dout(10) << " found cached " << *in << dendl;
8452 open_ino_finish(ino, info, in->authority().first);
8453 return;
8454 }
8455
8456 if (ret) {
8457 do_open_ino(ino, info, ret);
8458 return;
8459 }
8460
8461 mds_rank_t hint = info.auth_hint;
8462 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8463 info.discover, info.want_xlocked, &hint);
8464 if (ret > 0)
8465 return;
8466 if (hint != mds->get_nodeid())
8467 info.auth_hint = hint;
8468 do_open_ino(ino, info, ret);
8469 }
8470
8471 void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8472 {
8473 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8474 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8475 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8476 }
8477
8478 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8479 vector<inode_backpointer_t>& ancestors,
8480 bool discover, bool want_xlocked, mds_rank_t *hint)
8481 {
8482 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8483 int err = 0;
8484 for (unsigned i = 0; i < ancestors.size(); i++) {
8485 CInode *diri = get_inode(ancestors[i].dirino);
8486
8487 if (!diri) {
8488 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8489 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8490 return 1;
8491 }
8492 continue;
8493 }
8494
8495 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8496 CDir *dir = diri->get_parent_dir();
8497 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8498 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8499 dir = dir->get_inode()->get_parent_dir();
8500 _open_ino_fetch_dir(ino, m, dir, i == 0);
8501 return 1;
8502 }
8503
8504 if (!diri->is_dir()) {
8505 dout(10) << " " << *diri << " is not dir" << dendl;
8506 if (i == 0)
8507 err = -ENOTDIR;
8508 break;
8509 }
8510
8511 string &name = ancestors[i].dname;
8512 frag_t fg = diri->pick_dirfrag(name);
8513 CDir *dir = diri->get_dirfrag(fg);
8514 if (!dir) {
8515 if (diri->is_auth()) {
8516 if (diri->is_frozen()) {
8517 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8518 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8519 return 1;
8520 }
8521 dir = diri->get_or_open_dirfrag(this, fg);
8522 } else if (discover) {
8523 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8524 return 1;
8525 }
8526 }
8527 if (dir) {
8528 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8529 CDentry *dn = dir->lookup(name);
8530 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8531 if (dir->is_auth()) {
8532 if (dnl && dnl->is_primary() &&
8533 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8534 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8535 _open_ino_fetch_dir(ino, m, dir, i == 0);
8536 return 1;
8537 }
8538
8539 if (!dnl && !dir->is_complete() &&
8540 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8541 dout(10) << " fetching incomplete " << *dir << dendl;
8542 _open_ino_fetch_dir(ino, m, dir, i == 0);
8543 return 1;
8544 }
8545
8546 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8547 if (i == 0)
8548 err = -ENOENT;
8549 } else if (discover) {
8550 if (!dnl) {
8551 filepath path(name, 0);
8552 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8553 (i == 0 && want_xlocked));
8554 return 1;
8555 }
8556 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8557 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8558 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8559 return 1;
8560 }
8561 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8562 if (i == 0)
8563 err = -ENOENT;
8564 }
8565 }
8566 if (hint && i == 0)
8567 *hint = dir ? dir->authority().first : diri->authority().first;
8568 break;
8569 }
8570 return err;
8571 }
8572
8573 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8574 {
8575 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8576
8577 list<MDSInternalContextBase*> waiters;
8578 waiters.swap(info.waiters);
8579 opening_inodes.erase(ino);
8580 finish_contexts(g_ceph_context, waiters, ret);
8581 }
8582
8583 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8584 {
8585 if (err < 0 && err != -EAGAIN) {
8586 info.checked.clear();
8587 info.checked.insert(mds->get_nodeid());
8588 info.checking = MDS_RANK_NONE;
8589 info.check_peers = true;
8590 info.fetch_backtrace = true;
8591 if (info.discover) {
8592 info.discover = false;
8593 info.ancestors.clear();
8594 }
8595 if (err != -ENOENT && err != -ENOTDIR)
8596 info.last_err = err;
8597 }
8598
8599 if (info.check_peers) {
8600 info.check_peers = false;
8601 info.checking = MDS_RANK_NONE;
8602 do_open_ino_peer(ino, info);
8603 } else if (info.fetch_backtrace) {
8604 info.check_peers = true;
8605 info.fetch_backtrace = false;
8606 info.checking = mds->get_nodeid();
8607 info.checked.clear();
8608 info.checked.insert(mds->get_nodeid());
8609 C_IO_MDC_OpenInoBacktraceFetched *fin =
8610 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8611 fetch_backtrace(ino, info.pool, fin->bl,
8612 new C_OnFinisher(fin, mds->finisher));
8613 } else {
8614 assert(!info.ancestors.empty());
8615 info.checking = mds->get_nodeid();
8616 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8617 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8618 }
8619 }
8620
8621 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8622 {
8623 set<mds_rank_t> all, active;
8624 mds->mdsmap->get_mds_set(all);
8625 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8626 if (mds->get_state() == MDSMap::STATE_REJOIN)
8627 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8628
8629 dout(10) << "do_open_ino_peer " << ino << " active " << active
8630 << " all " << all << " checked " << info.checked << dendl;
8631
8632 mds_rank_t peer = MDS_RANK_NONE;
8633 if (info.auth_hint >= 0) {
8634 if (active.count(info.auth_hint)) {
8635 peer = info.auth_hint;
8636 info.auth_hint = MDS_RANK_NONE;
8637 }
8638 } else {
8639 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8640 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8641 peer = *p;
8642 break;
8643 }
8644 }
8645 if (peer < 0) {
8646 if (all.size() > active.size() && all != info.checked) {
8647 dout(10) << " waiting for more peers to be active" << dendl;
8648 } else {
8649 dout(10) << " all MDS peers have been checked " << dendl;
8650 do_open_ino(ino, info, 0);
8651 }
8652 } else {
8653 info.checking = peer;
8654 vector<inode_backpointer_t> *pa = NULL;
8655 // got backtrace from peer or backtrace just fetched
8656 if (info.discover || !info.fetch_backtrace)
8657 pa = &info.ancestors;
8658 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8659 }
8660 }
8661
8662 void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8663 {
8664 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8665 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8666 m->put();
8667 return;
8668 }
8669
8670 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8671
8672 inodeno_t ino = m->ino;
8673 MMDSOpenInoReply *reply;
8674 CInode *in = get_inode(ino);
8675 if (in) {
8676 dout(10) << " have " << *in << dendl;
8677 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8678 if (in->is_auth()) {
8679 touch_inode(in);
8680 while (1) {
8681 CDentry *pdn = in->get_parent_dn();
8682 if (!pdn)
8683 break;
8684 CInode *diri = pdn->get_dir()->get_inode();
8685 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
8686 in->inode.version));
8687 in = diri;
8688 }
8689 } else {
8690 reply->hint = in->authority().first;
8691 }
8692 } else if (err < 0) {
8693 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8694 } else {
8695 mds_rank_t hint = MDS_RANK_NONE;
8696 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8697 if (ret > 0)
8698 return;
8699 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8700 }
8701 m->get_connection()->send_message(reply);
8702 m->put();
8703 }
8704
8705 void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8706 {
8707 dout(10) << "handle_open_ino_reply " << *m << dendl;
8708
8709 inodeno_t ino = m->ino;
8710 mds_rank_t from = mds_rank_t(m->get_source().num());
8711 auto it = opening_inodes.find(ino);
8712 if (it != opening_inodes.end() && it->second.checking == from) {
8713 open_ino_info_t& info = it->second;
8714 info.checking = MDS_RANK_NONE;
8715 info.checked.insert(from);
8716
8717 CInode *in = get_inode(ino);
8718 if (in) {
8719 dout(10) << " found cached " << *in << dendl;
8720 open_ino_finish(ino, info, in->authority().first);
8721 } else if (!m->ancestors.empty()) {
8722 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8723 if (!info.want_replica) {
8724 open_ino_finish(ino, info, from);
8725 m->put();
8726 return;
8727 }
8728
8729 info.ancestors = m->ancestors;
8730 info.auth_hint = from;
8731 info.checking = mds->get_nodeid();
8732 info.discover = true;
8733 _open_ino_traverse_dir(ino, info, 0);
8734 } else if (m->error) {
8735 dout(10) << " error " << m->error << " from mds." << from << dendl;
8736 do_open_ino(ino, info, m->error);
8737 } else {
8738 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8739 info.auth_hint = m->hint;
8740 info.checked.erase(m->hint);
8741 }
8742 do_open_ino_peer(ino, info);
8743 }
8744 }
8745 m->put();
8746 }
8747
8748 void MDCache::kick_open_ino_peers(mds_rank_t who)
8749 {
8750 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8751
8752 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8753 p != opening_inodes.end();
8754 ++p) {
8755 open_ino_info_t& info = p->second;
8756 if (info.checking == who) {
8757 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8758 info.checking = MDS_RANK_NONE;
8759 do_open_ino_peer(p->first, info);
8760 } else if (info.checking == MDS_RANK_NONE) {
8761 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8762 do_open_ino_peer(p->first, info);
8763 }
8764 }
8765 }
8766
8767 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8768 bool want_replica, bool want_xlocked)
8769 {
8770 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8771 << want_replica << dendl;
8772
8773 if (opening_inodes.count(ino)) {
8774 open_ino_info_t& info = opening_inodes[ino];
8775 if (want_replica) {
8776 info.want_replica = true;
8777 if (want_xlocked && !info.want_xlocked) {
8778 if (!info.ancestors.empty()) {
8779 CInode *diri = get_inode(info.ancestors[0].dirino);
8780 if (diri) {
8781 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8782 CDir *dir = diri->get_dirfrag(fg);
8783 if (dir && !dir->is_auth()) {
8784 filepath path(info.ancestors[0].dname, 0);
8785 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8786 }
8787 }
8788 }
8789 info.want_xlocked = true;
8790 }
8791 }
8792 info.waiters.push_back(fin);
8793 } else {
8794 open_ino_info_t& info = opening_inodes[ino];
8795 info.checked.insert(mds->get_nodeid());
8796 info.want_replica = want_replica;
8797 info.want_xlocked = want_xlocked;
8798 info.tid = ++open_ino_last_tid;
8799 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8800 info.waiters.push_back(fin);
8801 do_open_ino(ino, info, 0);
8802 }
8803 }
8804
8805 /* ---------------------------- */
8806
8807 /*
8808 * search for a given inode on MDS peers. optionally start with the given node.
8809
8810
8811 TODO
8812 - recover from mds node failure, recovery
8813 - traverse path
8814
8815 */
8816 void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8817 {
8818 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8819 assert(!have_inode(ino));
8820
8821 ceph_tid_t tid = ++find_ino_peer_last_tid;
8822 find_ino_peer_info_t& fip = find_ino_peer[tid];
8823 fip.ino = ino;
8824 fip.tid = tid;
8825 fip.fin = c;
8826 fip.hint = hint;
8827 fip.checked.insert(mds->get_nodeid());
8828 _do_find_ino_peer(fip);
8829 }
8830
8831 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8832 {
8833 set<mds_rank_t> all, active;
8834 mds->mdsmap->get_mds_set(all);
8835 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8836
8837 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8838 << " active " << active << " all " << all
8839 << " checked " << fip.checked
8840 << dendl;
8841
8842 mds_rank_t m = MDS_RANK_NONE;
8843 if (fip.hint >= 0) {
8844 m = fip.hint;
8845 fip.hint = MDS_RANK_NONE;
8846 } else {
8847 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8848 if (*p != mds->get_nodeid() &&
8849 fip.checked.count(*p) == 0) {
8850 m = *p;
8851 break;
8852 }
8853 }
8854 if (m == MDS_RANK_NONE) {
8855 if (all.size() > active.size()) {
8856 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8857 } else {
8858 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8859 fip.fin->complete(-ESTALE);
8860 find_ino_peer.erase(fip.tid);
8861 }
8862 } else {
8863 fip.checking = m;
8864 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8865 }
8866 }
8867
8868 void MDCache::handle_find_ino(MMDSFindIno *m)
8869 {
8870 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8871 m->put();
8872 return;
8873 }
8874
8875 dout(10) << "handle_find_ino " << *m << dendl;
8876 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8877 CInode *in = get_inode(m->ino);
8878 if (in) {
8879 in->make_path(r->path);
8880 dout(10) << " have " << r->path << " " << *in << dendl;
8881 }
8882 m->get_connection()->send_message(r);
8883 m->put();
8884 }
8885
8886
8887 void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8888 {
8889 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8890 if (p != find_ino_peer.end()) {
8891 dout(10) << "handle_find_ino_reply " << *m << dendl;
8892 find_ino_peer_info_t& fip = p->second;
8893
8894 // success?
8895 if (get_inode(fip.ino)) {
8896 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8897 mds->queue_waiter(fip.fin);
8898 find_ino_peer.erase(p);
8899 m->put();
8900 return;
8901 }
8902
8903 mds_rank_t from = mds_rank_t(m->get_source().num());
8904 if (fip.checking == from)
8905 fip.checking = MDS_RANK_NONE;
8906 fip.checked.insert(from);
8907
8908 if (!m->path.empty()) {
8909 // we got a path!
8910 vector<CDentry*> trace;
8911 MDRequestRef null_ref;
8912 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8913 if (r > 0)
8914 return;
8915 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8916 << ", retrying" << dendl;
8917 fip.checked.clear();
8918 _do_find_ino_peer(fip);
8919 } else {
8920 // nope, continue.
8921 _do_find_ino_peer(fip);
8922 }
8923 } else {
8924 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8925 }
8926 m->put();
8927 }
8928
8929 void MDCache::kick_find_ino_peers(mds_rank_t who)
8930 {
8931 // find_ino_peers requests we should move on from
8932 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8933 p != find_ino_peer.end();
8934 ++p) {
8935 find_ino_peer_info_t& fip = p->second;
8936 if (fip.checking == who) {
8937 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8938 fip.checking = MDS_RANK_NONE;
8939 _do_find_ino_peer(fip);
8940 } else if (fip.checking == MDS_RANK_NONE) {
8941 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8942 _do_find_ino_peer(fip);
8943 }
8944 }
8945 }
8946
8947 /* ---------------------------- */
8948
8949 int MDCache::get_num_client_requests()
8950 {
8951 int count = 0;
8952 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8953 p != active_requests.end();
8954 ++p) {
8955 MDRequestRef& mdr = p->second;
8956 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8957 count++;
8958 }
8959 return count;
8960 }
8961
8962 /* This function takes over the reference to the passed Message */
8963 MDRequestRef MDCache::request_start(MClientRequest *req)
8964 {
8965 // did we win a forward race against a slave?
8966 if (active_requests.count(req->get_reqid())) {
8967 MDRequestRef& mdr = active_requests[req->get_reqid()];
8968 assert(mdr);
8969 if (mdr->is_slave()) {
8970 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8971 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8972 } else {
8973 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8974 req->put();
8975 }
8976 return MDRequestRef();
8977 }
8978
8979 // register new client request
8980 MDRequestImpl::Params params;
8981 params.reqid = req->get_reqid();
8982 params.attempt = req->get_num_fwd();
8983 params.client_req = req;
8984 params.initiated = req->get_recv_stamp();
8985 params.throttled = req->get_throttle_stamp();
8986 params.all_read = req->get_recv_complete_stamp();
8987 params.dispatched = req->get_dispatch_stamp();
8988
8989 MDRequestRef mdr =
8990 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8991 active_requests[params.reqid] = mdr;
8992 mdr->set_op_stamp(req->get_stamp());
8993 dout(7) << "request_start " << *mdr << dendl;
8994 return mdr;
8995 }
8996
8997 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
8998 {
8999 int by = m->get_source().num();
9000 MDRequestImpl::Params params;
9001 params.reqid = ri;
9002 params.attempt = attempt;
9003 params.triggering_slave_req = m;
9004 params.slave_to = by;
9005 params.initiated = m->get_recv_stamp();
9006 params.throttled = m->get_throttle_stamp();
9007 params.all_read = m->get_recv_complete_stamp();
9008 params.dispatched = m->get_dispatch_stamp();
9009 MDRequestRef mdr =
9010 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9011 assert(active_requests.count(mdr->reqid) == 0);
9012 active_requests[mdr->reqid] = mdr;
9013 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9014 return mdr;
9015 }
9016
9017 MDRequestRef MDCache::request_start_internal(int op)
9018 {
9019 MDRequestImpl::Params params;
9020 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9021 params.reqid.tid = mds->issue_tid();
9022 params.initiated = ceph_clock_now();
9023 params.internal_op = op;
9024 MDRequestRef mdr =
9025 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9026
9027 assert(active_requests.count(mdr->reqid) == 0);
9028 active_requests[mdr->reqid] = mdr;
9029 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9030 return mdr;
9031 }
9032
9033 MDRequestRef MDCache::request_get(metareqid_t rid)
9034 {
9035 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9036 assert(p != active_requests.end());
9037 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9038 return p->second;
9039 }
9040
9041 void MDCache::request_finish(MDRequestRef& mdr)
9042 {
9043 dout(7) << "request_finish " << *mdr << dendl;
9044 mdr->mark_event("finishing request");
9045
9046 // slave finisher?
9047 if (mdr->has_more() && mdr->more()->slave_commit) {
9048 Context *fin = mdr->more()->slave_commit;
9049 mdr->more()->slave_commit = 0;
9050 int ret;
9051 if (mdr->aborted) {
9052 mdr->aborted = false;
9053 ret = -1;
9054 mdr->more()->slave_rolling_back = true;
9055 } else {
9056 ret = 0;
9057 mdr->committing = true;
9058 }
9059 fin->complete(ret); // this must re-call request_finish.
9060 return;
9061 }
9062
9063 request_cleanup(mdr);
9064 }
9065
9066
9067 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9068 {
9069 mdr->mark_event("forwarding request");
9070 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9071 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9072 << *mdr->client_request << dendl;
9073 mds->forward_message_mds(mdr->client_request, who);
9074 mdr->client_request = 0;
9075 if (mds->logger) mds->logger->inc(l_mds_forward);
9076 } else if (mdr->internal_op >= 0) {
9077 dout(10) << "request_forward on internal op; cancelling" << dendl;
9078 mdr->internal_op_finish->complete(-EXDEV);
9079 } else {
9080 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9081 << " was from mds" << dendl;
9082 }
9083 request_cleanup(mdr);
9084 }
9085
9086
9087 void MDCache::dispatch_request(MDRequestRef& mdr)
9088 {
9089 if (mdr->client_request) {
9090 mds->server->dispatch_client_request(mdr);
9091 } else if (mdr->slave_request) {
9092 mds->server->dispatch_slave_request(mdr);
9093 } else {
9094 switch (mdr->internal_op) {
9095 case CEPH_MDS_OP_FRAGMENTDIR:
9096 dispatch_fragment_dir(mdr);
9097 break;
9098 case CEPH_MDS_OP_EXPORTDIR:
9099 migrator->dispatch_export_dir(mdr, 0);
9100 break;
9101 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9102 enqueue_scrub_work(mdr);
9103 break;
9104 case CEPH_MDS_OP_FLUSH:
9105 flush_dentry_work(mdr);
9106 break;
9107 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9108 repair_dirfrag_stats_work(mdr);
9109 break;
9110 case CEPH_MDS_OP_REPAIR_INODESTATS:
9111 repair_inode_stats_work(mdr);
9112 break;
9113 default:
9114 ceph_abort();
9115 }
9116 }
9117 }
9118
9119
9120 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9121 {
9122 if (!mdr->has_more())
9123 return;
9124
9125 // clean up slaves
9126 // (will implicitly drop remote dn pins)
9127 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9128 p != mdr->more()->slaves.end();
9129 ++p) {
9130 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9131 MMDSSlaveRequest::OP_FINISH);
9132
9133 if (mdr->killed && !mdr->committing) {
9134 r->mark_abort();
9135 } else if (mdr->more()->srcdn_auth_mds == *p &&
9136 mdr->more()->inode_import.length() > 0) {
9137 // information about rename imported caps
9138 r->inode_export.claim(mdr->more()->inode_import);
9139 }
9140
9141 mds->send_message_mds(r, *p);
9142 }
9143
9144 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9145 * implicitly. Note that we don't call the finishers -- there shouldn't
9146 * be any on a remote lock and the request finish wakes up all
9147 * the waiters anyway! */
9148 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9149 while (p != mdr->xlocks.end()) {
9150 if ((*p)->get_parent()->is_auth())
9151 ++p;
9152 else {
9153 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9154 << " on " << *(*p)->get_parent() << dendl;
9155 (*p)->put_xlock();
9156 mdr->locks.erase(*p);
9157 mdr->xlocks.erase(p++);
9158 }
9159 }
9160
9161 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9162 while (q != mdr->remote_wrlocks.end()) {
9163 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9164 << " on mds." << q->second
9165 << " on " << *(q->first)->get_parent() << dendl;
9166 mdr->locks.erase(q->first);
9167 mdr->remote_wrlocks.erase(q++);
9168 }
9169
9170 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9171 * leaving them in can cause double-notifies as
9172 * this function can get called more than once */
9173 }
9174
9175 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9176 {
9177 request_drop_foreign_locks(mdr);
9178 mds->locker->drop_non_rdlocks(mdr.get());
9179 }
9180
9181 void MDCache::request_drop_locks(MDRequestRef& mdr)
9182 {
9183 request_drop_foreign_locks(mdr);
9184 mds->locker->drop_locks(mdr.get());
9185 }
9186
9187 void MDCache::request_cleanup(MDRequestRef& mdr)
9188 {
9189 dout(15) << "request_cleanup " << *mdr << dendl;
9190
9191 if (mdr->has_more()) {
9192 if (mdr->more()->is_ambiguous_auth)
9193 mdr->clear_ambiguous_auth();
9194 if (!mdr->more()->waiting_for_finish.empty())
9195 mds->queue_waiters(mdr->more()->waiting_for_finish);
9196 }
9197
9198 request_drop_locks(mdr);
9199
9200 // drop (local) auth pins
9201 mdr->drop_local_auth_pins();
9202
9203 // drop stickydirs
9204 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9205 p != mdr->stickydirs.end();
9206 ++p)
9207 (*p)->put_stickydirs();
9208
9209 mds->locker->kick_cap_releases(mdr);
9210
9211 // drop cache pins
9212 mdr->drop_pins();
9213
9214 // remove from session
9215 mdr->item_session_request.remove_myself();
9216
9217 // remove from map
9218 active_requests.erase(mdr->reqid);
9219
9220 if (mds->logger)
9221 log_stat();
9222
9223 mdr->mark_event("cleaned up request");
9224 }
9225
9226 void MDCache::request_kill(MDRequestRef& mdr)
9227 {
9228 // rollback slave requests is tricky. just let the request proceed.
9229 if (mdr->done_locking && mdr->has_more() &&
9230 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9231 dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
9232
9233 assert(mdr->used_prealloc_ino == 0);
9234 assert(mdr->prealloc_inos.empty());
9235
9236 mdr->session = NULL;
9237 mdr->item_session_request.remove_myself();
9238 return;
9239 }
9240
9241 mdr->killed = true;
9242 mdr->mark_event("killing request");
9243
9244 if (mdr->committing) {
9245 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9246 } else {
9247 dout(10) << "request_kill " << *mdr << dendl;
9248 request_cleanup(mdr);
9249 }
9250 }
9251
9252 // -------------------------------------------------------------------------------
9253 // SNAPREALMS
9254
9255 struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9256 MDRequestRef mdr;
9257 MutationRef mut;
9258 CInode *in;
9259 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9260 MutationRef& mu, CInode *i) :
9261 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9262 void finish(int r) override {
9263 mdcache->_snaprealm_create_finish(mdr, mut, in);
9264 }
9265 };
9266
9267 void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9268 {
9269 dout(10) << "snaprealm_create " << *in << dendl;
9270 assert(!in->snaprealm);
9271
9272 // allocate an id..
9273 if (!mdr->more()->stid) {
9274 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9275 new C_MDS_RetryRequest(this, mdr));
9276 return;
9277 }
9278
9279 MutationRef mut(new MutationImpl());
9280 mut->ls = mds->mdlog->get_current_segment();
9281 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9282 mds->mdlog->start_entry(le);
9283
9284 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9285
9286 inode_t *pi = in->project_inode();
9287 pi->version = in->pre_dirty();
9288 pi->rstat.rsnaprealms++;
9289
9290 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9291 snapid_t seq;
9292 ::decode(seq, p);
9293
9294 sr_t *newsnap = in->project_snaprealm(seq);
9295 newsnap->seq = seq;
9296 newsnap->last_created = seq;
9297
9298 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9299 journal_cow_inode(mut, &le->metablob, in);
9300 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9301
9302 mds->server->submit_mdlog_entry(le,
9303 new C_MDC_snaprealm_create_finish(this, mdr,
9304 mut, in),
9305 mdr, __func__);
9306 mds->mdlog->flush();
9307 }
9308
9309
9310 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9311 {
9312 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9313
9314 vector<inodeno_t> split_inos;
9315 vector<inodeno_t> split_realms;
9316
9317 if (snapop == CEPH_SNAP_OP_SPLIT) {
9318 // notify clients of update|split
9319 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9320 !p.end(); ++p)
9321 split_inos.push_back((*p)->ino());
9322
9323 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9324 p != in->snaprealm->open_children.end();
9325 ++p)
9326 split_realms.push_back((*p)->inode->ino());
9327 }
9328
9329 bufferlist snapbl;
9330 in->snaprealm->build_snap_trace(snapbl);
9331
9332 set<SnapRealm*> past_children;
9333 map<client_t, MClientSnap*> updates;
9334 list<SnapRealm*> q;
9335 q.push_back(in->snaprealm);
9336 while (!q.empty()) {
9337 SnapRealm *realm = q.front();
9338 q.pop_front();
9339
9340 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9341 realm->invalidate_cached_snaps();
9342
9343 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9344 p != realm->client_caps.end();
9345 ++p) {
9346 assert(!p->second->empty());
9347 if (!nosend && updates.count(p->first) == 0) {
9348 MClientSnap *update = new MClientSnap(snapop);
9349 update->head.split = in->ino();
9350 update->split_inos = split_inos;
9351 update->split_realms = split_realms;
9352 update->bl = snapbl;
9353 updates[p->first] = update;
9354 }
9355 }
9356
9357 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9358 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9359 p != realm->open_past_children.end();
9360 ++p)
9361 past_children.insert(*p);
9362 }
9363
9364 // notify for active children, too.
9365 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9366 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9367 p != realm->open_children.end();
9368 ++p)
9369 q.push_back(*p);
9370 }
9371
9372 if (!nosend)
9373 send_snaps(updates);
9374
9375 // notify past children and their descendants if we update/delete old snapshots
9376 for (set<SnapRealm*>::iterator p = past_children.begin();
9377 p != past_children.end();
9378 ++p)
9379 q.push_back(*p);
9380
9381 while (!q.empty()) {
9382 SnapRealm *realm = q.front();
9383 q.pop_front();
9384
9385 realm->invalidate_cached_snaps();
9386
9387 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9388 p != realm->open_children.end();
9389 ++p) {
9390 if (past_children.count(*p) == 0)
9391 q.push_back(*p);
9392 }
9393
9394 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9395 p != realm->open_past_children.end();
9396 ++p) {
9397 if (past_children.count(*p) == 0) {
9398 q.push_back(*p);
9399 past_children.insert(*p);
9400 }
9401 }
9402 }
9403
9404 if (snapop == CEPH_SNAP_OP_DESTROY) {
9405 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9406 for (set<SnapRealm*>::iterator p = past_children.begin();
9407 p != past_children.end();
9408 ++p)
9409 maybe_eval_stray((*p)->inode, true);
9410 }
9411 }
9412
9413 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9414 {
9415 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9416
9417 // apply
9418 in->pop_and_dirty_projected_inode(mut->ls);
9419 mut->apply();
9420 mds->locker->drop_locks(mut.get());
9421 mut->cleanup();
9422
9423 // tell table we've committed
9424 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9425
9426 // create
9427 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9428 snapid_t seq;
9429 ::decode(seq, p);
9430
9431 in->open_snaprealm();
9432 in->snaprealm->srnode.seq = seq;
9433 in->snaprealm->srnode.created = seq;
9434 bool ok = in->snaprealm->_open_parents(NULL);
9435 assert(ok);
9436
9437 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9438
9439 /*
9440 static int count = 5;
9441 if (--count == 0)
9442 ceph_abort(); // hack test test **********
9443 */
9444
9445 // done.
9446 mdr->more()->stid = 0; // caller will likely need to reuse this
9447 dispatch_request(mdr);
9448 }
9449
9450
9451 // -------------------------------------------------------------------------------
9452 // STRAYS
9453
9454 struct C_MDC_RetryScanStray : public MDCacheContext {
9455 dirfrag_t next;
9456 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9457 void finish(int r) override {
9458 mdcache->scan_stray_dir(next);
9459 }
9460 };
9461
9462 void MDCache::scan_stray_dir(dirfrag_t next)
9463 {
9464 dout(10) << "scan_stray_dir " << next << dendl;
9465
9466 list<CDir*> ls;
9467 for (int i = 0; i < NUM_STRAY; ++i) {
9468 if (strays[i]->ino() < next.ino)
9469 continue;
9470 strays[i]->get_dirfrags(ls);
9471 }
9472
9473 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9474 CDir *dir = *p;
9475 if (dir->dirfrag() < next)
9476 continue;
9477 if (!dir->is_complete()) {
9478 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9479 return;
9480 }
9481 for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
9482 CDentry *dn = q->second;
9483 dn->state_set(CDentry::STATE_STRAY);
9484 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9485 if (dnl->is_primary()) {
9486 CInode *in = dnl->get_inode();
9487 if (in->inode.nlink == 0)
9488 in->state_set(CInode::STATE_ORPHAN);
9489 maybe_eval_stray(in);
9490 }
9491 }
9492 }
9493 }
9494
9495 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9496 {
9497 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9498 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9499 }
9500
9501
9502
9503
9504
9505 // ========================================================================================
9506 // DISCOVER
9507 /*
9508
9509 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9510 to the parent metadata object in the cache (pinning it).
9511
9512 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9513
9514 */
9515
9516 void MDCache::_send_discover(discover_info_t& d)
9517 {
9518 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9519 d.want_base_dir, d.want_xlocked);
9520 dis->set_tid(d.tid);
9521 mds->send_message_mds(dis, d.mds);
9522 }
9523
9524 void MDCache::discover_base_ino(inodeno_t want_ino,
9525 MDSInternalContextBase *onfinish,
9526 mds_rank_t from)
9527 {
9528 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9529 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9530 discover_info_t& d = _create_discover(from);
9531 d.ino = want_ino;
9532 _send_discover(d);
9533 }
9534 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9535 }
9536
9537
9538 void MDCache::discover_dir_frag(CInode *base,
9539 frag_t approx_fg,
9540 MDSInternalContextBase *onfinish,
9541 mds_rank_t from)
9542 {
9543 if (from < 0)
9544 from = base->authority().first;
9545
9546 dirfrag_t df(base->ino(), approx_fg);
9547 dout(7) << "discover_dir_frag " << df
9548 << " from mds." << from << dendl;
9549
9550 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9551 discover_info_t& d = _create_discover(from);
9552 d.pin_base(base);
9553 d.ino = base->ino();
9554 d.frag = approx_fg;
9555 d.want_base_dir = true;
9556 _send_discover(d);
9557 }
9558
9559 if (onfinish)
9560 base->add_dir_waiter(approx_fg, onfinish);
9561 }
9562
9563 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9564 CInode *base;
9565 snapid_t snapid;
9566 filepath path;
9567 mds_rank_t from;
9568 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9569 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9570 void finish(int r) override {
9571 mdcache->discover_path(base, snapid, path, 0, from);
9572 }
9573 };
9574
9575 void MDCache::discover_path(CInode *base,
9576 snapid_t snap,
9577 filepath want_path,
9578 MDSInternalContextBase *onfinish,
9579 bool want_xlocked,
9580 mds_rank_t from)
9581 {
9582 if (from < 0)
9583 from = base->authority().first;
9584
9585 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9586 << (want_xlocked ? " want_xlocked":"")
9587 << dendl;
9588
9589 if (base->is_ambiguous_auth()) {
9590 dout(10) << " waiting for single auth on " << *base << dendl;
9591 if (!onfinish)
9592 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9593 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9594 return;
9595 } else if (from == mds->get_nodeid()) {
9596 list<MDSInternalContextBase*> finished;
9597 base->take_waiting(CInode::WAIT_DIR, finished);
9598 mds->queue_waiters(finished);
9599 return;
9600 }
9601
9602 frag_t fg = base->pick_dirfrag(want_path[0]);
9603 if ((want_xlocked && want_path.depth() == 1) ||
9604 !base->is_waiting_for_dir(fg) || !onfinish) {
9605 discover_info_t& d = _create_discover(from);
9606 d.ino = base->ino();
9607 d.pin_base(base);
9608 d.frag = fg;
9609 d.snap = snap;
9610 d.want_path = want_path;
9611 d.want_base_dir = true;
9612 d.want_xlocked = want_xlocked;
9613 _send_discover(d);
9614 }
9615
9616 // register + wait
9617 if (onfinish)
9618 base->add_dir_waiter(fg, onfinish);
9619 }
9620
9621 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9622 CDir *base;
9623 snapid_t snapid;
9624 filepath path;
9625 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9626 MDCacheContext(c), base(b), snapid(s), path(p) {}
9627 void finish(int r) override {
9628 mdcache->discover_path(base, snapid, path, 0);
9629 }
9630 };
9631
9632 void MDCache::discover_path(CDir *base,
9633 snapid_t snap,
9634 filepath want_path,
9635 MDSInternalContextBase *onfinish,
9636 bool want_xlocked)
9637 {
9638 mds_rank_t from = base->authority().first;
9639
9640 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9641 << (want_xlocked ? " want_xlocked":"")
9642 << dendl;
9643
9644 if (base->is_ambiguous_auth()) {
9645 dout(7) << " waiting for single auth on " << *base << dendl;
9646 if (!onfinish)
9647 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9648 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9649 return;
9650 } else if (from == mds->get_nodeid()) {
9651 list<MDSInternalContextBase*> finished;
9652 base->take_sub_waiting(finished);
9653 mds->queue_waiters(finished);
9654 return;
9655 }
9656
9657 if ((want_xlocked && want_path.depth() == 1) ||
9658 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9659 discover_info_t& d = _create_discover(from);
9660 d.ino = base->ino();
9661 d.pin_base(base->inode);
9662 d.frag = base->get_frag();
9663 d.snap = snap;
9664 d.want_path = want_path;
9665 d.want_base_dir = false;
9666 d.want_xlocked = want_xlocked;
9667 _send_discover(d);
9668 }
9669
9670 // register + wait
9671 if (onfinish)
9672 base->add_dentry_waiter(want_path[0], snap, onfinish);
9673 }
9674
9675 void MDCache::kick_discovers(mds_rank_t who)
9676 {
9677 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9678 p != discovers.end();
9679 ++p) {
9680 if (p->second.mds != who)
9681 continue;
9682 _send_discover(p->second);
9683 }
9684 }
9685
9686
9687 /* This function DOES put the passed message before returning */
9688 void MDCache::handle_discover(MDiscover *dis)
9689 {
9690 mds_rank_t whoami = mds->get_nodeid();
9691 mds_rank_t from = mds_rank_t(dis->get_source().num());
9692
9693 assert(from != whoami);
9694
9695 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9696 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9697 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9698 dis->put();
9699 return;
9700 }
9701
9702 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9703 // delay processing request from survivor because we may not yet choose lock states.
9704 if (!mds->mdsmap->is_rejoin(from)) {
9705 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9706 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9707 return;
9708 }
9709 }
9710
9711
9712 CInode *cur = 0;
9713 MDiscoverReply *reply = new MDiscoverReply(dis);
9714
9715 snapid_t snapid = dis->get_snapid();
9716
9717 // get started.
9718 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9719 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9720 // wants root
9721 dout(7) << "handle_discover from mds." << from
9722 << " wants base + " << dis->get_want().get_path()
9723 << " snap " << snapid
9724 << dendl;
9725
9726 cur = get_inode(dis->get_base_ino());
9727 assert(cur);
9728
9729 // add root
9730 reply->starts_with = MDiscoverReply::INODE;
9731 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9732 dout(10) << "added base " << *cur << dendl;
9733 }
9734 else {
9735 // there's a base inode
9736 cur = get_inode(dis->get_base_ino(), snapid);
9737 if (!cur && snapid != CEPH_NOSNAP) {
9738 cur = get_inode(dis->get_base_ino());
9739 if (cur && !cur->is_multiversion())
9740 cur = NULL; // nope!
9741 }
9742
9743 if (!cur) {
9744 dout(7) << "handle_discover mds." << from
9745 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9746 << dendl;
9747 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9748 reply->set_error_dentry(dis->get_dentry(0));
9749 reply->set_flag_error_dir();
9750 } else if (dis->wants_base_dir()) {
9751 dout(7) << "handle_discover mds." << from
9752 << " wants basedir+" << dis->get_want().get_path()
9753 << " has " << *cur
9754 << dendl;
9755 } else {
9756 dout(7) << "handle_discover mds." << from
9757 << " wants " << dis->get_want().get_path()
9758 << " has " << *cur
9759 << dendl;
9760 }
9761 }
9762
9763 assert(reply);
9764
9765 // add content
9766 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9767 for (unsigned i = 0;
9768 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9769 i++) {
9770
9771 // -- figure out the dir
9772
9773 // is *cur even a dir at all?
9774 if (!cur->is_dir()) {
9775 dout(7) << *cur << " not a dir" << dendl;
9776 reply->set_flag_error_dir();
9777 break;
9778 }
9779
9780 // pick frag
9781 frag_t fg;
9782 if (dis->get_want().depth()) {
9783 // dentry specifies
9784 fg = cur->pick_dirfrag(dis->get_dentry(i));
9785 } else {
9786 // requester explicity specified the frag
9787 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9788 fg = dis->get_base_dir_frag();
9789 if (!cur->dirfragtree.is_leaf(fg))
9790 fg = cur->dirfragtree[fg.value()];
9791 }
9792 CDir *curdir = cur->get_dirfrag(fg);
9793
9794 if ((!curdir && !cur->is_auth()) ||
9795 (curdir && !curdir->is_auth())) {
9796
9797 /* before:
9798 * ONLY set flag if empty!!
9799 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9800 * resulting in duplicate discovers in flight,
9801 * which can wreak havoc when discovering rename srcdn (which may move)
9802 */
9803
9804 if (reply->is_empty()) {
9805 // only hint if empty.
9806 // someday this could be better, but right now the waiter logic isn't smart enough.
9807
9808 // hint
9809 if (curdir) {
9810 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9811 reply->set_dir_auth_hint(curdir->authority().first);
9812 } else {
9813 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9814 << *cur << dendl;
9815 reply->set_dir_auth_hint(cur->authority().first);
9816 }
9817
9818 // note error dentry, if any
9819 // NOTE: important, as it allows requester to issue an equivalent discover
9820 // to whomever we hint at.
9821 if (dis->get_want().depth() > i)
9822 reply->set_error_dentry(dis->get_dentry(i));
9823 }
9824
9825 break;
9826 }
9827
9828 if (!curdir) { // open dir?
9829 if (cur->is_frozen()) {
9830 if (!reply->is_empty()) {
9831 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9832 break;
9833 }
9834 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9835 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9836 reply->put();
9837 return;
9838 }
9839 curdir = cur->get_or_open_dirfrag(this, fg);
9840 } else if (curdir->is_frozen_tree() ||
9841 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
9842 if (!reply->is_empty()) {
9843 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9844 break;
9845 }
9846 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9847 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9848 reply->set_flag_error_dir();
9849 break;
9850 }
9851 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9852 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9853 reply->put();
9854 return;
9855 }
9856
9857 // add dir
9858 if (curdir->get_version() == 0) {
9859 // fetch newly opened dir
9860 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9861 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9862 // make sure the base frag is correct, though, in there was a refragment since the
9863 // original request was sent.
9864 reply->set_base_dir_frag(curdir->get_frag());
9865 } else {
9866 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9867 if (!reply->trace.length())
9868 reply->starts_with = MDiscoverReply::DIR;
9869 replicate_dir(curdir, from, reply->trace);
9870 dout(7) << "handle_discover added dir " << *curdir << dendl;
9871 }
9872
9873 // lookup
9874 CDentry *dn = 0;
9875 if (curdir->get_version() == 0) {
9876 // fetch newly opened dir
9877 assert(!curdir->has_bloom());
9878 } else if (dis->get_want().depth() > 0) {
9879 // lookup dentry
9880 dn = curdir->lookup(dis->get_dentry(i), snapid);
9881 } else
9882 break; // done!
9883
9884 // incomplete dir?
9885 if (!dn) {
9886 if (!curdir->is_complete() &&
9887 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
9888 // readdir
9889 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9890 if (reply->is_empty()) {
9891 // fetch and wait
9892 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9893 dis->wants_base_dir() && curdir->get_version() == 0);
9894 reply->put();
9895 return;
9896 } else {
9897 // initiate fetch, but send what we have so far
9898 curdir->fetch(0);
9899 break;
9900 }
9901 }
9902
9903 // send null dentry
9904 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9905 << *curdir << dendl;
9906 dn = curdir->add_null_dentry(dis->get_dentry(i));
9907 }
9908 assert(dn);
9909
9910 // don't add replica to purging dentry/inode
9911 if (dn->state_test(CDentry::STATE_PURGING)) {
9912 if (reply->is_empty())
9913 reply->set_flag_error_dn(dis->get_dentry(i));
9914 break;
9915 }
9916
9917 CDentry::linkage_t *dnl = dn->get_linkage();
9918
9919 // xlocked dentry?
9920 // ...always block on non-tail items (they are unrelated)
9921 // ...allow xlocked tail disocvery _only_ if explicitly requested
9922 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9923 if (dn->lock.is_xlocked()) {
9924 // is this the last (tail) item in the discover traversal?
9925 if (tailitem && dis->wants_xlocked()) {
9926 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9927 } else if (reply->is_empty()) {
9928 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9929 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9930 reply->put();
9931 return;
9932 } else {
9933 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9934 break;
9935 }
9936 }
9937
9938 // frozen inode?
9939 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9940 if (tailitem && dis->wants_xlocked()) {
9941 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9942 } else if (reply->is_empty()) {
9943 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9944 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9945 reply->put();
9946 return;
9947 } else {
9948 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9949 break;
9950 }
9951 }
9952
9953 // add dentry
9954 if (!reply->trace.length())
9955 reply->starts_with = MDiscoverReply::DENTRY;
9956 replicate_dentry(dn, from, reply->trace);
9957 dout(7) << "handle_discover added dentry " << *dn << dendl;
9958
9959 if (!dnl->is_primary()) break; // stop on null or remote link.
9960
9961 // add inode
9962 CInode *next = dnl->get_inode();
9963 assert(next->is_auth());
9964
9965 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
9966 dout(7) << "handle_discover added inode " << *next << dendl;
9967
9968 // descend, keep going.
9969 cur = next;
9970 continue;
9971 }
9972
9973 // how did we do?
9974 assert(!reply->is_empty());
9975 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
9976 mds->send_message(reply, dis->get_connection());
9977
9978 dis->put();
9979 }
9980
9981 /* This function DOES put the passed message before returning */
9982 void MDCache::handle_discover_reply(MDiscoverReply *m)
9983 {
9984 /*
9985 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
9986 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
9987 m->put();
9988 return;
9989 }
9990 */
9991 dout(7) << "discover_reply " << *m << dendl;
9992 if (m->is_flag_error_dir())
9993 dout(7) << " flag error, dir" << dendl;
9994 if (m->is_flag_error_dn())
9995 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
9996
9997 list<MDSInternalContextBase*> finished, error;
9998 mds_rank_t from = mds_rank_t(m->get_source().num());
9999
10000 // starting point
10001 CInode *cur = get_inode(m->get_base_ino());
10002 bufferlist::iterator p = m->trace.begin();
10003
10004 int next = m->starts_with;
10005
10006 // decrement discover counters
10007 if (m->get_tid()) {
10008 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10009 if (p != discovers.end()) {
10010 dout(10) << " found tid " << m->get_tid() << dendl;
10011 discovers.erase(p);
10012 } else {
10013 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10014 }
10015 }
10016
10017 // discover may start with an inode
10018 if (!p.end() && next == MDiscoverReply::INODE) {
10019 cur = add_replica_inode(p, NULL, finished);
10020 dout(7) << "discover_reply got base inode " << *cur << dendl;
10021 assert(cur->is_base());
10022
10023 next = MDiscoverReply::DIR;
10024
10025 // take waiters?
10026 if (cur->is_base() &&
10027 waiting_for_base_ino[from].count(cur->ino())) {
10028 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10029 waiting_for_base_ino[from].erase(cur->ino());
10030 }
10031 }
10032 assert(cur);
10033
10034 // loop over discover results.
10035 // indexes follow each ([[dir] dentry] inode)
10036 // can start, end with any type.
10037 while (!p.end()) {
10038 // dir
10039 frag_t fg;
10040 CDir *curdir = 0;
10041 if (next == MDiscoverReply::DIR) {
10042 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10043 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10044 assert(m->get_wanted_base_dir());
10045 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10046 }
10047 } else {
10048 // note: this can only happen our first way around this loop.
10049 if (p.end() && m->is_flag_error_dn()) {
10050 fg = cur->pick_dirfrag(m->get_error_dentry());
10051 curdir = cur->get_dirfrag(fg);
10052 } else
10053 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10054 }
10055
10056 if (p.end())
10057 break;
10058
10059 // dentry
10060 CDentry *dn = add_replica_dentry(p, curdir, finished);
10061
10062 if (p.end())
10063 break;
10064
10065 // inode
10066 cur = add_replica_inode(p, dn, finished);
10067
10068 next = MDiscoverReply::DIR;
10069 }
10070
10071 // dir error?
10072 // or dir_auth hint?
10073 if (m->is_flag_error_dir() && !cur->is_dir()) {
10074 // not a dir.
10075 cur->take_waiting(CInode::WAIT_DIR, error);
10076 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10077 mds_rank_t who = m->get_dir_auth_hint();
10078 if (who == mds->get_nodeid()) who = -1;
10079 if (who >= 0)
10080 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10081
10082
10083 if (m->get_wanted_base_dir()) {
10084 frag_t fg = m->get_base_dir_frag();
10085 CDir *dir = cur->get_dirfrag(fg);
10086
10087 if (cur->is_waiting_for_dir(fg)) {
10088 if (cur->is_auth())
10089 cur->take_waiting(CInode::WAIT_DIR, finished);
10090 else if (dir || !cur->dirfragtree.is_leaf(fg))
10091 cur->take_dir_waiting(fg, finished);
10092 else
10093 discover_dir_frag(cur, fg, 0, who);
10094 } else
10095 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10096 }
10097
10098 // try again?
10099 if (m->get_error_dentry().length()) {
10100 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10101 CDir *dir = cur->get_dirfrag(fg);
10102 // wanted a dentry
10103 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10104 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10105 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10106 m->get_wanted_snapid(), finished);
10107 } else {
10108 filepath relpath(m->get_error_dentry(), 0);
10109 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10110 }
10111 } else
10112 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10113 << m->get_error_dentry() << dendl;
10114 }
10115 } else if (m->is_flag_error_dn()) {
10116 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10117 CDir *dir = cur->get_dirfrag(fg);
10118 if (dir) {
10119 if (dir->is_auth()) {
10120 dir->take_sub_waiting(finished);
10121 } else {
10122 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10123 m->get_wanted_snapid(), error);
10124 }
10125 }
10126 }
10127
10128 // waiters
10129 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10130 mds->queue_waiters(finished);
10131
10132 // done
10133 m->put();
10134 }
10135
10136
10137
10138 // ----------------------------
10139 // REPLICAS
10140
10141 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10142 list<MDSInternalContextBase*>& finished)
10143 {
10144 dirfrag_t df;
10145 ::decode(df, p);
10146
10147 assert(diri->ino() == df.ino);
10148
10149 // add it (_replica_)
10150 CDir *dir = diri->get_dirfrag(df.frag);
10151
10152 if (dir) {
10153 // had replica. update w/ new nonce.
10154 dir->decode_replica(p);
10155 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10156 } else {
10157 // force frag to leaf in the diri tree
10158 if (!diri->dirfragtree.is_leaf(df.frag)) {
10159 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10160 << diri->dirfragtree << dendl;
10161 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10162 }
10163
10164 // add replica.
10165 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10166 dir->decode_replica(p);
10167
10168 // is this a dir_auth delegation boundary?
10169 if (from != diri->authority().first ||
10170 diri->is_ambiguous_auth() ||
10171 diri->is_base())
10172 adjust_subtree_auth(dir, from);
10173
10174 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10175
10176 // get waiters
10177 diri->take_dir_waiting(df.frag, finished);
10178 }
10179
10180 return dir;
10181 }
10182
10183 CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, mds_rank_t from)
10184 {
10185 assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN);
10186
10187 // forge a replica.
10188 CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) );
10189
10190 // i'm assuming this is a subtree root.
10191 adjust_subtree_auth(dir, from);
10192
10193 dout(7) << "forge_replica_dir added " << *dir << " while mds." << from << " is down" << dendl;
10194
10195 return dir;
10196 }
10197
10198 CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10199 {
10200 string name;
10201 snapid_t last;
10202 ::decode(name, p);
10203 ::decode(last, p);
10204
10205 CDentry *dn = dir->lookup(name, last);
10206
10207 // have it?
10208 if (dn) {
10209 dn->decode_replica(p, false);
10210 dout(7) << "add_replica_dentry had " << *dn << dendl;
10211 } else {
10212 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10213 dn->decode_replica(p, true);
10214 dout(7) << "add_replica_dentry added " << *dn << dendl;
10215 }
10216
10217 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10218
10219 return dn;
10220 }
10221
10222 CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10223 {
10224 inodeno_t ino;
10225 snapid_t last;
10226 ::decode(ino, p);
10227 ::decode(last, p);
10228 CInode *in = get_inode(ino, last);
10229 if (!in) {
10230 in = new CInode(this, false, 1, last);
10231 in->decode_replica(p, true);
10232 add_inode(in);
10233 if (in->ino() == MDS_INO_ROOT)
10234 in->inode_auth.first = 0;
10235 else if (in->is_mdsdir())
10236 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10237 dout(10) << "add_replica_inode added " << *in << dendl;
10238 if (dn) {
10239 assert(dn->get_linkage()->is_null());
10240 dn->dir->link_primary_inode(dn, in);
10241 }
10242 } else {
10243 in->decode_replica(p, false);
10244 dout(10) << "add_replica_inode had " << *in << dendl;
10245 }
10246
10247 if (dn) {
10248 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10249 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10250 }
10251
10252 return in;
10253 }
10254
10255
10256 void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10257 {
10258 uint64_t features = mds->mdsmap->get_up_features();
10259 replicate_inode(get_myin(), who, bl, features);
10260 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10261 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10262 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10263 replicate_dir(straydn->get_dir(), who, bl);
10264 replicate_dentry(straydn, who, bl);
10265 }
10266
10267 CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10268 {
10269 list<MDSInternalContextBase*> finished;
10270 bufferlist::iterator p = bl.begin();
10271
10272 CInode *mdsin = add_replica_inode(p, NULL, finished);
10273 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10274 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10275 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10276 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10277 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10278 if (!finished.empty())
10279 mds->queue_waiters(finished);
10280
10281 return straydn;
10282 }
10283
10284
10285 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10286 {
10287 // this is an FYI, re: replication
10288
10289 set<mds_rank_t> who;
10290 if (bcast) {
10291 mds->get_mds_map()->get_active_mds_set(who);
10292 } else {
10293 for (compact_map<mds_rank_t,unsigned>::iterator p = dir->replicas_begin();
10294 p != dir->replicas_end();
10295 ++p)
10296 who.insert(p->first);
10297 }
10298
10299 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10300
10301 filepath path;
10302 dir->inode->make_path(path);
10303
10304 mds_rank_t whoami = mds->get_nodeid();
10305 for (set<mds_rank_t>::iterator it = who.begin();
10306 it != who.end();
10307 ++it) {
10308 if (*it == whoami) continue;
10309 //if (*it == except) continue;
10310 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10311
10312 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10313 dir->dirfrag(),
10314 dir->dir_rep,
10315 dir->dir_rep_by,
10316 path,
10317 bcast),
10318 *it);
10319 }
10320
10321 return 0;
10322 }
10323
10324 /* This function DOES put the passed message before returning */
10325 void MDCache::handle_dir_update(MDirUpdate *m)
10326 {
10327 CDir *dir = get_dirfrag(m->get_dirfrag());
10328 if (!dir) {
10329 dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl;
10330
10331 // discover it?
10332 if (m->should_discover()) {
10333 // only try once!
10334 // this is key to avoid a fragtree update race, among other things.
10335 m->tried_discover();
10336 vector<CDentry*> trace;
10337 CInode *in;
10338 filepath path = m->get_path();
10339 dout(5) << "trying discover on dir_update for " << path << dendl;
10340 MDRequestRef null_ref;
10341 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10342 if (r > 0)
10343 return;
10344 assert(r == 0);
10345 open_remote_dirfrag(in, m->get_dirfrag().frag,
10346 new C_MDS_RetryMessage(mds, m));
10347 return;
10348 }
10349
10350 m->put();
10351 return;
10352 }
10353
10354 // update
10355 dout(5) << "dir_update on " << *dir << dendl;
10356 dir->dir_rep = m->get_dir_rep();
10357 dir->dir_rep_by = m->get_dir_rep_by();
10358
10359 // done
10360 m->put();
10361 }
10362
10363
10364
10365
10366
10367 // LINK
10368
10369 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10370 {
10371 dout(7) << "send_dentry_link " << *dn << dendl;
10372
10373 CDir *subtree = get_subtree_root(dn->get_dir());
10374 for (compact_map<mds_rank_t,unsigned>::iterator p = dn->replicas_begin();
10375 p != dn->replicas_end();
10376 ++p) {
10377 // don't tell (rename) witnesses; they already know
10378 if (mdr.get() && mdr->more()->witnessed.count(p->first))
10379 continue;
10380 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
10381 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
10382 rejoin_gather.count(p->first)))
10383 continue;
10384 CDentry::linkage_t *dnl = dn->get_linkage();
10385 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10386 dn->name, dnl->is_primary());
10387 if (dnl->is_primary()) {
10388 dout(10) << " primary " << *dnl->get_inode() << dendl;
10389 replicate_inode(dnl->get_inode(), p->first, m->bl,
10390 mds->mdsmap->get_up_features());
10391 } else if (dnl->is_remote()) {
10392 inodeno_t ino = dnl->get_remote_ino();
10393 __u8 d_type = dnl->get_remote_d_type();
10394 dout(10) << " remote " << ino << " " << d_type << dendl;
10395 ::encode(ino, m->bl);
10396 ::encode(d_type, m->bl);
10397 } else
10398 ceph_abort(); // aie, bad caller!
10399 mds->send_message_mds(m, p->first);
10400 }
10401 }
10402
10403 /* This function DOES put the passed message before returning */
10404 void MDCache::handle_dentry_link(MDentryLink *m)
10405 {
10406
10407 CDentry *dn = NULL;
10408 CDir *dir = get_dirfrag(m->get_dirfrag());
10409 if (!dir) {
10410 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10411 } else {
10412 dn = dir->lookup(m->get_dn());
10413 if (!dn) {
10414 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10415 } else {
10416 dout(7) << "handle_dentry_link on " << *dn << dendl;
10417 CDentry::linkage_t *dnl = dn->get_linkage();
10418
10419 assert(!dn->is_auth());
10420 assert(dnl->is_null());
10421 }
10422 }
10423
10424 bufferlist::iterator p = m->bl.begin();
10425 list<MDSInternalContextBase*> finished;
10426 if (dn) {
10427 if (m->get_is_primary()) {
10428 // primary link.
10429 add_replica_inode(p, dn, finished);
10430 } else {
10431 // remote link, easy enough.
10432 inodeno_t ino;
10433 __u8 d_type;
10434 ::decode(ino, p);
10435 ::decode(d_type, p);
10436 dir->link_remote_inode(dn, ino, d_type);
10437 }
10438 } else {
10439 ceph_abort();
10440 }
10441
10442 if (!finished.empty())
10443 mds->queue_waiters(finished);
10444
10445 m->put();
10446 return;
10447 }
10448
10449
10450 // UNLINK
10451
10452 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10453 {
10454 dout(10) << "send_dentry_unlink " << *dn << dendl;
10455 // share unlink news with replicas
10456 set<mds_rank_t> replicas;
10457 dn->list_replicas(replicas);
10458 if (straydn)
10459 straydn->list_replicas(replicas);
10460 for (set<mds_rank_t>::iterator it = replicas.begin();
10461 it != replicas.end();
10462 ++it) {
10463 // don't tell (rmdir) witnesses; they already know
10464 if (mdr.get() && mdr->more()->witnessed.count(*it))
10465 continue;
10466
10467 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10468 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10469 rejoin_gather.count(*it)))
10470 continue;
10471
10472 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
10473 if (straydn)
10474 replicate_stray(straydn, *it, unlink->straybl);
10475 mds->send_message_mds(unlink, *it);
10476 }
10477 }
10478
10479 /* This function DOES put the passed message before returning */
10480 void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10481 {
10482 // straydn
10483 CDentry *straydn = NULL;
10484 if (m->straybl.length())
10485 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10486
10487 CDir *dir = get_dirfrag(m->get_dirfrag());
10488 if (!dir) {
10489 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10490 } else {
10491 CDentry *dn = dir->lookup(m->get_dn());
10492 if (!dn) {
10493 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10494 } else {
10495 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10496 CDentry::linkage_t *dnl = dn->get_linkage();
10497
10498 // open inode?
10499 if (dnl->is_primary()) {
10500 CInode *in = dnl->get_inode();
10501 dn->dir->unlink_inode(dn);
10502 assert(straydn);
10503 straydn->dir->link_primary_inode(straydn, in);
10504
10505 // in->first is lazily updated on replica; drag it forward so
10506 // that we always keep it in sync with the dnq
10507 assert(straydn->first >= in->first);
10508 in->first = straydn->first;
10509
10510 // update subtree map?
10511 if (in->is_dir())
10512 adjust_subtree_after_rename(in, dir, false);
10513
10514 // send caps to auth (if we're not already)
10515 if (in->is_any_caps() &&
10516 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10517 migrator->export_caps(in);
10518
10519 straydn = NULL;
10520 } else {
10521 assert(!straydn);
10522 assert(dnl->is_remote());
10523 dn->dir->unlink_inode(dn);
10524 }
10525 assert(dnl->is_null());
10526 }
10527 }
10528
10529 // race with trim_dentry()
10530 if (straydn) {
10531 assert(straydn->get_num_ref() == 0);
10532 assert(straydn->get_linkage()->is_null());
10533 map<mds_rank_t, MCacheExpire*> expiremap;
10534 trim_dentry(straydn, expiremap);
10535 send_expire_messages(expiremap);
10536 }
10537
10538 m->put();
10539 return;
10540 }
10541
10542
10543
10544
10545
10546
10547 // ===================================================================
10548
10549
10550
10551 // ===================================================================
10552 // FRAGMENT
10553
10554
10555 /**
10556 * adjust_dir_fragments -- adjust fragmentation for a directory
10557 *
10558 * @param diri directory inode
10559 * @param basefrag base fragment
10560 * @param bits bit adjustment. positive for split, negative for merge.
10561 */
10562 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10563 list<CDir*>& resultfrags,
10564 list<MDSInternalContextBase*>& waiters,
10565 bool replay)
10566 {
10567 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10568 << " on " << *diri << dendl;
10569
10570 list<CDir*> srcfrags;
10571 diri->get_dirfrags_under(basefrag, srcfrags);
10572
10573 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10574 }
10575
10576 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10577 {
10578 CDir *dir = diri->get_dirfrag(fg);
10579 if (dir)
10580 return dir;
10581
10582 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10583
10584 list<CDir*> src, result;
10585 list<MDSInternalContextBase*> waiters;
10586
10587 // split a parent?
10588 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10589 while (1) {
10590 CDir *pdir = diri->get_dirfrag(parent);
10591 if (pdir) {
10592 int split = fg.bits() - parent.bits();
10593 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10594 src.push_back(pdir);
10595 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10596 dir = diri->get_dirfrag(fg);
10597 if (dir) {
10598 dout(10) << "force_dir_fragment result " << *dir << dendl;
10599 break;
10600 }
10601 }
10602 if (parent == frag_t())
10603 break;
10604 frag_t last = parent;
10605 parent = parent.parent();
10606 dout(10) << " " << last << " parent is " << parent << dendl;
10607 }
10608
10609 if (!dir) {
10610 // hoover up things under fg?
10611 diri->get_dirfrags_under(fg, src);
10612 if (src.empty()) {
10613 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10614 } else {
10615 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10616 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10617 dir = result.front();
10618 dout(10) << "force_dir_fragment result " << *dir << dendl;
10619 }
10620 }
10621 if (!replay)
10622 mds->queue_waiters(waiters);
10623 return dir;
10624 }
10625
10626 void MDCache::adjust_dir_fragments(CInode *diri,
10627 list<CDir*>& srcfrags,
10628 frag_t basefrag, int bits,
10629 list<CDir*>& resultfrags,
10630 list<MDSInternalContextBase*>& waiters,
10631 bool replay)
10632 {
10633 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10634 << " srcfrags " << srcfrags
10635 << " on " << *diri << dendl;
10636
10637 // adjust fragtree
10638 // yuck. we may have discovered the inode while it was being fragmented.
10639 if (!diri->dirfragtree.is_leaf(basefrag))
10640 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10641
10642 if (bits > 0)
10643 diri->dirfragtree.split(basefrag, bits);
10644 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10645
10646 if (srcfrags.empty())
10647 return;
10648
10649 // split
10650 CDir *parent_dir = diri->get_parent_dir();
10651 CDir *parent_subtree = 0;
10652 if (parent_dir)
10653 parent_subtree = get_subtree_root(parent_dir);
10654
10655 if (bits > 0) {
10656 // SPLIT
10657 assert(srcfrags.size() == 1);
10658 CDir *dir = srcfrags.front();
10659
10660 dir->split(bits, resultfrags, waiters, replay);
10661
10662 // did i change the subtree map?
10663 if (dir->is_subtree_root()) {
10664 // new frags are now separate subtrees
10665 for (list<CDir*>::iterator p = resultfrags.begin();
10666 p != resultfrags.end();
10667 ++p)
10668 subtrees[*p].clear(); // new frag is now its own subtree
10669
10670 // was i a bound?
10671 if (parent_subtree) {
10672 assert(subtrees[parent_subtree].count(dir));
10673 subtrees[parent_subtree].erase(dir);
10674 for (list<CDir*>::iterator p = resultfrags.begin();
10675 p != resultfrags.end();
10676 ++p) {
10677 assert((*p)->is_subtree_root());
10678 subtrees[parent_subtree].insert(*p);
10679 }
10680 }
10681
10682 // adjust my bounds.
10683 set<CDir*> bounds;
10684 bounds.swap(subtrees[dir]);
10685 subtrees.erase(dir);
10686 for (set<CDir*>::iterator p = bounds.begin();
10687 p != bounds.end();
10688 ++p) {
10689 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10690 subtrees[frag].insert(*p);
10691 }
10692
10693 show_subtrees(10);
10694
10695 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10696 dir->dir_auth = CDIR_AUTH_DEFAULT;
10697 }
10698
10699 diri->close_dirfrag(dir->get_frag());
10700
10701 } else {
10702 // MERGE
10703
10704 // are my constituent bits subtrees? if so, i will be too.
10705 // (it's all or none, actually.)
10706 bool any_subtree = false;
10707 for (CDir *dir : srcfrags) {
10708 if (dir->is_subtree_root()) {
10709 any_subtree = true;
10710 break;
10711 }
10712 }
10713 set<CDir*> new_bounds;
10714 if (any_subtree) {
10715 for (CDir *dir : srcfrags) {
10716 // this simplifies the code that find subtrees underneath the dirfrag
10717 if (!dir->is_subtree_root()) {
10718 dir->state_set(CDir::STATE_AUXSUBTREE);
10719 adjust_subtree_auth(dir, mds->get_nodeid());
10720 }
10721 }
10722
10723 for (CDir *dir : srcfrags) {
10724 assert(dir->is_subtree_root());
10725 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
10726 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10727 set<CDir*>::iterator r = q->second.begin();
10728 while (r != subtrees[dir].end()) {
10729 new_bounds.insert(*r);
10730 subtrees[dir].erase(r++);
10731 }
10732 subtrees.erase(q);
10733
10734 // remove myself as my parent's bound
10735 if (parent_subtree)
10736 subtrees[parent_subtree].erase(dir);
10737 }
10738 }
10739
10740 // merge
10741 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10742 f->merge(srcfrags, waiters, replay);
10743
10744 if (any_subtree) {
10745 assert(f->is_subtree_root());
10746 subtrees[f].swap(new_bounds);
10747 if (parent_subtree)
10748 subtrees[parent_subtree].insert(f);
10749
10750 show_subtrees(10);
10751 }
10752
10753 resultfrags.push_back(f);
10754 }
10755 }
10756
10757
10758 class C_MDC_FragmentFrozen : public MDSInternalContext {
10759 MDCache *mdcache;
10760 MDRequestRef mdr;
10761 public:
10762 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10763 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10764 void finish(int r) override {
10765 mdcache->fragment_frozen(mdr, r);
10766 }
10767 };
10768
10769 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10770 {
10771 if (is_readonly()) {
10772 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10773 return false;
10774 }
10775 if (mds->is_cluster_degraded()) {
10776 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10777 return false;
10778 }
10779 if (diri->get_parent_dir() &&
10780 diri->get_parent_dir()->get_inode()->is_stray()) {
10781 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10782 return false;
10783 }
10784 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10785 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10786 return false;
10787 }
10788
10789 if (diri->scrub_is_in_progress()) {
10790 dout(7) << "can_fragment: scrub in progress" << dendl;
10791 return false;
10792 }
10793
10794 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10795 CDir *dir = *p;
10796 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10797 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10798 return false;
10799 }
10800 if (!dir->is_auth()) {
10801 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10802 return false;
10803 }
10804 if (dir->is_bad()) {
10805 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10806 return false;
10807 }
10808 if (dir->is_frozen() ||
10809 dir->is_freezing()) {
10810 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10811 return false;
10812 }
10813 }
10814
10815 return true;
10816 }
10817
10818 void MDCache::split_dir(CDir *dir, int bits)
10819 {
10820 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10821 assert(dir->is_auth());
10822 CInode *diri = dir->inode;
10823
10824 list<CDir*> dirs;
10825 dirs.push_back(dir);
10826
10827 if (!can_fragment(diri, dirs)) {
10828 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10829 return;
10830 }
10831
10832 if (dir->frag.bits() + bits > 24) {
10833 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10834 return;
10835 }
10836
10837 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10838 mdr->more()->fragment_base = dir->dirfrag();
10839
10840 assert(fragments.count(dir->dirfrag()) == 0);
10841 fragment_info_t& info = fragments[dir->dirfrag()];
10842 info.mdr = mdr;
10843 info.dirs.push_back(dir);
10844 info.bits = bits;
10845 info.last_cum_auth_pins_change = ceph_clock_now();
10846
10847 fragment_freeze_dirs(dirs);
10848 // initial mark+complete pass
10849 fragment_mark_and_complete(mdr);
10850 }
10851
10852 void MDCache::merge_dir(CInode *diri, frag_t frag)
10853 {
10854 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10855
10856 list<CDir*> dirs;
10857 if (!diri->get_dirfrags_under(frag, dirs)) {
10858 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10859 return;
10860 }
10861
10862 if (diri->dirfragtree.is_leaf(frag)) {
10863 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10864 return;
10865 }
10866
10867 if (!can_fragment(diri, dirs))
10868 return;
10869
10870 CDir *first = dirs.front();
10871 int bits = first->get_frag().bits() - frag.bits();
10872 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10873
10874 dirfrag_t basedirfrag(diri->ino(), frag);
10875 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10876 mdr->more()->fragment_base = basedirfrag;
10877
10878 assert(fragments.count(basedirfrag) == 0);
10879 fragment_info_t& info = fragments[basedirfrag];
10880 info.mdr = mdr;
10881 info.dirs = dirs;
10882 info.bits = -bits;
10883 info.last_cum_auth_pins_change = ceph_clock_now();
10884
10885 fragment_freeze_dirs(dirs);
10886 // initial mark+complete pass
10887 fragment_mark_and_complete(mdr);
10888 }
10889
10890 void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10891 {
10892 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10893 CDir *dir = *p;
10894 dir->auth_pin(dir); // until we mark and complete them
10895 dir->state_set(CDir::STATE_FRAGMENTING);
10896 dir->freeze_dir();
10897 assert(dir->is_freezing_dir());
10898 }
10899 }
10900
10901 class C_MDC_FragmentMarking : public MDCacheContext {
10902 MDRequestRef mdr;
10903 public:
10904 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10905 void finish(int r) override {
10906 mdcache->fragment_mark_and_complete(mdr);
10907 }
10908 };
10909
10910 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10911 {
10912 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10913 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10914 if (it == fragments.end() || it->second.mdr != mdr) {
10915 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10916 request_finish(mdr);
10917 return;
10918 }
10919
10920 fragment_info_t& info = it->second;
10921 CInode *diri = info.dirs.front()->get_inode();
10922 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10923
10924 MDSGatherBuilder gather(g_ceph_context);
10925
10926 for (list<CDir*>::iterator p = info.dirs.begin();
10927 p != info.dirs.end();
10928 ++p) {
10929 CDir *dir = *p;
10930
10931 bool ready = true;
10932 if (!dir->is_complete()) {
10933 dout(15) << " fetching incomplete " << *dir << dendl;
10934 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10935 ready = false;
10936 } else if (dir->get_frag() == frag_t()) {
10937 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10938 // the operation. To avoid CDir::fetch() complaining about missing object,
10939 // we commit new dirfrag first.
10940 if (dir->state_test(CDir::STATE_CREATING)) {
10941 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10942 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10943 ready = false;
10944 } else if (dir->is_new()) {
10945 dout(15) << " committing new " << *dir << dendl;
10946 assert(dir->is_dirty());
10947 dir->commit(0, gather.new_sub(), true);
10948 ready = false;
10949 }
10950 }
10951 if (!ready)
10952 continue;
10953
10954 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10955 dout(15) << " marking " << *dir << dendl;
10956 for (CDir::map_t::iterator p = dir->items.begin();
10957 p != dir->items.end();
10958 ++p) {
10959 CDentry *dn = p->second;
10960 dn->get(CDentry::PIN_FRAGMENTING);
10961 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
10962 dn->state_set(CDentry::STATE_FRAGMENTING);
10963 }
10964 dir->state_set(CDir::STATE_DNPINNEDFRAG);
10965 dir->auth_unpin(dir);
10966 } else {
10967 dout(15) << " already marked " << *dir << dendl;
10968 }
10969 }
10970 if (gather.has_subs()) {
10971 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
10972 gather.activate();
10973 return;
10974 }
10975
10976 for (list<CDir*>::iterator p = info.dirs.begin();
10977 p != info.dirs.end();
10978 ++p) {
10979 CDir *dir = *p;
10980 if (!dir->is_frozen_dir()) {
10981 assert(dir->is_freezing_dir());
10982 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
10983 }
10984 }
10985 if (gather.has_subs()) {
10986 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
10987 gather.activate();
10988 // flush log so that request auth_pins are retired
10989 mds->mdlog->flush();
10990 return;
10991 }
10992
10993 fragment_frozen(mdr, 0);
10994 }
10995
10996 void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
10997 {
10998 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
10999 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11000 CDir *dir = *p;
11001 dout(10) << " frag " << *dir << dendl;
11002
11003 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11004 dir->state_clear(CDir::STATE_FRAGMENTING);
11005
11006 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11007 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11008
11009 for (CDir::map_t::iterator p = dir->items.begin();
11010 p != dir->items.end();
11011 ++p) {
11012 CDentry *dn = p->second;
11013 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11014 dn->state_clear(CDentry::STATE_FRAGMENTING);
11015 dn->put(CDentry::PIN_FRAGMENTING);
11016 }
11017 } else {
11018 dir->auth_unpin(dir);
11019 }
11020
11021 dir->unfreeze_dir();
11022 }
11023 }
11024
11025 bool MDCache::fragment_are_all_frozen(CDir *dir)
11026 {
11027 assert(dir->is_frozen_dir());
11028 map<dirfrag_t,fragment_info_t>::iterator p;
11029 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11030 p != fragments.end() && p->first.ino == dir->ino();
11031 ++p) {
11032 if (p->first.frag.contains(dir->get_frag()))
11033 return p->second.all_frozen;
11034 }
11035 ceph_abort();
11036 return false;
11037 }
11038
11039 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11040 {
11041 map<dirfrag_t,fragment_info_t>::iterator p;
11042 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11043 p != fragments.end() && p->first.ino == dir->ino();
11044 ++p) {
11045 if (p->first.frag.contains(dir->get_frag())) {
11046 p->second.num_remote_waiters++;
11047 return;
11048 }
11049 }
11050 ceph_abort();
11051 }
11052
11053 void MDCache::find_stale_fragment_freeze()
11054 {
11055 dout(10) << "find_stale_fragment_freeze" << dendl;
11056 // see comment in Migrator::find_stale_export_freeze()
11057 utime_t now = ceph_clock_now();
11058 utime_t cutoff = now;
11059 cutoff -= g_conf->mds_freeze_tree_timeout;
11060
11061 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11062 p != fragments.end(); ) {
11063 dirfrag_t df = p->first;
11064 fragment_info_t& info = p->second;
11065 ++p;
11066 if (info.all_frozen)
11067 continue;
11068 CDir *dir;
11069 int total_auth_pins = 0;
11070 for (list<CDir*>::iterator q = info.dirs.begin();
11071 q != info.dirs.end();
11072 ++q) {
11073 dir = *q;
11074 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11075 total_auth_pins = -1;
11076 break;
11077 }
11078 if (dir->is_frozen_dir())
11079 continue;
11080 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11081 }
11082 if (total_auth_pins < 0)
11083 continue;
11084 if (info.last_cum_auth_pins != total_auth_pins) {
11085 info.last_cum_auth_pins = total_auth_pins;
11086 info.last_cum_auth_pins_change = now;
11087 continue;
11088 }
11089 if (info.last_cum_auth_pins_change >= cutoff)
11090 continue;
11091 dir = info.dirs.front();
11092 if (info.num_remote_waiters > 0 ||
11093 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11094 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11095 list<CDir*> dirs;
11096 info.dirs.swap(dirs);
11097 fragments.erase(df);
11098 fragment_unmark_unfreeze_dirs(dirs);
11099 }
11100 }
11101 }
11102
11103 class C_MDC_FragmentPrep : public MDCacheLogContext {
11104 MDRequestRef mdr;
11105 public:
11106 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11107 void finish(int r) override {
11108 mdcache->_fragment_logged(mdr);
11109 }
11110 };
11111
11112 class C_MDC_FragmentStore : public MDCacheContext {
11113 MDRequestRef mdr;
11114 public:
11115 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11116 void finish(int r) override {
11117 mdcache->_fragment_stored(mdr);
11118 }
11119 };
11120
11121 class C_MDC_FragmentCommit : public MDCacheLogContext {
11122 dirfrag_t basedirfrag;
11123 list<CDir*> resultfrags;
11124 public:
11125 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11126 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11127 void finish(int r) override {
11128 mdcache->_fragment_committed(basedirfrag, resultfrags);
11129 }
11130 };
11131
11132 class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11133 dirfrag_t basedirfrag;
11134 list<CDir*> resultfrags;
11135 public:
11136 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11137 MDCacheIOContext(m), basedirfrag(f) {
11138 resultfrags.swap(l);
11139 }
11140 void finish(int r) override {
11141 assert(r == 0 || r == -ENOENT);
11142 mdcache->_fragment_finish(basedirfrag, resultfrags);
11143 }
11144 };
11145
11146 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11147 {
11148 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11149 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11150 if (it == fragments.end() || it->second.mdr != mdr) {
11151 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11152 request_finish(mdr);
11153 return;
11154 }
11155
11156 assert(r == 0);
11157 fragment_info_t& info = it->second;
11158 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11159 << " on " << info.dirs.front()->get_inode() << dendl;
11160
11161 info.all_frozen = true;
11162 dispatch_fragment_dir(mdr);
11163 }
11164
11165 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11166 {
11167 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11168 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11169 if (it == fragments.end() || it->second.mdr != mdr) {
11170 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11171 request_finish(mdr);
11172 return;
11173 }
11174
11175 fragment_info_t& info = it->second;
11176 CInode *diri = info.dirs.front()->get_inode();
11177
11178 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11179 << " on " << *diri << dendl;
11180 if (!mdr->aborted) {
11181 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11182 wrlocks.insert(&diri->dirfragtreelock);
11183 // prevent a racing gather on any other scatterlocks too
11184 wrlocks.insert(&diri->nestlock);
11185 wrlocks.insert(&diri->filelock);
11186 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11187 if (!mdr->aborted)
11188 return;
11189 }
11190
11191 if (mdr->aborted) {
11192 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11193 << info.dirs.front()->dirfrag() << dendl;
11194 if (info.bits > 0)
11195 mds->balancer->queue_split(info.dirs.front(), false);
11196 else
11197 mds->balancer->queue_merge(info.dirs.front());
11198 fragment_unmark_unfreeze_dirs(info.dirs);
11199 fragments.erase(it);
11200 request_finish(mdr);
11201 return;
11202 }
11203
11204 mdr->ls = mds->mdlog->get_current_segment();
11205 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11206 mds->mdlog->start_entry(le);
11207
11208 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11209 CDir *dir = *p;
11210 dirfrag_rollback rollback;
11211 rollback.fnode = dir->fnode;
11212 le->add_orig_frag(dir->get_frag(), &rollback);
11213 }
11214
11215 // refragment
11216 list<MDSInternalContextBase*> waiters;
11217 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11218 info.resultfrags, waiters, false);
11219 if (g_conf->mds_debug_frag)
11220 diri->verify_dirfrags();
11221 mds->queue_waiters(waiters);
11222
11223 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11224 assert(!diri->dirfragtree.is_leaf(*p));
11225
11226 le->metablob.add_dir_context(*info.resultfrags.begin());
11227 for (list<CDir*>::iterator p = info.resultfrags.begin();
11228 p != info.resultfrags.end();
11229 ++p) {
11230 if (diri->is_auth()) {
11231 le->metablob.add_fragmented_dir(*p, false, false);
11232 } else {
11233 (*p)->state_set(CDir::STATE_DIRTYDFT);
11234 le->metablob.add_fragmented_dir(*p, false, true);
11235 }
11236 }
11237
11238 // dft lock
11239 if (diri->is_auth()) {
11240 // journal dirfragtree
11241 inode_t *pi = diri->project_inode();
11242 pi->version = diri->pre_dirty();
11243 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11244 } else {
11245 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11246 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11247 mdr->add_updated_lock(&diri->dirfragtreelock);
11248 }
11249
11250 /*
11251 // filelock
11252 mds->locker->mark_updated_scatterlock(&diri->filelock);
11253 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11254 mut->add_updated_lock(&diri->filelock);
11255
11256 // dirlock
11257 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11258 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11259 mut->add_updated_lock(&diri->nestlock);
11260 */
11261
11262 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11263 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11264 mdr, __func__);
11265 mds->mdlog->flush();
11266 }
11267
11268 void MDCache::_fragment_logged(MDRequestRef& mdr)
11269 {
11270 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11271 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11272 assert(it != fragments.end());
11273 fragment_info_t &info = it->second;
11274 CInode *diri = info.resultfrags.front()->get_inode();
11275
11276 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11277 << " on " << *diri << dendl;
11278
11279 if (diri->is_auth())
11280 diri->pop_and_dirty_projected_inode(mdr->ls);
11281
11282 mdr->apply(); // mark scatterlock
11283
11284 // store resulting frags
11285 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11286
11287 for (list<CDir*>::iterator p = info.resultfrags.begin();
11288 p != info.resultfrags.end();
11289 ++p) {
11290 CDir *dir = *p;
11291 dout(10) << " storing result frag " << *dir << dendl;
11292
11293 // freeze and store them too
11294 dir->auth_pin(this);
11295 dir->state_set(CDir::STATE_FRAGMENTING);
11296 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11297 }
11298
11299 gather.activate();
11300 }
11301
11302 void MDCache::_fragment_stored(MDRequestRef& mdr)
11303 {
11304 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11305 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11306 assert(it != fragments.end());
11307 fragment_info_t &info = it->second;
11308 CInode *diri = info.resultfrags.front()->get_inode();
11309
11310 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11311 << " on " << *diri << dendl;
11312
11313 // tell peers
11314 CDir *first = *info.resultfrags.begin();
11315 for (compact_map<mds_rank_t,unsigned>::iterator p = first->replicas_begin();
11316 p != first->replicas_end();
11317 ++p) {
11318 if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
11319 (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
11320 rejoin_gather.count(p->first)))
11321 continue;
11322
11323 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11324
11325 // freshly replicate new dirs to peers
11326 for (list<CDir*>::iterator q = info.resultfrags.begin();
11327 q != info.resultfrags.end();
11328 ++q)
11329 replicate_dir(*q, p->first, notify->basebl);
11330
11331 mds->send_message_mds(notify, p->first);
11332 }
11333
11334 // journal commit
11335 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11336 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11337 info.resultfrags));
11338
11339 mds->locker->drop_locks(mdr.get());
11340
11341 // unfreeze resulting frags
11342 for (list<CDir*>::iterator p = info.resultfrags.begin();
11343 p != info.resultfrags.end();
11344 ++p) {
11345 CDir *dir = *p;
11346 dout(10) << " result frag " << *dir << dendl;
11347
11348 for (CDir::map_t::iterator p = dir->items.begin();
11349 p != dir->items.end();
11350 ++p) {
11351 CDentry *dn = p->second;
11352 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11353 dn->state_clear(CDentry::STATE_FRAGMENTING);
11354 dn->put(CDentry::PIN_FRAGMENTING);
11355 }
11356
11357 // unfreeze
11358 dir->unfreeze_dir();
11359 }
11360
11361 fragments.erase(it);
11362 request_finish(mdr);
11363 }
11364
11365 void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11366 {
11367 dout(10) << "fragment_committed " << basedirfrag << dendl;
11368 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11369 assert(it != uncommitted_fragments.end());
11370 ufragment &uf = it->second;
11371
11372 // remove old frags
11373 C_GatherBuilder gather(
11374 g_ceph_context,
11375 new C_OnFinisher(
11376 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11377 mds->finisher));
11378
11379 SnapContext nullsnapc;
11380 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11381 for (list<frag_t>::iterator p = uf.old_frags.begin();
11382 p != uf.old_frags.end();
11383 ++p) {
11384 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11385 ObjectOperation op;
11386 if (*p == frag_t()) {
11387 // backtrace object
11388 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11389 op.truncate(0);
11390 op.omap_clear();
11391 } else {
11392 dout(10) << " removing orphan dirfrag " << oid << dendl;
11393 op.remove();
11394 }
11395 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11396 ceph::real_clock::now(),
11397 0, gather.new_sub());
11398 }
11399
11400 assert(gather.has_subs());
11401 gather.activate();
11402 }
11403
11404 void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11405 {
11406 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11407 << resultfrags.size() << dendl;
11408 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11409 assert(it != uncommitted_fragments.end());
11410 ufragment &uf = it->second;
11411
11412 // unmark & auth_unpin
11413 for (const auto &dir : resultfrags) {
11414 dir->state_clear(CDir::STATE_FRAGMENTING);
11415 dir->auth_unpin(this);
11416
11417 // In case the resulting fragments are beyond the split size,
11418 // we might need to split them again right away (they could
11419 // have been taking inserts between unfreezing and getting
11420 // here)
11421 mds->balancer->maybe_fragment(dir, false);
11422 }
11423
11424 if (mds->logger) {
11425 if (resultfrags.size() > 1) {
11426 mds->logger->inc(l_mds_dir_split);
11427 } else {
11428 mds->logger->inc(l_mds_dir_merge);
11429 }
11430 }
11431
11432 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11433 mds->mdlog->start_submit_entry(le);
11434
11435 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11436 }
11437
11438 /* This function DOES put the passed message before returning */
11439 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11440 {
11441 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11442
11443 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11444 notify->put();
11445 return;
11446 }
11447
11448 CInode *diri = get_inode(notify->get_ino());
11449 if (diri) {
11450 frag_t base = notify->get_basefrag();
11451 int bits = notify->get_bits();
11452
11453 /*
11454 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11455 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11456 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11457 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11458 notify->put();
11459 return;
11460 }
11461 */
11462
11463 // refragment
11464 list<MDSInternalContextBase*> waiters;
11465 list<CDir*> resultfrags;
11466 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11467 if (g_conf->mds_debug_frag)
11468 diri->verify_dirfrags();
11469
11470 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11471 diri->take_dir_waiting((*p)->get_frag(), waiters);
11472
11473 // add new replica dirs values
11474 bufferlist::iterator p = notify->basebl.begin();
11475 while (!p.end())
11476 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11477
11478 mds->queue_waiters(waiters);
11479 } else {
11480 ceph_abort();
11481 }
11482
11483 notify->put();
11484 }
11485
11486 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11487 LogSegment *ls, bufferlist *rollback)
11488 {
11489 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11490 assert(!uncommitted_fragments.count(basedirfrag));
11491 ufragment& uf = uncommitted_fragments[basedirfrag];
11492 uf.old_frags = old_frags;
11493 uf.bits = bits;
11494 uf.ls = ls;
11495 ls->uncommitted_fragments.insert(basedirfrag);
11496 if (rollback)
11497 uf.rollback.swap(*rollback);
11498 }
11499
11500 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11501 {
11502 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11503 << " op " << EFragment::op_name(op) << dendl;
11504 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11505 if (it != uncommitted_fragments.end()) {
11506 ufragment& uf = it->second;
11507 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11508 uf.committed = true;
11509 } else {
11510 uf.ls->uncommitted_fragments.erase(basedirfrag);
11511 mds->queue_waiters(uf.waiters);
11512 uncommitted_fragments.erase(it);
11513 }
11514 }
11515 }
11516
11517 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11518 {
11519 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11520 << " old_frags (" << old_frags << ")" << dendl;
11521 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11522 if (it != uncommitted_fragments.end()) {
11523 ufragment& uf = it->second;
11524 if (!uf.old_frags.empty()) {
11525 uf.old_frags.swap(old_frags);
11526 uf.committed = true;
11527 } else {
11528 uf.ls->uncommitted_fragments.erase(basedirfrag);
11529 uncommitted_fragments.erase(it);
11530 }
11531 }
11532 }
11533
11534 void MDCache::rollback_uncommitted_fragments()
11535 {
11536 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11537 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11538 p != uncommitted_fragments.end();
11539 ++p) {
11540 ufragment &uf = p->second;
11541 CInode *diri = get_inode(p->first.ino);
11542 assert(diri);
11543
11544 if (uf.committed) {
11545 list<CDir*> frags;
11546 diri->get_dirfrags_under(p->first.frag, frags);
11547 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11548 CDir *dir = *q;
11549 dir->auth_pin(this);
11550 dir->state_set(CDir::STATE_FRAGMENTING);
11551 }
11552 _fragment_committed(p->first, frags);
11553 continue;
11554 }
11555
11556 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11557
11558 LogSegment *ls = mds->mdlog->get_current_segment();
11559 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11560 mds->mdlog->start_entry(le);
11561 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11562
11563 list<frag_t> old_frags;
11564 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11565
11566 list<CDir*> resultfrags;
11567 if (uf.old_frags.empty()) {
11568 // created by old format EFragment
11569 list<MDSInternalContextBase*> waiters;
11570 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11571 } else {
11572 bufferlist::iterator bp = uf.rollback.begin();
11573 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11574 CDir *dir = force_dir_fragment(diri, *q);
11575 resultfrags.push_back(dir);
11576
11577 dirfrag_rollback rollback;
11578 ::decode(rollback, bp);
11579
11580 dir->set_version(rollback.fnode.version);
11581 dir->fnode = rollback.fnode;
11582
11583 dir->_mark_dirty(ls);
11584
11585 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11586 dout(10) << " dirty nestinfo on " << *dir << dendl;
11587 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11588 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11589 }
11590 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11591 dout(10) << " dirty fragstat on " << *dir << dendl;
11592 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11593 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11594 }
11595
11596 le->add_orig_frag(dir->get_frag());
11597 le->metablob.add_dir_context(dir);
11598 if (diri_auth) {
11599 le->metablob.add_fragmented_dir(dir, true, false);
11600 } else {
11601 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11602 dir->state_set(CDir::STATE_DIRTYDFT);
11603 le->metablob.add_fragmented_dir(dir, true, true);
11604 }
11605 }
11606 }
11607
11608 if (diri_auth) {
11609 diri->project_inode()->version = diri->pre_dirty();
11610 diri->pop_and_dirty_projected_inode(ls); // hacky
11611 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11612 } else {
11613 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11614 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11615 }
11616
11617 if (g_conf->mds_debug_frag)
11618 diri->verify_dirfrags();
11619
11620 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11621 assert(!diri->dirfragtree.is_leaf(*q));
11622
11623 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11624 CDir *dir = *q;
11625 dir->auth_pin(this);
11626 dir->state_set(CDir::STATE_FRAGMENTING);
11627 }
11628
11629 mds->mdlog->submit_entry(le);
11630
11631 uf.old_frags.swap(old_frags);
11632 _fragment_committed(p->first, resultfrags);
11633 }
11634 }
11635
11636 void MDCache::force_readonly()
11637 {
11638 if (is_readonly())
11639 return;
11640
11641 dout(1) << "force file system read-only" << dendl;
11642 mds->clog->warn() << "force file system read-only";
11643
11644 set_readonly();
11645
11646 mds->server->force_clients_readonly();
11647
11648 // revoke write caps
11649 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
11650 p != inode_map.end();
11651 ++p) {
11652 CInode *in = p->second;
11653 if (in->is_head())
11654 mds->locker->eval(in, CEPH_CAP_LOCKS);
11655 }
11656
11657 mds->mdlog->flush();
11658 }
11659
11660
11661 // ==============================================================
11662 // debug crap
11663
11664 void MDCache::show_subtrees(int dbl)
11665 {
11666 if (g_conf->mds_thrash_exports)
11667 dbl += 15;
11668
11669 //dout(10) << "show_subtrees" << dendl;
11670
11671 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11672 return; // i won't print anything.
11673
11674 if (subtrees.empty()) {
11675 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11676 return;
11677 }
11678
11679 // root frags
11680 list<CDir*> basefrags;
11681 for (set<CInode*>::iterator p = base_inodes.begin();
11682 p != base_inodes.end();
11683 ++p)
11684 (*p)->get_dirfrags(basefrags);
11685 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11686 dout(15) << "show_subtrees" << dendl;
11687
11688 // queue stuff
11689 list<pair<CDir*,int> > q;
11690 string indent;
11691 set<CDir*> seen;
11692
11693 // calc max depth
11694 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11695 q.push_back(pair<CDir*,int>(*p, 0));
11696
11697 set<CDir*> subtrees_seen;
11698
11699 int depth = 0;
11700 while (!q.empty()) {
11701 CDir *dir = q.front().first;
11702 int d = q.front().second;
11703 q.pop_front();
11704
11705 if (subtrees.count(dir) == 0) continue;
11706
11707 subtrees_seen.insert(dir);
11708
11709 if (d > depth) depth = d;
11710
11711 // sanity check
11712 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11713 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11714 assert(seen.count(dir) == 0);
11715 seen.insert(dir);
11716
11717 // nested items?
11718 if (!subtrees[dir].empty()) {
11719 for (set<CDir*>::iterator p = subtrees[dir].begin();
11720 p != subtrees[dir].end();
11721 ++p) {
11722 //dout(25) << " saw sub " << **p << dendl;
11723 q.push_front(pair<CDir*,int>(*p, d+1));
11724 }
11725 }
11726 }
11727
11728
11729 // print tree
11730 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11731 q.push_back(pair<CDir*,int>(*p, 0));
11732
11733 while (!q.empty()) {
11734 CDir *dir = q.front().first;
11735 int d = q.front().second;
11736 q.pop_front();
11737
11738 if (subtrees.count(dir) == 0) continue;
11739
11740 // adjust indenter
11741 while ((unsigned)d < indent.size())
11742 indent.resize(d);
11743
11744 // pad
11745 string pad = "______________________________________";
11746 pad.resize(depth*2+1-indent.size());
11747 if (!subtrees[dir].empty())
11748 pad[0] = '.'; // parent
11749
11750
11751 string auth;
11752 if (dir->is_auth())
11753 auth = "auth ";
11754 else
11755 auth = " rep ";
11756
11757 char s[10];
11758 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11759 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11760 else
11761 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11762
11763 // print
11764 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11765
11766 if (dir->ino() == MDS_INO_ROOT)
11767 assert(dir->inode == root);
11768 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11769 assert(dir->inode == myin);
11770 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11771 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11772
11773 // nested items?
11774 if (!subtrees[dir].empty()) {
11775 // more at my level?
11776 if (!q.empty() && q.front().second == d)
11777 indent += "| ";
11778 else
11779 indent += " ";
11780
11781 for (set<CDir*>::iterator p = subtrees[dir].begin();
11782 p != subtrees[dir].end();
11783 ++p)
11784 q.push_front(pair<CDir*,int>(*p, d+2));
11785 }
11786 }
11787
11788 // verify there isn't stray crap in subtree map
11789 int lost = 0;
11790 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11791 p != subtrees.end();
11792 ++p) {
11793 if (subtrees_seen.count(p->first)) continue;
11794 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11795 lost++;
11796 }
11797 assert(lost == 0);
11798 }
11799
11800
11801 void MDCache::show_cache()
11802 {
11803 dout(7) << "show_cache" << dendl;
11804
11805 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11806 it != inode_map.end();
11807 ++it) {
11808 // unlinked?
11809 if (!it->second->parent)
11810 dout(7) << " unlinked " << *it->second << dendl;
11811
11812 // dirfrags?
11813 list<CDir*> dfs;
11814 it->second->get_dirfrags(dfs);
11815 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11816 CDir *dir = *p;
11817 dout(7) << " dirfrag " << *dir << dendl;
11818
11819 for (CDir::map_t::iterator p = dir->items.begin();
11820 p != dir->items.end();
11821 ++p) {
11822 CDentry *dn = p->second;
11823 dout(7) << " dentry " << *dn << dendl;
11824 CDentry::linkage_t *dnl = dn->get_linkage();
11825 if (dnl->is_primary() && dnl->get_inode())
11826 dout(7) << " inode " << *dnl->get_inode() << dendl;
11827 }
11828 }
11829 }
11830 }
11831
11832 int MDCache::dump_cache(std::string const &file_name)
11833 {
11834 return dump_cache(file_name.c_str(), NULL);
11835 }
11836
11837 int MDCache::dump_cache(Formatter *f)
11838 {
11839 return dump_cache(NULL, f);
11840 }
11841
11842 int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
11843 {
11844 return dump_cache(NULL, f, dump_root, depth);
11845 }
11846
11847 /**
11848 * Dump the metadata cache, either to a Formatter, if
11849 * provided, else to a plain text file.
11850 */
11851 int MDCache::dump_cache(const char *fn, Formatter *f,
11852 const string& dump_root, int depth)
11853 {
11854 int r = 0;
11855 int fd = -1;
11856
11857 if (f) {
11858 f->open_array_section("inodes");
11859 } else {
11860 char deffn[200];
11861 if (!fn) {
11862 snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11863 fn = deffn;
11864 }
11865
11866 dout(1) << "dump_cache to " << fn << dendl;
11867
11868 fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
11869 if (fd < 0) {
11870 derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
11871 return errno;
11872 }
11873 }
11874
11875 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11876 it != inode_map.end();
11877 ++it) {
11878 CInode *in = it->second;
11879
11880 if (!dump_root.empty()) {
11881 string ipath;
11882 if (in->is_root())
11883 ipath = "/";
11884 else
11885 in->make_path_string(ipath);
11886
11887 if (dump_root.length() > ipath.length() ||
11888 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11889 continue;
11890
11891 if (depth >= 0 &&
11892 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11893 continue;
11894 }
11895
11896 if (f) {
11897 f->open_object_section("inode");
11898 in->dump(f);
11899 } else {
11900 ostringstream ss;
11901 ss << *in << std::endl;
11902 std::string s = ss.str();
11903 r = safe_write(fd, s.c_str(), s.length());
11904 if (r < 0) {
11905 goto out;
11906 }
11907 }
11908
11909 list<CDir*> dfs;
11910 in->get_dirfrags(dfs);
11911 if (f) {
11912 f->open_array_section("dirfrags");
11913 }
11914 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11915 CDir *dir = *p;
11916 if (f) {
11917 f->open_object_section("dir");
11918 dir->dump(f);
11919 } else {
11920 ostringstream tt;
11921 tt << " " << *dir << std::endl;
11922 string t = tt.str();
11923 r = safe_write(fd, t.c_str(), t.length());
11924 if (r < 0) {
11925 goto out;
11926 }
11927 }
11928
11929 if (f) {
11930 f->open_array_section("dentries");
11931 }
11932 for (CDir::map_t::iterator q = dir->items.begin();
11933 q != dir->items.end();
11934 ++q) {
11935 CDentry *dn = q->second;
11936 if (f) {
11937 f->open_object_section("dentry");
11938 dn->dump(f);
11939 f->close_section();
11940 } else {
11941 ostringstream uu;
11942 uu << " " << *dn << std::endl;
11943 string u = uu.str();
11944 r = safe_write(fd, u.c_str(), u.length());
11945 if (r < 0) {
11946 goto out;
11947 }
11948 }
11949 }
11950 if (f) {
11951 f->close_section(); //dentries
11952 }
11953 dir->check_rstats();
11954 if (f) {
11955 f->close_section(); //dir
11956 }
11957 }
11958 if (f) {
11959 f->close_section(); // dirfrags
11960 }
11961
11962 if (f) {
11963 f->close_section(); // inode
11964 }
11965 }
11966
11967 out:
11968 if (f) {
11969 f->close_section(); // inodes
11970 } else {
11971 ::close(fd);
11972 }
11973 return r;
11974 }
11975
11976
11977
11978 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
11979 : MDSInternalContext(c->mds), cache(c), mdr(r)
11980 {}
11981
11982 void C_MDS_RetryRequest::finish(int r)
11983 {
11984 mdr->retry++;
11985 cache->dispatch_request(mdr);
11986 }
11987
11988
11989 class C_MDS_EnqueueScrub : public Context
11990 {
11991 Formatter *formatter;
11992 Context *on_finish;
11993 public:
11994 ScrubHeaderRef header;
11995 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
11996 formatter(f), on_finish(fin), header(nullptr) {}
11997
11998 Context *take_finisher() {
11999 Context *fin = on_finish;
12000 on_finish = NULL;
12001 return fin;
12002 }
12003
12004 void finish(int r) override {
12005 if (r < 0) { // we failed the lookup or something; dump ourselves
12006 formatter->open_object_section("results");
12007 formatter->dump_int("return_code", r);
12008 formatter->close_section(); // results
12009 }
12010 if (on_finish)
12011 on_finish->complete(r);
12012 }
12013 };
12014
12015 void MDCache::enqueue_scrub(
12016 const string& path,
12017 const std::string &tag,
12018 bool force, bool recursive, bool repair,
12019 Formatter *f, Context *fin)
12020 {
12021 dout(10) << __func__ << path << dendl;
12022 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12023 filepath fp(path.c_str());
12024 mdr->set_filepath(fp);
12025
12026 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12027 cs->header = std::make_shared<ScrubHeader>(
12028 tag, force, recursive, repair, f);
12029
12030 mdr->internal_op_finish = cs;
12031 enqueue_scrub_work(mdr);
12032 }
12033
12034 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12035 {
12036 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12037 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12038 if (NULL == in)
12039 return;
12040
12041 // TODO: Remove this restriction
12042 assert(in->is_auth());
12043
12044 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12045 if (!locked)
12046 return;
12047
12048 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12049 ScrubHeaderRef &header = cs->header;
12050
12051 // Cannot scrub same dentry twice at same time
12052 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12053 mds->server->respond_to_request(mdr, -EBUSY);
12054 return;
12055 } else {
12056 in->scrub_info();
12057 }
12058
12059 header->set_origin(in);
12060
12061 // only set completion context for non-recursive scrub, because we don't
12062 // want to block asok caller on long running scrub
12063 if (!header->get_recursive()) {
12064 Context *fin = cs->take_finisher();
12065 mds->scrubstack->enqueue_inode_top(in, header,
12066 new MDSInternalContextWrapper(mds, fin));
12067 } else
12068 mds->scrubstack->enqueue_inode_bottom(in, header, NULL);
12069
12070 mds->server->respond_to_request(mdr, 0);
12071 return;
12072 }
12073
12074 struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12075 MDRequestRef mdr;
12076 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12077 MDCacheLogContext(c), mdr(m) {}
12078 void finish(int r) override {
12079 mdr->apply();
12080 get_mds()->server->respond_to_request(mdr, r);
12081 }
12082 };
12083
12084 void MDCache::repair_dirfrag_stats(CDir *dir)
12085 {
12086 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12087 mdr->pin(dir);
12088 mdr->internal_op_private = dir;
12089 mdr->internal_op_finish = new C_MDSInternalNoop;
12090 repair_dirfrag_stats_work(mdr);
12091 }
12092
12093 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12094 {
12095 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12096 dout(10) << __func__ << " " << *dir << dendl;
12097
12098 if (!dir->is_auth()) {
12099 mds->server->respond_to_request(mdr, -ESTALE);
12100 return;
12101 }
12102
12103 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12104 mds->locker->drop_locks(mdr.get());
12105 mdr->drop_local_auth_pins();
12106 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12107 return;
12108 }
12109
12110 mdr->auth_pin(dir);
12111
12112 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12113 CInode *diri = dir->inode;
12114 rdlocks.insert(&diri->dirfragtreelock);
12115 wrlocks.insert(&diri->nestlock);
12116 wrlocks.insert(&diri->filelock);
12117 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12118 return;
12119
12120 if (!dir->is_complete()) {
12121 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12122 return;
12123 }
12124
12125 frag_info_t frag_info;
12126 nest_info_t nest_info;
12127 for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
12128 CDentry *dn = it->second;
12129 if (dn->last != CEPH_NOSNAP)
12130 continue;
12131 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12132 if (dnl->is_primary()) {
12133 CInode *in = dnl->get_inode();
12134 nest_info.add(in->get_projected_inode()->accounted_rstat);
12135 if (in->is_dir())
12136 frag_info.nsubdirs++;
12137 else
12138 frag_info.nfiles++;
12139 } else if (dnl->is_remote())
12140 frag_info.nfiles++;
12141 }
12142
12143 fnode_t *pf = dir->get_projected_fnode();
12144 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12145 bool good_rstat = nest_info.same_sums(pf->rstat);
12146 if (good_fragstat && good_rstat) {
12147 dout(10) << __func__ << " no corruption found" << dendl;
12148 mds->server->respond_to_request(mdr, 0);
12149 return;
12150 }
12151
12152 pf = dir->project_fnode();
12153 pf->version = dir->pre_dirty();
12154 mdr->add_projected_fnode(dir);
12155
12156 mdr->ls = mds->mdlog->get_current_segment();
12157 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12158 mds->mdlog->start_entry(le);
12159
12160 if (!good_fragstat) {
12161 if (pf->fragstat.mtime > frag_info.mtime)
12162 frag_info.mtime = pf->fragstat.mtime;
12163 if (pf->fragstat.change_attr > frag_info.change_attr)
12164 frag_info.change_attr = pf->fragstat.change_attr;
12165 pf->fragstat = frag_info;
12166 mds->locker->mark_updated_scatterlock(&diri->filelock);
12167 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12168 mdr->add_updated_lock(&diri->filelock);
12169 }
12170
12171 if (!good_rstat) {
12172 if (pf->rstat.rctime > nest_info.rctime)
12173 nest_info.rctime = pf->rstat.rctime;
12174 pf->rstat = nest_info;
12175 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12176 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12177 mdr->add_updated_lock(&diri->nestlock);
12178 }
12179
12180 le->metablob.add_dir_context(dir);
12181 le->metablob.add_dir(dir, true);
12182
12183 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12184 }
12185
12186 void MDCache::repair_inode_stats(CInode *diri)
12187 {
12188 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12189 mdr->pin(diri);
12190 mdr->internal_op_private = diri;
12191 mdr->internal_op_finish = new C_MDSInternalNoop;
12192 repair_inode_stats_work(mdr);
12193 }
12194
12195 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12196 {
12197 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12198 dout(10) << __func__ << " " << *diri << dendl;
12199
12200 if (!diri->is_auth()) {
12201 mds->server->respond_to_request(mdr, -ESTALE);
12202 return;
12203 }
12204 if (!diri->is_dir()) {
12205 mds->server->respond_to_request(mdr, -ENOTDIR);
12206 return;
12207 }
12208
12209 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12210 std::list<frag_t> frags;
12211
12212 if (mdr->ls) // already marked filelock/nestlock dirty ?
12213 goto do_rdlocks;
12214
12215 rdlocks.insert(&diri->dirfragtreelock);
12216 wrlocks.insert(&diri->nestlock);
12217 wrlocks.insert(&diri->filelock);
12218 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12219 return;
12220
12221 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12222 // the scatter-gather process, which will fix any fragstat/rstat errors.
12223 diri->dirfragtree.get_leaves(frags);
12224 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12225 CDir *dir = diri->get_dirfrag(*p);
12226 if (!dir) {
12227 assert(mdr->is_auth_pinned(diri));
12228 dir = diri->get_or_open_dirfrag(this, *p);
12229 }
12230 if (dir->get_version() == 0) {
12231 assert(dir->is_auth());
12232 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12233 return;
12234 }
12235 }
12236
12237 diri->state_set(CInode::STATE_REPAIRSTATS);
12238 mdr->ls = mds->mdlog->get_current_segment();
12239 mds->locker->mark_updated_scatterlock(&diri->filelock);
12240 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12241 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12242 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12243
12244 mds->locker->drop_locks(mdr.get());
12245
12246 do_rdlocks:
12247 // force the scatter-gather process
12248 rdlocks.insert(&diri->dirfragtreelock);
12249 rdlocks.insert(&diri->nestlock);
12250 rdlocks.insert(&diri->filelock);
12251 wrlocks.clear();
12252 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12253 return;
12254
12255 diri->state_clear(CInode::STATE_REPAIRSTATS);
12256
12257 frag_info_t dir_info;
12258 nest_info_t nest_info;
12259 nest_info.rsubdirs++; // it gets one to account for self
12260
12261 diri->dirfragtree.get_leaves(frags);
12262 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12263 CDir *dir = diri->get_dirfrag(*p);
12264 assert(dir);
12265 assert(dir->get_version() > 0);
12266 dir_info.add(dir->fnode.accounted_fragstat);
12267 nest_info.add(dir->fnode.accounted_rstat);
12268 }
12269
12270 if (!dir_info.same_sums(diri->inode.dirstat) ||
12271 !nest_info.same_sums(diri->inode.rstat)) {
12272 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12273 << *diri << dendl;
12274 }
12275
12276 mds->server->respond_to_request(mdr, 0);
12277 }
12278
12279 void MDCache::flush_dentry(const string& path, Context *fin)
12280 {
12281 if (is_readonly()) {
12282 dout(10) << __func__ << ": read-only FS" << dendl;
12283 fin->complete(-EROFS);
12284 return;
12285 }
12286 dout(10) << "flush_dentry " << path << dendl;
12287 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12288 filepath fp(path.c_str());
12289 mdr->set_filepath(fp);
12290 mdr->internal_op_finish = fin;
12291 flush_dentry_work(mdr);
12292 }
12293
12294 class C_FinishIOMDR : public MDSInternalContextBase {
12295 protected:
12296 MDSRank *mds;
12297 MDRequestRef mdr;
12298 MDSRank *get_mds() override { return mds; }
12299 public:
12300 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12301 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12302 };
12303
12304 void MDCache::flush_dentry_work(MDRequestRef& mdr)
12305 {
12306 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12307 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12308 if (NULL == in)
12309 return;
12310
12311 // TODO: Is this necessary? Fix it if so
12312 assert(in->is_auth());
12313 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12314 if (!locked)
12315 return;
12316 in->flush(new C_FinishIOMDR(mds, mdr));
12317 }
12318
12319
12320 /**
12321 * Initialize performance counters with global perfcounter
12322 * collection.
12323 */
12324 void MDCache::register_perfcounters()
12325 {
12326 PerfCountersBuilder pcb(g_ceph_context,
12327 "mds_cache", l_mdc_first, l_mdc_last);
12328
12329 /* Stray/purge statistics */
12330 pcb.add_u64(l_mdc_num_strays, "num_strays",
12331 "Stray dentries", "stry");
12332 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12333 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12334
12335 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12336 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12337 "Stray dentries enqueued for purge");
12338 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12339 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12340
12341
12342 /* Recovery queue statistics */
12343 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12344 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
12345 "Files waiting for recovery", "recy");
12346 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12347 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12348 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
12349 "File recoveries completed", "recd");
12350
12351 logger.reset(pcb.create_perf_counters());
12352 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12353 recovery_queue.set_logger(logger.get());
12354 stray_manager.set_logger(logger.get());
12355 }
12356
12357 void MDCache::activate_stray_manager()
12358 {
12359 if (open) {
12360 stray_manager.activate();
12361 } else {
12362 wait_for_open(
12363 new MDSInternalContextWrapper(mds,
12364 new FunctionContext([this](int r){
12365 stray_manager.activate();
12366 })
12367 )
12368 );
12369 }
12370 }
12371
12372 /**
12373 * Call this when putting references to an inode/dentry or
12374 * when attempting to trim it.
12375 *
12376 * If this inode is no longer linked by anyone, and this MDS
12377 * rank holds the primary dentry, and that dentry is in a stray
12378 * directory, then give up the dentry to the StrayManager, never
12379 * to be seen again by MDCache.
12380 *
12381 * @param delay if true, then purgeable inodes are stashed til
12382 * the next trim(), rather than being purged right
12383 * away.
12384 */
12385 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
12386 if (in->inode.nlink > 0 || in->is_base() || is_readonly() || mds->is_standby_replay())
12387 return;
12388 CDentry *dn = in->get_projected_parent_dn();
12389
12390 if (dn->state_test(CDentry::STATE_PURGING)) {
12391 /* We have already entered the purging process, no need
12392 * to re-evaluate me ! */
12393 return;
12394 }
12395
12396 if (dn->get_projected_linkage()->is_primary() &&
12397 dn->get_dir()->get_inode()->is_stray()) {
12398 stray_manager.eval_stray(dn, delay);
12399 }
12400 }
12401
12402 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12403 dout(10) << __func__ << " " << *diri << dendl;
12404 assert(diri->get_projected_parent_dir()->inode->is_stray());
12405 list<CDir*> ls;
12406 diri->get_dirfrags(ls);
12407 for (auto p : ls) {
12408 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12409 p->try_remove_dentries_for_stray();
12410 }
12411 if (!diri->snaprealm) {
12412 if (diri->is_auth())
12413 diri->clear_dirty_rstat();
12414 diri->clear_scatter_dirty();
12415 }
12416 }
12417