]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
e954c8912a57fbc6c470ff20d3626dac6556e3c7
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <boost/utility/string_view.hpp>
21 #include <map>
22
23 #include "MDCache.h"
24 #include "MDSRank.h"
25 #include "Server.h"
26 #include "Locker.h"
27 #include "MDLog.h"
28 #include "MDBalancer.h"
29 #include "Migrator.h"
30 #include "ScrubStack.h"
31
32 #include "SnapClient.h"
33
34 #include "MDSMap.h"
35
36 #include "CInode.h"
37 #include "CDir.h"
38
39 #include "Mutation.h"
40
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
44
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
47
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
52
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
55
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/ESlaveUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/ESessions.h"
63
64 #include "messages/MGenericMessage.h"
65
66 #include "messages/MMDSResolve.h"
67 #include "messages/MMDSResolveAck.h"
68 #include "messages/MMDSCacheRejoin.h"
69
70 #include "messages/MDiscover.h"
71 #include "messages/MDiscoverReply.h"
72
73 //#include "messages/MInodeUpdate.h"
74 #include "messages/MDirUpdate.h"
75 #include "messages/MCacheExpire.h"
76
77 #include "messages/MInodeFileCaps.h"
78
79 #include "messages/MLock.h"
80 #include "messages/MDentryLink.h"
81 #include "messages/MDentryUnlink.h"
82
83 #include "messages/MMDSFindIno.h"
84 #include "messages/MMDSFindInoReply.h"
85
86 #include "messages/MMDSOpenIno.h"
87 #include "messages/MMDSOpenInoReply.h"
88
89 #include "messages/MClientRequest.h"
90 #include "messages/MClientCaps.h"
91 #include "messages/MClientSnap.h"
92 #include "messages/MClientQuota.h"
93
94 #include "messages/MMDSSlaveRequest.h"
95
96 #include "messages/MMDSFragmentNotify.h"
97
98 #include "messages/MGatherCaps.h"
99
100 #include "InoTable.h"
101
102 #include "common/Timer.h"
103
104 #include "perfglue/heap_profiler.h"
105
106 using namespace std;
107
108 #include "common/config.h"
109 #include "include/assert.h"
110
111 #define dout_context g_ceph_context
112 #define dout_subsys ceph_subsys_mds
113 #undef dout_prefix
114 #define dout_prefix _prefix(_dout, mds)
115 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
116 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
117 }
118
119 set<int> SimpleLock::empty_gather_set;
120
121
122 /**
123 * All non-I/O contexts that require a reference
124 * to an MDCache instance descend from this.
125 */
126 class MDCacheContext : public virtual MDSInternalContextBase {
127 protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
131 assert(mdcache != NULL);
132 return mdcache->mds;
133 }
134 public:
135 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
136 };
137
138
139 /**
140 * Only for contexts called back from an I/O completion
141 *
142 * Note: duplication of members wrt MDCacheContext, because
143 * it'ls the lesser of two evils compared with introducing
144 * yet another piece of (multiple) inheritance.
145 */
146 class MDCacheIOContext : public virtual MDSIOContextBase {
147 protected:
148 MDCache *mdcache;
149 MDSRank *get_mds() override
150 {
151 assert(mdcache != NULL);
152 return mdcache->mds;
153 }
154 public:
155 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
156 };
157
158 class MDCacheLogContext : public virtual MDSLogContextBase {
159 protected:
160 MDCache *mdcache;
161 MDSRank *get_mds() override
162 {
163 assert(mdcache != NULL);
164 return mdcache->mds;
165 }
166 public:
167 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
168 };
169
170 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
171 mds(m),
172 filer(m->objecter, m->finisher),
173 exceeded_size_limit(false),
174 recovery_queue(m),
175 stray_manager(m, purge_queue_)
176 {
177 migrator.reset(new Migrator(mds, this));
178 root = NULL;
179 myin = NULL;
180 readonly = false;
181
182 stray_index = 0;
183 for (int i = 0; i < NUM_STRAY; ++i) {
184 strays[i] = NULL;
185 }
186
187 num_shadow_inodes = 0;
188 num_inodes_with_caps = 0;
189
190 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
191 (g_conf->mds_dir_max_commit_size << 20) :
192 (0.9 *(g_conf->osd_max_write_size << 20));
193
194 discover_last_tid = 0;
195 open_ino_last_tid = 0;
196 find_ino_peer_last_tid = 0;
197
198 last_cap_id = 0;
199
200 client_lease_durations[0] = 5.0;
201 client_lease_durations[1] = 30.0;
202 client_lease_durations[2] = 300.0;
203
204 resolves_pending = false;
205 rejoins_pending = false;
206 cap_imports_num_opening = 0;
207
208 opening_root = open = false;
209 lru.lru_set_midpoint(cache_mid());
210
211 bottom_lru.lru_set_midpoint(0);
212
213 decayrate.set_halflife(g_conf->mds_decay_halflife);
214
215 did_shutdown_log_cap = false;
216 }
217
218 MDCache::~MDCache()
219 {
220 if (logger) {
221 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
222 }
223 }
224
225
226
227 void MDCache::log_stat()
228 {
229 mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
230 mds->logger->set(l_mds_inodes, lru.lru_get_size());
231 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
232 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
233 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
234 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
235 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
236 mds->logger->set(l_mds_caps, Capability::count());
237 }
238
239
240 //
241
242 bool MDCache::shutdown()
243 {
244 if (lru.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
246 //show_cache();
247 show_subtrees();
248 //dump();
249 }
250 return true;
251 }
252
253
254 // ====================================================================
255 // some inode functions
256
257 void MDCache::add_inode(CInode *in)
258 {
259 // add to lru, inode map
260 if (in->last == CEPH_NOSNAP) {
261 auto &p = inode_map[in->ino()];
262 assert(!p); // should be no dup inos!
263 p = in;
264 } else {
265 auto &p = snap_inode_map[in->vino()];
266 assert(!p); // should be no dup inos!
267 p = in;
268 }
269
270 if (in->ino() < MDS_INO_SYSTEM_BASE) {
271 if (in->ino() == MDS_INO_ROOT)
272 root = in;
273 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
274 myin = in;
275 else if (in->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
277 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
278 }
279 }
280 if (in->is_base())
281 base_inodes.insert(in);
282 }
283
284 if (cache_toofull()) {
285 exceeded_size_limit = true;
286 }
287 }
288
289 void MDCache::remove_inode(CInode *o)
290 {
291 dout(14) << "remove_inode " << *o << dendl;
292
293 if (o->get_parent_dn()) {
294 // FIXME: multiple parents?
295 CDentry *dn = o->get_parent_dn();
296 assert(!dn->is_dirty());
297 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
298 }
299
300 if (o->is_dirty())
301 o->mark_clean();
302 if (o->is_dirty_parent())
303 o->clear_dirty_parent();
304
305 o->clear_scatter_dirty();
306
307 o->item_open_file.remove_myself();
308
309 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
310 export_pin_queue.erase(o);
311
312 // remove from inode map
313 if (o->last == CEPH_NOSNAP)
314 inode_map.erase(o->ino());
315 else
316 snap_inode_map.erase(o->vino());
317
318 if (o->ino() < MDS_INO_SYSTEM_BASE) {
319 if (o == root) root = 0;
320 if (o == myin) myin = 0;
321 if (o->is_stray()) {
322 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
323 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
324 }
325 }
326 if (o->is_base())
327 base_inodes.erase(o);
328 }
329
330 // delete it
331 assert(o->get_num_ref() == 0);
332 delete o;
333 }
334
335 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
336 {
337 file_layout_t result = file_layout_t::get_default();
338 result.pool_id = mdsmap.get_first_data_pool();
339 return result;
340 }
341
342 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
343 {
344 file_layout_t result = file_layout_t::get_default();
345 result.pool_id = mdsmap.get_metadata_pool();
346 if (g_conf->mds_log_segment_size > 0) {
347 result.object_size = g_conf->mds_log_segment_size;
348 result.stripe_unit = g_conf->mds_log_segment_size;
349 }
350 return result;
351 }
352
353 void MDCache::init_layouts()
354 {
355 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
356 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
357 }
358
359 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
360 int mode) const
361 {
362 in->inode.ino = ino;
363 in->inode.version = 1;
364 in->inode.xattr_version = 1;
365 in->inode.mode = 0500 | mode;
366 in->inode.size = 0;
367 in->inode.ctime =
368 in->inode.mtime =
369 in->inode.btime = ceph_clock_now();
370 in->inode.nlink = 1;
371 in->inode.truncate_size = -1ull;
372 in->inode.change_attr = 0;
373 in->inode.export_pin = MDS_RANK_NONE;
374
375 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
376 if (in->inode.is_dir()) {
377 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
378 ++in->inode.rstat.rsubdirs;
379 } else {
380 in->inode.layout = default_file_layout;
381 ++in->inode.rstat.rfiles;
382 }
383 in->inode.accounted_rstat = in->inode.rstat;
384
385 if (in->is_base()) {
386 if (in->is_root())
387 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
388 else
389 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
390 in->open_snaprealm(); // empty snaprealm
391 assert(!in->snaprealm->parent); // created its own
392 in->snaprealm->srnode.seq = 1;
393 }
394 }
395
396 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
397 {
398 dout(0) << "creating system inode with ino:" << ino << dendl;
399 CInode *in = new CInode(this);
400 create_unlinked_system_inode(in, ino, mode);
401 add_inode(in);
402 return in;
403 }
404
405 CInode *MDCache::create_root_inode()
406 {
407 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
408 i->inode.uid = g_conf->mds_root_ino_uid;
409 i->inode.gid = g_conf->mds_root_ino_gid;
410 i->inode.layout = default_file_layout;
411 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
412 return i;
413 }
414
415 void MDCache::create_empty_hierarchy(MDSGather *gather)
416 {
417 // create root dir
418 CInode *root = create_root_inode();
419
420 // force empty root dir
421 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
422 adjust_subtree_auth(rootdir, mds->get_nodeid());
423 rootdir->dir_rep = CDir::REP_ALL; //NONE;
424
425 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
426 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
427
428 root->inode.dirstat = rootdir->fnode.fragstat;
429 root->inode.rstat = rootdir->fnode.rstat;
430 ++root->inode.rstat.rsubdirs;
431 root->inode.accounted_rstat = root->inode.rstat;
432
433 rootdir->mark_complete();
434 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
435 rootdir->commit(0, gather->new_sub());
436
437 root->store(gather->new_sub());
438 }
439
440 void MDCache::create_mydir_hierarchy(MDSGather *gather)
441 {
442 // create mds dir
443 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
444
445 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
446 adjust_subtree_auth(mydir, mds->get_nodeid());
447
448 LogSegment *ls = mds->mdlog->get_current_segment();
449
450 // stray dir
451 for (int i = 0; i < NUM_STRAY; ++i) {
452 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
453 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
454 stringstream name;
455 name << "stray" << i;
456 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
457 sdn->_mark_dirty(mds->mdlog->get_current_segment());
458
459 stray->inode.dirstat = straydir->fnode.fragstat;
460
461 mydir->fnode.rstat.add(stray->inode.rstat);
462 mydir->fnode.fragstat.nsubdirs++;
463 // save them
464 straydir->mark_complete();
465 straydir->mark_dirty(straydir->pre_dirty(), ls);
466 straydir->commit(0, gather->new_sub());
467 stray->_mark_dirty_parent(ls, true);
468 stray->store_backtrace(gather->new_sub());
469 }
470
471 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
472 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
473
474 myin->inode.dirstat = mydir->fnode.fragstat;
475 myin->inode.rstat = mydir->fnode.rstat;
476 ++myin->inode.rstat.rsubdirs;
477 myin->inode.accounted_rstat = myin->inode.rstat;
478
479 mydir->mark_complete();
480 mydir->mark_dirty(mydir->pre_dirty(), ls);
481 mydir->commit(0, gather->new_sub());
482
483 myin->store(gather->new_sub());
484 }
485
486 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
487 MutationRef mut;
488 CDentry *dn;
489 version_t dpv;
490 MDSInternalContextBase *fin;
491 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
492 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
493 void finish(int r) override {
494 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
495 }
496 };
497
498 void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
499 {
500 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
501 CDentry *dn = dir->add_null_dentry(name);
502
503 dn->push_projected_linkage(in);
504 version_t dpv = dn->pre_dirty();
505
506 CDir *mdir = 0;
507 if (in->inode.is_dir()) {
508 in->inode.rstat.rsubdirs = 1;
509
510 mdir = in->get_or_open_dirfrag(this, frag_t());
511 mdir->mark_complete();
512 mdir->pre_dirty();
513 } else
514 in->inode.rstat.rfiles = 1;
515 in->inode.version = dn->pre_dirty();
516
517 SnapRealm *realm = dir->get_inode()->find_snaprealm();
518 dn->first = in->first = realm->get_newest_seq() + 1;
519
520 MutationRef mut(new MutationImpl());
521
522 // force some locks. hacky.
523 mds->locker->wrlock_force(&dir->inode->filelock, mut);
524 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
525
526 mut->ls = mds->mdlog->get_current_segment();
527 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
528 mds->mdlog->start_entry(le);
529
530 if (!in->is_mdsdir()) {
531 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
532 le->metablob.add_primary_dentry(dn, in, true);
533 } else {
534 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
535 journal_dirty_inode(mut.get(), &le->metablob, in);
536 dn->push_projected_linkage(in->ino(), in->d_type());
537 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
538 le->metablob.add_root(true, in);
539 }
540 if (mdir)
541 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
542
543 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
544 mds->mdlog->flush();
545 }
546
547 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
548 {
549 dout(10) << "_create_system_file_finish " << *dn << dendl;
550
551 dn->pop_projected_linkage();
552 dn->mark_dirty(dpv, mut->ls);
553
554 CInode *in = dn->get_linkage()->get_inode();
555 in->inode.version--;
556 in->mark_dirty(in->inode.version + 1, mut->ls);
557
558 if (in->inode.is_dir()) {
559 CDir *dir = in->get_dirfrag(frag_t());
560 assert(dir);
561 dir->mark_dirty(1, mut->ls);
562 dir->mark_new(mut->ls);
563 }
564
565 mut->apply();
566 mds->locker->drop_locks(mut.get());
567 mut->cleanup();
568
569 fin->complete(0);
570
571 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
572 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
573 }
574
575
576
577 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
578 MDCache *cache;
579 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
580 void finish(int r) override {
581 if (r < 0) {
582 // If we can't open root, something disastrous has happened: mark
583 // this rank damaged for operator intervention. Note that
584 // it is not okay to call suicide() here because we are in
585 // a Finisher callback.
586 cache->mds->damaged();
587 ceph_abort(); // damaged should never return
588 } else {
589 cache->open_root();
590 }
591 }
592 };
593
594 void MDCache::open_root_inode(MDSInternalContextBase *c)
595 {
596 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
597 CInode *in;
598 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
599 in->fetch(c);
600 } else {
601 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
602 }
603 }
604
605 void MDCache::open_mydir_inode(MDSInternalContextBase *c)
606 {
607 MDSGatherBuilder gather(g_ceph_context);
608
609 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
610 in->fetch(gather.new_sub());
611
612 gather.set_finisher(c);
613 gather.activate();
614 }
615
616 void MDCache::open_root()
617 {
618 dout(10) << "open_root" << dendl;
619
620 if (!root) {
621 open_root_inode(new C_MDS_RetryOpenRoot(this));
622 return;
623 }
624 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
625 assert(root->is_auth());
626 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
627 assert(rootdir);
628 if (!rootdir->is_subtree_root())
629 adjust_subtree_auth(rootdir, mds->get_nodeid());
630 if (!rootdir->is_complete()) {
631 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
632 return;
633 }
634 } else {
635 assert(!root->is_auth());
636 CDir *rootdir = root->get_dirfrag(frag_t());
637 if (!rootdir) {
638 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
639 return;
640 }
641 }
642
643 if (!myin) {
644 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
645 in->fetch(new C_MDS_RetryOpenRoot(this));
646 return;
647 }
648 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
649 assert(mydir);
650 adjust_subtree_auth(mydir, mds->get_nodeid());
651
652 populate_mydir();
653 }
654
655 void MDCache::populate_mydir()
656 {
657 assert(myin);
658 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
659 assert(mydir);
660
661 dout(10) << "populate_mydir " << *mydir << dendl;
662
663 if (!mydir->is_complete()) {
664 mydir->fetch(new C_MDS_RetryOpenRoot(this));
665 return;
666 }
667
668 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
669 // A missing dirfrag, we will recreate it. Before that, we must dirty
670 // it before dirtying any of the strays we create within it.
671 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
672 "recreating it now";
673 LogSegment *ls = mds->mdlog->get_current_segment();
674 mydir->state_clear(CDir::STATE_BADFRAG);
675 mydir->mark_complete();
676 mydir->mark_dirty(mydir->pre_dirty(), ls);
677 }
678
679 // open or create stray
680 uint64_t num_strays = 0;
681 for (int i = 0; i < NUM_STRAY; ++i) {
682 stringstream name;
683 name << "stray" << i;
684 CDentry *straydn = mydir->lookup(name.str());
685
686 // allow for older fs's with stray instead of stray0
687 if (straydn == NULL && i == 0)
688 straydn = mydir->lookup("stray");
689
690 if (!straydn || !straydn->get_linkage()->get_inode()) {
691 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
692 new C_MDS_RetryOpenRoot(this));
693 return;
694 }
695 assert(straydn);
696 assert(strays[i]);
697 // we make multiple passes through this method; make sure we only pin each stray once.
698 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
699 strays[i]->get(CInode::PIN_STRAY);
700 strays[i]->state_set(CInode::STATE_STRAYPINNED);
701 strays[i]->get_stickydirs();
702 }
703 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
704
705 // open all frags
706 list<frag_t> ls;
707 strays[i]->dirfragtree.get_leaves(ls);
708 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
709 frag_t fg = *p;
710 CDir *dir = strays[i]->get_dirfrag(fg);
711 if (!dir) {
712 dir = strays[i]->get_or_open_dirfrag(this, fg);
713 }
714
715 // DamageTable applies special handling to strays: it will
716 // have damaged() us out if one is damaged.
717 assert(!dir->state_test(CDir::STATE_BADFRAG));
718
719 if (dir->get_version() == 0) {
720 dir->fetch(new C_MDS_RetryOpenRoot(this));
721 return;
722 }
723
724 if (dir->get_frag_size() > 0)
725 num_strays += dir->get_frag_size();
726 }
727 }
728
729 stray_manager.set_num_strays(num_strays);
730
731 // okay!
732 dout(10) << "populate_mydir done" << dendl;
733 assert(!open);
734 open = true;
735 mds->queue_waiters(waiting_for_open);
736
737 scan_stray_dir();
738 }
739
740 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
741 {
742 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
743 }
744
745 CDir *MDCache::get_stray_dir(CInode *in)
746 {
747 string straydname;
748 in->name_stray_dentry(straydname);
749
750 CInode *strayi = get_stray();
751 assert(strayi);
752 frag_t fg = strayi->pick_dirfrag(straydname);
753 CDir *straydir = strayi->get_dirfrag(fg);
754 assert(straydir);
755 return straydir;
756 }
757
758 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
759 {
760 CDir *straydir = get_stray_dir(in);
761 string straydname;
762 in->name_stray_dentry(straydname);
763 CDentry *straydn = straydir->lookup(straydname);
764 if (!straydn) {
765 straydn = straydir->add_null_dentry(straydname);
766 straydn->mark_new();
767 } else {
768 assert(straydn->get_projected_linkage()->is_null());
769 }
770
771 straydn->state_set(CDentry::STATE_STRAY);
772 return straydn;
773 }
774
775
776
777 MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
778 {
779 // inode?
780 if (info.ino)
781 return get_inode(info.ino, info.snapid);
782
783 // dir or dentry.
784 CDir *dir = get_dirfrag(info.dirfrag);
785 if (!dir) return 0;
786
787 if (info.dname.length())
788 return dir->lookup(info.dname, info.snapid);
789 else
790 return dir;
791 }
792
793
794
795
796 // ====================================================================
797 // subtree management
798
799 void MDCache::list_subtrees(list<CDir*>& ls)
800 {
801 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
802 p != subtrees.end();
803 ++p)
804 ls.push_back(p->first);
805 }
806
807 /*
808 * adjust the dir_auth of a subtree.
809 * merge with parent and/or child subtrees, if is it appropriate.
810 * merge can ONLY happen if both parent and child have unambiguous auth.
811 */
812 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
813 {
814 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
815 << " on " << *dir << dendl;
816
817 show_subtrees();
818
819 CDir *root;
820 if (dir->inode->is_base()) {
821 root = dir; // bootstrap hack.
822 if (subtrees.count(root) == 0) {
823 subtrees[root];
824 root->get(CDir::PIN_SUBTREE);
825 }
826 } else {
827 root = get_subtree_root(dir); // subtree root
828 }
829 assert(root);
830 assert(subtrees.count(root));
831 dout(7) << " current root is " << *root << dendl;
832
833 if (root == dir) {
834 // i am already a subtree.
835 dir->set_dir_auth(auth);
836 } else {
837 // i am a new subtree.
838 dout(10) << " new subtree at " << *dir << dendl;
839 assert(subtrees.count(dir) == 0);
840 subtrees[dir]; // create empty subtree bounds list for me.
841 dir->get(CDir::PIN_SUBTREE);
842
843 // set dir_auth
844 dir->set_dir_auth(auth);
845
846 // move items nested beneath me, under me.
847 set<CDir*>::iterator p = subtrees[root].begin();
848 while (p != subtrees[root].end()) {
849 set<CDir*>::iterator next = p;
850 ++next;
851 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
852 // move under me
853 dout(10) << " claiming child bound " << **p << dendl;
854 subtrees[dir].insert(*p);
855 subtrees[root].erase(p);
856 }
857 p = next;
858 }
859
860 // i am a bound of the parent subtree.
861 subtrees[root].insert(dir);
862
863 // i am now the subtree root.
864 root = dir;
865
866 // adjust recursive pop counters
867 if (dir->is_auth()) {
868 utime_t now = ceph_clock_now();
869 CDir *p = dir->get_parent_dir();
870 while (p) {
871 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
872 if (p->is_subtree_root()) break;
873 p = p->inode->get_parent_dir();
874 }
875 }
876 }
877
878 show_subtrees();
879 }
880
881
882 void MDCache::try_subtree_merge(CDir *dir)
883 {
884 dout(7) << "try_subtree_merge " << *dir << dendl;
885 // record my old bounds
886 auto oldbounds = subtrees.at(dir);
887
888 set<CInode*> to_eval;
889 // try merge at my root
890 try_subtree_merge_at(dir, &to_eval);
891
892 // try merge at my old bounds
893 for (auto bound : oldbounds)
894 try_subtree_merge_at(bound, &to_eval);
895
896 if (!(mds->is_any_replay() || mds->is_resolve())) {
897 for(auto in : to_eval)
898 eval_subtree_root(in);
899 }
900 }
901
902 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
903 CInode *in;
904 MutationRef mut;
905 public:
906 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
907 void finish(int r) override {
908 mdcache->subtree_merge_writebehind_finish(in, mut);
909 }
910 };
911
912 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
913 {
914 dout(10) << "try_subtree_merge_at " << *dir << dendl;
915
916 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
917 dir->state_test(CDir::STATE_EXPORTBOUND) ||
918 dir->state_test(CDir::STATE_AUXSUBTREE))
919 return;
920
921 auto it = subtrees.find(dir);
922 assert(it != subtrees.end());
923
924 // merge with parent?
925 CDir *parent = dir;
926 if (!dir->inode->is_base())
927 parent = get_subtree_root(dir->get_parent_dir());
928
929 if (parent != dir && // we have a parent,
930 parent->dir_auth == dir->dir_auth) { // auth matches,
931 // merge with parent.
932 dout(10) << " subtree merge at " << *dir << dendl;
933 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
934
935 // move our bounds under the parent
936 subtrees[parent].insert(it->second.begin(), it->second.end());
937
938 // we are no longer a subtree or bound
939 dir->put(CDir::PIN_SUBTREE);
940 subtrees.erase(it);
941 subtrees[parent].erase(dir);
942
943 // adjust popularity?
944 if (dir->is_auth()) {
945 utime_t now = ceph_clock_now();
946 CDir *p = dir->get_parent_dir();
947 while (p) {
948 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
949 if (p->is_subtree_root()) break;
950 p = p->inode->get_parent_dir();
951 }
952 }
953
954 if (to_eval && dir->get_inode()->is_auth())
955 to_eval->insert(dir->get_inode());
956
957 show_subtrees(15);
958 }
959 }
960
961 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
962 {
963 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
964 in->pop_and_dirty_projected_inode(mut->ls);
965
966 mut->apply();
967 mds->locker->drop_locks(mut.get());
968 mut->cleanup();
969
970 in->auth_unpin(this);
971 }
972
973 void MDCache::eval_subtree_root(CInode *diri)
974 {
975 // evaluate subtree inode filelock?
976 // (we should scatter the filelock on subtree bounds)
977 assert(diri->is_auth());
978 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
979 }
980
981
982 void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
983 {
984 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
985 << " on " << *dir
986 << " bounds " << bounds
987 << dendl;
988
989 show_subtrees();
990
991 CDir *root;
992 if (dir->ino() == MDS_INO_ROOT) {
993 root = dir; // bootstrap hack.
994 if (subtrees.count(root) == 0) {
995 subtrees[root];
996 root->get(CDir::PIN_SUBTREE);
997 }
998 } else {
999 root = get_subtree_root(dir); // subtree root
1000 }
1001 assert(root);
1002 assert(subtrees.count(root));
1003 dout(7) << " current root is " << *root << dendl;
1004
1005 mds_authority_t oldauth = dir->authority();
1006
1007 if (root == dir) {
1008 // i am already a subtree.
1009 dir->set_dir_auth(auth);
1010 } else {
1011 // i am a new subtree.
1012 dout(10) << " new subtree at " << *dir << dendl;
1013 assert(subtrees.count(dir) == 0);
1014 subtrees[dir]; // create empty subtree bounds list for me.
1015 dir->get(CDir::PIN_SUBTREE);
1016
1017 // set dir_auth
1018 dir->set_dir_auth(auth);
1019
1020 // move items nested beneath me, under me.
1021 set<CDir*>::iterator p = subtrees[root].begin();
1022 while (p != subtrees[root].end()) {
1023 set<CDir*>::iterator next = p;
1024 ++next;
1025 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1026 // move under me
1027 dout(10) << " claiming child bound " << **p << dendl;
1028 subtrees[dir].insert(*p);
1029 subtrees[root].erase(p);
1030 }
1031 p = next;
1032 }
1033
1034 // i am a bound of the parent subtree.
1035 subtrees[root].insert(dir);
1036
1037 // i am now the subtree root.
1038 root = dir;
1039 }
1040
1041 set<CInode*> to_eval;
1042
1043 // verify/adjust bounds.
1044 // - these may be new, or
1045 // - beneath existing ambiguous bounds (which will be collapsed),
1046 // - but NOT beneath unambiguous bounds.
1047 for (set<CDir*>::iterator p = bounds.begin();
1048 p != bounds.end();
1049 ++p) {
1050 CDir *bound = *p;
1051
1052 // new bound?
1053 if (subtrees[dir].count(bound) == 0) {
1054 if (get_subtree_root(bound) == dir) {
1055 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1056 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1057 }
1058 else {
1059 dout(10) << " want bound " << *bound << dendl;
1060 CDir *t = get_subtree_root(bound->get_parent_dir());
1061 if (subtrees[t].count(bound) == 0) {
1062 assert(t != dir);
1063 dout(10) << " new bound " << *bound << dendl;
1064 adjust_subtree_auth(bound, t->authority());
1065 }
1066 // make sure it's nested beneath ambiguous subtree(s)
1067 while (1) {
1068 while (subtrees[dir].count(t) == 0)
1069 t = get_subtree_root(t->get_parent_dir());
1070 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1071 adjust_subtree_auth(t, auth);
1072 try_subtree_merge_at(t, &to_eval);
1073 t = get_subtree_root(bound->get_parent_dir());
1074 if (t == dir) break;
1075 }
1076 }
1077 }
1078 else {
1079 dout(10) << " already have bound " << *bound << dendl;
1080 }
1081 }
1082 // merge stray bounds?
1083 while (!subtrees[dir].empty()) {
1084 set<CDir*> copy = subtrees[dir];
1085 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1086 if (bounds.count(*p) == 0) {
1087 CDir *stray = *p;
1088 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1089 adjust_subtree_auth(stray, auth);
1090 try_subtree_merge_at(stray, &to_eval);
1091 }
1092 }
1093 // swallowing subtree may add new subtree bounds
1094 if (copy == subtrees[dir])
1095 break;
1096 }
1097
1098 // bound should now match.
1099 verify_subtree_bounds(dir, bounds);
1100
1101 show_subtrees();
1102
1103 if (!(mds->is_any_replay() || mds->is_resolve())) {
1104 for(auto in : to_eval)
1105 eval_subtree_root(in);
1106 }
1107 }
1108
1109
1110 /*
1111 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1112 * fragmentation as necessary to get an equivalent bounding set. That is, only
1113 * split if one of our frags spans the provided bounding set. Never merge.
1114 */
1115 void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1116 {
1117 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1118
1119 // sort by ino
1120 map<inodeno_t, fragset_t> byino;
1121 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1122 byino[p->ino].insert(p->frag);
1123 dout(10) << " by ino: " << byino << dendl;
1124
1125 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1126 CInode *diri = get_inode(p->first);
1127 if (!diri)
1128 continue;
1129 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1130
1131 fragtree_t tmpdft;
1132 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1133 tmpdft.force_to_leaf(g_ceph_context, *q);
1134
1135 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1136 frag_t fg = *q;
1137 list<frag_t> fgls;
1138 diri->dirfragtree.get_leaves_under(fg, fgls);
1139 if (fgls.empty()) {
1140 bool all = true;
1141 frag_t approx_fg = diri->dirfragtree[fg.value()];
1142 list<frag_t> ls;
1143 tmpdft.get_leaves_under(approx_fg, ls);
1144 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1145 if (p->second.get().count(*r) == 0) {
1146 // not bound, so the resolve message is from auth MDS of the dirfrag
1147 force_dir_fragment(diri, *r);
1148 all = false;
1149 }
1150 }
1151 if (all)
1152 fgls.push_back(approx_fg);
1153 else
1154 diri->dirfragtree.get_leaves_under(fg, fgls);
1155 }
1156 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1157 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1158 CDir *dir = diri->get_dirfrag(*r);
1159 if (dir)
1160 bounds.insert(dir);
1161 }
1162 }
1163 }
1164 }
1165
1166 void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1167 {
1168 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1169 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1170
1171 set<CDir*> bounds;
1172 get_force_dirfrag_bound_set(bound_dfs, bounds);
1173 adjust_bounded_subtree_auth(dir, bounds, auth);
1174 }
1175
1176 void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1177 {
1178 dout(10) << "map_dirfrag_set " << dfs << dendl;
1179
1180 // group by inode
1181 map<inodeno_t, fragset_t> ino_fragset;
1182 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1183 ino_fragset[p->ino].insert(p->frag);
1184
1185 // get frags
1186 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1187 p != ino_fragset.end();
1188 ++p) {
1189 CInode *in = get_inode(p->first);
1190 if (!in)
1191 continue;
1192
1193 list<frag_t> fglist;
1194 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1195 in->dirfragtree.get_leaves_under(*q, fglist);
1196
1197 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1198 << " on " << *in << dendl;
1199
1200 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1201 CDir *dir = in->get_dirfrag(*q);
1202 if (dir)
1203 result.insert(dir);
1204 }
1205 }
1206 }
1207
1208
1209
1210 CDir *MDCache::get_subtree_root(CDir *dir)
1211 {
1212 // find the underlying dir that delegates (or is about to delegate) auth
1213 while (true) {
1214 if (dir->is_subtree_root())
1215 return dir;
1216 dir = dir->get_inode()->get_parent_dir();
1217 if (!dir)
1218 return 0; // none
1219 }
1220 }
1221
1222 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1223 {
1224 // find the underlying dir that delegates (or is about to delegate) auth
1225 while (true) {
1226 if (dir->is_subtree_root())
1227 return dir;
1228 dir = dir->get_inode()->get_projected_parent_dir();
1229 if (!dir)
1230 return 0; // none
1231 }
1232 }
1233
1234 void MDCache::remove_subtree(CDir *dir)
1235 {
1236 dout(10) << "remove_subtree " << *dir << dendl;
1237 assert(subtrees.count(dir));
1238 assert(subtrees[dir].empty());
1239 subtrees.erase(dir);
1240 dir->put(CDir::PIN_SUBTREE);
1241 if (dir->get_parent_dir()) {
1242 CDir *p = get_subtree_root(dir->get_parent_dir());
1243 assert(subtrees[p].count(dir));
1244 subtrees[p].erase(dir);
1245 }
1246 }
1247
1248 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1249 {
1250 assert(subtrees.count(dir));
1251 bounds = subtrees[dir];
1252 }
1253
1254 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1255 {
1256 if (subtrees.count(dir)) {
1257 // just copy them, dir is a subtree.
1258 get_subtree_bounds(dir, bounds);
1259 } else {
1260 // find them
1261 CDir *root = get_subtree_root(dir);
1262 for (set<CDir*>::iterator p = subtrees[root].begin();
1263 p != subtrees[root].end();
1264 ++p) {
1265 CDir *t = *p;
1266 while (t != root) {
1267 t = t->get_parent_dir();
1268 assert(t);
1269 if (t == dir) {
1270 bounds.insert(*p);
1271 continue;
1272 }
1273 }
1274 }
1275 }
1276 }
1277
1278 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1279 {
1280 // for debugging only.
1281 assert(subtrees.count(dir));
1282 if (bounds != subtrees[dir]) {
1283 dout(0) << "verify_subtree_bounds failed" << dendl;
1284 set<CDir*> b = bounds;
1285 for (auto &cd : subtrees[dir]) {
1286 if (bounds.count(cd)) {
1287 b.erase(cd);
1288 continue;
1289 }
1290 dout(0) << " missing bound " << *cd << dendl;
1291 }
1292 for (const auto &cd : b)
1293 dout(0) << " extra bound " << *cd << dendl;
1294 }
1295 assert(bounds == subtrees[dir]);
1296 }
1297
1298 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1299 {
1300 // for debugging only.
1301 assert(subtrees.count(dir));
1302
1303 // make sure that any bounds i do have are properly noted as such.
1304 int failed = 0;
1305 for (const auto &fg : bounds) {
1306 CDir *bd = get_dirfrag(fg);
1307 if (!bd) continue;
1308 if (subtrees[dir].count(bd) == 0) {
1309 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1310 failed++;
1311 }
1312 }
1313 assert(failed == 0);
1314 }
1315
1316 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1317 {
1318 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1319 << " to " << *newdir << dendl;
1320 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1321 }
1322
1323 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1324 {
1325 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1326
1327 //show_subtrees();
1328
1329 CDir *newdir = diri->get_parent_dir();
1330
1331 if (pop) {
1332 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1333 assert(p != projected_subtree_renames.end());
1334 assert(!p->second.empty());
1335 assert(p->second.front().first == olddir);
1336 assert(p->second.front().second == newdir);
1337 p->second.pop_front();
1338 if (p->second.empty())
1339 projected_subtree_renames.erase(p);
1340 }
1341
1342 // adjust subtree
1343 list<CDir*> dfls;
1344 // make sure subtree dirfrags are at the front of the list
1345 diri->get_subtree_dirfrags(dfls);
1346 diri->get_nested_dirfrags(dfls);
1347 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1348 CDir *dir = *p;
1349
1350 dout(10) << "dirfrag " << *dir << dendl;
1351 CDir *oldparent = get_subtree_root(olddir);
1352 dout(10) << " old parent " << *oldparent << dendl;
1353 CDir *newparent = get_subtree_root(newdir);
1354 dout(10) << " new parent " << *newparent << dendl;
1355
1356 if (oldparent == newparent) {
1357 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1358 continue;
1359 }
1360
1361 if (dir->is_subtree_root()) {
1362 // children are fine. change parent.
1363 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1364 assert(subtrees[oldparent].count(dir));
1365 subtrees[oldparent].erase(dir);
1366 assert(subtrees.count(newparent));
1367 subtrees[newparent].insert(dir);
1368 // caller is responsible for 'eval diri'
1369 try_subtree_merge_at(dir, NULL);
1370 } else {
1371 // mid-subtree.
1372
1373 // see if any old bounds move to the new parent.
1374 list<CDir*> tomove;
1375 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1376 p != subtrees[oldparent].end();
1377 ++p) {
1378 CDir *bound = *p;
1379 CDir *broot = get_subtree_root(bound->get_parent_dir());
1380 if (broot != oldparent) {
1381 assert(broot == newparent);
1382 tomove.push_back(bound);
1383 }
1384 }
1385 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1386 CDir *bound = *p;
1387 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1388 subtrees[oldparent].erase(bound);
1389 subtrees[newparent].insert(bound);
1390 }
1391
1392 // did auth change?
1393 if (oldparent->authority() != newparent->authority()) {
1394 adjust_subtree_auth(dir, oldparent->authority());
1395 // caller is responsible for 'eval diri'
1396 try_subtree_merge_at(dir, NULL);
1397 }
1398 }
1399 }
1400
1401 show_subtrees();
1402 }
1403
1404
1405 void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1406 {
1407 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1408 p != subtrees.end();
1409 ++p) {
1410 CDir *root = p->first;
1411 if (root->is_full_dir_auth())
1412 s.insert(root);
1413 }
1414 }
1415 void MDCache::get_auth_subtrees(set<CDir*>& s)
1416 {
1417 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1418 p != subtrees.end();
1419 ++p) {
1420 CDir *root = p->first;
1421 if (root->is_auth())
1422 s.insert(root);
1423 }
1424 }
1425
1426
1427 // count.
1428
1429 int MDCache::num_subtrees()
1430 {
1431 return subtrees.size();
1432 }
1433
1434 int MDCache::num_subtrees_fullauth()
1435 {
1436 int n = 0;
1437 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1438 p != subtrees.end();
1439 ++p) {
1440 CDir *root = p->first;
1441 if (root->is_full_dir_auth())
1442 n++;
1443 }
1444 return n;
1445 }
1446
1447 int MDCache::num_subtrees_fullnonauth()
1448 {
1449 int n = 0;
1450 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1451 p != subtrees.end();
1452 ++p) {
1453 CDir *root = p->first;
1454 if (root->is_full_dir_nonauth())
1455 n++;
1456 }
1457 return n;
1458 }
1459
1460
1461
1462 // ===================================
1463 // journal and snap/cow helpers
1464
1465
1466 /*
1467 * find first inode in cache that follows given snapid. otherwise, return current.
1468 */
1469 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1470 {
1471 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1472 assert(in->last == CEPH_NOSNAP);
1473
1474 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1475 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1476 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1477 in = p->second;
1478 }
1479
1480 return in;
1481 }
1482
1483
1484 /*
1485 * note: i'm currently cheating wrt dirty and inode.version on cow
1486 * items. instead of doing a full dir predirty, i just take the
1487 * original item's version, and set the dirty flag (via
1488 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1489 * means a special case in the dir commit clean sweep assertions.
1490 * bah.
1491 */
1492 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1493 {
1494 assert(last >= in->first);
1495
1496 CInode *oldin = new CInode(this, true, in->first, last);
1497 oldin->inode = *in->get_previous_projected_inode();
1498 oldin->symlink = in->symlink;
1499 oldin->xattrs = *in->get_previous_projected_xattrs();
1500 oldin->inode.trim_client_ranges(last);
1501
1502 if (in->first < in->oldest_snap)
1503 in->oldest_snap = in->first;
1504
1505 in->first = last+1;
1506
1507 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1508 add_inode(oldin);
1509
1510 if (in->last != CEPH_NOSNAP) {
1511 CInode *head_in = get_inode(in->ino());
1512 assert(head_in);
1513 if (head_in->split_need_snapflush(oldin, in)) {
1514 oldin->client_snap_caps = in->client_snap_caps;
1515 for (const auto &p : in->client_snap_caps) {
1516 SimpleLock *lock = oldin->get_lock(p.first);
1517 assert(lock);
1518 for (const auto &q : p.second) {
1519 oldin->auth_pin(lock);
1520 lock->set_state(LOCK_SNAP_SYNC); // gathering
1521 lock->get_wrlock(true);
1522 (void)q; /* unused */
1523 }
1524 }
1525 }
1526 return oldin;
1527 }
1528
1529 if (!in->client_caps.empty()) {
1530 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1531 // clone caps?
1532 for (auto &p : in->client_caps) {
1533 client_t client = p.first;
1534 Capability *cap = p.second;
1535 int issued = cap->issued();
1536 if ((issued & CEPH_CAP_ANY_WR) &&
1537 cap->client_follows < last) {
1538 // note in oldin
1539 for (int i = 0; i < num_cinode_locks; i++) {
1540 if (issued & cinode_lock_info[i].wr_caps) {
1541 int lockid = cinode_lock_info[i].lock;
1542 SimpleLock *lock = oldin->get_lock(lockid);
1543 assert(lock);
1544 oldin->client_snap_caps[lockid].insert(client);
1545 oldin->auth_pin(lock);
1546 lock->set_state(LOCK_SNAP_SYNC); // gathering
1547 lock->get_wrlock(true);
1548 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1549 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1550 }
1551 }
1552 cap->client_follows = last;
1553
1554 // we need snapflushes for any intervening snaps
1555 dout(10) << " snaps " << snaps << dendl;
1556 for (auto q = snaps.lower_bound(oldin->first);
1557 q != snaps.end() && *q <= last;
1558 ++q) {
1559 in->add_need_snapflush(oldin, *q, client);
1560 }
1561 } else {
1562 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1563 }
1564 }
1565 }
1566 return oldin;
1567 }
1568
1569 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1570 CDentry *dn, snapid_t follows,
1571 CInode **pcow_inode, CDentry::linkage_t *dnl)
1572 {
1573 if (!dn) {
1574 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1575 return;
1576 }
1577 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1578 assert(dn->is_auth());
1579
1580 // nothing to cow on a null dentry, fix caller
1581 if (!dnl)
1582 dnl = dn->get_projected_linkage();
1583 assert(!dnl->is_null());
1584
1585 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1586 // multiversion inode.
1587 CInode *in = dnl->get_inode();
1588 SnapRealm *realm = NULL;
1589
1590 if (in->get_projected_parent_dn() != dn) {
1591 assert(follows == CEPH_NOSNAP);
1592 realm = dn->dir->inode->find_snaprealm();
1593 snapid_t dir_follows = realm->get_newest_snap();
1594
1595 if (dir_follows+1 > dn->first) {
1596 snapid_t oldfirst = dn->first;
1597 dn->first = dir_follows+1;
1598 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1599 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
1600 oldfirst, dir_follows);
1601 olddn->pre_dirty();
1602 dout(10) << " olddn " << *olddn << dendl;
1603 metablob->add_remote_dentry(olddn, true);
1604 mut->add_cow_dentry(olddn);
1605 // FIXME: adjust link count here? hmm.
1606
1607 if (dir_follows+1 > in->first)
1608 in->cow_old_inode(dir_follows, false);
1609 }
1610 }
1611
1612 if (in->snaprealm) {
1613 realm = in->snaprealm;
1614 follows = realm->get_newest_seq();
1615 } else
1616 follows = dir_follows;
1617 } else {
1618 realm = in->find_snaprealm();
1619 if (follows == CEPH_NOSNAP)
1620 follows = realm->get_newest_seq();
1621 }
1622
1623 // already cloned?
1624 if (follows < in->first) {
1625 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1626 return;
1627 }
1628
1629 if (!realm->has_snaps_in_range(in->first, follows)) {
1630 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1631 in->first = follows + 1;
1632 return;
1633 }
1634
1635 in->cow_old_inode(follows, false);
1636
1637 } else {
1638 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1639 if (follows == CEPH_NOSNAP)
1640 follows = realm->get_newest_seq();
1641
1642 // already cloned?
1643 if (follows < dn->first) {
1644 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1645 return;
1646 }
1647
1648 // update dn.first before adding old dentry to cdir's map
1649 snapid_t oldfirst = dn->first;
1650 dn->first = follows+1;
1651
1652 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1653
1654 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1655 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1656 if (in)
1657 in->first = follows+1;
1658 return;
1659 }
1660
1661 dout(10) << " dn " << *dn << dendl;
1662 if (in) {
1663 CInode *oldin = cow_inode(in, follows);
1664 mut->add_cow_inode(oldin);
1665 if (pcow_inode)
1666 *pcow_inode = oldin;
1667 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
1668 oldin->inode.version = olddn->pre_dirty();
1669 dout(10) << " olddn " << *olddn << dendl;
1670 bool need_snapflush = !oldin->client_snap_caps.empty();
1671 if (need_snapflush)
1672 mut->ls->open_files.push_back(&oldin->item_open_file);
1673 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1674 mut->add_cow_dentry(olddn);
1675 } else {
1676 assert(dnl->is_remote());
1677 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
1678 oldfirst, follows);
1679 olddn->pre_dirty();
1680 dout(10) << " olddn " << *olddn << dendl;
1681 metablob->add_remote_dentry(olddn, true);
1682 mut->add_cow_dentry(olddn);
1683 }
1684 }
1685 }
1686
1687
1688 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1689 CInode *in, snapid_t follows,
1690 CInode **pcow_inode)
1691 {
1692 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1693 CDentry *dn = in->get_projected_parent_dn();
1694 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1695 }
1696
1697 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1698 {
1699 if (in->is_base()) {
1700 metablob->add_root(true, in, in->get_projected_inode());
1701 } else {
1702 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1703 follows = in->first - 1;
1704 CDentry *dn = in->get_projected_parent_dn();
1705 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1706 journal_cow_dentry(mut, metablob, dn, follows);
1707 if (in->get_projected_inode()->is_backtrace_updated()) {
1708 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1709 in->get_previous_projected_inode()->layout.pool_id;
1710 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1711 } else {
1712 metablob->add_primary_dentry(dn, in, true);
1713 }
1714 }
1715 }
1716
1717
1718
1719 // nested ---------------------------------------------------------------
1720
1721 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1722 int linkunlink, SnapRealm *prealm)
1723 {
1724 CDentry *parentdn = cur->get_projected_parent_dn();
1725 CInode::mempool_inode *curi = cur->get_projected_inode();
1726
1727 if (cur->first > first)
1728 first = cur->first;
1729
1730 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1731 << " " << *cur << dendl;
1732 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1733 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1734
1735 /*
1736 * FIXME. this incompletely propagates rstats to _old_ parents
1737 * (i.e. shortly after a directory rename). but we need full
1738 * blown hard link backpointers to make this work properly...
1739 */
1740 snapid_t floor = parentdn->first;
1741 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1742
1743 if (!prealm)
1744 prealm = parent->inode->find_snaprealm();
1745 const set<snapid_t> snaps = prealm->get_snaps();
1746
1747 if (cur->last != CEPH_NOSNAP) {
1748 assert(cur->dirty_old_rstats.empty());
1749 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1750 if (q == snaps.end() || *q > cur->last)
1751 return;
1752 }
1753
1754 if (cur->last >= floor) {
1755 bool update = true;
1756 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1757 // rename src inode is not projected in the slave rename prep case. so we should
1758 // avoid updateing the inode.
1759 assert(linkunlink < 0);
1760 assert(cur->is_frozen_inode());
1761 update = false;
1762 }
1763 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1764 linkunlink, update);
1765 }
1766
1767 if (g_conf->mds_snap_rstat) {
1768 for (const auto &p : cur->dirty_old_rstats) {
1769 auto &old = cur->old_inodes[p];
1770 snapid_t ofirst = std::max(old.first, floor);
1771 auto it = snaps.lower_bound(ofirst);
1772 if (it == snaps.end() || *it > p)
1773 continue;
1774 if (p >= floor)
1775 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
1776 }
1777 }
1778 cur->dirty_old_rstats.clear();
1779 }
1780
1781
1782 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
1783 CDir *parent, int linkunlink, bool update_inode)
1784 {
1785 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1786 dout(20) << " inode rstat " << inode.rstat << dendl;
1787 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1788 nest_info_t delta;
1789 if (linkunlink == 0) {
1790 delta.add(inode.rstat);
1791 delta.sub(inode.accounted_rstat);
1792 } else if (linkunlink < 0) {
1793 delta.sub(inode.accounted_rstat);
1794 } else {
1795 delta.add(inode.rstat);
1796 }
1797 dout(20) << " delta " << delta << dendl;
1798
1799 if (update_inode)
1800 inode.accounted_rstat = inode.rstat;
1801
1802 while (last >= ofirst) {
1803 /*
1804 * pick fnode version to update. at each iteration, we want to
1805 * pick a segment ending in 'last' to update. split as necessary
1806 * to make that work. then, adjust first up so that we only
1807 * update one segment at a time. then loop to cover the whole
1808 * [ofirst,last] interval.
1809 */
1810 nest_info_t *prstat;
1811 snapid_t first;
1812 fnode_t *pf = parent->get_projected_fnode();
1813 if (last == CEPH_NOSNAP) {
1814 if (g_conf->mds_snap_rstat)
1815 first = MAX(ofirst, parent->first);
1816 else
1817 first = parent->first;
1818 prstat = &pf->rstat;
1819 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1820
1821 if (first > parent->first &&
1822 !(pf->rstat == pf->accounted_rstat)) {
1823 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1824 << parent->first << "," << (first-1) << "] "
1825 << " " << *prstat << "/" << pf->accounted_rstat
1826 << dendl;
1827 parent->dirty_old_rstat[first-1].first = parent->first;
1828 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1829 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1830 }
1831 parent->first = first;
1832 } else if (!g_conf->mds_snap_rstat) {
1833 // drop snapshots' rstats
1834 break;
1835 } else if (last >= parent->first) {
1836 first = parent->first;
1837 parent->dirty_old_rstat[last].first = first;
1838 parent->dirty_old_rstat[last].rstat = pf->rstat;
1839 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1840 prstat = &parent->dirty_old_rstat[last].rstat;
1841 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1842 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1843 } else {
1844 // be careful, dirty_old_rstat is a _sparse_ map.
1845 // sorry, this is ugly.
1846 first = ofirst;
1847
1848 // find any intersection with last
1849 auto it = parent->dirty_old_rstat.lower_bound(last);
1850 if (it == parent->dirty_old_rstat.end()) {
1851 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1852 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1853 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1854 first = parent->dirty_old_rstat.rbegin()->first+1;
1855 }
1856 } else {
1857 // *it last is >= last
1858 if (it->second.first <= last) {
1859 // *it intersects [first,last]
1860 if (it->second.first < first) {
1861 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1862 parent->dirty_old_rstat[first-1] = it->second;
1863 it->second.first = first;
1864 }
1865 if (it->second.first > first)
1866 first = it->second.first;
1867 if (last < it->first) {
1868 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1869 parent->dirty_old_rstat[last] = it->second;
1870 it->second.first = last+1;
1871 }
1872 } else {
1873 // *it is to the _right_ of [first,last]
1874 it = parent->dirty_old_rstat.lower_bound(first);
1875 // new *it last is >= first
1876 if (it->second.first <= last && // new *it isn't also to the right, and
1877 it->first >= first) { // it intersects our first bit,
1878 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1879 first = it->first+1;
1880 }
1881 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1882 }
1883 }
1884 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1885 parent->dirty_old_rstat[last].first = first;
1886 prstat = &parent->dirty_old_rstat[last].rstat;
1887 }
1888
1889 // apply
1890 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1891 assert(last >= first);
1892 prstat->add(delta);
1893 if (update_inode)
1894 inode.accounted_rstat = inode.rstat;
1895 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1896
1897 last = first-1;
1898 }
1899 }
1900
1901 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1902 snapid_t ofirst, snapid_t last,
1903 CInode *pin, bool cow_head)
1904 {
1905 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1906 dout(20) << " frag rstat " << rstat << dendl;
1907 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1908 nest_info_t delta = rstat;
1909 delta.sub(accounted_rstat);
1910 dout(20) << " delta " << delta << dendl;
1911
1912 while (last >= ofirst) {
1913 CInode::mempool_inode *pi;
1914 snapid_t first;
1915 if (last == pin->last) {
1916 pi = pin->get_projected_inode();
1917 first = MAX(ofirst, pin->first);
1918 if (first > pin->first) {
1919 auto &old = pin->cow_old_inode(first-1, cow_head);
1920 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1921 }
1922 } else {
1923 if (last >= pin->first) {
1924 first = pin->first;
1925 pin->cow_old_inode(last, cow_head);
1926 } else {
1927 // our life is easier here because old_inodes is not sparse
1928 // (although it may not begin at snapid 1)
1929 auto it = pin->old_inodes.lower_bound(last);
1930 if (it == pin->old_inodes.end()) {
1931 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1932 break;
1933 }
1934 first = it->second.first;
1935 if (first > last) {
1936 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1937 //assert(p == pin->old_inodes.begin());
1938 break;
1939 }
1940 if (it->first > last) {
1941 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1942 << (last+1) << "," << it->first << "]" << dendl;
1943 pin->old_inodes[last] = it->second;
1944 it->second.first = last+1;
1945 pin->dirty_old_rstats.insert(it->first);
1946 }
1947 }
1948 if (first < ofirst) {
1949 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1950 << first << "," << ofirst-1 << "]" << dendl;
1951 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1952 pin->dirty_old_rstats.insert(ofirst-1);
1953 pin->old_inodes[last].first = first = ofirst;
1954 }
1955 pi = &pin->old_inodes[last].inode;
1956 pin->dirty_old_rstats.insert(last);
1957 }
1958 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1959 pi->rstat.add(delta);
1960 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1961
1962 last = first-1;
1963 }
1964 }
1965
1966 void MDCache::broadcast_quota_to_client(CInode *in)
1967 {
1968 if (!in->is_auth() || in->is_frozen())
1969 return;
1970
1971 auto i = in->get_projected_inode();
1972
1973 if (!i->quota.is_enable())
1974 return;
1975
1976 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1977 it != in->client_caps.end();
1978 ++it) {
1979 Session *session = mds->get_session(it->first);
1980 if (!session || !session->connection ||
1981 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
1982 continue;
1983
1984 Capability *cap = it->second;
1985 if (cap->last_rbytes == i->rstat.rbytes &&
1986 cap->last_rsize == i->rstat.rsize())
1987 continue;
1988
1989 if (i->quota.max_files > 0) {
1990 if (i->rstat.rsize() >= i->quota.max_files)
1991 goto update;
1992
1993 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
1994 abs(cap->last_rsize - i->rstat.rsize()))
1995 goto update;
1996 }
1997
1998 if (i->quota.max_bytes > 0) {
1999 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2000 goto update;
2001
2002 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2003 abs(cap->last_rbytes - i->rstat.rbytes))
2004 goto update;
2005 }
2006
2007 continue;
2008
2009 update:
2010 cap->last_rsize = i->rstat.rsize();
2011 cap->last_rbytes = i->rstat.rbytes;
2012
2013 MClientQuota *msg = new MClientQuota();
2014 msg->ino = in->ino();
2015 msg->rstat = i->rstat;
2016 msg->quota = i->quota;
2017 mds->send_message_client_counted(msg, session->connection);
2018 }
2019 for (const auto &it : in->get_replicas()) {
2020 MGatherCaps *msg = new MGatherCaps;
2021 msg->ino = in->ino();
2022 mds->send_message_mds(msg, it.first);
2023 }
2024 }
2025
2026 /*
2027 * NOTE: we _have_ to delay the scatter if we are called during a
2028 * rejoin, because we can't twiddle locks between when the
2029 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2030 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2031 * (no requests), and a survivor acks immediately. _except_ that
2032 * during rejoin_(weak|strong) processing, we may complete a lock
2033 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2034 * scatterlock state in that case or the lock states will get out of
2035 * sync between the auth and replica.
2036 *
2037 * the simple solution is to never do the scatter here. instead, put
2038 * the scatterlock on a list if it isn't already wrlockable. this is
2039 * probably the best plan anyway, since we avoid too many
2040 * scatters/locks under normal usage.
2041 */
2042 /*
2043 * some notes on dirlock/nestlock scatterlock semantics:
2044 *
2045 * the fragstat (dirlock) will never be updated without
2046 * dirlock+nestlock wrlock held by the caller.
2047 *
2048 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2049 * data is pushed up the tree. this could be changed with some
2050 * restructuring here, but in its current form we ensure that the
2051 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2052 * frag, which is nice. and, we only need to track frags that need to
2053 * be nudged (and not inodes with pending rstat changes that need to
2054 * be pushed into the frag). a consequence of this is that the
2055 * accounted_rstat on scatterlock sync may not match our current
2056 * rstat. this is normal and expected.
2057 */
2058 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2059 CInode *in, CDir *parent,
2060 int flags, int linkunlink,
2061 snapid_t cfollows)
2062 {
2063 bool primary_dn = flags & PREDIRTY_PRIMARY;
2064 bool do_parent_mtime = flags & PREDIRTY_DIR;
2065 bool shallow = flags & PREDIRTY_SHALLOW;
2066
2067 assert(mds->mdlog->entry_is_open());
2068
2069 // make sure stamp is set
2070 if (mut->get_mds_stamp() == utime_t())
2071 mut->set_mds_stamp(ceph_clock_now());
2072
2073 if (in->is_base())
2074 return;
2075
2076 dout(10) << "predirty_journal_parents"
2077 << (do_parent_mtime ? " do_parent_mtime":"")
2078 << " linkunlink=" << linkunlink
2079 << (primary_dn ? " primary_dn":" remote_dn")
2080 << (shallow ? " SHALLOW":"")
2081 << " follows " << cfollows
2082 << " " << *in << dendl;
2083
2084 if (!parent) {
2085 assert(primary_dn);
2086 parent = in->get_projected_parent_dn()->get_dir();
2087 }
2088
2089 if (flags == 0 && linkunlink == 0) {
2090 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2091 blob->add_dir_context(parent);
2092 return;
2093 }
2094
2095 // build list of inodes to wrlock, dirty, and update
2096 list<CInode*> lsi;
2097 CInode *cur = in;
2098 CDentry *parentdn = NULL;
2099 bool first = true;
2100 while (parent) {
2101 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2102 assert(parent->is_auth());
2103
2104 // opportunistically adjust parent dirfrag
2105 CInode *pin = parent->get_inode();
2106
2107 // inode -> dirfrag
2108 mut->auth_pin(parent);
2109 mut->add_projected_fnode(parent);
2110
2111 fnode_t *pf = parent->project_fnode();
2112 pf->version = parent->pre_dirty();
2113
2114 if (do_parent_mtime || linkunlink) {
2115 assert(mut->wrlocks.count(&pin->filelock));
2116 assert(mut->wrlocks.count(&pin->nestlock));
2117 assert(cfollows == CEPH_NOSNAP);
2118
2119 // update stale fragstat/rstat?
2120 parent->resync_accounted_fragstat();
2121 parent->resync_accounted_rstat();
2122
2123 if (do_parent_mtime) {
2124 pf->fragstat.mtime = mut->get_op_stamp();
2125 pf->fragstat.change_attr++;
2126 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2127 if (pf->fragstat.mtime > pf->rstat.rctime) {
2128 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2129 pf->rstat.rctime = pf->fragstat.mtime;
2130 } else {
2131 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2132 }
2133 }
2134 if (linkunlink) {
2135 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2136 if (in->is_dir()) {
2137 pf->fragstat.nsubdirs += linkunlink;
2138 //pf->rstat.rsubdirs += linkunlink;
2139 } else {
2140 pf->fragstat.nfiles += linkunlink;
2141 //pf->rstat.rfiles += linkunlink;
2142 }
2143 }
2144 }
2145
2146 // rstat
2147 if (!primary_dn) {
2148 // don't update parent this pass
2149 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2150 pin->versionlock.can_wrlock())) {
2151 dout(20) << " unwritable parent nestlock " << pin->nestlock
2152 << ", marking dirty rstat on " << *cur << dendl;
2153 cur->mark_dirty_rstat();
2154 } else {
2155 // if we don't hold a wrlock reference on this nestlock, take one,
2156 // because we are about to write into the dirfrag fnode and that needs
2157 // to commit before the lock can cycle.
2158 if (linkunlink) {
2159 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2160 }
2161
2162 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2163 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2164 mds->locker->wrlock_force(&pin->nestlock, mut);
2165 }
2166
2167 // now we can project the inode rstat diff the dirfrag
2168 SnapRealm *prealm = pin->find_snaprealm();
2169
2170 snapid_t follows = cfollows;
2171 if (follows == CEPH_NOSNAP)
2172 follows = prealm->get_newest_seq();
2173
2174 snapid_t first = follows+1;
2175
2176 // first, if the frag is stale, bring it back in sync.
2177 parent->resync_accounted_rstat();
2178
2179 // now push inode rstats into frag
2180 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2181 cur->clear_dirty_rstat();
2182 }
2183
2184 bool stop = false;
2185 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2186 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2187 stop = true;
2188 }
2189
2190 // delay propagating until later?
2191 if (!stop && !first &&
2192 g_conf->mds_dirstat_min_interval > 0) {
2193 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2194 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2195 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2196 << " < " << g_conf->mds_dirstat_min_interval
2197 << ", stopping" << dendl;
2198 stop = true;
2199 } else {
2200 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2201 }
2202 }
2203
2204 // can cast only because i'm passing nowait=true in the sole user
2205 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2206 if (!stop &&
2207 mut->wrlocks.count(&pin->nestlock) == 0 &&
2208 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2209 //true
2210 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2211 )) { // ** do not initiate.. see above comment **
2212 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2213 << " on " << *pin << dendl;
2214 stop = true;
2215 }
2216 if (stop) {
2217 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2218 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2219 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2220 mut->add_updated_lock(&pin->nestlock);
2221 if (do_parent_mtime || linkunlink) {
2222 mds->locker->mark_updated_scatterlock(&pin->filelock);
2223 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2224 mut->add_updated_lock(&pin->filelock);
2225 }
2226 break;
2227 }
2228 if (!mut->wrlocks.count(&pin->versionlock))
2229 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2230
2231 assert(mut->wrlocks.count(&pin->nestlock) ||
2232 mut->is_slave());
2233
2234 pin->last_dirstat_prop = mut->get_mds_stamp();
2235
2236 // dirfrag -> diri
2237 mut->auth_pin(pin);
2238 mut->add_projected_inode(pin);
2239 lsi.push_front(pin);
2240
2241 pin->pre_cow_old_inode(); // avoid cow mayhem!
2242
2243 auto &pi = pin->project_inode();
2244 pi.inode.version = pin->pre_dirty();
2245
2246 // dirstat
2247 if (do_parent_mtime || linkunlink) {
2248 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2249 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2250 bool touched_mtime = false, touched_chattr = false;
2251 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2252 pf->accounted_fragstat = pf->fragstat;
2253 if (touched_mtime)
2254 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
2255 if (touched_chattr)
2256 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2257 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
2258
2259 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2260 if (pi.inode.dirstat.size() < 0)
2261 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2262 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
2263 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2264 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
2265 << ", dirfrag has " << pf->fragstat;
2266
2267 // trust the dirfrag for now
2268 pi.inode.dirstat = pf->fragstat;
2269
2270 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2271 }
2272 }
2273 }
2274
2275 /*
2276 * the rule here is to follow the _oldest_ parent with dirty rstat
2277 * data. if we don't propagate all data, we add ourselves to the
2278 * nudge list. that way all rstat data will (eventually) get
2279 * pushed up the tree.
2280 *
2281 * actually, no. for now, silently drop rstats for old parents. we need
2282 * hard link backpointers to do the above properly.
2283 */
2284
2285 // stop?
2286 if (pin->is_base())
2287 break;
2288 parentdn = pin->get_projected_parent_dn();
2289 assert(parentdn);
2290
2291 // rstat
2292 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2293
2294 // first, if the frag is stale, bring it back in sync.
2295 parent->resync_accounted_rstat();
2296
2297 if (g_conf->mds_snap_rstat) {
2298 for (auto &p : parent->dirty_old_rstat) {
2299 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2300 p.first, pin, true);
2301 }
2302 }
2303 parent->dirty_old_rstat.clear();
2304 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2305
2306 pf->accounted_rstat = pf->rstat;
2307
2308 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2309 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
2310 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2311 << parent->dirfrag() << ", inode has " << pi.inode.rstat
2312 << ", dirfrag has " << pf->rstat;
2313
2314 // trust the dirfrag for now
2315 pi.inode.rstat = pf->rstat;
2316
2317 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2318 }
2319 }
2320
2321 parent->check_rstats();
2322 broadcast_quota_to_client(pin);
2323 // next parent!
2324 cur = pin;
2325 parent = parentdn->get_dir();
2326 linkunlink = 0;
2327 do_parent_mtime = false;
2328 primary_dn = true;
2329 first = false;
2330 }
2331
2332 // now, stick it in the blob
2333 assert(parent);
2334 assert(parent->is_auth());
2335 blob->add_dir_context(parent);
2336 blob->add_dir(parent, true);
2337 for (list<CInode*>::iterator p = lsi.begin();
2338 p != lsi.end();
2339 ++p) {
2340 CInode *cur = *p;
2341 journal_dirty_inode(mut.get(), blob, cur);
2342 }
2343
2344 }
2345
2346
2347
2348
2349
2350 // ===================================
2351 // slave requests
2352
2353
2354 /*
2355 * some handlers for master requests with slaves. we need to make
2356 * sure slaves journal commits before we forget we mastered them and
2357 * remove them from the uncommitted_masters map (used during recovery
2358 * to commit|abort slaves).
2359 */
2360 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2361 metareqid_t reqid;
2362 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2363 void finish(int r) override {
2364 mdcache->_logged_master_commit(reqid);
2365 }
2366 };
2367
2368 void MDCache::log_master_commit(metareqid_t reqid)
2369 {
2370 dout(10) << "log_master_commit " << reqid << dendl;
2371 uncommitted_masters[reqid].committing = true;
2372 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2373 new C_MDC_CommittedMaster(this, reqid));
2374 }
2375
2376 void MDCache::_logged_master_commit(metareqid_t reqid)
2377 {
2378 dout(10) << "_logged_master_commit " << reqid << dendl;
2379 assert(uncommitted_masters.count(reqid));
2380 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2381 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2382 uncommitted_masters.erase(reqid);
2383 }
2384
2385 // while active...
2386
2387 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2388 {
2389 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2390 assert(uncommitted_masters.count(r));
2391 uncommitted_masters[r].slaves.erase(from);
2392 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2393 log_master_commit(r);
2394 }
2395
2396 void MDCache::logged_master_update(metareqid_t reqid)
2397 {
2398 dout(10) << "logged_master_update " << reqid << dendl;
2399 assert(uncommitted_masters.count(reqid));
2400 uncommitted_masters[reqid].safe = true;
2401 if (pending_masters.count(reqid)) {
2402 pending_masters.erase(reqid);
2403 if (pending_masters.empty())
2404 process_delayed_resolve();
2405 }
2406 }
2407
2408 /*
2409 * Master may crash after receiving all slaves' commit acks, but before journalling
2410 * the final commit. Slaves may crash after journalling the slave commit, but before
2411 * sending commit ack to the master. Commit masters with no uncommitted slave when
2412 * resolve finishes.
2413 */
2414 void MDCache::finish_committed_masters()
2415 {
2416 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2417 p != uncommitted_masters.end();
2418 ++p) {
2419 p->second.recovering = false;
2420 if (!p->second.committing && p->second.slaves.empty()) {
2421 dout(10) << "finish_committed_masters " << p->first << dendl;
2422 log_master_commit(p->first);
2423 }
2424 }
2425 }
2426
2427 /*
2428 * at end of resolve... we must journal a commit|abort for all slave
2429 * updates, before moving on.
2430 *
2431 * this is so that the master can safely journal ECommitted on ops it
2432 * masters when it reaches up:active (all other recovering nodes must
2433 * complete resolve before that happens).
2434 */
2435 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2436 mds_rank_t from;
2437 metareqid_t reqid;
2438 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2439 void finish(int r) override {
2440 mdcache->_logged_slave_commit(from, reqid);
2441 }
2442 };
2443
2444 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2445 {
2446 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2447
2448 // send a message
2449 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2450 mds->send_message_mds(req, from);
2451 }
2452
2453
2454
2455
2456
2457
2458 // ====================================================================
2459 // import map, recovery
2460
2461 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2462 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2463 {
2464 if (subtrees.count(oldparent)) {
2465 vector<dirfrag_t>& v = subtrees[oldparent];
2466 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2467 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2468 if (*it == df) {
2469 v.erase(it);
2470 break;
2471 }
2472 }
2473 if (subtrees.count(newparent)) {
2474 vector<dirfrag_t>& v = subtrees[newparent];
2475 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2476 v.push_back(df);
2477 }
2478 }
2479
2480 ESubtreeMap *MDCache::create_subtree_map()
2481 {
2482 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2483 << num_subtrees_fullauth() << " fullauth"
2484 << dendl;
2485
2486 show_subtrees();
2487
2488 ESubtreeMap *le = new ESubtreeMap();
2489 mds->mdlog->_start_entry(le);
2490
2491 map<dirfrag_t, CDir*> dirs_to_add;
2492
2493 if (myin) {
2494 CDir* mydir = myin->get_dirfrag(frag_t());
2495 dirs_to_add[mydir->dirfrag()] = mydir;
2496 }
2497
2498 // include all auth subtrees, and their bounds.
2499 // and a spanning tree to tie it to the root.
2500 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2501 p != subtrees.end();
2502 ++p) {
2503 CDir *dir = p->first;
2504
2505 // journal subtree as "ours" if we are
2506 // me, -2
2507 // me, me
2508 // me, !me (may be importing and ambiguous!)
2509
2510 // so not
2511 // !me, *
2512 if (dir->get_dir_auth().first != mds->get_nodeid())
2513 continue;
2514
2515 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2516 my_ambiguous_imports.count(dir->dirfrag())) {
2517 dout(15) << " ambig subtree " << *dir << dendl;
2518 le->ambiguous_subtrees.insert(dir->dirfrag());
2519 } else {
2520 dout(15) << " subtree " << *dir << dendl;
2521 }
2522
2523 dirs_to_add[dir->dirfrag()] = dir;
2524 le->subtrees[dir->dirfrag()].clear();
2525
2526
2527 // bounds
2528 for (set<CDir*>::iterator q = p->second.begin();
2529 q != p->second.end();
2530 ++q) {
2531 CDir *bound = *q;
2532 dout(15) << " subtree bound " << *bound << dendl;
2533 dirs_to_add[bound->dirfrag()] = bound;
2534 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2535 }
2536 }
2537
2538 // apply projected renames
2539 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2540 p != projected_subtree_renames.end();
2541 ++p) {
2542 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2543 CInode *diri = p->first;
2544 CDir *olddir = q->first;
2545 CDir *newdir = q->second;
2546 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2547
2548 list<CDir*> dfls;
2549 diri->get_dirfrags(dfls);
2550 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2551 CDir *dir = *p;
2552 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2553 CDir *oldparent = get_projected_subtree_root(olddir);
2554 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2555 CDir *newparent = get_projected_subtree_root(newdir);
2556 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2557
2558 if (oldparent == newparent) {
2559 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2560 << oldparent->dirfrag() << dendl;
2561 continue;
2562 }
2563
2564 if (dir->is_subtree_root()) {
2565 if (le->subtrees.count(newparent->dirfrag()) &&
2566 oldparent->get_dir_auth() != newparent->get_dir_auth())
2567 dirs_to_add[dir->dirfrag()] = dir;
2568 // children are fine. change parent.
2569 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2570 le->subtrees);
2571 } else {
2572 // mid-subtree.
2573
2574 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2575 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2576 // if oldparent is auth, subtree is mine; include it.
2577 if (le->subtrees.count(oldparent->dirfrag())) {
2578 dirs_to_add[dir->dirfrag()] = dir;
2579 le->subtrees[dir->dirfrag()].clear();
2580 }
2581 // if newparent is auth, subtree is a new bound
2582 if (le->subtrees.count(newparent->dirfrag())) {
2583 dirs_to_add[dir->dirfrag()] = dir;
2584 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2585 }
2586 newparent = dir;
2587 }
2588
2589 // see if any old bounds move to the new parent.
2590 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2591 p != subtrees[oldparent].end();
2592 ++p) {
2593 CDir *bound = *p;
2594 if (dir->contains(bound->get_parent_dir()))
2595 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2596 le->subtrees);
2597 }
2598 }
2599 }
2600 }
2601 }
2602
2603 // simplify the journaled map. our in memory map may have more
2604 // subtrees than needed due to migrations that are just getting
2605 // started or just completing. but on replay, the "live" map will
2606 // be simple and we can do a straight comparison.
2607 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2608 if (le->ambiguous_subtrees.count(p->first))
2609 continue;
2610 unsigned i = 0;
2611 while (i < p->second.size()) {
2612 dirfrag_t b = p->second[i];
2613 if (le->subtrees.count(b) &&
2614 le->ambiguous_subtrees.count(b) == 0) {
2615 vector<dirfrag_t>& bb = le->subtrees[b];
2616 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2617 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2618 p->second.push_back(*r);
2619 dirs_to_add.erase(b);
2620 le->subtrees.erase(b);
2621 p->second.erase(p->second.begin() + i);
2622 } else {
2623 ++i;
2624 }
2625 }
2626 }
2627
2628 for (auto &p : dirs_to_add) {
2629 CDir *dir = p.second;
2630 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2631 le->metablob.add_dir(dir, false);
2632 }
2633
2634 dout(15) << " subtrees " << le->subtrees << dendl;
2635 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2636
2637 //le->metablob.print(cout);
2638 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2639 return le;
2640 }
2641
2642 void MDCache::dump_resolve_status(Formatter *f) const
2643 {
2644 f->open_object_section("resolve_status");
2645 f->dump_stream("resolve_gather") << resolve_gather;
2646 f->dump_stream("resolve_ack_gather") << resolve_gather;
2647 f->close_section();
2648 }
2649
2650 void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2651 {
2652 dout(10) << "resolve_start" << dendl;
2653 assert(!resolve_done);
2654 resolve_done.reset(resolve_done_);
2655
2656 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2657 // if we don't have the root dir, adjust it to UNKNOWN. during
2658 // resolve we want mds0 to explicit claim the portion of it that
2659 // it owns, so that anything beyond its bounds get left as
2660 // unknown.
2661 CDir *rootdir = root->get_dirfrag(frag_t());
2662 if (rootdir)
2663 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2664 }
2665 resolve_gather = recovery_set;
2666 }
2667
2668 void MDCache::send_resolves()
2669 {
2670 send_slave_resolves();
2671 if (!resolve_ack_gather.empty()) {
2672 dout(10) << "send_resolves still waiting for resolve ack from ("
2673 << resolve_ack_gather << ")" << dendl;
2674 return;
2675 }
2676 if (!need_resolve_rollback.empty()) {
2677 dout(10) << "send_resolves still waiting for rollback to commit on ("
2678 << need_resolve_rollback << ")" << dendl;
2679 return;
2680 }
2681 send_subtree_resolves();
2682 }
2683
2684 void MDCache::send_slave_resolves()
2685 {
2686 dout(10) << "send_slave_resolves" << dendl;
2687
2688 map<mds_rank_t, MMDSResolve*> resolves;
2689
2690 if (mds->is_resolve()) {
2691 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2692 p != uncommitted_slave_updates.end();
2693 ++p) {
2694 resolves[p->first] = new MMDSResolve;
2695 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2696 q != p->second.end();
2697 ++q) {
2698 dout(10) << " including uncommitted " << q->first << dendl;
2699 resolves[p->first]->add_slave_request(q->first, false);
2700 }
2701 }
2702 } else {
2703 set<mds_rank_t> resolve_set;
2704 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2705 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2706 p != active_requests.end();
2707 ++p) {
2708 MDRequestRef& mdr = p->second;
2709 if (!mdr->is_slave())
2710 continue;
2711 if (!mdr->slave_did_prepare() && !mdr->committing) {
2712 continue;
2713 }
2714 mds_rank_t master = mdr->slave_to_mds;
2715 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2716 dout(10) << " including uncommitted " << *mdr << dendl;
2717 if (!resolves.count(master))
2718 resolves[master] = new MMDSResolve;
2719 if (!mdr->committing &&
2720 mdr->has_more() && mdr->more()->is_inode_exporter) {
2721 // re-send cap exports
2722 CInode *in = mdr->more()->rename_inode;
2723 map<client_t, Capability::Export> cap_map;
2724 in->export_client_caps(cap_map);
2725 bufferlist bl;
2726 ::encode(in->ino(), bl);
2727 ::encode(cap_map, bl);
2728 resolves[master]->add_slave_request(p->first, bl);
2729 } else {
2730 resolves[master]->add_slave_request(p->first, mdr->committing);
2731 }
2732 }
2733 }
2734 }
2735
2736 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2737 p != resolves.end();
2738 ++p) {
2739 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2740 mds->send_message_mds(p->second, p->first);
2741 resolve_ack_gather.insert(p->first);
2742 }
2743 }
2744
2745 void MDCache::send_subtree_resolves()
2746 {
2747 dout(10) << "send_subtree_resolves" << dendl;
2748
2749 if (migrator->is_exporting() || migrator->is_importing()) {
2750 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2751 migrator->show_importing();
2752 migrator->show_exporting();
2753 resolves_pending = true;
2754 return; // not now
2755 }
2756
2757 map<mds_rank_t, MMDSResolve*> resolves;
2758 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2759 p != recovery_set.end();
2760 ++p) {
2761 if (*p == mds->get_nodeid())
2762 continue;
2763 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2764 resolves[*p] = new MMDSResolve;
2765 }
2766
2767 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2768 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2769
2770 // known
2771 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2772 p != subtrees.end();
2773 ++p) {
2774 CDir *dir = p->first;
2775
2776 // only our subtrees
2777 if (dir->authority().first != mds->get_nodeid())
2778 continue;
2779
2780 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2781 continue; // we'll add it below
2782
2783 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2784 // ambiguous (mid-import)
2785 set<CDir*> bounds;
2786 get_subtree_bounds(dir, bounds);
2787 vector<dirfrag_t> dfls;
2788 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2789 dfls.push_back((*q)->dirfrag());
2790
2791 my_ambig_imports[dir->dirfrag()] = dfls;
2792 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2793 } else {
2794 // not ambiguous.
2795 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2796 q != resolves.end();
2797 ++q)
2798 resolves[q->first]->add_subtree(dir->dirfrag());
2799 // bounds too
2800 vector<dirfrag_t> dfls;
2801 for (set<CDir*>::iterator q = subtrees[dir].begin();
2802 q != subtrees[dir].end();
2803 ++q) {
2804 CDir *bound = *q;
2805 dfls.push_back(bound->dirfrag());
2806 }
2807
2808 my_subtrees[dir->dirfrag()] = dfls;
2809 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2810 }
2811 }
2812
2813 // ambiguous
2814 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2815 p != my_ambiguous_imports.end();
2816 ++p) {
2817 my_ambig_imports[p->first] = p->second;
2818 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2819 }
2820
2821 // simplify the claimed subtree.
2822 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2823 unsigned i = 0;
2824 while (i < p->second.size()) {
2825 dirfrag_t b = p->second[i];
2826 if (my_subtrees.count(b)) {
2827 vector<dirfrag_t>& bb = my_subtrees[b];
2828 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2829 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2830 p->second.push_back(*r);
2831 my_subtrees.erase(b);
2832 p->second.erase(p->second.begin() + i);
2833 } else {
2834 ++i;
2835 }
2836 }
2837 }
2838
2839 // send
2840 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2841 p != resolves.end();
2842 ++p) {
2843 MMDSResolve* m = p->second;
2844 m->subtrees = my_subtrees;
2845 m->ambiguous_imports = my_ambig_imports;
2846 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2847 mds->send_message_mds(m, p->first);
2848 }
2849 resolves_pending = false;
2850 }
2851
2852 void MDCache::handle_mds_failure(mds_rank_t who)
2853 {
2854 dout(7) << "handle_mds_failure mds." << who << dendl;
2855
2856 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2857
2858 resolve_gather.insert(who);
2859 discard_delayed_resolve(who);
2860 ambiguous_slave_updates.erase(who);
2861
2862 rejoin_gather.insert(who);
2863 rejoin_sent.erase(who); // i need to send another
2864 rejoin_ack_sent.erase(who); // i need to send another
2865 rejoin_ack_gather.erase(who); // i'll need/get another.
2866
2867 dout(10) << " resolve_gather " << resolve_gather << dendl;
2868 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2869 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2870 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2871 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2872
2873
2874 // tell the migrator too.
2875 migrator->handle_mds_failure_or_stop(who);
2876
2877 // tell the balancer too.
2878 mds->balancer->handle_mds_failure(who);
2879
2880 // clean up any requests slave to/from this node
2881 list<MDRequestRef> finish;
2882 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2883 p != active_requests.end();
2884 ++p) {
2885 MDRequestRef& mdr = p->second;
2886 // slave to the failed node?
2887 if (mdr->slave_to_mds == who) {
2888 if (mdr->slave_did_prepare()) {
2889 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2890 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2891 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2892
2893 if (!mdr->more()->waiting_on_slave.empty()) {
2894 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2895 // will rollback, no need to wait
2896 if (mdr->slave_request) {
2897 mdr->slave_request->put();
2898 mdr->slave_request = 0;
2899 }
2900 mdr->more()->waiting_on_slave.clear();
2901 }
2902 } else if (!mdr->committing) {
2903 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2904 if (mdr->slave_request || mdr->slave_rolling_back())
2905 mdr->aborted = true;
2906 else
2907 finish.push_back(mdr);
2908 }
2909 }
2910
2911 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2912 if (mdr->more()->waiting_on_slave.count(who)) {
2913 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2914 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2915 << who << dendl;
2916 mdr->more()->waiting_on_slave.erase(who);
2917 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2918 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2919 }
2920
2921 if (mdr->more()->srcdn_auth_mds == who &&
2922 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2923 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2924 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2925 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2926 }
2927 } else if (mdr->slave_request) {
2928 MMDSSlaveRequest *slave_req = mdr->slave_request;
2929 // FIXME: Slave rename request can arrive after we notice mds failure.
2930 // This can cause mds to crash (does not affect integrity of FS).
2931 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2932 slave_req->srcdn_auth == who)
2933 slave_req->mark_interrupted();
2934 }
2935
2936 // failed node is slave?
2937 if (mdr->is_master() && !mdr->committing) {
2938 if (mdr->more()->srcdn_auth_mds == who) {
2939 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2940 << who << " to recover" << dendl;
2941 assert(mdr->more()->witnessed.count(who) == 0);
2942 if (mdr->more()->is_ambiguous_auth)
2943 mdr->clear_ambiguous_auth();
2944 // rename srcdn's auth mds failed, all witnesses will rollback
2945 mdr->more()->witnessed.clear();
2946 pending_masters.erase(p->first);
2947 }
2948
2949 if (mdr->more()->witnessed.count(who)) {
2950 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2951 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2952 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2953 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2954 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2955 // until either the request is committing or the slave also fails.
2956 assert(mdr->more()->waiting_on_slave.size() == 1);
2957 pending_masters.insert(p->first);
2958 } else {
2959 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2960 << who << " to recover" << dendl;
2961 if (srcdn_auth >= 0)
2962 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2963
2964 // discard this peer's prepare (if any)
2965 mdr->more()->witnessed.erase(who);
2966 }
2967 }
2968
2969 if (mdr->more()->waiting_on_slave.count(who)) {
2970 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2971 << " to recover" << dendl;
2972 // retry request when peer recovers
2973 mdr->more()->waiting_on_slave.erase(who);
2974 if (mdr->more()->waiting_on_slave.empty())
2975 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2976 }
2977
2978 if (mdr->locking && mdr->locking_target_mds == who)
2979 mdr->finish_locking(mdr->locking);
2980 }
2981 }
2982
2983 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2984 p != uncommitted_masters.end();
2985 ++p) {
2986 // The failed MDS may have already committed the slave update
2987 if (p->second.slaves.count(who)) {
2988 p->second.recovering = true;
2989 p->second.slaves.erase(who);
2990 }
2991 }
2992
2993 while (!finish.empty()) {
2994 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
2995 request_finish(finish.front());
2996 finish.pop_front();
2997 }
2998
2999 kick_find_ino_peers(who);
3000 kick_open_ino_peers(who);
3001
3002 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3003 p != fragments.end(); ) {
3004 dirfrag_t df = p->first;
3005 fragment_info_t& info = p->second;
3006 ++p;
3007 if (info.is_fragmenting())
3008 continue;
3009 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3010 list<CDir*> dirs;
3011 info.dirs.swap(dirs);
3012 fragments.erase(df);
3013 fragment_unmark_unfreeze_dirs(dirs);
3014 }
3015
3016 // MDCache::shutdown_export_strays() always exports strays to mds.0
3017 if (who == mds_rank_t(0))
3018 shutdown_exported_strays.clear();
3019
3020 show_subtrees();
3021 }
3022
3023 /*
3024 * handle_mds_recovery - called on another node's transition
3025 * from resolve -> active.
3026 */
3027 void MDCache::handle_mds_recovery(mds_rank_t who)
3028 {
3029 dout(7) << "handle_mds_recovery mds." << who << dendl;
3030
3031 // exclude all discover waiters. kick_discovers() will do the job
3032 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3033 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3034
3035 list<MDSInternalContextBase*> waiters;
3036
3037 // wake up any waiters in their subtrees
3038 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3039 p != subtrees.end();
3040 ++p) {
3041 CDir *dir = p->first;
3042
3043 if (dir->authority().first != who ||
3044 dir->authority().second == mds->get_nodeid())
3045 continue;
3046 assert(!dir->is_auth());
3047
3048 // wake any waiters
3049 list<CDir*> q;
3050 q.push_back(dir);
3051
3052 while (!q.empty()) {
3053 CDir *d = q.front();
3054 q.pop_front();
3055 d->take_waiting(d_mask, waiters);
3056
3057 // inode waiters too
3058 for (auto &p : d->items) {
3059 CDentry *dn = p.second;
3060 CDentry::linkage_t *dnl = dn->get_linkage();
3061 if (dnl->is_primary()) {
3062 dnl->get_inode()->take_waiting(i_mask, waiters);
3063
3064 // recurse?
3065 list<CDir*> ls;
3066 dnl->get_inode()->get_dirfrags(ls);
3067 for (list<CDir*>::iterator p = ls.begin();
3068 p != ls.end();
3069 ++p) {
3070 CDir *subdir = *p;
3071 if (!subdir->is_subtree_root())
3072 q.push_back(subdir);
3073 }
3074 }
3075 }
3076 }
3077 }
3078
3079 kick_open_ino_peers(who);
3080 kick_find_ino_peers(who);
3081
3082 // queue them up.
3083 mds->queue_waiters(waiters);
3084 }
3085
3086 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3087 {
3088 dout(7) << "set_recovery_set " << s << dendl;
3089 recovery_set = s;
3090 }
3091
3092
3093 /*
3094 * during resolve state, we share resolves to determine who
3095 * is authoritative for which trees. we expect to get an resolve
3096 * from _everyone_ in the recovery_set (the mds cluster at the time of
3097 * the first failure).
3098 *
3099 * This functions puts the passed message before returning
3100 */
3101 void MDCache::handle_resolve(MMDSResolve *m)
3102 {
3103 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3104 mds_rank_t from = mds_rank_t(m->get_source().num());
3105
3106 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3107 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3108 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3109 return;
3110 }
3111 // wait until we reach the resolve stage!
3112 m->put();
3113 return;
3114 }
3115
3116 discard_delayed_resolve(from);
3117
3118 // ambiguous slave requests?
3119 if (!m->slave_requests.empty()) {
3120 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3121 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3122 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3123 assert(!p->second.committing);
3124 pending_masters.insert(p->first);
3125 }
3126 }
3127
3128 if (!pending_masters.empty()) {
3129 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3130 delayed_resolve[from] = m;
3131 return;
3132 }
3133 }
3134
3135 MMDSResolveAck *ack = new MMDSResolveAck;
3136 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3137 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3138 // COMMIT
3139 if (p->second.committing) {
3140 // already committing, waiting for the OP_COMMITTED slave reply
3141 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3142 } else {
3143 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3144 ack->add_commit(p->first);
3145 }
3146 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3147
3148 if (p->second.inode_caps.length() > 0) {
3149 // slave wants to export caps (rename)
3150 assert(mds->is_resolve());
3151
3152 inodeno_t ino;
3153 map<client_t,Capability::Export> cap_exports;
3154 bufferlist::iterator q = p->second.inode_caps.begin();
3155 ::decode(ino, q);
3156 ::decode(cap_exports, q);
3157
3158 assert(get_inode(ino));
3159
3160 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3161 q != cap_exports.end();
3162 ++q) {
3163 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3164 im.cap_id = ++last_cap_id; // assign a new cap ID
3165 im.issue_seq = 1;
3166 im.mseq = q->second.mseq;
3167 }
3168
3169 // will process these caps in rejoin stage
3170 rejoin_slave_exports[ino].first = from;
3171 rejoin_slave_exports[ino].second.swap(cap_exports);
3172
3173 // send information of imported caps back to slave
3174 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3175 }
3176 } else {
3177 // ABORT
3178 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3179 assert(!p->second.committing);
3180 ack->add_abort(p->first);
3181 }
3182 }
3183 mds->send_message(ack, m->get_connection());
3184 m->put();
3185 return;
3186 }
3187
3188 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3189 dout(10) << "delay processing subtree resolve" << dendl;
3190 delayed_resolve[from] = m;
3191 return;
3192 }
3193
3194 bool survivor = false;
3195 // am i a surviving ambiguous importer?
3196 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3197 survivor = true;
3198 // check for any import success/failure (from this node)
3199 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3200 while (p != my_ambiguous_imports.end()) {
3201 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3202 ++next;
3203 CDir *dir = get_dirfrag(p->first);
3204 assert(dir);
3205 dout(10) << "checking ambiguous import " << *dir << dendl;
3206 if (migrator->is_importing(dir->dirfrag()) &&
3207 migrator->get_import_peer(dir->dirfrag()) == from) {
3208 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3209
3210 // check if sender claims the subtree
3211 bool claimed_by_sender = false;
3212 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3213 q != m->subtrees.end();
3214 ++q) {
3215 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3216 CDir *base = get_force_dirfrag(q->first, false);
3217 if (!base || !base->contains(dir))
3218 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3219
3220 bool inside = true;
3221 set<CDir*> bounds;
3222 get_force_dirfrag_bound_set(q->second, bounds);
3223 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3224 CDir *bound = *p;
3225 if (bound->contains(dir)) {
3226 inside = false; // nope, bound is dir or parent of dir, not inside.
3227 break;
3228 }
3229 }
3230 if (inside)
3231 claimed_by_sender = true;
3232 }
3233
3234 my_ambiguous_imports.erase(p); // no longer ambiguous.
3235 if (claimed_by_sender) {
3236 dout(7) << "ambiguous import failed on " << *dir << dendl;
3237 migrator->import_reverse(dir);
3238 } else {
3239 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3240 migrator->import_finish(dir, true);
3241 }
3242 }
3243 p = next;
3244 }
3245 }
3246
3247 // update my dir_auth values
3248 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3249 // migrations between other nodes)
3250 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3251 pi != m->subtrees.end();
3252 ++pi) {
3253 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3254 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3255 if (!dir)
3256 continue;
3257 adjust_bounded_subtree_auth(dir, pi->second, from);
3258 try_subtree_merge(dir);
3259 }
3260
3261 show_subtrees();
3262
3263 // note ambiguous imports too
3264 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3265 pi != m->ambiguous_imports.end();
3266 ++pi) {
3267 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3268 other_ambiguous_imports[from][pi->first].swap( pi->second );
3269 }
3270
3271 // did i get them all?
3272 resolve_gather.erase(from);
3273
3274 maybe_resolve_finish();
3275
3276 m->put();
3277 }
3278
3279 void MDCache::process_delayed_resolve()
3280 {
3281 dout(10) << "process_delayed_resolve" << dendl;
3282 map<mds_rank_t, MMDSResolve*> tmp;
3283 tmp.swap(delayed_resolve);
3284 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3285 handle_resolve(p->second);
3286 }
3287
3288 void MDCache::discard_delayed_resolve(mds_rank_t who)
3289 {
3290 if (delayed_resolve.count(who)) {
3291 delayed_resolve[who]->put();
3292 delayed_resolve.erase(who);
3293 }
3294 }
3295
3296 void MDCache::maybe_resolve_finish()
3297 {
3298 assert(resolve_ack_gather.empty());
3299 assert(need_resolve_rollback.empty());
3300
3301 if (!resolve_gather.empty()) {
3302 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3303 << resolve_gather << ")" << dendl;
3304 return;
3305 }
3306
3307 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3308 disambiguate_my_imports();
3309 finish_committed_masters();
3310
3311 if (resolve_done) {
3312 assert(mds->is_resolve());
3313 trim_unlinked_inodes();
3314 recalc_auth_bits(false);
3315 resolve_done.release()->complete(0);
3316 } else {
3317 maybe_send_pending_rejoins();
3318 }
3319 }
3320
3321 /* This functions puts the passed message before returning */
3322 void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3323 {
3324 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3325 mds_rank_t from = mds_rank_t(ack->get_source().num());
3326
3327 if (!resolve_ack_gather.count(from) ||
3328 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3329 ack->put();
3330 return;
3331 }
3332
3333 if (ambiguous_slave_updates.count(from)) {
3334 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3335 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3336 }
3337
3338 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3339 p != ack->commit.end();
3340 ++p) {
3341 dout(10) << " commit on slave " << p->first << dendl;
3342
3343 if (ambiguous_slave_updates.count(from)) {
3344 remove_ambiguous_slave_update(p->first, from);
3345 continue;
3346 }
3347
3348 if (mds->is_resolve()) {
3349 // replay
3350 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3351 assert(su);
3352
3353 // log commit
3354 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3355 ESlaveUpdate::OP_COMMIT, su->origop),
3356 new C_MDC_SlaveCommit(this, from, p->first));
3357 mds->mdlog->flush();
3358
3359 finish_uncommitted_slave_update(p->first, from);
3360 } else {
3361 MDRequestRef mdr = request_get(p->first);
3362 // information about master imported caps
3363 if (p->second.length() > 0)
3364 mdr->more()->inode_import.claim(p->second);
3365
3366 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3367 request_finish(mdr);
3368 }
3369 }
3370
3371 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3372 p != ack->abort.end();
3373 ++p) {
3374 dout(10) << " abort on slave " << *p << dendl;
3375
3376 if (mds->is_resolve()) {
3377 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3378 assert(su);
3379
3380 // perform rollback (and journal a rollback entry)
3381 // note: this will hold up the resolve a bit, until the rollback entries journal.
3382 MDRequestRef null_ref;
3383 switch (su->origop) {
3384 case ESlaveUpdate::LINK:
3385 mds->server->do_link_rollback(su->rollback, from, null_ref);
3386 break;
3387 case ESlaveUpdate::RENAME:
3388 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3389 break;
3390 case ESlaveUpdate::RMDIR:
3391 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3392 break;
3393 default:
3394 ceph_abort();
3395 }
3396 } else {
3397 MDRequestRef mdr = request_get(*p);
3398 mdr->aborted = true;
3399 if (mdr->slave_request) {
3400 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3401 add_rollback(*p, from);
3402 } else {
3403 request_finish(mdr);
3404 }
3405 }
3406 }
3407
3408 if (!ambiguous_slave_updates.count(from))
3409 resolve_ack_gather.erase(from);
3410 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3411 send_subtree_resolves();
3412 process_delayed_resolve();
3413 }
3414
3415 ack->put();
3416 }
3417
3418 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3419 {
3420 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3421 uncommitted_slave_updates[master][reqid] = su;
3422 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3423 uncommitted_slave_rename_olddir[*p]++;
3424 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3425 uncommitted_slave_unlink[*p]++;
3426 }
3427
3428 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3429 {
3430 assert(uncommitted_slave_updates[master].count(reqid));
3431 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3432
3433 uncommitted_slave_updates[master].erase(reqid);
3434 if (uncommitted_slave_updates[master].empty())
3435 uncommitted_slave_updates.erase(master);
3436 // discard the non-auth subtree we renamed out of
3437 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3438 CInode *diri = *p;
3439 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3440 assert(it != uncommitted_slave_rename_olddir.end());
3441 it->second--;
3442 if (it->second == 0) {
3443 uncommitted_slave_rename_olddir.erase(it);
3444 list<CDir*> ls;
3445 diri->get_dirfrags(ls);
3446 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3447 CDir *root = get_subtree_root(*q);
3448 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3449 try_trim_non_auth_subtree(root);
3450 if (*q != root)
3451 break;
3452 }
3453 }
3454 } else
3455 assert(it->second > 0);
3456 }
3457 // removed the inodes that were unlinked by slave update
3458 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3459 CInode *in = *p;
3460 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3461 assert(it != uncommitted_slave_unlink.end());
3462 it->second--;
3463 if (it->second == 0) {
3464 uncommitted_slave_unlink.erase(it);
3465 if (!in->get_projected_parent_dn())
3466 mds->mdcache->remove_inode_recursive(in);
3467 } else
3468 assert(it->second > 0);
3469 }
3470 delete su;
3471 }
3472
3473 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3474 {
3475
3476 MDSlaveUpdate* su = NULL;
3477 if (uncommitted_slave_updates.count(master) &&
3478 uncommitted_slave_updates[master].count(reqid)) {
3479 su = uncommitted_slave_updates[master][reqid];
3480 assert(su);
3481 }
3482 return su;
3483 }
3484
3485 void MDCache::finish_rollback(metareqid_t reqid) {
3486 assert(need_resolve_rollback.count(reqid));
3487 if (mds->is_resolve())
3488 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3489 need_resolve_rollback.erase(reqid);
3490 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3491 send_subtree_resolves();
3492 process_delayed_resolve();
3493 }
3494 }
3495
3496 void MDCache::disambiguate_other_imports()
3497 {
3498 dout(10) << "disambiguate_other_imports" << dendl;
3499
3500 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3501 // other nodes' ambiguous imports
3502 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3503 p != other_ambiguous_imports.end();
3504 ++p) {
3505 mds_rank_t who = p->first;
3506 dout(10) << "ambiguous imports for mds." << who << dendl;
3507
3508 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3509 q != p->second.end();
3510 ++q) {
3511 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3512 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3513 CDir *dir = get_force_dirfrag(q->first, recovering);
3514 if (!dir) continue;
3515
3516 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3517 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3518 dout(10) << " mds." << who << " did import " << *dir << dendl;
3519 adjust_bounded_subtree_auth(dir, q->second, who);
3520 try_subtree_merge(dir);
3521 } else {
3522 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3523 }
3524 }
3525 }
3526 other_ambiguous_imports.clear();
3527 }
3528
3529 void MDCache::disambiguate_my_imports()
3530 {
3531 dout(10) << "disambiguate_my_imports" << dendl;
3532
3533 if (!mds->is_resolve()) {
3534 assert(my_ambiguous_imports.empty());
3535 return;
3536 }
3537
3538 disambiguate_other_imports();
3539
3540 // my ambiguous imports
3541 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3542 while (!my_ambiguous_imports.empty()) {
3543 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3544
3545 CDir *dir = get_dirfrag(q->first);
3546 assert(dir);
3547
3548 if (dir->authority() != me_ambig) {
3549 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3550 cancel_ambiguous_import(dir);
3551
3552 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3553
3554 // subtree may have been swallowed by another node claiming dir
3555 // as their own.
3556 CDir *root = get_subtree_root(dir);
3557 if (root != dir)
3558 dout(10) << " subtree root is " << *root << dendl;
3559 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3560 try_trim_non_auth_subtree(root);
3561 } else {
3562 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3563 finish_ambiguous_import(q->first);
3564 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3565 }
3566 }
3567 assert(my_ambiguous_imports.empty());
3568 mds->mdlog->flush();
3569
3570 // verify all my subtrees are unambiguous!
3571 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3572 p != subtrees.end();
3573 ++p) {
3574 CDir *dir = p->first;
3575 if (dir->is_ambiguous_dir_auth()) {
3576 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3577 }
3578 assert(!dir->is_ambiguous_dir_auth());
3579 }
3580
3581 show_subtrees();
3582 }
3583
3584
3585 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3586 {
3587 assert(my_ambiguous_imports.count(base) == 0);
3588 my_ambiguous_imports[base] = bounds;
3589 }
3590
3591
3592 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3593 {
3594 // make a list
3595 vector<dirfrag_t> binos;
3596 for (set<CDir*>::iterator p = bounds.begin();
3597 p != bounds.end();
3598 ++p)
3599 binos.push_back((*p)->dirfrag());
3600
3601 // note: this can get called twice if the exporter fails during recovery
3602 if (my_ambiguous_imports.count(base->dirfrag()))
3603 my_ambiguous_imports.erase(base->dirfrag());
3604
3605 add_ambiguous_import(base->dirfrag(), binos);
3606 }
3607
3608 void MDCache::cancel_ambiguous_import(CDir *dir)
3609 {
3610 dirfrag_t df = dir->dirfrag();
3611 assert(my_ambiguous_imports.count(df));
3612 dout(10) << "cancel_ambiguous_import " << df
3613 << " bounds " << my_ambiguous_imports[df]
3614 << " " << *dir
3615 << dendl;
3616 my_ambiguous_imports.erase(df);
3617 }
3618
3619 void MDCache::finish_ambiguous_import(dirfrag_t df)
3620 {
3621 assert(my_ambiguous_imports.count(df));
3622 vector<dirfrag_t> bounds;
3623 bounds.swap(my_ambiguous_imports[df]);
3624 my_ambiguous_imports.erase(df);
3625
3626 dout(10) << "finish_ambiguous_import " << df
3627 << " bounds " << bounds
3628 << dendl;
3629 CDir *dir = get_dirfrag(df);
3630 assert(dir);
3631
3632 // adjust dir_auth, import maps
3633 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3634 try_subtree_merge(dir);
3635 }
3636
3637 void MDCache::remove_inode_recursive(CInode *in)
3638 {
3639 dout(10) << "remove_inode_recursive " << *in << dendl;
3640 list<CDir*> ls;
3641 in->get_dirfrags(ls);
3642 list<CDir*>::iterator p = ls.begin();
3643 while (p != ls.end()) {
3644 CDir *subdir = *p++;
3645
3646 dout(10) << " removing dirfrag " << subdir << dendl;
3647 auto it = subdir->items.begin();
3648 while (it != subdir->items.end()) {
3649 CDentry *dn = it->second;
3650 ++it;
3651 CDentry::linkage_t *dnl = dn->get_linkage();
3652 if (dnl->is_primary()) {
3653 CInode *tin = dnl->get_inode();
3654 subdir->unlink_inode(dn, false);
3655 remove_inode_recursive(tin);
3656 }
3657 subdir->remove_dentry(dn);
3658 }
3659
3660 if (subdir->is_subtree_root())
3661 remove_subtree(subdir);
3662 in->close_dirfrag(subdir->dirfrag().frag);
3663 }
3664 remove_inode(in);
3665 }
3666
3667 bool MDCache::expire_recursive(
3668 CInode *in,
3669 map<mds_rank_t, MCacheExpire*>& expiremap)
3670 {
3671 assert(!in->is_auth());
3672
3673 dout(10) << __func__ << ":" << *in << dendl;
3674
3675 // Recurse into any dirfrags beneath this inode
3676 list<CDir*> ls;
3677 in->get_dirfrags(ls);
3678 for (auto subdir : ls) {
3679 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3680 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3681 return true;
3682 }
3683
3684 for (auto &it : subdir->items) {
3685 CDentry *dn = it.second;
3686 CDentry::linkage_t *dnl = dn->get_linkage();
3687 if (dnl->is_primary()) {
3688 CInode *tin = dnl->get_inode();
3689
3690 /* Remote strays with linkage (i.e. hardlinks) should not be
3691 * expired, because they may be the target of
3692 * a rename() as the owning MDS shuts down */
3693 if (!tin->is_stray() && tin->inode.nlink) {
3694 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3695 return true;
3696 }
3697
3698 const bool abort = expire_recursive(tin, expiremap);
3699 if (abort) {
3700 return true;
3701 }
3702 }
3703 if (dn->lru_is_expireable()) {
3704 trim_dentry(dn, expiremap);
3705 } else {
3706 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3707 return true;
3708 }
3709 }
3710 }
3711
3712 return false;
3713 }
3714
3715 void MDCache::trim_unlinked_inodes()
3716 {
3717 dout(7) << "trim_unlinked_inodes" << dendl;
3718 list<CInode*> q;
3719 for (auto &p : inode_map) {
3720 CInode *in = p.second;
3721 if (in->get_parent_dn() == NULL && !in->is_base()) {
3722 dout(7) << " will trim from " << *in << dendl;
3723 q.push_back(in);
3724 }
3725 }
3726 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3727 remove_inode_recursive(*p);
3728 }
3729
3730 /** recalc_auth_bits()
3731 * once subtree auth is disambiguated, we need to adjust all the
3732 * auth and dirty bits in our cache before moving on.
3733 */
3734 void MDCache::recalc_auth_bits(bool replay)
3735 {
3736 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3737
3738 if (root) {
3739 root->inode_auth.first = mds->mdsmap->get_root();
3740 bool auth = mds->get_nodeid() == root->inode_auth.first;
3741 if (auth) {
3742 root->state_set(CInode::STATE_AUTH);
3743 } else {
3744 root->state_clear(CInode::STATE_AUTH);
3745 if (!replay)
3746 root->state_set(CInode::STATE_REJOINING);
3747 }
3748 }
3749
3750 set<CInode*> subtree_inodes;
3751 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3752 p != subtrees.end();
3753 ++p) {
3754 if (p->first->dir_auth.first == mds->get_nodeid())
3755 subtree_inodes.insert(p->first->inode);
3756 }
3757
3758 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3759 p != subtrees.end();
3760 ++p) {
3761 if (p->first->inode->is_mdsdir()) {
3762 CInode *in = p->first->inode;
3763 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3764 if (auth) {
3765 in->state_set(CInode::STATE_AUTH);
3766 } else {
3767 in->state_clear(CInode::STATE_AUTH);
3768 if (!replay)
3769 in->state_set(CInode::STATE_REJOINING);
3770 }
3771 }
3772
3773 list<CDir*> dfq; // dirfrag queue
3774 dfq.push_back(p->first);
3775
3776 bool auth = p->first->authority().first == mds->get_nodeid();
3777 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3778
3779 while (!dfq.empty()) {
3780 CDir *dir = dfq.front();
3781 dfq.pop_front();
3782
3783 // dir
3784 if (auth) {
3785 dir->state_set(CDir::STATE_AUTH);
3786 } else {
3787 dir->state_clear(CDir::STATE_AUTH);
3788 if (!replay) {
3789 // close empty non-auth dirfrag
3790 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3791 dir->inode->close_dirfrag(dir->get_frag());
3792 continue;
3793 }
3794 dir->state_set(CDir::STATE_REJOINING);
3795 dir->state_clear(CDir::STATE_COMPLETE);
3796 if (dir->is_dirty())
3797 dir->mark_clean();
3798 }
3799 }
3800
3801 // dentries in this dir
3802 for (auto &p : dir->items) {
3803 // dn
3804 CDentry *dn = p.second;
3805 CDentry::linkage_t *dnl = dn->get_linkage();
3806 if (auth) {
3807 dn->state_set(CDentry::STATE_AUTH);
3808 } else {
3809 dn->state_clear(CDentry::STATE_AUTH);
3810 if (!replay) {
3811 dn->state_set(CDentry::STATE_REJOINING);
3812 if (dn->is_dirty())
3813 dn->mark_clean();
3814 }
3815 }
3816
3817 if (dnl->is_primary()) {
3818 // inode
3819 CInode *in = dnl->get_inode();
3820 if (auth) {
3821 in->state_set(CInode::STATE_AUTH);
3822 } else {
3823 in->state_clear(CInode::STATE_AUTH);
3824 if (!replay) {
3825 in->state_set(CInode::STATE_REJOINING);
3826 if (in->is_dirty())
3827 in->mark_clean();
3828 if (in->is_dirty_parent())
3829 in->clear_dirty_parent();
3830 // avoid touching scatterlocks for our subtree roots!
3831 if (subtree_inodes.count(in) == 0)
3832 in->clear_scatter_dirty();
3833 }
3834 }
3835 // recurse?
3836 if (in->is_dir())
3837 in->get_nested_dirfrags(dfq);
3838 }
3839 }
3840 }
3841 }
3842
3843 show_subtrees();
3844 show_cache();
3845 }
3846
3847
3848
3849 // ===========================================================================
3850 // REJOIN
3851
3852 /*
3853 * notes on scatterlock recovery:
3854 *
3855 * - recovering inode replica sends scatterlock data for any subtree
3856 * roots (the only ones that are possibly dirty).
3857 *
3858 * - surviving auth incorporates any provided scatterlock data. any
3859 * pending gathers are then finished, as with the other lock types.
3860 *
3861 * that takes care of surviving auth + (recovering replica)*.
3862 *
3863 * - surviving replica sends strong_inode, which includes current
3864 * scatterlock state, AND any dirty scatterlock data. this
3865 * provides the recovering auth with everything it might need.
3866 *
3867 * - recovering auth must pick initial scatterlock state based on
3868 * (weak|strong) rejoins.
3869 * - always assimilate scatterlock data (it can't hurt)
3870 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3871 * - include base inode in ack for all inodes that saw scatterlock content
3872 *
3873 * also, for scatter gather,
3874 *
3875 * - auth increments {frag,r}stat.version on completion of any gather.
3876 *
3877 * - auth incorporates changes in a gather _only_ if the version
3878 * matches.
3879 *
3880 * - replica discards changes any time the scatterlock syncs, and
3881 * after recovery.
3882 */
3883
3884 void MDCache::dump_rejoin_status(Formatter *f) const
3885 {
3886 f->open_object_section("rejoin_status");
3887 f->dump_stream("rejoin_gather") << rejoin_gather;
3888 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3889 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3890 f->close_section();
3891 }
3892
3893 void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3894 {
3895 dout(10) << "rejoin_start" << dendl;
3896 assert(!rejoin_done);
3897 rejoin_done.reset(rejoin_done_);
3898
3899 rejoin_gather = recovery_set;
3900 // need finish opening cap inodes before sending cache rejoins
3901 rejoin_gather.insert(mds->get_nodeid());
3902 process_imported_caps();
3903 }
3904
3905 /*
3906 * rejoin phase!
3907 *
3908 * this initiates rejoin. it shoudl be called before we get any
3909 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3910 *
3911 * we start out by sending rejoins to everyone in the recovery set.
3912 *
3913 * if we are rejoin, send for all regions in our cache.
3914 * if we are active|stopping, send only to nodes that are are rejoining.
3915 */
3916 void MDCache::rejoin_send_rejoins()
3917 {
3918 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3919
3920 if (rejoin_gather.count(mds->get_nodeid())) {
3921 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3922 rejoins_pending = true;
3923 return;
3924 }
3925 if (!resolve_gather.empty()) {
3926 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3927 << resolve_gather << ")" << dendl;
3928 rejoins_pending = true;
3929 return;
3930 }
3931
3932 assert(!migrator->is_importing());
3933 assert(!migrator->is_exporting());
3934
3935 if (!mds->is_rejoin()) {
3936 disambiguate_other_imports();
3937 }
3938
3939 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3940
3941
3942 // if i am rejoining, send a rejoin to everyone.
3943 // otherwise, just send to others who are rejoining.
3944 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3945 p != recovery_set.end();
3946 ++p) {
3947 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3948 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3949 if (mds->is_rejoin())
3950 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3951 else if (mds->mdsmap->is_rejoin(*p))
3952 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3953 }
3954
3955 if (mds->is_rejoin()) {
3956 map<client_t, set<mds_rank_t> > client_exports;
3957 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3958 assert(cap_export_targets.count(p->first));
3959 mds_rank_t target = cap_export_targets[p->first];
3960 if (rejoins.count(target) == 0)
3961 continue;
3962 rejoins[target]->cap_exports[p->first] = p->second;
3963 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3964 client_exports[q->first].insert(target);
3965 }
3966 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3967 p != client_exports.end();
3968 ++p) {
3969 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3970 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3971 rejoins[*q]->client_map[p->first] = inst;
3972 }
3973 }
3974
3975
3976 // check all subtrees
3977 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
3978 p != subtrees.end();
3979 ++p) {
3980 CDir *dir = p->first;
3981 assert(dir->is_subtree_root());
3982 if (dir->is_ambiguous_dir_auth()) {
3983 // exporter is recovering, importer is survivor.
3984 assert(rejoins.count(dir->authority().first));
3985 assert(!rejoins.count(dir->authority().second));
3986 continue;
3987 }
3988
3989 // my subtree?
3990 if (dir->is_auth())
3991 continue; // skip my own regions!
3992
3993 mds_rank_t auth = dir->get_dir_auth().first;
3994 assert(auth >= 0);
3995 if (rejoins.count(auth) == 0)
3996 continue; // don't care about this node's subtrees
3997
3998 rejoin_walk(dir, rejoins[auth]);
3999 }
4000
4001 // rejoin root inodes, too
4002 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4003 p != rejoins.end();
4004 ++p) {
4005 if (mds->is_rejoin()) {
4006 // weak
4007 if (p->first == 0 && root) {
4008 p->second->add_weak_inode(root->vino());
4009 if (root->is_dirty_scattered()) {
4010 dout(10) << " sending scatterlock state on root " << *root << dendl;
4011 p->second->add_scatterlock_state(root);
4012 }
4013 }
4014 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4015 if (in)
4016 p->second->add_weak_inode(in->vino());
4017 }
4018 } else {
4019 // strong
4020 if (p->first == 0 && root) {
4021 p->second->add_strong_inode(root->vino(),
4022 root->get_replica_nonce(),
4023 root->get_caps_wanted(),
4024 root->filelock.get_state(),
4025 root->nestlock.get_state(),
4026 root->dirfragtreelock.get_state());
4027 root->state_set(CInode::STATE_REJOINING);
4028 if (root->is_dirty_scattered()) {
4029 dout(10) << " sending scatterlock state on root " << *root << dendl;
4030 p->second->add_scatterlock_state(root);
4031 }
4032 }
4033
4034 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4035 p->second->add_strong_inode(in->vino(),
4036 in->get_replica_nonce(),
4037 in->get_caps_wanted(),
4038 in->filelock.get_state(),
4039 in->nestlock.get_state(),
4040 in->dirfragtreelock.get_state());
4041 in->state_set(CInode::STATE_REJOINING);
4042 }
4043 }
4044 }
4045
4046 if (!mds->is_rejoin()) {
4047 // i am survivor. send strong rejoin.
4048 // note request remote_auth_pins, xlocks
4049 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4050 p != active_requests.end();
4051 ++p) {
4052 MDRequestRef& mdr = p->second;
4053 if (mdr->is_slave())
4054 continue;
4055 // auth pins
4056 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4057 q != mdr->remote_auth_pins.end();
4058 ++q) {
4059 if (!q->first->is_auth()) {
4060 assert(q->second == q->first->authority().first);
4061 if (rejoins.count(q->second) == 0) continue;
4062 MMDSCacheRejoin *rejoin = rejoins[q->second];
4063
4064 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4065 MDSCacheObjectInfo i;
4066 q->first->set_object_info(i);
4067 if (i.ino)
4068 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4069 else
4070 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4071
4072 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4073 mdr->more()->rename_inode == q->first)
4074 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4075 mdr->reqid, mdr->attempt);
4076 }
4077 }
4078 // xlocks
4079 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4080 q != mdr->xlocks.end();
4081 ++q) {
4082 if (!(*q)->get_parent()->is_auth()) {
4083 mds_rank_t who = (*q)->get_parent()->authority().first;
4084 if (rejoins.count(who) == 0) continue;
4085 MMDSCacheRejoin *rejoin = rejoins[who];
4086
4087 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4088 MDSCacheObjectInfo i;
4089 (*q)->get_parent()->set_object_info(i);
4090 if (i.ino)
4091 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4092 mdr->reqid, mdr->attempt);
4093 else
4094 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4095 mdr->reqid, mdr->attempt);
4096 }
4097 }
4098 // remote wrlocks
4099 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4100 q != mdr->remote_wrlocks.end();
4101 ++q) {
4102 mds_rank_t who = q->second;
4103 if (rejoins.count(who) == 0) continue;
4104 MMDSCacheRejoin *rejoin = rejoins[who];
4105
4106 dout(15) << " " << *mdr << " wrlock on " << q->second
4107 << " " << q->first->get_parent() << dendl;
4108 MDSCacheObjectInfo i;
4109 q->first->get_parent()->set_object_info(i);
4110 assert(i.ino);
4111 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4112 mdr->reqid, mdr->attempt);
4113 }
4114 }
4115 }
4116
4117 // send the messages
4118 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4119 p != rejoins.end();
4120 ++p) {
4121 assert(rejoin_sent.count(p->first) == 0);
4122 assert(rejoin_ack_gather.count(p->first) == 0);
4123 rejoin_sent.insert(p->first);
4124 rejoin_ack_gather.insert(p->first);
4125 mds->send_message_mds(p->second, p->first);
4126 }
4127 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4128 rejoins_pending = false;
4129
4130 // nothing?
4131 if (mds->is_rejoin() && rejoins.empty()) {
4132 dout(10) << "nothing to rejoin" << dendl;
4133 rejoin_gather_finish();
4134 }
4135 }
4136
4137
4138 /**
4139 * rejoin_walk - build rejoin declarations for a subtree
4140 *
4141 * @param dir subtree root
4142 * @param rejoin rejoin message
4143 *
4144 * from a rejoining node:
4145 * weak dirfrag
4146 * weak dentries (w/ connectivity)
4147 *
4148 * from a surviving node:
4149 * strong dirfrag
4150 * strong dentries (no connectivity!)
4151 * strong inodes
4152 */
4153 void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4154 {
4155 dout(10) << "rejoin_walk " << *dir << dendl;
4156
4157 list<CDir*> nested; // finish this dir, then do nested items
4158
4159 if (mds->is_rejoin()) {
4160 // WEAK
4161 rejoin->add_weak_dirfrag(dir->dirfrag());
4162 for (auto &p : dir->items) {
4163 CDentry *dn = p.second;
4164 assert(dn->last == CEPH_NOSNAP);
4165 CDentry::linkage_t *dnl = dn->get_linkage();
4166 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4167 assert(dnl->is_primary());
4168 CInode *in = dnl->get_inode();
4169 assert(dnl->get_inode()->is_dir());
4170 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4171 in->get_nested_dirfrags(nested);
4172 if (in->is_dirty_scattered()) {
4173 dout(10) << " sending scatterlock state on " << *in << dendl;
4174 rejoin->add_scatterlock_state(in);
4175 }
4176 }
4177 } else {
4178 // STRONG
4179 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4180 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4181 dir->state_set(CDir::STATE_REJOINING);
4182
4183 for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
4184 CDentry *dn = it->second;
4185 CDentry::linkage_t *dnl = dn->get_linkage();
4186 dout(15) << " add_strong_dentry " << *dn << dendl;
4187 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4188 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4189 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4190 dnl->is_remote() ? dnl->get_remote_d_type():0,
4191 dn->get_replica_nonce(),
4192 dn->lock.get_state());
4193 dn->state_set(CDentry::STATE_REJOINING);
4194 if (dnl->is_primary()) {
4195 CInode *in = dnl->get_inode();
4196 dout(15) << " add_strong_inode " << *in << dendl;
4197 rejoin->add_strong_inode(in->vino(),
4198 in->get_replica_nonce(),
4199 in->get_caps_wanted(),
4200 in->filelock.get_state(),
4201 in->nestlock.get_state(),
4202 in->dirfragtreelock.get_state());
4203 in->state_set(CInode::STATE_REJOINING);
4204 in->get_nested_dirfrags(nested);
4205 if (in->is_dirty_scattered()) {
4206 dout(10) << " sending scatterlock state on " << *in << dendl;
4207 rejoin->add_scatterlock_state(in);
4208 }
4209 }
4210 }
4211 }
4212
4213 // recurse into nested dirs
4214 for (list<CDir*>::iterator p = nested.begin();
4215 p != nested.end();
4216 ++p)
4217 rejoin_walk(*p, rejoin);
4218 }
4219
4220
4221 /*
4222 * i got a rejoin.
4223 * - reply with the lockstate
4224 *
4225 * if i am active|stopping,
4226 * - remove source from replica list for everything not referenced here.
4227 * This function puts the passed message before returning.
4228 */
4229 void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4230 {
4231 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4232 << " (" << m->get_payload().length() << " bytes)"
4233 << dendl;
4234
4235 switch (m->op) {
4236 case MMDSCacheRejoin::OP_WEAK:
4237 handle_cache_rejoin_weak(m);
4238 break;
4239 case MMDSCacheRejoin::OP_STRONG:
4240 handle_cache_rejoin_strong(m);
4241 break;
4242 case MMDSCacheRejoin::OP_ACK:
4243 handle_cache_rejoin_ack(m);
4244 break;
4245
4246 default:
4247 ceph_abort();
4248 }
4249 m->put();
4250 }
4251
4252
4253 /*
4254 * handle_cache_rejoin_weak
4255 *
4256 * the sender
4257 * - is recovering from their journal.
4258 * - may have incorrect (out of date) inode contents
4259 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4260 *
4261 * if the sender didn't trim_non_auth(), they
4262 * - may have incorrect (out of date) dentry/inode linkage
4263 * - may have deleted/purged inodes
4264 * and i may have to go to disk to get accurate inode contents. yuck.
4265 * This functions DOES NOT put the passed message before returning
4266 */
4267 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4268 {
4269 mds_rank_t from = mds_rank_t(weak->get_source().num());
4270
4271 // possible response(s)
4272 MMDSCacheRejoin *ack = 0; // if survivor
4273 set<vinodeno_t> acked_inodes; // if survivor
4274 set<SimpleLock *> gather_locks; // if survivor
4275 bool survivor = false; // am i a survivor?
4276
4277 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4278 survivor = true;
4279 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4280 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4281
4282 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4283
4284 // check cap exports
4285 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4286 CInode *in = get_inode(p->first);
4287 assert(!in || in->is_auth());
4288 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4289 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4290 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4291 Capability::Import& im = imported_caps[p->first][q->first];
4292 if (cap) {
4293 im.cap_id = cap->get_cap_id();
4294 im.issue_seq = cap->get_last_seq();
4295 im.mseq = cap->get_mseq();
4296 } else {
4297 // all are zero
4298 }
4299 }
4300 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4301 }
4302
4303 ::encode(imported_caps, ack->imported_caps);
4304 } else {
4305 assert(mds->is_rejoin());
4306
4307 // we may have already received a strong rejoin from the sender.
4308 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4309 assert(gather_locks.empty());
4310
4311 // check cap exports.
4312 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4313
4314 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4315 CInode *in = get_inode(p->first);
4316 assert(!in || in->is_auth());
4317 // note
4318 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4319 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4320 cap_imports[p->first][q->first][from] = q->second;
4321 }
4322 }
4323 }
4324
4325 // assimilate any potentially dirty scatterlock state
4326 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4327 p != weak->inode_scatterlocks.end();
4328 ++p) {
4329 CInode *in = get_inode(p->first);
4330 assert(in);
4331 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4332 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4333 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4334 if (!survivor)
4335 rejoin_potential_updated_scatterlocks.insert(in);
4336 }
4337
4338 // recovering peer may send incorrect dirfrags here. we need to
4339 // infer which dirfrag they meant. the ack will include a
4340 // strong_dirfrag that will set them straight on the fragmentation.
4341
4342 // walk weak map
4343 set<CDir*> dirs_to_share;
4344 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4345 p != weak->weak_dirfrags.end();
4346 ++p) {
4347 CInode *diri = get_inode(p->ino);
4348 if (!diri)
4349 dout(0) << " missing dir ino " << p->ino << dendl;
4350 assert(diri);
4351
4352 list<frag_t> ls;
4353 if (diri->dirfragtree.is_leaf(p->frag)) {
4354 ls.push_back(p->frag);
4355 } else {
4356 diri->dirfragtree.get_leaves_under(p->frag, ls);
4357 if (ls.empty())
4358 ls.push_back(diri->dirfragtree[p->frag.value()]);
4359 }
4360 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4361 frag_t fg = *q;
4362 CDir *dir = diri->get_dirfrag(fg);
4363 if (!dir) {
4364 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4365 continue;
4366 }
4367 assert(dir);
4368 if (dirs_to_share.count(dir)) {
4369 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4370 } else {
4371 dirs_to_share.insert(dir);
4372 unsigned nonce = dir->add_replica(from);
4373 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4374 if (ack) {
4375 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4376 ack->add_dirfrag_base(dir);
4377 }
4378 }
4379 }
4380 }
4381
4382 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4383 p != weak->weak.end();
4384 ++p) {
4385 CInode *diri = get_inode(p->first);
4386 if (!diri)
4387 dout(0) << " missing dir ino " << p->first << dendl;
4388 assert(diri);
4389
4390 // weak dentries
4391 CDir *dir = 0;
4392 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4393 q != p->second.end();
4394 ++q) {
4395 // locate proper dirfrag.
4396 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4397 frag_t fg = diri->pick_dirfrag(q->first.name);
4398 if (!dir || dir->get_frag() != fg) {
4399 dir = diri->get_dirfrag(fg);
4400 if (!dir)
4401 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4402 assert(dir);
4403 assert(dirs_to_share.count(dir));
4404 }
4405
4406 // and dentry
4407 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4408 assert(dn);
4409 CDentry::linkage_t *dnl = dn->get_linkage();
4410 assert(dnl->is_primary());
4411
4412 if (survivor && dn->is_replica(from))
4413 dentry_remove_replica(dn, from, gather_locks);
4414 unsigned dnonce = dn->add_replica(from);
4415 dout(10) << " have " << *dn << dendl;
4416 if (ack)
4417 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4418 dnl->get_inode()->ino(), inodeno_t(0), 0,
4419 dnonce, dn->lock.get_replica_state());
4420
4421 // inode
4422 CInode *in = dnl->get_inode();
4423 assert(in);
4424
4425 if (survivor && in->is_replica(from))
4426 inode_remove_replica(in, from, true, gather_locks);
4427 unsigned inonce = in->add_replica(from);
4428 dout(10) << " have " << *in << dendl;
4429
4430 // scatter the dirlock, just in case?
4431 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4432 in->filelock.set_state(LOCK_MIX);
4433
4434 if (ack) {
4435 acked_inodes.insert(in->vino());
4436 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4437 bufferlist bl;
4438 in->_encode_locks_state_for_rejoin(bl, from);
4439 ack->add_inode_locks(in, inonce, bl);
4440 }
4441 }
4442 }
4443
4444 // weak base inodes? (root, stray, etc.)
4445 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4446 p != weak->weak_inodes.end();
4447 ++p) {
4448 CInode *in = get_inode(*p);
4449 assert(in); // hmm fixme wrt stray?
4450 if (survivor && in->is_replica(from))
4451 inode_remove_replica(in, from, true, gather_locks);
4452 unsigned inonce = in->add_replica(from);
4453 dout(10) << " have base " << *in << dendl;
4454
4455 if (ack) {
4456 acked_inodes.insert(in->vino());
4457 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4458 bufferlist bl;
4459 in->_encode_locks_state_for_rejoin(bl, from);
4460 ack->add_inode_locks(in, inonce, bl);
4461 }
4462 }
4463
4464 assert(rejoin_gather.count(from));
4465 rejoin_gather.erase(from);
4466 if (survivor) {
4467 // survivor. do everything now.
4468 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4469 p != weak->inode_scatterlocks.end();
4470 ++p) {
4471 CInode *in = get_inode(p->first);
4472 assert(in);
4473 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4474 acked_inodes.insert(in->vino());
4475 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4476 }
4477
4478 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4479 mds->send_message(ack, weak->get_connection());
4480
4481 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4482 if (!(*p)->is_stable())
4483 mds->locker->eval_gather(*p);
4484 }
4485 } else {
4486 // done?
4487 if (rejoin_gather.empty()) {
4488 rejoin_gather_finish();
4489 } else {
4490 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4491 }
4492 }
4493 }
4494
4495 class C_MDC_RejoinGatherFinish : public MDCacheContext {
4496 public:
4497 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4498 void finish(int r) override {
4499 mdcache->rejoin_gather_finish();
4500 }
4501 };
4502
4503 /*
4504 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4505 *
4506 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4507 * ack, the replica dne, and we can remove it from our replica maps.
4508 */
4509 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4510 set<vinodeno_t>& acked_inodes,
4511 set<SimpleLock *>& gather_locks)
4512 {
4513 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4514
4515 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4516 // inode?
4517 if (in->is_auth() &&
4518 in->is_replica(from) &&
4519 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4520 inode_remove_replica(in, from, false, gather_locks);
4521 dout(10) << " rem " << *in << dendl;
4522 }
4523
4524 if (!in->is_dir())
4525 return;
4526
4527 list<CDir*> dfs;
4528 in->get_dirfrags(dfs);
4529 for (list<CDir*>::iterator p = dfs.begin();
4530 p != dfs.end();
4531 ++p) {
4532 CDir *dir = *p;
4533 if (!dir->is_auth())
4534 continue;
4535
4536 if (dir->is_replica(from) &&
4537 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4538 dir->remove_replica(from);
4539 dout(10) << " rem " << *dir << dendl;
4540 }
4541
4542 // dentries
4543 for (auto &p : dir->items) {
4544 CDentry *dn = p.second;
4545
4546 if (dn->is_replica(from) &&
4547 (ack == NULL ||
4548 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4549 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
4550 dentry_remove_replica(dn, from, gather_locks);
4551 dout(10) << " rem " << *dn << dendl;
4552 }
4553 }
4554 }
4555 };
4556
4557 for (auto &p : inode_map)
4558 scour_func(p.second);
4559 for (auto &p : snap_inode_map)
4560 scour_func(p.second);
4561 }
4562
4563
4564 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4565 {
4566 CInode *in = new CInode(this, true, 1, last);
4567 in->inode.ino = ino;
4568 in->state_set(CInode::STATE_REJOINUNDEF);
4569 add_inode(in);
4570 rejoin_undef_inodes.insert(in);
4571 dout(10) << " invented " << *in << dendl;
4572 return in;
4573 }
4574
4575 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4576 {
4577 CInode *in = get_inode(df.ino);
4578 if (!in)
4579 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4580 if (!in->is_dir()) {
4581 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4582 in->inode.mode = S_IFDIR;
4583 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4584 }
4585 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4586 dir->state_set(CDir::STATE_REJOINUNDEF);
4587 rejoin_undef_dirfrags.insert(dir);
4588 dout(10) << " invented " << *dir << dendl;
4589 return dir;
4590 }
4591
4592 /* This functions DOES NOT put the passed message before returning */
4593 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4594 {
4595 mds_rank_t from = mds_rank_t(strong->get_source().num());
4596
4597 // only a recovering node will get a strong rejoin.
4598 assert(mds->is_rejoin());
4599
4600 // assimilate any potentially dirty scatterlock state
4601 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4602 p != strong->inode_scatterlocks.end();
4603 ++p) {
4604 CInode *in = get_inode(p->first);
4605 assert(in);
4606 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4607 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4608 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4609 rejoin_potential_updated_scatterlocks.insert(in);
4610 }
4611
4612 rejoin_unlinked_inodes[from].clear();
4613
4614 // surviving peer may send incorrect dirfrag here (maybe they didn't
4615 // get the fragment notify, or maybe we rolled back?). we need to
4616 // infer the right frag and get them with the program. somehow.
4617 // we don't normally send ACK.. so we'll need to bundle this with
4618 // MISSING or something.
4619
4620 // strong dirfrags/dentries.
4621 // also process auth_pins, xlocks.
4622 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4623 p != strong->strong_dirfrags.end();
4624 ++p) {
4625 CInode *diri = get_inode(p->first.ino);
4626 if (!diri)
4627 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4628 CDir *dir = diri->get_dirfrag(p->first.frag);
4629 bool refragged = false;
4630 if (dir) {
4631 dout(10) << " have " << *dir << dendl;
4632 } else {
4633 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4634 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4635 else if (diri->dirfragtree.is_leaf(p->first.frag))
4636 dir = rejoin_invent_dirfrag(p->first);
4637 }
4638 if (dir) {
4639 dir->add_replica(from, p->second.nonce);
4640 dir->dir_rep = p->second.dir_rep;
4641 } else {
4642 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4643 list<frag_t> ls;
4644 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4645 if (ls.empty())
4646 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4647 dout(10) << " maps to frag(s) " << ls << dendl;
4648 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4649 CDir *dir = diri->get_dirfrag(*q);
4650 if (!dir)
4651 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4652 else
4653 dout(10) << " have(approx) " << *dir << dendl;
4654 dir->add_replica(from, p->second.nonce);
4655 dir->dir_rep = p->second.dir_rep;
4656 }
4657 refragged = true;
4658 }
4659
4660 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4661 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4662 q != dmap.end();
4663 ++q) {
4664 CDentry *dn;
4665 if (!refragged)
4666 dn = dir->lookup(q->first.name, q->first.snapid);
4667 else {
4668 frag_t fg = diri->pick_dirfrag(q->first.name);
4669 dir = diri->get_dirfrag(fg);
4670 assert(dir);
4671 dn = dir->lookup(q->first.name, q->first.snapid);
4672 }
4673 if (!dn) {
4674 if (q->second.is_remote()) {
4675 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4676 q->second.first, q->first.snapid);
4677 } else if (q->second.is_null()) {
4678 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4679 } else {
4680 CInode *in = get_inode(q->second.ino, q->first.snapid);
4681 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4682 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4683 }
4684 dout(10) << " invented " << *dn << dendl;
4685 }
4686 CDentry::linkage_t *dnl = dn->get_linkage();
4687
4688 // dn auth_pin?
4689 if (strong->authpinned_dentries.count(p->first) &&
4690 strong->authpinned_dentries[p->first].count(q->first)) {
4691 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4692 r != strong->authpinned_dentries[p->first][q->first].end();
4693 ++r) {
4694 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4695
4696 // get/create slave mdrequest
4697 MDRequestRef mdr;
4698 if (have_request(r->reqid))
4699 mdr = request_get(r->reqid);
4700 else
4701 mdr = request_start_slave(r->reqid, r->attempt, strong);
4702 mdr->auth_pin(dn);
4703 }
4704 }
4705
4706 // dn xlock?
4707 if (strong->xlocked_dentries.count(p->first) &&
4708 strong->xlocked_dentries[p->first].count(q->first)) {
4709 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4710 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4711 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4712 assert(mdr->is_auth_pinned(dn));
4713 if (!mdr->xlocks.count(&dn->versionlock)) {
4714 assert(dn->versionlock.can_xlock_local());
4715 dn->versionlock.get_xlock(mdr, mdr->get_client());
4716 mdr->xlocks.insert(&dn->versionlock);
4717 mdr->locks.insert(&dn->versionlock);
4718 }
4719 if (dn->lock.is_stable())
4720 dn->auth_pin(&dn->lock);
4721 dn->lock.set_state(LOCK_XLOCK);
4722 dn->lock.get_xlock(mdr, mdr->get_client());
4723 mdr->xlocks.insert(&dn->lock);
4724 mdr->locks.insert(&dn->lock);
4725 }
4726
4727 dn->add_replica(from, q->second.nonce);
4728 dout(10) << " have " << *dn << dendl;
4729
4730 if (dnl->is_primary()) {
4731 if (q->second.is_primary()) {
4732 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4733 // the survivor missed MDentryUnlink+MDentryLink messages ?
4734 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4735 CInode *in = get_inode(q->second.ino, q->first.snapid);
4736 assert(in);
4737 assert(in->get_parent_dn());
4738 rejoin_unlinked_inodes[from].insert(in);
4739 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4740 }
4741 } else {
4742 // the survivor missed MDentryLink message ?
4743 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4744 dout(7) << " sender doesn't have primay dentry" << dendl;
4745 }
4746 } else {
4747 if (q->second.is_primary()) {
4748 // the survivor missed MDentryUnlink message ?
4749 CInode *in = get_inode(q->second.ino, q->first.snapid);
4750 assert(in);
4751 assert(in->get_parent_dn());
4752 rejoin_unlinked_inodes[from].insert(in);
4753 dout(7) << " sender has primary dentry but we don't" << dendl;
4754 }
4755 }
4756 }
4757 }
4758
4759 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4760 p != strong->strong_inodes.end();
4761 ++p) {
4762 CInode *in = get_inode(p->first);
4763 assert(in);
4764 in->add_replica(from, p->second.nonce);
4765 dout(10) << " have " << *in << dendl;
4766
4767 MMDSCacheRejoin::inode_strong &is = p->second;
4768
4769 // caps_wanted
4770 if (is.caps_wanted) {
4771 in->mds_caps_wanted[from] = is.caps_wanted;
4772 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4773 << " on " << *in << dendl;
4774 }
4775
4776 // scatterlocks?
4777 // infer state from replica state:
4778 // * go to MIX if they might have wrlocks
4779 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4780 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4781 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4782 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4783
4784 // auth pin?
4785 if (strong->authpinned_inodes.count(in->vino())) {
4786 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4787 r != strong->authpinned_inodes[in->vino()].end();
4788 ++r) {
4789 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4790
4791 // get/create slave mdrequest
4792 MDRequestRef mdr;
4793 if (have_request(r->reqid))
4794 mdr = request_get(r->reqid);
4795 else
4796 mdr = request_start_slave(r->reqid, r->attempt, strong);
4797 if (strong->frozen_authpin_inodes.count(in->vino())) {
4798 assert(!in->get_num_auth_pins());
4799 mdr->freeze_auth_pin(in);
4800 } else {
4801 assert(!in->is_frozen_auth_pin());
4802 }
4803 mdr->auth_pin(in);
4804 }
4805 }
4806 // xlock(s)?
4807 if (strong->xlocked_inodes.count(in->vino())) {
4808 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4809 q != strong->xlocked_inodes[in->vino()].end();
4810 ++q) {
4811 SimpleLock *lock = in->get_lock(q->first);
4812 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4813 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4814 assert(mdr->is_auth_pinned(in));
4815 if (!mdr->xlocks.count(&in->versionlock)) {
4816 assert(in->versionlock.can_xlock_local());
4817 in->versionlock.get_xlock(mdr, mdr->get_client());
4818 mdr->xlocks.insert(&in->versionlock);
4819 mdr->locks.insert(&in->versionlock);
4820 }
4821 if (lock->is_stable())
4822 in->auth_pin(lock);
4823 lock->set_state(LOCK_XLOCK);
4824 if (lock == &in->filelock)
4825 in->loner_cap = -1;
4826 lock->get_xlock(mdr, mdr->get_client());
4827 mdr->xlocks.insert(lock);
4828 mdr->locks.insert(lock);
4829 }
4830 }
4831 }
4832 // wrlock(s)?
4833 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4834 p != strong->wrlocked_inodes.end();
4835 ++p) {
4836 CInode *in = get_inode(p->first);
4837 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4838 q != p->second.end();
4839 ++q) {
4840 SimpleLock *lock = in->get_lock(q->first);
4841 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4842 r != q->second.end();
4843 ++r) {
4844 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4845 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4846 if (in->is_auth())
4847 assert(mdr->is_auth_pinned(in));
4848 lock->set_state(LOCK_MIX);
4849 if (lock == &in->filelock)
4850 in->loner_cap = -1;
4851 lock->get_wrlock(true);
4852 mdr->wrlocks.insert(lock);
4853 mdr->locks.insert(lock);
4854 }
4855 }
4856 }
4857
4858 // done?
4859 assert(rejoin_gather.count(from));
4860 rejoin_gather.erase(from);
4861 if (rejoin_gather.empty()) {
4862 rejoin_gather_finish();
4863 } else {
4864 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4865 }
4866 }
4867
4868 /* This functions DOES NOT put the passed message before returning */
4869 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4870 {
4871 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4872 mds_rank_t from = mds_rank_t(ack->get_source().num());
4873
4874 assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4875 bool survivor = !mds->is_rejoin();
4876
4877 // for sending cache expire message
4878 set<CInode*> isolated_inodes;
4879 set<CInode*> refragged_inodes;
4880
4881 // dirs
4882 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4883 p != ack->strong_dirfrags.end();
4884 ++p) {
4885 // we may have had incorrect dir fragmentation; refragment based
4886 // on what they auth tells us.
4887 CDir *dir = get_dirfrag(p->first);
4888 if (!dir) {
4889 dir = get_force_dirfrag(p->first, false);
4890 if (dir)
4891 refragged_inodes.insert(dir->get_inode());
4892 }
4893 if (!dir) {
4894 CInode *diri = get_inode(p->first.ino);
4895 if (!diri) {
4896 // barebones inode; the full inode loop below will clean up.
4897 diri = new CInode(this, false);
4898 diri->inode.ino = p->first.ino;
4899 diri->inode.mode = S_IFDIR;
4900 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4901 add_inode(diri);
4902 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4903 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4904 dout(10) << " add inode " << *diri << dendl;
4905 } else {
4906 diri->inode_auth = CDIR_AUTH_DEFAULT;
4907 isolated_inodes.insert(diri);
4908 dout(10) << " unconnected dirfrag " << p->first << dendl;
4909 }
4910 }
4911 // barebones dirfrag; the full dirfrag loop below will clean up.
4912 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4913 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4914 (dir->authority() != CDIR_AUTH_UNDEF &&
4915 dir->authority().first != from))
4916 adjust_subtree_auth(dir, from);
4917 dout(10) << " add dirfrag " << *dir << dendl;
4918 }
4919
4920 dir->set_replica_nonce(p->second.nonce);
4921 dir->state_clear(CDir::STATE_REJOINING);
4922 dout(10) << " got " << *dir << dendl;
4923
4924 // dentries
4925 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4926 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4927 q != dmap.end();
4928 ++q) {
4929 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4930 if(!dn)
4931 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4932
4933 CDentry::linkage_t *dnl = dn->get_linkage();
4934
4935 assert(dn->last == q->first.snapid);
4936 if (dn->first != q->second.first) {
4937 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4938 dn->first = q->second.first;
4939 }
4940
4941 // may have bad linkage if we missed dentry link/unlink messages
4942 if (dnl->is_primary()) {
4943 CInode *in = dnl->get_inode();
4944 if (!q->second.is_primary() ||
4945 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4946 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4947 dir->unlink_inode(dn);
4948 }
4949 } else if (dnl->is_remote()) {
4950 if (!q->second.is_remote() ||
4951 q->second.remote_ino != dnl->get_remote_ino() ||
4952 q->second.remote_d_type != dnl->get_remote_d_type()) {
4953 dout(10) << " had bad linkage for " << *dn << dendl;
4954 dir->unlink_inode(dn);
4955 }
4956 } else {
4957 if (!q->second.is_null())
4958 dout(10) << " had bad linkage for " << *dn << dendl;
4959 }
4960
4961 // hmm, did we have the proper linkage here?
4962 if (dnl->is_null() && !q->second.is_null()) {
4963 if (q->second.is_remote()) {
4964 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4965 } else {
4966 CInode *in = get_inode(q->second.ino, q->first.snapid);
4967 if (!in) {
4968 // barebones inode; assume it's dir, the full inode loop below will clean up.
4969 in = new CInode(this, false, q->second.first, q->first.snapid);
4970 in->inode.ino = q->second.ino;
4971 in->inode.mode = S_IFDIR;
4972 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4973 add_inode(in);
4974 dout(10) << " add inode " << *in << dendl;
4975 } else if (in->get_parent_dn()) {
4976 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
4977 << ", unlinking " << *in << dendl;
4978 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
4979 }
4980 dn->dir->link_primary_inode(dn, in);
4981 isolated_inodes.erase(in);
4982 }
4983 }
4984
4985 dn->set_replica_nonce(q->second.nonce);
4986 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
4987 dn->state_clear(CDentry::STATE_REJOINING);
4988 dout(10) << " got " << *dn << dendl;
4989 }
4990 }
4991
4992 for (set<CInode*>::iterator p = refragged_inodes.begin();
4993 p != refragged_inodes.end();
4994 ++p) {
4995 list<CDir*> ls;
4996 (*p)->get_nested_dirfrags(ls);
4997 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
4998 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
4999 continue;
5000 assert((*q)->get_num_any() == 0);
5001 (*p)->close_dirfrag((*q)->get_frag());
5002 }
5003 }
5004
5005 // full dirfrags
5006 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5007 p != ack->dirfrag_bases.end();
5008 ++p) {
5009 CDir *dir = get_dirfrag(p->first);
5010 assert(dir);
5011 bufferlist::iterator q = p->second.begin();
5012 dir->_decode_base(q);
5013 dout(10) << " got dir replica " << *dir << dendl;
5014 }
5015
5016 // full inodes
5017 bufferlist::iterator p = ack->inode_base.begin();
5018 while (!p.end()) {
5019 inodeno_t ino;
5020 snapid_t last;
5021 bufferlist basebl;
5022 ::decode(ino, p);
5023 ::decode(last, p);
5024 ::decode(basebl, p);
5025 CInode *in = get_inode(ino, last);
5026 assert(in);
5027 bufferlist::iterator q = basebl.begin();
5028 in->_decode_base(q);
5029 dout(10) << " got inode base " << *in << dendl;
5030 }
5031
5032 // inodes
5033 p = ack->inode_locks.begin();
5034 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5035 while (!p.end()) {
5036 inodeno_t ino;
5037 snapid_t last;
5038 __u32 nonce;
5039 bufferlist lockbl;
5040 ::decode(ino, p);
5041 ::decode(last, p);
5042 ::decode(nonce, p);
5043 ::decode(lockbl, p);
5044
5045 CInode *in = get_inode(ino, last);
5046 assert(in);
5047 in->set_replica_nonce(nonce);
5048 bufferlist::iterator q = lockbl.begin();
5049 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5050 in->state_clear(CInode::STATE_REJOINING);
5051 dout(10) << " got inode locks " << *in << dendl;
5052 }
5053
5054 // FIXME: This can happen if entire subtree, together with the inode subtree root
5055 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5056 assert(isolated_inodes.empty());
5057
5058 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5059 bufferlist::iterator bp = ack->imported_caps.begin();
5060 ::decode(peer_imported, bp);
5061
5062 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5063 p != peer_imported.end();
5064 ++p) {
5065 assert(cap_exports.count(p->first));
5066 assert(cap_export_targets.count(p->first));
5067 assert(cap_export_targets[p->first] == from);
5068 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5069 q != p->second.end();
5070 ++q) {
5071 assert(cap_exports[p->first].count(q->first));
5072
5073 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5074 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5075 assert(session);
5076
5077 // mark client caps stale.
5078 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5079 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5080 mds->get_osd_epoch_barrier());
5081 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5082 (q->second.cap_id > 0 ? from : -1), 0);
5083 mds->send_message_client_counted(m, session);
5084
5085 cap_exports[p->first].erase(q->first);
5086 }
5087 assert(cap_exports[p->first].empty());
5088 }
5089
5090 // done?
5091 assert(rejoin_ack_gather.count(from));
5092 rejoin_ack_gather.erase(from);
5093 if (!survivor) {
5094
5095 if (rejoin_gather.empty()) {
5096 // eval unstable scatter locks after all wrlocks are rejoined.
5097 while (!rejoin_eval_locks.empty()) {
5098 SimpleLock *lock = rejoin_eval_locks.front();
5099 rejoin_eval_locks.pop_front();
5100 if (!lock->is_stable())
5101 mds->locker->eval_gather(lock);
5102 }
5103 }
5104
5105 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5106 rejoin_ack_gather.empty()) {
5107 // finally, kickstart past snap parent opens
5108 open_snap_parents();
5109 } else {
5110 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5111 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5112 }
5113 } else {
5114 // survivor.
5115 mds->queue_waiters(rejoin_waiters);
5116 }
5117 }
5118
5119 /**
5120 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5121 *
5122 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5123 * messages that clean these guys up...
5124 */
5125 void MDCache::rejoin_trim_undef_inodes()
5126 {
5127 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5128
5129 while (!rejoin_undef_inodes.empty()) {
5130 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5131 CInode *in = *p;
5132 rejoin_undef_inodes.erase(p);
5133
5134 in->clear_replica_map();
5135
5136 // close out dirfrags
5137 if (in->is_dir()) {
5138 list<CDir*> dfls;
5139 in->get_dirfrags(dfls);
5140 for (list<CDir*>::iterator p = dfls.begin();
5141 p != dfls.end();
5142 ++p) {
5143 CDir *dir = *p;
5144 dir->clear_replica_map();
5145
5146 for (auto &p : dir->items) {
5147 CDentry *dn = p.second;
5148 dn->clear_replica_map();
5149
5150 dout(10) << " trimming " << *dn << dendl;
5151 dir->remove_dentry(dn);
5152 }
5153
5154 dout(10) << " trimming " << *dir << dendl;
5155 in->close_dirfrag(dir->dirfrag().frag);
5156 }
5157 }
5158
5159 CDentry *dn = in->get_parent_dn();
5160 if (dn) {
5161 dn->clear_replica_map();
5162 dout(10) << " trimming " << *dn << dendl;
5163 dn->dir->remove_dentry(dn);
5164 } else {
5165 dout(10) << " trimming " << *in << dendl;
5166 remove_inode(in);
5167 }
5168 }
5169
5170 assert(rejoin_undef_inodes.empty());
5171 }
5172
5173 void MDCache::rejoin_gather_finish()
5174 {
5175 dout(10) << "rejoin_gather_finish" << dendl;
5176 assert(mds->is_rejoin());
5177
5178 if (open_undef_inodes_dirfrags())
5179 return;
5180
5181 if (process_imported_caps())
5182 return;
5183
5184 choose_lock_states_and_reconnect_caps();
5185
5186 identify_files_to_recover();
5187 rejoin_send_acks();
5188
5189 // signal completion of fetches, rejoin_gather_finish, etc.
5190 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5191 rejoin_ack_gather.erase(mds->get_nodeid());
5192
5193 // did we already get our acks too?
5194 if (rejoin_ack_gather.empty()) {
5195 // finally, kickstart past snap parent opens
5196 open_snap_parents();
5197 }
5198 }
5199
5200 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5201 inodeno_t ino;
5202 public:
5203 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5204 void finish(int r) override {
5205 mdcache->rejoin_open_ino_finish(ino, r);
5206 }
5207 };
5208
5209 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5210 {
5211 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5212
5213 if (ret < 0) {
5214 cap_imports_missing.insert(ino);
5215 } else if (ret == mds->get_nodeid()) {
5216 assert(get_inode(ino));
5217 } else {
5218 auto p = cap_imports.find(ino);
5219 assert(p != cap_imports.end());
5220 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5221 assert(q->second.count(MDS_RANK_NONE));
5222 assert(q->second.size() == 1);
5223 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5224 }
5225 cap_imports.erase(p);
5226 }
5227
5228 assert(cap_imports_num_opening > 0);
5229 cap_imports_num_opening--;
5230
5231 if (cap_imports_num_opening == 0) {
5232 if (rejoin_gather.empty())
5233 rejoin_gather_finish();
5234 else if (rejoin_gather.count(mds->get_nodeid()))
5235 process_imported_caps();
5236 }
5237 }
5238
5239 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5240 public:
5241 map<client_t,entity_inst_t> client_map;
5242 map<client_t,uint64_t> sseqmap;
5243
5244 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5245 MDCacheLogContext(c), client_map(cm) {}
5246 void finish(int r) override {
5247 assert(r == 0);
5248 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5249 }
5250 };
5251
5252 void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5253 map<client_t,uint64_t>& sseqmap)
5254 {
5255 dout(10) << "rejoin_open_sessions_finish" << dendl;
5256 mds->server->finish_force_open_sessions(client_map, sseqmap);
5257 if (rejoin_gather.empty())
5258 rejoin_gather_finish();
5259 }
5260
5261 bool MDCache::process_imported_caps()
5262 {
5263 dout(10) << "process_imported_caps" << dendl;
5264
5265 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5266 CInode *in = get_inode(p->first);
5267 if (in) {
5268 assert(in->is_auth());
5269 cap_imports_missing.erase(p->first);
5270 continue;
5271 }
5272 if (cap_imports_missing.count(p->first) > 0)
5273 continue;
5274
5275 cap_imports_num_opening++;
5276 dout(10) << " opening missing ino " << p->first << dendl;
5277 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5278 }
5279
5280 if (cap_imports_num_opening > 0)
5281 return true;
5282
5283 // called by rejoin_gather_finish() ?
5284 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5285 // if sessions for imported caps are all open ?
5286 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5287 p != rejoin_client_map.end();
5288 ++p) {
5289 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5290 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5291 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5292 ESessions *le = new ESessions(pv, rejoin_client_map);
5293 mds->mdlog->start_submit_entry(le, finish);
5294 mds->mdlog->flush();
5295 rejoin_client_map.clear();
5296 return true;
5297 }
5298 }
5299 rejoin_client_map.clear();
5300
5301 // process caps that were exported by slave rename
5302 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5303 p != rejoin_slave_exports.end();
5304 ++p) {
5305 CInode *in = get_inode(p->first);
5306 assert(in);
5307 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5308 q != p->second.second.end();
5309 ++q) {
5310 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5311 assert(session);
5312
5313 Capability *cap = in->get_client_cap(q->first);
5314 if (!cap)
5315 cap = in->add_client_cap(q->first, session);
5316 cap->merge(q->second, true);
5317
5318 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5319 assert(cap->get_last_seq() == im.issue_seq);
5320 assert(cap->get_mseq() == im.mseq);
5321 cap->set_cap_id(im.cap_id);
5322 // send cap import because we assigned a new cap ID
5323 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5324 p->second.first, CEPH_CAP_FLAG_AUTH);
5325 }
5326 }
5327 rejoin_slave_exports.clear();
5328 rejoin_imported_caps.clear();
5329
5330 // process cap imports
5331 // ino -> client -> frommds -> capex
5332 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5333 CInode *in = get_inode(p->first);
5334 if (!in) {
5335 dout(10) << " still missing ino " << p->first
5336 << ", will try again after replayed client requests" << dendl;
5337 ++p;
5338 continue;
5339 }
5340 assert(in->is_auth());
5341 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5342 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5343 assert(session);
5344 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5345 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5346 add_reconnected_cap(q->first, in->ino(), r->second);
5347 if (r->first >= 0) {
5348 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5349 cap->inc_mseq();
5350 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5351
5352 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5353 im.cap_id = cap->get_cap_id();
5354 im.issue_seq = cap->get_last_seq();
5355 im.mseq = cap->get_mseq();
5356 }
5357 }
5358 }
5359 cap_imports.erase(p++); // remove and move on
5360 }
5361 } else {
5362 trim_non_auth();
5363
5364 rejoin_gather.erase(mds->get_nodeid());
5365 maybe_send_pending_rejoins();
5366
5367 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5368 rejoin_gather_finish();
5369 }
5370 return false;
5371 }
5372
5373 void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5374 {
5375 // are this realm's parents fully open?
5376 if (realm->have_past_parents_open()) {
5377 dout(10) << " have past snap parents for realm " << *realm
5378 << " on " << *realm->inode << dendl;
5379 if (reconnect) {
5380 // finish off client snaprealm reconnects?
5381 auto p = reconnected_snaprealms.find(realm->inode->ino());
5382 if (p != reconnected_snaprealms.end()) {
5383 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5384 finish_snaprealm_reconnect(q->first, realm, q->second);
5385 reconnected_snaprealms.erase(p);
5386 }
5387 }
5388 } else {
5389 if (!missing_snap_parents.count(realm->inode)) {
5390 dout(10) << " MISSING past snap parents for realm " << *realm
5391 << " on " << *realm->inode << dendl;
5392 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5393 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5394 } else {
5395 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5396 << " on " << *realm->inode << dendl;
5397 }
5398 }
5399 }
5400
5401 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5402 client_t client, snapid_t snap_follows)
5403 {
5404 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5405
5406 const set<snapid_t>& snaps = realm->get_snaps();
5407 snapid_t follows = snap_follows;
5408
5409 while (true) {
5410 CInode *in = pick_inode_snap(head_in, follows);
5411 if (in == head_in)
5412 break;
5413 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5414
5415 /* TODO: we can check the reconnected/flushing caps to find
5416 * which locks need gathering */
5417 for (int i = 0; i < num_cinode_locks; i++) {
5418 int lockid = cinode_lock_info[i].lock;
5419 SimpleLock *lock = in->get_lock(lockid);
5420 assert(lock);
5421 in->client_snap_caps[lockid].insert(client);
5422 in->auth_pin(lock);
5423 lock->set_state(LOCK_SNAP_SYNC);
5424 lock->get_wrlock(true);
5425 }
5426
5427 for (auto p = snaps.lower_bound(in->first);
5428 p != snaps.end() && *p <= in->last;
5429 ++p) {
5430 head_in->add_need_snapflush(in, *p, client);
5431 }
5432
5433 follows = in->last;
5434 }
5435 }
5436
5437 /*
5438 * choose lock states based on reconnected caps
5439 */
5440 void MDCache::choose_lock_states_and_reconnect_caps()
5441 {
5442 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5443
5444 map<client_t,MClientSnap*> splits;
5445
5446 for (auto i : inode_map) {
5447 CInode *in = i.second;
5448
5449 if (in->last != CEPH_NOSNAP)
5450 continue;
5451
5452 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5453 in->mark_dirty_rstat();
5454
5455 int dirty_caps = 0;
5456 auto p = reconnected_caps.find(in->ino());
5457 if (p != reconnected_caps.end()) {
5458 for (const auto &it : p->second)
5459 dirty_caps |= it.second.dirty_caps;
5460 }
5461 in->choose_lock_states(dirty_caps);
5462 dout(15) << " chose lock states on " << *in << dendl;
5463
5464 SnapRealm *realm = in->find_snaprealm();
5465
5466 check_realm_past_parents(realm, realm == in->snaprealm);
5467
5468 if (p != reconnected_caps.end()) {
5469 bool missing_snap_parent = false;
5470 // also, make sure client's cap is in the correct snaprealm.
5471 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5472 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5473 if (realm->have_past_parents_open()) {
5474 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5475 } else {
5476 missing_snap_parent = true;
5477 }
5478 }
5479
5480 if (q->second.realm_ino == realm->inode->ino()) {
5481 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5482 } else {
5483 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5484 << " != " << realm->inode->ino() << dendl;
5485 if (realm->have_past_parents_open()) {
5486 // ok, include in a split message _now_.
5487 prepare_realm_split(realm, q->first, in->ino(), splits);
5488 } else {
5489 // send the split later.
5490 missing_snap_parent = true;
5491 }
5492 }
5493 }
5494 if (missing_snap_parent)
5495 missing_snap_parents[realm->inode].insert(in);
5496 }
5497 }
5498
5499 send_snaps(splits);
5500 }
5501
5502 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5503 map<client_t,MClientSnap*>& splits)
5504 {
5505 MClientSnap *snap;
5506 if (splits.count(client) == 0) {
5507 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5508 snap->head.split = realm->inode->ino();
5509 realm->build_snap_trace(snap->bl);
5510
5511 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5512 p != realm->open_children.end();
5513 ++p)
5514 snap->split_realms.push_back((*p)->inode->ino());
5515
5516 } else
5517 snap = splits[client];
5518 snap->split_inos.push_back(ino);
5519 }
5520
5521 void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5522 {
5523 dout(10) << "send_snaps" << dendl;
5524
5525 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5526 p != splits.end();
5527 ++p) {
5528 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5529 if (session) {
5530 dout(10) << " client." << p->first
5531 << " split " << p->second->head.split
5532 << " inos " << p->second->split_inos
5533 << dendl;
5534 mds->send_message_client_counted(p->second, session);
5535 } else {
5536 dout(10) << " no session for client." << p->first << dendl;
5537 p->second->put();
5538 }
5539 }
5540 splits.clear();
5541 }
5542
5543
5544 /*
5545 * remove any items from logsegment open_file lists that don't have
5546 * any caps
5547 */
5548 void MDCache::clean_open_file_lists()
5549 {
5550 dout(10) << "clean_open_file_lists" << dendl;
5551
5552 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5553 p != mds->mdlog->segments.end();
5554 ++p) {
5555 LogSegment *ls = p->second;
5556
5557 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5558 while (!q.end()) {
5559 CInode *in = *q;
5560 ++q;
5561 if (in->last == CEPH_NOSNAP) {
5562 if (!in->is_any_caps_wanted()) {
5563 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5564 in->item_open_file.remove_myself();
5565 }
5566 } else if (in->last != CEPH_NOSNAP) {
5567 if (in->client_snap_caps.empty()) {
5568 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5569 in->item_open_file.remove_myself();
5570 }
5571 }
5572 }
5573 }
5574 }
5575
5576
5577
5578 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5579 {
5580 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5581 << " on " << *in << dendl;
5582 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5583 if (!session) {
5584 dout(10) << " no session for client." << client << dendl;
5585 return NULL;
5586 }
5587
5588 Capability *cap = in->reconnect_cap(client, icr, session);
5589
5590 if (frommds >= 0) {
5591 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5592 cap->inc_mseq();
5593 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5594 }
5595
5596 return cap;
5597 }
5598
5599 void MDCache::export_remaining_imported_caps()
5600 {
5601 dout(10) << "export_remaining_imported_caps" << dendl;
5602
5603 stringstream warn_str;
5604
5605 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5606 warn_str << " ino " << p->first << "\n";
5607 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5608 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5609 if (session) {
5610 // mark client caps stale.
5611 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5612 stale->set_cap_peer(0, 0, 0, -1, 0);
5613 mds->send_message_client_counted(stale, q->first);
5614 }
5615 }
5616
5617 mds->heartbeat_reset();
5618 }
5619
5620 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5621 p != cap_reconnect_waiters.end();
5622 ++p)
5623 mds->queue_waiters(p->second);
5624
5625 cap_imports.clear();
5626 cap_reconnect_waiters.clear();
5627
5628 if (warn_str.peek() != EOF) {
5629 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5630 mds->clog->warn(warn_str);
5631 }
5632 }
5633
5634 void MDCache::try_reconnect_cap(CInode *in, Session *session)
5635 {
5636 client_t client = session->info.get_client();
5637 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5638 if (rc) {
5639 in->reconnect_cap(client, *rc, session);
5640 dout(10) << "try_reconnect_cap client." << client
5641 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5642 << " issue " << ccap_string(rc->capinfo.issued)
5643 << " on " << *in << dendl;
5644 remove_replay_cap_reconnect(in->ino(), client);
5645
5646 if (in->is_replicated()) {
5647 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5648 } else {
5649 int dirty_caps = 0;
5650 auto p = reconnected_caps.find(in->ino());
5651 if (p != reconnected_caps.end()) {
5652 auto q = p->second.find(client);
5653 if (q != p->second.end())
5654 dirty_caps = q->second.dirty_caps;
5655 }
5656 in->choose_lock_states(dirty_caps);
5657 dout(15) << " chose lock states on " << *in << dendl;
5658 }
5659
5660 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5661 cap_reconnect_waiters.find(in->ino());
5662 if (it != cap_reconnect_waiters.end()) {
5663 mds->queue_waiters(it->second);
5664 cap_reconnect_waiters.erase(it);
5665 }
5666 }
5667 }
5668
5669
5670
5671 // -------
5672 // cap imports and delayed snap parent opens
5673
5674 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5675 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5676 int peer, int p_flags)
5677 {
5678 client_t client = session->info.inst.name.num();
5679 SnapRealm *realm = in->find_snaprealm();
5680 if (realm->have_past_parents_open()) {
5681 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5682 if (cap->get_last_seq() == 0) // reconnected cap
5683 cap->inc_last_seq();
5684 cap->set_last_issue();
5685 cap->set_last_issue_stamp(ceph_clock_now());
5686 cap->clear_new();
5687 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5688 in->ino(),
5689 realm->inode->ino(),
5690 cap->get_cap_id(), cap->get_last_seq(),
5691 cap->pending(), cap->wanted(), 0,
5692 cap->get_mseq(), mds->get_osd_epoch_barrier());
5693 in->encode_cap_message(reap, cap);
5694 realm->build_snap_trace(reap->snapbl);
5695 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5696 mds->send_message_client_counted(reap, session);
5697 } else {
5698 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5699 << cap->get_mseq() << " on " << *in << dendl;
5700 in->auth_pin(this);
5701 cap->inc_suppress();
5702 delayed_imported_caps[client].insert(in);
5703 missing_snap_parents[in].size();
5704 }
5705 }
5706
5707 void MDCache::do_delayed_cap_imports()
5708 {
5709 dout(10) << "do_delayed_cap_imports" << dendl;
5710
5711 assert(delayed_imported_caps.empty());
5712 }
5713
5714 struct C_MDC_OpenSnapParents : public MDCacheContext {
5715 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5716 void finish(int r) override {
5717 mdcache->open_snap_parents();
5718 }
5719 };
5720
5721 void MDCache::open_snap_parents()
5722 {
5723 dout(10) << "open_snap_parents" << dendl;
5724
5725 map<client_t,MClientSnap*> splits;
5726 MDSGatherBuilder gather(g_ceph_context);
5727
5728 auto p = missing_snap_parents.begin();
5729 while (p != missing_snap_parents.end()) {
5730 CInode *in = p->first;
5731 assert(in->snaprealm);
5732 if (in->snaprealm->open_parents(gather.new_sub())) {
5733 dout(10) << " past parents now open on " << *in << dendl;
5734
5735 for (CInode *child : p->second) {
5736 auto q = reconnected_caps.find(child->ino());
5737 assert(q != reconnected_caps.end());
5738 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5739 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5740 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5741 }
5742 // make sure client's cap is in the correct snaprealm.
5743 if (r->second.realm_ino != in->ino()) {
5744 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5745 }
5746 }
5747 }
5748
5749 missing_snap_parents.erase(p++);
5750
5751 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5752
5753 // finish off client snaprealm reconnects?
5754 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5755 if (q != reconnected_snaprealms.end()) {
5756 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5757 r != q->second.end();
5758 ++r)
5759 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5760 reconnected_snaprealms.erase(q);
5761 }
5762 } else {
5763 dout(10) << " opening past parents on " << *in << dendl;
5764 ++p;
5765 }
5766 }
5767
5768 send_snaps(splits);
5769
5770 if (gather.has_subs()) {
5771 dout(10) << "open_snap_parents - waiting for "
5772 << gather.num_subs_remaining() << dendl;
5773 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5774 gather.activate();
5775 } else {
5776 if (!reconnected_snaprealms.empty()) {
5777 stringstream warn_str;
5778 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5779 p != reconnected_snaprealms.end();
5780 ++p) {
5781 warn_str << " unconnected snaprealm " << p->first << "\n";
5782 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5783 q != p->second.end();
5784 ++q)
5785 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5786 }
5787 mds->clog->warn() << "open_snap_parents has:";
5788 mds->clog->warn(warn_str);
5789 }
5790 assert(rejoin_waiters.empty());
5791 assert(missing_snap_parents.empty());
5792 dout(10) << "open_snap_parents - all open" << dendl;
5793 do_delayed_cap_imports();
5794
5795 assert(rejoin_done);
5796 rejoin_done.release()->complete(0);
5797 reconnected_caps.clear();
5798 }
5799 }
5800
5801 bool MDCache::open_undef_inodes_dirfrags()
5802 {
5803 dout(10) << "open_undef_inodes_dirfrags "
5804 << rejoin_undef_inodes.size() << " inodes "
5805 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5806
5807 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5808
5809 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5810 p != rejoin_undef_inodes.end();
5811 ++p) {
5812 CInode *in = *p;
5813 assert(!in->is_base());
5814 fetch_queue.insert(in->get_parent_dir());
5815 }
5816
5817 if (fetch_queue.empty())
5818 return false;
5819
5820 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5821 for (set<CDir*>::iterator p = fetch_queue.begin();
5822 p != fetch_queue.end();
5823 ++p) {
5824 CDir *dir = *p;
5825 CInode *diri = dir->get_inode();
5826 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5827 continue;
5828 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5829 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5830 dir->fetch(gather.new_sub());
5831 }
5832 assert(gather.has_subs());
5833 gather.activate();
5834 return true;
5835 }
5836
5837 void MDCache::opened_undef_inode(CInode *in) {
5838 dout(10) << "opened_undef_inode " << *in << dendl;
5839 rejoin_undef_inodes.erase(in);
5840 if (in->is_dir()) {
5841 // FIXME: re-hash dentries if necessary
5842 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5843 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5844 CDir *dir = in->get_dirfrag(frag_t());
5845 assert(dir);
5846 rejoin_undef_dirfrags.erase(dir);
5847 in->force_dirfrags();
5848 list<CDir*> ls;
5849 in->get_dirfrags(ls);
5850 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5851 rejoin_undef_dirfrags.insert(*p);
5852 }
5853 }
5854 }
5855
5856 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5857 {
5858 if (seq < realm->get_newest_seq()) {
5859 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5860 << realm->get_newest_seq()
5861 << " on " << *realm << dendl;
5862 // send an update
5863 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5864 if (session) {
5865 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5866 realm->build_snap_trace(snap->bl);
5867 mds->send_message_client_counted(snap, session);
5868 } else {
5869 dout(10) << " ...or not, no session for this client!" << dendl;
5870 }
5871 } else {
5872 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5873 << " on " << *realm << dendl;
5874 }
5875 }
5876
5877
5878
5879 void MDCache::rejoin_send_acks()
5880 {
5881 dout(7) << "rejoin_send_acks" << dendl;
5882
5883 // replicate stray
5884 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5885 p != rejoin_unlinked_inodes.end();
5886 ++p) {
5887 for (set<CInode*>::iterator q = p->second.begin();
5888 q != p->second.end();
5889 ++q) {
5890 CInode *in = *q;
5891 dout(7) << " unlinked inode " << *in << dendl;
5892 // inode expired
5893 if (!in->is_replica(p->first))
5894 continue;
5895 while (1) {
5896 CDentry *dn = in->get_parent_dn();
5897 if (dn->is_replica(p->first))
5898 break;
5899 dn->add_replica(p->first);
5900 CDir *dir = dn->get_dir();
5901 if (dir->is_replica(p->first))
5902 break;
5903 dir->add_replica(p->first);
5904 in = dir->get_inode();
5905 if (in->is_replica(p->first))
5906 break;
5907 in->add_replica(p->first);
5908 if (in->is_base())
5909 break;
5910 }
5911 }
5912 }
5913 rejoin_unlinked_inodes.clear();
5914
5915 // send acks to everyone in the recovery set
5916 map<mds_rank_t,MMDSCacheRejoin*> acks;
5917 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5918 p != recovery_set.end();
5919 ++p) {
5920 if (rejoin_ack_sent.count(*p))
5921 continue;
5922 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5923 }
5924
5925 rejoin_ack_sent = recovery_set;
5926
5927 // walk subtrees
5928 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5929 p != subtrees.end();
5930 ++p) {
5931 CDir *dir = p->first;
5932 if (!dir->is_auth())
5933 continue;
5934 dout(10) << "subtree " << *dir << dendl;
5935
5936 // auth items in this subtree
5937 list<CDir*> dq;
5938 dq.push_back(dir);
5939
5940 while (!dq.empty()) {
5941 CDir *dir = dq.front();
5942 dq.pop_front();
5943
5944 // dir
5945 for (auto &r : dir->get_replicas()) {
5946 auto it = acks.find(r.first);
5947 if (it == acks.end())
5948 continue;
5949 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
5950 it->second->add_dirfrag_base(dir);
5951 }
5952
5953 for (auto &p : dir->items) {
5954 CDentry *dn = p.second;
5955 CDentry::linkage_t *dnl = dn->get_linkage();
5956
5957 // inode
5958 CInode *in = NULL;
5959 if (dnl->is_primary())
5960 in = dnl->get_inode();
5961
5962 // dentry
5963 for (auto &r : dn->get_replicas()) {
5964 auto it = acks.find(r.first);
5965 if (it == acks.end())
5966 continue;
5967 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
5968 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
5969 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
5970 dnl->is_remote() ? dnl->get_remote_d_type():0,
5971 ++r.second,
5972 dn->lock.get_replica_state());
5973 // peer missed MDentrylink message ?
5974 if (in && !in->is_replica(r.first))
5975 in->add_replica(r.first);
5976 }
5977
5978 if (!in)
5979 continue;
5980
5981 for (auto &r : in->get_replicas()) {
5982 auto it = acks.find(r.first);
5983 if (it == acks.end())
5984 continue;
5985 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
5986 bufferlist bl;
5987 in->_encode_locks_state_for_rejoin(bl, r.first);
5988 it->second->add_inode_locks(in, ++r.second, bl);
5989 }
5990
5991 // subdirs in this subtree?
5992 in->get_nested_dirfrags(dq);
5993 }
5994 }
5995 }
5996
5997 // base inodes too
5998 if (root && root->is_auth())
5999 for (auto &r : root->get_replicas()) {
6000 auto it = acks.find(r.first);
6001 if (it == acks.end())
6002 continue;
6003 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6004 bufferlist bl;
6005 root->_encode_locks_state_for_rejoin(bl, r.first);
6006 it->second->add_inode_locks(root, ++r.second, bl);
6007 }
6008 if (myin)
6009 for (auto &r : myin->get_replicas()) {
6010 auto it = acks.find(r.first);
6011 if (it == acks.end())
6012 continue;
6013 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6014 bufferlist bl;
6015 myin->_encode_locks_state_for_rejoin(bl, r.first);
6016 it->second->add_inode_locks(myin, ++r.second, bl);
6017 }
6018
6019 // include inode base for any inodes whose scatterlocks may have updated
6020 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6021 p != rejoin_potential_updated_scatterlocks.end();
6022 ++p) {
6023 CInode *in = *p;
6024 for (const auto &r : in->get_replicas()) {
6025 auto it = acks.find(r.first);
6026 if (it == acks.end())
6027 continue;
6028 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6029 }
6030 }
6031
6032 // send acks
6033 for (auto p = acks.begin(); p != acks.end(); ++p) {
6034 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6035 mds->send_message_mds(p->second, p->first);
6036 }
6037
6038 rejoin_imported_caps.clear();
6039 }
6040
6041 class C_MDC_ReIssueCaps : public MDCacheContext {
6042 CInode *in;
6043 public:
6044 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6045 MDCacheContext(mdc), in(i)
6046 {
6047 in->get(CInode::PIN_PTRWAITER);
6048 }
6049 void finish(int r) override {
6050 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6051 mdcache->mds->locker->issue_caps(in);
6052 in->put(CInode::PIN_PTRWAITER);
6053 }
6054 };
6055
6056 void MDCache::reissue_all_caps()
6057 {
6058 dout(10) << "reissue_all_caps" << dendl;
6059
6060 for (auto &p : inode_map) {
6061 CInode *in = p.second;
6062 if (in->is_head() && in->is_any_caps()) {
6063 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6064 if (in->is_frozen_inode()) {
6065 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6066 continue;
6067 }
6068 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6069 mds->locker->issue_caps(in);
6070 }
6071 }
6072 }
6073
6074
6075 // ===============================================================================
6076
6077 struct C_MDC_QueuedCow : public MDCacheContext {
6078 CInode *in;
6079 MutationRef mut;
6080 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6081 MDCacheContext(mdc), in(i), mut(m) {}
6082 void finish(int r) override {
6083 mdcache->_queued_file_recover_cow(in, mut);
6084 }
6085 };
6086
6087
6088 void MDCache::queue_file_recover(CInode *in)
6089 {
6090 dout(10) << "queue_file_recover " << *in << dendl;
6091 assert(in->is_auth());
6092
6093 // cow?
6094 /*
6095 SnapRealm *realm = in->find_snaprealm();
6096 set<snapid_t> s = realm->get_snaps();
6097 while (!s.empty() && *s.begin() < in->first)
6098 s.erase(s.begin());
6099 while (!s.empty() && *s.rbegin() > in->last)
6100 s.erase(*s.rbegin());
6101 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6102 if (s.size() > 1) {
6103 CInode::mempool_inode pi = in->project_inode();
6104 pi->version = in->pre_dirty();
6105
6106 auto mut(std::make_shared<MutationImpl>());
6107 mut->ls = mds->mdlog->get_current_segment();
6108 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6109 mds->mdlog->start_entry(le);
6110 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6111
6112 s.erase(*s.begin());
6113 while (!s.empty()) {
6114 snapid_t snapid = *s.begin();
6115 CInode *cow_inode = 0;
6116 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6117 assert(cow_inode);
6118 recovery_queue.enqueue(cow_inode);
6119 s.erase(*s.begin());
6120 }
6121
6122 in->parent->first = in->first;
6123 le->metablob.add_primary_dentry(in->parent, in, true);
6124 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6125 mds->mdlog->flush();
6126 }
6127 */
6128
6129 recovery_queue.enqueue(in);
6130 }
6131
6132 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6133 {
6134 in->pop_and_dirty_projected_inode(mut->ls);
6135 mut->apply();
6136 mds->locker->drop_locks(mut.get());
6137 mut->cleanup();
6138 }
6139
6140
6141 /*
6142 * called after recovery to recover file sizes for previously opened (for write)
6143 * files. that is, those where max_size > size.
6144 */
6145 void MDCache::identify_files_to_recover()
6146 {
6147 dout(10) << "identify_files_to_recover" << dendl;
6148 for (auto &p : inode_map) {
6149 CInode *in = p.second;
6150 if (!in->is_auth())
6151 continue;
6152
6153 if (in->last != CEPH_NOSNAP)
6154 continue;
6155
6156 // Only normal files need file size recovery
6157 if (!in->is_file()) {
6158 continue;
6159 }
6160
6161 bool recover = false;
6162 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6163 p != in->inode.client_ranges.end();
6164 ++p) {
6165 Capability *cap = in->get_client_cap(p->first);
6166 if (!cap) {
6167 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6168 recover = true;
6169 break;
6170 }
6171 }
6172
6173 if (recover) {
6174 if (in->filelock.is_stable()) {
6175 in->auth_pin(&in->filelock);
6176 } else {
6177 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6178 }
6179 in->filelock.set_state(LOCK_PRE_SCAN);
6180 rejoin_recover_q.push_back(in);
6181 } else {
6182 rejoin_check_q.push_back(in);
6183 }
6184 }
6185 }
6186
6187 void MDCache::start_files_to_recover()
6188 {
6189 for (CInode *in : rejoin_check_q) {
6190 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6191 mds->locker->issue_caps(in);
6192 mds->locker->check_inode_max_size(in);
6193 }
6194 rejoin_check_q.clear();
6195 for (CInode *in : rejoin_recover_q) {
6196 mds->locker->file_recover(&in->filelock);
6197 }
6198 if (!rejoin_recover_q.empty()) {
6199 rejoin_recover_q.clear();
6200 do_file_recover();
6201 }
6202 }
6203
6204 void MDCache::do_file_recover()
6205 {
6206 recovery_queue.advance();
6207 }
6208
6209 // ===============================================================================
6210
6211
6212 // ----------------------------
6213 // truncate
6214
6215 class C_MDC_RetryTruncate : public MDCacheContext {
6216 CInode *in;
6217 LogSegment *ls;
6218 public:
6219 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6220 MDCacheContext(c), in(i), ls(l) {}
6221 void finish(int r) override {
6222 mdcache->_truncate_inode(in, ls);
6223 }
6224 };
6225
6226 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6227 {
6228 auto pi = in->get_projected_inode();
6229 dout(10) << "truncate_inode "
6230 << pi->truncate_from << " -> " << pi->truncate_size
6231 << " on " << *in
6232 << dendl;
6233
6234 ls->truncating_inodes.insert(in);
6235 in->get(CInode::PIN_TRUNCATING);
6236 in->auth_pin(this);
6237
6238 if (!in->client_need_snapflush.empty() &&
6239 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6240 assert(in->filelock.is_xlocked());
6241 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6242 mds->locker->issue_caps(in);
6243 return;
6244 }
6245
6246 _truncate_inode(in, ls);
6247 }
6248
6249 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6250 CInode *in;
6251 LogSegment *ls;
6252 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6253 MDCacheIOContext(c), in(i), ls(l) {}
6254 void finish(int r) override {
6255 assert(r == 0 || r == -ENOENT);
6256 mdcache->truncate_inode_finish(in, ls);
6257 }
6258 };
6259
6260 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6261 {
6262 auto pi = &in->inode;
6263 dout(10) << "_truncate_inode "
6264 << pi->truncate_from << " -> " << pi->truncate_size
6265 << " on " << *in << dendl;
6266
6267 assert(pi->is_truncating());
6268 assert(pi->truncate_size < (1ULL << 63));
6269 assert(pi->truncate_from < (1ULL << 63));
6270 assert(pi->truncate_size < pi->truncate_from);
6271
6272
6273 SnapRealm *realm = in->find_snaprealm();
6274 SnapContext nullsnap;
6275 const SnapContext *snapc;
6276 if (realm) {
6277 dout(10) << " realm " << *realm << dendl;
6278 snapc = &realm->get_snap_context();
6279 } else {
6280 dout(10) << " NO realm, using null context" << dendl;
6281 snapc = &nullsnap;
6282 assert(in->last == CEPH_NOSNAP);
6283 }
6284 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6285 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6286 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6287 pi->truncate_seq, ceph::real_time::min(), 0,
6288 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6289 mds->finisher));
6290 }
6291
6292 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6293 CInode *in;
6294 MutationRef mut;
6295 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6296 MDCacheLogContext(m), in(i), mut(mu) {}
6297 void finish(int r) override {
6298 mdcache->truncate_inode_logged(in, mut);
6299 }
6300 };
6301
6302 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6303 {
6304 dout(10) << "truncate_inode_finish " << *in << dendl;
6305
6306 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6307 assert(p != ls->truncating_inodes.end());
6308 ls->truncating_inodes.erase(p);
6309
6310 // update
6311 auto &pi = in->project_inode();
6312 pi.inode.version = in->pre_dirty();
6313 pi.inode.truncate_from = 0;
6314 pi.inode.truncate_pending--;
6315
6316 MutationRef mut(new MutationImpl());
6317 mut->ls = mds->mdlog->get_current_segment();
6318 mut->add_projected_inode(in);
6319
6320 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6321 mds->mdlog->start_entry(le);
6322 CDentry *dn = in->get_projected_parent_dn();
6323 le->metablob.add_dir_context(dn->get_dir());
6324 le->metablob.add_primary_dentry(dn, in, true);
6325 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6326
6327 journal_dirty_inode(mut.get(), &le->metablob, in);
6328 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6329
6330 // flush immediately if there are readers/writers waiting
6331 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6332 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6333 mds->mdlog->flush();
6334 }
6335
6336 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6337 {
6338 dout(10) << "truncate_inode_logged " << *in << dendl;
6339 mut->apply();
6340 mds->locker->drop_locks(mut.get());
6341 mut->cleanup();
6342
6343 in->put(CInode::PIN_TRUNCATING);
6344 in->auth_unpin(this);
6345
6346 list<MDSInternalContextBase*> waiters;
6347 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6348 mds->queue_waiters(waiters);
6349 }
6350
6351
6352 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6353 {
6354 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6355 << ls->seq << "/" << ls->offset << dendl;
6356 ls->truncating_inodes.insert(in);
6357 in->get(CInode::PIN_TRUNCATING);
6358 }
6359
6360 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6361 {
6362 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6363 << ls->seq << "/" << ls->offset << dendl;
6364 // if we have the logseg the truncate started in, it must be in our list.
6365 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6366 assert(p != ls->truncating_inodes.end());
6367 ls->truncating_inodes.erase(p);
6368 in->put(CInode::PIN_TRUNCATING);
6369 }
6370
6371 void MDCache::start_recovered_truncates()
6372 {
6373 dout(10) << "start_recovered_truncates" << dendl;
6374 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6375 p != mds->mdlog->segments.end();
6376 ++p) {
6377 LogSegment *ls = p->second;
6378 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6379 q != ls->truncating_inodes.end();
6380 ++q) {
6381 CInode *in = *q;
6382 in->auth_pin(this);
6383
6384 if (!in->client_need_snapflush.empty() &&
6385 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6386 assert(in->filelock.is_stable());
6387 in->filelock.set_state(LOCK_XLOCKDONE);
6388 in->auth_pin(&in->filelock);
6389 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6390 // start_files_to_recover will revoke caps
6391 continue;
6392 }
6393 _truncate_inode(in, ls);
6394 }
6395 }
6396 }
6397
6398
6399
6400
6401
6402
6403 // ================================================================================
6404 // cache trimming
6405
6406 void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6407 {
6408 bool is_standby_replay = mds->is_standby_replay();
6409 std::vector<CDentry *> unexpirables;
6410 uint64_t trimmed = 0;
6411
6412 dout(7) << "trim_lru trimming " << count
6413 << " items from LRU"
6414 << " size=" << lru.lru_get_size()
6415 << " mid=" << lru.lru_get_top()
6416 << " pintail=" << lru.lru_get_pintail()
6417 << " pinned=" << lru.lru_get_num_pinned()
6418 << dendl;
6419
6420 for (;;) {
6421 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6422 if (!dn)
6423 break;
6424 if (trim_dentry(dn, expiremap)) {
6425 unexpirables.push_back(dn);
6426 } else {
6427 trimmed++;
6428 }
6429 }
6430
6431 for (auto &dn : unexpirables) {
6432 bottom_lru.lru_insert_mid(dn);
6433 }
6434 unexpirables.clear();
6435
6436 // trim dentries from the LRU until count is reached
6437 while (cache_toofull() || count > 0) {
6438 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6439 if (!dn) {
6440 break;
6441 }
6442 if ((is_standby_replay && dn->get_linkage()->inode &&
6443 dn->get_linkage()->inode->item_open_file.is_on_list())) {
6444 unexpirables.push_back(dn);
6445 } else if (trim_dentry(dn, expiremap)) {
6446 unexpirables.push_back(dn);
6447 } else {
6448 trimmed++;
6449 if (count > 0) count--;
6450 }
6451 }
6452
6453 for (auto &dn : unexpirables) {
6454 lru.lru_insert_mid(dn);
6455 }
6456 unexpirables.clear();
6457
6458 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6459 }
6460
6461 /*
6462 * note: only called while MDS is active or stopping... NOT during recovery.
6463 * however, we may expire a replica whose authority is recovering.
6464 *
6465 * @param count is number of dentries to try to expire
6466 */
6467 bool MDCache::trim(uint64_t count)
6468 {
6469 uint64_t used = cache_size();
6470 uint64_t limit = cache_limit_memory();
6471 map<mds_rank_t, MCacheExpire*> expiremap;
6472
6473 dout(7) << "trim bytes_used=" << bytes2str(used)
6474 << " limit=" << bytes2str(limit)
6475 << " reservation=" << cache_reservation()
6476 << "% count=" << count << dendl;
6477
6478 // process delayed eval_stray()
6479 stray_manager.advance_delayed();
6480
6481 trim_lru(count, expiremap);
6482
6483 // trim non-auth, non-bound subtrees
6484 for (auto p = subtrees.begin(); p != subtrees.end();) {
6485 CDir *dir = p->first;
6486 ++p;
6487 CInode *diri = dir->get_inode();
6488 if (dir->is_auth()) {
6489 if (!diri->is_auth() && !diri->is_base() &&
6490 dir->get_num_head_items() == 0) {
6491 if (dir->state_test(CDir::STATE_EXPORTING) ||
6492 !(mds->is_active() || mds->is_stopping()) ||
6493 dir->is_freezing() || dir->is_frozen())
6494 continue;
6495
6496 migrator->export_empty_import(dir);
6497 }
6498 } else {
6499 if (!diri->is_auth()) {
6500 if (dir->get_num_ref() > 1) // only subtree pin
6501 continue;
6502 list<CDir*> ls;
6503 diri->get_subtree_dirfrags(ls);
6504 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6505 continue;
6506
6507 // don't trim subtree root if its auth MDS is recovering.
6508 // This simplify the cache rejoin code.
6509 if (dir->is_subtree_root() &&
6510 rejoin_ack_gather.count(dir->get_dir_auth().first))
6511 continue;
6512 trim_dirfrag(dir, 0, expiremap);
6513 }
6514 }
6515 }
6516
6517 // trim root?
6518 if (mds->is_stopping() && root) {
6519 list<CDir*> ls;
6520 root->get_dirfrags(ls);
6521 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6522 CDir *dir = *p;
6523 if (dir->get_num_ref() == 1) // subtree pin
6524 trim_dirfrag(dir, 0, expiremap);
6525 }
6526 if (root->get_num_ref() == 0)
6527 trim_inode(0, root, 0, expiremap);
6528 }
6529
6530 std::set<mds_rank_t> stopping;
6531 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6532 stopping.erase(mds->get_nodeid());
6533 for (auto rank : stopping) {
6534 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6535 if (!mdsdir_in)
6536 continue;
6537
6538 if (expiremap.count(rank) == 0) {
6539 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6540 }
6541
6542 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6543
6544 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6545 if (!aborted) {
6546 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6547 list<CDir*> ls;
6548 mdsdir_in->get_dirfrags(ls);
6549 for (auto dir : ls) {
6550 if (dir->get_num_ref() == 1) // subtree pin
6551 trim_dirfrag(dir, dir, expiremap);
6552 }
6553 if (mdsdir_in->get_num_ref() == 0)
6554 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6555 } else {
6556 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6557 }
6558 }
6559
6560 // Other rank's base inodes (when I'm stopping)
6561 if (mds->is_stopping()) {
6562 for (set<CInode*>::iterator p = base_inodes.begin();
6563 p != base_inodes.end(); ++p) {
6564 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6565 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6566 if ((*p)->get_num_ref() == 0) {
6567 trim_inode(NULL, *p, NULL, expiremap);
6568 }
6569 }
6570 }
6571 }
6572
6573 // send any expire messages
6574 send_expire_messages(expiremap);
6575
6576 return true;
6577 }
6578
6579 void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6580 {
6581 // send expires
6582 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6583 it != expiremap.end();
6584 ++it) {
6585 if (mds->is_cluster_degraded() &&
6586 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6587 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6588 rejoin_sent.count(it->first) == 0))) {
6589 it->second->put();
6590 continue;
6591 }
6592 dout(7) << "sending cache_expire to " << it->first << dendl;
6593 mds->send_message_mds(it->second, it->first);
6594 }
6595 }
6596
6597
6598 bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6599 {
6600 dout(12) << "trim_dentry " << *dn << dendl;
6601
6602 CDentry::linkage_t *dnl = dn->get_linkage();
6603
6604 CDir *dir = dn->get_dir();
6605 assert(dir);
6606
6607 CDir *con = get_subtree_root(dir);
6608 if (con)
6609 dout(12) << " in container " << *con << dendl;
6610 else {
6611 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6612 assert(dn->is_auth());
6613 }
6614
6615 // If replica dentry is not readable, it's likely we will receive
6616 // MDentryLink/MDentryUnlink message soon (It's possible we first
6617 // receive a MDentryUnlink message, then MDentryLink message)
6618 // MDentryLink message only replicates an inode, so we should
6619 // avoid trimming the inode's parent dentry. This is because that
6620 // unconnected replicas are problematic for subtree migration.
6621 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6622 !dn->get_dir()->get_inode()->is_stray())
6623 return true;
6624
6625 // adjust the dir state
6626 // NOTE: we can safely remove a clean, null dentry without effecting
6627 // directory completeness.
6628 // (check this _before_ we unlink the inode, below!)
6629 bool clear_complete = false;
6630 if (!(dnl->is_null() && dn->is_clean()))
6631 clear_complete = true;
6632
6633 // unlink the dentry
6634 if (dnl->is_remote()) {
6635 // just unlink.
6636 dir->unlink_inode(dn, false);
6637 } else if (dnl->is_primary()) {
6638 // expire the inode, too.
6639 CInode *in = dnl->get_inode();
6640 assert(in);
6641 if (trim_inode(dn, in, con, expiremap))
6642 return true; // purging stray instead of trimming
6643 } else {
6644 assert(dnl->is_null());
6645 }
6646
6647 if (!dn->is_auth()) {
6648 // notify dentry authority.
6649 mds_authority_t auth = dn->authority();
6650
6651 for (int p=0; p<2; p++) {
6652 mds_rank_t a = auth.first;
6653 if (p) a = auth.second;
6654 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6655 if (mds->get_nodeid() == auth.second &&
6656 con->is_importing()) break; // don't send any expire while importing.
6657 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6658
6659 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6660 assert(a != mds->get_nodeid());
6661 if (expiremap.count(a) == 0)
6662 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6663 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6664 }
6665 }
6666
6667 // remove dentry
6668 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6669 dir->add_to_bloom(dn);
6670 dir->remove_dentry(dn);
6671
6672 if (clear_complete)
6673 dir->state_clear(CDir::STATE_COMPLETE);
6674
6675 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6676 return false;
6677 }
6678
6679
6680 void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6681 {
6682 dout(15) << "trim_dirfrag " << *dir << dendl;
6683
6684 if (dir->is_subtree_root()) {
6685 assert(!dir->is_auth() ||
6686 (!dir->is_replicated() && dir->inode->is_base()));
6687 remove_subtree(dir); // remove from subtree map
6688 }
6689 assert(dir->get_num_ref() == 0);
6690
6691 CInode *in = dir->get_inode();
6692
6693 if (!dir->is_auth()) {
6694 mds_authority_t auth = dir->authority();
6695
6696 // was this an auth delegation? (if so, slightly modified container)
6697 dirfrag_t condf;
6698 if (dir->is_subtree_root()) {
6699 dout(12) << " subtree root, container is " << *dir << dendl;
6700 con = dir;
6701 condf = dir->dirfrag();
6702 } else {
6703 condf = con->dirfrag();
6704 }
6705
6706 for (int p=0; p<2; p++) {
6707 mds_rank_t a = auth.first;
6708 if (p) a = auth.second;
6709 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6710 if (mds->get_nodeid() == auth.second &&
6711 con->is_importing()) break; // don't send any expire while importing.
6712 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6713
6714 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6715 assert(a != mds->get_nodeid());
6716 if (expiremap.count(a) == 0)
6717 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6718 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6719 }
6720 }
6721
6722 in->close_dirfrag(dir->dirfrag().frag);
6723 }
6724
6725 /**
6726 * Try trimming an inode from the cache
6727 *
6728 * @return true if the inode is still in cache, else false if it was trimmed
6729 */
6730 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6731 {
6732 dout(15) << "trim_inode " << *in << dendl;
6733 assert(in->get_num_ref() == 0);
6734
6735 if (in->is_dir()) {
6736 // If replica inode's dirfragtreelock is not readable, it's likely
6737 // some dirfrags of the inode are being fragmented and we will receive
6738 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6739 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6740 // This is because that unconnected replicas are problematic for
6741 // subtree migration.
6742 //
6743 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6744 return true;
6745
6746 // DIR
6747 list<CDir*> dfls;
6748 in->get_dirfrags(dfls);
6749 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6750 CDir *dir = *p;
6751 assert(!dir->is_subtree_root());
6752 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6753 }
6754 }
6755
6756 // INODE
6757 if (in->is_auth()) {
6758 // eval stray after closing dirfrags
6759 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6760 maybe_eval_stray(in);
6761 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6762 return true;
6763 }
6764 } else {
6765 mds_authority_t auth = in->authority();
6766
6767 dirfrag_t df;
6768 if (con)
6769 df = con->dirfrag();
6770 else
6771 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6772
6773 for (int p=0; p<2; p++) {
6774 mds_rank_t a = auth.first;
6775 if (p) a = auth.second;
6776 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6777 if (con && mds->get_nodeid() == auth.second &&
6778 con->is_importing()) break; // don't send any expire while importing.
6779 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6780
6781 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6782 assert(a != mds->get_nodeid());
6783 if (expiremap.count(a) == 0)
6784 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6785 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6786 }
6787 }
6788
6789 /*
6790 if (in->is_auth()) {
6791 if (in->hack_accessed)
6792 mds->logger->inc("outt");
6793 else {
6794 mds->logger->inc("outut");
6795 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6796 }
6797 }
6798 */
6799
6800 // unlink
6801 if (dn)
6802 dn->get_dir()->unlink_inode(dn, false);
6803 remove_inode(in);
6804 return false;
6805 }
6806
6807
6808 /**
6809 * trim_non_auth - remove any non-auth items from our cache
6810 *
6811 * this reduces the amount of non-auth metadata in our cache, reducing the
6812 * load incurred by the rejoin phase.
6813 *
6814 * the only non-auth items that remain are those that are needed to
6815 * attach our own subtrees to the root.
6816 *
6817 * when we are done, all dentries will be in the top bit of the lru.
6818 *
6819 * why we have to do this:
6820 * we may not have accurate linkage for non-auth items. which means we will
6821 * know which subtree it falls into, and can not be sure to declare it to the
6822 * correct authority.
6823 */
6824 void MDCache::trim_non_auth()
6825 {
6826 dout(7) << "trim_non_auth" << dendl;
6827
6828 // temporarily pin all subtree roots
6829 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6830 p != subtrees.end();
6831 ++p)
6832 p->first->get(CDir::PIN_SUBTREETEMP);
6833
6834 list<CDentry*> auth_list;
6835
6836 // trim non-auth items from the lru
6837 for (;;) {
6838 CDentry *dn = NULL;
6839 if (bottom_lru.lru_get_size() > 0)
6840 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6841 if (!dn && lru.lru_get_size() > 0)
6842 dn = static_cast<CDentry*>(lru.lru_expire());
6843 if (!dn)
6844 break;
6845
6846 CDentry::linkage_t *dnl = dn->get_linkage();
6847
6848 if (dn->is_auth()) {
6849 // add back into lru (at the top)
6850 auth_list.push_back(dn);
6851
6852 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6853 dn->unlink_remote(dnl);
6854 } else {
6855 // non-auth. expire.
6856 CDir *dir = dn->get_dir();
6857 assert(dir);
6858
6859 // unlink the dentry
6860 dout(10) << " removing " << *dn << dendl;
6861 if (dnl->is_remote()) {
6862 dir->unlink_inode(dn, false);
6863 }
6864 else if (dnl->is_primary()) {
6865 CInode *in = dnl->get_inode();
6866 dout(10) << " removing " << *in << dendl;
6867 list<CDir*> ls;
6868 in->get_dirfrags(ls);
6869 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6870 CDir *subdir = *p;
6871 assert(!subdir->is_subtree_root());
6872 in->close_dirfrag(subdir->dirfrag().frag);
6873 }
6874 dir->unlink_inode(dn, false);
6875 remove_inode(in);
6876 }
6877 else {
6878 assert(dnl->is_null());
6879 }
6880
6881 assert(!dir->has_bloom());
6882 dir->remove_dentry(dn);
6883 // adjust the dir state
6884 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6885 // close empty non-auth dirfrag
6886 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6887 dir->inode->close_dirfrag(dir->get_frag());
6888 }
6889 }
6890
6891 for (auto dn : auth_list) {
6892 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6893 bottom_lru.lru_insert_mid(dn);
6894 else
6895 lru.lru_insert_top(dn);
6896 }
6897
6898 // move everything in the pintail to the top bit of the lru.
6899 lru.lru_touch_entire_pintail();
6900
6901 // unpin all subtrees
6902 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6903 p != subtrees.end();
6904 ++p)
6905 p->first->put(CDir::PIN_SUBTREETEMP);
6906
6907 if (lru.lru_get_size() == 0 &&
6908 bottom_lru.lru_get_size() == 0) {
6909 // root, stray, etc.?
6910 auto p = inode_map.begin();
6911 while (p != inode_map.end()) {
6912 CInode *in = p->second;
6913 ++p;
6914 if (!in->is_auth()) {
6915 list<CDir*> ls;
6916 in->get_dirfrags(ls);
6917 for (list<CDir*>::iterator p = ls.begin();
6918 p != ls.end();
6919 ++p) {
6920 dout(10) << " removing " << **p << dendl;
6921 assert((*p)->get_num_ref() == 1); // SUBTREE
6922 remove_subtree((*p));
6923 in->close_dirfrag((*p)->dirfrag().frag);
6924 }
6925 dout(10) << " removing " << *in << dendl;
6926 assert(!in->get_parent_dn());
6927 assert(in->get_num_ref() == 0);
6928 remove_inode(in);
6929 }
6930 }
6931 }
6932
6933 show_subtrees();
6934 }
6935
6936 /**
6937 * Recursively trim the subtree rooted at directory to remove all
6938 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6939 * of those links. This is used to clear invalid data out of the cache.
6940 * Note that it doesn't clear the passed-in directory, since that's not
6941 * always safe.
6942 */
6943 bool MDCache::trim_non_auth_subtree(CDir *dir)
6944 {
6945 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6946
6947 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6948
6949 auto j = dir->begin();
6950 auto i = j;
6951 while (j != dir->end()) {
6952 i = j++;
6953 CDentry *dn = i->second;
6954 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6955 CDentry::linkage_t *dnl = dn->get_linkage();
6956 if (dnl->is_primary()) { // check for subdirectories, etc
6957 CInode *in = dnl->get_inode();
6958 bool keep_inode = false;
6959 if (in->is_dir()) {
6960 list<CDir*> subdirs;
6961 in->get_dirfrags(subdirs);
6962 for (list<CDir*>::iterator subdir = subdirs.begin();
6963 subdir != subdirs.end();
6964 ++subdir) {
6965 if ((*subdir)->is_subtree_root()) {
6966 keep_inode = true;
6967 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
6968 } else {
6969 if (trim_non_auth_subtree(*subdir))
6970 keep_inode = true;
6971 else {
6972 in->close_dirfrag((*subdir)->get_frag());
6973 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
6974 }
6975 }
6976 }
6977
6978 }
6979 if (!keep_inode) { // remove it!
6980 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
6981 dir->unlink_inode(dn, false);
6982 remove_inode(in);
6983 assert(!dir->has_bloom());
6984 dir->remove_dentry(dn);
6985 } else {
6986 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
6987 dn->state_clear(CDentry::STATE_AUTH);
6988 in->state_clear(CInode::STATE_AUTH);
6989 }
6990 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
6991 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
6992 } else { // just remove it
6993 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
6994 if (dnl->is_remote())
6995 dir->unlink_inode(dn, false);
6996 dir->remove_dentry(dn);
6997 }
6998 }
6999 dir->state_clear(CDir::STATE_AUTH);
7000 /**
7001 * We've now checked all our children and deleted those that need it.
7002 * Now return to caller, and tell them if *we're* a keeper.
7003 */
7004 return keep_dir || dir->get_num_any();
7005 }
7006
7007 /*
7008 * during replay, when we determine a subtree is no longer ours, we
7009 * try to trim it from our cache. because subtrees must be connected
7010 * to the root, the fact that we can trim this tree may mean that our
7011 * children or parents can also be trimmed.
7012 */
7013 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7014 {
7015 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7016
7017 // can we now trim child subtrees?
7018 set<CDir*> bounds;
7019 get_subtree_bounds(dir, bounds);
7020 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7021 CDir *bd = *p;
7022 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7023 bd->get_num_any() == 0 && // and empty
7024 can_trim_non_auth_dirfrag(bd)) {
7025 CInode *bi = bd->get_inode();
7026 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7027 remove_subtree(bd);
7028 bd->mark_clean();
7029 bi->close_dirfrag(bd->get_frag());
7030 }
7031 }
7032
7033 if (trim_non_auth_subtree(dir)) {
7034 // keep
7035 try_subtree_merge(dir);
7036 } else {
7037 // can we trim this subtree (and possibly our ancestors) too?
7038 while (true) {
7039 CInode *diri = dir->get_inode();
7040 if (diri->is_base()) {
7041 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7042 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7043 remove_subtree(dir);
7044 dir->mark_clean();
7045 diri->close_dirfrag(dir->get_frag());
7046
7047 dout(10) << " removing " << *diri << dendl;
7048 assert(!diri->get_parent_dn());
7049 assert(diri->get_num_ref() == 0);
7050 remove_inode(diri);
7051 }
7052 break;
7053 }
7054
7055 CDir *psub = get_subtree_root(diri->get_parent_dir());
7056 dout(10) << " parent subtree is " << *psub << dendl;
7057 if (psub->get_dir_auth().first == mds->get_nodeid())
7058 break; // we are auth, keep.
7059
7060 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7061 remove_subtree(dir);
7062 dir->mark_clean();
7063 diri->close_dirfrag(dir->get_frag());
7064
7065 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7066 if (trim_non_auth_subtree(psub))
7067 break;
7068 dir = psub;
7069 }
7070 }
7071
7072 show_subtrees();
7073 }
7074
7075 void MDCache::standby_trim_segment(LogSegment *ls)
7076 {
7077 ls->new_dirfrags.clear_list();
7078 ls->open_files.clear_list();
7079
7080 while (!ls->dirty_dirfrags.empty()) {
7081 CDir *dir = ls->dirty_dirfrags.front();
7082 dir->mark_clean();
7083 }
7084 while (!ls->dirty_inodes.empty()) {
7085 CInode *in = ls->dirty_inodes.front();
7086 in->mark_clean();
7087 }
7088 while (!ls->dirty_dentries.empty()) {
7089 CDentry *dn = ls->dirty_dentries.front();
7090 dn->mark_clean();
7091 }
7092 while (!ls->dirty_parent_inodes.empty()) {
7093 CInode *in = ls->dirty_parent_inodes.front();
7094 in->clear_dirty_parent();
7095 }
7096 while (!ls->dirty_dirfrag_dir.empty()) {
7097 CInode *in = ls->dirty_dirfrag_dir.front();
7098 in->filelock.remove_dirty();
7099 }
7100 while (!ls->dirty_dirfrag_nest.empty()) {
7101 CInode *in = ls->dirty_dirfrag_nest.front();
7102 in->nestlock.remove_dirty();
7103 }
7104 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7105 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7106 in->dirfragtreelock.remove_dirty();
7107 }
7108 }
7109
7110 /* This function DOES put the passed message before returning */
7111 void MDCache::handle_cache_expire(MCacheExpire *m)
7112 {
7113 mds_rank_t from = mds_rank_t(m->get_from());
7114
7115 dout(7) << "cache_expire from mds." << from << dendl;
7116
7117 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7118 m->put();
7119 return;
7120 }
7121
7122 set<SimpleLock *> gather_locks;
7123 // loop over realms
7124 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7125 p != m->realms.end();
7126 ++p) {
7127 // check container?
7128 if (p->first.ino > 0) {
7129 CInode *expired_inode = get_inode(p->first.ino);
7130 assert(expired_inode); // we had better have this.
7131 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7132 assert(parent_dir);
7133
7134 int export_state = -1;
7135 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7136 export_state = migrator->get_export_state(parent_dir);
7137 assert(export_state >= 0);
7138 }
7139
7140 if (!parent_dir->is_auth() ||
7141 (export_state != -1 &&
7142 ((export_state == Migrator::EXPORT_WARNING &&
7143 migrator->export_has_warned(parent_dir,from)) ||
7144 export_state == Migrator::EXPORT_EXPORTING ||
7145 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7146 (export_state == Migrator::EXPORT_NOTIFYING &&
7147 !migrator->export_has_notified(parent_dir,from))))) {
7148
7149 // not auth.
7150 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7151 assert(parent_dir->is_frozen_tree_root());
7152
7153 // make a message container
7154 if (delayed_expire[parent_dir].count(from) == 0)
7155 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7156
7157 // merge these expires into it
7158 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7159 continue;
7160 }
7161 assert(export_state <= Migrator::EXPORT_PREPPING ||
7162 (export_state == Migrator::EXPORT_WARNING &&
7163 !migrator->export_has_warned(parent_dir, from)));
7164
7165 dout(7) << "expires for " << *parent_dir << dendl;
7166 } else {
7167 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7168 }
7169
7170 // INODES
7171 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7172 it != p->second.inodes.end();
7173 ++it) {
7174 CInode *in = get_inode(it->first);
7175 unsigned nonce = it->second;
7176
7177 if (!in) {
7178 dout(0) << " inode expire on " << it->first << " from " << from
7179 << ", don't have it" << dendl;
7180 assert(in);
7181 }
7182 assert(in->is_auth());
7183 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7184
7185 // check nonce
7186 if (nonce == in->get_replica_nonce(from)) {
7187 // remove from our cached_by
7188 dout(7) << " inode expire on " << *in << " from mds." << from
7189 << " cached_by was " << in->get_replicas() << dendl;
7190 inode_remove_replica(in, from, false, gather_locks);
7191 }
7192 else {
7193 // this is an old nonce, ignore expire.
7194 dout(7) << " inode expire on " << *in << " from mds." << from
7195 << " with old nonce " << nonce
7196 << " (current " << in->get_replica_nonce(from) << "), dropping"
7197 << dendl;
7198 }
7199 }
7200
7201 // DIRS
7202 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7203 it != p->second.dirs.end();
7204 ++it) {
7205 CDir *dir = get_dirfrag(it->first);
7206 unsigned nonce = it->second;
7207
7208 if (!dir) {
7209 CInode *diri = get_inode(it->first.ino);
7210 if (diri) {
7211 if (mds->is_rejoin() &&
7212 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7213 !diri->is_replica(from)) {
7214 list<CDir*> ls;
7215 diri->get_nested_dirfrags(ls);
7216 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7217 << " while rejoining, inode isn't replicated" << dendl;
7218 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7219 dir = *q;
7220 if (dir->is_replica(from)) {
7221 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7222 dir->remove_replica(from);
7223 }
7224 }
7225 continue;
7226 }
7227 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7228 if (other) {
7229 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7230 << " have " << *other << ", mismatched frags, dropping" << dendl;
7231 continue;
7232 }
7233 }
7234 dout(0) << " dir expire on " << it->first << " from " << from
7235 << ", don't have it" << dendl;
7236 assert(dir);
7237 }
7238 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7239
7240 assert(dir->is_auth());
7241
7242 // check nonce
7243 if (nonce == dir->get_replica_nonce(from)) {
7244 // remove from our cached_by
7245 dout(7) << " dir expire on " << *dir << " from mds." << from
7246 << " replicas was " << dir->get_replicas() << dendl;
7247 dir->remove_replica(from);
7248 }
7249 else {
7250 // this is an old nonce, ignore expire.
7251 dout(7) << " dir expire on " << *dir << " from mds." << from
7252 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7253 << "), dropping" << dendl;
7254 }
7255 }
7256
7257 // DENTRIES
7258 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7259 pd != p->second.dentries.end();
7260 ++pd) {
7261 dout(10) << " dn expires in dir " << pd->first << dendl;
7262 CInode *diri = get_inode(pd->first.ino);
7263 assert(diri);
7264 CDir *dir = diri->get_dirfrag(pd->first.frag);
7265
7266 if (!dir) {
7267 dout(0) << " dn expires on " << pd->first << " from " << from
7268 << ", must have refragmented" << dendl;
7269 } else {
7270 assert(dir->is_auth());
7271 }
7272
7273 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7274 p != pd->second.end();
7275 ++p) {
7276 unsigned nonce = p->second;
7277 CDentry *dn;
7278
7279 if (dir) {
7280 dn = dir->lookup(p->first.first, p->first.second);
7281 } else {
7282 // which dirfrag for this dentry?
7283 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7284 assert(dir);
7285 assert(dir->is_auth());
7286 dn = dir->lookup(p->first.first, p->first.second);
7287 }
7288
7289 if (!dn) {
7290 if (dir)
7291 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7292 else
7293 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7294 }
7295 assert(dn);
7296
7297 if (nonce == dn->get_replica_nonce(from)) {
7298 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7299 dentry_remove_replica(dn, from, gather_locks);
7300 }
7301 else {
7302 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7303 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7304 << "), dropping" << dendl;
7305 }
7306 }
7307 }
7308 }
7309
7310 // done
7311 m->put();
7312
7313 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7314 if (!(*p)->is_stable())
7315 mds->locker->eval_gather(*p);
7316 }
7317 }
7318
7319 void MDCache::process_delayed_expire(CDir *dir)
7320 {
7321 dout(7) << "process_delayed_expire on " << *dir << dendl;
7322 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7323 p != delayed_expire[dir].end();
7324 ++p)
7325 handle_cache_expire(p->second);
7326 delayed_expire.erase(dir);
7327 }
7328
7329 void MDCache::discard_delayed_expire(CDir *dir)
7330 {
7331 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7332 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7333 p != delayed_expire[dir].end();
7334 ++p)
7335 p->second->put();
7336 delayed_expire.erase(dir);
7337 }
7338
7339 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7340 set<SimpleLock *>& gather_locks)
7341 {
7342 in->remove_replica(from);
7343 in->mds_caps_wanted.erase(from);
7344
7345 // note: this code calls _eval more often than it needs to!
7346 // fix lock
7347 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7348 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7349 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7350 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7351 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7352 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7353
7354 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7355 // Don't remove the recovering mds from lock's gathering list because
7356 // it may hold rejoined wrlocks.
7357 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7358 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7359 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7360 }
7361
7362 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7363 {
7364 dn->remove_replica(from);
7365
7366 // fix lock
7367 if (dn->lock.remove_replica(from))
7368 gather_locks.insert(&dn->lock);
7369
7370 // Replicated strays might now be elegible for purge
7371 CDentry::linkage_t *dnl = dn->get_linkage();
7372 if (dnl->is_primary()) {
7373 maybe_eval_stray(dnl->get_inode());
7374 }
7375 }
7376
7377 void MDCache::trim_client_leases()
7378 {
7379 utime_t now = ceph_clock_now();
7380
7381 dout(10) << "trim_client_leases" << dendl;
7382
7383 for (int pool=0; pool<client_lease_pools; pool++) {
7384 int before = client_leases[pool].size();
7385 if (client_leases[pool].empty())
7386 continue;
7387
7388 while (!client_leases[pool].empty()) {
7389 ClientLease *r = client_leases[pool].front();
7390 if (r->ttl > now) break;
7391 CDentry *dn = static_cast<CDentry*>(r->parent);
7392 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7393 dn->remove_client_lease(r, mds->locker);
7394 }
7395 int after = client_leases[pool].size();
7396 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7397 << (before-after) << " leases, " << after << " left" << dendl;
7398 }
7399 }
7400
7401
7402 void MDCache::check_memory_usage()
7403 {
7404 static MemoryModel mm(g_ceph_context);
7405 static MemoryModel::snap last;
7406 mm.sample(&last);
7407 static MemoryModel::snap baseline = last;
7408
7409 // check client caps
7410 assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7411 double caps_per_inode = 0.0;
7412 if (CInode::count())
7413 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7414
7415 dout(2) << "check_memory_usage"
7416 << " total " << last.get_total()
7417 << ", rss " << last.get_rss()
7418 << ", heap " << last.get_heap()
7419 << ", baseline " << baseline.get_heap()
7420 << ", buffers " << (buffer::get_total_alloc() >> 10)
7421 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7422 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7423 << dendl;
7424
7425 mds->update_mlogger();
7426 mds->mlogger->set(l_mdm_rss, last.get_rss());
7427 mds->mlogger->set(l_mdm_heap, last.get_heap());
7428
7429 if (cache_toofull()) {
7430 last_recall_state = ceph_clock_now();
7431 mds->server->recall_client_state();
7432 }
7433
7434 // If the cache size had exceeded its limit, but we're back in bounds
7435 // now, free any unused pool memory so that our memory usage isn't
7436 // permanently bloated.
7437 if (exceeded_size_limit && !cache_toofull()) {
7438 // Only do this once we are back in bounds: otherwise the releases would
7439 // slow down whatever process caused us to exceed bounds to begin with
7440 if (ceph_using_tcmalloc()) {
7441 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7442 << dendl;
7443 ceph_heap_release_free_memory();
7444 }
7445 exceeded_size_limit = false;
7446 }
7447 }
7448
7449
7450
7451 // =========================================================================================
7452 // shutdown
7453
7454 class C_MDC_ShutdownCheck : public MDCacheContext {
7455 public:
7456 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7457 void finish(int) override {
7458 mdcache->shutdown_check();
7459 }
7460 };
7461
7462 void MDCache::shutdown_check()
7463 {
7464 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7465
7466 // cache
7467 char old_val[32] = { 0 };
7468 char *o = old_val;
7469 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7470 g_conf->set_val("debug_mds", "10");
7471 g_conf->apply_changes(NULL);
7472 show_cache();
7473 g_conf->set_val("debug_mds", old_val);
7474 g_conf->apply_changes(NULL);
7475 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7476
7477 // this
7478 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7479 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7480
7481
7482 if (mds->objecter->is_active()) {
7483 dout(0) << "objecter still active" << dendl;
7484 mds->objecter->dump_active();
7485 }
7486 }
7487
7488
7489 void MDCache::shutdown_start()
7490 {
7491 dout(2) << "shutdown_start" << dendl;
7492
7493 if (g_conf->mds_shutdown_check)
7494 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7495
7496 // g_conf->debug_mds = 10;
7497 }
7498
7499
7500
7501 bool MDCache::shutdown_pass()
7502 {
7503 dout(7) << "shutdown_pass" << dendl;
7504
7505 if (mds->is_stopped()) {
7506 dout(7) << " already shut down" << dendl;
7507 show_cache();
7508 show_subtrees();
7509 return true;
7510 }
7511
7512 // empty stray dir
7513 if (!shutdown_export_strays()) {
7514 dout(7) << "waiting for strays to migrate" << dendl;
7515 return false;
7516 }
7517
7518 // drop our reference to our stray dir inode
7519 for (int i = 0; i < NUM_STRAY; ++i) {
7520 if (strays[i] &&
7521 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7522 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7523 strays[i]->put(CInode::PIN_STRAY);
7524 strays[i]->put_stickydirs();
7525 }
7526 }
7527
7528 // trim cache
7529 trim(UINT64_MAX);
7530 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7531
7532 // SUBTREES
7533 int num_auth_subtree = 0;
7534 if (!subtrees.empty() &&
7535 mds->get_nodeid() != 0 &&
7536 migrator->get_export_queue_size() == 0) {
7537 dout(7) << "looking for subtrees to export to mds0" << dendl;
7538 list<CDir*> ls;
7539 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7540 it != subtrees.end();
7541 ++it) {
7542 CDir *dir = it->first;
7543 if (dir->get_inode()->is_mdsdir())
7544 continue;
7545 if (dir->is_auth()) {
7546 num_auth_subtree++;
7547 if (dir->is_frozen() ||
7548 dir->is_freezing() ||
7549 dir->is_ambiguous_dir_auth() ||
7550 dir->state_test(CDir::STATE_EXPORTING))
7551 continue;
7552 ls.push_back(dir);
7553 }
7554 }
7555 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7556 CDir *dir = *p;
7557 mds_rank_t dest = dir->get_inode()->authority().first;
7558 if (dest > 0 && !mds->mdsmap->is_active(dest))
7559 dest = 0;
7560 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7561 migrator->export_dir_nicely(dir, dest);
7562 }
7563 }
7564
7565 if (num_auth_subtree > 0) {
7566 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7567 show_subtrees();
7568 return false;
7569 }
7570
7571 // close out any sessions (and open files!) before we try to trim the log, etc.
7572 if (mds->sessionmap.have_unclosed_sessions()) {
7573 if (!mds->server->terminating_sessions)
7574 mds->server->terminate_sessions();
7575 return false;
7576 }
7577
7578 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7579 if (mydir && !mydir->is_subtree_root())
7580 mydir = NULL;
7581
7582 // subtrees map not empty yet?
7583 if (subtrees.size() > (mydir ? 1 : 0)) {
7584 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7585 show_subtrees();
7586 migrator->show_importing();
7587 migrator->show_exporting();
7588 if (!migrator->is_importing() && !migrator->is_exporting())
7589 show_cache();
7590 return false;
7591 }
7592 assert(!migrator->is_exporting());
7593 assert(!migrator->is_importing());
7594
7595 // flush what we can from the log
7596 mds->mdlog->trim(0);
7597 if (mds->mdlog->get_num_segments() > 1) {
7598 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7599 return false;
7600 }
7601
7602 if ((myin && myin->is_auth_pinned()) ||
7603 (mydir && mydir->is_auth_pinned())) {
7604 dout(7) << "still have auth pinned objects" << dendl;
7605 return false;
7606 }
7607
7608 // (only do this once!)
7609 if (!mds->mdlog->is_capped()) {
7610 dout(7) << "capping the log" << dendl;
7611 mds->mdlog->cap();
7612 mds->mdlog->trim();
7613 }
7614
7615 if (!mds->mdlog->empty()) {
7616 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7617 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7618 return false;
7619 }
7620
7621 if (!did_shutdown_log_cap) {
7622 // flush journal header
7623 dout(7) << "writing header for (now-empty) journal" << dendl;
7624 assert(mds->mdlog->empty());
7625 mds->mdlog->write_head(0);
7626 // NOTE: filer active checker below will block us until this completes.
7627 did_shutdown_log_cap = true;
7628 return false;
7629 }
7630
7631 // filer active?
7632 if (mds->objecter->is_active()) {
7633 dout(7) << "objecter still active" << dendl;
7634 mds->objecter->dump_active();
7635 return false;
7636 }
7637
7638 // trim what we can from the cache
7639 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7640 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7641 show_cache();
7642 //dump();
7643 return false;
7644 }
7645
7646 // make mydir subtree go away
7647 if (mydir) {
7648 if (mydir->get_num_ref() > 1) { // subtree pin
7649 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7650 show_cache();
7651 return false;
7652 }
7653
7654 remove_subtree(mydir);
7655 myin->close_dirfrag(mydir->get_frag());
7656 }
7657 assert(subtrees.empty());
7658
7659 if (myin)
7660 remove_inode(myin);
7661
7662 // done!
7663 dout(2) << "shutdown done." << dendl;
7664 return true;
7665 }
7666
7667 bool MDCache::shutdown_export_strays()
7668 {
7669 if (mds->get_nodeid() == 0)
7670 return true;
7671
7672 dout(10) << "shutdown_export_strays" << dendl;
7673
7674 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7675
7676 bool done = true;
7677
7678 list<CDir*> dfs;
7679 for (int i = 0; i < NUM_STRAY; ++i) {
7680 if (!strays[i]) {
7681 continue;
7682 }
7683 strays[i]->get_dirfrags(dfs);
7684 }
7685
7686 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7687 dfs_i != dfs.end(); ++dfs_i)
7688 {
7689 CDir *dir = *dfs_i;
7690
7691 if (!dir->is_complete()) {
7692 dir->fetch(0);
7693 done = false;
7694 if (!mds0_active)
7695 break;
7696 }
7697
7698 for (auto &p : dir->items) {
7699 CDentry *dn = p.second;
7700 CDentry::linkage_t *dnl = dn->get_linkage();
7701 if (dnl->is_null())
7702 continue;
7703 done = false;
7704 if (!mds0_active)
7705 break;
7706
7707 if (dn->state_test(CDentry::STATE_PURGING)) {
7708 // Don't try to migrate anything that is actually
7709 // being purged right now
7710 continue;
7711 }
7712
7713 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7714 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7715 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7716 } else {
7717 dout(10) << "already exporting " << *dn << dendl;
7718 }
7719 }
7720 }
7721
7722 return done;
7723 }
7724
7725 // ========= messaging ==============
7726
7727 /* This function DOES put the passed message before returning */
7728 void MDCache::dispatch(Message *m)
7729 {
7730 switch (m->get_type()) {
7731
7732 // RESOLVE
7733 case MSG_MDS_RESOLVE:
7734 handle_resolve(static_cast<MMDSResolve*>(m));
7735 break;
7736 case MSG_MDS_RESOLVEACK:
7737 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7738 break;
7739
7740 // REJOIN
7741 case MSG_MDS_CACHEREJOIN:
7742 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7743 break;
7744
7745 case MSG_MDS_DISCOVER:
7746 handle_discover(static_cast<MDiscover*>(m));
7747 break;
7748 case MSG_MDS_DISCOVERREPLY:
7749 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7750 break;
7751
7752 case MSG_MDS_DIRUPDATE:
7753 handle_dir_update(static_cast<MDirUpdate*>(m));
7754 break;
7755
7756 case MSG_MDS_CACHEEXPIRE:
7757 handle_cache_expire(static_cast<MCacheExpire*>(m));
7758 break;
7759
7760 case MSG_MDS_DENTRYLINK:
7761 handle_dentry_link(static_cast<MDentryLink*>(m));
7762 break;
7763 case MSG_MDS_DENTRYUNLINK:
7764 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7765 break;
7766
7767 case MSG_MDS_FRAGMENTNOTIFY:
7768 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7769 break;
7770
7771 case MSG_MDS_FINDINO:
7772 handle_find_ino(static_cast<MMDSFindIno *>(m));
7773 break;
7774 case MSG_MDS_FINDINOREPLY:
7775 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7776 break;
7777
7778 case MSG_MDS_OPENINO:
7779 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7780 break;
7781 case MSG_MDS_OPENINOREPLY:
7782 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7783 break;
7784
7785 default:
7786 derr << "cache unknown message " << m->get_type() << dendl;
7787 assert(0 == "cache unknown message");
7788 }
7789 }
7790
7791 MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7792 {
7793 if (mdr) {
7794 dout(20) << "_get_waiter retryrequest" << dendl;
7795 return new C_MDS_RetryRequest(this, mdr);
7796 } else if (req) {
7797 dout(20) << "_get_waiter retrymessage" << dendl;
7798 return new C_MDS_RetryMessage(mds, req);
7799 } else {
7800 return fin;
7801 }
7802 }
7803
7804 int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7805 const filepath& path, // what
7806 vector<CDentry*> *pdnvec, // result
7807 CInode **pin,
7808 int onfail)
7809 {
7810 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7811 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7812 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7813
7814 assert(mdr || req || fin);
7815 assert(!forward || mdr || req); // forward requires a request
7816
7817 snapid_t snapid = CEPH_NOSNAP;
7818 if (mdr)
7819 mdr->snapid = snapid;
7820
7821 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7822
7823 if (mds->logger) mds->logger->inc(l_mds_traverse);
7824
7825 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7826 CInode *cur = get_inode(path.get_ino());
7827 if (cur == NULL) {
7828 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7829 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7830 else {
7831 //ceph_abort(); // hrm.. broken
7832 return -ESTALE;
7833 }
7834 return 1;
7835 }
7836 if (cur->state_test(CInode::STATE_PURGING))
7837 return -ESTALE;
7838
7839 // make sure snaprealm are open...
7840 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7841 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7842 return 1;
7843 }
7844
7845 // start trace
7846 if (pdnvec)
7847 pdnvec->clear();
7848 if (pin)
7849 *pin = cur;
7850
7851 unsigned depth = 0;
7852 while (depth < path.depth()) {
7853 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7854 << "' snapid " << snapid << dendl;
7855
7856 if (!cur->is_dir()) {
7857 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7858 return -ENOTDIR;
7859 }
7860
7861 // walk into snapdir?
7862 if (path[depth].length() == 0) {
7863 dout(10) << "traverse: snapdir" << dendl;
7864 if (!mdr)
7865 return -EINVAL;
7866 snapid = CEPH_SNAPDIR;
7867 mdr->snapid = snapid;
7868 depth++;
7869 continue;
7870 }
7871 // walk thru snapdir?
7872 if (snapid == CEPH_SNAPDIR) {
7873 if (!mdr)
7874 return -EINVAL;
7875 SnapRealm *realm = cur->find_snaprealm();
7876 snapid = realm->resolve_snapname(path[depth], cur->ino());
7877 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7878 if (!snapid)
7879 return -ENOENT;
7880 mdr->snapid = snapid;
7881 depth++;
7882 continue;
7883 }
7884
7885 // open dir
7886 frag_t fg = cur->pick_dirfrag(path[depth]);
7887 CDir *curdir = cur->get_dirfrag(fg);
7888 if (!curdir) {
7889 if (cur->is_auth()) {
7890 // parent dir frozen_dir?
7891 if (cur->is_frozen()) {
7892 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7893 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7894 return 1;
7895 }
7896 curdir = cur->get_or_open_dirfrag(this, fg);
7897 } else {
7898 // discover?
7899 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7900 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7901 null_okay);
7902 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7903 return 1;
7904 }
7905 }
7906 assert(curdir);
7907
7908 #ifdef MDS_VERIFY_FRAGSTAT
7909 if (curdir->is_complete())
7910 curdir->verify_fragstat();
7911 #endif
7912
7913 // frozen?
7914 /*
7915 if (curdir->is_frozen()) {
7916 // doh!
7917 // FIXME: traverse is allowed?
7918 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7919 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7920 if (onfinish) delete onfinish;
7921 return 1;
7922 }
7923 */
7924
7925 // Before doing dirfrag->dn lookup, compare with DamageTable's
7926 // record of which dentries were unreadable
7927 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7928 dout(4) << "traverse: stopped lookup at damaged dentry "
7929 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7930 return -EIO;
7931 }
7932
7933 // dentry
7934 CDentry *dn = curdir->lookup(path[depth], snapid);
7935 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7936
7937 // null and last_bit and xlocked by me?
7938 if (dnl && dnl->is_null() && null_okay) {
7939 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7940 if (pdnvec)
7941 pdnvec->push_back(dn);
7942 if (pin)
7943 *pin = 0;
7944 break; // done!
7945 }
7946
7947 if (dnl &&
7948 dn->lock.is_xlocked() &&
7949 dn->lock.get_xlock_by() != mdr &&
7950 !dn->lock.can_read(client) &&
7951 (dnl->is_null() || forward)) {
7952 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7953 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7954 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7955 mds->mdlog->flush();
7956 return 1;
7957 }
7958
7959 // can we conclude ENOENT?
7960 if (dnl && dnl->is_null()) {
7961 if (dn->lock.can_read(client) ||
7962 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
7963 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
7964 if (pdnvec) {
7965 if (depth == path.depth() - 1)
7966 pdnvec->push_back(dn);
7967 else
7968 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
7969 }
7970 return -ENOENT;
7971 } else {
7972 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
7973 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7974 return 1;
7975 }
7976 }
7977
7978 if (dnl && !dnl->is_null()) {
7979 CInode *in = dnl->get_inode();
7980
7981 // do we have inode?
7982 if (!in) {
7983 assert(dnl->is_remote());
7984 // do i have it?
7985 in = get_inode(dnl->get_remote_ino());
7986 if (in) {
7987 dout(7) << "linking in remote in " << *in << dendl;
7988 dn->link_remote(dnl, in);
7989 } else {
7990 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
7991 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7992 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
7993 dout(4) << "traverse: remote dentry points to damaged ino "
7994 << *dn << dendl;
7995 return -EIO;
7996 }
7997 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
7998 (null_okay && depth == path.depth() - 1));
7999 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8000 return 1;
8001 }
8002 }
8003
8004 cur = in;
8005 // make sure snaprealm are open...
8006 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8007 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8008 return 1;
8009 }
8010
8011 // add to trace, continue.
8012 touch_inode(cur);
8013 if (pdnvec)
8014 pdnvec->push_back(dn);
8015 if (pin)
8016 *pin = cur;
8017 depth++;
8018 continue;
8019 }
8020
8021
8022 // MISS. dentry doesn't exist.
8023 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8024
8025 if (curdir->is_auth()) {
8026 // dentry is mine.
8027 if (curdir->is_complete() ||
8028 (snapid == CEPH_NOSNAP &&
8029 curdir->has_bloom() &&
8030 !curdir->is_in_bloom(path[depth]))){
8031 // file not found
8032 if (pdnvec) {
8033 // instantiate a null dn?
8034 if (depth < path.depth()-1){
8035 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8036 dn = NULL;
8037 } else if (dn) {
8038 ceph_abort(); // should have fallen out in ->is_null() check above
8039 } else if (curdir->is_frozen()) {
8040 dout(20) << " not adding null to frozen dir " << dendl;
8041 } else if (snapid < CEPH_MAXSNAP) {
8042 dout(20) << " not adding null for snapid " << snapid << dendl;
8043 } else {
8044 // create a null dentry
8045 dn = curdir->add_null_dentry(path[depth]);
8046 dout(20) << " added null " << *dn << dendl;
8047 }
8048 if (dn)
8049 pdnvec->push_back(dn);
8050 else
8051 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8052 }
8053 return -ENOENT;
8054 } else {
8055
8056 // Check DamageTable for missing fragments before trying to fetch
8057 // this
8058 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8059 dout(4) << "traverse: damaged dirfrag " << *curdir
8060 << ", blocking fetch" << dendl;
8061 return -EIO;
8062 }
8063
8064 // directory isn't complete; reload
8065 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8066 touch_inode(cur);
8067 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8068 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8069 return 1;
8070 }
8071 } else {
8072 // dirfrag/dentry is not mine.
8073 mds_authority_t dauth = curdir->authority();
8074
8075 if (forward &&
8076 snapid && mdr && mdr->client_request &&
8077 (int)depth < mdr->client_request->get_num_fwd()) {
8078 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8079 << " < fwd " << mdr->client_request->get_num_fwd()
8080 << ", discovering instead of forwarding" << dendl;
8081 discover = true;
8082 }
8083
8084 if ((discover || null_okay)) {
8085 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8086 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8087 null_okay);
8088 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8089 return 1;
8090 }
8091 if (forward) {
8092 // forward
8093 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8094
8095 if (curdir->is_ambiguous_auth()) {
8096 // wait
8097 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8098 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8099 return 1;
8100 }
8101
8102 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8103
8104 if (mdr)
8105 request_forward(mdr, dauth.first);
8106 else
8107 mds->forward_message_mds(req, dauth.first);
8108
8109 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8110 assert(fin == NULL);
8111 return 2;
8112 }
8113 }
8114
8115 ceph_abort(); // i shouldn't get here
8116 }
8117
8118 // success.
8119 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8120 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8121 if (mdr)
8122 assert(mdr->snapid == snapid);
8123 return 0;
8124 }
8125
8126 CInode *MDCache::cache_traverse(const filepath& fp)
8127 {
8128 dout(10) << "cache_traverse " << fp << dendl;
8129
8130 CInode *in;
8131 if (fp.get_ino())
8132 in = get_inode(fp.get_ino());
8133 else
8134 in = root;
8135 if (!in)
8136 return NULL;
8137
8138 for (unsigned i = 0; i < fp.depth(); i++) {
8139 boost::string_view dname = fp[i];
8140 frag_t fg = in->pick_dirfrag(dname);
8141 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8142 CDir *curdir = in->get_dirfrag(fg);
8143 if (!curdir)
8144 return NULL;
8145 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8146 if (!dn)
8147 return NULL;
8148 in = dn->get_linkage()->get_inode();
8149 if (!in)
8150 return NULL;
8151 }
8152 dout(10) << " got " << *in << dendl;
8153 return in;
8154 }
8155
8156
8157 /**
8158 * open_remote_dir -- open up a remote dirfrag
8159 *
8160 * @param diri base inode
8161 * @param approxfg approximate fragment.
8162 * @param fin completion callback
8163 */
8164 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8165 {
8166 dout(10) << "open_remote_dir on " << *diri << dendl;
8167 assert(diri->is_dir());
8168 assert(!diri->is_auth());
8169 assert(diri->get_dirfrag(approxfg) == 0);
8170
8171 discover_dir_frag(diri, approxfg, fin);
8172 }
8173
8174
8175 /**
8176 * get_dentry_inode - get or open inode
8177 *
8178 * @param dn the dentry
8179 * @param mdr current request
8180 *
8181 * will return inode for primary, or link up/open up remote link's inode as necessary.
8182 * If it's not available right now, puts mdr on wait list and returns null.
8183 */
8184 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8185 {
8186 CDentry::linkage_t *dnl;
8187 if (projected)
8188 dnl = dn->get_projected_linkage();
8189 else
8190 dnl = dn->get_linkage();
8191
8192 assert(!dnl->is_null());
8193
8194 if (dnl->is_primary())
8195 return dnl->inode;
8196
8197 assert(dnl->is_remote());
8198 CInode *in = get_inode(dnl->get_remote_ino());
8199 if (in) {
8200 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8201 dn->link_remote(dnl, in);
8202 return in;
8203 } else {
8204 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8205 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8206 return 0;
8207 }
8208 }
8209
8210 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8211 CDentry *dn;
8212 inodeno_t ino;
8213 MDSInternalContextBase *onfinish;
8214 bool want_xlocked;
8215 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
8216 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8217 dn->get(MDSCacheObject::PIN_PTRWAITER);
8218 }
8219 void finish(int r) override {
8220 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8221 dn->put(MDSCacheObject::PIN_PTRWAITER);
8222 }
8223 };
8224
8225 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8226 {
8227 dout(10) << "open_remote_dentry " << *dn << dendl;
8228 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8229 inodeno_t ino = dnl->get_remote_ino();
8230 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8231 open_ino(ino, pool,
8232 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8233 }
8234
8235 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8236 bool want_xlocked, int r)
8237 {
8238 if (r < 0) {
8239 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8240 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8241 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8242 dn->state_set(CDentry::STATE_BADREMOTEINO);
8243
8244 std::string path;
8245 CDir *dir = dn->get_dir();
8246 if (dir) {
8247 dir->get_inode()->make_path_string(path);
8248 path += "/";
8249 path += std::string(dn->get_name());
8250 }
8251
8252 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8253 if (fatal) {
8254 mds->damaged();
8255 ceph_abort(); // unreachable, damaged() respawns us
8256 }
8257 } else {
8258 r = 0;
8259 }
8260 }
8261 fin->complete(r < 0 ? r : 0);
8262 }
8263
8264
8265 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8266 {
8267 // empty trace if we're a base inode
8268 if (in->is_base())
8269 return;
8270
8271 CInode *parent = in->get_parent_inode();
8272 assert(parent);
8273 make_trace(trace, parent);
8274
8275 CDentry *dn = in->get_parent_dn();
8276 dout(15) << "make_trace adding " << *dn << dendl;
8277 trace.push_back(dn);
8278 }
8279
8280
8281 // -------------------------------------------------------------------------------
8282 // Open inode by inode number
8283
8284 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8285 inodeno_t ino;
8286 public:
8287 bufferlist bl;
8288 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8289 MDCacheIOContext(c), ino(i) {}
8290 void finish(int r) override {
8291 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8292 }
8293 };
8294
8295 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8296 inodeno_t ino;
8297 MMDSOpenIno *msg;
8298 bool parent;
8299 public:
8300 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8301 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8302 void finish(int r) override {
8303 if (r < 0 && !parent)
8304 r = -EAGAIN;
8305 if (msg) {
8306 mdcache->handle_open_ino(msg, r);
8307 return;
8308 }
8309 assert(mdcache->opening_inodes.count(ino));
8310 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8311 }
8312 };
8313
8314 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8315 inodeno_t ino;
8316 public:
8317 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8318 void finish(int r) override {
8319 mdcache->_open_ino_parent_opened(ino, r);
8320 }
8321 };
8322
8323 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8324 {
8325 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8326
8327 assert(opening_inodes.count(ino));
8328 open_ino_info_t& info = opening_inodes[ino];
8329
8330 CInode *in = get_inode(ino);
8331 if (in) {
8332 dout(10) << " found cached " << *in << dendl;
8333 open_ino_finish(ino, info, in->authority().first);
8334 return;
8335 }
8336
8337 inode_backtrace_t backtrace;
8338 if (err == 0) {
8339 try {
8340 ::decode(backtrace, bl);
8341 } catch (const buffer::error &decode_exc) {
8342 derr << "corrupt backtrace on ino x0" << std::hex << ino
8343 << std::dec << ": " << decode_exc << dendl;
8344 open_ino_finish(ino, info, -EIO);
8345 return;
8346 }
8347 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8348 dout(10) << " old object in pool " << info.pool
8349 << ", retrying pool " << backtrace.pool << dendl;
8350 info.pool = backtrace.pool;
8351 C_IO_MDC_OpenInoBacktraceFetched *fin =
8352 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8353 fetch_backtrace(ino, info.pool, fin->bl,
8354 new C_OnFinisher(fin, mds->finisher));
8355 return;
8356 }
8357 } else if (err == -ENOENT) {
8358 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8359 if (info.pool != meta_pool) {
8360 dout(10) << " no object in pool " << info.pool
8361 << ", retrying pool " << meta_pool << dendl;
8362 info.pool = meta_pool;
8363 C_IO_MDC_OpenInoBacktraceFetched *fin =
8364 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8365 fetch_backtrace(ino, info.pool, fin->bl,
8366 new C_OnFinisher(fin, mds->finisher));
8367 return;
8368 }
8369 err = 0; // backtrace.ancestors.empty() is checked below
8370 }
8371
8372 if (err == 0) {
8373 if (backtrace.ancestors.empty()) {
8374 dout(10) << " got empty backtrace " << dendl;
8375 err = -EIO;
8376 } else if (!info.ancestors.empty()) {
8377 if (info.ancestors[0] == backtrace.ancestors[0]) {
8378 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8379 err = -EINVAL;
8380 } else {
8381 info.last_err = 0;
8382 }
8383 }
8384 }
8385 if (err) {
8386 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8387 if (info.last_err)
8388 err = info.last_err;
8389 open_ino_finish(ino, info, err);
8390 return;
8391 }
8392
8393 dout(10) << " got backtrace " << backtrace << dendl;
8394 info.ancestors = backtrace.ancestors;
8395
8396 _open_ino_traverse_dir(ino, info, 0);
8397 }
8398
8399 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8400 {
8401 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8402
8403 assert(opening_inodes.count(ino));
8404 open_ino_info_t& info = opening_inodes[ino];
8405
8406 CInode *in = get_inode(ino);
8407 if (in) {
8408 dout(10) << " found cached " << *in << dendl;
8409 open_ino_finish(ino, info, in->authority().first);
8410 return;
8411 }
8412
8413 if (ret == mds->get_nodeid()) {
8414 _open_ino_traverse_dir(ino, info, 0);
8415 } else {
8416 if (ret >= 0) {
8417 mds_rank_t checked_rank = mds_rank_t(ret);
8418 info.check_peers = true;
8419 info.auth_hint = checked_rank;
8420 info.checked.erase(checked_rank);
8421 }
8422 do_open_ino(ino, info, ret);
8423 }
8424 }
8425
8426 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8427 {
8428 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8429
8430 CInode *in = get_inode(ino);
8431 if (in) {
8432 dout(10) << " found cached " << *in << dendl;
8433 open_ino_finish(ino, info, in->authority().first);
8434 return;
8435 }
8436
8437 if (ret) {
8438 do_open_ino(ino, info, ret);
8439 return;
8440 }
8441
8442 mds_rank_t hint = info.auth_hint;
8443 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8444 info.discover, info.want_xlocked, &hint);
8445 if (ret > 0)
8446 return;
8447 if (hint != mds->get_nodeid())
8448 info.auth_hint = hint;
8449 do_open_ino(ino, info, ret);
8450 }
8451
8452 void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8453 {
8454 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8455 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8456 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8457 }
8458
8459 int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8460 vector<inode_backpointer_t>& ancestors,
8461 bool discover, bool want_xlocked, mds_rank_t *hint)
8462 {
8463 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8464 int err = 0;
8465 for (unsigned i = 0; i < ancestors.size(); i++) {
8466 CInode *diri = get_inode(ancestors[i].dirino);
8467
8468 if (!diri) {
8469 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8470 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8471 return 1;
8472 }
8473 continue;
8474 }
8475
8476 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8477 CDir *dir = diri->get_parent_dir();
8478 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8479 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8480 dir = dir->get_inode()->get_parent_dir();
8481 _open_ino_fetch_dir(ino, m, dir, i == 0);
8482 return 1;
8483 }
8484
8485 if (!diri->is_dir()) {
8486 dout(10) << " " << *diri << " is not dir" << dendl;
8487 if (i == 0)
8488 err = -ENOTDIR;
8489 break;
8490 }
8491
8492 string &name = ancestors[i].dname;
8493 frag_t fg = diri->pick_dirfrag(name);
8494 CDir *dir = diri->get_dirfrag(fg);
8495 if (!dir) {
8496 if (diri->is_auth()) {
8497 if (diri->is_frozen()) {
8498 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8499 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8500 return 1;
8501 }
8502 dir = diri->get_or_open_dirfrag(this, fg);
8503 } else if (discover) {
8504 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8505 return 1;
8506 }
8507 }
8508 if (dir) {
8509 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8510 CDentry *dn = dir->lookup(name);
8511 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8512 if (dir->is_auth()) {
8513 if (dnl && dnl->is_primary() &&
8514 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8515 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8516 _open_ino_fetch_dir(ino, m, dir, i == 0);
8517 return 1;
8518 }
8519
8520 if (!dnl && !dir->is_complete() &&
8521 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8522 dout(10) << " fetching incomplete " << *dir << dendl;
8523 _open_ino_fetch_dir(ino, m, dir, i == 0);
8524 return 1;
8525 }
8526
8527 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8528 if (i == 0)
8529 err = -ENOENT;
8530 } else if (discover) {
8531 if (!dnl) {
8532 filepath path(name, 0);
8533 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8534 (i == 0 && want_xlocked));
8535 return 1;
8536 }
8537 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8538 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8539 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8540 return 1;
8541 }
8542 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8543 if (i == 0)
8544 err = -ENOENT;
8545 }
8546 }
8547 if (hint && i == 0)
8548 *hint = dir ? dir->authority().first : diri->authority().first;
8549 break;
8550 }
8551 return err;
8552 }
8553
8554 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8555 {
8556 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8557
8558 list<MDSInternalContextBase*> waiters;
8559 waiters.swap(info.waiters);
8560 opening_inodes.erase(ino);
8561 finish_contexts(g_ceph_context, waiters, ret);
8562 }
8563
8564 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8565 {
8566 if (err < 0 && err != -EAGAIN) {
8567 info.checked.clear();
8568 info.checking = MDS_RANK_NONE;
8569 info.check_peers = true;
8570 info.fetch_backtrace = true;
8571 if (info.discover) {
8572 info.discover = false;
8573 info.ancestors.clear();
8574 }
8575 if (err != -ENOENT && err != -ENOTDIR)
8576 info.last_err = err;
8577 }
8578
8579 if (info.check_peers || info.discover) {
8580 if (info.discover) {
8581 // got backtrace from peer, but failed to find inode. re-check peers
8582 info.discover = false;
8583 info.ancestors.clear();
8584 info.checked.clear();
8585 }
8586 info.check_peers = false;
8587 info.checking = MDS_RANK_NONE;
8588 do_open_ino_peer(ino, info);
8589 } else if (info.fetch_backtrace) {
8590 info.check_peers = true;
8591 info.fetch_backtrace = false;
8592 info.checking = mds->get_nodeid();
8593 info.checked.clear();
8594 C_IO_MDC_OpenInoBacktraceFetched *fin =
8595 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8596 fetch_backtrace(ino, info.pool, fin->bl,
8597 new C_OnFinisher(fin, mds->finisher));
8598 } else {
8599 assert(!info.ancestors.empty());
8600 info.checking = mds->get_nodeid();
8601 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8602 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8603 }
8604 }
8605
8606 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8607 {
8608 set<mds_rank_t> all, active;
8609 mds->mdsmap->get_mds_set(all);
8610 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8611 if (mds->get_state() == MDSMap::STATE_REJOIN)
8612 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8613
8614 dout(10) << "do_open_ino_peer " << ino << " active " << active
8615 << " all " << all << " checked " << info.checked << dendl;
8616
8617 mds_rank_t peer = MDS_RANK_NONE;
8618 if (info.auth_hint >= 0) {
8619 if (active.count(info.auth_hint)) {
8620 peer = info.auth_hint;
8621 info.auth_hint = MDS_RANK_NONE;
8622 }
8623 } else {
8624 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8625 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8626 peer = *p;
8627 break;
8628 }
8629 }
8630 if (peer < 0) {
8631 all.erase(mds->get_nodeid());
8632 if (all != info.checked) {
8633 dout(10) << " waiting for more peers to be active" << dendl;
8634 } else {
8635 dout(10) << " all MDS peers have been checked " << dendl;
8636 do_open_ino(ino, info, 0);
8637 }
8638 } else {
8639 info.checking = peer;
8640 vector<inode_backpointer_t> *pa = NULL;
8641 // got backtrace from peer or backtrace just fetched
8642 if (info.discover || !info.fetch_backtrace)
8643 pa = &info.ancestors;
8644 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8645 }
8646 }
8647
8648 void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8649 {
8650 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8651 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8652 m->put();
8653 return;
8654 }
8655
8656 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8657
8658 inodeno_t ino = m->ino;
8659 MMDSOpenInoReply *reply;
8660 CInode *in = get_inode(ino);
8661 if (in) {
8662 dout(10) << " have " << *in << dendl;
8663 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8664 if (in->is_auth()) {
8665 touch_inode(in);
8666 while (1) {
8667 CDentry *pdn = in->get_parent_dn();
8668 if (!pdn)
8669 break;
8670 CInode *diri = pdn->get_dir()->get_inode();
8671 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
8672 in->inode.version));
8673 in = diri;
8674 }
8675 } else {
8676 reply->hint = in->authority().first;
8677 }
8678 } else if (err < 0) {
8679 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8680 } else {
8681 mds_rank_t hint = MDS_RANK_NONE;
8682 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8683 if (ret > 0)
8684 return;
8685 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8686 }
8687 m->get_connection()->send_message(reply);
8688 m->put();
8689 }
8690
8691 void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8692 {
8693 dout(10) << "handle_open_ino_reply " << *m << dendl;
8694
8695 inodeno_t ino = m->ino;
8696 mds_rank_t from = mds_rank_t(m->get_source().num());
8697 auto it = opening_inodes.find(ino);
8698 if (it != opening_inodes.end() && it->second.checking == from) {
8699 open_ino_info_t& info = it->second;
8700 info.checking = MDS_RANK_NONE;
8701 info.checked.insert(from);
8702
8703 CInode *in = get_inode(ino);
8704 if (in) {
8705 dout(10) << " found cached " << *in << dendl;
8706 open_ino_finish(ino, info, in->authority().first);
8707 } else if (!m->ancestors.empty()) {
8708 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8709 if (!info.want_replica) {
8710 open_ino_finish(ino, info, from);
8711 m->put();
8712 return;
8713 }
8714
8715 info.ancestors = m->ancestors;
8716 info.auth_hint = from;
8717 info.checking = mds->get_nodeid();
8718 info.discover = true;
8719 _open_ino_traverse_dir(ino, info, 0);
8720 } else if (m->error) {
8721 dout(10) << " error " << m->error << " from mds." << from << dendl;
8722 do_open_ino(ino, info, m->error);
8723 } else {
8724 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8725 info.auth_hint = m->hint;
8726 info.checked.erase(m->hint);
8727 }
8728 do_open_ino_peer(ino, info);
8729 }
8730 }
8731 m->put();
8732 }
8733
8734 void MDCache::kick_open_ino_peers(mds_rank_t who)
8735 {
8736 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8737
8738 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8739 p != opening_inodes.end();
8740 ++p) {
8741 open_ino_info_t& info = p->second;
8742 if (info.checking == who) {
8743 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8744 info.checking = MDS_RANK_NONE;
8745 do_open_ino_peer(p->first, info);
8746 } else if (info.checking == MDS_RANK_NONE) {
8747 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8748 do_open_ino_peer(p->first, info);
8749 }
8750 }
8751 }
8752
8753 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8754 bool want_replica, bool want_xlocked)
8755 {
8756 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8757 << want_replica << dendl;
8758
8759 if (opening_inodes.count(ino)) {
8760 open_ino_info_t& info = opening_inodes[ino];
8761 if (want_replica) {
8762 info.want_replica = true;
8763 if (want_xlocked && !info.want_xlocked) {
8764 if (!info.ancestors.empty()) {
8765 CInode *diri = get_inode(info.ancestors[0].dirino);
8766 if (diri) {
8767 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8768 CDir *dir = diri->get_dirfrag(fg);
8769 if (dir && !dir->is_auth()) {
8770 filepath path(info.ancestors[0].dname, 0);
8771 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8772 }
8773 }
8774 }
8775 info.want_xlocked = true;
8776 }
8777 }
8778 info.waiters.push_back(fin);
8779 } else {
8780 open_ino_info_t& info = opening_inodes[ino];
8781 info.want_replica = want_replica;
8782 info.want_xlocked = want_xlocked;
8783 info.tid = ++open_ino_last_tid;
8784 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8785 info.waiters.push_back(fin);
8786 do_open_ino(ino, info, 0);
8787 }
8788 }
8789
8790 /* ---------------------------- */
8791
8792 /*
8793 * search for a given inode on MDS peers. optionally start with the given node.
8794
8795
8796 TODO
8797 - recover from mds node failure, recovery
8798 - traverse path
8799
8800 */
8801 void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8802 {
8803 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8804 CInode *in = get_inode(ino);
8805 if (in && in->state_test(CInode::STATE_PURGING)) {
8806 c->complete(-ESTALE);
8807 return;
8808 }
8809 assert(!in);
8810
8811 ceph_tid_t tid = ++find_ino_peer_last_tid;
8812 find_ino_peer_info_t& fip = find_ino_peer[tid];
8813 fip.ino = ino;
8814 fip.tid = tid;
8815 fip.fin = c;
8816 fip.hint = hint;
8817 _do_find_ino_peer(fip);
8818 }
8819
8820 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8821 {
8822 set<mds_rank_t> all, active;
8823 mds->mdsmap->get_mds_set(all);
8824 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8825
8826 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8827 << " active " << active << " all " << all
8828 << " checked " << fip.checked
8829 << dendl;
8830
8831 mds_rank_t m = MDS_RANK_NONE;
8832 if (fip.hint >= 0) {
8833 m = fip.hint;
8834 fip.hint = MDS_RANK_NONE;
8835 } else {
8836 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8837 if (*p != mds->get_nodeid() &&
8838 fip.checked.count(*p) == 0) {
8839 m = *p;
8840 break;
8841 }
8842 }
8843 if (m == MDS_RANK_NONE) {
8844 all.erase(mds->get_nodeid());
8845 if (all != fip.checked) {
8846 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8847 } else {
8848 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8849 fip.fin->complete(-ESTALE);
8850 find_ino_peer.erase(fip.tid);
8851 }
8852 } else {
8853 fip.checking = m;
8854 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8855 }
8856 }
8857
8858 void MDCache::handle_find_ino(MMDSFindIno *m)
8859 {
8860 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8861 m->put();
8862 return;
8863 }
8864
8865 dout(10) << "handle_find_ino " << *m << dendl;
8866 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8867 CInode *in = get_inode(m->ino);
8868 if (in) {
8869 in->make_path(r->path);
8870 dout(10) << " have " << r->path << " " << *in << dendl;
8871 }
8872 m->get_connection()->send_message(r);
8873 m->put();
8874 }
8875
8876
8877 void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8878 {
8879 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8880 if (p != find_ino_peer.end()) {
8881 dout(10) << "handle_find_ino_reply " << *m << dendl;
8882 find_ino_peer_info_t& fip = p->second;
8883
8884 // success?
8885 if (get_inode(fip.ino)) {
8886 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8887 mds->queue_waiter(fip.fin);
8888 find_ino_peer.erase(p);
8889 m->put();
8890 return;
8891 }
8892
8893 mds_rank_t from = mds_rank_t(m->get_source().num());
8894 if (fip.checking == from)
8895 fip.checking = MDS_RANK_NONE;
8896 fip.checked.insert(from);
8897
8898 if (!m->path.empty()) {
8899 // we got a path!
8900 vector<CDentry*> trace;
8901 MDRequestRef null_ref;
8902 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8903 if (r > 0)
8904 return;
8905 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8906 << ", retrying" << dendl;
8907 fip.checked.clear();
8908 _do_find_ino_peer(fip);
8909 } else {
8910 // nope, continue.
8911 _do_find_ino_peer(fip);
8912 }
8913 } else {
8914 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8915 }
8916 m->put();
8917 }
8918
8919 void MDCache::kick_find_ino_peers(mds_rank_t who)
8920 {
8921 // find_ino_peers requests we should move on from
8922 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8923 p != find_ino_peer.end();
8924 ++p) {
8925 find_ino_peer_info_t& fip = p->second;
8926 if (fip.checking == who) {
8927 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8928 fip.checking = MDS_RANK_NONE;
8929 _do_find_ino_peer(fip);
8930 } else if (fip.checking == MDS_RANK_NONE) {
8931 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8932 _do_find_ino_peer(fip);
8933 }
8934 }
8935 }
8936
8937 /* ---------------------------- */
8938
8939 int MDCache::get_num_client_requests()
8940 {
8941 int count = 0;
8942 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8943 p != active_requests.end();
8944 ++p) {
8945 MDRequestRef& mdr = p->second;
8946 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8947 count++;
8948 }
8949 return count;
8950 }
8951
8952 /* This function takes over the reference to the passed Message */
8953 MDRequestRef MDCache::request_start(MClientRequest *req)
8954 {
8955 // did we win a forward race against a slave?
8956 if (active_requests.count(req->get_reqid())) {
8957 MDRequestRef& mdr = active_requests[req->get_reqid()];
8958 assert(mdr);
8959 if (mdr->is_slave()) {
8960 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8961 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8962 } else {
8963 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8964 req->put();
8965 }
8966 return MDRequestRef();
8967 }
8968
8969 // register new client request
8970 MDRequestImpl::Params params;
8971 params.reqid = req->get_reqid();
8972 params.attempt = req->get_num_fwd();
8973 params.client_req = req;
8974 params.initiated = req->get_recv_stamp();
8975 params.throttled = req->get_throttle_stamp();
8976 params.all_read = req->get_recv_complete_stamp();
8977 params.dispatched = req->get_dispatch_stamp();
8978
8979 MDRequestRef mdr =
8980 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
8981 active_requests[params.reqid] = mdr;
8982 mdr->set_op_stamp(req->get_stamp());
8983 dout(7) << "request_start " << *mdr << dendl;
8984 return mdr;
8985 }
8986
8987 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
8988 {
8989 int by = m->get_source().num();
8990 MDRequestImpl::Params params;
8991 params.reqid = ri;
8992 params.attempt = attempt;
8993 params.triggering_slave_req = m;
8994 params.slave_to = by;
8995 params.initiated = m->get_recv_stamp();
8996 params.throttled = m->get_throttle_stamp();
8997 params.all_read = m->get_recv_complete_stamp();
8998 params.dispatched = m->get_dispatch_stamp();
8999 MDRequestRef mdr =
9000 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9001 assert(active_requests.count(mdr->reqid) == 0);
9002 active_requests[mdr->reqid] = mdr;
9003 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9004 return mdr;
9005 }
9006
9007 MDRequestRef MDCache::request_start_internal(int op)
9008 {
9009 MDRequestImpl::Params params;
9010 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9011 params.reqid.tid = mds->issue_tid();
9012 params.initiated = ceph_clock_now();
9013 params.internal_op = op;
9014 MDRequestRef mdr =
9015 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9016
9017 assert(active_requests.count(mdr->reqid) == 0);
9018 active_requests[mdr->reqid] = mdr;
9019 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9020 return mdr;
9021 }
9022
9023 MDRequestRef MDCache::request_get(metareqid_t rid)
9024 {
9025 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9026 assert(p != active_requests.end());
9027 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9028 return p->second;
9029 }
9030
9031 void MDCache::request_finish(MDRequestRef& mdr)
9032 {
9033 dout(7) << "request_finish " << *mdr << dendl;
9034 mdr->mark_event("finishing request");
9035
9036 // slave finisher?
9037 if (mdr->has_more() && mdr->more()->slave_commit) {
9038 Context *fin = mdr->more()->slave_commit;
9039 mdr->more()->slave_commit = 0;
9040 int ret;
9041 if (mdr->aborted) {
9042 mdr->aborted = false;
9043 ret = -1;
9044 mdr->more()->slave_rolling_back = true;
9045 } else {
9046 ret = 0;
9047 mdr->committing = true;
9048 }
9049 fin->complete(ret); // this must re-call request_finish.
9050 return;
9051 }
9052
9053 switch(mdr->internal_op) {
9054 case CEPH_MDS_OP_FRAGMENTDIR:
9055 logger->inc(l_mdss_ireq_fragmentdir);
9056 break;
9057 case CEPH_MDS_OP_EXPORTDIR:
9058 logger->inc(l_mdss_ireq_exportdir);
9059 break;
9060 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9061 logger->inc(l_mdss_ireq_enqueue_scrub);
9062 break;
9063 case CEPH_MDS_OP_FLUSH:
9064 logger->inc(l_mdss_ireq_flush);
9065 break;
9066 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9067 logger->inc(l_mdss_ireq_fragstats);
9068 break;
9069 case CEPH_MDS_OP_REPAIR_INODESTATS:
9070 logger->inc(l_mdss_ireq_inodestats);
9071 break;
9072 }
9073
9074 request_cleanup(mdr);
9075 }
9076
9077
9078 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9079 {
9080 mdr->mark_event("forwarding request");
9081 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9082 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9083 << *mdr->client_request << dendl;
9084 mds->forward_message_mds(mdr->client_request, who);
9085 mdr->client_request = 0;
9086 if (mds->logger) mds->logger->inc(l_mds_forward);
9087 } else if (mdr->internal_op >= 0) {
9088 dout(10) << "request_forward on internal op; cancelling" << dendl;
9089 mdr->internal_op_finish->complete(-EXDEV);
9090 } else {
9091 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9092 << " was from mds" << dendl;
9093 }
9094 request_cleanup(mdr);
9095 }
9096
9097
9098 void MDCache::dispatch_request(MDRequestRef& mdr)
9099 {
9100 if (mdr->client_request) {
9101 mds->server->dispatch_client_request(mdr);
9102 } else if (mdr->slave_request) {
9103 mds->server->dispatch_slave_request(mdr);
9104 } else {
9105 switch (mdr->internal_op) {
9106 case CEPH_MDS_OP_FRAGMENTDIR:
9107 dispatch_fragment_dir(mdr);
9108 break;
9109 case CEPH_MDS_OP_EXPORTDIR:
9110 migrator->dispatch_export_dir(mdr, 0);
9111 break;
9112 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9113 enqueue_scrub_work(mdr);
9114 break;
9115 case CEPH_MDS_OP_FLUSH:
9116 flush_dentry_work(mdr);
9117 break;
9118 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9119 repair_dirfrag_stats_work(mdr);
9120 break;
9121 case CEPH_MDS_OP_REPAIR_INODESTATS:
9122 repair_inode_stats_work(mdr);
9123 break;
9124 default:
9125 ceph_abort();
9126 }
9127 }
9128 }
9129
9130
9131 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9132 {
9133 if (!mdr->has_more())
9134 return;
9135
9136 // clean up slaves
9137 // (will implicitly drop remote dn pins)
9138 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9139 p != mdr->more()->slaves.end();
9140 ++p) {
9141 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9142 MMDSSlaveRequest::OP_FINISH);
9143
9144 if (mdr->killed && !mdr->committing) {
9145 r->mark_abort();
9146 } else if (mdr->more()->srcdn_auth_mds == *p &&
9147 mdr->more()->inode_import.length() > 0) {
9148 // information about rename imported caps
9149 r->inode_export.claim(mdr->more()->inode_import);
9150 }
9151
9152 mds->send_message_mds(r, *p);
9153 }
9154
9155 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9156 * implicitly. Note that we don't call the finishers -- there shouldn't
9157 * be any on a remote lock and the request finish wakes up all
9158 * the waiters anyway! */
9159 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9160 while (p != mdr->xlocks.end()) {
9161 if ((*p)->get_parent()->is_auth())
9162 ++p;
9163 else {
9164 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9165 << " on " << *(*p)->get_parent() << dendl;
9166 (*p)->put_xlock();
9167 mdr->locks.erase(*p);
9168 mdr->xlocks.erase(p++);
9169 }
9170 }
9171
9172 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9173 while (q != mdr->remote_wrlocks.end()) {
9174 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9175 << " on mds." << q->second
9176 << " on " << *(q->first)->get_parent() << dendl;
9177 mdr->locks.erase(q->first);
9178 mdr->remote_wrlocks.erase(q++);
9179 }
9180
9181 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9182 * leaving them in can cause double-notifies as
9183 * this function can get called more than once */
9184 }
9185
9186 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9187 {
9188 request_drop_foreign_locks(mdr);
9189 mds->locker->drop_non_rdlocks(mdr.get());
9190 }
9191
9192 void MDCache::request_drop_locks(MDRequestRef& mdr)
9193 {
9194 request_drop_foreign_locks(mdr);
9195 mds->locker->drop_locks(mdr.get());
9196 }
9197
9198 void MDCache::request_cleanup(MDRequestRef& mdr)
9199 {
9200 dout(15) << "request_cleanup " << *mdr << dendl;
9201
9202 if (mdr->has_more()) {
9203 if (mdr->more()->is_ambiguous_auth)
9204 mdr->clear_ambiguous_auth();
9205 if (!mdr->more()->waiting_for_finish.empty())
9206 mds->queue_waiters(mdr->more()->waiting_for_finish);
9207 }
9208
9209 request_drop_locks(mdr);
9210
9211 // drop (local) auth pins
9212 mdr->drop_local_auth_pins();
9213
9214 // drop stickydirs
9215 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9216 p != mdr->stickydirs.end();
9217 ++p)
9218 (*p)->put_stickydirs();
9219
9220 mds->locker->kick_cap_releases(mdr);
9221
9222 // drop cache pins
9223 mdr->drop_pins();
9224
9225 // remove from session
9226 mdr->item_session_request.remove_myself();
9227
9228 // remove from map
9229 active_requests.erase(mdr->reqid);
9230
9231 if (mds->logger)
9232 log_stat();
9233
9234 mdr->mark_event("cleaned up request");
9235 }
9236
9237 void MDCache::request_kill(MDRequestRef& mdr)
9238 {
9239 // rollback slave requests is tricky. just let the request proceed.
9240 if (mdr->has_more() &&
9241 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9242 if (!mdr->done_locking) {
9243 assert(mdr->more()->witnessed.empty());
9244 mdr->aborted = true;
9245 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9246 } else {
9247 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9248 }
9249
9250 assert(mdr->used_prealloc_ino == 0);
9251 assert(mdr->prealloc_inos.empty());
9252
9253 mdr->session = NULL;
9254 mdr->item_session_request.remove_myself();
9255 return;
9256 }
9257
9258 mdr->killed = true;
9259 mdr->mark_event("killing request");
9260
9261 if (mdr->committing) {
9262 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9263 } else {
9264 dout(10) << "request_kill " << *mdr << dendl;
9265 request_cleanup(mdr);
9266 }
9267 }
9268
9269 // -------------------------------------------------------------------------------
9270 // SNAPREALMS
9271
9272 struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9273 MDRequestRef mdr;
9274 MutationRef mut;
9275 CInode *in;
9276 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9277 MutationRef& mu, CInode *i) :
9278 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9279 void finish(int r) override {
9280 mdcache->_snaprealm_create_finish(mdr, mut, in);
9281 }
9282 };
9283
9284 void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9285 {
9286 dout(10) << "snaprealm_create " << *in << dendl;
9287 assert(!in->snaprealm);
9288
9289 // allocate an id..
9290 if (!mdr->more()->stid) {
9291 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9292 new C_MDS_RetryRequest(this, mdr));
9293 return;
9294 }
9295
9296 MutationRef mut(new MutationImpl());
9297 mut->ls = mds->mdlog->get_current_segment();
9298 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9299 mds->mdlog->start_entry(le);
9300
9301 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9302
9303 auto &pi = in->project_inode(false, true);
9304 pi.inode.version = in->pre_dirty();
9305 pi.inode.rstat.rsnaprealms++;
9306
9307 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9308 snapid_t seq;
9309 ::decode(seq, p);
9310
9311 auto &newsnap = *pi.snapnode;
9312 newsnap.created = seq;
9313 newsnap.seq = seq;
9314 newsnap.last_created = seq;
9315
9316 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9317 journal_cow_inode(mut, &le->metablob, in);
9318 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9319
9320 mds->server->submit_mdlog_entry(le,
9321 new C_MDC_snaprealm_create_finish(this, mdr,
9322 mut, in),
9323 mdr, __func__);
9324 mds->mdlog->flush();
9325 }
9326
9327
9328 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9329 {
9330 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9331
9332 vector<inodeno_t> split_inos;
9333 vector<inodeno_t> split_realms;
9334
9335 if (snapop == CEPH_SNAP_OP_SPLIT) {
9336 // notify clients of update|split
9337 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9338 !p.end(); ++p)
9339 split_inos.push_back((*p)->ino());
9340
9341 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9342 p != in->snaprealm->open_children.end();
9343 ++p)
9344 split_realms.push_back((*p)->inode->ino());
9345 }
9346
9347 bufferlist snapbl;
9348 in->snaprealm->build_snap_trace(snapbl);
9349
9350 set<SnapRealm*> past_children;
9351 map<client_t, MClientSnap*> updates;
9352 list<SnapRealm*> q;
9353 q.push_back(in->snaprealm);
9354 while (!q.empty()) {
9355 SnapRealm *realm = q.front();
9356 q.pop_front();
9357
9358 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9359 realm->invalidate_cached_snaps();
9360
9361 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9362 p != realm->client_caps.end();
9363 ++p) {
9364 assert(!p->second->empty());
9365 if (!nosend && updates.count(p->first) == 0) {
9366 MClientSnap *update = new MClientSnap(snapop);
9367 update->head.split = in->ino();
9368 update->split_inos = split_inos;
9369 update->split_realms = split_realms;
9370 update->bl = snapbl;
9371 updates[p->first] = update;
9372 }
9373 }
9374
9375 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9376 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9377 p != realm->open_past_children.end();
9378 ++p)
9379 past_children.insert(*p);
9380 }
9381
9382 // notify for active children, too.
9383 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9384 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9385 p != realm->open_children.end();
9386 ++p)
9387 q.push_back(*p);
9388 }
9389
9390 if (!nosend)
9391 send_snaps(updates);
9392
9393 // notify past children and their descendants if we update/delete old snapshots
9394 for (set<SnapRealm*>::iterator p = past_children.begin();
9395 p != past_children.end();
9396 ++p)
9397 q.push_back(*p);
9398
9399 while (!q.empty()) {
9400 SnapRealm *realm = q.front();
9401 q.pop_front();
9402
9403 realm->invalidate_cached_snaps();
9404
9405 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9406 p != realm->open_children.end();
9407 ++p) {
9408 if (past_children.count(*p) == 0)
9409 q.push_back(*p);
9410 }
9411
9412 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9413 p != realm->open_past_children.end();
9414 ++p) {
9415 if (past_children.count(*p) == 0) {
9416 q.push_back(*p);
9417 past_children.insert(*p);
9418 }
9419 }
9420 }
9421
9422 if (snapop == CEPH_SNAP_OP_DESTROY) {
9423 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9424 for (set<SnapRealm*>::iterator p = past_children.begin();
9425 p != past_children.end();
9426 ++p)
9427 maybe_eval_stray((*p)->inode, true);
9428 }
9429 }
9430
9431 void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9432 {
9433 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9434
9435 // apply
9436 in->pop_and_dirty_projected_inode(mut->ls);
9437 mut->apply();
9438 mds->locker->drop_locks(mut.get());
9439 mut->cleanup();
9440
9441 // tell table we've committed
9442 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9443
9444 // create
9445 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9446 snapid_t seq;
9447 ::decode(seq, p);
9448
9449 in->open_snaprealm();
9450 in->snaprealm->srnode.seq = seq;
9451 in->snaprealm->srnode.created = seq;
9452 bool ok = in->snaprealm->_open_parents(NULL);
9453 assert(ok);
9454
9455 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9456
9457 /*
9458 static int count = 5;
9459 if (--count == 0)
9460 ceph_abort(); // hack test test **********
9461 */
9462
9463 // done.
9464 mdr->more()->stid = 0; // caller will likely need to reuse this
9465 dispatch_request(mdr);
9466 }
9467
9468
9469 // -------------------------------------------------------------------------------
9470 // STRAYS
9471
9472 struct C_MDC_RetryScanStray : public MDCacheContext {
9473 dirfrag_t next;
9474 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9475 void finish(int r) override {
9476 mdcache->scan_stray_dir(next);
9477 }
9478 };
9479
9480 void MDCache::scan_stray_dir(dirfrag_t next)
9481 {
9482 dout(10) << "scan_stray_dir " << next << dendl;
9483
9484 list<CDir*> ls;
9485 for (int i = 0; i < NUM_STRAY; ++i) {
9486 if (strays[i]->ino() < next.ino)
9487 continue;
9488 strays[i]->get_dirfrags(ls);
9489 }
9490
9491 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9492 CDir *dir = *p;
9493 if (dir->dirfrag() < next)
9494 continue;
9495 if (!dir->is_complete()) {
9496 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9497 return;
9498 }
9499 for (auto &p : dir->items) {
9500 CDentry *dn = p.second;
9501 dn->state_set(CDentry::STATE_STRAY);
9502 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9503 if (dnl->is_primary()) {
9504 CInode *in = dnl->get_inode();
9505 if (in->inode.nlink == 0)
9506 in->state_set(CInode::STATE_ORPHAN);
9507 maybe_eval_stray(in);
9508 }
9509 }
9510 }
9511 }
9512
9513 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9514 {
9515 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9516 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9517 }
9518
9519
9520
9521
9522
9523 // ========================================================================================
9524 // DISCOVER
9525 /*
9526
9527 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9528 to the parent metadata object in the cache (pinning it).
9529
9530 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9531
9532 */
9533
9534 void MDCache::_send_discover(discover_info_t& d)
9535 {
9536 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9537 d.want_base_dir, d.want_xlocked);
9538 dis->set_tid(d.tid);
9539 mds->send_message_mds(dis, d.mds);
9540 }
9541
9542 void MDCache::discover_base_ino(inodeno_t want_ino,
9543 MDSInternalContextBase *onfinish,
9544 mds_rank_t from)
9545 {
9546 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9547 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9548 discover_info_t& d = _create_discover(from);
9549 d.ino = want_ino;
9550 _send_discover(d);
9551 }
9552 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9553 }
9554
9555
9556 void MDCache::discover_dir_frag(CInode *base,
9557 frag_t approx_fg,
9558 MDSInternalContextBase *onfinish,
9559 mds_rank_t from)
9560 {
9561 if (from < 0)
9562 from = base->authority().first;
9563
9564 dirfrag_t df(base->ino(), approx_fg);
9565 dout(7) << "discover_dir_frag " << df
9566 << " from mds." << from << dendl;
9567
9568 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9569 discover_info_t& d = _create_discover(from);
9570 d.pin_base(base);
9571 d.ino = base->ino();
9572 d.frag = approx_fg;
9573 d.want_base_dir = true;
9574 _send_discover(d);
9575 }
9576
9577 if (onfinish)
9578 base->add_dir_waiter(approx_fg, onfinish);
9579 }
9580
9581 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9582 CInode *base;
9583 snapid_t snapid;
9584 filepath path;
9585 mds_rank_t from;
9586 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9587 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9588 void finish(int r) override {
9589 mdcache->discover_path(base, snapid, path, 0, from);
9590 }
9591 };
9592
9593 void MDCache::discover_path(CInode *base,
9594 snapid_t snap,
9595 filepath want_path,
9596 MDSInternalContextBase *onfinish,
9597 bool want_xlocked,
9598 mds_rank_t from)
9599 {
9600 if (from < 0)
9601 from = base->authority().first;
9602
9603 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9604 << (want_xlocked ? " want_xlocked":"")
9605 << dendl;
9606
9607 if (base->is_ambiguous_auth()) {
9608 dout(10) << " waiting for single auth on " << *base << dendl;
9609 if (!onfinish)
9610 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9611 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9612 return;
9613 } else if (from == mds->get_nodeid()) {
9614 list<MDSInternalContextBase*> finished;
9615 base->take_waiting(CInode::WAIT_DIR, finished);
9616 mds->queue_waiters(finished);
9617 return;
9618 }
9619
9620 frag_t fg = base->pick_dirfrag(want_path[0]);
9621 if ((want_xlocked && want_path.depth() == 1) ||
9622 !base->is_waiting_for_dir(fg) || !onfinish) {
9623 discover_info_t& d = _create_discover(from);
9624 d.ino = base->ino();
9625 d.pin_base(base);
9626 d.frag = fg;
9627 d.snap = snap;
9628 d.want_path = want_path;
9629 d.want_base_dir = true;
9630 d.want_xlocked = want_xlocked;
9631 _send_discover(d);
9632 }
9633
9634 // register + wait
9635 if (onfinish)
9636 base->add_dir_waiter(fg, onfinish);
9637 }
9638
9639 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9640 CDir *base;
9641 snapid_t snapid;
9642 filepath path;
9643 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9644 MDCacheContext(c), base(b), snapid(s), path(p) {}
9645 void finish(int r) override {
9646 mdcache->discover_path(base, snapid, path, 0);
9647 }
9648 };
9649
9650 void MDCache::discover_path(CDir *base,
9651 snapid_t snap,
9652 filepath want_path,
9653 MDSInternalContextBase *onfinish,
9654 bool want_xlocked)
9655 {
9656 mds_rank_t from = base->authority().first;
9657
9658 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9659 << (want_xlocked ? " want_xlocked":"")
9660 << dendl;
9661
9662 if (base->is_ambiguous_auth()) {
9663 dout(7) << " waiting for single auth on " << *base << dendl;
9664 if (!onfinish)
9665 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9666 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9667 return;
9668 } else if (from == mds->get_nodeid()) {
9669 list<MDSInternalContextBase*> finished;
9670 base->take_sub_waiting(finished);
9671 mds->queue_waiters(finished);
9672 return;
9673 }
9674
9675 if ((want_xlocked && want_path.depth() == 1) ||
9676 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9677 discover_info_t& d = _create_discover(from);
9678 d.ino = base->ino();
9679 d.pin_base(base->inode);
9680 d.frag = base->get_frag();
9681 d.snap = snap;
9682 d.want_path = want_path;
9683 d.want_base_dir = false;
9684 d.want_xlocked = want_xlocked;
9685 _send_discover(d);
9686 }
9687
9688 // register + wait
9689 if (onfinish)
9690 base->add_dentry_waiter(want_path[0], snap, onfinish);
9691 }
9692
9693 void MDCache::kick_discovers(mds_rank_t who)
9694 {
9695 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9696 p != discovers.end();
9697 ++p) {
9698 if (p->second.mds != who)
9699 continue;
9700 _send_discover(p->second);
9701 }
9702 }
9703
9704
9705 /* This function DOES put the passed message before returning */
9706 void MDCache::handle_discover(MDiscover *dis)
9707 {
9708 mds_rank_t whoami = mds->get_nodeid();
9709 mds_rank_t from = mds_rank_t(dis->get_source().num());
9710
9711 assert(from != whoami);
9712
9713 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9714 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9715 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
9716 dis->put();
9717 return;
9718 }
9719
9720 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9721 // delay processing request from survivor because we may not yet choose lock states.
9722 if (!mds->mdsmap->is_rejoin(from)) {
9723 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9724 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9725 return;
9726 }
9727 }
9728
9729
9730 CInode *cur = 0;
9731 MDiscoverReply *reply = new MDiscoverReply(dis);
9732
9733 snapid_t snapid = dis->get_snapid();
9734
9735 // get started.
9736 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9737 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9738 // wants root
9739 dout(7) << "handle_discover from mds." << from
9740 << " wants base + " << dis->get_want().get_path()
9741 << " snap " << snapid
9742 << dendl;
9743
9744 cur = get_inode(dis->get_base_ino());
9745 assert(cur);
9746
9747 // add root
9748 reply->starts_with = MDiscoverReply::INODE;
9749 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9750 dout(10) << "added base " << *cur << dendl;
9751 }
9752 else {
9753 // there's a base inode
9754 cur = get_inode(dis->get_base_ino(), snapid);
9755 if (!cur && snapid != CEPH_NOSNAP) {
9756 cur = get_inode(dis->get_base_ino());
9757 if (cur && !cur->is_multiversion())
9758 cur = NULL; // nope!
9759 }
9760
9761 if (!cur) {
9762 dout(7) << "handle_discover mds." << from
9763 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9764 << dendl;
9765 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9766 reply->set_error_dentry(dis->get_dentry(0));
9767 reply->set_flag_error_dir();
9768 } else if (dis->wants_base_dir()) {
9769 dout(7) << "handle_discover mds." << from
9770 << " wants basedir+" << dis->get_want().get_path()
9771 << " has " << *cur
9772 << dendl;
9773 } else {
9774 dout(7) << "handle_discover mds." << from
9775 << " wants " << dis->get_want().get_path()
9776 << " has " << *cur
9777 << dendl;
9778 }
9779 }
9780
9781 assert(reply);
9782
9783 // add content
9784 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9785 for (unsigned i = 0;
9786 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9787 i++) {
9788
9789 // -- figure out the dir
9790
9791 // is *cur even a dir at all?
9792 if (!cur->is_dir()) {
9793 dout(7) << *cur << " not a dir" << dendl;
9794 reply->set_flag_error_dir();
9795 break;
9796 }
9797
9798 // pick frag
9799 frag_t fg;
9800 if (dis->get_want().depth()) {
9801 // dentry specifies
9802 fg = cur->pick_dirfrag(dis->get_dentry(i));
9803 } else {
9804 // requester explicity specified the frag
9805 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9806 fg = dis->get_base_dir_frag();
9807 if (!cur->dirfragtree.is_leaf(fg))
9808 fg = cur->dirfragtree[fg.value()];
9809 }
9810 CDir *curdir = cur->get_dirfrag(fg);
9811
9812 if ((!curdir && !cur->is_auth()) ||
9813 (curdir && !curdir->is_auth())) {
9814
9815 /* before:
9816 * ONLY set flag if empty!!
9817 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9818 * resulting in duplicate discovers in flight,
9819 * which can wreak havoc when discovering rename srcdn (which may move)
9820 */
9821
9822 if (reply->is_empty()) {
9823 // only hint if empty.
9824 // someday this could be better, but right now the waiter logic isn't smart enough.
9825
9826 // hint
9827 if (curdir) {
9828 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9829 reply->set_dir_auth_hint(curdir->authority().first);
9830 } else {
9831 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9832 << *cur << dendl;
9833 reply->set_dir_auth_hint(cur->authority().first);
9834 }
9835
9836 // note error dentry, if any
9837 // NOTE: important, as it allows requester to issue an equivalent discover
9838 // to whomever we hint at.
9839 if (dis->get_want().depth() > i)
9840 reply->set_error_dentry(dis->get_dentry(i));
9841 }
9842
9843 break;
9844 }
9845
9846 if (!curdir) { // open dir?
9847 if (cur->is_frozen()) {
9848 if (!reply->is_empty()) {
9849 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9850 break;
9851 }
9852 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9853 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9854 reply->put();
9855 return;
9856 }
9857 curdir = cur->get_or_open_dirfrag(this, fg);
9858 } else if (curdir->is_frozen_tree() ||
9859 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
9860 if (!reply->is_empty()) {
9861 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9862 break;
9863 }
9864 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9865 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9866 reply->set_flag_error_dir();
9867 break;
9868 }
9869 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9870 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9871 reply->put();
9872 return;
9873 }
9874
9875 // add dir
9876 if (curdir->get_version() == 0) {
9877 // fetch newly opened dir
9878 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9879 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9880 // make sure the base frag is correct, though, in there was a refragment since the
9881 // original request was sent.
9882 reply->set_base_dir_frag(curdir->get_frag());
9883 } else {
9884 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9885 if (!reply->trace.length())
9886 reply->starts_with = MDiscoverReply::DIR;
9887 replicate_dir(curdir, from, reply->trace);
9888 dout(7) << "handle_discover added dir " << *curdir << dendl;
9889 }
9890
9891 // lookup
9892 CDentry *dn = 0;
9893 if (curdir->get_version() == 0) {
9894 // fetch newly opened dir
9895 assert(!curdir->has_bloom());
9896 } else if (dis->get_want().depth() > 0) {
9897 // lookup dentry
9898 dn = curdir->lookup(dis->get_dentry(i), snapid);
9899 } else
9900 break; // done!
9901
9902 // incomplete dir?
9903 if (!dn) {
9904 if (!curdir->is_complete() &&
9905 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
9906 // readdir
9907 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9908 if (reply->is_empty()) {
9909 // fetch and wait
9910 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9911 dis->wants_base_dir() && curdir->get_version() == 0);
9912 reply->put();
9913 return;
9914 } else {
9915 // initiate fetch, but send what we have so far
9916 curdir->fetch(0);
9917 break;
9918 }
9919 }
9920
9921 // send null dentry
9922 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9923 << *curdir << dendl;
9924 dn = curdir->add_null_dentry(dis->get_dentry(i));
9925 }
9926 assert(dn);
9927
9928 // don't add replica to purging dentry/inode
9929 if (dn->state_test(CDentry::STATE_PURGING)) {
9930 if (reply->is_empty())
9931 reply->set_flag_error_dn(dis->get_dentry(i));
9932 break;
9933 }
9934
9935 CDentry::linkage_t *dnl = dn->get_linkage();
9936
9937 // xlocked dentry?
9938 // ...always block on non-tail items (they are unrelated)
9939 // ...allow xlocked tail disocvery _only_ if explicitly requested
9940 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9941 if (dn->lock.is_xlocked()) {
9942 // is this the last (tail) item in the discover traversal?
9943 if (tailitem && dis->wants_xlocked()) {
9944 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9945 } else if (reply->is_empty()) {
9946 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9947 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9948 reply->put();
9949 return;
9950 } else {
9951 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9952 break;
9953 }
9954 }
9955
9956 // frozen inode?
9957 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9958 if (tailitem && dis->wants_xlocked()) {
9959 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9960 } else if (reply->is_empty()) {
9961 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9962 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9963 reply->put();
9964 return;
9965 } else {
9966 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9967 break;
9968 }
9969 }
9970
9971 // add dentry
9972 if (!reply->trace.length())
9973 reply->starts_with = MDiscoverReply::DENTRY;
9974 replicate_dentry(dn, from, reply->trace);
9975 dout(7) << "handle_discover added dentry " << *dn << dendl;
9976
9977 if (!dnl->is_primary()) break; // stop on null or remote link.
9978
9979 // add inode
9980 CInode *next = dnl->get_inode();
9981 assert(next->is_auth());
9982
9983 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
9984 dout(7) << "handle_discover added inode " << *next << dendl;
9985
9986 // descend, keep going.
9987 cur = next;
9988 continue;
9989 }
9990
9991 // how did we do?
9992 assert(!reply->is_empty());
9993 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
9994 mds->send_message(reply, dis->get_connection());
9995
9996 dis->put();
9997 }
9998
9999 /* This function DOES put the passed message before returning */
10000 void MDCache::handle_discover_reply(MDiscoverReply *m)
10001 {
10002 /*
10003 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10004 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10005 m->put();
10006 return;
10007 }
10008 */
10009 dout(7) << "discover_reply " << *m << dendl;
10010 if (m->is_flag_error_dir())
10011 dout(7) << " flag error, dir" << dendl;
10012 if (m->is_flag_error_dn())
10013 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10014
10015 list<MDSInternalContextBase*> finished, error;
10016 mds_rank_t from = mds_rank_t(m->get_source().num());
10017
10018 // starting point
10019 CInode *cur = get_inode(m->get_base_ino());
10020 bufferlist::iterator p = m->trace.begin();
10021
10022 int next = m->starts_with;
10023
10024 // decrement discover counters
10025 if (m->get_tid()) {
10026 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10027 if (p != discovers.end()) {
10028 dout(10) << " found tid " << m->get_tid() << dendl;
10029 discovers.erase(p);
10030 } else {
10031 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10032 }
10033 }
10034
10035 // discover may start with an inode
10036 if (!p.end() && next == MDiscoverReply::INODE) {
10037 cur = add_replica_inode(p, NULL, finished);
10038 dout(7) << "discover_reply got base inode " << *cur << dendl;
10039 assert(cur->is_base());
10040
10041 next = MDiscoverReply::DIR;
10042
10043 // take waiters?
10044 if (cur->is_base() &&
10045 waiting_for_base_ino[from].count(cur->ino())) {
10046 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10047 waiting_for_base_ino[from].erase(cur->ino());
10048 }
10049 }
10050 assert(cur);
10051
10052 // loop over discover results.
10053 // indexes follow each ([[dir] dentry] inode)
10054 // can start, end with any type.
10055 while (!p.end()) {
10056 // dir
10057 frag_t fg;
10058 CDir *curdir = 0;
10059 if (next == MDiscoverReply::DIR) {
10060 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10061 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10062 assert(m->get_wanted_base_dir());
10063 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10064 }
10065 } else {
10066 // note: this can only happen our first way around this loop.
10067 if (p.end() && m->is_flag_error_dn()) {
10068 fg = cur->pick_dirfrag(m->get_error_dentry());
10069 curdir = cur->get_dirfrag(fg);
10070 } else
10071 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10072 }
10073
10074 if (p.end())
10075 break;
10076
10077 // dentry
10078 CDentry *dn = add_replica_dentry(p, curdir, finished);
10079
10080 if (p.end())
10081 break;
10082
10083 // inode
10084 cur = add_replica_inode(p, dn, finished);
10085
10086 next = MDiscoverReply::DIR;
10087 }
10088
10089 // dir error?
10090 // or dir_auth hint?
10091 if (m->is_flag_error_dir() && !cur->is_dir()) {
10092 // not a dir.
10093 cur->take_waiting(CInode::WAIT_DIR, error);
10094 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10095 mds_rank_t who = m->get_dir_auth_hint();
10096 if (who == mds->get_nodeid()) who = -1;
10097 if (who >= 0)
10098 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10099
10100
10101 if (m->get_wanted_base_dir()) {
10102 frag_t fg = m->get_base_dir_frag();
10103 CDir *dir = cur->get_dirfrag(fg);
10104
10105 if (cur->is_waiting_for_dir(fg)) {
10106 if (cur->is_auth())
10107 cur->take_waiting(CInode::WAIT_DIR, finished);
10108 else if (dir || !cur->dirfragtree.is_leaf(fg))
10109 cur->take_dir_waiting(fg, finished);
10110 else
10111 discover_dir_frag(cur, fg, 0, who);
10112 } else
10113 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10114 }
10115
10116 // try again?
10117 if (m->get_error_dentry().length()) {
10118 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10119 CDir *dir = cur->get_dirfrag(fg);
10120 // wanted a dentry
10121 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10122 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10123 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10124 m->get_wanted_snapid(), finished);
10125 } else {
10126 filepath relpath(m->get_error_dentry(), 0);
10127 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10128 }
10129 } else
10130 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10131 << m->get_error_dentry() << dendl;
10132 }
10133 } else if (m->is_flag_error_dn()) {
10134 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10135 CDir *dir = cur->get_dirfrag(fg);
10136 if (dir) {
10137 if (dir->is_auth()) {
10138 dir->take_sub_waiting(finished);
10139 } else {
10140 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10141 m->get_wanted_snapid(), error);
10142 }
10143 }
10144 }
10145
10146 // waiters
10147 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10148 mds->queue_waiters(finished);
10149
10150 // done
10151 m->put();
10152 }
10153
10154
10155
10156 // ----------------------------
10157 // REPLICAS
10158
10159
10160 void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10161 {
10162 dirfrag_t df = dir->dirfrag();
10163 ::encode(df, bl);
10164 dir->encode_replica(to, bl);
10165 }
10166
10167 void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10168 {
10169 ::encode(dn->get_name(), bl);
10170 ::encode(dn->last, bl);
10171 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10172 }
10173
10174 void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10175 uint64_t features)
10176 {
10177 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
10178 ::encode(in->last, bl);
10179 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10180 }
10181
10182 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10183 list<MDSInternalContextBase*>& finished)
10184 {
10185 dirfrag_t df;
10186 ::decode(df, p);
10187
10188 assert(diri->ino() == df.ino);
10189
10190 // add it (_replica_)
10191 CDir *dir = diri->get_dirfrag(df.frag);
10192
10193 if (dir) {
10194 // had replica. update w/ new nonce.
10195 dir->decode_replica(p);
10196 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10197 } else {
10198 // force frag to leaf in the diri tree
10199 if (!diri->dirfragtree.is_leaf(df.frag)) {
10200 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10201 << diri->dirfragtree << dendl;
10202 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10203 }
10204
10205 // add replica.
10206 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10207 dir->decode_replica(p);
10208
10209 // is this a dir_auth delegation boundary?
10210 if (from != diri->authority().first ||
10211 diri->is_ambiguous_auth() ||
10212 diri->is_base())
10213 adjust_subtree_auth(dir, from);
10214
10215 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10216
10217 // get waiters
10218 diri->take_dir_waiting(df.frag, finished);
10219 }
10220
10221 return dir;
10222 }
10223
10224 CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10225 {
10226 string name;
10227 snapid_t last;
10228 ::decode(name, p);
10229 ::decode(last, p);
10230
10231 CDentry *dn = dir->lookup(name, last);
10232
10233 // have it?
10234 if (dn) {
10235 dn->decode_replica(p, false);
10236 dout(7) << "add_replica_dentry had " << *dn << dendl;
10237 } else {
10238 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10239 dn->decode_replica(p, true);
10240 dout(7) << "add_replica_dentry added " << *dn << dendl;
10241 }
10242
10243 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10244
10245 return dn;
10246 }
10247
10248 CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10249 {
10250 inodeno_t ino;
10251 snapid_t last;
10252 ::decode(ino, p);
10253 ::decode(last, p);
10254 CInode *in = get_inode(ino, last);
10255 if (!in) {
10256 in = new CInode(this, false, 1, last);
10257 in->decode_replica(p, true);
10258 add_inode(in);
10259 if (in->ino() == MDS_INO_ROOT)
10260 in->inode_auth.first = 0;
10261 else if (in->is_mdsdir())
10262 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10263 dout(10) << "add_replica_inode added " << *in << dendl;
10264 if (dn) {
10265 assert(dn->get_linkage()->is_null());
10266 dn->dir->link_primary_inode(dn, in);
10267 }
10268 } else {
10269 in->decode_replica(p, false);
10270 dout(10) << "add_replica_inode had " << *in << dendl;
10271 }
10272
10273 if (dn) {
10274 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10275 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10276 }
10277
10278 return in;
10279 }
10280
10281
10282 void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10283 {
10284 uint64_t features = mds->mdsmap->get_up_features();
10285 replicate_inode(get_myin(), who, bl, features);
10286 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10287 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10288 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10289 replicate_dir(straydn->get_dir(), who, bl);
10290 replicate_dentry(straydn, who, bl);
10291 }
10292
10293 CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10294 {
10295 list<MDSInternalContextBase*> finished;
10296 bufferlist::iterator p = bl.begin();
10297
10298 CInode *mdsin = add_replica_inode(p, NULL, finished);
10299 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10300 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10301 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10302 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10303 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10304 if (!finished.empty())
10305 mds->queue_waiters(finished);
10306
10307 return straydn;
10308 }
10309
10310
10311 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10312 {
10313 // this is an FYI, re: replication
10314
10315 set<mds_rank_t> who;
10316 if (bcast) {
10317 mds->get_mds_map()->get_active_mds_set(who);
10318 } else {
10319 for (const auto &p : dir->get_replicas()) {
10320 who.insert(p.first);
10321 }
10322 }
10323
10324 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10325
10326 filepath path;
10327 dir->inode->make_path(path);
10328
10329 mds_rank_t whoami = mds->get_nodeid();
10330 for (set<mds_rank_t>::iterator it = who.begin();
10331 it != who.end();
10332 ++it) {
10333 if (*it == whoami) continue;
10334 //if (*it == except) continue;
10335 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10336
10337 std::set<int32_t> s;
10338 for (const auto &r : dir->dir_rep_by) {
10339 s.insert(r);
10340 }
10341 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10342 dir->dirfrag(),
10343 dir->dir_rep,
10344 s,
10345 path,
10346 bcast),
10347 *it);
10348 }
10349
10350 return 0;
10351 }
10352
10353 /* This function DOES put the passed message before returning */
10354 void MDCache::handle_dir_update(MDirUpdate *m)
10355 {
10356 dirfrag_t df = m->get_dirfrag();
10357 CDir *dir = get_dirfrag(df);
10358 if (!dir) {
10359 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10360
10361 // discover it?
10362 if (m->should_discover()) {
10363 // only try once!
10364 // this is key to avoid a fragtree update race, among other things.
10365 m->inc_tried_discover();
10366 vector<CDentry*> trace;
10367 CInode *in;
10368 filepath path = m->get_path();
10369 dout(5) << "trying discover on dir_update for " << path << dendl;
10370 MDRequestRef null_ref;
10371 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10372 if (r > 0)
10373 return;
10374 if (r == 0 &&
10375 in->ino() == df.ino &&
10376 in->get_approx_dirfrag(df.frag) == NULL) {
10377 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10378 return;
10379 }
10380 }
10381
10382 m->put();
10383 return;
10384 }
10385
10386 if (!m->has_tried_discover()) {
10387 // Update if it already exists. Othwerwise it got updated by discover reply.
10388 dout(5) << "dir_update on " << *dir << dendl;
10389 dir->dir_rep = m->get_dir_rep();
10390 dir->dir_rep_by.clear();
10391 for (const auto &e : m->get_dir_rep_by()) {
10392 dir->dir_rep_by.insert(e);
10393 }
10394 }
10395
10396 // done
10397 m->put();
10398 }
10399
10400
10401
10402
10403
10404 // LINK
10405
10406 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10407 {
10408 dout(7) << "send_dentry_link " << *dn << dendl;
10409
10410 CDir *subtree = get_subtree_root(dn->get_dir());
10411 for (const auto &p : dn->get_replicas()) {
10412 // don't tell (rename) witnesses; they already know
10413 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10414 continue;
10415 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10416 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10417 rejoin_gather.count(p.first)))
10418 continue;
10419 CDentry::linkage_t *dnl = dn->get_linkage();
10420 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10421 dn->get_name(), dnl->is_primary());
10422 if (dnl->is_primary()) {
10423 dout(10) << " primary " << *dnl->get_inode() << dendl;
10424 replicate_inode(dnl->get_inode(), p.first, m->bl,
10425 mds->mdsmap->get_up_features());
10426 } else if (dnl->is_remote()) {
10427 inodeno_t ino = dnl->get_remote_ino();
10428 __u8 d_type = dnl->get_remote_d_type();
10429 dout(10) << " remote " << ino << " " << d_type << dendl;
10430 ::encode(ino, m->bl);
10431 ::encode(d_type, m->bl);
10432 } else
10433 ceph_abort(); // aie, bad caller!
10434 mds->send_message_mds(m, p.first);
10435 }
10436 }
10437
10438 /* This function DOES put the passed message before returning */
10439 void MDCache::handle_dentry_link(MDentryLink *m)
10440 {
10441
10442 CDentry *dn = NULL;
10443 CDir *dir = get_dirfrag(m->get_dirfrag());
10444 if (!dir) {
10445 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10446 } else {
10447 dn = dir->lookup(m->get_dn());
10448 if (!dn) {
10449 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10450 } else {
10451 dout(7) << "handle_dentry_link on " << *dn << dendl;
10452 CDentry::linkage_t *dnl = dn->get_linkage();
10453
10454 assert(!dn->is_auth());
10455 assert(dnl->is_null());
10456 }
10457 }
10458
10459 bufferlist::iterator p = m->bl.begin();
10460 list<MDSInternalContextBase*> finished;
10461 if (dn) {
10462 if (m->get_is_primary()) {
10463 // primary link.
10464 add_replica_inode(p, dn, finished);
10465 } else {
10466 // remote link, easy enough.
10467 inodeno_t ino;
10468 __u8 d_type;
10469 ::decode(ino, p);
10470 ::decode(d_type, p);
10471 dir->link_remote_inode(dn, ino, d_type);
10472 }
10473 } else {
10474 ceph_abort();
10475 }
10476
10477 if (!finished.empty())
10478 mds->queue_waiters(finished);
10479
10480 m->put();
10481 return;
10482 }
10483
10484
10485 // UNLINK
10486
10487 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10488 {
10489 dout(10) << "send_dentry_unlink " << *dn << dendl;
10490 // share unlink news with replicas
10491 set<mds_rank_t> replicas;
10492 dn->list_replicas(replicas);
10493 if (straydn)
10494 straydn->list_replicas(replicas);
10495 for (set<mds_rank_t>::iterator it = replicas.begin();
10496 it != replicas.end();
10497 ++it) {
10498 // don't tell (rmdir) witnesses; they already know
10499 if (mdr.get() && mdr->more()->witnessed.count(*it))
10500 continue;
10501
10502 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10503 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10504 rejoin_gather.count(*it)))
10505 continue;
10506
10507 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
10508 if (straydn)
10509 replicate_stray(straydn, *it, unlink->straybl);
10510 mds->send_message_mds(unlink, *it);
10511 }
10512 }
10513
10514 /* This function DOES put the passed message before returning */
10515 void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10516 {
10517 // straydn
10518 CDentry *straydn = NULL;
10519 if (m->straybl.length())
10520 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10521
10522 CDir *dir = get_dirfrag(m->get_dirfrag());
10523 if (!dir) {
10524 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10525 } else {
10526 CDentry *dn = dir->lookup(m->get_dn());
10527 if (!dn) {
10528 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10529 } else {
10530 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10531 CDentry::linkage_t *dnl = dn->get_linkage();
10532
10533 // open inode?
10534 if (dnl->is_primary()) {
10535 CInode *in = dnl->get_inode();
10536 dn->dir->unlink_inode(dn);
10537 assert(straydn);
10538 straydn->dir->link_primary_inode(straydn, in);
10539
10540 // in->first is lazily updated on replica; drag it forward so
10541 // that we always keep it in sync with the dnq
10542 assert(straydn->first >= in->first);
10543 in->first = straydn->first;
10544
10545 // update subtree map?
10546 if (in->is_dir())
10547 adjust_subtree_after_rename(in, dir, false);
10548
10549 // send caps to auth (if we're not already)
10550 if (in->is_any_caps() &&
10551 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10552 migrator->export_caps(in);
10553
10554 straydn = NULL;
10555 } else {
10556 assert(!straydn);
10557 assert(dnl->is_remote());
10558 dn->dir->unlink_inode(dn);
10559 }
10560 assert(dnl->is_null());
10561 }
10562 }
10563
10564 // race with trim_dentry()
10565 if (straydn) {
10566 assert(straydn->get_num_ref() == 0);
10567 assert(straydn->get_linkage()->is_null());
10568 map<mds_rank_t, MCacheExpire*> expiremap;
10569 trim_dentry(straydn, expiremap);
10570 send_expire_messages(expiremap);
10571 }
10572
10573 m->put();
10574 return;
10575 }
10576
10577
10578
10579
10580
10581
10582 // ===================================================================
10583
10584
10585
10586 // ===================================================================
10587 // FRAGMENT
10588
10589
10590 /**
10591 * adjust_dir_fragments -- adjust fragmentation for a directory
10592 *
10593 * @param diri directory inode
10594 * @param basefrag base fragment
10595 * @param bits bit adjustment. positive for split, negative for merge.
10596 */
10597 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10598 list<CDir*>& resultfrags,
10599 list<MDSInternalContextBase*>& waiters,
10600 bool replay)
10601 {
10602 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10603 << " on " << *diri << dendl;
10604
10605 list<CDir*> srcfrags;
10606 diri->get_dirfrags_under(basefrag, srcfrags);
10607
10608 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10609 }
10610
10611 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10612 {
10613 CDir *dir = diri->get_dirfrag(fg);
10614 if (dir)
10615 return dir;
10616
10617 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10618
10619 list<CDir*> src, result;
10620 list<MDSInternalContextBase*> waiters;
10621
10622 // split a parent?
10623 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10624 while (1) {
10625 CDir *pdir = diri->get_dirfrag(parent);
10626 if (pdir) {
10627 int split = fg.bits() - parent.bits();
10628 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10629 src.push_back(pdir);
10630 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10631 dir = diri->get_dirfrag(fg);
10632 if (dir) {
10633 dout(10) << "force_dir_fragment result " << *dir << dendl;
10634 break;
10635 }
10636 }
10637 if (parent == frag_t())
10638 break;
10639 frag_t last = parent;
10640 parent = parent.parent();
10641 dout(10) << " " << last << " parent is " << parent << dendl;
10642 }
10643
10644 if (!dir) {
10645 // hoover up things under fg?
10646 diri->get_dirfrags_under(fg, src);
10647 if (src.empty()) {
10648 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10649 } else {
10650 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10651 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10652 dir = result.front();
10653 dout(10) << "force_dir_fragment result " << *dir << dendl;
10654 }
10655 }
10656 if (!replay)
10657 mds->queue_waiters(waiters);
10658 return dir;
10659 }
10660
10661 void MDCache::adjust_dir_fragments(CInode *diri,
10662 list<CDir*>& srcfrags,
10663 frag_t basefrag, int bits,
10664 list<CDir*>& resultfrags,
10665 list<MDSInternalContextBase*>& waiters,
10666 bool replay)
10667 {
10668 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10669 << " srcfrags " << srcfrags
10670 << " on " << *diri << dendl;
10671
10672 // adjust fragtree
10673 // yuck. we may have discovered the inode while it was being fragmented.
10674 if (!diri->dirfragtree.is_leaf(basefrag))
10675 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10676
10677 if (bits > 0)
10678 diri->dirfragtree.split(basefrag, bits);
10679 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10680
10681 if (srcfrags.empty())
10682 return;
10683
10684 // split
10685 CDir *parent_dir = diri->get_parent_dir();
10686 CDir *parent_subtree = 0;
10687 if (parent_dir)
10688 parent_subtree = get_subtree_root(parent_dir);
10689
10690 if (bits > 0) {
10691 // SPLIT
10692 assert(srcfrags.size() == 1);
10693 CDir *dir = srcfrags.front();
10694
10695 dir->split(bits, resultfrags, waiters, replay);
10696
10697 // did i change the subtree map?
10698 if (dir->is_subtree_root()) {
10699 // new frags are now separate subtrees
10700 for (list<CDir*>::iterator p = resultfrags.begin();
10701 p != resultfrags.end();
10702 ++p)
10703 subtrees[*p].clear(); // new frag is now its own subtree
10704
10705 // was i a bound?
10706 if (parent_subtree) {
10707 assert(subtrees[parent_subtree].count(dir));
10708 subtrees[parent_subtree].erase(dir);
10709 for (list<CDir*>::iterator p = resultfrags.begin();
10710 p != resultfrags.end();
10711 ++p) {
10712 assert((*p)->is_subtree_root());
10713 subtrees[parent_subtree].insert(*p);
10714 }
10715 }
10716
10717 // adjust my bounds.
10718 set<CDir*> bounds;
10719 bounds.swap(subtrees[dir]);
10720 subtrees.erase(dir);
10721 for (set<CDir*>::iterator p = bounds.begin();
10722 p != bounds.end();
10723 ++p) {
10724 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10725 subtrees[frag].insert(*p);
10726 }
10727
10728 show_subtrees(10);
10729
10730 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10731 dir->dir_auth = CDIR_AUTH_DEFAULT;
10732 }
10733
10734 diri->close_dirfrag(dir->get_frag());
10735
10736 } else {
10737 // MERGE
10738
10739 // are my constituent bits subtrees? if so, i will be too.
10740 // (it's all or none, actually.)
10741 bool any_subtree = false;
10742 for (CDir *dir : srcfrags) {
10743 if (dir->is_subtree_root()) {
10744 any_subtree = true;
10745 break;
10746 }
10747 }
10748 set<CDir*> new_bounds;
10749 if (any_subtree) {
10750 for (CDir *dir : srcfrags) {
10751 // this simplifies the code that find subtrees underneath the dirfrag
10752 if (!dir->is_subtree_root()) {
10753 dir->state_set(CDir::STATE_AUXSUBTREE);
10754 adjust_subtree_auth(dir, mds->get_nodeid());
10755 }
10756 }
10757
10758 for (CDir *dir : srcfrags) {
10759 assert(dir->is_subtree_root());
10760 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
10761 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10762 set<CDir*>::iterator r = q->second.begin();
10763 while (r != subtrees[dir].end()) {
10764 new_bounds.insert(*r);
10765 subtrees[dir].erase(r++);
10766 }
10767 subtrees.erase(q);
10768
10769 // remove myself as my parent's bound
10770 if (parent_subtree)
10771 subtrees[parent_subtree].erase(dir);
10772 }
10773 }
10774
10775 // merge
10776 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10777 f->merge(srcfrags, waiters, replay);
10778
10779 if (any_subtree) {
10780 assert(f->is_subtree_root());
10781 subtrees[f].swap(new_bounds);
10782 if (parent_subtree)
10783 subtrees[parent_subtree].insert(f);
10784
10785 show_subtrees(10);
10786 }
10787
10788 resultfrags.push_back(f);
10789 }
10790 }
10791
10792
10793 class C_MDC_FragmentFrozen : public MDSInternalContext {
10794 MDCache *mdcache;
10795 MDRequestRef mdr;
10796 public:
10797 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10798 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10799 void finish(int r) override {
10800 mdcache->fragment_frozen(mdr, r);
10801 }
10802 };
10803
10804 bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10805 {
10806 if (is_readonly()) {
10807 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10808 return false;
10809 }
10810 if (mds->is_cluster_degraded()) {
10811 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10812 return false;
10813 }
10814 if (diri->get_parent_dir() &&
10815 diri->get_parent_dir()->get_inode()->is_stray()) {
10816 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10817 return false;
10818 }
10819 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10820 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10821 return false;
10822 }
10823
10824 if (diri->scrub_is_in_progress()) {
10825 dout(7) << "can_fragment: scrub in progress" << dendl;
10826 return false;
10827 }
10828
10829 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10830 CDir *dir = *p;
10831 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10832 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10833 return false;
10834 }
10835 if (!dir->is_auth()) {
10836 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10837 return false;
10838 }
10839 if (dir->is_bad()) {
10840 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10841 return false;
10842 }
10843 if (dir->is_frozen() ||
10844 dir->is_freezing()) {
10845 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10846 return false;
10847 }
10848 }
10849
10850 return true;
10851 }
10852
10853 void MDCache::split_dir(CDir *dir, int bits)
10854 {
10855 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10856 assert(dir->is_auth());
10857 CInode *diri = dir->inode;
10858
10859 list<CDir*> dirs;
10860 dirs.push_back(dir);
10861
10862 if (!can_fragment(diri, dirs)) {
10863 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10864 return;
10865 }
10866
10867 if (dir->frag.bits() + bits > 24) {
10868 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10869 return;
10870 }
10871
10872 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10873 mdr->more()->fragment_base = dir->dirfrag();
10874
10875 assert(fragments.count(dir->dirfrag()) == 0);
10876 fragment_info_t& info = fragments[dir->dirfrag()];
10877 info.mdr = mdr;
10878 info.dirs.push_back(dir);
10879 info.bits = bits;
10880 info.last_cum_auth_pins_change = ceph_clock_now();
10881
10882 fragment_freeze_dirs(dirs);
10883 // initial mark+complete pass
10884 fragment_mark_and_complete(mdr);
10885 }
10886
10887 void MDCache::merge_dir(CInode *diri, frag_t frag)
10888 {
10889 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10890
10891 list<CDir*> dirs;
10892 if (!diri->get_dirfrags_under(frag, dirs)) {
10893 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10894 return;
10895 }
10896
10897 if (diri->dirfragtree.is_leaf(frag)) {
10898 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10899 return;
10900 }
10901
10902 if (!can_fragment(diri, dirs))
10903 return;
10904
10905 CDir *first = dirs.front();
10906 int bits = first->get_frag().bits() - frag.bits();
10907 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10908
10909 dirfrag_t basedirfrag(diri->ino(), frag);
10910 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10911 mdr->more()->fragment_base = basedirfrag;
10912
10913 assert(fragments.count(basedirfrag) == 0);
10914 fragment_info_t& info = fragments[basedirfrag];
10915 info.mdr = mdr;
10916 info.dirs = dirs;
10917 info.bits = -bits;
10918 info.last_cum_auth_pins_change = ceph_clock_now();
10919
10920 fragment_freeze_dirs(dirs);
10921 // initial mark+complete pass
10922 fragment_mark_and_complete(mdr);
10923 }
10924
10925 void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10926 {
10927 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10928 CDir *dir = *p;
10929 dir->auth_pin(dir); // until we mark and complete them
10930 dir->state_set(CDir::STATE_FRAGMENTING);
10931 dir->freeze_dir();
10932 assert(dir->is_freezing_dir());
10933 }
10934 }
10935
10936 class C_MDC_FragmentMarking : public MDCacheContext {
10937 MDRequestRef mdr;
10938 public:
10939 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10940 void finish(int r) override {
10941 mdcache->fragment_mark_and_complete(mdr);
10942 }
10943 };
10944
10945 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10946 {
10947 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10948 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10949 if (it == fragments.end() || it->second.mdr != mdr) {
10950 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10951 request_finish(mdr);
10952 return;
10953 }
10954
10955 fragment_info_t& info = it->second;
10956 CInode *diri = info.dirs.front()->get_inode();
10957 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10958
10959 MDSGatherBuilder gather(g_ceph_context);
10960
10961 for (list<CDir*>::iterator p = info.dirs.begin();
10962 p != info.dirs.end();
10963 ++p) {
10964 CDir *dir = *p;
10965
10966 bool ready = true;
10967 if (!dir->is_complete()) {
10968 dout(15) << " fetching incomplete " << *dir << dendl;
10969 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10970 ready = false;
10971 } else if (dir->get_frag() == frag_t()) {
10972 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10973 // the operation. To avoid CDir::fetch() complaining about missing object,
10974 // we commit new dirfrag first.
10975 if (dir->state_test(CDir::STATE_CREATING)) {
10976 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10977 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10978 ready = false;
10979 } else if (dir->is_new()) {
10980 dout(15) << " committing new " << *dir << dendl;
10981 assert(dir->is_dirty());
10982 dir->commit(0, gather.new_sub(), true);
10983 ready = false;
10984 }
10985 }
10986 if (!ready)
10987 continue;
10988
10989 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10990 dout(15) << " marking " << *dir << dendl;
10991 for (auto &p : dir->items) {
10992 CDentry *dn = p.second;
10993 dn->get(CDentry::PIN_FRAGMENTING);
10994 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
10995 dn->state_set(CDentry::STATE_FRAGMENTING);
10996 }
10997 dir->state_set(CDir::STATE_DNPINNEDFRAG);
10998 dir->auth_unpin(dir);
10999 } else {
11000 dout(15) << " already marked " << *dir << dendl;
11001 }
11002 }
11003 if (gather.has_subs()) {
11004 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11005 gather.activate();
11006 return;
11007 }
11008
11009 for (list<CDir*>::iterator p = info.dirs.begin();
11010 p != info.dirs.end();
11011 ++p) {
11012 CDir *dir = *p;
11013 if (!dir->is_frozen_dir()) {
11014 assert(dir->is_freezing_dir());
11015 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11016 }
11017 }
11018 if (gather.has_subs()) {
11019 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11020 gather.activate();
11021 // flush log so that request auth_pins are retired
11022 mds->mdlog->flush();
11023 return;
11024 }
11025
11026 fragment_frozen(mdr, 0);
11027 }
11028
11029 void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11030 {
11031 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11032 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11033 CDir *dir = *p;
11034 dout(10) << " frag " << *dir << dendl;
11035
11036 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11037 dir->state_clear(CDir::STATE_FRAGMENTING);
11038
11039 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11040 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11041
11042 for (auto &p : dir->items) {
11043 CDentry *dn = p.second;
11044 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11045 dn->state_clear(CDentry::STATE_FRAGMENTING);
11046 dn->put(CDentry::PIN_FRAGMENTING);
11047 }
11048 } else {
11049 dir->auth_unpin(dir);
11050 }
11051
11052 dir->unfreeze_dir();
11053 }
11054 }
11055
11056 bool MDCache::fragment_are_all_frozen(CDir *dir)
11057 {
11058 assert(dir->is_frozen_dir());
11059 map<dirfrag_t,fragment_info_t>::iterator p;
11060 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11061 p != fragments.end() && p->first.ino == dir->ino();
11062 ++p) {
11063 if (p->first.frag.contains(dir->get_frag()))
11064 return p->second.all_frozen;
11065 }
11066 ceph_abort();
11067 return false;
11068 }
11069
11070 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11071 {
11072 map<dirfrag_t,fragment_info_t>::iterator p;
11073 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11074 p != fragments.end() && p->first.ino == dir->ino();
11075 ++p) {
11076 if (p->first.frag.contains(dir->get_frag())) {
11077 p->second.num_remote_waiters++;
11078 return;
11079 }
11080 }
11081 ceph_abort();
11082 }
11083
11084 void MDCache::find_stale_fragment_freeze()
11085 {
11086 dout(10) << "find_stale_fragment_freeze" << dendl;
11087 // see comment in Migrator::find_stale_export_freeze()
11088 utime_t now = ceph_clock_now();
11089 utime_t cutoff = now;
11090 cutoff -= g_conf->mds_freeze_tree_timeout;
11091
11092 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11093 p != fragments.end(); ) {
11094 dirfrag_t df = p->first;
11095 fragment_info_t& info = p->second;
11096 ++p;
11097 if (info.all_frozen)
11098 continue;
11099 CDir *dir;
11100 int total_auth_pins = 0;
11101 for (list<CDir*>::iterator q = info.dirs.begin();
11102 q != info.dirs.end();
11103 ++q) {
11104 dir = *q;
11105 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11106 total_auth_pins = -1;
11107 break;
11108 }
11109 if (dir->is_frozen_dir())
11110 continue;
11111 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11112 }
11113 if (total_auth_pins < 0)
11114 continue;
11115 if (info.last_cum_auth_pins != total_auth_pins) {
11116 info.last_cum_auth_pins = total_auth_pins;
11117 info.last_cum_auth_pins_change = now;
11118 continue;
11119 }
11120 if (info.last_cum_auth_pins_change >= cutoff)
11121 continue;
11122 dir = info.dirs.front();
11123 if (info.num_remote_waiters > 0 ||
11124 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11125 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11126 list<CDir*> dirs;
11127 info.dirs.swap(dirs);
11128 fragments.erase(df);
11129 fragment_unmark_unfreeze_dirs(dirs);
11130 }
11131 }
11132 }
11133
11134 class C_MDC_FragmentPrep : public MDCacheLogContext {
11135 MDRequestRef mdr;
11136 public:
11137 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11138 void finish(int r) override {
11139 mdcache->_fragment_logged(mdr);
11140 }
11141 };
11142
11143 class C_MDC_FragmentStore : public MDCacheContext {
11144 MDRequestRef mdr;
11145 public:
11146 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11147 void finish(int r) override {
11148 mdcache->_fragment_stored(mdr);
11149 }
11150 };
11151
11152 class C_MDC_FragmentCommit : public MDCacheLogContext {
11153 dirfrag_t basedirfrag;
11154 list<CDir*> resultfrags;
11155 public:
11156 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11157 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11158 void finish(int r) override {
11159 mdcache->_fragment_committed(basedirfrag, resultfrags);
11160 }
11161 };
11162
11163 class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11164 dirfrag_t basedirfrag;
11165 list<CDir*> resultfrags;
11166 public:
11167 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11168 MDCacheIOContext(m), basedirfrag(f) {
11169 resultfrags.swap(l);
11170 }
11171 void finish(int r) override {
11172 assert(r == 0 || r == -ENOENT);
11173 mdcache->_fragment_finish(basedirfrag, resultfrags);
11174 }
11175 };
11176
11177 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11178 {
11179 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11180 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11181 if (it == fragments.end() || it->second.mdr != mdr) {
11182 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11183 request_finish(mdr);
11184 return;
11185 }
11186
11187 assert(r == 0);
11188 fragment_info_t& info = it->second;
11189 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11190 << " on " << info.dirs.front()->get_inode() << dendl;
11191
11192 info.all_frozen = true;
11193 dispatch_fragment_dir(mdr);
11194 }
11195
11196 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11197 {
11198 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11199 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11200 if (it == fragments.end() || it->second.mdr != mdr) {
11201 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11202 request_finish(mdr);
11203 return;
11204 }
11205
11206 fragment_info_t& info = it->second;
11207 CInode *diri = info.dirs.front()->get_inode();
11208
11209 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11210 << " on " << *diri << dendl;
11211 if (!mdr->aborted) {
11212 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11213 wrlocks.insert(&diri->dirfragtreelock);
11214 // prevent a racing gather on any other scatterlocks too
11215 wrlocks.insert(&diri->nestlock);
11216 wrlocks.insert(&diri->filelock);
11217 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11218 if (!mdr->aborted)
11219 return;
11220 }
11221
11222 if (mdr->aborted) {
11223 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11224 << info.dirs.front()->dirfrag() << dendl;
11225 if (info.bits > 0)
11226 mds->balancer->queue_split(info.dirs.front(), false);
11227 else
11228 mds->balancer->queue_merge(info.dirs.front());
11229 fragment_unmark_unfreeze_dirs(info.dirs);
11230 fragments.erase(it);
11231 request_finish(mdr);
11232 return;
11233 }
11234
11235 mdr->ls = mds->mdlog->get_current_segment();
11236 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11237 mds->mdlog->start_entry(le);
11238
11239 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11240 CDir *dir = *p;
11241 dirfrag_rollback rollback;
11242 rollback.fnode = dir->fnode;
11243 le->add_orig_frag(dir->get_frag(), &rollback);
11244 }
11245
11246 // refragment
11247 list<MDSInternalContextBase*> waiters;
11248 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11249 info.resultfrags, waiters, false);
11250 if (g_conf->mds_debug_frag)
11251 diri->verify_dirfrags();
11252 mds->queue_waiters(waiters);
11253
11254 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11255 assert(!diri->dirfragtree.is_leaf(*p));
11256
11257 le->metablob.add_dir_context(*info.resultfrags.begin());
11258 for (list<CDir*>::iterator p = info.resultfrags.begin();
11259 p != info.resultfrags.end();
11260 ++p) {
11261 if (diri->is_auth()) {
11262 le->metablob.add_fragmented_dir(*p, false, false);
11263 } else {
11264 (*p)->state_set(CDir::STATE_DIRTYDFT);
11265 le->metablob.add_fragmented_dir(*p, false, true);
11266 }
11267 }
11268
11269 // dft lock
11270 if (diri->is_auth()) {
11271 // journal dirfragtree
11272 auto &pi = diri->project_inode();
11273 pi.inode.version = diri->pre_dirty();
11274 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11275 } else {
11276 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11277 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11278 mdr->add_updated_lock(&diri->dirfragtreelock);
11279 }
11280
11281 /*
11282 // filelock
11283 mds->locker->mark_updated_scatterlock(&diri->filelock);
11284 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11285 mut->add_updated_lock(&diri->filelock);
11286
11287 // dirlock
11288 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11289 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11290 mut->add_updated_lock(&diri->nestlock);
11291 */
11292
11293 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11294 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11295 mdr, __func__);
11296 mds->mdlog->flush();
11297 }
11298
11299 void MDCache::_fragment_logged(MDRequestRef& mdr)
11300 {
11301 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11302 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11303 assert(it != fragments.end());
11304 fragment_info_t &info = it->second;
11305 CInode *diri = info.resultfrags.front()->get_inode();
11306
11307 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11308 << " on " << *diri << dendl;
11309
11310 if (diri->is_auth())
11311 diri->pop_and_dirty_projected_inode(mdr->ls);
11312
11313 mdr->apply(); // mark scatterlock
11314
11315 // store resulting frags
11316 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11317
11318 for (list<CDir*>::iterator p = info.resultfrags.begin();
11319 p != info.resultfrags.end();
11320 ++p) {
11321 CDir *dir = *p;
11322 dout(10) << " storing result frag " << *dir << dendl;
11323
11324 // freeze and store them too
11325 dir->auth_pin(this);
11326 dir->state_set(CDir::STATE_FRAGMENTING);
11327 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11328 }
11329
11330 gather.activate();
11331 }
11332
11333 void MDCache::_fragment_stored(MDRequestRef& mdr)
11334 {
11335 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11336 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11337 assert(it != fragments.end());
11338 fragment_info_t &info = it->second;
11339 CInode *diri = info.resultfrags.front()->get_inode();
11340
11341 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11342 << " on " << *diri << dendl;
11343
11344 // tell peers
11345 CDir *first = *info.resultfrags.begin();
11346 for (const auto &p : first->get_replicas()) {
11347 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11348 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11349 rejoin_gather.count(p.first)))
11350 continue;
11351
11352 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11353
11354 // freshly replicate new dirs to peers
11355 for (list<CDir*>::iterator q = info.resultfrags.begin();
11356 q != info.resultfrags.end();
11357 ++q)
11358 replicate_dir(*q, p.first, notify->basebl);
11359
11360 mds->send_message_mds(notify, p.first);
11361 }
11362
11363 // journal commit
11364 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11365 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11366 info.resultfrags));
11367
11368 mds->locker->drop_locks(mdr.get());
11369
11370 // unfreeze resulting frags
11371 for (list<CDir*>::iterator p = info.resultfrags.begin();
11372 p != info.resultfrags.end();
11373 ++p) {
11374 CDir *dir = *p;
11375 dout(10) << " result frag " << *dir << dendl;
11376
11377 for (auto &p : dir->items) {
11378 CDentry *dn = p.second;
11379 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11380 dn->state_clear(CDentry::STATE_FRAGMENTING);
11381 dn->put(CDentry::PIN_FRAGMENTING);
11382 }
11383
11384 // unfreeze
11385 dir->unfreeze_dir();
11386 }
11387
11388 fragments.erase(it);
11389 request_finish(mdr);
11390 }
11391
11392 void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11393 {
11394 dout(10) << "fragment_committed " << basedirfrag << dendl;
11395 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11396 assert(it != uncommitted_fragments.end());
11397 ufragment &uf = it->second;
11398
11399 // remove old frags
11400 C_GatherBuilder gather(
11401 g_ceph_context,
11402 new C_OnFinisher(
11403 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11404 mds->finisher));
11405
11406 SnapContext nullsnapc;
11407 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11408 for (list<frag_t>::iterator p = uf.old_frags.begin();
11409 p != uf.old_frags.end();
11410 ++p) {
11411 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11412 ObjectOperation op;
11413 if (*p == frag_t()) {
11414 // backtrace object
11415 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11416 op.truncate(0);
11417 op.omap_clear();
11418 } else {
11419 dout(10) << " removing orphan dirfrag " << oid << dendl;
11420 op.remove();
11421 }
11422 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11423 ceph::real_clock::now(),
11424 0, gather.new_sub());
11425 }
11426
11427 assert(gather.has_subs());
11428 gather.activate();
11429 }
11430
11431 void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11432 {
11433 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11434 << resultfrags.size() << dendl;
11435 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11436 assert(it != uncommitted_fragments.end());
11437 ufragment &uf = it->second;
11438
11439 // unmark & auth_unpin
11440 for (const auto &dir : resultfrags) {
11441 dir->state_clear(CDir::STATE_FRAGMENTING);
11442 dir->auth_unpin(this);
11443
11444 // In case the resulting fragments are beyond the split size,
11445 // we might need to split them again right away (they could
11446 // have been taking inserts between unfreezing and getting
11447 // here)
11448 mds->balancer->maybe_fragment(dir, false);
11449 }
11450
11451 if (mds->logger) {
11452 if (resultfrags.size() > 1) {
11453 mds->logger->inc(l_mds_dir_split);
11454 } else {
11455 mds->logger->inc(l_mds_dir_merge);
11456 }
11457 }
11458
11459 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11460 mds->mdlog->start_submit_entry(le);
11461
11462 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11463 }
11464
11465 /* This function DOES put the passed message before returning */
11466 void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11467 {
11468 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11469
11470 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11471 notify->put();
11472 return;
11473 }
11474
11475 CInode *diri = get_inode(notify->get_ino());
11476 if (diri) {
11477 frag_t base = notify->get_basefrag();
11478 int bits = notify->get_bits();
11479
11480 /*
11481 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11482 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11483 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11484 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11485 notify->put();
11486 return;
11487 }
11488 */
11489
11490 // refragment
11491 list<MDSInternalContextBase*> waiters;
11492 list<CDir*> resultfrags;
11493 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11494 if (g_conf->mds_debug_frag)
11495 diri->verify_dirfrags();
11496
11497 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11498 diri->take_dir_waiting((*p)->get_frag(), waiters);
11499
11500 // add new replica dirs values
11501 bufferlist::iterator p = notify->basebl.begin();
11502 while (!p.end())
11503 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11504
11505 mds->queue_waiters(waiters);
11506 } else {
11507 ceph_abort();
11508 }
11509
11510 notify->put();
11511 }
11512
11513 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11514 LogSegment *ls, bufferlist *rollback)
11515 {
11516 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11517 assert(!uncommitted_fragments.count(basedirfrag));
11518 ufragment& uf = uncommitted_fragments[basedirfrag];
11519 uf.old_frags = old_frags;
11520 uf.bits = bits;
11521 uf.ls = ls;
11522 ls->uncommitted_fragments.insert(basedirfrag);
11523 if (rollback)
11524 uf.rollback.swap(*rollback);
11525 }
11526
11527 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11528 {
11529 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11530 << " op " << EFragment::op_name(op) << dendl;
11531 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11532 if (it != uncommitted_fragments.end()) {
11533 ufragment& uf = it->second;
11534 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11535 uf.committed = true;
11536 } else {
11537 uf.ls->uncommitted_fragments.erase(basedirfrag);
11538 mds->queue_waiters(uf.waiters);
11539 uncommitted_fragments.erase(it);
11540 }
11541 }
11542 }
11543
11544 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11545 {
11546 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11547 << " old_frags (" << old_frags << ")" << dendl;
11548 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11549 if (it != uncommitted_fragments.end()) {
11550 ufragment& uf = it->second;
11551 if (!uf.old_frags.empty()) {
11552 uf.old_frags.swap(old_frags);
11553 uf.committed = true;
11554 } else {
11555 uf.ls->uncommitted_fragments.erase(basedirfrag);
11556 uncommitted_fragments.erase(it);
11557 }
11558 }
11559 }
11560
11561 void MDCache::rollback_uncommitted_fragments()
11562 {
11563 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11564 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11565 p != uncommitted_fragments.end();
11566 ++p) {
11567 ufragment &uf = p->second;
11568 CInode *diri = get_inode(p->first.ino);
11569 assert(diri);
11570
11571 if (uf.committed) {
11572 list<CDir*> frags;
11573 diri->get_dirfrags_under(p->first.frag, frags);
11574 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11575 CDir *dir = *q;
11576 dir->auth_pin(this);
11577 dir->state_set(CDir::STATE_FRAGMENTING);
11578 }
11579 _fragment_committed(p->first, frags);
11580 continue;
11581 }
11582
11583 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11584
11585 LogSegment *ls = mds->mdlog->get_current_segment();
11586 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11587 mds->mdlog->start_entry(le);
11588 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11589
11590 list<frag_t> old_frags;
11591 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11592
11593 list<CDir*> resultfrags;
11594 if (uf.old_frags.empty()) {
11595 // created by old format EFragment
11596 list<MDSInternalContextBase*> waiters;
11597 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11598 } else {
11599 bufferlist::iterator bp = uf.rollback.begin();
11600 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11601 CDir *dir = force_dir_fragment(diri, *q);
11602 resultfrags.push_back(dir);
11603
11604 dirfrag_rollback rollback;
11605 ::decode(rollback, bp);
11606
11607 dir->set_version(rollback.fnode.version);
11608 dir->fnode = rollback.fnode;
11609
11610 dir->_mark_dirty(ls);
11611
11612 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11613 dout(10) << " dirty nestinfo on " << *dir << dendl;
11614 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11615 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11616 }
11617 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11618 dout(10) << " dirty fragstat on " << *dir << dendl;
11619 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11620 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11621 }
11622
11623 le->add_orig_frag(dir->get_frag());
11624 le->metablob.add_dir_context(dir);
11625 if (diri_auth) {
11626 le->metablob.add_fragmented_dir(dir, true, false);
11627 } else {
11628 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11629 dir->state_set(CDir::STATE_DIRTYDFT);
11630 le->metablob.add_fragmented_dir(dir, true, true);
11631 }
11632 }
11633 }
11634
11635 if (diri_auth) {
11636 auto &pi = diri->project_inode();
11637 pi.inode.version = diri->pre_dirty();
11638 diri->pop_and_dirty_projected_inode(ls); // hacky
11639 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11640 } else {
11641 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11642 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11643 }
11644
11645 if (g_conf->mds_debug_frag)
11646 diri->verify_dirfrags();
11647
11648 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11649 assert(!diri->dirfragtree.is_leaf(*q));
11650
11651 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11652 CDir *dir = *q;
11653 dir->auth_pin(this);
11654 dir->state_set(CDir::STATE_FRAGMENTING);
11655 }
11656
11657 mds->mdlog->submit_entry(le);
11658
11659 uf.old_frags.swap(old_frags);
11660 _fragment_committed(p->first, resultfrags);
11661 }
11662 }
11663
11664 void MDCache::force_readonly()
11665 {
11666 if (is_readonly())
11667 return;
11668
11669 dout(1) << "force file system read-only" << dendl;
11670 mds->clog->warn() << "force file system read-only";
11671
11672 set_readonly();
11673
11674 mds->server->force_clients_readonly();
11675
11676 // revoke write caps
11677 for (auto &p : inode_map) {
11678 CInode *in = p.second;
11679 if (in->is_head())
11680 mds->locker->eval(in, CEPH_CAP_LOCKS);
11681 }
11682
11683 mds->mdlog->flush();
11684 }
11685
11686
11687 // ==============================================================
11688 // debug crap
11689
11690 void MDCache::show_subtrees(int dbl)
11691 {
11692 if (g_conf->mds_thrash_exports)
11693 dbl += 15;
11694
11695 //dout(10) << "show_subtrees" << dendl;
11696
11697 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11698 return; // i won't print anything.
11699
11700 if (subtrees.empty()) {
11701 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11702 return;
11703 }
11704
11705 // root frags
11706 list<CDir*> basefrags;
11707 for (set<CInode*>::iterator p = base_inodes.begin();
11708 p != base_inodes.end();
11709 ++p)
11710 (*p)->get_dirfrags(basefrags);
11711 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11712 dout(15) << "show_subtrees" << dendl;
11713
11714 // queue stuff
11715 list<pair<CDir*,int> > q;
11716 string indent;
11717 set<CDir*> seen;
11718
11719 // calc max depth
11720 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11721 q.push_back(pair<CDir*,int>(*p, 0));
11722
11723 set<CDir*> subtrees_seen;
11724
11725 int depth = 0;
11726 while (!q.empty()) {
11727 CDir *dir = q.front().first;
11728 int d = q.front().second;
11729 q.pop_front();
11730
11731 if (subtrees.count(dir) == 0) continue;
11732
11733 subtrees_seen.insert(dir);
11734
11735 if (d > depth) depth = d;
11736
11737 // sanity check
11738 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11739 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11740 assert(seen.count(dir) == 0);
11741 seen.insert(dir);
11742
11743 // nested items?
11744 if (!subtrees[dir].empty()) {
11745 for (set<CDir*>::iterator p = subtrees[dir].begin();
11746 p != subtrees[dir].end();
11747 ++p) {
11748 //dout(25) << " saw sub " << **p << dendl;
11749 q.push_front(pair<CDir*,int>(*p, d+1));
11750 }
11751 }
11752 }
11753
11754
11755 // print tree
11756 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11757 q.push_back(pair<CDir*,int>(*p, 0));
11758
11759 while (!q.empty()) {
11760 CDir *dir = q.front().first;
11761 int d = q.front().second;
11762 q.pop_front();
11763
11764 if (subtrees.count(dir) == 0) continue;
11765
11766 // adjust indenter
11767 while ((unsigned)d < indent.size())
11768 indent.resize(d);
11769
11770 // pad
11771 string pad = "______________________________________";
11772 pad.resize(depth*2+1-indent.size());
11773 if (!subtrees[dir].empty())
11774 pad[0] = '.'; // parent
11775
11776
11777 string auth;
11778 if (dir->is_auth())
11779 auth = "auth ";
11780 else
11781 auth = " rep ";
11782
11783 char s[10];
11784 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11785 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11786 else
11787 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11788
11789 // print
11790 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11791
11792 if (dir->ino() == MDS_INO_ROOT)
11793 assert(dir->inode == root);
11794 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11795 assert(dir->inode == myin);
11796 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11797 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11798
11799 // nested items?
11800 if (!subtrees[dir].empty()) {
11801 // more at my level?
11802 if (!q.empty() && q.front().second == d)
11803 indent += "| ";
11804 else
11805 indent += " ";
11806
11807 for (set<CDir*>::iterator p = subtrees[dir].begin();
11808 p != subtrees[dir].end();
11809 ++p)
11810 q.push_front(pair<CDir*,int>(*p, d+2));
11811 }
11812 }
11813
11814 // verify there isn't stray crap in subtree map
11815 int lost = 0;
11816 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11817 p != subtrees.end();
11818 ++p) {
11819 if (subtrees_seen.count(p->first)) continue;
11820 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11821 lost++;
11822 }
11823 assert(lost == 0);
11824 }
11825
11826 void MDCache::show_cache()
11827 {
11828 dout(7) << "show_cache" << dendl;
11829
11830 auto show_func = [this](CInode *in) {
11831 // unlinked?
11832 if (!in->parent)
11833 dout(7) << " unlinked " << *in << dendl;
11834
11835 // dirfrags?
11836 list<CDir*> dfs;
11837 in->get_dirfrags(dfs);
11838 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11839 CDir *dir = *p;
11840 dout(7) << " dirfrag " << *dir << dendl;
11841
11842 for (auto &p : dir->items) {
11843 CDentry *dn = p.second;
11844 dout(7) << " dentry " << *dn << dendl;
11845 CDentry::linkage_t *dnl = dn->get_linkage();
11846 if (dnl->is_primary() && dnl->get_inode())
11847 dout(7) << " inode " << *dnl->get_inode() << dendl;
11848 }
11849 }
11850 };
11851
11852 for (auto &p : inode_map)
11853 show_func(p.second);
11854 for (auto &p : snap_inode_map)
11855 show_func(p.second);
11856 }
11857
11858 int MDCache::cache_status(Formatter *f)
11859 {
11860 f->open_object_section("cache");
11861
11862 f->open_object_section("pool");
11863 mempool::get_pool(mempool::mds_co::id).dump(f);
11864 f->close_section();
11865
11866 f->close_section();
11867 return 0;
11868 }
11869
11870 int MDCache::dump_cache(boost::string_view file_name)
11871 {
11872 return dump_cache(file_name, NULL);
11873 }
11874
11875 int MDCache::dump_cache(Formatter *f)
11876 {
11877 return dump_cache(boost::string_view(""), f);
11878 }
11879
11880 int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
11881 {
11882 return dump_cache(boost::string_view(""), f, dump_root, depth);
11883 }
11884
11885 /**
11886 * Dump the metadata cache, either to a Formatter, if
11887 * provided, else to a plain text file.
11888 */
11889 int MDCache::dump_cache(boost::string_view fn, Formatter *f,
11890 boost::string_view dump_root, int depth)
11891 {
11892 int r = 0;
11893 int fd = -1;
11894
11895 if (f) {
11896 f->open_array_section("inodes");
11897 } else {
11898 char path[PATH_MAX] = "";
11899 if (fn.length()) {
11900 snprintf(path, sizeof path, "%s", fn.data());
11901 } else {
11902 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11903 }
11904
11905 dout(1) << "dump_cache to " << path << dendl;
11906
11907 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL, 0600);
11908 if (fd < 0) {
11909 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
11910 return errno;
11911 }
11912 }
11913
11914 auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
11915 int r;
11916 if (!dump_root.empty()) {
11917 string ipath;
11918 if (in->is_root())
11919 ipath = "/";
11920 else
11921 in->make_path_string(ipath);
11922
11923 if (dump_root.length() > ipath.length() ||
11924 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11925 return 0;
11926
11927 if (depth >= 0 &&
11928 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11929 return 0;
11930 }
11931
11932 if (f) {
11933 f->open_object_section("inode");
11934 in->dump(f);
11935 } else {
11936 ostringstream ss;
11937 ss << *in << std::endl;
11938 std::string s = ss.str();
11939 r = safe_write(fd, s.c_str(), s.length());
11940 if (r < 0)
11941 return r;
11942 }
11943
11944 list<CDir*> dfs;
11945 in->get_dirfrags(dfs);
11946 if (f) {
11947 f->open_array_section("dirfrags");
11948 }
11949 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11950 CDir *dir = *p;
11951 if (f) {
11952 f->open_object_section("dir");
11953 dir->dump(f);
11954 } else {
11955 ostringstream tt;
11956 tt << " " << *dir << std::endl;
11957 string t = tt.str();
11958 r = safe_write(fd, t.c_str(), t.length());
11959 if (r < 0)
11960 return r;
11961 }
11962
11963 if (f) {
11964 f->open_array_section("dentries");
11965 }
11966 for (auto &p : dir->items) {
11967 CDentry *dn = p.second;
11968 if (f) {
11969 f->open_object_section("dentry");
11970 dn->dump(f);
11971 f->close_section();
11972 } else {
11973 ostringstream uu;
11974 uu << " " << *dn << std::endl;
11975 string u = uu.str();
11976 r = safe_write(fd, u.c_str(), u.length());
11977 if (r < 0)
11978 return r;
11979 }
11980 }
11981 if (f) {
11982 f->close_section(); //dentries
11983 }
11984 dir->check_rstats();
11985 if (f) {
11986 f->close_section(); //dir
11987 }
11988 }
11989 if (f) {
11990 f->close_section(); // dirfrags
11991 }
11992
11993 if (f) {
11994 f->close_section(); // inode
11995 }
11996 return 1;
11997 };
11998
11999 for (auto &p : inode_map) {
12000 r = dump_func(p.second);
12001 if (r < 0)
12002 goto out;
12003 }
12004 for (auto &p : snap_inode_map) {
12005 r = dump_func(p.second);
12006 if (r < 0)
12007 goto out;
12008 }
12009 r = 0;
12010
12011 out:
12012 if (f) {
12013 f->close_section(); // inodes
12014 } else {
12015 ::close(fd);
12016 }
12017 return r;
12018 }
12019
12020
12021
12022 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12023 : MDSInternalContext(c->mds), cache(c), mdr(r)
12024 {}
12025
12026 void C_MDS_RetryRequest::finish(int r)
12027 {
12028 mdr->retry++;
12029 cache->dispatch_request(mdr);
12030 }
12031
12032
12033 class C_MDS_EnqueueScrub : public Context
12034 {
12035 Formatter *formatter;
12036 Context *on_finish;
12037 public:
12038 ScrubHeaderRef header;
12039 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12040 formatter(f), on_finish(fin), header(nullptr) {}
12041
12042 Context *take_finisher() {
12043 Context *fin = on_finish;
12044 on_finish = NULL;
12045 return fin;
12046 }
12047
12048 void finish(int r) override {
12049 if (r < 0) { // we failed the lookup or something; dump ourselves
12050 formatter->open_object_section("results");
12051 formatter->dump_int("return_code", r);
12052 formatter->close_section(); // results
12053 }
12054 if (on_finish)
12055 on_finish->complete(r);
12056 }
12057 };
12058
12059 void MDCache::enqueue_scrub(
12060 boost::string_view path,
12061 boost::string_view tag,
12062 bool force, bool recursive, bool repair,
12063 Formatter *f, Context *fin)
12064 {
12065 dout(10) << __func__ << path << dendl;
12066 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12067 filepath fp(path);
12068 mdr->set_filepath(fp);
12069
12070 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12071 cs->header = std::make_shared<ScrubHeader>(
12072 tag, force, recursive, repair, f);
12073
12074 mdr->internal_op_finish = cs;
12075 enqueue_scrub_work(mdr);
12076 }
12077
12078 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12079 {
12080 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12081 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12082 if (NULL == in)
12083 return;
12084
12085 // TODO: Remove this restriction
12086 assert(in->is_auth());
12087
12088 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12089 if (!locked)
12090 return;
12091
12092 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12093 ScrubHeaderRef &header = cs->header;
12094
12095 // Cannot scrub same dentry twice at same time
12096 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12097 mds->server->respond_to_request(mdr, -EBUSY);
12098 return;
12099 } else {
12100 in->scrub_info();
12101 }
12102
12103 header->set_origin(in);
12104
12105 Context *fin = nullptr;
12106 if (!header->get_recursive()) {
12107 fin = cs->take_finisher();
12108 }
12109
12110 // If the scrub did some repair, then flush the journal at the end of
12111 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12112 // the on disk state will still look damaged.
12113 auto expiry_fin = new FunctionContext([this, header, fin](int r){
12114 if (header->get_repaired()) {
12115 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12116 mds->mdlog->start_new_segment();
12117 mds->mdlog->trim_all();
12118 if (fin) {
12119 MDSGatherBuilder expiry_gather(g_ceph_context);
12120 const std::set<LogSegment*> &expiring_segments = mds->mdlog->get_expiring_segments();
12121 for (std::set<LogSegment*>::const_iterator i = expiring_segments.begin();
12122 i != expiring_segments.end(); ++i) {
12123 (*i)->wait_for_expiry(expiry_gather.new_sub());
12124 }
12125 expiry_gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12126 expiry_gather.activate();
12127 }
12128 } else {
12129 if (fin) {
12130 fin->complete(r);
12131 }
12132 }
12133 });
12134
12135 if (!header->get_recursive()) {
12136 mds->scrubstack->enqueue_inode_top(in, header,
12137 new MDSInternalContextWrapper(mds,
12138 expiry_fin));
12139 } else {
12140 mds->scrubstack->enqueue_inode_bottom(in, header,
12141 new MDSInternalContextWrapper(mds,
12142 expiry_fin));
12143 }
12144
12145 mds->server->respond_to_request(mdr, 0);
12146 return;
12147 }
12148
12149 struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12150 MDRequestRef mdr;
12151 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12152 MDCacheLogContext(c), mdr(m) {}
12153 void finish(int r) override {
12154 mdr->apply();
12155 get_mds()->server->respond_to_request(mdr, r);
12156 }
12157 };
12158
12159 void MDCache::repair_dirfrag_stats(CDir *dir)
12160 {
12161 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12162 mdr->pin(dir);
12163 mdr->internal_op_private = dir;
12164 mdr->internal_op_finish = new C_MDSInternalNoop;
12165 repair_dirfrag_stats_work(mdr);
12166 }
12167
12168 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12169 {
12170 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12171 dout(10) << __func__ << " " << *dir << dendl;
12172
12173 if (!dir->is_auth()) {
12174 mds->server->respond_to_request(mdr, -ESTALE);
12175 return;
12176 }
12177
12178 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12179 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12180
12181 mds->locker->drop_locks(mdr.get());
12182 mdr->drop_local_auth_pins();
12183 if (!mdr->remote_auth_pins.empty())
12184 mds->locker->notify_freeze_waiter(dir);
12185 return;
12186 }
12187
12188 mdr->auth_pin(dir);
12189
12190 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12191 CInode *diri = dir->inode;
12192 rdlocks.insert(&diri->dirfragtreelock);
12193 wrlocks.insert(&diri->nestlock);
12194 wrlocks.insert(&diri->filelock);
12195 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12196 return;
12197
12198 if (!dir->is_complete()) {
12199 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12200 return;
12201 }
12202
12203 frag_info_t frag_info;
12204 nest_info_t nest_info;
12205 for (auto it = dir->begin(); it != dir->end(); ++it) {
12206 CDentry *dn = it->second;
12207 if (dn->last != CEPH_NOSNAP)
12208 continue;
12209 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12210 if (dnl->is_primary()) {
12211 CInode *in = dnl->get_inode();
12212 nest_info.add(in->get_projected_inode()->accounted_rstat);
12213 if (in->is_dir())
12214 frag_info.nsubdirs++;
12215 else
12216 frag_info.nfiles++;
12217 } else if (dnl->is_remote())
12218 frag_info.nfiles++;
12219 }
12220
12221 fnode_t *pf = dir->get_projected_fnode();
12222 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12223 bool good_rstat = nest_info.same_sums(pf->rstat);
12224 if (good_fragstat && good_rstat) {
12225 dout(10) << __func__ << " no corruption found" << dendl;
12226 mds->server->respond_to_request(mdr, 0);
12227 return;
12228 }
12229
12230 pf = dir->project_fnode();
12231 pf->version = dir->pre_dirty();
12232 mdr->add_projected_fnode(dir);
12233
12234 mdr->ls = mds->mdlog->get_current_segment();
12235 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12236 mds->mdlog->start_entry(le);
12237
12238 if (!good_fragstat) {
12239 if (pf->fragstat.mtime > frag_info.mtime)
12240 frag_info.mtime = pf->fragstat.mtime;
12241 if (pf->fragstat.change_attr > frag_info.change_attr)
12242 frag_info.change_attr = pf->fragstat.change_attr;
12243 pf->fragstat = frag_info;
12244 mds->locker->mark_updated_scatterlock(&diri->filelock);
12245 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12246 mdr->add_updated_lock(&diri->filelock);
12247 }
12248
12249 if (!good_rstat) {
12250 if (pf->rstat.rctime > nest_info.rctime)
12251 nest_info.rctime = pf->rstat.rctime;
12252 pf->rstat = nest_info;
12253 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12254 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12255 mdr->add_updated_lock(&diri->nestlock);
12256 }
12257
12258 le->metablob.add_dir_context(dir);
12259 le->metablob.add_dir(dir, true);
12260
12261 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12262 }
12263
12264 void MDCache::repair_inode_stats(CInode *diri)
12265 {
12266 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12267 mdr->pin(diri);
12268 mdr->internal_op_private = diri;
12269 mdr->internal_op_finish = new C_MDSInternalNoop;
12270 repair_inode_stats_work(mdr);
12271 }
12272
12273 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12274 {
12275 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12276 dout(10) << __func__ << " " << *diri << dendl;
12277
12278 if (!diri->is_auth()) {
12279 mds->server->respond_to_request(mdr, -ESTALE);
12280 return;
12281 }
12282 if (!diri->is_dir()) {
12283 mds->server->respond_to_request(mdr, -ENOTDIR);
12284 return;
12285 }
12286
12287 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12288 std::list<frag_t> frags;
12289
12290 if (mdr->ls) // already marked filelock/nestlock dirty ?
12291 goto do_rdlocks;
12292
12293 rdlocks.insert(&diri->dirfragtreelock);
12294 wrlocks.insert(&diri->nestlock);
12295 wrlocks.insert(&diri->filelock);
12296 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12297 return;
12298
12299 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12300 // the scatter-gather process, which will fix any fragstat/rstat errors.
12301 diri->dirfragtree.get_leaves(frags);
12302 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12303 CDir *dir = diri->get_dirfrag(*p);
12304 if (!dir) {
12305 assert(mdr->is_auth_pinned(diri));
12306 dir = diri->get_or_open_dirfrag(this, *p);
12307 }
12308 if (dir->get_version() == 0) {
12309 assert(dir->is_auth());
12310 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12311 return;
12312 }
12313 }
12314
12315 diri->state_set(CInode::STATE_REPAIRSTATS);
12316 mdr->ls = mds->mdlog->get_current_segment();
12317 mds->locker->mark_updated_scatterlock(&diri->filelock);
12318 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12319 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12320 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12321
12322 mds->locker->drop_locks(mdr.get());
12323
12324 do_rdlocks:
12325 // force the scatter-gather process
12326 rdlocks.insert(&diri->dirfragtreelock);
12327 rdlocks.insert(&diri->nestlock);
12328 rdlocks.insert(&diri->filelock);
12329 wrlocks.clear();
12330 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12331 return;
12332
12333 diri->state_clear(CInode::STATE_REPAIRSTATS);
12334
12335 frag_info_t dir_info;
12336 nest_info_t nest_info;
12337 nest_info.rsubdirs++; // it gets one to account for self
12338
12339 diri->dirfragtree.get_leaves(frags);
12340 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12341 CDir *dir = diri->get_dirfrag(*p);
12342 assert(dir);
12343 assert(dir->get_version() > 0);
12344 dir_info.add(dir->fnode.accounted_fragstat);
12345 nest_info.add(dir->fnode.accounted_rstat);
12346 }
12347
12348 if (!dir_info.same_sums(diri->inode.dirstat) ||
12349 !nest_info.same_sums(diri->inode.rstat)) {
12350 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12351 << *diri << dendl;
12352 }
12353
12354 mds->server->respond_to_request(mdr, 0);
12355 }
12356
12357 void MDCache::flush_dentry(boost::string_view path, Context *fin)
12358 {
12359 if (is_readonly()) {
12360 dout(10) << __func__ << ": read-only FS" << dendl;
12361 fin->complete(-EROFS);
12362 return;
12363 }
12364 dout(10) << "flush_dentry " << path << dendl;
12365 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12366 filepath fp(path);
12367 mdr->set_filepath(fp);
12368 mdr->internal_op_finish = fin;
12369 flush_dentry_work(mdr);
12370 }
12371
12372 class C_FinishIOMDR : public MDSInternalContextBase {
12373 protected:
12374 MDSRank *mds;
12375 MDRequestRef mdr;
12376 MDSRank *get_mds() override { return mds; }
12377 public:
12378 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12379 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12380 };
12381
12382 void MDCache::flush_dentry_work(MDRequestRef& mdr)
12383 {
12384 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12385 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12386 if (NULL == in)
12387 return;
12388
12389 // TODO: Is this necessary? Fix it if so
12390 assert(in->is_auth());
12391 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12392 if (!locked)
12393 return;
12394 in->flush(new C_FinishIOMDR(mds, mdr));
12395 }
12396
12397
12398 /**
12399 * Initialize performance counters with global perfcounter
12400 * collection.
12401 */
12402 void MDCache::register_perfcounters()
12403 {
12404 PerfCountersBuilder pcb(g_ceph_context,
12405 "mds_cache", l_mdc_first, l_mdc_last);
12406
12407 /* Stray/purge statistics */
12408 pcb.add_u64(l_mdc_num_strays, "num_strays",
12409 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
12410 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12411 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12412
12413 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12414 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12415 "Stray dentries enqueued for purge");
12416 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12417 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12418
12419
12420 /* Recovery queue statistics */
12421 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12422 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
12423 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
12424 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12425 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12426 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
12427 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
12428
12429 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12430 "Internal Request type enqueue scrub");
12431 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12432 "Internal Request type export dir");
12433 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12434 "Internal Request type flush");
12435 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12436 "Internal Request type fragmentdir");
12437 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12438 "Internal Request type frag stats");
12439 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12440 "Internal Request type inode stats");
12441
12442 logger.reset(pcb.create_perf_counters());
12443 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12444 recovery_queue.set_logger(logger.get());
12445 stray_manager.set_logger(logger.get());
12446 }
12447
12448 void MDCache::activate_stray_manager()
12449 {
12450 if (open) {
12451 stray_manager.activate();
12452 } else {
12453 wait_for_open(
12454 new MDSInternalContextWrapper(mds,
12455 new FunctionContext([this](int r){
12456 stray_manager.activate();
12457 })
12458 )
12459 );
12460 }
12461 }
12462
12463 /**
12464 * Call this when putting references to an inode/dentry or
12465 * when attempting to trim it.
12466 *
12467 * If this inode is no longer linked by anyone, and this MDS
12468 * rank holds the primary dentry, and that dentry is in a stray
12469 * directory, then give up the dentry to the StrayManager, never
12470 * to be seen again by MDCache.
12471 *
12472 * @param delay if true, then purgeable inodes are stashed til
12473 * the next trim(), rather than being purged right
12474 * away.
12475 */
12476 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
12477 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12478 mds->get_state() <= MDSMap::STATE_REJOIN)
12479 return;
12480
12481 CDentry *dn = in->get_projected_parent_dn();
12482
12483 if (dn->state_test(CDentry::STATE_PURGING)) {
12484 /* We have already entered the purging process, no need
12485 * to re-evaluate me ! */
12486 return;
12487 }
12488
12489 if (dn->get_projected_linkage()->is_primary() &&
12490 dn->get_dir()->get_inode()->is_stray()) {
12491 stray_manager.eval_stray(dn, delay);
12492 }
12493 }
12494
12495 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12496 dout(10) << __func__ << " " << *diri << dendl;
12497 assert(diri->get_projected_parent_dir()->inode->is_stray());
12498 list<CDir*> ls;
12499 diri->get_dirfrags(ls);
12500 for (auto &p : ls) {
12501 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12502 p->try_remove_dentries_for_stray();
12503 }
12504 if (!diri->snaprealm) {
12505 if (diri->is_auth())
12506 diri->clear_dirty_rstat();
12507 diri->clear_scatter_dirty();
12508 }
12509 }
12510