]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
update sources to 12.2.8
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
94b18763 20#include <boost/utility/string_view.hpp>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae
FG
44
45#include "msg/Message.h"
46#include "msg/Messenger.h"
47
181888fb 48#include "common/MemoryModel.h"
7c673cae 49#include "common/errno.h"
7c673cae 50#include "common/perf_counters.h"
181888fb
FG
51#include "common/safe_io.h"
52
7c673cae
FG
53#include "osdc/Journaler.h"
54#include "osdc/Filer.h"
55
56#include "events/ESubtreeMap.h"
57#include "events/EUpdate.h"
58#include "events/ESlaveUpdate.h"
59#include "events/EImportFinish.h"
60#include "events/EFragment.h"
61#include "events/ECommitted.h"
62#include "events/ESessions.h"
63
64#include "messages/MGenericMessage.h"
65
66#include "messages/MMDSResolve.h"
67#include "messages/MMDSResolveAck.h"
68#include "messages/MMDSCacheRejoin.h"
69
70#include "messages/MDiscover.h"
71#include "messages/MDiscoverReply.h"
72
73//#include "messages/MInodeUpdate.h"
74#include "messages/MDirUpdate.h"
75#include "messages/MCacheExpire.h"
76
77#include "messages/MInodeFileCaps.h"
78
79#include "messages/MLock.h"
80#include "messages/MDentryLink.h"
81#include "messages/MDentryUnlink.h"
82
83#include "messages/MMDSFindIno.h"
84#include "messages/MMDSFindInoReply.h"
85
86#include "messages/MMDSOpenIno.h"
87#include "messages/MMDSOpenInoReply.h"
88
89#include "messages/MClientRequest.h"
90#include "messages/MClientCaps.h"
91#include "messages/MClientSnap.h"
92#include "messages/MClientQuota.h"
93
94#include "messages/MMDSSlaveRequest.h"
95
96#include "messages/MMDSFragmentNotify.h"
97
98#include "messages/MGatherCaps.h"
99
100#include "InoTable.h"
101
102#include "common/Timer.h"
103
104#include "perfglue/heap_profiler.h"
105
106using namespace std;
107
108#include "common/config.h"
109#include "include/assert.h"
110
111#define dout_context g_ceph_context
112#define dout_subsys ceph_subsys_mds
113#undef dout_prefix
114#define dout_prefix _prefix(_dout, mds)
115static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
116 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
117}
118
119set<int> SimpleLock::empty_gather_set;
120
121
122/**
123 * All non-I/O contexts that require a reference
124 * to an MDCache instance descend from this.
125 */
126class MDCacheContext : public virtual MDSInternalContextBase {
127protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
131 assert(mdcache != NULL);
132 return mdcache->mds;
133 }
134public:
135 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
136};
137
138
139/**
140 * Only for contexts called back from an I/O completion
141 *
142 * Note: duplication of members wrt MDCacheContext, because
143 * it'ls the lesser of two evils compared with introducing
144 * yet another piece of (multiple) inheritance.
145 */
146class MDCacheIOContext : public virtual MDSIOContextBase {
147protected:
148 MDCache *mdcache;
149 MDSRank *get_mds() override
150 {
151 assert(mdcache != NULL);
152 return mdcache->mds;
153 }
154public:
155 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
156};
157
158class MDCacheLogContext : public virtual MDSLogContextBase {
159protected:
160 MDCache *mdcache;
161 MDSRank *get_mds() override
162 {
163 assert(mdcache != NULL);
164 return mdcache->mds;
165 }
166public:
167 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
168};
169
170MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
171 mds(m),
172 filer(m->objecter, m->finisher),
173 exceeded_size_limit(false),
174 recovery_queue(m),
175 stray_manager(m, purge_queue_)
176{
177 migrator.reset(new Migrator(mds, this));
178 root = NULL;
179 myin = NULL;
180 readonly = false;
181
182 stray_index = 0;
183 for (int i = 0; i < NUM_STRAY; ++i) {
184 strays[i] = NULL;
185 }
186
b32b8144 187 num_shadow_inodes = 0;
7c673cae
FG
188 num_inodes_with_caps = 0;
189
190 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
191 (g_conf->mds_dir_max_commit_size << 20) :
192 (0.9 *(g_conf->osd_max_write_size << 20));
193
194 discover_last_tid = 0;
195 open_ino_last_tid = 0;
196 find_ino_peer_last_tid = 0;
197
198 last_cap_id = 0;
199
200 client_lease_durations[0] = 5.0;
201 client_lease_durations[1] = 30.0;
202 client_lease_durations[2] = 300.0;
203
204 resolves_pending = false;
205 rejoins_pending = false;
206 cap_imports_num_opening = 0;
207
208 opening_root = open = false;
181888fb 209 lru.lru_set_midpoint(cache_mid());
7c673cae 210
31f18b77
FG
211 bottom_lru.lru_set_midpoint(0);
212
7c673cae
FG
213 decayrate.set_halflife(g_conf->mds_decay_halflife);
214
215 did_shutdown_log_cap = false;
216}
217
218MDCache::~MDCache()
219{
220 if (logger) {
221 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
222 }
223}
224
225
226
227void MDCache::log_stat()
228{
181888fb 229 mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
7c673cae
FG
230 mds->logger->set(l_mds_inodes, lru.lru_get_size());
231 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
232 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
233 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
234 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
235 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
236 mds->logger->set(l_mds_caps, Capability::count());
237}
238
239
240//
241
242bool MDCache::shutdown()
243{
244 if (lru.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
246 //show_cache();
247 show_subtrees();
248 //dump();
249 }
250 return true;
251}
252
253
254// ====================================================================
255// some inode functions
256
257void MDCache::add_inode(CInode *in)
258{
259 // add to lru, inode map
b32b8144
FG
260 if (in->last == CEPH_NOSNAP) {
261 auto &p = inode_map[in->ino()];
262 assert(!p); // should be no dup inos!
263 p = in;
264 } else {
265 auto &p = snap_inode_map[in->vino()];
266 assert(!p); // should be no dup inos!
267 p = in;
268 }
7c673cae
FG
269
270 if (in->ino() < MDS_INO_SYSTEM_BASE) {
271 if (in->ino() == MDS_INO_ROOT)
272 root = in;
273 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
274 myin = in;
275 else if (in->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
277 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
278 }
279 }
280 if (in->is_base())
281 base_inodes.insert(in);
282 }
283
181888fb 284 if (cache_toofull()) {
7c673cae
FG
285 exceeded_size_limit = true;
286 }
287}
288
289void MDCache::remove_inode(CInode *o)
290{
291 dout(14) << "remove_inode " << *o << dendl;
292
293 if (o->get_parent_dn()) {
294 // FIXME: multiple parents?
295 CDentry *dn = o->get_parent_dn();
296 assert(!dn->is_dirty());
297 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
298 }
299
300 if (o->is_dirty())
301 o->mark_clean();
302 if (o->is_dirty_parent())
303 o->clear_dirty_parent();
304
305 o->clear_scatter_dirty();
306
307 o->item_open_file.remove_myself();
308
31f18b77
FG
309 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
310 export_pin_queue.erase(o);
7c673cae
FG
311
312 // remove from inode map
b32b8144
FG
313 if (o->last == CEPH_NOSNAP)
314 inode_map.erase(o->ino());
315 else
316 snap_inode_map.erase(o->vino());
7c673cae
FG
317
318 if (o->ino() < MDS_INO_SYSTEM_BASE) {
319 if (o == root) root = 0;
320 if (o == myin) myin = 0;
321 if (o->is_stray()) {
322 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
323 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
324 }
325 }
326 if (o->is_base())
327 base_inodes.erase(o);
328 }
329
330 // delete it
331 assert(o->get_num_ref() == 0);
332 delete o;
333}
334
335file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
336{
337 file_layout_t result = file_layout_t::get_default();
338 result.pool_id = mdsmap.get_first_data_pool();
339 return result;
340}
341
342file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
343{
344 file_layout_t result = file_layout_t::get_default();
345 result.pool_id = mdsmap.get_metadata_pool();
346 if (g_conf->mds_log_segment_size > 0) {
347 result.object_size = g_conf->mds_log_segment_size;
348 result.stripe_unit = g_conf->mds_log_segment_size;
349 }
350 return result;
351}
352
353void MDCache::init_layouts()
354{
355 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
356 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
357}
358
359void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
360 int mode) const
361{
362 in->inode.ino = ino;
363 in->inode.version = 1;
364 in->inode.xattr_version = 1;
365 in->inode.mode = 0500 | mode;
366 in->inode.size = 0;
367 in->inode.ctime =
368 in->inode.mtime =
369 in->inode.btime = ceph_clock_now();
370 in->inode.nlink = 1;
371 in->inode.truncate_size = -1ull;
372 in->inode.change_attr = 0;
373 in->inode.export_pin = MDS_RANK_NONE;
374
375 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
376 if (in->inode.is_dir()) {
377 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
378 ++in->inode.rstat.rsubdirs;
379 } else {
380 in->inode.layout = default_file_layout;
381 ++in->inode.rstat.rfiles;
382 }
383 in->inode.accounted_rstat = in->inode.rstat;
384
385 if (in->is_base()) {
386 if (in->is_root())
387 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
388 else
389 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
390 in->open_snaprealm(); // empty snaprealm
391 assert(!in->snaprealm->parent); // created its own
392 in->snaprealm->srnode.seq = 1;
393 }
394}
395
396CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
397{
398 dout(0) << "creating system inode with ino:" << ino << dendl;
399 CInode *in = new CInode(this);
400 create_unlinked_system_inode(in, ino, mode);
401 add_inode(in);
402 return in;
403}
404
405CInode *MDCache::create_root_inode()
406{
407 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
408 i->inode.uid = g_conf->mds_root_ino_uid;
409 i->inode.gid = g_conf->mds_root_ino_gid;
410 i->inode.layout = default_file_layout;
411 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
412 return i;
413}
414
415void MDCache::create_empty_hierarchy(MDSGather *gather)
416{
417 // create root dir
418 CInode *root = create_root_inode();
419
420 // force empty root dir
421 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
422 adjust_subtree_auth(rootdir, mds->get_nodeid());
423 rootdir->dir_rep = CDir::REP_ALL; //NONE;
424
425 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
426 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
427
428 root->inode.dirstat = rootdir->fnode.fragstat;
429 root->inode.rstat = rootdir->fnode.rstat;
430 ++root->inode.rstat.rsubdirs;
431 root->inode.accounted_rstat = root->inode.rstat;
432
433 rootdir->mark_complete();
434 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
435 rootdir->commit(0, gather->new_sub());
436
28e407b8
AA
437 root->mark_clean();
438 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
439 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
440 root->flush(gather->new_sub());
7c673cae
FG
441}
442
443void MDCache::create_mydir_hierarchy(MDSGather *gather)
444{
445 // create mds dir
446 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
447
448 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
449 adjust_subtree_auth(mydir, mds->get_nodeid());
450
451 LogSegment *ls = mds->mdlog->get_current_segment();
452
453 // stray dir
454 for (int i = 0; i < NUM_STRAY; ++i) {
455 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
456 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
457 stringstream name;
458 name << "stray" << i;
459 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
460 sdn->_mark_dirty(mds->mdlog->get_current_segment());
461
462 stray->inode.dirstat = straydir->fnode.fragstat;
463
464 mydir->fnode.rstat.add(stray->inode.rstat);
465 mydir->fnode.fragstat.nsubdirs++;
466 // save them
467 straydir->mark_complete();
468 straydir->mark_dirty(straydir->pre_dirty(), ls);
469 straydir->commit(0, gather->new_sub());
28e407b8 470 stray->mark_dirty_parent(ls, true);
7c673cae
FG
471 stray->store_backtrace(gather->new_sub());
472 }
473
474 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
475 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
476
477 myin->inode.dirstat = mydir->fnode.fragstat;
478 myin->inode.rstat = mydir->fnode.rstat;
479 ++myin->inode.rstat.rsubdirs;
480 myin->inode.accounted_rstat = myin->inode.rstat;
481
482 mydir->mark_complete();
483 mydir->mark_dirty(mydir->pre_dirty(), ls);
484 mydir->commit(0, gather->new_sub());
485
486 myin->store(gather->new_sub());
487}
488
489struct C_MDC_CreateSystemFile : public MDCacheLogContext {
490 MutationRef mut;
491 CDentry *dn;
492 version_t dpv;
493 MDSInternalContextBase *fin;
494 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
495 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
496 void finish(int r) override {
497 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
498 }
499};
500
501void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
502{
503 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
504 CDentry *dn = dir->add_null_dentry(name);
505
506 dn->push_projected_linkage(in);
507 version_t dpv = dn->pre_dirty();
508
509 CDir *mdir = 0;
510 if (in->inode.is_dir()) {
511 in->inode.rstat.rsubdirs = 1;
512
513 mdir = in->get_or_open_dirfrag(this, frag_t());
514 mdir->mark_complete();
515 mdir->pre_dirty();
516 } else
517 in->inode.rstat.rfiles = 1;
518 in->inode.version = dn->pre_dirty();
519
520 SnapRealm *realm = dir->get_inode()->find_snaprealm();
521 dn->first = in->first = realm->get_newest_seq() + 1;
522
523 MutationRef mut(new MutationImpl());
524
525 // force some locks. hacky.
526 mds->locker->wrlock_force(&dir->inode->filelock, mut);
527 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
528
529 mut->ls = mds->mdlog->get_current_segment();
530 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
531 mds->mdlog->start_entry(le);
532
533 if (!in->is_mdsdir()) {
534 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
535 le->metablob.add_primary_dentry(dn, in, true);
536 } else {
537 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
538 journal_dirty_inode(mut.get(), &le->metablob, in);
539 dn->push_projected_linkage(in->ino(), in->d_type());
540 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
541 le->metablob.add_root(true, in);
542 }
543 if (mdir)
544 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
545
546 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
547 mds->mdlog->flush();
548}
549
550void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
551{
552 dout(10) << "_create_system_file_finish " << *dn << dendl;
553
554 dn->pop_projected_linkage();
555 dn->mark_dirty(dpv, mut->ls);
556
557 CInode *in = dn->get_linkage()->get_inode();
558 in->inode.version--;
559 in->mark_dirty(in->inode.version + 1, mut->ls);
560
561 if (in->inode.is_dir()) {
562 CDir *dir = in->get_dirfrag(frag_t());
563 assert(dir);
564 dir->mark_dirty(1, mut->ls);
565 dir->mark_new(mut->ls);
566 }
567
568 mut->apply();
569 mds->locker->drop_locks(mut.get());
570 mut->cleanup();
571
572 fin->complete(0);
573
574 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
575 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
576}
577
578
579
580struct C_MDS_RetryOpenRoot : public MDSInternalContext {
581 MDCache *cache;
582 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
583 void finish(int r) override {
584 if (r < 0) {
585 // If we can't open root, something disastrous has happened: mark
586 // this rank damaged for operator intervention. Note that
587 // it is not okay to call suicide() here because we are in
588 // a Finisher callback.
589 cache->mds->damaged();
590 ceph_abort(); // damaged should never return
591 } else {
592 cache->open_root();
593 }
594 }
595};
596
597void MDCache::open_root_inode(MDSInternalContextBase *c)
598{
599 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
600 CInode *in;
601 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
602 in->fetch(c);
603 } else {
604 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
605 }
606}
607
608void MDCache::open_mydir_inode(MDSInternalContextBase *c)
609{
610 MDSGatherBuilder gather(g_ceph_context);
611
612 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
613 in->fetch(gather.new_sub());
614
615 gather.set_finisher(c);
616 gather.activate();
617}
618
28e407b8
AA
619void MDCache::open_mydir_frag(MDSInternalContextBase *c)
620{
621 open_mydir_inode(
622 new MDSInternalContextWrapper(mds,
623 new FunctionContext([this, c](int r) {
624 if (r < 0) {
625 c->complete(r);
626 return;
627 }
628 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
629 assert(mydir);
630 adjust_subtree_auth(mydir, mds->get_nodeid());
631 mydir->fetch(c);
632 })
633 )
634 );
635}
636
7c673cae
FG
637void MDCache::open_root()
638{
639 dout(10) << "open_root" << dendl;
640
641 if (!root) {
642 open_root_inode(new C_MDS_RetryOpenRoot(this));
643 return;
644 }
645 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
646 assert(root->is_auth());
647 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
648 assert(rootdir);
649 if (!rootdir->is_subtree_root())
650 adjust_subtree_auth(rootdir, mds->get_nodeid());
651 if (!rootdir->is_complete()) {
652 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
653 return;
654 }
655 } else {
656 assert(!root->is_auth());
657 CDir *rootdir = root->get_dirfrag(frag_t());
658 if (!rootdir) {
224ce89b 659 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
660 return;
661 }
662 }
663
664 if (!myin) {
665 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
666 in->fetch(new C_MDS_RetryOpenRoot(this));
667 return;
668 }
669 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
670 assert(mydir);
671 adjust_subtree_auth(mydir, mds->get_nodeid());
672
673 populate_mydir();
674}
675
676void MDCache::populate_mydir()
677{
678 assert(myin);
679 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
680 assert(mydir);
681
682 dout(10) << "populate_mydir " << *mydir << dendl;
683
684 if (!mydir->is_complete()) {
685 mydir->fetch(new C_MDS_RetryOpenRoot(this));
686 return;
687 }
688
689 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
690 // A missing dirfrag, we will recreate it. Before that, we must dirty
691 // it before dirtying any of the strays we create within it.
692 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
693 "recreating it now";
694 LogSegment *ls = mds->mdlog->get_current_segment();
695 mydir->state_clear(CDir::STATE_BADFRAG);
696 mydir->mark_complete();
697 mydir->mark_dirty(mydir->pre_dirty(), ls);
698 }
699
700 // open or create stray
701 uint64_t num_strays = 0;
702 for (int i = 0; i < NUM_STRAY; ++i) {
703 stringstream name;
704 name << "stray" << i;
705 CDentry *straydn = mydir->lookup(name.str());
706
707 // allow for older fs's with stray instead of stray0
708 if (straydn == NULL && i == 0)
709 straydn = mydir->lookup("stray");
710
711 if (!straydn || !straydn->get_linkage()->get_inode()) {
712 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
713 new C_MDS_RetryOpenRoot(this));
714 return;
715 }
716 assert(straydn);
717 assert(strays[i]);
718 // we make multiple passes through this method; make sure we only pin each stray once.
719 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
720 strays[i]->get(CInode::PIN_STRAY);
721 strays[i]->state_set(CInode::STATE_STRAYPINNED);
722 strays[i]->get_stickydirs();
723 }
724 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
725
726 // open all frags
727 list<frag_t> ls;
728 strays[i]->dirfragtree.get_leaves(ls);
729 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
730 frag_t fg = *p;
731 CDir *dir = strays[i]->get_dirfrag(fg);
732 if (!dir) {
733 dir = strays[i]->get_or_open_dirfrag(this, fg);
734 }
735
736 // DamageTable applies special handling to strays: it will
737 // have damaged() us out if one is damaged.
738 assert(!dir->state_test(CDir::STATE_BADFRAG));
739
740 if (dir->get_version() == 0) {
741 dir->fetch(new C_MDS_RetryOpenRoot(this));
742 return;
743 }
744
745 if (dir->get_frag_size() > 0)
746 num_strays += dir->get_frag_size();
747 }
748 }
749
750 stray_manager.set_num_strays(num_strays);
751
752 // okay!
753 dout(10) << "populate_mydir done" << dendl;
754 assert(!open);
755 open = true;
756 mds->queue_waiters(waiting_for_open);
757
758 scan_stray_dir();
759}
760
761void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
762{
763 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
764}
765
766CDir *MDCache::get_stray_dir(CInode *in)
767{
768 string straydname;
769 in->name_stray_dentry(straydname);
770
771 CInode *strayi = get_stray();
772 assert(strayi);
773 frag_t fg = strayi->pick_dirfrag(straydname);
774 CDir *straydir = strayi->get_dirfrag(fg);
775 assert(straydir);
776 return straydir;
777}
778
779CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
780{
781 CDir *straydir = get_stray_dir(in);
782 string straydname;
783 in->name_stray_dentry(straydname);
784 CDentry *straydn = straydir->lookup(straydname);
785 if (!straydn) {
786 straydn = straydir->add_null_dentry(straydname);
787 straydn->mark_new();
788 } else {
789 assert(straydn->get_projected_linkage()->is_null());
790 }
791
792 straydn->state_set(CDentry::STATE_STRAY);
793 return straydn;
794}
795
796
797
798MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
799{
800 // inode?
801 if (info.ino)
802 return get_inode(info.ino, info.snapid);
803
804 // dir or dentry.
805 CDir *dir = get_dirfrag(info.dirfrag);
806 if (!dir) return 0;
807
808 if (info.dname.length())
809 return dir->lookup(info.dname, info.snapid);
810 else
811 return dir;
812}
813
814
815
816
817// ====================================================================
818// subtree management
819
820void MDCache::list_subtrees(list<CDir*>& ls)
821{
822 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
823 p != subtrees.end();
824 ++p)
825 ls.push_back(p->first);
826}
827
828/*
829 * adjust the dir_auth of a subtree.
830 * merge with parent and/or child subtrees, if is it appropriate.
831 * merge can ONLY happen if both parent and child have unambiguous auth.
832 */
28e407b8 833void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
834{
835 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
836 << " on " << *dir << dendl;
837
7c673cae
FG
838 show_subtrees();
839
840 CDir *root;
841 if (dir->inode->is_base()) {
842 root = dir; // bootstrap hack.
843 if (subtrees.count(root) == 0) {
844 subtrees[root];
845 root->get(CDir::PIN_SUBTREE);
846 }
847 } else {
848 root = get_subtree_root(dir); // subtree root
849 }
850 assert(root);
851 assert(subtrees.count(root));
852 dout(7) << " current root is " << *root << dendl;
853
854 if (root == dir) {
855 // i am already a subtree.
856 dir->set_dir_auth(auth);
857 } else {
858 // i am a new subtree.
859 dout(10) << " new subtree at " << *dir << dendl;
860 assert(subtrees.count(dir) == 0);
861 subtrees[dir]; // create empty subtree bounds list for me.
862 dir->get(CDir::PIN_SUBTREE);
863
864 // set dir_auth
865 dir->set_dir_auth(auth);
866
867 // move items nested beneath me, under me.
868 set<CDir*>::iterator p = subtrees[root].begin();
869 while (p != subtrees[root].end()) {
870 set<CDir*>::iterator next = p;
871 ++next;
872 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
873 // move under me
874 dout(10) << " claiming child bound " << **p << dendl;
875 subtrees[dir].insert(*p);
876 subtrees[root].erase(p);
877 }
878 p = next;
879 }
880
881 // i am a bound of the parent subtree.
882 subtrees[root].insert(dir);
883
884 // i am now the subtree root.
885 root = dir;
886
887 // adjust recursive pop counters
28e407b8 888 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
889 utime_t now = ceph_clock_now();
890 CDir *p = dir->get_parent_dir();
891 while (p) {
892 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
893 if (p->is_subtree_root()) break;
894 p = p->inode->get_parent_dir();
895 }
896 }
7c673cae
FG
897 }
898
899 show_subtrees();
900}
901
902
903void MDCache::try_subtree_merge(CDir *dir)
904{
905 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
906 // record my old bounds
907 auto oldbounds = subtrees.at(dir);
7c673cae 908
224ce89b 909 set<CInode*> to_eval;
7c673cae 910 // try merge at my root
224ce89b 911 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
912
913 // try merge at my old bounds
224ce89b
WB
914 for (auto bound : oldbounds)
915 try_subtree_merge_at(bound, &to_eval);
916
917 if (!(mds->is_any_replay() || mds->is_resolve())) {
918 for(auto in : to_eval)
919 eval_subtree_root(in);
920 }
7c673cae
FG
921}
922
923class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
924 CInode *in;
925 MutationRef mut;
926public:
927 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
928 void finish(int r) override {
929 mdcache->subtree_merge_writebehind_finish(in, mut);
930 }
931};
932
28e407b8 933void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
934{
935 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
936
937 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
938 dir->state_test(CDir::STATE_EXPORTBOUND) ||
939 dir->state_test(CDir::STATE_AUXSUBTREE))
940 return;
941
942 auto it = subtrees.find(dir);
943 assert(it != subtrees.end());
7c673cae 944
7c673cae
FG
945 // merge with parent?
946 CDir *parent = dir;
947 if (!dir->inode->is_base())
948 parent = get_subtree_root(dir->get_parent_dir());
949
b32b8144
FG
950 if (parent != dir && // we have a parent,
951 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
952 // merge with parent.
953 dout(10) << " subtree merge at " << *dir << dendl;
954 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
955
956 // move our bounds under the parent
b32b8144 957 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
958
959 // we are no longer a subtree or bound
960 dir->put(CDir::PIN_SUBTREE);
b32b8144 961 subtrees.erase(it);
7c673cae
FG
962 subtrees[parent].erase(dir);
963
964 // adjust popularity?
28e407b8 965 if (adjust_pop && dir->is_auth()) {
7c673cae 966 utime_t now = ceph_clock_now();
28e407b8 967 CDir *cur = dir;
7c673cae
FG
968 CDir *p = dir->get_parent_dir();
969 while (p) {
970 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
28e407b8 971 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 972 if (p->is_subtree_root()) break;
28e407b8 973 cur = p;
7c673cae
FG
974 p = p->inode->get_parent_dir();
975 }
976 }
977
224ce89b
WB
978 if (to_eval && dir->get_inode()->is_auth())
979 to_eval->insert(dir->get_inode());
7c673cae 980
181888fb
FG
981 show_subtrees(15);
982 }
7c673cae
FG
983}
984
985void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
986{
987 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
988 in->pop_and_dirty_projected_inode(mut->ls);
989
990 mut->apply();
991 mds->locker->drop_locks(mut.get());
992 mut->cleanup();
993
994 in->auth_unpin(this);
995}
996
997void MDCache::eval_subtree_root(CInode *diri)
998{
999 // evaluate subtree inode filelock?
1000 // (we should scatter the filelock on subtree bounds)
224ce89b
WB
1001 assert(diri->is_auth());
1002 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1003}
1004
1005
1006void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
1007{
1008 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1009 << " on " << *dir
1010 << " bounds " << bounds
1011 << dendl;
1012
1013 show_subtrees();
1014
1015 CDir *root;
1016 if (dir->ino() == MDS_INO_ROOT) {
1017 root = dir; // bootstrap hack.
1018 if (subtrees.count(root) == 0) {
1019 subtrees[root];
1020 root->get(CDir::PIN_SUBTREE);
1021 }
1022 } else {
1023 root = get_subtree_root(dir); // subtree root
1024 }
1025 assert(root);
1026 assert(subtrees.count(root));
1027 dout(7) << " current root is " << *root << dendl;
1028
1029 mds_authority_t oldauth = dir->authority();
1030
1031 if (root == dir) {
1032 // i am already a subtree.
1033 dir->set_dir_auth(auth);
1034 } else {
1035 // i am a new subtree.
1036 dout(10) << " new subtree at " << *dir << dendl;
1037 assert(subtrees.count(dir) == 0);
1038 subtrees[dir]; // create empty subtree bounds list for me.
1039 dir->get(CDir::PIN_SUBTREE);
1040
1041 // set dir_auth
1042 dir->set_dir_auth(auth);
1043
1044 // move items nested beneath me, under me.
1045 set<CDir*>::iterator p = subtrees[root].begin();
1046 while (p != subtrees[root].end()) {
1047 set<CDir*>::iterator next = p;
1048 ++next;
1049 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1050 // move under me
1051 dout(10) << " claiming child bound " << **p << dendl;
1052 subtrees[dir].insert(*p);
1053 subtrees[root].erase(p);
1054 }
1055 p = next;
1056 }
1057
1058 // i am a bound of the parent subtree.
1059 subtrees[root].insert(dir);
1060
1061 // i am now the subtree root.
1062 root = dir;
1063 }
1064
224ce89b
WB
1065 set<CInode*> to_eval;
1066
7c673cae
FG
1067 // verify/adjust bounds.
1068 // - these may be new, or
1069 // - beneath existing ambiguous bounds (which will be collapsed),
1070 // - but NOT beneath unambiguous bounds.
1071 for (set<CDir*>::iterator p = bounds.begin();
1072 p != bounds.end();
1073 ++p) {
1074 CDir *bound = *p;
1075
1076 // new bound?
1077 if (subtrees[dir].count(bound) == 0) {
1078 if (get_subtree_root(bound) == dir) {
1079 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1080 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1081 }
1082 else {
1083 dout(10) << " want bound " << *bound << dendl;
1084 CDir *t = get_subtree_root(bound->get_parent_dir());
1085 if (subtrees[t].count(bound) == 0) {
1086 assert(t != dir);
1087 dout(10) << " new bound " << *bound << dendl;
1088 adjust_subtree_auth(bound, t->authority());
1089 }
1090 // make sure it's nested beneath ambiguous subtree(s)
1091 while (1) {
1092 while (subtrees[dir].count(t) == 0)
1093 t = get_subtree_root(t->get_parent_dir());
1094 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1095 adjust_subtree_auth(t, auth);
224ce89b 1096 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1097 t = get_subtree_root(bound->get_parent_dir());
1098 if (t == dir) break;
1099 }
1100 }
1101 }
1102 else {
1103 dout(10) << " already have bound " << *bound << dendl;
1104 }
1105 }
1106 // merge stray bounds?
1107 while (!subtrees[dir].empty()) {
1108 set<CDir*> copy = subtrees[dir];
1109 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1110 if (bounds.count(*p) == 0) {
1111 CDir *stray = *p;
1112 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1113 adjust_subtree_auth(stray, auth);
224ce89b 1114 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1115 }
1116 }
1117 // swallowing subtree may add new subtree bounds
1118 if (copy == subtrees[dir])
1119 break;
1120 }
1121
1122 // bound should now match.
1123 verify_subtree_bounds(dir, bounds);
1124
1125 show_subtrees();
224ce89b
WB
1126
1127 if (!(mds->is_any_replay() || mds->is_resolve())) {
1128 for(auto in : to_eval)
1129 eval_subtree_root(in);
1130 }
7c673cae
FG
1131}
1132
1133
1134/*
1135 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1136 * fragmentation as necessary to get an equivalent bounding set. That is, only
1137 * split if one of our frags spans the provided bounding set. Never merge.
1138 */
1139void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1140{
1141 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1142
1143 // sort by ino
1144 map<inodeno_t, fragset_t> byino;
1145 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1146 byino[p->ino].insert(p->frag);
1147 dout(10) << " by ino: " << byino << dendl;
1148
1149 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1150 CInode *diri = get_inode(p->first);
1151 if (!diri)
1152 continue;
1153 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1154
1155 fragtree_t tmpdft;
1156 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1157 tmpdft.force_to_leaf(g_ceph_context, *q);
1158
1159 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1160 frag_t fg = *q;
1161 list<frag_t> fgls;
1162 diri->dirfragtree.get_leaves_under(fg, fgls);
1163 if (fgls.empty()) {
1164 bool all = true;
1165 frag_t approx_fg = diri->dirfragtree[fg.value()];
1166 list<frag_t> ls;
1167 tmpdft.get_leaves_under(approx_fg, ls);
1168 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1169 if (p->second.get().count(*r) == 0) {
1170 // not bound, so the resolve message is from auth MDS of the dirfrag
1171 force_dir_fragment(diri, *r);
1172 all = false;
1173 }
1174 }
1175 if (all)
1176 fgls.push_back(approx_fg);
1177 else
1178 diri->dirfragtree.get_leaves_under(fg, fgls);
1179 }
1180 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1181 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1182 CDir *dir = diri->get_dirfrag(*r);
1183 if (dir)
1184 bounds.insert(dir);
1185 }
1186 }
1187 }
1188}
1189
1190void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1191{
1192 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1193 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1194
1195 set<CDir*> bounds;
1196 get_force_dirfrag_bound_set(bound_dfs, bounds);
1197 adjust_bounded_subtree_auth(dir, bounds, auth);
1198}
1199
1200void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1201{
1202 dout(10) << "map_dirfrag_set " << dfs << dendl;
1203
1204 // group by inode
1205 map<inodeno_t, fragset_t> ino_fragset;
1206 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1207 ino_fragset[p->ino].insert(p->frag);
1208
1209 // get frags
1210 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1211 p != ino_fragset.end();
1212 ++p) {
1213 CInode *in = get_inode(p->first);
1214 if (!in)
1215 continue;
1216
1217 list<frag_t> fglist;
1218 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1219 in->dirfragtree.get_leaves_under(*q, fglist);
1220
1221 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1222 << " on " << *in << dendl;
1223
1224 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1225 CDir *dir = in->get_dirfrag(*q);
1226 if (dir)
1227 result.insert(dir);
1228 }
1229 }
1230}
1231
1232
1233
1234CDir *MDCache::get_subtree_root(CDir *dir)
1235{
1236 // find the underlying dir that delegates (or is about to delegate) auth
1237 while (true) {
1238 if (dir->is_subtree_root())
1239 return dir;
1240 dir = dir->get_inode()->get_parent_dir();
1241 if (!dir)
1242 return 0; // none
1243 }
1244}
1245
1246CDir *MDCache::get_projected_subtree_root(CDir *dir)
1247{
1248 // find the underlying dir that delegates (or is about to delegate) auth
1249 while (true) {
1250 if (dir->is_subtree_root())
1251 return dir;
1252 dir = dir->get_inode()->get_projected_parent_dir();
1253 if (!dir)
1254 return 0; // none
1255 }
1256}
1257
1258void MDCache::remove_subtree(CDir *dir)
1259{
1260 dout(10) << "remove_subtree " << *dir << dendl;
1261 assert(subtrees.count(dir));
1262 assert(subtrees[dir].empty());
1263 subtrees.erase(dir);
1264 dir->put(CDir::PIN_SUBTREE);
1265 if (dir->get_parent_dir()) {
1266 CDir *p = get_subtree_root(dir->get_parent_dir());
1267 assert(subtrees[p].count(dir));
1268 subtrees[p].erase(dir);
1269 }
1270}
1271
1272void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1273{
1274 assert(subtrees.count(dir));
1275 bounds = subtrees[dir];
1276}
1277
1278void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1279{
1280 if (subtrees.count(dir)) {
1281 // just copy them, dir is a subtree.
1282 get_subtree_bounds(dir, bounds);
1283 } else {
1284 // find them
1285 CDir *root = get_subtree_root(dir);
1286 for (set<CDir*>::iterator p = subtrees[root].begin();
1287 p != subtrees[root].end();
1288 ++p) {
1289 CDir *t = *p;
1290 while (t != root) {
1291 t = t->get_parent_dir();
1292 assert(t);
1293 if (t == dir) {
1294 bounds.insert(*p);
1295 continue;
1296 }
1297 }
1298 }
1299 }
1300}
1301
1302void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1303{
1304 // for debugging only.
1305 assert(subtrees.count(dir));
1306 if (bounds != subtrees[dir]) {
1307 dout(0) << "verify_subtree_bounds failed" << dendl;
1308 set<CDir*> b = bounds;
1309 for (auto &cd : subtrees[dir]) {
1310 if (bounds.count(cd)) {
1311 b.erase(cd);
1312 continue;
1313 }
1314 dout(0) << " missing bound " << *cd << dendl;
1315 }
1316 for (const auto &cd : b)
1317 dout(0) << " extra bound " << *cd << dendl;
1318 }
1319 assert(bounds == subtrees[dir]);
1320}
1321
1322void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1323{
1324 // for debugging only.
1325 assert(subtrees.count(dir));
1326
1327 // make sure that any bounds i do have are properly noted as such.
1328 int failed = 0;
1329 for (const auto &fg : bounds) {
1330 CDir *bd = get_dirfrag(fg);
1331 if (!bd) continue;
1332 if (subtrees[dir].count(bd) == 0) {
1333 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1334 failed++;
1335 }
1336 }
1337 assert(failed == 0);
1338}
1339
1340void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1341{
1342 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1343 << " to " << *newdir << dendl;
1344 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1345}
1346
224ce89b 1347void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1348{
1349 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1350
1351 //show_subtrees();
28e407b8 1352 utime_t now = ceph_clock_now();
7c673cae
FG
1353
1354 CDir *newdir = diri->get_parent_dir();
1355
1356 if (pop) {
1357 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1358 assert(p != projected_subtree_renames.end());
1359 assert(!p->second.empty());
1360 assert(p->second.front().first == olddir);
1361 assert(p->second.front().second == newdir);
1362 p->second.pop_front();
1363 if (p->second.empty())
1364 projected_subtree_renames.erase(p);
1365 }
1366
1367 // adjust subtree
1368 list<CDir*> dfls;
1369 // make sure subtree dirfrags are at the front of the list
1370 diri->get_subtree_dirfrags(dfls);
1371 diri->get_nested_dirfrags(dfls);
1372 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1373 CDir *dir = *p;
1374
1375 dout(10) << "dirfrag " << *dir << dendl;
1376 CDir *oldparent = get_subtree_root(olddir);
1377 dout(10) << " old parent " << *oldparent << dendl;
1378 CDir *newparent = get_subtree_root(newdir);
1379 dout(10) << " new parent " << *newparent << dendl;
1380
28e407b8
AA
1381 if (olddir != newdir)
1382 mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
1383
7c673cae
FG
1384 if (oldparent == newparent) {
1385 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1386 } else if (dir->is_subtree_root()) {
7c673cae
FG
1387 // children are fine. change parent.
1388 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1389 assert(subtrees[oldparent].count(dir));
1390 subtrees[oldparent].erase(dir);
1391 assert(subtrees.count(newparent));
1392 subtrees[newparent].insert(dir);
224ce89b 1393 // caller is responsible for 'eval diri'
28e407b8 1394 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1395 } else {
1396 // mid-subtree.
1397
1398 // see if any old bounds move to the new parent.
1399 list<CDir*> tomove;
1400 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1401 p != subtrees[oldparent].end();
1402 ++p) {
1403 CDir *bound = *p;
1404 CDir *broot = get_subtree_root(bound->get_parent_dir());
1405 if (broot != oldparent) {
1406 assert(broot == newparent);
1407 tomove.push_back(bound);
1408 }
1409 }
1410 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1411 CDir *bound = *p;
1412 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1413 subtrees[oldparent].erase(bound);
1414 subtrees[newparent].insert(bound);
1415 }
1416
1417 // did auth change?
1418 if (oldparent->authority() != newparent->authority()) {
28e407b8 1419 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1420 // caller is responsible for 'eval diri'
28e407b8 1421 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1422 }
1423 }
28e407b8
AA
1424
1425 if (olddir != newdir)
1426 mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
7c673cae
FG
1427 }
1428
1429 show_subtrees();
1430}
1431
1432
1433void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1434{
1435 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1436 p != subtrees.end();
1437 ++p) {
1438 CDir *root = p->first;
1439 if (root->is_full_dir_auth())
1440 s.insert(root);
1441 }
1442}
1443void MDCache::get_auth_subtrees(set<CDir*>& s)
1444{
1445 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1446 p != subtrees.end();
1447 ++p) {
1448 CDir *root = p->first;
1449 if (root->is_auth())
1450 s.insert(root);
1451 }
1452}
1453
1454
1455// count.
1456
1457int MDCache::num_subtrees()
1458{
1459 return subtrees.size();
1460}
1461
1462int MDCache::num_subtrees_fullauth()
1463{
1464 int n = 0;
1465 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1466 p != subtrees.end();
1467 ++p) {
1468 CDir *root = p->first;
1469 if (root->is_full_dir_auth())
1470 n++;
1471 }
1472 return n;
1473}
1474
1475int MDCache::num_subtrees_fullnonauth()
1476{
1477 int n = 0;
1478 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1479 p != subtrees.end();
1480 ++p) {
1481 CDir *root = p->first;
1482 if (root->is_full_dir_nonauth())
1483 n++;
1484 }
1485 return n;
1486}
1487
1488
1489
1490// ===================================
1491// journal and snap/cow helpers
1492
1493
1494/*
1495 * find first inode in cache that follows given snapid. otherwise, return current.
1496 */
1497CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1498{
1499 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1500 assert(in->last == CEPH_NOSNAP);
1501
b32b8144
FG
1502 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1503 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1504 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1505 in = p->second;
7c673cae 1506 }
b32b8144 1507
7c673cae
FG
1508 return in;
1509}
1510
1511
1512/*
1513 * note: i'm currently cheating wrt dirty and inode.version on cow
1514 * items. instead of doing a full dir predirty, i just take the
1515 * original item's version, and set the dirty flag (via
1516 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1517 * means a special case in the dir commit clean sweep assertions.
1518 * bah.
1519 */
1520CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1521{
1522 assert(last >= in->first);
1523
b32b8144 1524 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae
FG
1525 oldin->inode = *in->get_previous_projected_inode();
1526 oldin->symlink = in->symlink;
1527 oldin->xattrs = *in->get_previous_projected_xattrs();
1528 oldin->inode.trim_client_ranges(last);
1529
1530 if (in->first < in->oldest_snap)
1531 in->oldest_snap = in->first;
1532
1533 in->first = last+1;
1534
1535 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1536 add_inode(oldin);
1537
1538 if (in->last != CEPH_NOSNAP) {
1539 CInode *head_in = get_inode(in->ino());
1540 assert(head_in);
1541 if (head_in->split_need_snapflush(oldin, in)) {
1542 oldin->client_snap_caps = in->client_snap_caps;
94b18763
FG
1543 for (const auto &p : in->client_snap_caps) {
1544 SimpleLock *lock = oldin->get_lock(p.first);
7c673cae 1545 assert(lock);
94b18763 1546 for (const auto &q : p.second) {
7c673cae
FG
1547 oldin->auth_pin(lock);
1548 lock->set_state(LOCK_SNAP_SYNC); // gathering
1549 lock->get_wrlock(true);
94b18763 1550 (void)q; /* unused */
7c673cae
FG
1551 }
1552 }
1553 }
1554 return oldin;
1555 }
1556
b32b8144
FG
1557 if (!in->client_caps.empty()) {
1558 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1559 // clone caps?
94b18763 1560 for (auto &p : in->client_caps) {
b32b8144
FG
1561 client_t client = p.first;
1562 Capability *cap = p.second;
1563 int issued = cap->issued();
1564 if ((issued & CEPH_CAP_ANY_WR) &&
1565 cap->client_follows < last) {
1566 // note in oldin
1567 for (int i = 0; i < num_cinode_locks; i++) {
1568 if (issued & cinode_lock_info[i].wr_caps) {
1569 int lockid = cinode_lock_info[i].lock;
1570 SimpleLock *lock = oldin->get_lock(lockid);
1571 assert(lock);
1572 oldin->client_snap_caps[lockid].insert(client);
1573 oldin->auth_pin(lock);
1574 lock->set_state(LOCK_SNAP_SYNC); // gathering
1575 lock->get_wrlock(true);
1576 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1577 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1578 }
7c673cae 1579 }
b32b8144
FG
1580 cap->client_follows = last;
1581
1582 // we need snapflushes for any intervening snaps
1583 dout(10) << " snaps " << snaps << dendl;
1584 for (auto q = snaps.lower_bound(oldin->first);
1585 q != snaps.end() && *q <= last;
1586 ++q) {
1587 in->add_need_snapflush(oldin, *q, client);
1588 }
1589 } else {
1590 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1591 }
7c673cae
FG
1592 }
1593 }
7c673cae
FG
1594 return oldin;
1595}
1596
1597void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1598 CDentry *dn, snapid_t follows,
1599 CInode **pcow_inode, CDentry::linkage_t *dnl)
1600{
1601 if (!dn) {
1602 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1603 return;
1604 }
1605 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1606 assert(dn->is_auth());
1607
1608 // nothing to cow on a null dentry, fix caller
1609 if (!dnl)
1610 dnl = dn->get_projected_linkage();
1611 assert(!dnl->is_null());
1612
1613 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1614 // multiversion inode.
1615 CInode *in = dnl->get_inode();
1616 SnapRealm *realm = NULL;
1617
1618 if (in->get_projected_parent_dn() != dn) {
1619 assert(follows == CEPH_NOSNAP);
1620 realm = dn->dir->inode->find_snaprealm();
1621 snapid_t dir_follows = realm->get_newest_snap();
1622
1623 if (dir_follows+1 > dn->first) {
1624 snapid_t oldfirst = dn->first;
1625 dn->first = dir_follows+1;
1626 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1627 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1628 oldfirst, dir_follows);
1629 olddn->pre_dirty();
1630 dout(10) << " olddn " << *olddn << dendl;
1631 metablob->add_remote_dentry(olddn, true);
1632 mut->add_cow_dentry(olddn);
1633 // FIXME: adjust link count here? hmm.
1634
1635 if (dir_follows+1 > in->first)
1636 in->cow_old_inode(dir_follows, false);
1637 }
1638 }
1639
1640 if (in->snaprealm) {
1641 realm = in->snaprealm;
1642 follows = realm->get_newest_seq();
1643 } else
1644 follows = dir_follows;
1645 } else {
1646 realm = in->find_snaprealm();
1647 if (follows == CEPH_NOSNAP)
1648 follows = realm->get_newest_seq();
1649 }
1650
1651 // already cloned?
1652 if (follows < in->first) {
1653 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1654 return;
1655 }
1656
1657 if (!realm->has_snaps_in_range(in->first, follows)) {
1658 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1659 in->first = follows + 1;
1660 return;
1661 }
1662
1663 in->cow_old_inode(follows, false);
1664
1665 } else {
1666 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1667 if (follows == CEPH_NOSNAP)
1668 follows = realm->get_newest_seq();
1669
1670 // already cloned?
1671 if (follows < dn->first) {
1672 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1673 return;
1674 }
1675
1676 // update dn.first before adding old dentry to cdir's map
1677 snapid_t oldfirst = dn->first;
1678 dn->first = follows+1;
1679
1680 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1681
1682 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1683 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1684 if (in)
1685 in->first = follows+1;
1686 return;
1687 }
1688
1689 dout(10) << " dn " << *dn << dendl;
1690 if (in) {
1691 CInode *oldin = cow_inode(in, follows);
1692 mut->add_cow_inode(oldin);
1693 if (pcow_inode)
1694 *pcow_inode = oldin;
94b18763 1695 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
7c673cae
FG
1696 oldin->inode.version = olddn->pre_dirty();
1697 dout(10) << " olddn " << *olddn << dendl;
1698 bool need_snapflush = !oldin->client_snap_caps.empty();
1699 if (need_snapflush)
1700 mut->ls->open_files.push_back(&oldin->item_open_file);
1701 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1702 mut->add_cow_dentry(olddn);
1703 } else {
1704 assert(dnl->is_remote());
94b18763 1705 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1706 oldfirst, follows);
1707 olddn->pre_dirty();
1708 dout(10) << " olddn " << *olddn << dendl;
1709 metablob->add_remote_dentry(olddn, true);
1710 mut->add_cow_dentry(olddn);
1711 }
1712 }
1713}
1714
1715
1716void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1717 CInode *in, snapid_t follows,
1718 CInode **pcow_inode)
1719{
1720 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1721 CDentry *dn = in->get_projected_parent_dn();
1722 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1723}
1724
1725void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1726{
1727 if (in->is_base()) {
1728 metablob->add_root(true, in, in->get_projected_inode());
1729 } else {
1730 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1731 follows = in->first - 1;
1732 CDentry *dn = in->get_projected_parent_dn();
1733 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1734 journal_cow_dentry(mut, metablob, dn, follows);
1735 if (in->get_projected_inode()->is_backtrace_updated()) {
1736 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1737 in->get_previous_projected_inode()->layout.pool_id;
1738 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1739 } else {
1740 metablob->add_primary_dentry(dn, in, true);
1741 }
1742 }
1743}
1744
1745
1746
1747// nested ---------------------------------------------------------------
1748
1749void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1750 int linkunlink, SnapRealm *prealm)
1751{
1752 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1753 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1754
1755 if (cur->first > first)
1756 first = cur->first;
1757
1758 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1759 << " " << *cur << dendl;
1760 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1761 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1762
1763 /*
1764 * FIXME. this incompletely propagates rstats to _old_ parents
1765 * (i.e. shortly after a directory rename). but we need full
1766 * blown hard link backpointers to make this work properly...
1767 */
1768 snapid_t floor = parentdn->first;
1769 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1770
1771 if (!prealm)
1772 prealm = parent->inode->find_snaprealm();
1773 const set<snapid_t> snaps = prealm->get_snaps();
1774
1775 if (cur->last != CEPH_NOSNAP) {
1776 assert(cur->dirty_old_rstats.empty());
1777 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1778 if (q == snaps.end() || *q > cur->last)
1779 return;
1780 }
1781
1782 if (cur->last >= floor) {
1783 bool update = true;
1784 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1785 // rename src inode is not projected in the slave rename prep case. so we should
1786 // avoid updateing the inode.
1787 assert(linkunlink < 0);
1788 assert(cur->is_frozen_inode());
1789 update = false;
1790 }
1791 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1792 linkunlink, update);
1793 }
1794
1795 if (g_conf->mds_snap_rstat) {
94b18763
FG
1796 for (const auto &p : cur->dirty_old_rstats) {
1797 auto &old = cur->old_inodes[p];
1798 snapid_t ofirst = std::max(old.first, floor);
1799 auto it = snaps.lower_bound(ofirst);
1800 if (it == snaps.end() || *it > p)
7c673cae 1801 continue;
94b18763
FG
1802 if (p >= floor)
1803 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1804 }
1805 }
1806 cur->dirty_old_rstats.clear();
1807}
1808
1809
94b18763 1810void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1811 CDir *parent, int linkunlink, bool update_inode)
1812{
1813 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1814 dout(20) << " inode rstat " << inode.rstat << dendl;
1815 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1816 nest_info_t delta;
1817 if (linkunlink == 0) {
1818 delta.add(inode.rstat);
1819 delta.sub(inode.accounted_rstat);
1820 } else if (linkunlink < 0) {
1821 delta.sub(inode.accounted_rstat);
1822 } else {
1823 delta.add(inode.rstat);
1824 }
1825 dout(20) << " delta " << delta << dendl;
1826
1827 if (update_inode)
1828 inode.accounted_rstat = inode.rstat;
1829
1830 while (last >= ofirst) {
1831 /*
1832 * pick fnode version to update. at each iteration, we want to
1833 * pick a segment ending in 'last' to update. split as necessary
1834 * to make that work. then, adjust first up so that we only
1835 * update one segment at a time. then loop to cover the whole
1836 * [ofirst,last] interval.
1837 */
1838 nest_info_t *prstat;
1839 snapid_t first;
1840 fnode_t *pf = parent->get_projected_fnode();
1841 if (last == CEPH_NOSNAP) {
1842 if (g_conf->mds_snap_rstat)
1843 first = MAX(ofirst, parent->first);
1844 else
1845 first = parent->first;
1846 prstat = &pf->rstat;
1847 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1848
1849 if (first > parent->first &&
1850 !(pf->rstat == pf->accounted_rstat)) {
1851 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1852 << parent->first << "," << (first-1) << "] "
1853 << " " << *prstat << "/" << pf->accounted_rstat
1854 << dendl;
1855 parent->dirty_old_rstat[first-1].first = parent->first;
1856 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1857 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1858 }
1859 parent->first = first;
1860 } else if (!g_conf->mds_snap_rstat) {
1861 // drop snapshots' rstats
1862 break;
1863 } else if (last >= parent->first) {
1864 first = parent->first;
1865 parent->dirty_old_rstat[last].first = first;
1866 parent->dirty_old_rstat[last].rstat = pf->rstat;
1867 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1868 prstat = &parent->dirty_old_rstat[last].rstat;
1869 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1870 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1871 } else {
1872 // be careful, dirty_old_rstat is a _sparse_ map.
1873 // sorry, this is ugly.
1874 first = ofirst;
1875
1876 // find any intersection with last
94b18763
FG
1877 auto it = parent->dirty_old_rstat.lower_bound(last);
1878 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1879 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1880 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1881 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1882 first = parent->dirty_old_rstat.rbegin()->first+1;
1883 }
1884 } else {
94b18763
FG
1885 // *it last is >= last
1886 if (it->second.first <= last) {
1887 // *it intersects [first,last]
1888 if (it->second.first < first) {
1889 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1890 parent->dirty_old_rstat[first-1] = it->second;
1891 it->second.first = first;
7c673cae 1892 }
94b18763
FG
1893 if (it->second.first > first)
1894 first = it->second.first;
1895 if (last < it->first) {
1896 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1897 parent->dirty_old_rstat[last] = it->second;
1898 it->second.first = last+1;
7c673cae
FG
1899 }
1900 } else {
94b18763
FG
1901 // *it is to the _right_ of [first,last]
1902 it = parent->dirty_old_rstat.lower_bound(first);
1903 // new *it last is >= first
1904 if (it->second.first <= last && // new *it isn't also to the right, and
1905 it->first >= first) { // it intersects our first bit,
1906 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1907 first = it->first+1;
7c673cae
FG
1908 }
1909 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1910 }
1911 }
1912 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1913 parent->dirty_old_rstat[last].first = first;
1914 prstat = &parent->dirty_old_rstat[last].rstat;
1915 }
1916
1917 // apply
1918 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1919 assert(last >= first);
1920 prstat->add(delta);
1921 if (update_inode)
1922 inode.accounted_rstat = inode.rstat;
1923 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1924
1925 last = first-1;
1926 }
1927}
1928
1929void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1930 snapid_t ofirst, snapid_t last,
1931 CInode *pin, bool cow_head)
1932{
1933 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1934 dout(20) << " frag rstat " << rstat << dendl;
1935 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1936 nest_info_t delta = rstat;
1937 delta.sub(accounted_rstat);
1938 dout(20) << " delta " << delta << dendl;
1939
1940 while (last >= ofirst) {
94b18763 1941 CInode::mempool_inode *pi;
7c673cae
FG
1942 snapid_t first;
1943 if (last == pin->last) {
1944 pi = pin->get_projected_inode();
1945 first = MAX(ofirst, pin->first);
1946 if (first > pin->first) {
94b18763 1947 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1948 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1949 }
1950 } else {
1951 if (last >= pin->first) {
1952 first = pin->first;
1953 pin->cow_old_inode(last, cow_head);
1954 } else {
1955 // our life is easier here because old_inodes is not sparse
1956 // (although it may not begin at snapid 1)
94b18763
FG
1957 auto it = pin->old_inodes.lower_bound(last);
1958 if (it == pin->old_inodes.end()) {
7c673cae
FG
1959 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1960 break;
1961 }
94b18763 1962 first = it->second.first;
7c673cae 1963 if (first > last) {
94b18763 1964 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1965 //assert(p == pin->old_inodes.begin());
1966 break;
1967 }
94b18763
FG
1968 if (it->first > last) {
1969 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1970 << (last+1) << "," << it->first << "]" << dendl;
1971 pin->old_inodes[last] = it->second;
1972 it->second.first = last+1;
1973 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1974 }
1975 }
1976 if (first < ofirst) {
1977 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1978 << first << "," << ofirst-1 << "]" << dendl;
1979 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1980 pin->dirty_old_rstats.insert(ofirst-1);
1981 pin->old_inodes[last].first = first = ofirst;
1982 }
1983 pi = &pin->old_inodes[last].inode;
1984 pin->dirty_old_rstats.insert(last);
1985 }
1986 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1987 pi->rstat.add(delta);
1988 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1989
1990 last = first-1;
1991 }
1992}
1993
28e407b8 1994void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
7c673cae
FG
1995{
1996 if (!in->is_auth() || in->is_frozen())
1997 return;
1998
94b18763 1999 auto i = in->get_projected_inode();
7c673cae
FG
2000
2001 if (!i->quota.is_enable())
2002 return;
2003
2004 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
2005 it != in->client_caps.end();
2006 ++it) {
2007 Session *session = mds->get_session(it->first);
2008 if (!session || !session->connection ||
2009 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
2010 continue;
2011
2012 Capability *cap = it->second;
28e407b8
AA
2013
2014 if (exclude_ct >= 0 && exclude_ct != it->first)
2015 goto update;
2016
7c673cae
FG
2017 if (cap->last_rbytes == i->rstat.rbytes &&
2018 cap->last_rsize == i->rstat.rsize())
2019 continue;
2020
2021 if (i->quota.max_files > 0) {
2022 if (i->rstat.rsize() >= i->quota.max_files)
2023 goto update;
2024
2025 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2026 abs(cap->last_rsize - i->rstat.rsize()))
2027 goto update;
2028 }
2029
2030 if (i->quota.max_bytes > 0) {
2031 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2032 goto update;
2033
2034 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2035 abs(cap->last_rbytes - i->rstat.rbytes))
2036 goto update;
2037 }
2038
2039 continue;
2040
2041update:
2042 cap->last_rsize = i->rstat.rsize();
2043 cap->last_rbytes = i->rstat.rbytes;
2044
2045 MClientQuota *msg = new MClientQuota();
2046 msg->ino = in->ino();
2047 msg->rstat = i->rstat;
2048 msg->quota = i->quota;
2049 mds->send_message_client_counted(msg, session->connection);
2050 }
181888fb 2051 for (const auto &it : in->get_replicas()) {
7c673cae
FG
2052 MGatherCaps *msg = new MGatherCaps;
2053 msg->ino = in->ino();
181888fb 2054 mds->send_message_mds(msg, it.first);
7c673cae
FG
2055 }
2056}
2057
2058/*
2059 * NOTE: we _have_ to delay the scatter if we are called during a
2060 * rejoin, because we can't twiddle locks between when the
2061 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2062 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2063 * (no requests), and a survivor acks immediately. _except_ that
2064 * during rejoin_(weak|strong) processing, we may complete a lock
2065 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2066 * scatterlock state in that case or the lock states will get out of
2067 * sync between the auth and replica.
2068 *
2069 * the simple solution is to never do the scatter here. instead, put
2070 * the scatterlock on a list if it isn't already wrlockable. this is
2071 * probably the best plan anyway, since we avoid too many
2072 * scatters/locks under normal usage.
2073 */
2074/*
2075 * some notes on dirlock/nestlock scatterlock semantics:
2076 *
2077 * the fragstat (dirlock) will never be updated without
2078 * dirlock+nestlock wrlock held by the caller.
2079 *
2080 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2081 * data is pushed up the tree. this could be changed with some
2082 * restructuring here, but in its current form we ensure that the
2083 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2084 * frag, which is nice. and, we only need to track frags that need to
2085 * be nudged (and not inodes with pending rstat changes that need to
2086 * be pushed into the frag). a consequence of this is that the
2087 * accounted_rstat on scatterlock sync may not match our current
2088 * rstat. this is normal and expected.
2089 */
2090void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2091 CInode *in, CDir *parent,
2092 int flags, int linkunlink,
2093 snapid_t cfollows)
2094{
2095 bool primary_dn = flags & PREDIRTY_PRIMARY;
2096 bool do_parent_mtime = flags & PREDIRTY_DIR;
2097 bool shallow = flags & PREDIRTY_SHALLOW;
2098
2099 assert(mds->mdlog->entry_is_open());
2100
2101 // make sure stamp is set
2102 if (mut->get_mds_stamp() == utime_t())
2103 mut->set_mds_stamp(ceph_clock_now());
2104
2105 if (in->is_base())
2106 return;
2107
2108 dout(10) << "predirty_journal_parents"
2109 << (do_parent_mtime ? " do_parent_mtime":"")
2110 << " linkunlink=" << linkunlink
2111 << (primary_dn ? " primary_dn":" remote_dn")
2112 << (shallow ? " SHALLOW":"")
2113 << " follows " << cfollows
2114 << " " << *in << dendl;
2115
2116 if (!parent) {
2117 assert(primary_dn);
2118 parent = in->get_projected_parent_dn()->get_dir();
2119 }
2120
2121 if (flags == 0 && linkunlink == 0) {
2122 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2123 blob->add_dir_context(parent);
2124 return;
2125 }
2126
2127 // build list of inodes to wrlock, dirty, and update
2128 list<CInode*> lsi;
2129 CInode *cur = in;
2130 CDentry *parentdn = NULL;
2131 bool first = true;
2132 while (parent) {
2133 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2134 assert(parent->is_auth());
2135
2136 // opportunistically adjust parent dirfrag
2137 CInode *pin = parent->get_inode();
2138
2139 // inode -> dirfrag
2140 mut->auth_pin(parent);
2141 mut->add_projected_fnode(parent);
2142
2143 fnode_t *pf = parent->project_fnode();
2144 pf->version = parent->pre_dirty();
2145
2146 if (do_parent_mtime || linkunlink) {
2147 assert(mut->wrlocks.count(&pin->filelock));
2148 assert(mut->wrlocks.count(&pin->nestlock));
2149 assert(cfollows == CEPH_NOSNAP);
2150
2151 // update stale fragstat/rstat?
2152 parent->resync_accounted_fragstat();
2153 parent->resync_accounted_rstat();
2154
2155 if (do_parent_mtime) {
2156 pf->fragstat.mtime = mut->get_op_stamp();
2157 pf->fragstat.change_attr++;
2158 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2159 if (pf->fragstat.mtime > pf->rstat.rctime) {
2160 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2161 pf->rstat.rctime = pf->fragstat.mtime;
2162 } else {
2163 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2164 }
2165 }
2166 if (linkunlink) {
2167 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2168 if (in->is_dir()) {
2169 pf->fragstat.nsubdirs += linkunlink;
2170 //pf->rstat.rsubdirs += linkunlink;
2171 } else {
2172 pf->fragstat.nfiles += linkunlink;
2173 //pf->rstat.rfiles += linkunlink;
2174 }
2175 }
2176 }
2177
2178 // rstat
2179 if (!primary_dn) {
2180 // don't update parent this pass
2181 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2182 pin->versionlock.can_wrlock())) {
2183 dout(20) << " unwritable parent nestlock " << pin->nestlock
2184 << ", marking dirty rstat on " << *cur << dendl;
2185 cur->mark_dirty_rstat();
2186 } else {
2187 // if we don't hold a wrlock reference on this nestlock, take one,
2188 // because we are about to write into the dirfrag fnode and that needs
2189 // to commit before the lock can cycle.
2190 if (linkunlink) {
2191 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2192 }
2193
2194 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2195 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2196 mds->locker->wrlock_force(&pin->nestlock, mut);
2197 }
2198
2199 // now we can project the inode rstat diff the dirfrag
2200 SnapRealm *prealm = pin->find_snaprealm();
2201
2202 snapid_t follows = cfollows;
2203 if (follows == CEPH_NOSNAP)
2204 follows = prealm->get_newest_seq();
2205
2206 snapid_t first = follows+1;
2207
2208 // first, if the frag is stale, bring it back in sync.
2209 parent->resync_accounted_rstat();
2210
2211 // now push inode rstats into frag
2212 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2213 cur->clear_dirty_rstat();
2214 }
2215
2216 bool stop = false;
2217 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2218 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2219 stop = true;
2220 }
2221
2222 // delay propagating until later?
2223 if (!stop && !first &&
2224 g_conf->mds_dirstat_min_interval > 0) {
2225 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2226 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2227 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2228 << " < " << g_conf->mds_dirstat_min_interval
2229 << ", stopping" << dendl;
2230 stop = true;
2231 } else {
2232 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2233 }
2234 }
2235
2236 // can cast only because i'm passing nowait=true in the sole user
2237 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2238 if (!stop &&
2239 mut->wrlocks.count(&pin->nestlock) == 0 &&
2240 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2241 //true
2242 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2243 )) { // ** do not initiate.. see above comment **
2244 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2245 << " on " << *pin << dendl;
2246 stop = true;
2247 }
2248 if (stop) {
2249 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2250 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2251 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2252 mut->add_updated_lock(&pin->nestlock);
2253 if (do_parent_mtime || linkunlink) {
2254 mds->locker->mark_updated_scatterlock(&pin->filelock);
2255 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2256 mut->add_updated_lock(&pin->filelock);
2257 }
2258 break;
2259 }
2260 if (!mut->wrlocks.count(&pin->versionlock))
2261 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2262
2263 assert(mut->wrlocks.count(&pin->nestlock) ||
2264 mut->is_slave());
2265
2266 pin->last_dirstat_prop = mut->get_mds_stamp();
2267
2268 // dirfrag -> diri
2269 mut->auth_pin(pin);
2270 mut->add_projected_inode(pin);
2271 lsi.push_front(pin);
2272
2273 pin->pre_cow_old_inode(); // avoid cow mayhem!
2274
94b18763
FG
2275 auto &pi = pin->project_inode();
2276 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2277
2278 // dirstat
2279 if (do_parent_mtime || linkunlink) {
2280 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2281 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2282 bool touched_mtime = false, touched_chattr = false;
94b18763 2283 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2284 pf->accounted_fragstat = pf->fragstat;
2285 if (touched_mtime)
94b18763 2286 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2287 if (touched_chattr)
94b18763
FG
2288 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2289 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2290
2291 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2292 if (pi.inode.dirstat.size() < 0)
7c673cae 2293 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
94b18763 2294 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2295 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2296 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2297 << ", dirfrag has " << pf->fragstat;
2298
2299 // trust the dirfrag for now
94b18763 2300 pi.inode.dirstat = pf->fragstat;
7c673cae
FG
2301
2302 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2303 }
2304 }
2305 }
2306
2307 /*
2308 * the rule here is to follow the _oldest_ parent with dirty rstat
2309 * data. if we don't propagate all data, we add ourselves to the
2310 * nudge list. that way all rstat data will (eventually) get
2311 * pushed up the tree.
2312 *
2313 * actually, no. for now, silently drop rstats for old parents. we need
2314 * hard link backpointers to do the above properly.
2315 */
2316
2317 // stop?
2318 if (pin->is_base())
2319 break;
2320 parentdn = pin->get_projected_parent_dn();
2321 assert(parentdn);
2322
2323 // rstat
2324 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2325
2326 // first, if the frag is stale, bring it back in sync.
2327 parent->resync_accounted_rstat();
2328
2329 if (g_conf->mds_snap_rstat) {
94b18763
FG
2330 for (auto &p : parent->dirty_old_rstat) {
2331 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2332 p.first, pin, true);
2333 }
7c673cae
FG
2334 }
2335 parent->dirty_old_rstat.clear();
2336 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2337
2338 pf->accounted_rstat = pf->rstat;
2339
2340 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2341 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2342 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2343 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2344 << ", dirfrag has " << pf->rstat;
2345
2346 // trust the dirfrag for now
94b18763 2347 pi.inode.rstat = pf->rstat;
7c673cae
FG
2348
2349 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2350 }
2351 }
2352
2353 parent->check_rstats();
2354 broadcast_quota_to_client(pin);
2355 // next parent!
2356 cur = pin;
2357 parent = parentdn->get_dir();
2358 linkunlink = 0;
2359 do_parent_mtime = false;
2360 primary_dn = true;
2361 first = false;
2362 }
2363
2364 // now, stick it in the blob
2365 assert(parent);
2366 assert(parent->is_auth());
2367 blob->add_dir_context(parent);
2368 blob->add_dir(parent, true);
2369 for (list<CInode*>::iterator p = lsi.begin();
2370 p != lsi.end();
2371 ++p) {
2372 CInode *cur = *p;
2373 journal_dirty_inode(mut.get(), blob, cur);
2374 }
2375
2376}
2377
2378
2379
2380
2381
2382// ===================================
2383// slave requests
2384
2385
2386/*
2387 * some handlers for master requests with slaves. we need to make
2388 * sure slaves journal commits before we forget we mastered them and
2389 * remove them from the uncommitted_masters map (used during recovery
2390 * to commit|abort slaves).
2391 */
2392struct C_MDC_CommittedMaster : public MDCacheLogContext {
2393 metareqid_t reqid;
2394 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2395 void finish(int r) override {
2396 mdcache->_logged_master_commit(reqid);
2397 }
2398};
2399
2400void MDCache::log_master_commit(metareqid_t reqid)
2401{
2402 dout(10) << "log_master_commit " << reqid << dendl;
2403 uncommitted_masters[reqid].committing = true;
2404 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2405 new C_MDC_CommittedMaster(this, reqid));
2406}
2407
2408void MDCache::_logged_master_commit(metareqid_t reqid)
2409{
2410 dout(10) << "_logged_master_commit " << reqid << dendl;
2411 assert(uncommitted_masters.count(reqid));
2412 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2413 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2414 uncommitted_masters.erase(reqid);
2415}
2416
2417// while active...
2418
2419void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2420{
2421 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2422 assert(uncommitted_masters.count(r));
2423 uncommitted_masters[r].slaves.erase(from);
2424 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2425 log_master_commit(r);
2426}
2427
2428void MDCache::logged_master_update(metareqid_t reqid)
2429{
2430 dout(10) << "logged_master_update " << reqid << dendl;
2431 assert(uncommitted_masters.count(reqid));
2432 uncommitted_masters[reqid].safe = true;
2433 if (pending_masters.count(reqid)) {
2434 pending_masters.erase(reqid);
2435 if (pending_masters.empty())
2436 process_delayed_resolve();
2437 }
2438}
2439
2440/*
2441 * Master may crash after receiving all slaves' commit acks, but before journalling
2442 * the final commit. Slaves may crash after journalling the slave commit, but before
2443 * sending commit ack to the master. Commit masters with no uncommitted slave when
2444 * resolve finishes.
2445 */
2446void MDCache::finish_committed_masters()
2447{
2448 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2449 p != uncommitted_masters.end();
2450 ++p) {
2451 p->second.recovering = false;
2452 if (!p->second.committing && p->second.slaves.empty()) {
2453 dout(10) << "finish_committed_masters " << p->first << dendl;
2454 log_master_commit(p->first);
2455 }
2456 }
2457}
2458
2459/*
2460 * at end of resolve... we must journal a commit|abort for all slave
2461 * updates, before moving on.
2462 *
2463 * this is so that the master can safely journal ECommitted on ops it
2464 * masters when it reaches up:active (all other recovering nodes must
2465 * complete resolve before that happens).
2466 */
2467struct C_MDC_SlaveCommit : public MDCacheLogContext {
2468 mds_rank_t from;
2469 metareqid_t reqid;
2470 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2471 void finish(int r) override {
2472 mdcache->_logged_slave_commit(from, reqid);
2473 }
2474};
2475
2476void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2477{
2478 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2479
2480 // send a message
2481 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2482 mds->send_message_mds(req, from);
2483}
2484
2485
2486
2487
2488
2489
2490// ====================================================================
2491// import map, recovery
2492
2493void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2494 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2495{
2496 if (subtrees.count(oldparent)) {
2497 vector<dirfrag_t>& v = subtrees[oldparent];
2498 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2499 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2500 if (*it == df) {
2501 v.erase(it);
2502 break;
2503 }
2504 }
2505 if (subtrees.count(newparent)) {
2506 vector<dirfrag_t>& v = subtrees[newparent];
2507 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2508 v.push_back(df);
2509 }
2510}
2511
2512ESubtreeMap *MDCache::create_subtree_map()
2513{
2514 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2515 << num_subtrees_fullauth() << " fullauth"
2516 << dendl;
2517
2518 show_subtrees();
2519
2520 ESubtreeMap *le = new ESubtreeMap();
2521 mds->mdlog->_start_entry(le);
2522
2523 map<dirfrag_t, CDir*> dirs_to_add;
2524
2525 if (myin) {
2526 CDir* mydir = myin->get_dirfrag(frag_t());
2527 dirs_to_add[mydir->dirfrag()] = mydir;
2528 }
2529
2530 // include all auth subtrees, and their bounds.
2531 // and a spanning tree to tie it to the root.
2532 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2533 p != subtrees.end();
2534 ++p) {
2535 CDir *dir = p->first;
2536
2537 // journal subtree as "ours" if we are
2538 // me, -2
2539 // me, me
2540 // me, !me (may be importing and ambiguous!)
2541
2542 // so not
2543 // !me, *
2544 if (dir->get_dir_auth().first != mds->get_nodeid())
2545 continue;
2546
2547 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2548 my_ambiguous_imports.count(dir->dirfrag())) {
2549 dout(15) << " ambig subtree " << *dir << dendl;
2550 le->ambiguous_subtrees.insert(dir->dirfrag());
2551 } else {
2552 dout(15) << " subtree " << *dir << dendl;
2553 }
2554
2555 dirs_to_add[dir->dirfrag()] = dir;
2556 le->subtrees[dir->dirfrag()].clear();
2557
2558
2559 // bounds
2560 for (set<CDir*>::iterator q = p->second.begin();
2561 q != p->second.end();
2562 ++q) {
2563 CDir *bound = *q;
2564 dout(15) << " subtree bound " << *bound << dendl;
2565 dirs_to_add[bound->dirfrag()] = bound;
2566 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2567 }
2568 }
2569
2570 // apply projected renames
2571 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2572 p != projected_subtree_renames.end();
2573 ++p) {
2574 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2575 CInode *diri = p->first;
2576 CDir *olddir = q->first;
2577 CDir *newdir = q->second;
2578 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2579
2580 list<CDir*> dfls;
2581 diri->get_dirfrags(dfls);
2582 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2583 CDir *dir = *p;
2584 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2585 CDir *oldparent = get_projected_subtree_root(olddir);
2586 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2587 CDir *newparent = get_projected_subtree_root(newdir);
2588 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2589
2590 if (oldparent == newparent) {
2591 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2592 << oldparent->dirfrag() << dendl;
2593 continue;
2594 }
2595
2596 if (dir->is_subtree_root()) {
2597 if (le->subtrees.count(newparent->dirfrag()) &&
2598 oldparent->get_dir_auth() != newparent->get_dir_auth())
2599 dirs_to_add[dir->dirfrag()] = dir;
2600 // children are fine. change parent.
2601 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2602 le->subtrees);
2603 } else {
2604 // mid-subtree.
2605
2606 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2607 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2608 // if oldparent is auth, subtree is mine; include it.
2609 if (le->subtrees.count(oldparent->dirfrag())) {
2610 dirs_to_add[dir->dirfrag()] = dir;
2611 le->subtrees[dir->dirfrag()].clear();
2612 }
2613 // if newparent is auth, subtree is a new bound
2614 if (le->subtrees.count(newparent->dirfrag())) {
2615 dirs_to_add[dir->dirfrag()] = dir;
2616 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2617 }
2618 newparent = dir;
2619 }
2620
2621 // see if any old bounds move to the new parent.
2622 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2623 p != subtrees[oldparent].end();
2624 ++p) {
2625 CDir *bound = *p;
2626 if (dir->contains(bound->get_parent_dir()))
2627 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2628 le->subtrees);
2629 }
2630 }
2631 }
2632 }
2633 }
2634
2635 // simplify the journaled map. our in memory map may have more
2636 // subtrees than needed due to migrations that are just getting
2637 // started or just completing. but on replay, the "live" map will
2638 // be simple and we can do a straight comparison.
2639 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2640 if (le->ambiguous_subtrees.count(p->first))
2641 continue;
2642 unsigned i = 0;
2643 while (i < p->second.size()) {
2644 dirfrag_t b = p->second[i];
2645 if (le->subtrees.count(b) &&
2646 le->ambiguous_subtrees.count(b) == 0) {
2647 vector<dirfrag_t>& bb = le->subtrees[b];
2648 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2649 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2650 p->second.push_back(*r);
2651 dirs_to_add.erase(b);
2652 le->subtrees.erase(b);
2653 p->second.erase(p->second.begin() + i);
2654 } else {
2655 ++i;
2656 }
2657 }
2658 }
2659
94b18763 2660 for (auto &p : dirs_to_add) {
7c673cae
FG
2661 CDir *dir = p.second;
2662 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2663 le->metablob.add_dir(dir, false);
2664 }
2665
2666 dout(15) << " subtrees " << le->subtrees << dendl;
2667 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2668
2669 //le->metablob.print(cout);
2670 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2671 return le;
2672}
2673
2674void MDCache::dump_resolve_status(Formatter *f) const
2675{
2676 f->open_object_section("resolve_status");
2677 f->dump_stream("resolve_gather") << resolve_gather;
2678 f->dump_stream("resolve_ack_gather") << resolve_gather;
2679 f->close_section();
2680}
2681
2682void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2683{
2684 dout(10) << "resolve_start" << dendl;
2685 assert(!resolve_done);
2686 resolve_done.reset(resolve_done_);
2687
2688 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2689 // if we don't have the root dir, adjust it to UNKNOWN. during
2690 // resolve we want mds0 to explicit claim the portion of it that
2691 // it owns, so that anything beyond its bounds get left as
2692 // unknown.
2693 CDir *rootdir = root->get_dirfrag(frag_t());
2694 if (rootdir)
2695 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2696 }
2697 resolve_gather = recovery_set;
2698}
2699
2700void MDCache::send_resolves()
2701{
2702 send_slave_resolves();
2703 if (!resolve_ack_gather.empty()) {
2704 dout(10) << "send_resolves still waiting for resolve ack from ("
2705 << resolve_ack_gather << ")" << dendl;
2706 return;
2707 }
2708 if (!need_resolve_rollback.empty()) {
2709 dout(10) << "send_resolves still waiting for rollback to commit on ("
2710 << need_resolve_rollback << ")" << dendl;
2711 return;
2712 }
2713 send_subtree_resolves();
2714}
2715
2716void MDCache::send_slave_resolves()
2717{
2718 dout(10) << "send_slave_resolves" << dendl;
2719
2720 map<mds_rank_t, MMDSResolve*> resolves;
2721
2722 if (mds->is_resolve()) {
2723 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2724 p != uncommitted_slave_updates.end();
2725 ++p) {
2726 resolves[p->first] = new MMDSResolve;
2727 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2728 q != p->second.end();
2729 ++q) {
2730 dout(10) << " including uncommitted " << q->first << dendl;
2731 resolves[p->first]->add_slave_request(q->first, false);
2732 }
2733 }
2734 } else {
2735 set<mds_rank_t> resolve_set;
2736 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2737 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2738 p != active_requests.end();
2739 ++p) {
2740 MDRequestRef& mdr = p->second;
2741 if (!mdr->is_slave())
2742 continue;
2743 if (!mdr->slave_did_prepare() && !mdr->committing) {
2744 continue;
2745 }
2746 mds_rank_t master = mdr->slave_to_mds;
2747 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2748 dout(10) << " including uncommitted " << *mdr << dendl;
2749 if (!resolves.count(master))
2750 resolves[master] = new MMDSResolve;
2751 if (!mdr->committing &&
2752 mdr->has_more() && mdr->more()->is_inode_exporter) {
2753 // re-send cap exports
2754 CInode *in = mdr->more()->rename_inode;
2755 map<client_t, Capability::Export> cap_map;
2756 in->export_client_caps(cap_map);
2757 bufferlist bl;
2758 ::encode(in->ino(), bl);
2759 ::encode(cap_map, bl);
2760 resolves[master]->add_slave_request(p->first, bl);
2761 } else {
2762 resolves[master]->add_slave_request(p->first, mdr->committing);
2763 }
2764 }
2765 }
2766 }
2767
2768 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2769 p != resolves.end();
2770 ++p) {
2771 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2772 mds->send_message_mds(p->second, p->first);
2773 resolve_ack_gather.insert(p->first);
2774 }
2775}
2776
2777void MDCache::send_subtree_resolves()
2778{
2779 dout(10) << "send_subtree_resolves" << dendl;
2780
2781 if (migrator->is_exporting() || migrator->is_importing()) {
2782 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2783 migrator->show_importing();
2784 migrator->show_exporting();
2785 resolves_pending = true;
2786 return; // not now
2787 }
2788
2789 map<mds_rank_t, MMDSResolve*> resolves;
2790 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2791 p != recovery_set.end();
2792 ++p) {
2793 if (*p == mds->get_nodeid())
2794 continue;
2795 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2796 resolves[*p] = new MMDSResolve;
2797 }
2798
2799 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2800 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2801
2802 // known
2803 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2804 p != subtrees.end();
2805 ++p) {
2806 CDir *dir = p->first;
2807
2808 // only our subtrees
2809 if (dir->authority().first != mds->get_nodeid())
2810 continue;
2811
2812 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2813 continue; // we'll add it below
2814
2815 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2816 // ambiguous (mid-import)
2817 set<CDir*> bounds;
2818 get_subtree_bounds(dir, bounds);
2819 vector<dirfrag_t> dfls;
2820 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2821 dfls.push_back((*q)->dirfrag());
2822
2823 my_ambig_imports[dir->dirfrag()] = dfls;
2824 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2825 } else {
2826 // not ambiguous.
2827 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2828 q != resolves.end();
2829 ++q)
2830 resolves[q->first]->add_subtree(dir->dirfrag());
2831 // bounds too
2832 vector<dirfrag_t> dfls;
2833 for (set<CDir*>::iterator q = subtrees[dir].begin();
2834 q != subtrees[dir].end();
2835 ++q) {
2836 CDir *bound = *q;
2837 dfls.push_back(bound->dirfrag());
2838 }
2839
2840 my_subtrees[dir->dirfrag()] = dfls;
2841 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2842 }
2843 }
2844
2845 // ambiguous
2846 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2847 p != my_ambiguous_imports.end();
2848 ++p) {
2849 my_ambig_imports[p->first] = p->second;
2850 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2851 }
2852
2853 // simplify the claimed subtree.
2854 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2855 unsigned i = 0;
2856 while (i < p->second.size()) {
2857 dirfrag_t b = p->second[i];
2858 if (my_subtrees.count(b)) {
2859 vector<dirfrag_t>& bb = my_subtrees[b];
2860 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2861 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2862 p->second.push_back(*r);
2863 my_subtrees.erase(b);
2864 p->second.erase(p->second.begin() + i);
2865 } else {
2866 ++i;
2867 }
2868 }
2869 }
2870
2871 // send
2872 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2873 p != resolves.end();
2874 ++p) {
2875 MMDSResolve* m = p->second;
2876 m->subtrees = my_subtrees;
2877 m->ambiguous_imports = my_ambig_imports;
2878 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2879 mds->send_message_mds(m, p->first);
2880 }
2881 resolves_pending = false;
2882}
2883
2884void MDCache::handle_mds_failure(mds_rank_t who)
2885{
2886 dout(7) << "handle_mds_failure mds." << who << dendl;
2887
2888 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2889
2890 resolve_gather.insert(who);
2891 discard_delayed_resolve(who);
2892 ambiguous_slave_updates.erase(who);
2893
2894 rejoin_gather.insert(who);
2895 rejoin_sent.erase(who); // i need to send another
31f18b77 2896 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2897 rejoin_ack_gather.erase(who); // i'll need/get another.
2898
2899 dout(10) << " resolve_gather " << resolve_gather << dendl;
2900 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2901 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2902 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2903 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2904
2905
2906 // tell the migrator too.
2907 migrator->handle_mds_failure_or_stop(who);
2908
224ce89b
WB
2909 // tell the balancer too.
2910 mds->balancer->handle_mds_failure(who);
2911
7c673cae
FG
2912 // clean up any requests slave to/from this node
2913 list<MDRequestRef> finish;
2914 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2915 p != active_requests.end();
2916 ++p) {
2917 MDRequestRef& mdr = p->second;
2918 // slave to the failed node?
2919 if (mdr->slave_to_mds == who) {
2920 if (mdr->slave_did_prepare()) {
2921 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2922 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2923 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2924
2925 if (!mdr->more()->waiting_on_slave.empty()) {
2926 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2927 // will rollback, no need to wait
2928 if (mdr->slave_request) {
2929 mdr->slave_request->put();
2930 mdr->slave_request = 0;
2931 }
2932 mdr->more()->waiting_on_slave.clear();
2933 }
2934 } else if (!mdr->committing) {
2935 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2936 if (mdr->slave_request || mdr->slave_rolling_back())
2937 mdr->aborted = true;
2938 else
2939 finish.push_back(mdr);
2940 }
2941 }
2942
2943 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2944 if (mdr->more()->waiting_on_slave.count(who)) {
2945 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2946 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2947 << who << dendl;
2948 mdr->more()->waiting_on_slave.erase(who);
2949 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2950 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2951 }
2952
2953 if (mdr->more()->srcdn_auth_mds == who &&
2954 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2955 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2956 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2957 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2958 }
31f18b77
FG
2959 } else if (mdr->slave_request) {
2960 MMDSSlaveRequest *slave_req = mdr->slave_request;
2961 // FIXME: Slave rename request can arrive after we notice mds failure.
2962 // This can cause mds to crash (does not affect integrity of FS).
2963 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2964 slave_req->srcdn_auth == who)
2965 slave_req->mark_interrupted();
7c673cae
FG
2966 }
2967
2968 // failed node is slave?
2969 if (mdr->is_master() && !mdr->committing) {
2970 if (mdr->more()->srcdn_auth_mds == who) {
2971 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2972 << who << " to recover" << dendl;
2973 assert(mdr->more()->witnessed.count(who) == 0);
2974 if (mdr->more()->is_ambiguous_auth)
2975 mdr->clear_ambiguous_auth();
2976 // rename srcdn's auth mds failed, all witnesses will rollback
2977 mdr->more()->witnessed.clear();
2978 pending_masters.erase(p->first);
2979 }
2980
2981 if (mdr->more()->witnessed.count(who)) {
2982 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2983 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2984 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2985 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2986 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2987 // until either the request is committing or the slave also fails.
2988 assert(mdr->more()->waiting_on_slave.size() == 1);
2989 pending_masters.insert(p->first);
2990 } else {
2991 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2992 << who << " to recover" << dendl;
2993 if (srcdn_auth >= 0)
2994 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2995
2996 // discard this peer's prepare (if any)
2997 mdr->more()->witnessed.erase(who);
2998 }
2999 }
3000
3001 if (mdr->more()->waiting_on_slave.count(who)) {
3002 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3003 << " to recover" << dendl;
3004 // retry request when peer recovers
3005 mdr->more()->waiting_on_slave.erase(who);
3006 if (mdr->more()->waiting_on_slave.empty())
3007 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3008 }
3009
3010 if (mdr->locking && mdr->locking_target_mds == who)
3011 mdr->finish_locking(mdr->locking);
3012 }
3013 }
3014
3015 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3016 p != uncommitted_masters.end();
3017 ++p) {
3018 // The failed MDS may have already committed the slave update
3019 if (p->second.slaves.count(who)) {
3020 p->second.recovering = true;
3021 p->second.slaves.erase(who);
3022 }
3023 }
3024
3025 while (!finish.empty()) {
3026 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3027 request_finish(finish.front());
3028 finish.pop_front();
3029 }
3030
3031 kick_find_ino_peers(who);
3032 kick_open_ino_peers(who);
3033
3034 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3035 p != fragments.end(); ) {
3036 dirfrag_t df = p->first;
3037 fragment_info_t& info = p->second;
3038 ++p;
3039 if (info.is_fragmenting())
3040 continue;
3041 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3042 list<CDir*> dirs;
3043 info.dirs.swap(dirs);
3044 fragments.erase(df);
3045 fragment_unmark_unfreeze_dirs(dirs);
3046 }
3047
3048 // MDCache::shutdown_export_strays() always exports strays to mds.0
3049 if (who == mds_rank_t(0))
3050 shutdown_exported_strays.clear();
3051
3052 show_subtrees();
3053}
3054
3055/*
3056 * handle_mds_recovery - called on another node's transition
3057 * from resolve -> active.
3058 */
3059void MDCache::handle_mds_recovery(mds_rank_t who)
3060{
3061 dout(7) << "handle_mds_recovery mds." << who << dendl;
3062
3063 // exclude all discover waiters. kick_discovers() will do the job
3064 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3065 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3066
3067 list<MDSInternalContextBase*> waiters;
3068
3069 // wake up any waiters in their subtrees
3070 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3071 p != subtrees.end();
3072 ++p) {
3073 CDir *dir = p->first;
3074
3075 if (dir->authority().first != who ||
3076 dir->authority().second == mds->get_nodeid())
3077 continue;
3078 assert(!dir->is_auth());
3079
3080 // wake any waiters
3081 list<CDir*> q;
3082 q.push_back(dir);
3083
3084 while (!q.empty()) {
3085 CDir *d = q.front();
3086 q.pop_front();
3087 d->take_waiting(d_mask, waiters);
3088
3089 // inode waiters too
94b18763
FG
3090 for (auto &p : d->items) {
3091 CDentry *dn = p.second;
7c673cae
FG
3092 CDentry::linkage_t *dnl = dn->get_linkage();
3093 if (dnl->is_primary()) {
3094 dnl->get_inode()->take_waiting(i_mask, waiters);
3095
3096 // recurse?
3097 list<CDir*> ls;
3098 dnl->get_inode()->get_dirfrags(ls);
3099 for (list<CDir*>::iterator p = ls.begin();
3100 p != ls.end();
3101 ++p) {
3102 CDir *subdir = *p;
3103 if (!subdir->is_subtree_root())
3104 q.push_back(subdir);
3105 }
3106 }
3107 }
3108 }
3109 }
3110
3111 kick_open_ino_peers(who);
3112 kick_find_ino_peers(who);
3113
3114 // queue them up.
3115 mds->queue_waiters(waiters);
3116}
3117
3118void MDCache::set_recovery_set(set<mds_rank_t>& s)
3119{
3120 dout(7) << "set_recovery_set " << s << dendl;
3121 recovery_set = s;
3122}
3123
3124
3125/*
3126 * during resolve state, we share resolves to determine who
3127 * is authoritative for which trees. we expect to get an resolve
3128 * from _everyone_ in the recovery_set (the mds cluster at the time of
3129 * the first failure).
3130 *
3131 * This functions puts the passed message before returning
3132 */
3133void MDCache::handle_resolve(MMDSResolve *m)
3134{
3135 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3136 mds_rank_t from = mds_rank_t(m->get_source().num());
3137
3138 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3139 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3140 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3141 return;
3142 }
3143 // wait until we reach the resolve stage!
3144 m->put();
3145 return;
3146 }
3147
3148 discard_delayed_resolve(from);
3149
3150 // ambiguous slave requests?
3151 if (!m->slave_requests.empty()) {
3152 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3153 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3154 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3155 assert(!p->second.committing);
3156 pending_masters.insert(p->first);
3157 }
3158 }
3159
3160 if (!pending_masters.empty()) {
3161 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3162 delayed_resolve[from] = m;
3163 return;
3164 }
3165 }
3166
3167 MMDSResolveAck *ack = new MMDSResolveAck;
3168 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3169 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3170 // COMMIT
3171 if (p->second.committing) {
3172 // already committing, waiting for the OP_COMMITTED slave reply
3173 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3174 } else {
3175 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3176 ack->add_commit(p->first);
3177 }
3178 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3179
3180 if (p->second.inode_caps.length() > 0) {
3181 // slave wants to export caps (rename)
3182 assert(mds->is_resolve());
3183
3184 inodeno_t ino;
3185 map<client_t,Capability::Export> cap_exports;
3186 bufferlist::iterator q = p->second.inode_caps.begin();
3187 ::decode(ino, q);
3188 ::decode(cap_exports, q);
3189
3190 assert(get_inode(ino));
3191
3192 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3193 q != cap_exports.end();
3194 ++q) {
3195 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3196 im.cap_id = ++last_cap_id; // assign a new cap ID
3197 im.issue_seq = 1;
3198 im.mseq = q->second.mseq;
28e407b8
AA
3199
3200 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3201 if (session)
3202 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3203 }
3204
3205 // will process these caps in rejoin stage
3206 rejoin_slave_exports[ino].first = from;
3207 rejoin_slave_exports[ino].second.swap(cap_exports);
3208
3209 // send information of imported caps back to slave
3210 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3211 }
3212 } else {
3213 // ABORT
3214 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3215 assert(!p->second.committing);
3216 ack->add_abort(p->first);
3217 }
3218 }
3219 mds->send_message(ack, m->get_connection());
3220 m->put();
3221 return;
3222 }
3223
3224 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3225 dout(10) << "delay processing subtree resolve" << dendl;
3226 delayed_resolve[from] = m;
3227 return;
3228 }
3229
3230 bool survivor = false;
3231 // am i a surviving ambiguous importer?
3232 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3233 survivor = true;
3234 // check for any import success/failure (from this node)
3235 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3236 while (p != my_ambiguous_imports.end()) {
3237 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3238 ++next;
3239 CDir *dir = get_dirfrag(p->first);
3240 assert(dir);
3241 dout(10) << "checking ambiguous import " << *dir << dendl;
3242 if (migrator->is_importing(dir->dirfrag()) &&
3243 migrator->get_import_peer(dir->dirfrag()) == from) {
3244 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3245
3246 // check if sender claims the subtree
3247 bool claimed_by_sender = false;
3248 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3249 q != m->subtrees.end();
3250 ++q) {
3251 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3252 CDir *base = get_force_dirfrag(q->first, false);
3253 if (!base || !base->contains(dir))
3254 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3255
3256 bool inside = true;
3257 set<CDir*> bounds;
3258 get_force_dirfrag_bound_set(q->second, bounds);
3259 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3260 CDir *bound = *p;
3261 if (bound->contains(dir)) {
3262 inside = false; // nope, bound is dir or parent of dir, not inside.
3263 break;
3264 }
3265 }
3266 if (inside)
3267 claimed_by_sender = true;
3268 }
3269
3270 my_ambiguous_imports.erase(p); // no longer ambiguous.
3271 if (claimed_by_sender) {
3272 dout(7) << "ambiguous import failed on " << *dir << dendl;
3273 migrator->import_reverse(dir);
3274 } else {
3275 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3276 migrator->import_finish(dir, true);
3277 }
3278 }
3279 p = next;
3280 }
3281 }
3282
3283 // update my dir_auth values
3284 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3285 // migrations between other nodes)
3286 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3287 pi != m->subtrees.end();
3288 ++pi) {
3289 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3290 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3291 if (!dir)
3292 continue;
3293 adjust_bounded_subtree_auth(dir, pi->second, from);
3294 try_subtree_merge(dir);
3295 }
3296
3297 show_subtrees();
3298
3299 // note ambiguous imports too
3300 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3301 pi != m->ambiguous_imports.end();
3302 ++pi) {
3303 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3304 other_ambiguous_imports[from][pi->first].swap( pi->second );
3305 }
3306
3307 // did i get them all?
3308 resolve_gather.erase(from);
3309
3310 maybe_resolve_finish();
3311
3312 m->put();
3313}
3314
3315void MDCache::process_delayed_resolve()
3316{
3317 dout(10) << "process_delayed_resolve" << dendl;
3318 map<mds_rank_t, MMDSResolve*> tmp;
3319 tmp.swap(delayed_resolve);
3320 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3321 handle_resolve(p->second);
3322}
3323
3324void MDCache::discard_delayed_resolve(mds_rank_t who)
3325{
3326 if (delayed_resolve.count(who)) {
3327 delayed_resolve[who]->put();
3328 delayed_resolve.erase(who);
3329 }
3330}
3331
3332void MDCache::maybe_resolve_finish()
3333{
3334 assert(resolve_ack_gather.empty());
3335 assert(need_resolve_rollback.empty());
3336
3337 if (!resolve_gather.empty()) {
3338 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3339 << resolve_gather << ")" << dendl;
3340 return;
3341 }
3342
3343 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3344 disambiguate_my_imports();
3345 finish_committed_masters();
3346
3347 if (resolve_done) {
3348 assert(mds->is_resolve());
3349 trim_unlinked_inodes();
3350 recalc_auth_bits(false);
3351 resolve_done.release()->complete(0);
3352 } else {
3353 maybe_send_pending_rejoins();
3354 }
3355}
3356
3357/* This functions puts the passed message before returning */
3358void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3359{
3360 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3361 mds_rank_t from = mds_rank_t(ack->get_source().num());
3362
3363 if (!resolve_ack_gather.count(from) ||
3364 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3365 ack->put();
3366 return;
3367 }
3368
3369 if (ambiguous_slave_updates.count(from)) {
3370 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3371 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3372 }
3373
3374 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3375 p != ack->commit.end();
3376 ++p) {
3377 dout(10) << " commit on slave " << p->first << dendl;
3378
3379 if (ambiguous_slave_updates.count(from)) {
3380 remove_ambiguous_slave_update(p->first, from);
3381 continue;
3382 }
3383
3384 if (mds->is_resolve()) {
3385 // replay
3386 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3387 assert(su);
3388
3389 // log commit
3390 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3391 ESlaveUpdate::OP_COMMIT, su->origop),
3392 new C_MDC_SlaveCommit(this, from, p->first));
3393 mds->mdlog->flush();
3394
3395 finish_uncommitted_slave_update(p->first, from);
3396 } else {
3397 MDRequestRef mdr = request_get(p->first);
3398 // information about master imported caps
3399 if (p->second.length() > 0)
3400 mdr->more()->inode_import.claim(p->second);
3401
3402 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3403 request_finish(mdr);
3404 }
3405 }
3406
3407 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3408 p != ack->abort.end();
3409 ++p) {
3410 dout(10) << " abort on slave " << *p << dendl;
3411
3412 if (mds->is_resolve()) {
3413 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3414 assert(su);
3415
3416 // perform rollback (and journal a rollback entry)
3417 // note: this will hold up the resolve a bit, until the rollback entries journal.
3418 MDRequestRef null_ref;
3419 switch (su->origop) {
3420 case ESlaveUpdate::LINK:
3421 mds->server->do_link_rollback(su->rollback, from, null_ref);
3422 break;
3423 case ESlaveUpdate::RENAME:
3424 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3425 break;
3426 case ESlaveUpdate::RMDIR:
3427 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3428 break;
3429 default:
3430 ceph_abort();
3431 }
3432 } else {
3433 MDRequestRef mdr = request_get(*p);
3434 mdr->aborted = true;
3435 if (mdr->slave_request) {
3436 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3437 add_rollback(*p, from);
3438 } else {
3439 request_finish(mdr);
3440 }
3441 }
3442 }
3443
3444 if (!ambiguous_slave_updates.count(from))
3445 resolve_ack_gather.erase(from);
3446 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3447 send_subtree_resolves();
3448 process_delayed_resolve();
3449 }
3450
3451 ack->put();
3452}
3453
3454void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3455{
3456 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3457 uncommitted_slave_updates[master][reqid] = su;
3458 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3459 uncommitted_slave_rename_olddir[*p]++;
3460 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3461 uncommitted_slave_unlink[*p]++;
3462}
3463
3464void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3465{
3466 assert(uncommitted_slave_updates[master].count(reqid));
3467 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3468
3469 uncommitted_slave_updates[master].erase(reqid);
3470 if (uncommitted_slave_updates[master].empty())
3471 uncommitted_slave_updates.erase(master);
3472 // discard the non-auth subtree we renamed out of
3473 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3474 CInode *diri = *p;
3475 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3476 assert(it != uncommitted_slave_rename_olddir.end());
3477 it->second--;
3478 if (it->second == 0) {
3479 uncommitted_slave_rename_olddir.erase(it);
3480 list<CDir*> ls;
3481 diri->get_dirfrags(ls);
3482 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3483 CDir *root = get_subtree_root(*q);
3484 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3485 try_trim_non_auth_subtree(root);
3486 if (*q != root)
3487 break;
3488 }
3489 }
3490 } else
3491 assert(it->second > 0);
3492 }
3493 // removed the inodes that were unlinked by slave update
3494 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3495 CInode *in = *p;
3496 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3497 assert(it != uncommitted_slave_unlink.end());
3498 it->second--;
3499 if (it->second == 0) {
3500 uncommitted_slave_unlink.erase(it);
3501 if (!in->get_projected_parent_dn())
3502 mds->mdcache->remove_inode_recursive(in);
3503 } else
3504 assert(it->second > 0);
3505 }
3506 delete su;
3507}
3508
3509MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3510{
3511
3512 MDSlaveUpdate* su = NULL;
3513 if (uncommitted_slave_updates.count(master) &&
3514 uncommitted_slave_updates[master].count(reqid)) {
3515 su = uncommitted_slave_updates[master][reqid];
3516 assert(su);
3517 }
3518 return su;
3519}
3520
3521void MDCache::finish_rollback(metareqid_t reqid) {
3522 assert(need_resolve_rollback.count(reqid));
3523 if (mds->is_resolve())
3524 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3525 need_resolve_rollback.erase(reqid);
3526 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3527 send_subtree_resolves();
3528 process_delayed_resolve();
3529 }
3530}
3531
3532void MDCache::disambiguate_other_imports()
3533{
3534 dout(10) << "disambiguate_other_imports" << dendl;
3535
3536 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3537 // other nodes' ambiguous imports
3538 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3539 p != other_ambiguous_imports.end();
3540 ++p) {
3541 mds_rank_t who = p->first;
3542 dout(10) << "ambiguous imports for mds." << who << dendl;
3543
3544 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3545 q != p->second.end();
3546 ++q) {
3547 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3548 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3549 CDir *dir = get_force_dirfrag(q->first, recovering);
3550 if (!dir) continue;
3551
3552 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3553 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3554 dout(10) << " mds." << who << " did import " << *dir << dendl;
3555 adjust_bounded_subtree_auth(dir, q->second, who);
3556 try_subtree_merge(dir);
3557 } else {
3558 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3559 }
3560 }
3561 }
3562 other_ambiguous_imports.clear();
3563}
3564
3565void MDCache::disambiguate_my_imports()
3566{
3567 dout(10) << "disambiguate_my_imports" << dendl;
3568
3569 if (!mds->is_resolve()) {
3570 assert(my_ambiguous_imports.empty());
3571 return;
3572 }
3573
3574 disambiguate_other_imports();
3575
3576 // my ambiguous imports
3577 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3578 while (!my_ambiguous_imports.empty()) {
3579 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3580
3581 CDir *dir = get_dirfrag(q->first);
3582 assert(dir);
3583
3584 if (dir->authority() != me_ambig) {
3585 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3586 cancel_ambiguous_import(dir);
3587
3588 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3589
3590 // subtree may have been swallowed by another node claiming dir
3591 // as their own.
3592 CDir *root = get_subtree_root(dir);
3593 if (root != dir)
3594 dout(10) << " subtree root is " << *root << dendl;
3595 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3596 try_trim_non_auth_subtree(root);
3597 } else {
3598 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3599 finish_ambiguous_import(q->first);
3600 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3601 }
3602 }
3603 assert(my_ambiguous_imports.empty());
3604 mds->mdlog->flush();
3605
3606 // verify all my subtrees are unambiguous!
3607 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3608 p != subtrees.end();
3609 ++p) {
3610 CDir *dir = p->first;
3611 if (dir->is_ambiguous_dir_auth()) {
3612 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3613 }
3614 assert(!dir->is_ambiguous_dir_auth());
3615 }
3616
3617 show_subtrees();
3618}
3619
3620
3621void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3622{
3623 assert(my_ambiguous_imports.count(base) == 0);
3624 my_ambiguous_imports[base] = bounds;
3625}
3626
3627
3628void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3629{
3630 // make a list
3631 vector<dirfrag_t> binos;
3632 for (set<CDir*>::iterator p = bounds.begin();
3633 p != bounds.end();
3634 ++p)
3635 binos.push_back((*p)->dirfrag());
3636
3637 // note: this can get called twice if the exporter fails during recovery
3638 if (my_ambiguous_imports.count(base->dirfrag()))
3639 my_ambiguous_imports.erase(base->dirfrag());
3640
3641 add_ambiguous_import(base->dirfrag(), binos);
3642}
3643
3644void MDCache::cancel_ambiguous_import(CDir *dir)
3645{
3646 dirfrag_t df = dir->dirfrag();
3647 assert(my_ambiguous_imports.count(df));
3648 dout(10) << "cancel_ambiguous_import " << df
3649 << " bounds " << my_ambiguous_imports[df]
3650 << " " << *dir
3651 << dendl;
3652 my_ambiguous_imports.erase(df);
3653}
3654
3655void MDCache::finish_ambiguous_import(dirfrag_t df)
3656{
3657 assert(my_ambiguous_imports.count(df));
3658 vector<dirfrag_t> bounds;
3659 bounds.swap(my_ambiguous_imports[df]);
3660 my_ambiguous_imports.erase(df);
3661
3662 dout(10) << "finish_ambiguous_import " << df
3663 << " bounds " << bounds
3664 << dendl;
3665 CDir *dir = get_dirfrag(df);
3666 assert(dir);
3667
3668 // adjust dir_auth, import maps
3669 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3670 try_subtree_merge(dir);
3671}
3672
3673void MDCache::remove_inode_recursive(CInode *in)
3674{
3675 dout(10) << "remove_inode_recursive " << *in << dendl;
3676 list<CDir*> ls;
3677 in->get_dirfrags(ls);
3678 list<CDir*>::iterator p = ls.begin();
3679 while (p != ls.end()) {
3680 CDir *subdir = *p++;
3681
3682 dout(10) << " removing dirfrag " << subdir << dendl;
94b18763
FG
3683 auto it = subdir->items.begin();
3684 while (it != subdir->items.end()) {
3685 CDentry *dn = it->second;
3686 ++it;
7c673cae
FG
3687 CDentry::linkage_t *dnl = dn->get_linkage();
3688 if (dnl->is_primary()) {
3689 CInode *tin = dnl->get_inode();
31f18b77 3690 subdir->unlink_inode(dn, false);
7c673cae
FG
3691 remove_inode_recursive(tin);
3692 }
3693 subdir->remove_dentry(dn);
3694 }
3695
3696 if (subdir->is_subtree_root())
3697 remove_subtree(subdir);
3698 in->close_dirfrag(subdir->dirfrag().frag);
3699 }
3700 remove_inode(in);
3701}
3702
3703bool MDCache::expire_recursive(
3704 CInode *in,
3705 map<mds_rank_t, MCacheExpire*>& expiremap)
3706{
3707 assert(!in->is_auth());
3708
3709 dout(10) << __func__ << ":" << *in << dendl;
3710
3711 // Recurse into any dirfrags beneath this inode
3712 list<CDir*> ls;
3713 in->get_dirfrags(ls);
3714 for (auto subdir : ls) {
3715 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3716 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3717 return true;
3718 }
3719
3720 for (auto &it : subdir->items) {
3721 CDentry *dn = it.second;
3722 CDentry::linkage_t *dnl = dn->get_linkage();
3723 if (dnl->is_primary()) {
3724 CInode *tin = dnl->get_inode();
3725
3726 /* Remote strays with linkage (i.e. hardlinks) should not be
3727 * expired, because they may be the target of
3728 * a rename() as the owning MDS shuts down */
3729 if (!tin->is_stray() && tin->inode.nlink) {
3730 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3731 return true;
3732 }
3733
3734 const bool abort = expire_recursive(tin, expiremap);
3735 if (abort) {
3736 return true;
3737 }
3738 }
3739 if (dn->lru_is_expireable()) {
3740 trim_dentry(dn, expiremap);
3741 } else {
3742 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3743 return true;
3744 }
3745 }
3746 }
3747
3748 return false;
3749}
3750
3751void MDCache::trim_unlinked_inodes()
3752{
3753 dout(7) << "trim_unlinked_inodes" << dendl;
3754 list<CInode*> q;
94b18763 3755 for (auto &p : inode_map) {
b32b8144 3756 CInode *in = p.second;
7c673cae
FG
3757 if (in->get_parent_dn() == NULL && !in->is_base()) {
3758 dout(7) << " will trim from " << *in << dendl;
3759 q.push_back(in);
3760 }
3761 }
3762 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3763 remove_inode_recursive(*p);
3764}
3765
3766/** recalc_auth_bits()
3767 * once subtree auth is disambiguated, we need to adjust all the
3768 * auth and dirty bits in our cache before moving on.
3769 */
3770void MDCache::recalc_auth_bits(bool replay)
3771{
3772 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3773
3774 if (root) {
3775 root->inode_auth.first = mds->mdsmap->get_root();
3776 bool auth = mds->get_nodeid() == root->inode_auth.first;
3777 if (auth) {
3778 root->state_set(CInode::STATE_AUTH);
3779 } else {
3780 root->state_clear(CInode::STATE_AUTH);
3781 if (!replay)
3782 root->state_set(CInode::STATE_REJOINING);
3783 }
3784 }
3785
3786 set<CInode*> subtree_inodes;
3787 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3788 p != subtrees.end();
3789 ++p) {
3790 if (p->first->dir_auth.first == mds->get_nodeid())
3791 subtree_inodes.insert(p->first->inode);
3792 }
3793
3794 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3795 p != subtrees.end();
3796 ++p) {
3797 if (p->first->inode->is_mdsdir()) {
3798 CInode *in = p->first->inode;
3799 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3800 if (auth) {
3801 in->state_set(CInode::STATE_AUTH);
3802 } else {
3803 in->state_clear(CInode::STATE_AUTH);
3804 if (!replay)
3805 in->state_set(CInode::STATE_REJOINING);
3806 }
3807 }
3808
3809 list<CDir*> dfq; // dirfrag queue
3810 dfq.push_back(p->first);
3811
3812 bool auth = p->first->authority().first == mds->get_nodeid();
3813 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3814
3815 while (!dfq.empty()) {
3816 CDir *dir = dfq.front();
3817 dfq.pop_front();
3818
3819 // dir
3820 if (auth) {
3821 dir->state_set(CDir::STATE_AUTH);
3822 } else {
3823 dir->state_clear(CDir::STATE_AUTH);
3824 if (!replay) {
3825 // close empty non-auth dirfrag
3826 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3827 dir->inode->close_dirfrag(dir->get_frag());
3828 continue;
3829 }
3830 dir->state_set(CDir::STATE_REJOINING);
3831 dir->state_clear(CDir::STATE_COMPLETE);
3832 if (dir->is_dirty())
3833 dir->mark_clean();
3834 }
3835 }
3836
3837 // dentries in this dir
94b18763 3838 for (auto &p : dir->items) {
7c673cae 3839 // dn
94b18763 3840 CDentry *dn = p.second;
7c673cae
FG
3841 CDentry::linkage_t *dnl = dn->get_linkage();
3842 if (auth) {
3843 dn->state_set(CDentry::STATE_AUTH);
3844 } else {
3845 dn->state_clear(CDentry::STATE_AUTH);
3846 if (!replay) {
3847 dn->state_set(CDentry::STATE_REJOINING);
3848 if (dn->is_dirty())
3849 dn->mark_clean();
3850 }
3851 }
3852
3853 if (dnl->is_primary()) {
3854 // inode
3855 CInode *in = dnl->get_inode();
3856 if (auth) {
3857 in->state_set(CInode::STATE_AUTH);
3858 } else {
3859 in->state_clear(CInode::STATE_AUTH);
3860 if (!replay) {
3861 in->state_set(CInode::STATE_REJOINING);
3862 if (in->is_dirty())
3863 in->mark_clean();
3864 if (in->is_dirty_parent())
3865 in->clear_dirty_parent();
3866 // avoid touching scatterlocks for our subtree roots!
3867 if (subtree_inodes.count(in) == 0)
3868 in->clear_scatter_dirty();
3869 }
3870 }
3871 // recurse?
3872 if (in->is_dir())
3873 in->get_nested_dirfrags(dfq);
3874 }
3875 }
3876 }
3877 }
3878
3879 show_subtrees();
3880 show_cache();
3881}
3882
3883
3884
3885// ===========================================================================
3886// REJOIN
3887
3888/*
3889 * notes on scatterlock recovery:
3890 *
3891 * - recovering inode replica sends scatterlock data for any subtree
3892 * roots (the only ones that are possibly dirty).
3893 *
3894 * - surviving auth incorporates any provided scatterlock data. any
3895 * pending gathers are then finished, as with the other lock types.
3896 *
3897 * that takes care of surviving auth + (recovering replica)*.
3898 *
3899 * - surviving replica sends strong_inode, which includes current
3900 * scatterlock state, AND any dirty scatterlock data. this
3901 * provides the recovering auth with everything it might need.
3902 *
3903 * - recovering auth must pick initial scatterlock state based on
3904 * (weak|strong) rejoins.
3905 * - always assimilate scatterlock data (it can't hurt)
3906 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3907 * - include base inode in ack for all inodes that saw scatterlock content
3908 *
3909 * also, for scatter gather,
3910 *
3911 * - auth increments {frag,r}stat.version on completion of any gather.
3912 *
3913 * - auth incorporates changes in a gather _only_ if the version
3914 * matches.
3915 *
3916 * - replica discards changes any time the scatterlock syncs, and
3917 * after recovery.
3918 */
3919
3920void MDCache::dump_rejoin_status(Formatter *f) const
3921{
3922 f->open_object_section("rejoin_status");
3923 f->dump_stream("rejoin_gather") << rejoin_gather;
3924 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3925 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3926 f->close_section();
3927}
3928
3929void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3930{
3931 dout(10) << "rejoin_start" << dendl;
3932 assert(!rejoin_done);
3933 rejoin_done.reset(rejoin_done_);
3934
3935 rejoin_gather = recovery_set;
3936 // need finish opening cap inodes before sending cache rejoins
3937 rejoin_gather.insert(mds->get_nodeid());
3938 process_imported_caps();
3939}
3940
3941/*
3942 * rejoin phase!
3943 *
3944 * this initiates rejoin. it shoudl be called before we get any
3945 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3946 *
3947 * we start out by sending rejoins to everyone in the recovery set.
3948 *
3949 * if we are rejoin, send for all regions in our cache.
3950 * if we are active|stopping, send only to nodes that are are rejoining.
3951 */
3952void MDCache::rejoin_send_rejoins()
3953{
3954 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3955
3956 if (rejoin_gather.count(mds->get_nodeid())) {
3957 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3958 rejoins_pending = true;
3959 return;
3960 }
3961 if (!resolve_gather.empty()) {
3962 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3963 << resolve_gather << ")" << dendl;
3964 rejoins_pending = true;
3965 return;
3966 }
3967
3968 assert(!migrator->is_importing());
3969 assert(!migrator->is_exporting());
3970
3971 if (!mds->is_rejoin()) {
3972 disambiguate_other_imports();
3973 }
3974
3975 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3976
3977
3978 // if i am rejoining, send a rejoin to everyone.
3979 // otherwise, just send to others who are rejoining.
3980 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3981 p != recovery_set.end();
3982 ++p) {
3983 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3984 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3985 if (mds->is_rejoin())
3986 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3987 else if (mds->mdsmap->is_rejoin(*p))
3988 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3989 }
3990
3991 if (mds->is_rejoin()) {
3992 map<client_t, set<mds_rank_t> > client_exports;
3993 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
28e407b8 3994 mds_rank_t target = p->second.first;
7c673cae
FG
3995 if (rejoins.count(target) == 0)
3996 continue;
28e407b8
AA
3997 rejoins[target]->cap_exports[p->first] = p->second.second;
3998 for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
7c673cae
FG
3999 client_exports[q->first].insert(target);
4000 }
4001 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
4002 p != client_exports.end();
4003 ++p) {
4004 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
4005 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
4006 rejoins[*q]->client_map[p->first] = inst;
4007 }
4008 }
4009
4010
4011 // check all subtrees
4012 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4013 p != subtrees.end();
4014 ++p) {
4015 CDir *dir = p->first;
4016 assert(dir->is_subtree_root());
4017 if (dir->is_ambiguous_dir_auth()) {
4018 // exporter is recovering, importer is survivor.
4019 assert(rejoins.count(dir->authority().first));
4020 assert(!rejoins.count(dir->authority().second));
4021 continue;
4022 }
4023
4024 // my subtree?
4025 if (dir->is_auth())
4026 continue; // skip my own regions!
4027
4028 mds_rank_t auth = dir->get_dir_auth().first;
4029 assert(auth >= 0);
4030 if (rejoins.count(auth) == 0)
4031 continue; // don't care about this node's subtrees
4032
4033 rejoin_walk(dir, rejoins[auth]);
4034 }
4035
4036 // rejoin root inodes, too
4037 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4038 p != rejoins.end();
4039 ++p) {
4040 if (mds->is_rejoin()) {
4041 // weak
4042 if (p->first == 0 && root) {
4043 p->second->add_weak_inode(root->vino());
4044 if (root->is_dirty_scattered()) {
4045 dout(10) << " sending scatterlock state on root " << *root << dendl;
4046 p->second->add_scatterlock_state(root);
4047 }
4048 }
4049 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4050 if (in)
4051 p->second->add_weak_inode(in->vino());
4052 }
4053 } else {
4054 // strong
4055 if (p->first == 0 && root) {
4056 p->second->add_strong_inode(root->vino(),
4057 root->get_replica_nonce(),
4058 root->get_caps_wanted(),
4059 root->filelock.get_state(),
4060 root->nestlock.get_state(),
4061 root->dirfragtreelock.get_state());
4062 root->state_set(CInode::STATE_REJOINING);
4063 if (root->is_dirty_scattered()) {
4064 dout(10) << " sending scatterlock state on root " << *root << dendl;
4065 p->second->add_scatterlock_state(root);
4066 }
4067 }
4068
4069 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4070 p->second->add_strong_inode(in->vino(),
4071 in->get_replica_nonce(),
4072 in->get_caps_wanted(),
4073 in->filelock.get_state(),
4074 in->nestlock.get_state(),
4075 in->dirfragtreelock.get_state());
4076 in->state_set(CInode::STATE_REJOINING);
4077 }
4078 }
4079 }
4080
4081 if (!mds->is_rejoin()) {
4082 // i am survivor. send strong rejoin.
4083 // note request remote_auth_pins, xlocks
4084 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4085 p != active_requests.end();
4086 ++p) {
4087 MDRequestRef& mdr = p->second;
4088 if (mdr->is_slave())
4089 continue;
4090 // auth pins
4091 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4092 q != mdr->remote_auth_pins.end();
4093 ++q) {
4094 if (!q->first->is_auth()) {
4095 assert(q->second == q->first->authority().first);
4096 if (rejoins.count(q->second) == 0) continue;
4097 MMDSCacheRejoin *rejoin = rejoins[q->second];
4098
4099 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4100 MDSCacheObjectInfo i;
4101 q->first->set_object_info(i);
4102 if (i.ino)
4103 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4104 else
4105 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4106
4107 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4108 mdr->more()->rename_inode == q->first)
4109 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4110 mdr->reqid, mdr->attempt);
4111 }
4112 }
4113 // xlocks
4114 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4115 q != mdr->xlocks.end();
4116 ++q) {
4117 if (!(*q)->get_parent()->is_auth()) {
4118 mds_rank_t who = (*q)->get_parent()->authority().first;
4119 if (rejoins.count(who) == 0) continue;
4120 MMDSCacheRejoin *rejoin = rejoins[who];
4121
4122 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4123 MDSCacheObjectInfo i;
4124 (*q)->get_parent()->set_object_info(i);
4125 if (i.ino)
4126 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4127 mdr->reqid, mdr->attempt);
4128 else
4129 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4130 mdr->reqid, mdr->attempt);
4131 }
4132 }
4133 // remote wrlocks
4134 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4135 q != mdr->remote_wrlocks.end();
4136 ++q) {
4137 mds_rank_t who = q->second;
4138 if (rejoins.count(who) == 0) continue;
4139 MMDSCacheRejoin *rejoin = rejoins[who];
4140
4141 dout(15) << " " << *mdr << " wrlock on " << q->second
4142 << " " << q->first->get_parent() << dendl;
4143 MDSCacheObjectInfo i;
4144 q->first->get_parent()->set_object_info(i);
4145 assert(i.ino);
4146 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4147 mdr->reqid, mdr->attempt);
4148 }
4149 }
4150 }
4151
4152 // send the messages
4153 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4154 p != rejoins.end();
4155 ++p) {
4156 assert(rejoin_sent.count(p->first) == 0);
4157 assert(rejoin_ack_gather.count(p->first) == 0);
4158 rejoin_sent.insert(p->first);
4159 rejoin_ack_gather.insert(p->first);
4160 mds->send_message_mds(p->second, p->first);
4161 }
4162 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4163 rejoins_pending = false;
4164
4165 // nothing?
28e407b8 4166 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4167 dout(10) << "nothing to rejoin" << dendl;
4168 rejoin_gather_finish();
4169 }
4170}
4171
4172
4173/**
4174 * rejoin_walk - build rejoin declarations for a subtree
4175 *
4176 * @param dir subtree root
4177 * @param rejoin rejoin message
4178 *
4179 * from a rejoining node:
4180 * weak dirfrag
4181 * weak dentries (w/ connectivity)
4182 *
4183 * from a surviving node:
4184 * strong dirfrag
4185 * strong dentries (no connectivity!)
4186 * strong inodes
4187 */
4188void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4189{
4190 dout(10) << "rejoin_walk " << *dir << dendl;
4191
4192 list<CDir*> nested; // finish this dir, then do nested items
4193
4194 if (mds->is_rejoin()) {
4195 // WEAK
4196 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4197 for (auto &p : dir->items) {
4198 CDentry *dn = p.second;
4199 assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4200 CDentry::linkage_t *dnl = dn->get_linkage();
4201 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4202 assert(dnl->is_primary());
4203 CInode *in = dnl->get_inode();
4204 assert(dnl->get_inode()->is_dir());
94b18763 4205 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
7c673cae
FG
4206 in->get_nested_dirfrags(nested);
4207 if (in->is_dirty_scattered()) {
4208 dout(10) << " sending scatterlock state on " << *in << dendl;
4209 rejoin->add_scatterlock_state(in);
4210 }
4211 }
4212 } else {
4213 // STRONG
4214 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4215 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4216 dir->state_set(CDir::STATE_REJOINING);
4217
94b18763
FG
4218 for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
4219 CDentry *dn = it->second;
7c673cae
FG
4220 CDentry::linkage_t *dnl = dn->get_linkage();
4221 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4222 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4223 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4224 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4225 dnl->is_remote() ? dnl->get_remote_d_type():0,
4226 dn->get_replica_nonce(),
4227 dn->lock.get_state());
4228 dn->state_set(CDentry::STATE_REJOINING);
4229 if (dnl->is_primary()) {
4230 CInode *in = dnl->get_inode();
4231 dout(15) << " add_strong_inode " << *in << dendl;
4232 rejoin->add_strong_inode(in->vino(),
4233 in->get_replica_nonce(),
4234 in->get_caps_wanted(),
4235 in->filelock.get_state(),
4236 in->nestlock.get_state(),
4237 in->dirfragtreelock.get_state());
4238 in->state_set(CInode::STATE_REJOINING);
4239 in->get_nested_dirfrags(nested);
4240 if (in->is_dirty_scattered()) {
4241 dout(10) << " sending scatterlock state on " << *in << dendl;
4242 rejoin->add_scatterlock_state(in);
4243 }
4244 }
4245 }
4246 }
4247
4248 // recurse into nested dirs
4249 for (list<CDir*>::iterator p = nested.begin();
4250 p != nested.end();
4251 ++p)
4252 rejoin_walk(*p, rejoin);
4253}
4254
4255
4256/*
4257 * i got a rejoin.
4258 * - reply with the lockstate
4259 *
4260 * if i am active|stopping,
4261 * - remove source from replica list for everything not referenced here.
4262 * This function puts the passed message before returning.
4263 */
4264void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4265{
4266 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4267 << " (" << m->get_payload().length() << " bytes)"
4268 << dendl;
4269
4270 switch (m->op) {
4271 case MMDSCacheRejoin::OP_WEAK:
4272 handle_cache_rejoin_weak(m);
4273 break;
4274 case MMDSCacheRejoin::OP_STRONG:
4275 handle_cache_rejoin_strong(m);
4276 break;
4277 case MMDSCacheRejoin::OP_ACK:
4278 handle_cache_rejoin_ack(m);
4279 break;
4280
4281 default:
4282 ceph_abort();
4283 }
4284 m->put();
4285}
4286
4287
4288/*
4289 * handle_cache_rejoin_weak
4290 *
4291 * the sender
4292 * - is recovering from their journal.
4293 * - may have incorrect (out of date) inode contents
4294 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4295 *
4296 * if the sender didn't trim_non_auth(), they
4297 * - may have incorrect (out of date) dentry/inode linkage
4298 * - may have deleted/purged inodes
4299 * and i may have to go to disk to get accurate inode contents. yuck.
4300 * This functions DOES NOT put the passed message before returning
4301 */
4302void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4303{
4304 mds_rank_t from = mds_rank_t(weak->get_source().num());
4305
4306 // possible response(s)
4307 MMDSCacheRejoin *ack = 0; // if survivor
4308 set<vinodeno_t> acked_inodes; // if survivor
4309 set<SimpleLock *> gather_locks; // if survivor
4310 bool survivor = false; // am i a survivor?
4311
4312 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4313 survivor = true;
4314 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4315 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4316
4317 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4318
4319 // check cap exports
4320 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4321 CInode *in = get_inode(p->first);
4322 assert(!in || in->is_auth());
4323 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4324 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4325 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4326 Capability::Import& im = imported_caps[p->first][q->first];
4327 if (cap) {
4328 im.cap_id = cap->get_cap_id();
4329 im.issue_seq = cap->get_last_seq();
4330 im.mseq = cap->get_mseq();
4331 } else {
4332 // all are zero
4333 }
4334 }
4335 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4336 }
4337
4338 ::encode(imported_caps, ack->imported_caps);
4339 } else {
4340 assert(mds->is_rejoin());
4341
4342 // we may have already received a strong rejoin from the sender.
4343 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4344 assert(gather_locks.empty());
4345
4346 // check cap exports.
4347 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4348
4349 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4350 CInode *in = get_inode(p->first);
b32b8144 4351 assert(!in || in->is_auth());
7c673cae
FG
4352 // note
4353 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4354 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4355 cap_imports[p->first][q->first][from] = q->second;
4356 }
4357 }
4358 }
4359
4360 // assimilate any potentially dirty scatterlock state
4361 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4362 p != weak->inode_scatterlocks.end();
4363 ++p) {
4364 CInode *in = get_inode(p->first);
4365 assert(in);
4366 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4367 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4368 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4369 if (!survivor)
4370 rejoin_potential_updated_scatterlocks.insert(in);
4371 }
4372
4373 // recovering peer may send incorrect dirfrags here. we need to
4374 // infer which dirfrag they meant. the ack will include a
4375 // strong_dirfrag that will set them straight on the fragmentation.
4376
4377 // walk weak map
4378 set<CDir*> dirs_to_share;
4379 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4380 p != weak->weak_dirfrags.end();
4381 ++p) {
4382 CInode *diri = get_inode(p->ino);
4383 if (!diri)
4384 dout(0) << " missing dir ino " << p->ino << dendl;
4385 assert(diri);
4386
4387 list<frag_t> ls;
4388 if (diri->dirfragtree.is_leaf(p->frag)) {
4389 ls.push_back(p->frag);
4390 } else {
4391 diri->dirfragtree.get_leaves_under(p->frag, ls);
4392 if (ls.empty())
4393 ls.push_back(diri->dirfragtree[p->frag.value()]);
4394 }
4395 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4396 frag_t fg = *q;
4397 CDir *dir = diri->get_dirfrag(fg);
4398 if (!dir) {
4399 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4400 continue;
4401 }
4402 assert(dir);
4403 if (dirs_to_share.count(dir)) {
4404 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4405 } else {
4406 dirs_to_share.insert(dir);
4407 unsigned nonce = dir->add_replica(from);
4408 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4409 if (ack) {
4410 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4411 ack->add_dirfrag_base(dir);
4412 }
4413 }
4414 }
4415 }
4416
4417 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4418 p != weak->weak.end();
4419 ++p) {
4420 CInode *diri = get_inode(p->first);
4421 if (!diri)
4422 dout(0) << " missing dir ino " << p->first << dendl;
4423 assert(diri);
4424
4425 // weak dentries
4426 CDir *dir = 0;
4427 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4428 q != p->second.end();
4429 ++q) {
4430 // locate proper dirfrag.
4431 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4432 frag_t fg = diri->pick_dirfrag(q->first.name);
4433 if (!dir || dir->get_frag() != fg) {
4434 dir = diri->get_dirfrag(fg);
4435 if (!dir)
4436 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4437 assert(dir);
4438 assert(dirs_to_share.count(dir));
4439 }
4440
4441 // and dentry
4442 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4443 assert(dn);
4444 CDentry::linkage_t *dnl = dn->get_linkage();
4445 assert(dnl->is_primary());
4446
4447 if (survivor && dn->is_replica(from))
4448 dentry_remove_replica(dn, from, gather_locks);
4449 unsigned dnonce = dn->add_replica(from);
4450 dout(10) << " have " << *dn << dendl;
4451 if (ack)
94b18763 4452 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4453 dnl->get_inode()->ino(), inodeno_t(0), 0,
4454 dnonce, dn->lock.get_replica_state());
4455
4456 // inode
4457 CInode *in = dnl->get_inode();
4458 assert(in);
4459
4460 if (survivor && in->is_replica(from))
4461 inode_remove_replica(in, from, true, gather_locks);
4462 unsigned inonce = in->add_replica(from);
4463 dout(10) << " have " << *in << dendl;
4464
4465 // scatter the dirlock, just in case?
4466 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4467 in->filelock.set_state(LOCK_MIX);
4468
4469 if (ack) {
4470 acked_inodes.insert(in->vino());
4471 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4472 bufferlist bl;
4473 in->_encode_locks_state_for_rejoin(bl, from);
4474 ack->add_inode_locks(in, inonce, bl);
4475 }
4476 }
4477 }
4478
4479 // weak base inodes? (root, stray, etc.)
4480 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4481 p != weak->weak_inodes.end();
4482 ++p) {
4483 CInode *in = get_inode(*p);
4484 assert(in); // hmm fixme wrt stray?
4485 if (survivor && in->is_replica(from))
4486 inode_remove_replica(in, from, true, gather_locks);
4487 unsigned inonce = in->add_replica(from);
4488 dout(10) << " have base " << *in << dendl;
4489
4490 if (ack) {
4491 acked_inodes.insert(in->vino());
4492 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4493 bufferlist bl;
4494 in->_encode_locks_state_for_rejoin(bl, from);
4495 ack->add_inode_locks(in, inonce, bl);
4496 }
4497 }
4498
4499 assert(rejoin_gather.count(from));
4500 rejoin_gather.erase(from);
4501 if (survivor) {
4502 // survivor. do everything now.
4503 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4504 p != weak->inode_scatterlocks.end();
4505 ++p) {
4506 CInode *in = get_inode(p->first);
4507 assert(in);
4508 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4509 acked_inodes.insert(in->vino());
4510 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4511 }
4512
4513 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4514 mds->send_message(ack, weak->get_connection());
4515
4516 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4517 if (!(*p)->is_stable())
4518 mds->locker->eval_gather(*p);
4519 }
4520 } else {
4521 // done?
28e407b8 4522 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4523 rejoin_gather_finish();
4524 } else {
4525 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4526 }
4527 }
4528}
4529
7c673cae
FG
4530/*
4531 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4532 *
4533 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4534 * ack, the replica dne, and we can remove it from our replica maps.
4535 */
4536void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4537 set<vinodeno_t>& acked_inodes,
4538 set<SimpleLock *>& gather_locks)
4539{
4540 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4541
b32b8144 4542 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4543 // inode?
4544 if (in->is_auth() &&
4545 in->is_replica(from) &&
b32b8144 4546 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4547 inode_remove_replica(in, from, false, gather_locks);
4548 dout(10) << " rem " << *in << dendl;
4549 }
4550
b32b8144
FG
4551 if (!in->is_dir())
4552 return;
7c673cae
FG
4553
4554 list<CDir*> dfs;
4555 in->get_dirfrags(dfs);
4556 for (list<CDir*>::iterator p = dfs.begin();
4557 p != dfs.end();
4558 ++p) {
4559 CDir *dir = *p;
181888fb
FG
4560 if (!dir->is_auth())
4561 continue;
7c673cae 4562
181888fb 4563 if (dir->is_replica(from) &&
7c673cae
FG
4564 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4565 dir->remove_replica(from);
4566 dout(10) << " rem " << *dir << dendl;
4567 }
4568
4569 // dentries
94b18763
FG
4570 for (auto &p : dir->items) {
4571 CDentry *dn = p.second;
7c673cae
FG
4572
4573 if (dn->is_replica(from) &&
4574 (ack == NULL ||
4575 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
94b18763 4576 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
7c673cae
FG
4577 dentry_remove_replica(dn, from, gather_locks);
4578 dout(10) << " rem " << *dn << dendl;
4579 }
4580 }
4581 }
b32b8144
FG
4582 };
4583
94b18763 4584 for (auto &p : inode_map)
b32b8144 4585 scour_func(p.second);
94b18763 4586 for (auto &p : snap_inode_map)
b32b8144 4587 scour_func(p.second);
7c673cae
FG
4588}
4589
4590
4591CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4592{
4593 CInode *in = new CInode(this, true, 1, last);
4594 in->inode.ino = ino;
4595 in->state_set(CInode::STATE_REJOINUNDEF);
4596 add_inode(in);
4597 rejoin_undef_inodes.insert(in);
4598 dout(10) << " invented " << *in << dendl;
4599 return in;
4600}
4601
4602CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4603{
4604 CInode *in = get_inode(df.ino);
4605 if (!in)
4606 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4607 if (!in->is_dir()) {
4608 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4609 in->inode.mode = S_IFDIR;
4610 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4611 }
4612 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4613 dir->state_set(CDir::STATE_REJOINUNDEF);
4614 rejoin_undef_dirfrags.insert(dir);
4615 dout(10) << " invented " << *dir << dendl;
4616 return dir;
4617}
4618
4619/* This functions DOES NOT put the passed message before returning */
4620void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4621{
4622 mds_rank_t from = mds_rank_t(strong->get_source().num());
4623
4624 // only a recovering node will get a strong rejoin.
4625 assert(mds->is_rejoin());
4626
4627 // assimilate any potentially dirty scatterlock state
4628 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4629 p != strong->inode_scatterlocks.end();
4630 ++p) {
4631 CInode *in = get_inode(p->first);
4632 assert(in);
4633 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4634 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4635 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4636 rejoin_potential_updated_scatterlocks.insert(in);
4637 }
4638
4639 rejoin_unlinked_inodes[from].clear();
4640
4641 // surviving peer may send incorrect dirfrag here (maybe they didn't
4642 // get the fragment notify, or maybe we rolled back?). we need to
4643 // infer the right frag and get them with the program. somehow.
4644 // we don't normally send ACK.. so we'll need to bundle this with
4645 // MISSING or something.
4646
4647 // strong dirfrags/dentries.
4648 // also process auth_pins, xlocks.
4649 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4650 p != strong->strong_dirfrags.end();
4651 ++p) {
4652 CInode *diri = get_inode(p->first.ino);
4653 if (!diri)
4654 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4655 CDir *dir = diri->get_dirfrag(p->first.frag);
4656 bool refragged = false;
4657 if (dir) {
4658 dout(10) << " have " << *dir << dendl;
4659 } else {
4660 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4661 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4662 else if (diri->dirfragtree.is_leaf(p->first.frag))
4663 dir = rejoin_invent_dirfrag(p->first);
4664 }
4665 if (dir) {
4666 dir->add_replica(from, p->second.nonce);
4667 dir->dir_rep = p->second.dir_rep;
4668 } else {
4669 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4670 list<frag_t> ls;
4671 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4672 if (ls.empty())
4673 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4674 dout(10) << " maps to frag(s) " << ls << dendl;
4675 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4676 CDir *dir = diri->get_dirfrag(*q);
4677 if (!dir)
4678 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4679 else
4680 dout(10) << " have(approx) " << *dir << dendl;
4681 dir->add_replica(from, p->second.nonce);
4682 dir->dir_rep = p->second.dir_rep;
4683 }
4684 refragged = true;
4685 }
4686
4687 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4688 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4689 q != dmap.end();
4690 ++q) {
4691 CDentry *dn;
4692 if (!refragged)
4693 dn = dir->lookup(q->first.name, q->first.snapid);
4694 else {
4695 frag_t fg = diri->pick_dirfrag(q->first.name);
4696 dir = diri->get_dirfrag(fg);
4697 assert(dir);
4698 dn = dir->lookup(q->first.name, q->first.snapid);
4699 }
4700 if (!dn) {
4701 if (q->second.is_remote()) {
4702 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4703 q->second.first, q->first.snapid);
4704 } else if (q->second.is_null()) {
4705 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4706 } else {
4707 CInode *in = get_inode(q->second.ino, q->first.snapid);
4708 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4709 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4710 }
4711 dout(10) << " invented " << *dn << dendl;
4712 }
4713 CDentry::linkage_t *dnl = dn->get_linkage();
4714
4715 // dn auth_pin?
4716 if (strong->authpinned_dentries.count(p->first) &&
4717 strong->authpinned_dentries[p->first].count(q->first)) {
4718 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4719 r != strong->authpinned_dentries[p->first][q->first].end();
4720 ++r) {
4721 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4722
4723 // get/create slave mdrequest
4724 MDRequestRef mdr;
4725 if (have_request(r->reqid))
4726 mdr = request_get(r->reqid);
4727 else
4728 mdr = request_start_slave(r->reqid, r->attempt, strong);
4729 mdr->auth_pin(dn);
4730 }
4731 }
4732
4733 // dn xlock?
4734 if (strong->xlocked_dentries.count(p->first) &&
4735 strong->xlocked_dentries[p->first].count(q->first)) {
4736 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4737 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4738 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4739 assert(mdr->is_auth_pinned(dn));
4740 if (!mdr->xlocks.count(&dn->versionlock)) {
4741 assert(dn->versionlock.can_xlock_local());
4742 dn->versionlock.get_xlock(mdr, mdr->get_client());
4743 mdr->xlocks.insert(&dn->versionlock);
4744 mdr->locks.insert(&dn->versionlock);
4745 }
4746 if (dn->lock.is_stable())
4747 dn->auth_pin(&dn->lock);
4748 dn->lock.set_state(LOCK_XLOCK);
4749 dn->lock.get_xlock(mdr, mdr->get_client());
4750 mdr->xlocks.insert(&dn->lock);
4751 mdr->locks.insert(&dn->lock);
4752 }
4753
4754 dn->add_replica(from, q->second.nonce);
4755 dout(10) << " have " << *dn << dendl;
4756
4757 if (dnl->is_primary()) {
4758 if (q->second.is_primary()) {
4759 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4760 // the survivor missed MDentryUnlink+MDentryLink messages ?
4761 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4762 CInode *in = get_inode(q->second.ino, q->first.snapid);
4763 assert(in);
4764 assert(in->get_parent_dn());
4765 rejoin_unlinked_inodes[from].insert(in);
4766 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4767 }
4768 } else {
4769 // the survivor missed MDentryLink message ?
4770 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4771 dout(7) << " sender doesn't have primay dentry" << dendl;
4772 }
4773 } else {
4774 if (q->second.is_primary()) {
4775 // the survivor missed MDentryUnlink message ?
4776 CInode *in = get_inode(q->second.ino, q->first.snapid);
4777 assert(in);
4778 assert(in->get_parent_dn());
4779 rejoin_unlinked_inodes[from].insert(in);
4780 dout(7) << " sender has primary dentry but we don't" << dendl;
4781 }
4782 }
4783 }
4784 }
4785
4786 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4787 p != strong->strong_inodes.end();
4788 ++p) {
4789 CInode *in = get_inode(p->first);
4790 assert(in);
4791 in->add_replica(from, p->second.nonce);
4792 dout(10) << " have " << *in << dendl;
4793
4794 MMDSCacheRejoin::inode_strong &is = p->second;
4795
4796 // caps_wanted
4797 if (is.caps_wanted) {
4798 in->mds_caps_wanted[from] = is.caps_wanted;
4799 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4800 << " on " << *in << dendl;
4801 }
4802
4803 // scatterlocks?
4804 // infer state from replica state:
4805 // * go to MIX if they might have wrlocks
4806 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4807 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4808 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4809 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4810
4811 // auth pin?
4812 if (strong->authpinned_inodes.count(in->vino())) {
4813 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4814 r != strong->authpinned_inodes[in->vino()].end();
4815 ++r) {
4816 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4817
4818 // get/create slave mdrequest
4819 MDRequestRef mdr;
4820 if (have_request(r->reqid))
4821 mdr = request_get(r->reqid);
4822 else
4823 mdr = request_start_slave(r->reqid, r->attempt, strong);
4824 if (strong->frozen_authpin_inodes.count(in->vino())) {
4825 assert(!in->get_num_auth_pins());
4826 mdr->freeze_auth_pin(in);
4827 } else {
4828 assert(!in->is_frozen_auth_pin());
4829 }
4830 mdr->auth_pin(in);
4831 }
4832 }
4833 // xlock(s)?
4834 if (strong->xlocked_inodes.count(in->vino())) {
4835 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4836 q != strong->xlocked_inodes[in->vino()].end();
4837 ++q) {
4838 SimpleLock *lock = in->get_lock(q->first);
4839 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4840 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4841 assert(mdr->is_auth_pinned(in));
4842 if (!mdr->xlocks.count(&in->versionlock)) {
4843 assert(in->versionlock.can_xlock_local());
4844 in->versionlock.get_xlock(mdr, mdr->get_client());
4845 mdr->xlocks.insert(&in->versionlock);
4846 mdr->locks.insert(&in->versionlock);
4847 }
4848 if (lock->is_stable())
4849 in->auth_pin(lock);
4850 lock->set_state(LOCK_XLOCK);
4851 if (lock == &in->filelock)
4852 in->loner_cap = -1;
4853 lock->get_xlock(mdr, mdr->get_client());
4854 mdr->xlocks.insert(lock);
4855 mdr->locks.insert(lock);
4856 }
4857 }
4858 }
4859 // wrlock(s)?
4860 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4861 p != strong->wrlocked_inodes.end();
4862 ++p) {
4863 CInode *in = get_inode(p->first);
4864 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4865 q != p->second.end();
4866 ++q) {
4867 SimpleLock *lock = in->get_lock(q->first);
4868 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4869 r != q->second.end();
4870 ++r) {
4871 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4872 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4873 if (in->is_auth())
4874 assert(mdr->is_auth_pinned(in));
4875 lock->set_state(LOCK_MIX);
4876 if (lock == &in->filelock)
4877 in->loner_cap = -1;
4878 lock->get_wrlock(true);
4879 mdr->wrlocks.insert(lock);
4880 mdr->locks.insert(lock);
4881 }
4882 }
4883 }
4884
4885 // done?
4886 assert(rejoin_gather.count(from));
4887 rejoin_gather.erase(from);
28e407b8 4888 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4889 rejoin_gather_finish();
4890 } else {
4891 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4892 }
4893}
4894
4895/* This functions DOES NOT put the passed message before returning */
4896void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4897{
4898 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4899 mds_rank_t from = mds_rank_t(ack->get_source().num());
4900
b32b8144
FG
4901 assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4902 bool survivor = !mds->is_rejoin();
4903
7c673cae
FG
4904 // for sending cache expire message
4905 set<CInode*> isolated_inodes;
4906 set<CInode*> refragged_inodes;
4907
4908 // dirs
4909 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4910 p != ack->strong_dirfrags.end();
4911 ++p) {
4912 // we may have had incorrect dir fragmentation; refragment based
4913 // on what they auth tells us.
4914 CDir *dir = get_dirfrag(p->first);
4915 if (!dir) {
4916 dir = get_force_dirfrag(p->first, false);
4917 if (dir)
4918 refragged_inodes.insert(dir->get_inode());
4919 }
4920 if (!dir) {
4921 CInode *diri = get_inode(p->first.ino);
4922 if (!diri) {
4923 // barebones inode; the full inode loop below will clean up.
4924 diri = new CInode(this, false);
4925 diri->inode.ino = p->first.ino;
4926 diri->inode.mode = S_IFDIR;
4927 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4928 add_inode(diri);
4929 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4930 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4931 dout(10) << " add inode " << *diri << dendl;
4932 } else {
4933 diri->inode_auth = CDIR_AUTH_DEFAULT;
4934 isolated_inodes.insert(diri);
4935 dout(10) << " unconnected dirfrag " << p->first << dendl;
4936 }
4937 }
4938 // barebones dirfrag; the full dirfrag loop below will clean up.
4939 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4940 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4941 (dir->authority() != CDIR_AUTH_UNDEF &&
4942 dir->authority().first != from))
4943 adjust_subtree_auth(dir, from);
4944 dout(10) << " add dirfrag " << *dir << dendl;
4945 }
4946
4947 dir->set_replica_nonce(p->second.nonce);
4948 dir->state_clear(CDir::STATE_REJOINING);
4949 dout(10) << " got " << *dir << dendl;
4950
4951 // dentries
4952 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4953 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4954 q != dmap.end();
4955 ++q) {
4956 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4957 if(!dn)
4958 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4959
4960 CDentry::linkage_t *dnl = dn->get_linkage();
4961
4962 assert(dn->last == q->first.snapid);
4963 if (dn->first != q->second.first) {
4964 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4965 dn->first = q->second.first;
4966 }
4967
4968 // may have bad linkage if we missed dentry link/unlink messages
4969 if (dnl->is_primary()) {
4970 CInode *in = dnl->get_inode();
4971 if (!q->second.is_primary() ||
4972 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4973 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4974 dir->unlink_inode(dn);
4975 }
4976 } else if (dnl->is_remote()) {
4977 if (!q->second.is_remote() ||
4978 q->second.remote_ino != dnl->get_remote_ino() ||
4979 q->second.remote_d_type != dnl->get_remote_d_type()) {
4980 dout(10) << " had bad linkage for " << *dn << dendl;
4981 dir->unlink_inode(dn);
4982 }
4983 } else {
4984 if (!q->second.is_null())
4985 dout(10) << " had bad linkage for " << *dn << dendl;
4986 }
4987
4988 // hmm, did we have the proper linkage here?
4989 if (dnl->is_null() && !q->second.is_null()) {
4990 if (q->second.is_remote()) {
4991 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4992 } else {
4993 CInode *in = get_inode(q->second.ino, q->first.snapid);
4994 if (!in) {
4995 // barebones inode; assume it's dir, the full inode loop below will clean up.
4996 in = new CInode(this, false, q->second.first, q->first.snapid);
4997 in->inode.ino = q->second.ino;
4998 in->inode.mode = S_IFDIR;
4999 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
5000 add_inode(in);
5001 dout(10) << " add inode " << *in << dendl;
5002 } else if (in->get_parent_dn()) {
5003 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5004 << ", unlinking " << *in << dendl;
5005 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5006 }
5007 dn->dir->link_primary_inode(dn, in);
5008 isolated_inodes.erase(in);
5009 }
5010 }
5011
5012 dn->set_replica_nonce(q->second.nonce);
b32b8144 5013 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
7c673cae
FG
5014 dn->state_clear(CDentry::STATE_REJOINING);
5015 dout(10) << " got " << *dn << dendl;
5016 }
5017 }
5018
5019 for (set<CInode*>::iterator p = refragged_inodes.begin();
5020 p != refragged_inodes.end();
5021 ++p) {
5022 list<CDir*> ls;
5023 (*p)->get_nested_dirfrags(ls);
5024 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5025 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5026 continue;
5027 assert((*q)->get_num_any() == 0);
5028 (*p)->close_dirfrag((*q)->get_frag());
5029 }
5030 }
5031
5032 // full dirfrags
5033 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5034 p != ack->dirfrag_bases.end();
5035 ++p) {
5036 CDir *dir = get_dirfrag(p->first);
5037 assert(dir);
5038 bufferlist::iterator q = p->second.begin();
5039 dir->_decode_base(q);
5040 dout(10) << " got dir replica " << *dir << dendl;
5041 }
5042
5043 // full inodes
5044 bufferlist::iterator p = ack->inode_base.begin();
5045 while (!p.end()) {
5046 inodeno_t ino;
5047 snapid_t last;
5048 bufferlist basebl;
5049 ::decode(ino, p);
5050 ::decode(last, p);
5051 ::decode(basebl, p);
5052 CInode *in = get_inode(ino, last);
5053 assert(in);
5054 bufferlist::iterator q = basebl.begin();
5055 in->_decode_base(q);
5056 dout(10) << " got inode base " << *in << dendl;
5057 }
5058
5059 // inodes
5060 p = ack->inode_locks.begin();
5061 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5062 while (!p.end()) {
5063 inodeno_t ino;
5064 snapid_t last;
5065 __u32 nonce;
5066 bufferlist lockbl;
5067 ::decode(ino, p);
5068 ::decode(last, p);
5069 ::decode(nonce, p);
5070 ::decode(lockbl, p);
5071
5072 CInode *in = get_inode(ino, last);
5073 assert(in);
5074 in->set_replica_nonce(nonce);
5075 bufferlist::iterator q = lockbl.begin();
b32b8144 5076 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5077 in->state_clear(CInode::STATE_REJOINING);
5078 dout(10) << " got inode locks " << *in << dendl;
5079 }
5080
5081 // FIXME: This can happen if entire subtree, together with the inode subtree root
5082 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5083 assert(isolated_inodes.empty());
5084
5085 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5086 bufferlist::iterator bp = ack->imported_caps.begin();
5087 ::decode(peer_imported, bp);
5088
5089 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5090 p != peer_imported.end();
5091 ++p) {
28e407b8
AA
5092 auto& ex = cap_exports.at(p->first);
5093 assert(ex.first == from);
7c673cae
FG
5094 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5095 q != p->second.end();
5096 ++q) {
28e407b8
AA
5097 auto r = ex.second.find(q->first);
5098 assert(r != ex.second.end());
7c673cae
FG
5099
5100 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5101 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5102 if (!session) {
5103 dout(10) << " no session for client." << p->first << dendl;
5104 ex.second.erase(r);
5105 continue;
5106 }
7c673cae
FG
5107
5108 // mark client caps stale.
5109 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5110 r->second.capinfo.cap_id, 0,
7c673cae
FG
5111 mds->get_osd_epoch_barrier());
5112 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5113 (q->second.cap_id > 0 ? from : -1), 0);
5114 mds->send_message_client_counted(m, session);
5115
28e407b8 5116 ex.second.erase(r);
7c673cae 5117 }
28e407b8 5118 assert(ex.second.empty());
7c673cae
FG
5119 }
5120
5121 // done?
5122 assert(rejoin_ack_gather.count(from));
5123 rejoin_ack_gather.erase(from);
b32b8144 5124 if (!survivor) {
7c673cae
FG
5125
5126 if (rejoin_gather.empty()) {
5127 // eval unstable scatter locks after all wrlocks are rejoined.
5128 while (!rejoin_eval_locks.empty()) {
5129 SimpleLock *lock = rejoin_eval_locks.front();
5130 rejoin_eval_locks.pop_front();
5131 if (!lock->is_stable())
5132 mds->locker->eval_gather(lock);
5133 }
5134 }
5135
5136 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5137 rejoin_ack_gather.empty()) {
5138 // finally, kickstart past snap parent opens
5139 open_snap_parents();
5140 } else {
5141 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5142 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5143 }
5144 } else {
5145 // survivor.
5146 mds->queue_waiters(rejoin_waiters);
5147 }
5148}
5149
5150/**
5151 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5152 *
5153 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5154 * messages that clean these guys up...
5155 */
5156void MDCache::rejoin_trim_undef_inodes()
5157{
5158 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5159
5160 while (!rejoin_undef_inodes.empty()) {
5161 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5162 CInode *in = *p;
5163 rejoin_undef_inodes.erase(p);
5164
5165 in->clear_replica_map();
5166
5167 // close out dirfrags
5168 if (in->is_dir()) {
5169 list<CDir*> dfls;
5170 in->get_dirfrags(dfls);
5171 for (list<CDir*>::iterator p = dfls.begin();
5172 p != dfls.end();
5173 ++p) {
5174 CDir *dir = *p;
5175 dir->clear_replica_map();
5176
94b18763
FG
5177 for (auto &p : dir->items) {
5178 CDentry *dn = p.second;
7c673cae
FG
5179 dn->clear_replica_map();
5180
5181 dout(10) << " trimming " << *dn << dendl;
5182 dir->remove_dentry(dn);
5183 }
5184
5185 dout(10) << " trimming " << *dir << dendl;
5186 in->close_dirfrag(dir->dirfrag().frag);
5187 }
5188 }
5189
5190 CDentry *dn = in->get_parent_dn();
5191 if (dn) {
5192 dn->clear_replica_map();
5193 dout(10) << " trimming " << *dn << dendl;
5194 dn->dir->remove_dentry(dn);
5195 } else {
5196 dout(10) << " trimming " << *in << dendl;
5197 remove_inode(in);
5198 }
5199 }
5200
5201 assert(rejoin_undef_inodes.empty());
5202}
5203
5204void MDCache::rejoin_gather_finish()
5205{
5206 dout(10) << "rejoin_gather_finish" << dendl;
5207 assert(mds->is_rejoin());
28e407b8 5208 assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5209
5210 if (open_undef_inodes_dirfrags())
5211 return;
5212
5213 if (process_imported_caps())
5214 return;
5215
5216 choose_lock_states_and_reconnect_caps();
5217
5218 identify_files_to_recover();
5219 rejoin_send_acks();
5220
5221 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5222 rejoin_ack_gather.erase(mds->get_nodeid());
5223
5224 // did we already get our acks too?
5225 if (rejoin_ack_gather.empty()) {
5226 // finally, kickstart past snap parent opens
5227 open_snap_parents();
5228 }
5229}
5230
5231class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5232 inodeno_t ino;
5233public:
5234 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5235 void finish(int r) override {
5236 mdcache->rejoin_open_ino_finish(ino, r);
5237 }
5238};
5239
5240void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5241{
5242 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5243
5244 if (ret < 0) {
5245 cap_imports_missing.insert(ino);
5246 } else if (ret == mds->get_nodeid()) {
5247 assert(get_inode(ino));
5248 } else {
5249 auto p = cap_imports.find(ino);
5250 assert(p != cap_imports.end());
5251 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5252 assert(q->second.count(MDS_RANK_NONE));
5253 assert(q->second.size() == 1);
5254 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5255 }
5256 cap_imports.erase(p);
5257 }
5258
5259 assert(cap_imports_num_opening > 0);
5260 cap_imports_num_opening--;
5261
5262 if (cap_imports_num_opening == 0) {
5263 if (rejoin_gather.empty())
5264 rejoin_gather_finish();
5265 else if (rejoin_gather.count(mds->get_nodeid()))
5266 process_imported_caps();
5267 }
5268}
5269
5270class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5271public:
28e407b8
AA
5272 map<client_t,pair<Session*,uint64_t> > session_map;
5273 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae
FG
5274 void finish(int r) override {
5275 assert(r == 0);
28e407b8 5276 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5277 }
5278};
5279
28e407b8 5280void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5281{
5282 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5283 mds->server->finish_force_open_sessions(session_map);
5284 rejoin_session_map.swap(session_map);
7c673cae
FG
5285 if (rejoin_gather.empty())
5286 rejoin_gather_finish();
5287}
5288
5289bool MDCache::process_imported_caps()
5290{
5291 dout(10) << "process_imported_caps" << dendl;
5292
5293 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5294 CInode *in = get_inode(p->first);
5295 if (in) {
5296 assert(in->is_auth());
5297 cap_imports_missing.erase(p->first);
5298 continue;
5299 }
5300 if (cap_imports_missing.count(p->first) > 0)
5301 continue;
5302
5303 cap_imports_num_opening++;
5304 dout(10) << " opening missing ino " << p->first << dendl;
5305 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
28e407b8
AA
5306 if (!(cap_imports_num_opening % 1000))
5307 mds->heartbeat_reset();
7c673cae
FG
5308 }
5309
5310 if (cap_imports_num_opening > 0)
5311 return true;
5312
5313 // called by rejoin_gather_finish() ?
5314 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5315 if (!rejoin_client_map.empty() &&
5316 rejoin_session_map.empty()) {
5317 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5318 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5319 finish->session_map);
5320 mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
5321 mds->mdlog->flush();
5322 rejoin_client_map.clear();
5323 return true;
7c673cae 5324 }
7c673cae
FG
5325
5326 // process caps that were exported by slave rename
5327 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5328 p != rejoin_slave_exports.end();
5329 ++p) {
5330 CInode *in = get_inode(p->first);
5331 assert(in);
5332 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5333 q != p->second.second.end();
5334 ++q) {
28e407b8
AA
5335 auto r = rejoin_session_map.find(q->first);
5336 if (r == rejoin_session_map.end())
5337 continue;
7c673cae 5338
28e407b8 5339 Session *session = r->second.first;
7c673cae
FG
5340 Capability *cap = in->get_client_cap(q->first);
5341 if (!cap)
5342 cap = in->add_client_cap(q->first, session);
5343 cap->merge(q->second, true);
5344
5345 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5346 assert(cap->get_last_seq() == im.issue_seq);
5347 assert(cap->get_mseq() == im.mseq);
5348 cap->set_cap_id(im.cap_id);
5349 // send cap import because we assigned a new cap ID
5350 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5351 p->second.first, CEPH_CAP_FLAG_AUTH);
5352 }
5353 }
5354 rejoin_slave_exports.clear();
5355 rejoin_imported_caps.clear();
5356
5357 // process cap imports
5358 // ino -> client -> frommds -> capex
5359 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5360 CInode *in = get_inode(p->first);
5361 if (!in) {
5362 dout(10) << " still missing ino " << p->first
5363 << ", will try again after replayed client requests" << dendl;
5364 ++p;
5365 continue;
5366 }
5367 assert(in->is_auth());
5368 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5369 Session *session;
5370 {
5371 auto r = rejoin_session_map.find(q->first);
5372 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5373 }
5374
7c673cae 5375 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5376 if (!session) {
5377 if (r->first >= 0)
5378 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5379 continue;
5380 }
5381
7c673cae
FG
5382 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5383 add_reconnected_cap(q->first, in->ino(), r->second);
5384 if (r->first >= 0) {
5385 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5386 cap->inc_mseq();
5387 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5388
5389 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5390 im.cap_id = cap->get_cap_id();
5391 im.issue_seq = cap->get_last_seq();
5392 im.mseq = cap->get_mseq();
5393 }
5394 }
5395 }
5396 cap_imports.erase(p++); // remove and move on
5397 }
5398 } else {
5399 trim_non_auth();
5400
28e407b8 5401 assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5402 rejoin_gather.erase(mds->get_nodeid());
28e407b8 5403 assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5404 maybe_send_pending_rejoins();
7c673cae
FG
5405 }
5406 return false;
5407}
5408
5409void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5410{
5411 // are this realm's parents fully open?
5412 if (realm->have_past_parents_open()) {
5413 dout(10) << " have past snap parents for realm " << *realm
5414 << " on " << *realm->inode << dendl;
5415 if (reconnect) {
5416 // finish off client snaprealm reconnects?
5417 auto p = reconnected_snaprealms.find(realm->inode->ino());
5418 if (p != reconnected_snaprealms.end()) {
5419 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5420 finish_snaprealm_reconnect(q->first, realm, q->second);
5421 reconnected_snaprealms.erase(p);
5422 }
5423 }
5424 } else {
5425 if (!missing_snap_parents.count(realm->inode)) {
5426 dout(10) << " MISSING past snap parents for realm " << *realm
5427 << " on " << *realm->inode << dendl;
5428 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5429 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5430 } else {
5431 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5432 << " on " << *realm->inode << dendl;
5433 }
5434 }
5435}
5436
5437void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5438 client_t client, snapid_t snap_follows)
5439{
5440 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5441
5442 const set<snapid_t>& snaps = realm->get_snaps();
5443 snapid_t follows = snap_follows;
5444
5445 while (true) {
5446 CInode *in = pick_inode_snap(head_in, follows);
5447 if (in == head_in)
5448 break;
5449 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5450
5451 /* TODO: we can check the reconnected/flushing caps to find
5452 * which locks need gathering */
5453 for (int i = 0; i < num_cinode_locks; i++) {
5454 int lockid = cinode_lock_info[i].lock;
5455 SimpleLock *lock = in->get_lock(lockid);
5456 assert(lock);
5457 in->client_snap_caps[lockid].insert(client);
5458 in->auth_pin(lock);
5459 lock->set_state(LOCK_SNAP_SYNC);
5460 lock->get_wrlock(true);
5461 }
5462
5463 for (auto p = snaps.lower_bound(in->first);
5464 p != snaps.end() && *p <= in->last;
5465 ++p) {
5466 head_in->add_need_snapflush(in, *p, client);
5467 }
5468
5469 follows = in->last;
5470 }
5471}
5472
5473/*
5474 * choose lock states based on reconnected caps
5475 */
5476void MDCache::choose_lock_states_and_reconnect_caps()
5477{
5478 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5479
5480 map<client_t,MClientSnap*> splits;
5481
b32b8144
FG
5482 for (auto i : inode_map) {
5483 CInode *in = i.second;
7c673cae
FG
5484
5485 if (in->last != CEPH_NOSNAP)
5486 continue;
5487
5488 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5489 in->mark_dirty_rstat();
5490
7c673cae 5491 int dirty_caps = 0;
b32b8144 5492 auto p = reconnected_caps.find(in->ino());
7c673cae
FG
5493 if (p != reconnected_caps.end()) {
5494 for (const auto &it : p->second)
5495 dirty_caps |= it.second.dirty_caps;
5496 }
5497 in->choose_lock_states(dirty_caps);
5498 dout(15) << " chose lock states on " << *in << dendl;
5499
5500 SnapRealm *realm = in->find_snaprealm();
5501
5502 check_realm_past_parents(realm, realm == in->snaprealm);
5503
5504 if (p != reconnected_caps.end()) {
5505 bool missing_snap_parent = false;
5506 // also, make sure client's cap is in the correct snaprealm.
5507 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5508 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5509 if (realm->have_past_parents_open()) {
5510 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5511 } else {
5512 missing_snap_parent = true;
5513 }
5514 }
5515
5516 if (q->second.realm_ino == realm->inode->ino()) {
5517 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5518 } else {
5519 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5520 << " != " << realm->inode->ino() << dendl;
5521 if (realm->have_past_parents_open()) {
5522 // ok, include in a split message _now_.
5523 prepare_realm_split(realm, q->first, in->ino(), splits);
5524 } else {
5525 // send the split later.
5526 missing_snap_parent = true;
5527 }
5528 }
5529 }
5530 if (missing_snap_parent)
5531 missing_snap_parents[realm->inode].insert(in);
5532 }
5533 }
5534
5535 send_snaps(splits);
5536}
5537
5538void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5539 map<client_t,MClientSnap*>& splits)
5540{
5541 MClientSnap *snap;
5542 if (splits.count(client) == 0) {
5543 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5544 snap->head.split = realm->inode->ino();
5545 realm->build_snap_trace(snap->bl);
5546
5547 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5548 p != realm->open_children.end();
5549 ++p)
5550 snap->split_realms.push_back((*p)->inode->ino());
5551
5552 } else
5553 snap = splits[client];
5554 snap->split_inos.push_back(ino);
5555}
5556
5557void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5558{
5559 dout(10) << "send_snaps" << dendl;
5560
5561 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5562 p != splits.end();
5563 ++p) {
5564 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5565 if (session) {
5566 dout(10) << " client." << p->first
5567 << " split " << p->second->head.split
5568 << " inos " << p->second->split_inos
5569 << dendl;
5570 mds->send_message_client_counted(p->second, session);
5571 } else {
5572 dout(10) << " no session for client." << p->first << dendl;
5573 p->second->put();
5574 }
5575 }
5576 splits.clear();
5577}
5578
5579
5580/*
5581 * remove any items from logsegment open_file lists that don't have
5582 * any caps
5583 */
5584void MDCache::clean_open_file_lists()
5585{
5586 dout(10) << "clean_open_file_lists" << dendl;
5587
5588 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5589 p != mds->mdlog->segments.end();
5590 ++p) {
5591 LogSegment *ls = p->second;
5592
5593 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5594 while (!q.end()) {
5595 CInode *in = *q;
5596 ++q;
5597 if (in->last == CEPH_NOSNAP) {
5598 if (!in->is_any_caps_wanted()) {
5599 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5600 in->item_open_file.remove_myself();
5601 }
5602 } else if (in->last != CEPH_NOSNAP) {
5603 if (in->client_snap_caps.empty()) {
5604 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5605 in->item_open_file.remove_myself();
5606 }
5607 }
5608 }
5609 }
5610}
5611
5612
5613
5614Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5615{
5616 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5617 << " on " << *in << dendl;
5618 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5619 if (!session) {
5620 dout(10) << " no session for client." << client << dendl;
5621 return NULL;
5622 }
5623
5624 Capability *cap = in->reconnect_cap(client, icr, session);
5625
5626 if (frommds >= 0) {
5627 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5628 cap->inc_mseq();
5629 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5630 }
5631
5632 return cap;
5633}
5634
5635void MDCache::export_remaining_imported_caps()
5636{
5637 dout(10) << "export_remaining_imported_caps" << dendl;
5638
5639 stringstream warn_str;
5640
5641 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5642 warn_str << " ino " << p->first << "\n";
5643 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5644 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5645 if (session) {
5646 // mark client caps stale.
5647 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5648 stale->set_cap_peer(0, 0, 0, -1, 0);
5649 mds->send_message_client_counted(stale, q->first);
5650 }
5651 }
5652
5653 mds->heartbeat_reset();
5654 }
5655
5656 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5657 p != cap_reconnect_waiters.end();
5658 ++p)
5659 mds->queue_waiters(p->second);
5660
5661 cap_imports.clear();
5662 cap_reconnect_waiters.clear();
5663
5664 if (warn_str.peek() != EOF) {
5665 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5666 mds->clog->warn(warn_str);
5667 }
5668}
5669
5670void MDCache::try_reconnect_cap(CInode *in, Session *session)
5671{
5672 client_t client = session->info.get_client();
5673 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5674 if (rc) {
5675 in->reconnect_cap(client, *rc, session);
5676 dout(10) << "try_reconnect_cap client." << client
5677 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5678 << " issue " << ccap_string(rc->capinfo.issued)
5679 << " on " << *in << dendl;
5680 remove_replay_cap_reconnect(in->ino(), client);
5681
5682 if (in->is_replicated()) {
5683 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5684 } else {
5685 int dirty_caps = 0;
5686 auto p = reconnected_caps.find(in->ino());
5687 if (p != reconnected_caps.end()) {
5688 auto q = p->second.find(client);
5689 if (q != p->second.end())
5690 dirty_caps = q->second.dirty_caps;
5691 }
5692 in->choose_lock_states(dirty_caps);
5693 dout(15) << " chose lock states on " << *in << dendl;
5694 }
5695
5696 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5697 cap_reconnect_waiters.find(in->ino());
5698 if (it != cap_reconnect_waiters.end()) {
5699 mds->queue_waiters(it->second);
5700 cap_reconnect_waiters.erase(it);
5701 }
5702 }
5703}
5704
5705
5706
5707// -------
5708// cap imports and delayed snap parent opens
5709
5710void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5711 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5712 int peer, int p_flags)
5713{
5714 client_t client = session->info.inst.name.num();
5715 SnapRealm *realm = in->find_snaprealm();
5716 if (realm->have_past_parents_open()) {
5717 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5718 if (cap->get_last_seq() == 0) // reconnected cap
5719 cap->inc_last_seq();
5720 cap->set_last_issue();
5721 cap->set_last_issue_stamp(ceph_clock_now());
5722 cap->clear_new();
5723 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5724 in->ino(),
5725 realm->inode->ino(),
5726 cap->get_cap_id(), cap->get_last_seq(),
5727 cap->pending(), cap->wanted(), 0,
5728 cap->get_mseq(), mds->get_osd_epoch_barrier());
5729 in->encode_cap_message(reap, cap);
5730 realm->build_snap_trace(reap->snapbl);
5731 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5732 mds->send_message_client_counted(reap, session);
5733 } else {
5734 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5735 << cap->get_mseq() << " on " << *in << dendl;
5736 in->auth_pin(this);
5737 cap->inc_suppress();
5738 delayed_imported_caps[client].insert(in);
5739 missing_snap_parents[in].size();
5740 }
5741}
5742
5743void MDCache::do_delayed_cap_imports()
5744{
5745 dout(10) << "do_delayed_cap_imports" << dendl;
5746
5747 assert(delayed_imported_caps.empty());
5748}
5749
5750struct C_MDC_OpenSnapParents : public MDCacheContext {
5751 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5752 void finish(int r) override {
5753 mdcache->open_snap_parents();
5754 }
5755};
5756
5757void MDCache::open_snap_parents()
5758{
5759 dout(10) << "open_snap_parents" << dendl;
5760
5761 map<client_t,MClientSnap*> splits;
5762 MDSGatherBuilder gather(g_ceph_context);
5763
5764 auto p = missing_snap_parents.begin();
5765 while (p != missing_snap_parents.end()) {
5766 CInode *in = p->first;
5767 assert(in->snaprealm);
5768 if (in->snaprealm->open_parents(gather.new_sub())) {
5769 dout(10) << " past parents now open on " << *in << dendl;
5770
5771 for (CInode *child : p->second) {
5772 auto q = reconnected_caps.find(child->ino());
5773 assert(q != reconnected_caps.end());
5774 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5775 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5776 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5777 }
5778 // make sure client's cap is in the correct snaprealm.
5779 if (r->second.realm_ino != in->ino()) {
5780 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5781 }
5782 }
5783 }
5784
5785 missing_snap_parents.erase(p++);
5786
5787 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5788
5789 // finish off client snaprealm reconnects?
5790 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5791 if (q != reconnected_snaprealms.end()) {
5792 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5793 r != q->second.end();
5794 ++r)
5795 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5796 reconnected_snaprealms.erase(q);
5797 }
5798 } else {
5799 dout(10) << " opening past parents on " << *in << dendl;
5800 ++p;
5801 }
5802 }
5803
5804 send_snaps(splits);
5805
5806 if (gather.has_subs()) {
5807 dout(10) << "open_snap_parents - waiting for "
5808 << gather.num_subs_remaining() << dendl;
5809 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5810 gather.activate();
5811 } else {
5812 if (!reconnected_snaprealms.empty()) {
5813 stringstream warn_str;
5814 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5815 p != reconnected_snaprealms.end();
5816 ++p) {
5817 warn_str << " unconnected snaprealm " << p->first << "\n";
5818 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5819 q != p->second.end();
5820 ++q)
5821 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5822 }
5823 mds->clog->warn() << "open_snap_parents has:";
5824 mds->clog->warn(warn_str);
5825 }
5826 assert(rejoin_waiters.empty());
5827 assert(missing_snap_parents.empty());
5828 dout(10) << "open_snap_parents - all open" << dendl;
5829 do_delayed_cap_imports();
5830
5831 assert(rejoin_done);
5832 rejoin_done.release()->complete(0);
5833 reconnected_caps.clear();
5834 }
5835}
5836
5837bool MDCache::open_undef_inodes_dirfrags()
5838{
5839 dout(10) << "open_undef_inodes_dirfrags "
5840 << rejoin_undef_inodes.size() << " inodes "
5841 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5842
5843 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5844
5845 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5846 p != rejoin_undef_inodes.end();
5847 ++p) {
5848 CInode *in = *p;
5849 assert(!in->is_base());
5850 fetch_queue.insert(in->get_parent_dir());
5851 }
5852
5853 if (fetch_queue.empty())
5854 return false;
5855
28e407b8
AA
5856 MDSGatherBuilder gather(g_ceph_context,
5857 new MDSInternalContextWrapper(mds,
5858 new FunctionContext([this](int r) {
5859 if (rejoin_gather.empty())
5860 rejoin_gather_finish();
5861 })
5862 )
5863 );
5864
7c673cae
FG
5865 for (set<CDir*>::iterator p = fetch_queue.begin();
5866 p != fetch_queue.end();
5867 ++p) {
5868 CDir *dir = *p;
5869 CInode *diri = dir->get_inode();
5870 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5871 continue;
5872 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5873 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5874 dir->fetch(gather.new_sub());
5875 }
5876 assert(gather.has_subs());
5877 gather.activate();
5878 return true;
5879}
5880
5881void MDCache::opened_undef_inode(CInode *in) {
5882 dout(10) << "opened_undef_inode " << *in << dendl;
5883 rejoin_undef_inodes.erase(in);
5884 if (in->is_dir()) {
5885 // FIXME: re-hash dentries if necessary
5886 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5887 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5888 CDir *dir = in->get_dirfrag(frag_t());
5889 assert(dir);
5890 rejoin_undef_dirfrags.erase(dir);
5891 in->force_dirfrags();
5892 list<CDir*> ls;
5893 in->get_dirfrags(ls);
5894 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5895 rejoin_undef_dirfrags.insert(*p);
5896 }
5897 }
5898}
5899
5900void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5901{
5902 if (seq < realm->get_newest_seq()) {
5903 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5904 << realm->get_newest_seq()
5905 << " on " << *realm << dendl;
5906 // send an update
5907 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5908 if (session) {
5909 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5910 realm->build_snap_trace(snap->bl);
5911 mds->send_message_client_counted(snap, session);
5912 } else {
5913 dout(10) << " ...or not, no session for this client!" << dendl;
5914 }
5915 } else {
5916 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5917 << " on " << *realm << dendl;
5918 }
5919}
5920
5921
5922
5923void MDCache::rejoin_send_acks()
5924{
5925 dout(7) << "rejoin_send_acks" << dendl;
5926
5927 // replicate stray
5928 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5929 p != rejoin_unlinked_inodes.end();
5930 ++p) {
5931 for (set<CInode*>::iterator q = p->second.begin();
5932 q != p->second.end();
5933 ++q) {
5934 CInode *in = *q;
5935 dout(7) << " unlinked inode " << *in << dendl;
5936 // inode expired
5937 if (!in->is_replica(p->first))
5938 continue;
5939 while (1) {
5940 CDentry *dn = in->get_parent_dn();
5941 if (dn->is_replica(p->first))
5942 break;
5943 dn->add_replica(p->first);
5944 CDir *dir = dn->get_dir();
5945 if (dir->is_replica(p->first))
5946 break;
5947 dir->add_replica(p->first);
5948 in = dir->get_inode();
5949 if (in->is_replica(p->first))
5950 break;
224ce89b 5951 in->add_replica(p->first);
7c673cae
FG
5952 if (in->is_base())
5953 break;
5954 }
5955 }
5956 }
5957 rejoin_unlinked_inodes.clear();
5958
5959 // send acks to everyone in the recovery set
31f18b77 5960 map<mds_rank_t,MMDSCacheRejoin*> acks;
7c673cae
FG
5961 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5962 p != recovery_set.end();
31f18b77
FG
5963 ++p) {
5964 if (rejoin_ack_sent.count(*p))
5965 continue;
5966 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5967 }
5968
5969 rejoin_ack_sent = recovery_set;
7c673cae
FG
5970
5971 // walk subtrees
5972 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5973 p != subtrees.end();
5974 ++p) {
5975 CDir *dir = p->first;
5976 if (!dir->is_auth())
5977 continue;
5978 dout(10) << "subtree " << *dir << dendl;
5979
5980 // auth items in this subtree
5981 list<CDir*> dq;
5982 dq.push_back(dir);
5983
5984 while (!dq.empty()) {
5985 CDir *dir = dq.front();
5986 dq.pop_front();
5987
5988 // dir
181888fb
FG
5989 for (auto &r : dir->get_replicas()) {
5990 auto it = acks.find(r.first);
31f18b77
FG
5991 if (it == acks.end())
5992 continue;
181888fb 5993 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 5994 it->second->add_dirfrag_base(dir);
7c673cae
FG
5995 }
5996
94b18763
FG
5997 for (auto &p : dir->items) {
5998 CDentry *dn = p.second;
7c673cae
FG
5999 CDentry::linkage_t *dnl = dn->get_linkage();
6000
6001 // inode
6002 CInode *in = NULL;
6003 if (dnl->is_primary())
6004 in = dnl->get_inode();
6005
6006 // dentry
181888fb
FG
6007 for (auto &r : dn->get_replicas()) {
6008 auto it = acks.find(r.first);
31f18b77
FG
6009 if (it == acks.end())
6010 continue;
94b18763 6011 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6012 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6013 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6014 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6015 ++r.second,
7c673cae
FG
6016 dn->lock.get_replica_state());
6017 // peer missed MDentrylink message ?
181888fb
FG
6018 if (in && !in->is_replica(r.first))
6019 in->add_replica(r.first);
7c673cae
FG
6020 }
6021
6022 if (!in)
6023 continue;
6024
181888fb
FG
6025 for (auto &r : in->get_replicas()) {
6026 auto it = acks.find(r.first);
31f18b77
FG
6027 if (it == acks.end())
6028 continue;
6029 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6030 bufferlist bl;
181888fb
FG
6031 in->_encode_locks_state_for_rejoin(bl, r.first);
6032 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6033 }
6034
6035 // subdirs in this subtree?
6036 in->get_nested_dirfrags(dq);
6037 }
6038 }
6039 }
6040
6041 // base inodes too
6042 if (root && root->is_auth())
181888fb
FG
6043 for (auto &r : root->get_replicas()) {
6044 auto it = acks.find(r.first);
31f18b77
FG
6045 if (it == acks.end())
6046 continue;
6047 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6048 bufferlist bl;
181888fb
FG
6049 root->_encode_locks_state_for_rejoin(bl, r.first);
6050 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6051 }
6052 if (myin)
181888fb
FG
6053 for (auto &r : myin->get_replicas()) {
6054 auto it = acks.find(r.first);
31f18b77
FG
6055 if (it == acks.end())
6056 continue;
6057 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6058 bufferlist bl;
181888fb
FG
6059 myin->_encode_locks_state_for_rejoin(bl, r.first);
6060 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6061 }
6062
6063 // include inode base for any inodes whose scatterlocks may have updated
6064 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6065 p != rejoin_potential_updated_scatterlocks.end();
6066 ++p) {
6067 CInode *in = *p;
181888fb
FG
6068 for (const auto &r : in->get_replicas()) {
6069 auto it = acks.find(r.first);
31f18b77
FG
6070 if (it == acks.end())
6071 continue;
6072 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6073 }
7c673cae
FG
6074 }
6075
6076 // send acks
31f18b77 6077 for (auto p = acks.begin(); p != acks.end(); ++p) {
7c673cae
FG
6078 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6079 mds->send_message_mds(p->second, p->first);
6080 }
6081
6082 rejoin_imported_caps.clear();
6083}
6084
c07f9fc5
FG
6085class C_MDC_ReIssueCaps : public MDCacheContext {
6086 CInode *in;
6087public:
6088 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6089 MDCacheContext(mdc), in(i)
6090 {
6091 in->get(CInode::PIN_PTRWAITER);
6092 }
6093 void finish(int r) override {
6094 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6095 mdcache->mds->locker->issue_caps(in);
6096 in->put(CInode::PIN_PTRWAITER);
6097 }
6098};
7c673cae
FG
6099
6100void MDCache::reissue_all_caps()
6101{
6102 dout(10) << "reissue_all_caps" << dendl;
6103
94b18763 6104 for (auto &p : inode_map) {
b32b8144 6105 CInode *in = p.second;
7c673cae 6106 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6107 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6108 if (in->is_frozen_inode()) {
6109 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6110 continue;
6111 }
7c673cae
FG
6112 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6113 mds->locker->issue_caps(in);
6114 }
6115 }
6116}
6117
6118
6119// ===============================================================================
6120
6121struct C_MDC_QueuedCow : public MDCacheContext {
6122 CInode *in;
6123 MutationRef mut;
6124 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6125 MDCacheContext(mdc), in(i), mut(m) {}
6126 void finish(int r) override {
6127 mdcache->_queued_file_recover_cow(in, mut);
6128 }
6129};
6130
6131
6132void MDCache::queue_file_recover(CInode *in)
6133{
6134 dout(10) << "queue_file_recover " << *in << dendl;
6135 assert(in->is_auth());
6136
6137 // cow?
6138 /*
6139 SnapRealm *realm = in->find_snaprealm();
6140 set<snapid_t> s = realm->get_snaps();
6141 while (!s.empty() && *s.begin() < in->first)
6142 s.erase(s.begin());
6143 while (!s.empty() && *s.rbegin() > in->last)
6144 s.erase(*s.rbegin());
6145 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6146 if (s.size() > 1) {
94b18763 6147 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6148 pi->version = in->pre_dirty();
6149
6150 auto mut(std::make_shared<MutationImpl>());
6151 mut->ls = mds->mdlog->get_current_segment();
6152 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6153 mds->mdlog->start_entry(le);
6154 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6155
6156 s.erase(*s.begin());
6157 while (!s.empty()) {
6158 snapid_t snapid = *s.begin();
6159 CInode *cow_inode = 0;
6160 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6161 assert(cow_inode);
6162 recovery_queue.enqueue(cow_inode);
6163 s.erase(*s.begin());
6164 }
6165
6166 in->parent->first = in->first;
6167 le->metablob.add_primary_dentry(in->parent, in, true);
6168 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6169 mds->mdlog->flush();
6170 }
6171 */
6172
6173 recovery_queue.enqueue(in);
6174}
6175
6176void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6177{
6178 in->pop_and_dirty_projected_inode(mut->ls);
6179 mut->apply();
6180 mds->locker->drop_locks(mut.get());
6181 mut->cleanup();
6182}
6183
6184
6185/*
6186 * called after recovery to recover file sizes for previously opened (for write)
6187 * files. that is, those where max_size > size.
6188 */
6189void MDCache::identify_files_to_recover()
6190{
6191 dout(10) << "identify_files_to_recover" << dendl;
94b18763 6192 for (auto &p : inode_map) {
b32b8144 6193 CInode *in = p.second;
7c673cae
FG
6194 if (!in->is_auth())
6195 continue;
6196
6197 if (in->last != CEPH_NOSNAP)
6198 continue;
6199
6200 // Only normal files need file size recovery
6201 if (!in->is_file()) {
6202 continue;
6203 }
6204
6205 bool recover = false;
6206 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6207 p != in->inode.client_ranges.end();
6208 ++p) {
6209 Capability *cap = in->get_client_cap(p->first);
6210 if (!cap) {
6211 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6212 recover = true;
6213 break;
6214 }
6215 }
6216
6217 if (recover) {
6218 if (in->filelock.is_stable()) {
6219 in->auth_pin(&in->filelock);
6220 } else {
6221 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6222 }
6223 in->filelock.set_state(LOCK_PRE_SCAN);
6224 rejoin_recover_q.push_back(in);
6225 } else {
6226 rejoin_check_q.push_back(in);
6227 }
6228 }
6229}
6230
6231void MDCache::start_files_to_recover()
6232{
6233 for (CInode *in : rejoin_check_q) {
6234 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6235 mds->locker->issue_caps(in);
6236 mds->locker->check_inode_max_size(in);
6237 }
6238 rejoin_check_q.clear();
6239 for (CInode *in : rejoin_recover_q) {
6240 mds->locker->file_recover(&in->filelock);
6241 }
6242 if (!rejoin_recover_q.empty()) {
6243 rejoin_recover_q.clear();
6244 do_file_recover();
6245 }
6246}
6247
6248void MDCache::do_file_recover()
6249{
6250 recovery_queue.advance();
6251}
6252
6253// ===============================================================================
6254
6255
6256// ----------------------------
6257// truncate
6258
6259class C_MDC_RetryTruncate : public MDCacheContext {
6260 CInode *in;
6261 LogSegment *ls;
6262public:
6263 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6264 MDCacheContext(c), in(i), ls(l) {}
6265 void finish(int r) override {
6266 mdcache->_truncate_inode(in, ls);
6267 }
6268};
6269
6270void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6271{
94b18763 6272 auto pi = in->get_projected_inode();
7c673cae
FG
6273 dout(10) << "truncate_inode "
6274 << pi->truncate_from << " -> " << pi->truncate_size
6275 << " on " << *in
6276 << dendl;
6277
6278 ls->truncating_inodes.insert(in);
6279 in->get(CInode::PIN_TRUNCATING);
6280 in->auth_pin(this);
6281
6282 if (!in->client_need_snapflush.empty() &&
6283 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6284 assert(in->filelock.is_xlocked());
6285 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6286 mds->locker->issue_caps(in);
6287 return;
6288 }
6289
6290 _truncate_inode(in, ls);
6291}
6292
6293struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6294 CInode *in;
6295 LogSegment *ls;
6296 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6297 MDCacheIOContext(c), in(i), ls(l) {}
6298 void finish(int r) override {
6299 assert(r == 0 || r == -ENOENT);
6300 mdcache->truncate_inode_finish(in, ls);
6301 }
6302};
6303
6304void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6305{
94b18763 6306 auto pi = &in->inode;
7c673cae
FG
6307 dout(10) << "_truncate_inode "
6308 << pi->truncate_from << " -> " << pi->truncate_size
6309 << " on " << *in << dendl;
6310
6311 assert(pi->is_truncating());
6312 assert(pi->truncate_size < (1ULL << 63));
6313 assert(pi->truncate_from < (1ULL << 63));
6314 assert(pi->truncate_size < pi->truncate_from);
6315
6316
6317 SnapRealm *realm = in->find_snaprealm();
6318 SnapContext nullsnap;
6319 const SnapContext *snapc;
6320 if (realm) {
6321 dout(10) << " realm " << *realm << dendl;
6322 snapc = &realm->get_snap_context();
6323 } else {
6324 dout(10) << " NO realm, using null context" << dendl;
6325 snapc = &nullsnap;
6326 assert(in->last == CEPH_NOSNAP);
6327 }
6328 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6329 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6330 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6331 pi->truncate_seq, ceph::real_time::min(), 0,
6332 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6333 mds->finisher));
6334}
6335
6336struct C_MDC_TruncateLogged : public MDCacheLogContext {
6337 CInode *in;
6338 MutationRef mut;
6339 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6340 MDCacheLogContext(m), in(i), mut(mu) {}
6341 void finish(int r) override {
6342 mdcache->truncate_inode_logged(in, mut);
6343 }
6344};
6345
6346void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6347{
6348 dout(10) << "truncate_inode_finish " << *in << dendl;
6349
6350 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6351 assert(p != ls->truncating_inodes.end());
6352 ls->truncating_inodes.erase(p);
6353
6354 // update
94b18763
FG
6355 auto &pi = in->project_inode();
6356 pi.inode.version = in->pre_dirty();
6357 pi.inode.truncate_from = 0;
6358 pi.inode.truncate_pending--;
7c673cae
FG
6359
6360 MutationRef mut(new MutationImpl());
6361 mut->ls = mds->mdlog->get_current_segment();
6362 mut->add_projected_inode(in);
6363
6364 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6365 mds->mdlog->start_entry(le);
6366 CDentry *dn = in->get_projected_parent_dn();
6367 le->metablob.add_dir_context(dn->get_dir());
6368 le->metablob.add_primary_dentry(dn, in, true);
6369 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6370
6371 journal_dirty_inode(mut.get(), &le->metablob, in);
6372 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6373
6374 // flush immediately if there are readers/writers waiting
6375 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6376 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6377 mds->mdlog->flush();
6378}
6379
6380void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6381{
6382 dout(10) << "truncate_inode_logged " << *in << dendl;
6383 mut->apply();
6384 mds->locker->drop_locks(mut.get());
6385 mut->cleanup();
6386
6387 in->put(CInode::PIN_TRUNCATING);
6388 in->auth_unpin(this);
6389
6390 list<MDSInternalContextBase*> waiters;
6391 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6392 mds->queue_waiters(waiters);
6393}
6394
6395
6396void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6397{
6398 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6399 << ls->seq << "/" << ls->offset << dendl;
6400 ls->truncating_inodes.insert(in);
6401 in->get(CInode::PIN_TRUNCATING);
6402}
6403
6404void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6405{
6406 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6407 << ls->seq << "/" << ls->offset << dendl;
6408 // if we have the logseg the truncate started in, it must be in our list.
6409 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6410 assert(p != ls->truncating_inodes.end());
6411 ls->truncating_inodes.erase(p);
6412 in->put(CInode::PIN_TRUNCATING);
6413}
6414
6415void MDCache::start_recovered_truncates()
6416{
6417 dout(10) << "start_recovered_truncates" << dendl;
6418 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6419 p != mds->mdlog->segments.end();
6420 ++p) {
6421 LogSegment *ls = p->second;
6422 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6423 q != ls->truncating_inodes.end();
6424 ++q) {
6425 CInode *in = *q;
6426 in->auth_pin(this);
6427
6428 if (!in->client_need_snapflush.empty() &&
6429 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6430 assert(in->filelock.is_stable());
6431 in->filelock.set_state(LOCK_XLOCKDONE);
6432 in->auth_pin(&in->filelock);
6433 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6434 // start_files_to_recover will revoke caps
6435 continue;
6436 }
6437 _truncate_inode(in, ls);
6438 }
6439 }
6440}
6441
6442
6443
6444
6445
6446
6447// ================================================================================
6448// cache trimming
6449
181888fb
FG
6450void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6451{
7c673cae 6452 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6453 std::vector<CDentry *> unexpirables;
6454 uint64_t trimmed = 0;
6455
6456 dout(7) << "trim_lru trimming " << count
6457 << " items from LRU"
6458 << " size=" << lru.lru_get_size()
6459 << " mid=" << lru.lru_get_top()
6460 << " pintail=" << lru.lru_get_pintail()
6461 << " pinned=" << lru.lru_get_num_pinned()
6462 << dendl;
7c673cae 6463
31f18b77
FG
6464 for (;;) {
6465 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6466 if (!dn)
6467 break;
6468 if (trim_dentry(dn, expiremap)) {
6469 unexpirables.push_back(dn);
181888fb
FG
6470 } else {
6471 trimmed++;
31f18b77
FG
6472 }
6473 }
6474
181888fb 6475 for (auto &dn : unexpirables) {
31f18b77 6476 bottom_lru.lru_insert_mid(dn);
181888fb 6477 }
31f18b77
FG
6478 unexpirables.clear();
6479
181888fb
FG
6480 // trim dentries from the LRU until count is reached
6481 while (cache_toofull() || count > 0) {
7c673cae
FG
6482 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6483 if (!dn) {
6484 break;
6485 }
7c673cae 6486 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6487 dn->get_linkage()->inode->item_open_file.is_on_list())) {
7c673cae 6488 unexpirables.push_back(dn);
181888fb
FG
6489 } else if (trim_dentry(dn, expiremap)) {
6490 unexpirables.push_back(dn);
6491 } else {
6492 trimmed++;
3efd9988 6493 if (count > 0) count--;
7c673cae
FG
6494 }
6495 }
181888fb
FG
6496
6497 for (auto &dn : unexpirables) {
31f18b77 6498 lru.lru_insert_mid(dn);
181888fb 6499 }
31f18b77 6500 unexpirables.clear();
7c673cae 6501
181888fb
FG
6502 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6503}
6504
6505/*
6506 * note: only called while MDS is active or stopping... NOT during recovery.
6507 * however, we may expire a replica whose authority is recovering.
6508 *
6509 * @param count is number of dentries to try to expire
6510 */
6511bool MDCache::trim(uint64_t count)
6512{
6513 uint64_t used = cache_size();
6514 uint64_t limit = cache_limit_memory();
6515 map<mds_rank_t, MCacheExpire*> expiremap;
6516
6517 dout(7) << "trim bytes_used=" << bytes2str(used)
6518 << " limit=" << bytes2str(limit)
6519 << " reservation=" << cache_reservation()
6520 << "% count=" << count << dendl;
6521
6522 // process delayed eval_stray()
6523 stray_manager.advance_delayed();
6524
6525 trim_lru(count, expiremap);
6526
7c673cae 6527 // trim non-auth, non-bound subtrees
181888fb 6528 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6529 CDir *dir = p->first;
6530 ++p;
31f18b77
FG
6531 CInode *diri = dir->get_inode();
6532 if (dir->is_auth()) {
6533 if (!diri->is_auth() && !diri->is_base() &&
6534 dir->get_num_head_items() == 0) {
6535 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6536 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6537 dir->is_freezing() || dir->is_frozen())
6538 continue;
6539
6540 migrator->export_empty_import(dir);
6541 }
6542 } else {
6543 if (!diri->is_auth()) {
6544 if (dir->get_num_ref() > 1) // only subtree pin
6545 continue;
6546 list<CDir*> ls;
6547 diri->get_subtree_dirfrags(ls);
6548 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6549 continue;
6550
6551 // don't trim subtree root if its auth MDS is recovering.
6552 // This simplify the cache rejoin code.
6553 if (dir->is_subtree_root() &&
6554 rejoin_ack_gather.count(dir->get_dir_auth().first))
6555 continue;
7c673cae 6556 trim_dirfrag(dir, 0, expiremap);
31f18b77 6557 }
7c673cae
FG
6558 }
6559 }
6560
6561 // trim root?
181888fb 6562 if (mds->is_stopping() && root) {
7c673cae
FG
6563 list<CDir*> ls;
6564 root->get_dirfrags(ls);
6565 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6566 CDir *dir = *p;
6567 if (dir->get_num_ref() == 1) // subtree pin
6568 trim_dirfrag(dir, 0, expiremap);
6569 }
6570 if (root->get_num_ref() == 0)
6571 trim_inode(0, root, 0, expiremap);
6572 }
6573
6574 std::set<mds_rank_t> stopping;
6575 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6576 stopping.erase(mds->get_nodeid());
6577 for (auto rank : stopping) {
6578 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6579 if (!mdsdir_in)
6580 continue;
6581
6582 if (expiremap.count(rank) == 0) {
6583 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6584 }
6585
6586 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6587
6588 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6589 if (!aborted) {
6590 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6591 list<CDir*> ls;
6592 mdsdir_in->get_dirfrags(ls);
6593 for (auto dir : ls) {
6594 if (dir->get_num_ref() == 1) // subtree pin
6595 trim_dirfrag(dir, dir, expiremap);
6596 }
6597 if (mdsdir_in->get_num_ref() == 0)
6598 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6599 } else {
6600 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6601 }
6602 }
6603
6604 // Other rank's base inodes (when I'm stopping)
181888fb 6605 if (mds->is_stopping()) {
7c673cae
FG
6606 for (set<CInode*>::iterator p = base_inodes.begin();
6607 p != base_inodes.end(); ++p) {
6608 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6609 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6610 if ((*p)->get_num_ref() == 0) {
6611 trim_inode(NULL, *p, NULL, expiremap);
6612 }
6613 }
6614 }
6615 }
6616
6617 // send any expire messages
6618 send_expire_messages(expiremap);
6619
6620 return true;
6621}
6622
6623void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6624{
6625 // send expires
6626 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6627 it != expiremap.end();
6628 ++it) {
6629 if (mds->is_cluster_degraded() &&
6630 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6631 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6632 rejoin_sent.count(it->first) == 0))) {
6633 it->second->put();
6634 continue;
6635 }
6636 dout(7) << "sending cache_expire to " << it->first << dendl;
6637 mds->send_message_mds(it->second, it->first);
6638 }
6639}
6640
6641
6642bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6643{
6644 dout(12) << "trim_dentry " << *dn << dendl;
6645
6646 CDentry::linkage_t *dnl = dn->get_linkage();
6647
6648 CDir *dir = dn->get_dir();
6649 assert(dir);
6650
6651 CDir *con = get_subtree_root(dir);
6652 if (con)
6653 dout(12) << " in container " << *con << dendl;
6654 else {
6655 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6656 assert(dn->is_auth());
6657 }
6658
6659 // If replica dentry is not readable, it's likely we will receive
6660 // MDentryLink/MDentryUnlink message soon (It's possible we first
6661 // receive a MDentryUnlink message, then MDentryLink message)
6662 // MDentryLink message only replicates an inode, so we should
6663 // avoid trimming the inode's parent dentry. This is because that
6664 // unconnected replicas are problematic for subtree migration.
6665 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6666 !dn->get_dir()->get_inode()->is_stray())
6667 return true;
6668
6669 // adjust the dir state
6670 // NOTE: we can safely remove a clean, null dentry without effecting
6671 // directory completeness.
6672 // (check this _before_ we unlink the inode, below!)
6673 bool clear_complete = false;
6674 if (!(dnl->is_null() && dn->is_clean()))
6675 clear_complete = true;
6676
6677 // unlink the dentry
6678 if (dnl->is_remote()) {
6679 // just unlink.
31f18b77 6680 dir->unlink_inode(dn, false);
7c673cae
FG
6681 } else if (dnl->is_primary()) {
6682 // expire the inode, too.
6683 CInode *in = dnl->get_inode();
6684 assert(in);
6685 if (trim_inode(dn, in, con, expiremap))
6686 return true; // purging stray instead of trimming
6687 } else {
6688 assert(dnl->is_null());
6689 }
6690
6691 if (!dn->is_auth()) {
6692 // notify dentry authority.
6693 mds_authority_t auth = dn->authority();
6694
6695 for (int p=0; p<2; p++) {
6696 mds_rank_t a = auth.first;
6697 if (p) a = auth.second;
6698 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6699 if (mds->get_nodeid() == auth.second &&
6700 con->is_importing()) break; // don't send any expire while importing.
6701 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6702
6703 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6704 assert(a != mds->get_nodeid());
6705 if (expiremap.count(a) == 0)
6706 expiremap[a] = new MCacheExpire(mds->get_nodeid());
94b18763 6707 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6708 }
6709 }
6710
6711 // remove dentry
6712 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6713 dir->add_to_bloom(dn);
6714 dir->remove_dentry(dn);
6715
6716 if (clear_complete)
6717 dir->state_clear(CDir::STATE_COMPLETE);
6718
7c673cae
FG
6719 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6720 return false;
6721}
6722
6723
6724void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6725{
6726 dout(15) << "trim_dirfrag " << *dir << dendl;
6727
6728 if (dir->is_subtree_root()) {
6729 assert(!dir->is_auth() ||
6730 (!dir->is_replicated() && dir->inode->is_base()));
6731 remove_subtree(dir); // remove from subtree map
6732 }
6733 assert(dir->get_num_ref() == 0);
6734
6735 CInode *in = dir->get_inode();
6736
6737 if (!dir->is_auth()) {
6738 mds_authority_t auth = dir->authority();
6739
6740 // was this an auth delegation? (if so, slightly modified container)
6741 dirfrag_t condf;
6742 if (dir->is_subtree_root()) {
6743 dout(12) << " subtree root, container is " << *dir << dendl;
6744 con = dir;
6745 condf = dir->dirfrag();
6746 } else {
6747 condf = con->dirfrag();
6748 }
6749
6750 for (int p=0; p<2; p++) {
6751 mds_rank_t a = auth.first;
6752 if (p) a = auth.second;
6753 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6754 if (mds->get_nodeid() == auth.second &&
6755 con->is_importing()) break; // don't send any expire while importing.
6756 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6757
6758 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6759 assert(a != mds->get_nodeid());
6760 if (expiremap.count(a) == 0)
6761 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6762 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6763 }
6764 }
6765
6766 in->close_dirfrag(dir->dirfrag().frag);
6767}
6768
6769/**
6770 * Try trimming an inode from the cache
6771 *
6772 * @return true if the inode is still in cache, else false if it was trimmed
6773 */
6774bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6775{
6776 dout(15) << "trim_inode " << *in << dendl;
6777 assert(in->get_num_ref() == 0);
6778
6779 if (in->is_dir()) {
6780 // If replica inode's dirfragtreelock is not readable, it's likely
6781 // some dirfrags of the inode are being fragmented and we will receive
6782 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6783 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6784 // This is because that unconnected replicas are problematic for
6785 // subtree migration.
6786 //
28e407b8 6787 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
7c673cae 6788 return true;
28e407b8 6789 }
7c673cae
FG
6790
6791 // DIR
6792 list<CDir*> dfls;
6793 in->get_dirfrags(dfls);
6794 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6795 CDir *dir = *p;
6796 assert(!dir->is_subtree_root());
6797 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6798 }
6799 }
6800
6801 // INODE
6802 if (in->is_auth()) {
6803 // eval stray after closing dirfrags
6804 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6805 maybe_eval_stray(in);
6806 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6807 return true;
6808 }
6809 } else {
6810 mds_authority_t auth = in->authority();
6811
6812 dirfrag_t df;
6813 if (con)
6814 df = con->dirfrag();
6815 else
6816 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6817
6818 for (int p=0; p<2; p++) {
6819 mds_rank_t a = auth.first;
6820 if (p) a = auth.second;
6821 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6822 if (con && mds->get_nodeid() == auth.second &&
6823 con->is_importing()) break; // don't send any expire while importing.
6824 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6825
6826 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6827 assert(a != mds->get_nodeid());
6828 if (expiremap.count(a) == 0)
6829 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6830 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6831 }
6832 }
6833
6834 /*
6835 if (in->is_auth()) {
6836 if (in->hack_accessed)
6837 mds->logger->inc("outt");
6838 else {
6839 mds->logger->inc("outut");
6840 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6841 }
6842 }
6843 */
6844
6845 // unlink
6846 if (dn)
31f18b77 6847 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
6848 remove_inode(in);
6849 return false;
6850}
6851
6852
6853/**
6854 * trim_non_auth - remove any non-auth items from our cache
6855 *
6856 * this reduces the amount of non-auth metadata in our cache, reducing the
6857 * load incurred by the rejoin phase.
6858 *
6859 * the only non-auth items that remain are those that are needed to
6860 * attach our own subtrees to the root.
6861 *
6862 * when we are done, all dentries will be in the top bit of the lru.
6863 *
6864 * why we have to do this:
6865 * we may not have accurate linkage for non-auth items. which means we will
6866 * know which subtree it falls into, and can not be sure to declare it to the
6867 * correct authority.
6868 */
6869void MDCache::trim_non_auth()
6870{
6871 dout(7) << "trim_non_auth" << dendl;
6872
6873 // temporarily pin all subtree roots
6874 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6875 p != subtrees.end();
6876 ++p)
6877 p->first->get(CDir::PIN_SUBTREETEMP);
6878
31f18b77 6879 list<CDentry*> auth_list;
7c673cae
FG
6880
6881 // trim non-auth items from the lru
31f18b77
FG
6882 for (;;) {
6883 CDentry *dn = NULL;
6884 if (bottom_lru.lru_get_size() > 0)
6885 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6886 if (!dn && lru.lru_get_size() > 0)
6887 dn = static_cast<CDentry*>(lru.lru_expire());
6888 if (!dn)
6889 break;
6890
7c673cae
FG
6891 CDentry::linkage_t *dnl = dn->get_linkage();
6892
6893 if (dn->is_auth()) {
6894 // add back into lru (at the top)
31f18b77 6895 auth_list.push_back(dn);
7c673cae
FG
6896
6897 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6898 dn->unlink_remote(dnl);
7c673cae
FG
6899 } else {
6900 // non-auth. expire.
6901 CDir *dir = dn->get_dir();
6902 assert(dir);
6903
6904 // unlink the dentry
6905 dout(10) << " removing " << *dn << dendl;
6906 if (dnl->is_remote()) {
31f18b77 6907 dir->unlink_inode(dn, false);
7c673cae
FG
6908 }
6909 else if (dnl->is_primary()) {
6910 CInode *in = dnl->get_inode();
6911 dout(10) << " removing " << *in << dendl;
6912 list<CDir*> ls;
6913 in->get_dirfrags(ls);
6914 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6915 CDir *subdir = *p;
6916 assert(!subdir->is_subtree_root());
6917 in->close_dirfrag(subdir->dirfrag().frag);
6918 }
31f18b77 6919 dir->unlink_inode(dn, false);
7c673cae
FG
6920 remove_inode(in);
6921 }
6922 else {
6923 assert(dnl->is_null());
6924 }
6925
6926 assert(!dir->has_bloom());
6927 dir->remove_dentry(dn);
6928 // adjust the dir state
6929 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6930 // close empty non-auth dirfrag
6931 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6932 dir->inode->close_dirfrag(dir->get_frag());
6933 }
6934 }
6935
31f18b77
FG
6936 for (auto dn : auth_list) {
6937 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6938 bottom_lru.lru_insert_mid(dn);
6939 else
6940 lru.lru_insert_top(dn);
6941 }
6942
7c673cae
FG
6943 // move everything in the pintail to the top bit of the lru.
6944 lru.lru_touch_entire_pintail();
6945
6946 // unpin all subtrees
6947 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6948 p != subtrees.end();
6949 ++p)
6950 p->first->put(CDir::PIN_SUBTREETEMP);
6951
31f18b77
FG
6952 if (lru.lru_get_size() == 0 &&
6953 bottom_lru.lru_get_size() == 0) {
7c673cae 6954 // root, stray, etc.?
b32b8144 6955 auto p = inode_map.begin();
7c673cae 6956 while (p != inode_map.end()) {
7c673cae 6957 CInode *in = p->second;
b32b8144 6958 ++p;
7c673cae
FG
6959 if (!in->is_auth()) {
6960 list<CDir*> ls;
6961 in->get_dirfrags(ls);
6962 for (list<CDir*>::iterator p = ls.begin();
6963 p != ls.end();
6964 ++p) {
6965 dout(10) << " removing " << **p << dendl;
6966 assert((*p)->get_num_ref() == 1); // SUBTREE
6967 remove_subtree((*p));
6968 in->close_dirfrag((*p)->dirfrag().frag);
6969 }
6970 dout(10) << " removing " << *in << dendl;
6971 assert(!in->get_parent_dn());
6972 assert(in->get_num_ref() == 0);
6973 remove_inode(in);
6974 }
7c673cae
FG
6975 }
6976 }
6977
6978 show_subtrees();
6979}
6980
6981/**
6982 * Recursively trim the subtree rooted at directory to remove all
6983 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6984 * of those links. This is used to clear invalid data out of the cache.
6985 * Note that it doesn't clear the passed-in directory, since that's not
6986 * always safe.
6987 */
6988bool MDCache::trim_non_auth_subtree(CDir *dir)
6989{
6990 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6991
6992 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6993
94b18763
FG
6994 auto j = dir->begin();
6995 auto i = j;
7c673cae
FG
6996 while (j != dir->end()) {
6997 i = j++;
6998 CDentry *dn = i->second;
6999 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7000 CDentry::linkage_t *dnl = dn->get_linkage();
7001 if (dnl->is_primary()) { // check for subdirectories, etc
7002 CInode *in = dnl->get_inode();
7003 bool keep_inode = false;
7004 if (in->is_dir()) {
7005 list<CDir*> subdirs;
7006 in->get_dirfrags(subdirs);
7007 for (list<CDir*>::iterator subdir = subdirs.begin();
7008 subdir != subdirs.end();
7009 ++subdir) {
7010 if ((*subdir)->is_subtree_root()) {
7011 keep_inode = true;
7012 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7013 } else {
7014 if (trim_non_auth_subtree(*subdir))
7015 keep_inode = true;
7016 else {
7017 in->close_dirfrag((*subdir)->get_frag());
7018 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7019 }
7020 }
7021 }
7022
7023 }
7024 if (!keep_inode) { // remove it!
7025 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7026 dir->unlink_inode(dn, false);
7c673cae
FG
7027 remove_inode(in);
7028 assert(!dir->has_bloom());
7029 dir->remove_dentry(dn);
7030 } else {
7031 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7032 dn->state_clear(CDentry::STATE_AUTH);
7033 in->state_clear(CInode::STATE_AUTH);
7034 }
7035 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7036 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7037 } else { // just remove it
7038 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7039 if (dnl->is_remote())
31f18b77 7040 dir->unlink_inode(dn, false);
7c673cae
FG
7041 dir->remove_dentry(dn);
7042 }
7043 }
7044 dir->state_clear(CDir::STATE_AUTH);
7045 /**
7046 * We've now checked all our children and deleted those that need it.
7047 * Now return to caller, and tell them if *we're* a keeper.
7048 */
7049 return keep_dir || dir->get_num_any();
7050}
7051
7052/*
7053 * during replay, when we determine a subtree is no longer ours, we
7054 * try to trim it from our cache. because subtrees must be connected
7055 * to the root, the fact that we can trim this tree may mean that our
7056 * children or parents can also be trimmed.
7057 */
7058void MDCache::try_trim_non_auth_subtree(CDir *dir)
7059{
7060 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7061
7062 // can we now trim child subtrees?
7063 set<CDir*> bounds;
7064 get_subtree_bounds(dir, bounds);
7065 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7066 CDir *bd = *p;
7067 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7068 bd->get_num_any() == 0 && // and empty
7069 can_trim_non_auth_dirfrag(bd)) {
7070 CInode *bi = bd->get_inode();
7071 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7072 remove_subtree(bd);
7073 bd->mark_clean();
7074 bi->close_dirfrag(bd->get_frag());
7075 }
7076 }
7077
7078 if (trim_non_auth_subtree(dir)) {
7079 // keep
7080 try_subtree_merge(dir);
7081 } else {
7082 // can we trim this subtree (and possibly our ancestors) too?
7083 while (true) {
7084 CInode *diri = dir->get_inode();
7085 if (diri->is_base()) {
7086 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7087 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7088 remove_subtree(dir);
7089 dir->mark_clean();
7090 diri->close_dirfrag(dir->get_frag());
7091
7092 dout(10) << " removing " << *diri << dendl;
7093 assert(!diri->get_parent_dn());
7094 assert(diri->get_num_ref() == 0);
7095 remove_inode(diri);
7096 }
7097 break;
7098 }
7099
7100 CDir *psub = get_subtree_root(diri->get_parent_dir());
7101 dout(10) << " parent subtree is " << *psub << dendl;
7102 if (psub->get_dir_auth().first == mds->get_nodeid())
7103 break; // we are auth, keep.
7104
7105 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7106 remove_subtree(dir);
7107 dir->mark_clean();
7108 diri->close_dirfrag(dir->get_frag());
7109
7110 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7111 if (trim_non_auth_subtree(psub))
7112 break;
7113 dir = psub;
7114 }
7115 }
7116
7117 show_subtrees();
7118}
7119
7120void MDCache::standby_trim_segment(LogSegment *ls)
7121{
7122 ls->new_dirfrags.clear_list();
7123 ls->open_files.clear_list();
7124
7125 while (!ls->dirty_dirfrags.empty()) {
7126 CDir *dir = ls->dirty_dirfrags.front();
7127 dir->mark_clean();
7128 }
7129 while (!ls->dirty_inodes.empty()) {
7130 CInode *in = ls->dirty_inodes.front();
7131 in->mark_clean();
7132 }
7133 while (!ls->dirty_dentries.empty()) {
7134 CDentry *dn = ls->dirty_dentries.front();
7135 dn->mark_clean();
7136 }
7137 while (!ls->dirty_parent_inodes.empty()) {
7138 CInode *in = ls->dirty_parent_inodes.front();
7139 in->clear_dirty_parent();
7140 }
7141 while (!ls->dirty_dirfrag_dir.empty()) {
7142 CInode *in = ls->dirty_dirfrag_dir.front();
7143 in->filelock.remove_dirty();
7144 }
7145 while (!ls->dirty_dirfrag_nest.empty()) {
7146 CInode *in = ls->dirty_dirfrag_nest.front();
7147 in->nestlock.remove_dirty();
7148 }
7149 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7150 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7151 in->dirfragtreelock.remove_dirty();
7152 }
7153}
7154
7155/* This function DOES put the passed message before returning */
7156void MDCache::handle_cache_expire(MCacheExpire *m)
7157{
7158 mds_rank_t from = mds_rank_t(m->get_from());
7159
7160 dout(7) << "cache_expire from mds." << from << dendl;
7161
7162 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7163 m->put();
7164 return;
7165 }
7166
7167 set<SimpleLock *> gather_locks;
7168 // loop over realms
7169 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7170 p != m->realms.end();
7171 ++p) {
7172 // check container?
7173 if (p->first.ino > 0) {
7174 CInode *expired_inode = get_inode(p->first.ino);
7175 assert(expired_inode); // we had better have this.
7176 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7177 assert(parent_dir);
7178
7179 int export_state = -1;
7180 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7181 export_state = migrator->get_export_state(parent_dir);
7182 assert(export_state >= 0);
7183 }
7184
7185 if (!parent_dir->is_auth() ||
7186 (export_state != -1 &&
7187 ((export_state == Migrator::EXPORT_WARNING &&
7188 migrator->export_has_warned(parent_dir,from)) ||
7189 export_state == Migrator::EXPORT_EXPORTING ||
7190 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7191 (export_state == Migrator::EXPORT_NOTIFYING &&
7192 !migrator->export_has_notified(parent_dir,from))))) {
7193
7194 // not auth.
7195 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7196 assert(parent_dir->is_frozen_tree_root());
7197
7198 // make a message container
7199 if (delayed_expire[parent_dir].count(from) == 0)
7200 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7201
7202 // merge these expires into it
7203 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7204 continue;
7205 }
7206 assert(export_state <= Migrator::EXPORT_PREPPING ||
7207 (export_state == Migrator::EXPORT_WARNING &&
7208 !migrator->export_has_warned(parent_dir, from)));
7209
7210 dout(7) << "expires for " << *parent_dir << dendl;
7211 } else {
7212 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7213 }
7214
7215 // INODES
7216 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7217 it != p->second.inodes.end();
7218 ++it) {
7219 CInode *in = get_inode(it->first);
7220 unsigned nonce = it->second;
7221
7222 if (!in) {
7223 dout(0) << " inode expire on " << it->first << " from " << from
7224 << ", don't have it" << dendl;
7225 assert(in);
7226 }
7227 assert(in->is_auth());
7228 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7229
7230 // check nonce
7231 if (nonce == in->get_replica_nonce(from)) {
7232 // remove from our cached_by
7233 dout(7) << " inode expire on " << *in << " from mds." << from
7234 << " cached_by was " << in->get_replicas() << dendl;
7235 inode_remove_replica(in, from, false, gather_locks);
7236 }
7237 else {
7238 // this is an old nonce, ignore expire.
7239 dout(7) << " inode expire on " << *in << " from mds." << from
7240 << " with old nonce " << nonce
7241 << " (current " << in->get_replica_nonce(from) << "), dropping"
7242 << dendl;
7243 }
7244 }
7245
7246 // DIRS
7247 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7248 it != p->second.dirs.end();
7249 ++it) {
7250 CDir *dir = get_dirfrag(it->first);
7251 unsigned nonce = it->second;
7252
7253 if (!dir) {
7254 CInode *diri = get_inode(it->first.ino);
7255 if (diri) {
7256 if (mds->is_rejoin() &&
7257 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7258 !diri->is_replica(from)) {
7259 list<CDir*> ls;
7260 diri->get_nested_dirfrags(ls);
7261 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7262 << " while rejoining, inode isn't replicated" << dendl;
7263 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7264 dir = *q;
7265 if (dir->is_replica(from)) {
7266 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7267 dir->remove_replica(from);
7268 }
7269 }
7270 continue;
7271 }
7272 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7273 if (other) {
7274 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7275 << " have " << *other << ", mismatched frags, dropping" << dendl;
7276 continue;
7277 }
7278 }
7279 dout(0) << " dir expire on " << it->first << " from " << from
7280 << ", don't have it" << dendl;
7281 assert(dir);
7282 }
7283 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7284
7285 assert(dir->is_auth());
7286
7287 // check nonce
7288 if (nonce == dir->get_replica_nonce(from)) {
7289 // remove from our cached_by
7290 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7291 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7292 dir->remove_replica(from);
7293 }
7294 else {
7295 // this is an old nonce, ignore expire.
7296 dout(7) << " dir expire on " << *dir << " from mds." << from
7297 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7298 << "), dropping" << dendl;
7299 }
7300 }
7301
7302 // DENTRIES
7303 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7304 pd != p->second.dentries.end();
7305 ++pd) {
7306 dout(10) << " dn expires in dir " << pd->first << dendl;
7307 CInode *diri = get_inode(pd->first.ino);
7308 assert(diri);
7309 CDir *dir = diri->get_dirfrag(pd->first.frag);
7310
7311 if (!dir) {
7312 dout(0) << " dn expires on " << pd->first << " from " << from
7313 << ", must have refragmented" << dendl;
7314 } else {
7315 assert(dir->is_auth());
7316 }
7317
7318 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7319 p != pd->second.end();
7320 ++p) {
7321 unsigned nonce = p->second;
7322 CDentry *dn;
7323
7324 if (dir) {
7325 dn = dir->lookup(p->first.first, p->first.second);
7326 } else {
7327 // which dirfrag for this dentry?
7328 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7329 assert(dir);
7330 assert(dir->is_auth());
7331 dn = dir->lookup(p->first.first, p->first.second);
7332 }
7333
7334 if (!dn) {
7335 if (dir)
7336 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7337 else
7338 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7339 }
7340 assert(dn);
7341
7342 if (nonce == dn->get_replica_nonce(from)) {
7343 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7344 dentry_remove_replica(dn, from, gather_locks);
7345 }
7346 else {
7347 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7348 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7349 << "), dropping" << dendl;
7350 }
7351 }
7352 }
7353 }
7354
7355 // done
7356 m->put();
7357
7358 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7359 if (!(*p)->is_stable())
7360 mds->locker->eval_gather(*p);
7361 }
7362}
7363
7364void MDCache::process_delayed_expire(CDir *dir)
7365{
7366 dout(7) << "process_delayed_expire on " << *dir << dendl;
7367 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7368 p != delayed_expire[dir].end();
7369 ++p)
7370 handle_cache_expire(p->second);
7371 delayed_expire.erase(dir);
7372}
7373
7374void MDCache::discard_delayed_expire(CDir *dir)
7375{
7376 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7377 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7378 p != delayed_expire[dir].end();
7379 ++p)
7380 p->second->put();
7381 delayed_expire.erase(dir);
7382}
7383
7384void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7385 set<SimpleLock *>& gather_locks)
7386{
7387 in->remove_replica(from);
7388 in->mds_caps_wanted.erase(from);
7389
7390 // note: this code calls _eval more often than it needs to!
7391 // fix lock
7392 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7393 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7394 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7395 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7396 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7397 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7398
7399 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7400 // Don't remove the recovering mds from lock's gathering list because
7401 // it may hold rejoined wrlocks.
7402 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7403 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7404 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7405}
7406
7407void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7408{
7409 dn->remove_replica(from);
7410
7411 // fix lock
7412 if (dn->lock.remove_replica(from))
7413 gather_locks.insert(&dn->lock);
7414
7415 // Replicated strays might now be elegible for purge
7416 CDentry::linkage_t *dnl = dn->get_linkage();
7417 if (dnl->is_primary()) {
7418 maybe_eval_stray(dnl->get_inode());
7419 }
7420}
7421
7422void MDCache::trim_client_leases()
7423{
7424 utime_t now = ceph_clock_now();
7425
7426 dout(10) << "trim_client_leases" << dendl;
7427
7428 for (int pool=0; pool<client_lease_pools; pool++) {
7429 int before = client_leases[pool].size();
7430 if (client_leases[pool].empty())
7431 continue;
7432
7433 while (!client_leases[pool].empty()) {
7434 ClientLease *r = client_leases[pool].front();
7435 if (r->ttl > now) break;
7436 CDentry *dn = static_cast<CDentry*>(r->parent);
7437 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7438 dn->remove_client_lease(r, mds->locker);
7439 }
7440 int after = client_leases[pool].size();
7441 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7442 << (before-after) << " leases, " << after << " left" << dendl;
7443 }
7444}
7445
7446
7447void MDCache::check_memory_usage()
7448{
7449 static MemoryModel mm(g_ceph_context);
7450 static MemoryModel::snap last;
7451 mm.sample(&last);
7452 static MemoryModel::snap baseline = last;
7453
7454 // check client caps
b32b8144 7455 assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7456 double caps_per_inode = 0.0;
7c673cae 7457 if (CInode::count())
181888fb 7458 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae
FG
7459
7460 dout(2) << "check_memory_usage"
7461 << " total " << last.get_total()
7462 << ", rss " << last.get_rss()
7463 << ", heap " << last.get_heap()
7464 << ", baseline " << baseline.get_heap()
7465 << ", buffers " << (buffer::get_total_alloc() >> 10)
7466 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7467 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7468 << dendl;
7469
c07f9fc5 7470 mds->update_mlogger();
7c673cae
FG
7471 mds->mlogger->set(l_mdm_rss, last.get_rss());
7472 mds->mlogger->set(l_mdm_heap, last.get_heap());
7473
181888fb
FG
7474 if (cache_toofull()) {
7475 last_recall_state = ceph_clock_now();
7476 mds->server->recall_client_state();
7c673cae
FG
7477 }
7478
7479 // If the cache size had exceeded its limit, but we're back in bounds
7480 // now, free any unused pool memory so that our memory usage isn't
7481 // permanently bloated.
181888fb 7482 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7483 // Only do this once we are back in bounds: otherwise the releases would
7484 // slow down whatever process caused us to exceed bounds to begin with
7485 if (ceph_using_tcmalloc()) {
7486 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7487 << dendl;
7488 ceph_heap_release_free_memory();
7489 }
7490 exceeded_size_limit = false;
7491 }
7492}
7493
7494
7495
7496// =========================================================================================
7497// shutdown
7498
7499class C_MDC_ShutdownCheck : public MDCacheContext {
7500public:
7501 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7502 void finish(int) override {
7503 mdcache->shutdown_check();
7504 }
7505};
7506
7507void MDCache::shutdown_check()
7508{
7509 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7510
7511 // cache
7512 char old_val[32] = { 0 };
7513 char *o = old_val;
7514 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7515 g_conf->set_val("debug_mds", "10");
7516 g_conf->apply_changes(NULL);
7517 show_cache();
7518 g_conf->set_val("debug_mds", old_val);
7519 g_conf->apply_changes(NULL);
7520 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7521
7522 // this
31f18b77 7523 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7524 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7525
7526
7527 if (mds->objecter->is_active()) {
7528 dout(0) << "objecter still active" << dendl;
7529 mds->objecter->dump_active();
7530 }
7531}
7532
7533
7534void MDCache::shutdown_start()
7535{
7536 dout(2) << "shutdown_start" << dendl;
7537
7538 if (g_conf->mds_shutdown_check)
7539 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7540
7541 // g_conf->debug_mds = 10;
7542}
7543
7544
7545
7546bool MDCache::shutdown_pass()
7547{
7548 dout(7) << "shutdown_pass" << dendl;
7549
7550 if (mds->is_stopped()) {
7551 dout(7) << " already shut down" << dendl;
7552 show_cache();
7553 show_subtrees();
7554 return true;
7555 }
7556
7557 // empty stray dir
28e407b8 7558 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7559
7560 // trim cache
181888fb 7561 trim(UINT64_MAX);
31f18b77 7562 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7563
28e407b8 7564 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae
FG
7565 int num_auth_subtree = 0;
7566 if (!subtrees.empty() &&
28e407b8 7567 mds->get_nodeid() != 0) {
7c673cae
FG
7568 dout(7) << "looking for subtrees to export to mds0" << dendl;
7569 list<CDir*> ls;
7570 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7571 it != subtrees.end();
7572 ++it) {
7573 CDir *dir = it->first;
7574 if (dir->get_inode()->is_mdsdir())
7575 continue;
7576 if (dir->is_auth()) {
7577 num_auth_subtree++;
7578 if (dir->is_frozen() ||
7579 dir->is_freezing() ||
7580 dir->is_ambiguous_dir_auth() ||
7581 dir->state_test(CDir::STATE_EXPORTING))
7582 continue;
7583 ls.push_back(dir);
7584 }
7585 }
28e407b8
AA
7586
7587 migrator->clear_export_queue();
7c673cae
FG
7588 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7589 CDir *dir = *p;
7590 mds_rank_t dest = dir->get_inode()->authority().first;
7591 if (dest > 0 && !mds->mdsmap->is_active(dest))
7592 dest = 0;
7593 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7594 migrator->export_dir_nicely(dir, dest);
7595 }
7596 }
7597
28e407b8
AA
7598 if (!strays_all_exported) {
7599 dout(7) << "waiting for strays to migrate" << dendl;
7600 return false;
7601 }
7602
7c673cae 7603 if (num_auth_subtree > 0) {
28e407b8 7604 assert(mds->get_nodeid() > 0);
7c673cae
FG
7605 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7606 show_subtrees();
7607 return false;
7608 }
7609
7610 // close out any sessions (and open files!) before we try to trim the log, etc.
7611 if (mds->sessionmap.have_unclosed_sessions()) {
7612 if (!mds->server->terminating_sessions)
7613 mds->server->terminate_sessions();
7614 return false;
7615 }
7616
28e407b8
AA
7617 // Fully trim the log so that all objects in cache are clean and may be
7618 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7619 // trim the log such that the cache eventually becomes clean.
7620 mds->mdlog->trim(0);
7621 if (mds->mdlog->get_num_segments() > 1) {
7622 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7623 return false;
7624 }
7625
7626 // drop our reference to our stray dir inode
7627 for (int i = 0; i < NUM_STRAY; ++i) {
7628 if (strays[i] &&
7629 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7630 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7631 strays[i]->put(CInode::PIN_STRAY);
7632 strays[i]->put_stickydirs();
7633 }
7634 }
7635
7c673cae
FG
7636 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7637 if (mydir && !mydir->is_subtree_root())
7638 mydir = NULL;
7639
7640 // subtrees map not empty yet?
7641 if (subtrees.size() > (mydir ? 1 : 0)) {
7642 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7643 show_subtrees();
7644 migrator->show_importing();
7645 migrator->show_exporting();
7646 if (!migrator->is_importing() && !migrator->is_exporting())
7647 show_cache();
7648 return false;
7649 }
7650 assert(!migrator->is_exporting());
7651 assert(!migrator->is_importing());
7652
181888fb
FG
7653 if ((myin && myin->is_auth_pinned()) ||
7654 (mydir && mydir->is_auth_pinned())) {
7655 dout(7) << "still have auth pinned objects" << dendl;
7656 return false;
7657 }
7658
7c673cae
FG
7659 // (only do this once!)
7660 if (!mds->mdlog->is_capped()) {
7661 dout(7) << "capping the log" << dendl;
7662 mds->mdlog->cap();
7663 mds->mdlog->trim();
7664 }
7665
7666 if (!mds->mdlog->empty()) {
7667 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7668 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7669 return false;
7670 }
7671
7672 if (!did_shutdown_log_cap) {
7673 // flush journal header
7674 dout(7) << "writing header for (now-empty) journal" << dendl;
7675 assert(mds->mdlog->empty());
7676 mds->mdlog->write_head(0);
7677 // NOTE: filer active checker below will block us until this completes.
7678 did_shutdown_log_cap = true;
7679 return false;
7680 }
7681
7682 // filer active?
7683 if (mds->objecter->is_active()) {
7684 dout(7) << "objecter still active" << dendl;
7685 mds->objecter->dump_active();
7686 return false;
7687 }
7688
7689 // trim what we can from the cache
31f18b77
FG
7690 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7691 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7692 show_cache();
7693 //dump();
7694 return false;
7695 }
31f18b77
FG
7696
7697 // make mydir subtree go away
7698 if (mydir) {
7699 if (mydir->get_num_ref() > 1) { // subtree pin
7700 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7701 show_cache();
7702 return false;
7703 }
7704
7705 remove_subtree(mydir);
7706 myin->close_dirfrag(mydir->get_frag());
7707 }
7708 assert(subtrees.empty());
7709
1adf2230 7710 if (myin) {
31f18b77 7711 remove_inode(myin);
1adf2230
AA
7712 assert(!myin);
7713 }
7714
7c673cae
FG
7715 // done!
7716 dout(2) << "shutdown done." << dendl;
7717 return true;
7718}
7719
7720bool MDCache::shutdown_export_strays()
7721{
7722 if (mds->get_nodeid() == 0)
7723 return true;
7724
7725 dout(10) << "shutdown_export_strays" << dendl;
7726
7727 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7728
7729 bool done = true;
7730
7731 list<CDir*> dfs;
7732 for (int i = 0; i < NUM_STRAY; ++i) {
28e407b8
AA
7733 if (!strays[i] ||
7734 !strays[i]->state_test(CInode::STATE_STRAYPINNED))
7c673cae 7735 continue;
7c673cae
FG
7736 strays[i]->get_dirfrags(dfs);
7737 }
7738
7739 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7740 dfs_i != dfs.end(); ++dfs_i)
7741 {
7742 CDir *dir = *dfs_i;
7743
7744 if (!dir->is_complete()) {
7745 dir->fetch(0);
7746 done = false;
7747 if (!mds0_active)
7748 break;
7749 }
7750
94b18763
FG
7751 for (auto &p : dir->items) {
7752 CDentry *dn = p.second;
28e407b8 7753 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7754 if (dnl->is_null())
7755 continue;
7756 done = false;
7757 if (!mds0_active)
7758 break;
7759
7760 if (dn->state_test(CDentry::STATE_PURGING)) {
7761 // Don't try to migrate anything that is actually
7762 // being purged right now
7763 continue;
7764 }
7765
7766 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7767 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7768 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7769 } else {
7770 dout(10) << "already exporting " << *dn << dendl;
7771 }
7772 }
7773 }
7774
7775 return done;
7776}
7777
7778// ========= messaging ==============
7779
7780/* This function DOES put the passed message before returning */
7781void MDCache::dispatch(Message *m)
7782{
7783 switch (m->get_type()) {
7784
7785 // RESOLVE
7786 case MSG_MDS_RESOLVE:
7787 handle_resolve(static_cast<MMDSResolve*>(m));
7788 break;
7789 case MSG_MDS_RESOLVEACK:
7790 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7791 break;
7792
7793 // REJOIN
7794 case MSG_MDS_CACHEREJOIN:
7795 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7796 break;
7797
7798 case MSG_MDS_DISCOVER:
7799 handle_discover(static_cast<MDiscover*>(m));
7800 break;
7801 case MSG_MDS_DISCOVERREPLY:
7802 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7803 break;
7804
7805 case MSG_MDS_DIRUPDATE:
7806 handle_dir_update(static_cast<MDirUpdate*>(m));
7807 break;
7808
7809 case MSG_MDS_CACHEEXPIRE:
7810 handle_cache_expire(static_cast<MCacheExpire*>(m));
7811 break;
7812
7813 case MSG_MDS_DENTRYLINK:
7814 handle_dentry_link(static_cast<MDentryLink*>(m));
7815 break;
7816 case MSG_MDS_DENTRYUNLINK:
7817 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7818 break;
7819
7820 case MSG_MDS_FRAGMENTNOTIFY:
7821 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7822 break;
7823
7824 case MSG_MDS_FINDINO:
7825 handle_find_ino(static_cast<MMDSFindIno *>(m));
7826 break;
7827 case MSG_MDS_FINDINOREPLY:
7828 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7829 break;
7830
7831 case MSG_MDS_OPENINO:
7832 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7833 break;
7834 case MSG_MDS_OPENINOREPLY:
7835 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7836 break;
7837
7838 default:
7839 derr << "cache unknown message " << m->get_type() << dendl;
7840 assert(0 == "cache unknown message");
7841 }
7842}
7843
7844MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7845{
7846 if (mdr) {
7847 dout(20) << "_get_waiter retryrequest" << dendl;
7848 return new C_MDS_RetryRequest(this, mdr);
7849 } else if (req) {
7850 dout(20) << "_get_waiter retrymessage" << dendl;
7851 return new C_MDS_RetryMessage(mds, req);
7852 } else {
7853 return fin;
7854 }
7855}
7856
7857int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7858 const filepath& path, // what
7859 vector<CDentry*> *pdnvec, // result
7860 CInode **pin,
7861 int onfail)
7862{
7863 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7864 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7865 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7866
7867 assert(mdr || req || fin);
7868 assert(!forward || mdr || req); // forward requires a request
7869
7870 snapid_t snapid = CEPH_NOSNAP;
7871 if (mdr)
7872 mdr->snapid = snapid;
7873
7874 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7875
7876 if (mds->logger) mds->logger->inc(l_mds_traverse);
7877
7878 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7879 CInode *cur = get_inode(path.get_ino());
7880 if (cur == NULL) {
7881 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7882 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7883 else {
7884 //ceph_abort(); // hrm.. broken
7885 return -ESTALE;
7886 }
7887 return 1;
7888 }
7889 if (cur->state_test(CInode::STATE_PURGING))
7890 return -ESTALE;
7891
7892 // make sure snaprealm are open...
7893 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7894 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7895 return 1;
7896 }
7897
7898 // start trace
7899 if (pdnvec)
7900 pdnvec->clear();
7901 if (pin)
7902 *pin = cur;
7903
7904 unsigned depth = 0;
7905 while (depth < path.depth()) {
7906 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7907 << "' snapid " << snapid << dendl;
7908
7909 if (!cur->is_dir()) {
7910 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7911 return -ENOTDIR;
7912 }
7913
7914 // walk into snapdir?
7915 if (path[depth].length() == 0) {
7916 dout(10) << "traverse: snapdir" << dendl;
7917 if (!mdr)
7918 return -EINVAL;
7919 snapid = CEPH_SNAPDIR;
7920 mdr->snapid = snapid;
7921 depth++;
7922 continue;
7923 }
7924 // walk thru snapdir?
7925 if (snapid == CEPH_SNAPDIR) {
7926 if (!mdr)
7927 return -EINVAL;
7928 SnapRealm *realm = cur->find_snaprealm();
7929 snapid = realm->resolve_snapname(path[depth], cur->ino());
7930 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7931 if (!snapid)
7932 return -ENOENT;
7933 mdr->snapid = snapid;
7934 depth++;
7935 continue;
7936 }
7937
7938 // open dir
7939 frag_t fg = cur->pick_dirfrag(path[depth]);
7940 CDir *curdir = cur->get_dirfrag(fg);
7941 if (!curdir) {
7942 if (cur->is_auth()) {
7943 // parent dir frozen_dir?
7944 if (cur->is_frozen()) {
7945 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7946 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7947 return 1;
7948 }
7949 curdir = cur->get_or_open_dirfrag(this, fg);
7950 } else {
7951 // discover?
7952 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7953 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7954 null_okay);
7955 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7956 return 1;
7957 }
7958 }
7959 assert(curdir);
7960
7961#ifdef MDS_VERIFY_FRAGSTAT
7962 if (curdir->is_complete())
7963 curdir->verify_fragstat();
7964#endif
7965
7966 // frozen?
7967 /*
7968 if (curdir->is_frozen()) {
7969 // doh!
7970 // FIXME: traverse is allowed?
7971 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7972 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7973 if (onfinish) delete onfinish;
7974 return 1;
7975 }
7976 */
7977
7978 // Before doing dirfrag->dn lookup, compare with DamageTable's
7979 // record of which dentries were unreadable
7980 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7981 dout(4) << "traverse: stopped lookup at damaged dentry "
7982 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7983 return -EIO;
7984 }
7985
7986 // dentry
7987 CDentry *dn = curdir->lookup(path[depth], snapid);
7988 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7989
7990 // null and last_bit and xlocked by me?
7991 if (dnl && dnl->is_null() && null_okay) {
7992 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7993 if (pdnvec)
7994 pdnvec->push_back(dn);
7995 if (pin)
7996 *pin = 0;
7997 break; // done!
7998 }
7999
8000 if (dnl &&
8001 dn->lock.is_xlocked() &&
8002 dn->lock.get_xlock_by() != mdr &&
8003 !dn->lock.can_read(client) &&
8004 (dnl->is_null() || forward)) {
8005 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
8006 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8007 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
8008 mds->mdlog->flush();
8009 return 1;
8010 }
8011
8012 // can we conclude ENOENT?
8013 if (dnl && dnl->is_null()) {
8014 if (dn->lock.can_read(client) ||
8015 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8016 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8017 if (pdnvec) {
8018 if (depth == path.depth() - 1)
8019 pdnvec->push_back(dn);
8020 else
8021 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8022 }
8023 return -ENOENT;
8024 } else {
8025 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
8026 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8027 return 1;
8028 }
8029 }
8030
8031 if (dnl && !dnl->is_null()) {
8032 CInode *in = dnl->get_inode();
8033
8034 // do we have inode?
8035 if (!in) {
8036 assert(dnl->is_remote());
8037 // do i have it?
8038 in = get_inode(dnl->get_remote_ino());
8039 if (in) {
8040 dout(7) << "linking in remote in " << *in << dendl;
8041 dn->link_remote(dnl, in);
8042 } else {
8043 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8044 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8045 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8046 dout(4) << "traverse: remote dentry points to damaged ino "
8047 << *dn << dendl;
8048 return -EIO;
8049 }
8050 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8051 (null_okay && depth == path.depth() - 1));
8052 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8053 return 1;
8054 }
8055 }
8056
8057 cur = in;
8058 // make sure snaprealm are open...
8059 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8060 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8061 return 1;
8062 }
8063
8064 // add to trace, continue.
8065 touch_inode(cur);
8066 if (pdnvec)
8067 pdnvec->push_back(dn);
8068 if (pin)
8069 *pin = cur;
8070 depth++;
8071 continue;
8072 }
8073
8074
8075 // MISS. dentry doesn't exist.
8076 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8077
8078 if (curdir->is_auth()) {
8079 // dentry is mine.
8080 if (curdir->is_complete() ||
8081 (snapid == CEPH_NOSNAP &&
8082 curdir->has_bloom() &&
8083 !curdir->is_in_bloom(path[depth]))){
8084 // file not found
8085 if (pdnvec) {
8086 // instantiate a null dn?
8087 if (depth < path.depth()-1){
8088 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8089 dn = NULL;
8090 } else if (dn) {
8091 ceph_abort(); // should have fallen out in ->is_null() check above
8092 } else if (curdir->is_frozen()) {
8093 dout(20) << " not adding null to frozen dir " << dendl;
8094 } else if (snapid < CEPH_MAXSNAP) {
8095 dout(20) << " not adding null for snapid " << snapid << dendl;
8096 } else {
8097 // create a null dentry
8098 dn = curdir->add_null_dentry(path[depth]);
8099 dout(20) << " added null " << *dn << dendl;
8100 }
8101 if (dn)
8102 pdnvec->push_back(dn);
8103 else
8104 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8105 }
8106 return -ENOENT;
8107 } else {
8108
8109 // Check DamageTable for missing fragments before trying to fetch
8110 // this
8111 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8112 dout(4) << "traverse: damaged dirfrag " << *curdir
8113 << ", blocking fetch" << dendl;
8114 return -EIO;
8115 }
8116
8117 // directory isn't complete; reload
8118 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8119 touch_inode(cur);
8120 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8121 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8122 return 1;
8123 }
8124 } else {
8125 // dirfrag/dentry is not mine.
8126 mds_authority_t dauth = curdir->authority();
8127
8128 if (forward &&
8129 snapid && mdr && mdr->client_request &&
8130 (int)depth < mdr->client_request->get_num_fwd()) {
8131 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8132 << " < fwd " << mdr->client_request->get_num_fwd()
8133 << ", discovering instead of forwarding" << dendl;
8134 discover = true;
8135 }
8136
8137 if ((discover || null_okay)) {
8138 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8139 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8140 null_okay);
8141 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8142 return 1;
8143 }
8144 if (forward) {
8145 // forward
8146 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8147
8148 if (curdir->is_ambiguous_auth()) {
8149 // wait
8150 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8151 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8152 return 1;
8153 }
8154
8155 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8156
8157 if (mdr)
8158 request_forward(mdr, dauth.first);
8159 else
8160 mds->forward_message_mds(req, dauth.first);
8161
8162 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8163 assert(fin == NULL);
8164 return 2;
8165 }
8166 }
8167
8168 ceph_abort(); // i shouldn't get here
8169 }
8170
8171 // success.
8172 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8173 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8174 if (mdr)
8175 assert(mdr->snapid == snapid);
8176 return 0;
8177}
8178
8179CInode *MDCache::cache_traverse(const filepath& fp)
8180{
8181 dout(10) << "cache_traverse " << fp << dendl;
8182
8183 CInode *in;
8184 if (fp.get_ino())
8185 in = get_inode(fp.get_ino());
8186 else
8187 in = root;
8188 if (!in)
8189 return NULL;
8190
8191 for (unsigned i = 0; i < fp.depth(); i++) {
94b18763 8192 boost::string_view dname = fp[i];
7c673cae
FG
8193 frag_t fg = in->pick_dirfrag(dname);
8194 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8195 CDir *curdir = in->get_dirfrag(fg);
8196 if (!curdir)
8197 return NULL;
8198 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8199 if (!dn)
8200 return NULL;
8201 in = dn->get_linkage()->get_inode();
8202 if (!in)
8203 return NULL;
8204 }
8205 dout(10) << " got " << *in << dendl;
8206 return in;
8207}
8208
8209
8210/**
8211 * open_remote_dir -- open up a remote dirfrag
8212 *
8213 * @param diri base inode
8214 * @param approxfg approximate fragment.
8215 * @param fin completion callback
8216 */
8217void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8218{
8219 dout(10) << "open_remote_dir on " << *diri << dendl;
7c673cae
FG
8220 assert(diri->is_dir());
8221 assert(!diri->is_auth());
8222 assert(diri->get_dirfrag(approxfg) == 0);
8223
224ce89b 8224 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8225}
8226
8227
8228/**
8229 * get_dentry_inode - get or open inode
8230 *
8231 * @param dn the dentry
8232 * @param mdr current request
8233 *
8234 * will return inode for primary, or link up/open up remote link's inode as necessary.
8235 * If it's not available right now, puts mdr on wait list and returns null.
8236 */
8237CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8238{
8239 CDentry::linkage_t *dnl;
8240 if (projected)
8241 dnl = dn->get_projected_linkage();
8242 else
8243 dnl = dn->get_linkage();
8244
8245 assert(!dnl->is_null());
8246
8247 if (dnl->is_primary())
8248 return dnl->inode;
8249
8250 assert(dnl->is_remote());
8251 CInode *in = get_inode(dnl->get_remote_ino());
8252 if (in) {
8253 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8254 dn->link_remote(dnl, in);
8255 return in;
8256 } else {
8257 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8258 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8259 return 0;
8260 }
8261}
8262
8263struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8264 CDentry *dn;
8265 inodeno_t ino;
8266 MDSInternalContextBase *onfinish;
8267 bool want_xlocked;
8268 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
31f18b77
FG
8269 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8270 dn->get(MDSCacheObject::PIN_PTRWAITER);
8271 }
7c673cae
FG
8272 void finish(int r) override {
8273 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8274 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8275 }
8276};
8277
8278void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8279{
8280 dout(10) << "open_remote_dentry " << *dn << dendl;
8281 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8282 inodeno_t ino = dnl->get_remote_ino();
8283 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8284 open_ino(ino, pool,
8285 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8286}
8287
8288void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8289 bool want_xlocked, int r)
8290{
8291 if (r < 0) {
31f18b77
FG
8292 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8293 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8294 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8295 dn->state_set(CDentry::STATE_BADREMOTEINO);
8296
8297 std::string path;
8298 CDir *dir = dn->get_dir();
8299 if (dir) {
31f18b77 8300 dir->get_inode()->make_path_string(path);
94b18763
FG
8301 path += "/";
8302 path += std::string(dn->get_name());
7c673cae
FG
8303 }
8304
31f18b77 8305 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8306 if (fatal) {
31f18b77
FG
8307 mds->damaged();
8308 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8309 }
31f18b77
FG
8310 } else {
8311 r = 0;
8312 }
7c673cae
FG
8313 }
8314 fin->complete(r < 0 ? r : 0);
8315}
8316
8317
8318void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8319{
8320 // empty trace if we're a base inode
8321 if (in->is_base())
8322 return;
8323
8324 CInode *parent = in->get_parent_inode();
8325 assert(parent);
8326 make_trace(trace, parent);
8327
8328 CDentry *dn = in->get_parent_dn();
8329 dout(15) << "make_trace adding " << *dn << dendl;
8330 trace.push_back(dn);
8331}
8332
8333
8334// -------------------------------------------------------------------------------
8335// Open inode by inode number
8336
8337class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8338 inodeno_t ino;
8339 public:
8340 bufferlist bl;
8341 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8342 MDCacheIOContext(c), ino(i) {}
8343 void finish(int r) override {
8344 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8345 }
8346};
8347
8348struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8349 inodeno_t ino;
8350 MMDSOpenIno *msg;
8351 bool parent;
8352 public:
8353 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8354 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8355 void finish(int r) override {
8356 if (r < 0 && !parent)
8357 r = -EAGAIN;
8358 if (msg) {
8359 mdcache->handle_open_ino(msg, r);
8360 return;
8361 }
8362 assert(mdcache->opening_inodes.count(ino));
8363 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8364 }
8365};
8366
8367struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8368 inodeno_t ino;
8369 public:
8370 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8371 void finish(int r) override {
8372 mdcache->_open_ino_parent_opened(ino, r);
8373 }
8374};
8375
8376void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8377{
8378 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8379
8380 assert(opening_inodes.count(ino));
8381 open_ino_info_t& info = opening_inodes[ino];
8382
8383 CInode *in = get_inode(ino);
8384 if (in) {
8385 dout(10) << " found cached " << *in << dendl;
8386 open_ino_finish(ino, info, in->authority().first);
8387 return;
8388 }
8389
8390 inode_backtrace_t backtrace;
8391 if (err == 0) {
8392 try {
8393 ::decode(backtrace, bl);
8394 } catch (const buffer::error &decode_exc) {
8395 derr << "corrupt backtrace on ino x0" << std::hex << ino
8396 << std::dec << ": " << decode_exc << dendl;
8397 open_ino_finish(ino, info, -EIO);
8398 return;
8399 }
8400 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8401 dout(10) << " old object in pool " << info.pool
8402 << ", retrying pool " << backtrace.pool << dendl;
8403 info.pool = backtrace.pool;
8404 C_IO_MDC_OpenInoBacktraceFetched *fin =
8405 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8406 fetch_backtrace(ino, info.pool, fin->bl,
8407 new C_OnFinisher(fin, mds->finisher));
8408 return;
8409 }
8410 } else if (err == -ENOENT) {
8411 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8412 if (info.pool != meta_pool) {
8413 dout(10) << " no object in pool " << info.pool
8414 << ", retrying pool " << meta_pool << dendl;
8415 info.pool = meta_pool;
8416 C_IO_MDC_OpenInoBacktraceFetched *fin =
8417 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8418 fetch_backtrace(ino, info.pool, fin->bl,
8419 new C_OnFinisher(fin, mds->finisher));
8420 return;
8421 }
8422 err = 0; // backtrace.ancestors.empty() is checked below
8423 }
8424
8425 if (err == 0) {
8426 if (backtrace.ancestors.empty()) {
8427 dout(10) << " got empty backtrace " << dendl;
8428 err = -EIO;
8429 } else if (!info.ancestors.empty()) {
8430 if (info.ancestors[0] == backtrace.ancestors[0]) {
8431 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8432 err = -EINVAL;
8433 } else {
8434 info.last_err = 0;
8435 }
8436 }
8437 }
8438 if (err) {
8439 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8440 if (info.last_err)
8441 err = info.last_err;
8442 open_ino_finish(ino, info, err);
8443 return;
8444 }
8445
8446 dout(10) << " got backtrace " << backtrace << dendl;
8447 info.ancestors = backtrace.ancestors;
8448
8449 _open_ino_traverse_dir(ino, info, 0);
8450}
8451
8452void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8453{
8454 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8455
8456 assert(opening_inodes.count(ino));
8457 open_ino_info_t& info = opening_inodes[ino];
8458
8459 CInode *in = get_inode(ino);
8460 if (in) {
8461 dout(10) << " found cached " << *in << dendl;
8462 open_ino_finish(ino, info, in->authority().first);
8463 return;
8464 }
8465
8466 if (ret == mds->get_nodeid()) {
8467 _open_ino_traverse_dir(ino, info, 0);
8468 } else {
8469 if (ret >= 0) {
8470 mds_rank_t checked_rank = mds_rank_t(ret);
8471 info.check_peers = true;
8472 info.auth_hint = checked_rank;
8473 info.checked.erase(checked_rank);
8474 }
8475 do_open_ino(ino, info, ret);
8476 }
8477}
8478
8479void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8480{
8481 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8482
8483 CInode *in = get_inode(ino);
8484 if (in) {
8485 dout(10) << " found cached " << *in << dendl;
8486 open_ino_finish(ino, info, in->authority().first);
8487 return;
8488 }
8489
8490 if (ret) {
8491 do_open_ino(ino, info, ret);
8492 return;
8493 }
8494
8495 mds_rank_t hint = info.auth_hint;
8496 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8497 info.discover, info.want_xlocked, &hint);
8498 if (ret > 0)
8499 return;
8500 if (hint != mds->get_nodeid())
8501 info.auth_hint = hint;
8502 do_open_ino(ino, info, ret);
8503}
8504
8505void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8506{
8507 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8508 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8509 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8510}
8511
8512int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8513 vector<inode_backpointer_t>& ancestors,
8514 bool discover, bool want_xlocked, mds_rank_t *hint)
8515{
8516 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8517 int err = 0;
8518 for (unsigned i = 0; i < ancestors.size(); i++) {
8519 CInode *diri = get_inode(ancestors[i].dirino);
8520
8521 if (!diri) {
8522 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8523 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8524 return 1;
8525 }
8526 continue;
8527 }
8528
8529 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8530 CDir *dir = diri->get_parent_dir();
8531 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8532 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8533 dir = dir->get_inode()->get_parent_dir();
8534 _open_ino_fetch_dir(ino, m, dir, i == 0);
8535 return 1;
8536 }
8537
8538 if (!diri->is_dir()) {
8539 dout(10) << " " << *diri << " is not dir" << dendl;
8540 if (i == 0)
8541 err = -ENOTDIR;
8542 break;
8543 }
8544
8545 string &name = ancestors[i].dname;
8546 frag_t fg = diri->pick_dirfrag(name);
8547 CDir *dir = diri->get_dirfrag(fg);
8548 if (!dir) {
8549 if (diri->is_auth()) {
8550 if (diri->is_frozen()) {
8551 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8552 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8553 return 1;
8554 }
8555 dir = diri->get_or_open_dirfrag(this, fg);
8556 } else if (discover) {
8557 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8558 return 1;
8559 }
8560 }
8561 if (dir) {
8562 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8563 CDentry *dn = dir->lookup(name);
8564 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8565 if (dir->is_auth()) {
8566 if (dnl && dnl->is_primary() &&
8567 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8568 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8569 _open_ino_fetch_dir(ino, m, dir, i == 0);
8570 return 1;
8571 }
8572
8573 if (!dnl && !dir->is_complete() &&
8574 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8575 dout(10) << " fetching incomplete " << *dir << dendl;
8576 _open_ino_fetch_dir(ino, m, dir, i == 0);
8577 return 1;
8578 }
8579
8580 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8581 if (i == 0)
8582 err = -ENOENT;
8583 } else if (discover) {
8584 if (!dnl) {
8585 filepath path(name, 0);
8586 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8587 (i == 0 && want_xlocked));
8588 return 1;
8589 }
8590 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8591 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8592 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8593 return 1;
8594 }
8595 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8596 if (i == 0)
8597 err = -ENOENT;
8598 }
8599 }
8600 if (hint && i == 0)
8601 *hint = dir ? dir->authority().first : diri->authority().first;
8602 break;
8603 }
8604 return err;
8605}
8606
8607void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8608{
8609 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8610
8611 list<MDSInternalContextBase*> waiters;
8612 waiters.swap(info.waiters);
8613 opening_inodes.erase(ino);
8614 finish_contexts(g_ceph_context, waiters, ret);
8615}
8616
8617void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8618{
8619 if (err < 0 && err != -EAGAIN) {
8620 info.checked.clear();
7c673cae
FG
8621 info.checking = MDS_RANK_NONE;
8622 info.check_peers = true;
8623 info.fetch_backtrace = true;
8624 if (info.discover) {
8625 info.discover = false;
8626 info.ancestors.clear();
8627 }
8628 if (err != -ENOENT && err != -ENOTDIR)
8629 info.last_err = err;
8630 }
8631
d2e6a577
FG
8632 if (info.check_peers || info.discover) {
8633 if (info.discover) {
8634 // got backtrace from peer, but failed to find inode. re-check peers
8635 info.discover = false;
8636 info.ancestors.clear();
8637 info.checked.clear();
8638 }
7c673cae
FG
8639 info.check_peers = false;
8640 info.checking = MDS_RANK_NONE;
8641 do_open_ino_peer(ino, info);
8642 } else if (info.fetch_backtrace) {
8643 info.check_peers = true;
8644 info.fetch_backtrace = false;
8645 info.checking = mds->get_nodeid();
8646 info.checked.clear();
7c673cae
FG
8647 C_IO_MDC_OpenInoBacktraceFetched *fin =
8648 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8649 fetch_backtrace(ino, info.pool, fin->bl,
8650 new C_OnFinisher(fin, mds->finisher));
8651 } else {
8652 assert(!info.ancestors.empty());
8653 info.checking = mds->get_nodeid();
8654 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8655 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8656 }
8657}
8658
8659void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8660{
8661 set<mds_rank_t> all, active;
8662 mds->mdsmap->get_mds_set(all);
7c673cae 8663 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
8664 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
8665 else
8666 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8667
8668 dout(10) << "do_open_ino_peer " << ino << " active " << active
8669 << " all " << all << " checked " << info.checked << dendl;
8670
8671 mds_rank_t peer = MDS_RANK_NONE;
8672 if (info.auth_hint >= 0) {
8673 if (active.count(info.auth_hint)) {
8674 peer = info.auth_hint;
8675 info.auth_hint = MDS_RANK_NONE;
8676 }
8677 } else {
8678 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8679 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8680 peer = *p;
8681 break;
8682 }
8683 }
8684 if (peer < 0) {
d2e6a577
FG
8685 all.erase(mds->get_nodeid());
8686 if (all != info.checked) {
7c673cae
FG
8687 dout(10) << " waiting for more peers to be active" << dendl;
8688 } else {
8689 dout(10) << " all MDS peers have been checked " << dendl;
8690 do_open_ino(ino, info, 0);
8691 }
8692 } else {
8693 info.checking = peer;
8694 vector<inode_backpointer_t> *pa = NULL;
8695 // got backtrace from peer or backtrace just fetched
8696 if (info.discover || !info.fetch_backtrace)
8697 pa = &info.ancestors;
8698 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8699 }
8700}
8701
8702void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8703{
8704 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8705 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8706 m->put();
8707 return;
8708 }
8709
8710 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8711
8712 inodeno_t ino = m->ino;
8713 MMDSOpenInoReply *reply;
8714 CInode *in = get_inode(ino);
8715 if (in) {
8716 dout(10) << " have " << *in << dendl;
8717 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8718 if (in->is_auth()) {
8719 touch_inode(in);
8720 while (1) {
8721 CDentry *pdn = in->get_parent_dn();
8722 if (!pdn)
8723 break;
8724 CInode *diri = pdn->get_dir()->get_inode();
94b18763 8725 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
8726 in->inode.version));
8727 in = diri;
8728 }
8729 } else {
8730 reply->hint = in->authority().first;
8731 }
8732 } else if (err < 0) {
8733 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8734 } else {
8735 mds_rank_t hint = MDS_RANK_NONE;
8736 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8737 if (ret > 0)
8738 return;
8739 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8740 }
8741 m->get_connection()->send_message(reply);
8742 m->put();
8743}
8744
8745void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8746{
8747 dout(10) << "handle_open_ino_reply " << *m << dendl;
8748
8749 inodeno_t ino = m->ino;
8750 mds_rank_t from = mds_rank_t(m->get_source().num());
8751 auto it = opening_inodes.find(ino);
8752 if (it != opening_inodes.end() && it->second.checking == from) {
8753 open_ino_info_t& info = it->second;
8754 info.checking = MDS_RANK_NONE;
8755 info.checked.insert(from);
8756
8757 CInode *in = get_inode(ino);
8758 if (in) {
8759 dout(10) << " found cached " << *in << dendl;
8760 open_ino_finish(ino, info, in->authority().first);
8761 } else if (!m->ancestors.empty()) {
8762 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8763 if (!info.want_replica) {
8764 open_ino_finish(ino, info, from);
8765 m->put();
8766 return;
8767 }
8768
8769 info.ancestors = m->ancestors;
8770 info.auth_hint = from;
8771 info.checking = mds->get_nodeid();
8772 info.discover = true;
8773 _open_ino_traverse_dir(ino, info, 0);
8774 } else if (m->error) {
8775 dout(10) << " error " << m->error << " from mds." << from << dendl;
8776 do_open_ino(ino, info, m->error);
8777 } else {
8778 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8779 info.auth_hint = m->hint;
8780 info.checked.erase(m->hint);
8781 }
8782 do_open_ino_peer(ino, info);
8783 }
8784 }
8785 m->put();
8786}
8787
8788void MDCache::kick_open_ino_peers(mds_rank_t who)
8789{
8790 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8791
8792 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8793 p != opening_inodes.end();
8794 ++p) {
8795 open_ino_info_t& info = p->second;
8796 if (info.checking == who) {
8797 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8798 info.checking = MDS_RANK_NONE;
8799 do_open_ino_peer(p->first, info);
8800 } else if (info.checking == MDS_RANK_NONE) {
8801 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8802 do_open_ino_peer(p->first, info);
8803 }
8804 }
8805}
8806
8807void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8808 bool want_replica, bool want_xlocked)
8809{
8810 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8811 << want_replica << dendl;
8812
8813 if (opening_inodes.count(ino)) {
8814 open_ino_info_t& info = opening_inodes[ino];
8815 if (want_replica) {
8816 info.want_replica = true;
8817 if (want_xlocked && !info.want_xlocked) {
8818 if (!info.ancestors.empty()) {
8819 CInode *diri = get_inode(info.ancestors[0].dirino);
8820 if (diri) {
8821 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8822 CDir *dir = diri->get_dirfrag(fg);
8823 if (dir && !dir->is_auth()) {
8824 filepath path(info.ancestors[0].dname, 0);
8825 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8826 }
8827 }
8828 }
8829 info.want_xlocked = true;
8830 }
8831 }
8832 info.waiters.push_back(fin);
8833 } else {
8834 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
8835 info.want_replica = want_replica;
8836 info.want_xlocked = want_xlocked;
8837 info.tid = ++open_ino_last_tid;
8838 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8839 info.waiters.push_back(fin);
8840 do_open_ino(ino, info, 0);
8841 }
8842}
8843
8844/* ---------------------------- */
8845
8846/*
8847 * search for a given inode on MDS peers. optionally start with the given node.
8848
8849
8850 TODO
8851 - recover from mds node failure, recovery
8852 - traverse path
8853
8854 */
8855void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8856{
8857 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
8858 CInode *in = get_inode(ino);
8859 if (in && in->state_test(CInode::STATE_PURGING)) {
8860 c->complete(-ESTALE);
8861 return;
8862 }
8863 assert(!in);
7c673cae
FG
8864
8865 ceph_tid_t tid = ++find_ino_peer_last_tid;
8866 find_ino_peer_info_t& fip = find_ino_peer[tid];
8867 fip.ino = ino;
8868 fip.tid = tid;
8869 fip.fin = c;
8870 fip.hint = hint;
7c673cae
FG
8871 _do_find_ino_peer(fip);
8872}
8873
8874void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8875{
8876 set<mds_rank_t> all, active;
8877 mds->mdsmap->get_mds_set(all);
1adf2230 8878 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8879
8880 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8881 << " active " << active << " all " << all
8882 << " checked " << fip.checked
8883 << dendl;
8884
8885 mds_rank_t m = MDS_RANK_NONE;
8886 if (fip.hint >= 0) {
8887 m = fip.hint;
8888 fip.hint = MDS_RANK_NONE;
8889 } else {
8890 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8891 if (*p != mds->get_nodeid() &&
8892 fip.checked.count(*p) == 0) {
8893 m = *p;
8894 break;
8895 }
8896 }
8897 if (m == MDS_RANK_NONE) {
d2e6a577
FG
8898 all.erase(mds->get_nodeid());
8899 if (all != fip.checked) {
7c673cae
FG
8900 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8901 } else {
8902 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8903 fip.fin->complete(-ESTALE);
8904 find_ino_peer.erase(fip.tid);
8905 }
8906 } else {
8907 fip.checking = m;
8908 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8909 }
8910}
8911
8912void MDCache::handle_find_ino(MMDSFindIno *m)
8913{
8914 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8915 m->put();
8916 return;
8917 }
8918
8919 dout(10) << "handle_find_ino " << *m << dendl;
8920 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8921 CInode *in = get_inode(m->ino);
8922 if (in) {
8923 in->make_path(r->path);
8924 dout(10) << " have " << r->path << " " << *in << dendl;
8925 }
8926 m->get_connection()->send_message(r);
8927 m->put();
8928}
8929
8930
8931void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8932{
8933 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8934 if (p != find_ino_peer.end()) {
8935 dout(10) << "handle_find_ino_reply " << *m << dendl;
8936 find_ino_peer_info_t& fip = p->second;
8937
8938 // success?
8939 if (get_inode(fip.ino)) {
8940 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8941 mds->queue_waiter(fip.fin);
8942 find_ino_peer.erase(p);
8943 m->put();
8944 return;
8945 }
8946
8947 mds_rank_t from = mds_rank_t(m->get_source().num());
8948 if (fip.checking == from)
8949 fip.checking = MDS_RANK_NONE;
8950 fip.checked.insert(from);
8951
8952 if (!m->path.empty()) {
8953 // we got a path!
8954 vector<CDentry*> trace;
8955 MDRequestRef null_ref;
8956 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8957 if (r > 0)
8958 return;
8959 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8960 << ", retrying" << dendl;
8961 fip.checked.clear();
8962 _do_find_ino_peer(fip);
8963 } else {
8964 // nope, continue.
8965 _do_find_ino_peer(fip);
8966 }
8967 } else {
8968 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8969 }
8970 m->put();
8971}
8972
8973void MDCache::kick_find_ino_peers(mds_rank_t who)
8974{
8975 // find_ino_peers requests we should move on from
8976 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8977 p != find_ino_peer.end();
8978 ++p) {
8979 find_ino_peer_info_t& fip = p->second;
8980 if (fip.checking == who) {
8981 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8982 fip.checking = MDS_RANK_NONE;
8983 _do_find_ino_peer(fip);
8984 } else if (fip.checking == MDS_RANK_NONE) {
8985 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8986 _do_find_ino_peer(fip);
8987 }
8988 }
8989}
8990
8991/* ---------------------------- */
8992
8993int MDCache::get_num_client_requests()
8994{
8995 int count = 0;
8996 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8997 p != active_requests.end();
8998 ++p) {
8999 MDRequestRef& mdr = p->second;
9000 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9001 count++;
9002 }
9003 return count;
9004}
9005
9006/* This function takes over the reference to the passed Message */
9007MDRequestRef MDCache::request_start(MClientRequest *req)
9008{
9009 // did we win a forward race against a slave?
9010 if (active_requests.count(req->get_reqid())) {
9011 MDRequestRef& mdr = active_requests[req->get_reqid()];
9012 assert(mdr);
9013 if (mdr->is_slave()) {
9014 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9015 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9016 } else {
9017 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9018 req->put();
9019 }
9020 return MDRequestRef();
9021 }
9022
9023 // register new client request
9024 MDRequestImpl::Params params;
9025 params.reqid = req->get_reqid();
9026 params.attempt = req->get_num_fwd();
9027 params.client_req = req;
9028 params.initiated = req->get_recv_stamp();
9029 params.throttled = req->get_throttle_stamp();
9030 params.all_read = req->get_recv_complete_stamp();
9031 params.dispatched = req->get_dispatch_stamp();
9032
9033 MDRequestRef mdr =
9034 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9035 active_requests[params.reqid] = mdr;
9036 mdr->set_op_stamp(req->get_stamp());
9037 dout(7) << "request_start " << *mdr << dendl;
9038 return mdr;
9039}
9040
9041MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9042{
9043 int by = m->get_source().num();
9044 MDRequestImpl::Params params;
9045 params.reqid = ri;
9046 params.attempt = attempt;
9047 params.triggering_slave_req = m;
9048 params.slave_to = by;
9049 params.initiated = m->get_recv_stamp();
9050 params.throttled = m->get_throttle_stamp();
9051 params.all_read = m->get_recv_complete_stamp();
9052 params.dispatched = m->get_dispatch_stamp();
9053 MDRequestRef mdr =
9054 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9055 assert(active_requests.count(mdr->reqid) == 0);
9056 active_requests[mdr->reqid] = mdr;
9057 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9058 return mdr;
9059}
9060
9061MDRequestRef MDCache::request_start_internal(int op)
9062{
9063 MDRequestImpl::Params params;
9064 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9065 params.reqid.tid = mds->issue_tid();
9066 params.initiated = ceph_clock_now();
9067 params.internal_op = op;
9068 MDRequestRef mdr =
9069 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9070
9071 assert(active_requests.count(mdr->reqid) == 0);
9072 active_requests[mdr->reqid] = mdr;
9073 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9074 return mdr;
9075}
9076
9077MDRequestRef MDCache::request_get(metareqid_t rid)
9078{
9079 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9080 assert(p != active_requests.end());
9081 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9082 return p->second;
9083}
9084
9085void MDCache::request_finish(MDRequestRef& mdr)
9086{
9087 dout(7) << "request_finish " << *mdr << dendl;
9088 mdr->mark_event("finishing request");
9089
9090 // slave finisher?
9091 if (mdr->has_more() && mdr->more()->slave_commit) {
9092 Context *fin = mdr->more()->slave_commit;
9093 mdr->more()->slave_commit = 0;
9094 int ret;
9095 if (mdr->aborted) {
9096 mdr->aborted = false;
9097 ret = -1;
9098 mdr->more()->slave_rolling_back = true;
9099 } else {
9100 ret = 0;
9101 mdr->committing = true;
9102 }
9103 fin->complete(ret); // this must re-call request_finish.
9104 return;
9105 }
9106
d2e6a577
FG
9107 switch(mdr->internal_op) {
9108 case CEPH_MDS_OP_FRAGMENTDIR:
9109 logger->inc(l_mdss_ireq_fragmentdir);
9110 break;
9111 case CEPH_MDS_OP_EXPORTDIR:
9112 logger->inc(l_mdss_ireq_exportdir);
9113 break;
9114 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9115 logger->inc(l_mdss_ireq_enqueue_scrub);
9116 break;
9117 case CEPH_MDS_OP_FLUSH:
9118 logger->inc(l_mdss_ireq_flush);
9119 break;
9120 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9121 logger->inc(l_mdss_ireq_fragstats);
9122 break;
9123 case CEPH_MDS_OP_REPAIR_INODESTATS:
9124 logger->inc(l_mdss_ireq_inodestats);
9125 break;
9126 }
9127
7c673cae
FG
9128 request_cleanup(mdr);
9129}
9130
9131
9132void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9133{
9134 mdr->mark_event("forwarding request");
9135 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9136 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9137 << *mdr->client_request << dendl;
9138 mds->forward_message_mds(mdr->client_request, who);
9139 mdr->client_request = 0;
9140 if (mds->logger) mds->logger->inc(l_mds_forward);
9141 } else if (mdr->internal_op >= 0) {
9142 dout(10) << "request_forward on internal op; cancelling" << dendl;
9143 mdr->internal_op_finish->complete(-EXDEV);
9144 } else {
9145 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9146 << " was from mds" << dendl;
9147 }
9148 request_cleanup(mdr);
9149}
9150
9151
9152void MDCache::dispatch_request(MDRequestRef& mdr)
9153{
9154 if (mdr->client_request) {
9155 mds->server->dispatch_client_request(mdr);
9156 } else if (mdr->slave_request) {
9157 mds->server->dispatch_slave_request(mdr);
9158 } else {
9159 switch (mdr->internal_op) {
9160 case CEPH_MDS_OP_FRAGMENTDIR:
9161 dispatch_fragment_dir(mdr);
9162 break;
9163 case CEPH_MDS_OP_EXPORTDIR:
9164 migrator->dispatch_export_dir(mdr, 0);
9165 break;
9166 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9167 enqueue_scrub_work(mdr);
9168 break;
9169 case CEPH_MDS_OP_FLUSH:
9170 flush_dentry_work(mdr);
9171 break;
9172 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9173 repair_dirfrag_stats_work(mdr);
9174 break;
9175 case CEPH_MDS_OP_REPAIR_INODESTATS:
9176 repair_inode_stats_work(mdr);
9177 break;
9178 default:
9179 ceph_abort();
9180 }
9181 }
9182}
9183
9184
9185void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9186{
9187 if (!mdr->has_more())
9188 return;
9189
9190 // clean up slaves
9191 // (will implicitly drop remote dn pins)
9192 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9193 p != mdr->more()->slaves.end();
9194 ++p) {
9195 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9196 MMDSSlaveRequest::OP_FINISH);
9197
9198 if (mdr->killed && !mdr->committing) {
9199 r->mark_abort();
9200 } else if (mdr->more()->srcdn_auth_mds == *p &&
9201 mdr->more()->inode_import.length() > 0) {
9202 // information about rename imported caps
9203 r->inode_export.claim(mdr->more()->inode_import);
9204 }
9205
9206 mds->send_message_mds(r, *p);
9207 }
9208
9209 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9210 * implicitly. Note that we don't call the finishers -- there shouldn't
9211 * be any on a remote lock and the request finish wakes up all
9212 * the waiters anyway! */
9213 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9214 while (p != mdr->xlocks.end()) {
9215 if ((*p)->get_parent()->is_auth())
9216 ++p;
9217 else {
9218 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9219 << " on " << *(*p)->get_parent() << dendl;
9220 (*p)->put_xlock();
9221 mdr->locks.erase(*p);
9222 mdr->xlocks.erase(p++);
9223 }
9224 }
9225
9226 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9227 while (q != mdr->remote_wrlocks.end()) {
9228 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9229 << " on mds." << q->second
9230 << " on " << *(q->first)->get_parent() << dendl;
9231 mdr->locks.erase(q->first);
9232 mdr->remote_wrlocks.erase(q++);
9233 }
9234
9235 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9236 * leaving them in can cause double-notifies as
9237 * this function can get called more than once */
9238}
9239
9240void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9241{
9242 request_drop_foreign_locks(mdr);
9243 mds->locker->drop_non_rdlocks(mdr.get());
9244}
9245
9246void MDCache::request_drop_locks(MDRequestRef& mdr)
9247{
9248 request_drop_foreign_locks(mdr);
9249 mds->locker->drop_locks(mdr.get());
9250}
9251
9252void MDCache::request_cleanup(MDRequestRef& mdr)
9253{
9254 dout(15) << "request_cleanup " << *mdr << dendl;
9255
9256 if (mdr->has_more()) {
9257 if (mdr->more()->is_ambiguous_auth)
9258 mdr->clear_ambiguous_auth();
9259 if (!mdr->more()->waiting_for_finish.empty())
9260 mds->queue_waiters(mdr->more()->waiting_for_finish);
9261 }
9262
9263 request_drop_locks(mdr);
9264
9265 // drop (local) auth pins
9266 mdr->drop_local_auth_pins();
9267
9268 // drop stickydirs
9269 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9270 p != mdr->stickydirs.end();
9271 ++p)
9272 (*p)->put_stickydirs();
9273
9274 mds->locker->kick_cap_releases(mdr);
9275
9276 // drop cache pins
9277 mdr->drop_pins();
9278
9279 // remove from session
9280 mdr->item_session_request.remove_myself();
9281
9282 // remove from map
9283 active_requests.erase(mdr->reqid);
9284
9285 if (mds->logger)
9286 log_stat();
9287
9288 mdr->mark_event("cleaned up request");
9289}
9290
9291void MDCache::request_kill(MDRequestRef& mdr)
9292{
9293 // rollback slave requests is tricky. just let the request proceed.
94b18763 9294 if (mdr->has_more() &&
7c673cae 9295 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
94b18763
FG
9296 if (!mdr->done_locking) {
9297 assert(mdr->more()->witnessed.empty());
9298 mdr->aborted = true;
9299 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9300 } else {
9301 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9302 }
7c673cae
FG
9303
9304 assert(mdr->used_prealloc_ino == 0);
9305 assert(mdr->prealloc_inos.empty());
9306
9307 mdr->session = NULL;
9308 mdr->item_session_request.remove_myself();
9309 return;
9310 }
9311
9312 mdr->killed = true;
9313 mdr->mark_event("killing request");
9314
9315 if (mdr->committing) {
9316 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9317 } else {
9318 dout(10) << "request_kill " << *mdr << dendl;
9319 request_cleanup(mdr);
9320 }
9321}
9322
9323// -------------------------------------------------------------------------------
9324// SNAPREALMS
9325
9326struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9327 MDRequestRef mdr;
9328 MutationRef mut;
9329 CInode *in;
9330 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9331 MutationRef& mu, CInode *i) :
9332 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9333 void finish(int r) override {
9334 mdcache->_snaprealm_create_finish(mdr, mut, in);
9335 }
9336};
9337
9338void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9339{
9340 dout(10) << "snaprealm_create " << *in << dendl;
9341 assert(!in->snaprealm);
9342
9343 // allocate an id..
9344 if (!mdr->more()->stid) {
9345 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9346 new C_MDS_RetryRequest(this, mdr));
9347 return;
9348 }
9349
9350 MutationRef mut(new MutationImpl());
9351 mut->ls = mds->mdlog->get_current_segment();
9352 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9353 mds->mdlog->start_entry(le);
9354
9355 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9356
94b18763
FG
9357 auto &pi = in->project_inode(false, true);
9358 pi.inode.version = in->pre_dirty();
9359 pi.inode.rstat.rsnaprealms++;
7c673cae
FG
9360
9361 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9362 snapid_t seq;
9363 ::decode(seq, p);
9364
94b18763
FG
9365 auto &newsnap = *pi.snapnode;
9366 newsnap.created = seq;
9367 newsnap.seq = seq;
9368 newsnap.last_created = seq;
7c673cae
FG
9369
9370 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9371 journal_cow_inode(mut, &le->metablob, in);
9372 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9373
9374 mds->server->submit_mdlog_entry(le,
9375 new C_MDC_snaprealm_create_finish(this, mdr,
9376 mut, in),
9377 mdr, __func__);
9378 mds->mdlog->flush();
9379}
9380
9381
9382void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9383{
9384 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9385
9386 vector<inodeno_t> split_inos;
9387 vector<inodeno_t> split_realms;
9388
9389 if (snapop == CEPH_SNAP_OP_SPLIT) {
9390 // notify clients of update|split
9391 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9392 !p.end(); ++p)
9393 split_inos.push_back((*p)->ino());
9394
9395 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9396 p != in->snaprealm->open_children.end();
9397 ++p)
9398 split_realms.push_back((*p)->inode->ino());
9399 }
9400
9401 bufferlist snapbl;
9402 in->snaprealm->build_snap_trace(snapbl);
9403
9404 set<SnapRealm*> past_children;
9405 map<client_t, MClientSnap*> updates;
9406 list<SnapRealm*> q;
9407 q.push_back(in->snaprealm);
9408 while (!q.empty()) {
9409 SnapRealm *realm = q.front();
9410 q.pop_front();
9411
9412 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9413 realm->invalidate_cached_snaps();
9414
9415 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9416 p != realm->client_caps.end();
9417 ++p) {
9418 assert(!p->second->empty());
9419 if (!nosend && updates.count(p->first) == 0) {
9420 MClientSnap *update = new MClientSnap(snapop);
9421 update->head.split = in->ino();
9422 update->split_inos = split_inos;
9423 update->split_realms = split_realms;
9424 update->bl = snapbl;
9425 updates[p->first] = update;
9426 }
9427 }
9428
9429 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9430 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9431 p != realm->open_past_children.end();
9432 ++p)
9433 past_children.insert(*p);
9434 }
9435
9436 // notify for active children, too.
9437 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9438 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9439 p != realm->open_children.end();
9440 ++p)
9441 q.push_back(*p);
9442 }
9443
9444 if (!nosend)
9445 send_snaps(updates);
9446
9447 // notify past children and their descendants if we update/delete old snapshots
9448 for (set<SnapRealm*>::iterator p = past_children.begin();
9449 p != past_children.end();
9450 ++p)
9451 q.push_back(*p);
9452
9453 while (!q.empty()) {
9454 SnapRealm *realm = q.front();
9455 q.pop_front();
9456
9457 realm->invalidate_cached_snaps();
9458
9459 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9460 p != realm->open_children.end();
9461 ++p) {
9462 if (past_children.count(*p) == 0)
9463 q.push_back(*p);
9464 }
9465
9466 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9467 p != realm->open_past_children.end();
9468 ++p) {
9469 if (past_children.count(*p) == 0) {
9470 q.push_back(*p);
9471 past_children.insert(*p);
9472 }
9473 }
9474 }
9475
9476 if (snapop == CEPH_SNAP_OP_DESTROY) {
9477 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9478 for (set<SnapRealm*>::iterator p = past_children.begin();
9479 p != past_children.end();
9480 ++p)
9481 maybe_eval_stray((*p)->inode, true);
9482 }
9483}
9484
9485void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9486{
9487 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9488
9489 // apply
9490 in->pop_and_dirty_projected_inode(mut->ls);
9491 mut->apply();
9492 mds->locker->drop_locks(mut.get());
9493 mut->cleanup();
9494
9495 // tell table we've committed
9496 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9497
9498 // create
9499 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9500 snapid_t seq;
9501 ::decode(seq, p);
9502
9503 in->open_snaprealm();
9504 in->snaprealm->srnode.seq = seq;
9505 in->snaprealm->srnode.created = seq;
9506 bool ok = in->snaprealm->_open_parents(NULL);
9507 assert(ok);
9508
9509 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9510
9511 /*
9512 static int count = 5;
9513 if (--count == 0)
9514 ceph_abort(); // hack test test **********
9515 */
9516
9517 // done.
9518 mdr->more()->stid = 0; // caller will likely need to reuse this
9519 dispatch_request(mdr);
9520}
9521
9522
9523// -------------------------------------------------------------------------------
9524// STRAYS
9525
9526struct C_MDC_RetryScanStray : public MDCacheContext {
9527 dirfrag_t next;
9528 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9529 void finish(int r) override {
9530 mdcache->scan_stray_dir(next);
9531 }
9532};
9533
9534void MDCache::scan_stray_dir(dirfrag_t next)
9535{
9536 dout(10) << "scan_stray_dir " << next << dendl;
9537
9538 list<CDir*> ls;
9539 for (int i = 0; i < NUM_STRAY; ++i) {
9540 if (strays[i]->ino() < next.ino)
9541 continue;
9542 strays[i]->get_dirfrags(ls);
9543 }
9544
9545 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9546 CDir *dir = *p;
9547 if (dir->dirfrag() < next)
9548 continue;
9549 if (!dir->is_complete()) {
9550 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9551 return;
9552 }
94b18763
FG
9553 for (auto &p : dir->items) {
9554 CDentry *dn = p.second;
7c673cae
FG
9555 dn->state_set(CDentry::STATE_STRAY);
9556 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9557 if (dnl->is_primary()) {
9558 CInode *in = dnl->get_inode();
9559 if (in->inode.nlink == 0)
9560 in->state_set(CInode::STATE_ORPHAN);
9561 maybe_eval_stray(in);
9562 }
9563 }
9564 }
9565}
9566
7c673cae
FG
9567void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9568{
9569 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9570 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9571}
9572
9573
9574
9575
9576
9577// ========================================================================================
9578// DISCOVER
9579/*
9580
9581 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9582 to the parent metadata object in the cache (pinning it).
9583
9584 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9585
9586*/
9587
9588void MDCache::_send_discover(discover_info_t& d)
9589{
9590 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9591 d.want_base_dir, d.want_xlocked);
9592 dis->set_tid(d.tid);
9593 mds->send_message_mds(dis, d.mds);
9594}
9595
9596void MDCache::discover_base_ino(inodeno_t want_ino,
9597 MDSInternalContextBase *onfinish,
9598 mds_rank_t from)
9599{
9600 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9601 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9602 discover_info_t& d = _create_discover(from);
9603 d.ino = want_ino;
9604 _send_discover(d);
9605 }
9606 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9607}
9608
9609
9610void MDCache::discover_dir_frag(CInode *base,
9611 frag_t approx_fg,
9612 MDSInternalContextBase *onfinish,
9613 mds_rank_t from)
9614{
9615 if (from < 0)
9616 from = base->authority().first;
9617
9618 dirfrag_t df(base->ino(), approx_fg);
9619 dout(7) << "discover_dir_frag " << df
9620 << " from mds." << from << dendl;
9621
9622 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9623 discover_info_t& d = _create_discover(from);
9624 d.pin_base(base);
9625 d.ino = base->ino();
9626 d.frag = approx_fg;
9627 d.want_base_dir = true;
9628 _send_discover(d);
9629 }
9630
9631 if (onfinish)
9632 base->add_dir_waiter(approx_fg, onfinish);
9633}
9634
9635struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9636 CInode *base;
9637 snapid_t snapid;
9638 filepath path;
9639 mds_rank_t from;
9640 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9641 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9642 void finish(int r) override {
9643 mdcache->discover_path(base, snapid, path, 0, from);
9644 }
9645};
9646
9647void MDCache::discover_path(CInode *base,
9648 snapid_t snap,
9649 filepath want_path,
9650 MDSInternalContextBase *onfinish,
9651 bool want_xlocked,
9652 mds_rank_t from)
9653{
9654 if (from < 0)
9655 from = base->authority().first;
9656
9657 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9658 << (want_xlocked ? " want_xlocked":"")
9659 << dendl;
9660
9661 if (base->is_ambiguous_auth()) {
9662 dout(10) << " waiting for single auth on " << *base << dendl;
9663 if (!onfinish)
9664 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9665 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9666 return;
9667 } else if (from == mds->get_nodeid()) {
9668 list<MDSInternalContextBase*> finished;
9669 base->take_waiting(CInode::WAIT_DIR, finished);
9670 mds->queue_waiters(finished);
9671 return;
9672 }
9673
9674 frag_t fg = base->pick_dirfrag(want_path[0]);
9675 if ((want_xlocked && want_path.depth() == 1) ||
9676 !base->is_waiting_for_dir(fg) || !onfinish) {
9677 discover_info_t& d = _create_discover(from);
9678 d.ino = base->ino();
9679 d.pin_base(base);
9680 d.frag = fg;
9681 d.snap = snap;
9682 d.want_path = want_path;
9683 d.want_base_dir = true;
9684 d.want_xlocked = want_xlocked;
9685 _send_discover(d);
9686 }
9687
9688 // register + wait
9689 if (onfinish)
9690 base->add_dir_waiter(fg, onfinish);
9691}
9692
9693struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9694 CDir *base;
9695 snapid_t snapid;
9696 filepath path;
9697 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9698 MDCacheContext(c), base(b), snapid(s), path(p) {}
9699 void finish(int r) override {
9700 mdcache->discover_path(base, snapid, path, 0);
9701 }
9702};
9703
9704void MDCache::discover_path(CDir *base,
9705 snapid_t snap,
9706 filepath want_path,
9707 MDSInternalContextBase *onfinish,
9708 bool want_xlocked)
9709{
9710 mds_rank_t from = base->authority().first;
9711
9712 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9713 << (want_xlocked ? " want_xlocked":"")
9714 << dendl;
9715
9716 if (base->is_ambiguous_auth()) {
9717 dout(7) << " waiting for single auth on " << *base << dendl;
9718 if (!onfinish)
9719 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9720 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9721 return;
9722 } else if (from == mds->get_nodeid()) {
9723 list<MDSInternalContextBase*> finished;
9724 base->take_sub_waiting(finished);
9725 mds->queue_waiters(finished);
9726 return;
9727 }
9728
9729 if ((want_xlocked && want_path.depth() == 1) ||
9730 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9731 discover_info_t& d = _create_discover(from);
9732 d.ino = base->ino();
31f18b77 9733 d.pin_base(base->inode);
7c673cae
FG
9734 d.frag = base->get_frag();
9735 d.snap = snap;
9736 d.want_path = want_path;
9737 d.want_base_dir = false;
9738 d.want_xlocked = want_xlocked;
9739 _send_discover(d);
9740 }
9741
9742 // register + wait
9743 if (onfinish)
9744 base->add_dentry_waiter(want_path[0], snap, onfinish);
9745}
9746
9747void MDCache::kick_discovers(mds_rank_t who)
9748{
9749 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9750 p != discovers.end();
9751 ++p) {
9752 if (p->second.mds != who)
9753 continue;
9754 _send_discover(p->second);
9755 }
9756}
9757
9758
9759/* This function DOES put the passed message before returning */
9760void MDCache::handle_discover(MDiscover *dis)
9761{
9762 mds_rank_t whoami = mds->get_nodeid();
9763 mds_rank_t from = mds_rank_t(dis->get_source().num());
9764
9765 assert(from != whoami);
9766
9767 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9768 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 9769 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9770 dis->put();
9771 return;
9772 }
9773
9774 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9775 // delay processing request from survivor because we may not yet choose lock states.
9776 if (!mds->mdsmap->is_rejoin(from)) {
9777 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9778 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9779 return;
9780 }
9781 }
9782
9783
9784 CInode *cur = 0;
9785 MDiscoverReply *reply = new MDiscoverReply(dis);
9786
9787 snapid_t snapid = dis->get_snapid();
9788
9789 // get started.
9790 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9791 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9792 // wants root
9793 dout(7) << "handle_discover from mds." << from
9794 << " wants base + " << dis->get_want().get_path()
9795 << " snap " << snapid
9796 << dendl;
9797
9798 cur = get_inode(dis->get_base_ino());
9799 assert(cur);
9800
9801 // add root
9802 reply->starts_with = MDiscoverReply::INODE;
9803 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9804 dout(10) << "added base " << *cur << dendl;
9805 }
9806 else {
9807 // there's a base inode
9808 cur = get_inode(dis->get_base_ino(), snapid);
9809 if (!cur && snapid != CEPH_NOSNAP) {
9810 cur = get_inode(dis->get_base_ino());
9811 if (cur && !cur->is_multiversion())
9812 cur = NULL; // nope!
9813 }
9814
9815 if (!cur) {
9816 dout(7) << "handle_discover mds." << from
9817 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9818 << dendl;
9819 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9820 reply->set_error_dentry(dis->get_dentry(0));
9821 reply->set_flag_error_dir();
9822 } else if (dis->wants_base_dir()) {
9823 dout(7) << "handle_discover mds." << from
9824 << " wants basedir+" << dis->get_want().get_path()
9825 << " has " << *cur
9826 << dendl;
9827 } else {
9828 dout(7) << "handle_discover mds." << from
9829 << " wants " << dis->get_want().get_path()
9830 << " has " << *cur
9831 << dendl;
9832 }
9833 }
9834
9835 assert(reply);
9836
9837 // add content
9838 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9839 for (unsigned i = 0;
9840 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9841 i++) {
9842
9843 // -- figure out the dir
9844
9845 // is *cur even a dir at all?
9846 if (!cur->is_dir()) {
9847 dout(7) << *cur << " not a dir" << dendl;
9848 reply->set_flag_error_dir();
9849 break;
9850 }
9851
9852 // pick frag
9853 frag_t fg;
9854 if (dis->get_want().depth()) {
9855 // dentry specifies
9856 fg = cur->pick_dirfrag(dis->get_dentry(i));
9857 } else {
9858 // requester explicity specified the frag
9859 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9860 fg = dis->get_base_dir_frag();
9861 if (!cur->dirfragtree.is_leaf(fg))
9862 fg = cur->dirfragtree[fg.value()];
9863 }
9864 CDir *curdir = cur->get_dirfrag(fg);
9865
9866 if ((!curdir && !cur->is_auth()) ||
9867 (curdir && !curdir->is_auth())) {
9868
9869 /* before:
9870 * ONLY set flag if empty!!
9871 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9872 * resulting in duplicate discovers in flight,
9873 * which can wreak havoc when discovering rename srcdn (which may move)
9874 */
9875
9876 if (reply->is_empty()) {
9877 // only hint if empty.
9878 // someday this could be better, but right now the waiter logic isn't smart enough.
9879
9880 // hint
9881 if (curdir) {
9882 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9883 reply->set_dir_auth_hint(curdir->authority().first);
9884 } else {
9885 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9886 << *cur << dendl;
9887 reply->set_dir_auth_hint(cur->authority().first);
9888 }
9889
9890 // note error dentry, if any
9891 // NOTE: important, as it allows requester to issue an equivalent discover
9892 // to whomever we hint at.
9893 if (dis->get_want().depth() > i)
9894 reply->set_error_dentry(dis->get_dentry(i));
9895 }
9896
9897 break;
9898 }
9899
9900 if (!curdir) { // open dir?
9901 if (cur->is_frozen()) {
9902 if (!reply->is_empty()) {
9903 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9904 break;
9905 }
9906 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9907 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9908 reply->put();
9909 return;
9910 }
9911 curdir = cur->get_or_open_dirfrag(this, fg);
9912 } else if (curdir->is_frozen_tree() ||
9913 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
9914 if (!reply->is_empty()) {
9915 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9916 break;
9917 }
7c673cae
FG
9918 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9919 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9920 reply->set_flag_error_dir();
9921 break;
9922 }
7c673cae
FG
9923 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9924 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9925 reply->put();
9926 return;
9927 }
9928
9929 // add dir
9930 if (curdir->get_version() == 0) {
9931 // fetch newly opened dir
9932 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9933 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9934 // make sure the base frag is correct, though, in there was a refragment since the
9935 // original request was sent.
9936 reply->set_base_dir_frag(curdir->get_frag());
9937 } else {
9938 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9939 if (!reply->trace.length())
9940 reply->starts_with = MDiscoverReply::DIR;
9941 replicate_dir(curdir, from, reply->trace);
9942 dout(7) << "handle_discover added dir " << *curdir << dendl;
9943 }
9944
9945 // lookup
9946 CDentry *dn = 0;
9947 if (curdir->get_version() == 0) {
9948 // fetch newly opened dir
31f18b77 9949 assert(!curdir->has_bloom());
7c673cae
FG
9950 } else if (dis->get_want().depth() > 0) {
9951 // lookup dentry
9952 dn = curdir->lookup(dis->get_dentry(i), snapid);
9953 } else
9954 break; // done!
9955
9956 // incomplete dir?
9957 if (!dn) {
31f18b77
FG
9958 if (!curdir->is_complete() &&
9959 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
9960 // readdir
9961 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9962 if (reply->is_empty()) {
9963 // fetch and wait
9964 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9965 dis->wants_base_dir() && curdir->get_version() == 0);
9966 reply->put();
9967 return;
9968 } else {
9969 // initiate fetch, but send what we have so far
9970 curdir->fetch(0);
9971 break;
9972 }
9973 }
9974
9975 // send null dentry
9976 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9977 << *curdir << dendl;
9978 dn = curdir->add_null_dentry(dis->get_dentry(i));
9979 }
9980 assert(dn);
9981
31f18b77
FG
9982 // don't add replica to purging dentry/inode
9983 if (dn->state_test(CDentry::STATE_PURGING)) {
9984 if (reply->is_empty())
9985 reply->set_flag_error_dn(dis->get_dentry(i));
9986 break;
9987 }
9988
7c673cae
FG
9989 CDentry::linkage_t *dnl = dn->get_linkage();
9990
9991 // xlocked dentry?
9992 // ...always block on non-tail items (they are unrelated)
9993 // ...allow xlocked tail disocvery _only_ if explicitly requested
9994 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9995 if (dn->lock.is_xlocked()) {
9996 // is this the last (tail) item in the discover traversal?
9997 if (tailitem && dis->wants_xlocked()) {
9998 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9999 } else if (reply->is_empty()) {
10000 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10001 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10002 reply->put();
10003 return;
10004 } else {
10005 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10006 break;
10007 }
10008 }
10009
10010 // frozen inode?
10011 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10012 if (tailitem && dis->wants_xlocked()) {
10013 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10014 } else if (reply->is_empty()) {
10015 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10016 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10017 reply->put();
10018 return;
10019 } else {
10020 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10021 break;
10022 }
10023 }
10024
10025 // add dentry
10026 if (!reply->trace.length())
10027 reply->starts_with = MDiscoverReply::DENTRY;
10028 replicate_dentry(dn, from, reply->trace);
10029 dout(7) << "handle_discover added dentry " << *dn << dendl;
10030
10031 if (!dnl->is_primary()) break; // stop on null or remote link.
10032
10033 // add inode
10034 CInode *next = dnl->get_inode();
10035 assert(next->is_auth());
10036
10037 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10038 dout(7) << "handle_discover added inode " << *next << dendl;
10039
10040 // descend, keep going.
10041 cur = next;
10042 continue;
10043 }
10044
10045 // how did we do?
10046 assert(!reply->is_empty());
10047 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10048 mds->send_message(reply, dis->get_connection());
10049
10050 dis->put();
10051}
10052
10053/* This function DOES put the passed message before returning */
10054void MDCache::handle_discover_reply(MDiscoverReply *m)
10055{
10056 /*
10057 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10058 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10059 m->put();
10060 return;
10061 }
10062 */
10063 dout(7) << "discover_reply " << *m << dendl;
10064 if (m->is_flag_error_dir())
10065 dout(7) << " flag error, dir" << dendl;
10066 if (m->is_flag_error_dn())
10067 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10068
10069 list<MDSInternalContextBase*> finished, error;
10070 mds_rank_t from = mds_rank_t(m->get_source().num());
10071
10072 // starting point
10073 CInode *cur = get_inode(m->get_base_ino());
10074 bufferlist::iterator p = m->trace.begin();
10075
10076 int next = m->starts_with;
10077
10078 // decrement discover counters
10079 if (m->get_tid()) {
10080 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10081 if (p != discovers.end()) {
10082 dout(10) << " found tid " << m->get_tid() << dendl;
10083 discovers.erase(p);
10084 } else {
10085 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10086 }
10087 }
10088
10089 // discover may start with an inode
10090 if (!p.end() && next == MDiscoverReply::INODE) {
10091 cur = add_replica_inode(p, NULL, finished);
10092 dout(7) << "discover_reply got base inode " << *cur << dendl;
10093 assert(cur->is_base());
10094
10095 next = MDiscoverReply::DIR;
10096
10097 // take waiters?
10098 if (cur->is_base() &&
10099 waiting_for_base_ino[from].count(cur->ino())) {
10100 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10101 waiting_for_base_ino[from].erase(cur->ino());
10102 }
10103 }
10104 assert(cur);
10105
10106 // loop over discover results.
10107 // indexes follow each ([[dir] dentry] inode)
10108 // can start, end with any type.
10109 while (!p.end()) {
10110 // dir
10111 frag_t fg;
10112 CDir *curdir = 0;
10113 if (next == MDiscoverReply::DIR) {
10114 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10115 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10116 assert(m->get_wanted_base_dir());
10117 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10118 }
10119 } else {
10120 // note: this can only happen our first way around this loop.
10121 if (p.end() && m->is_flag_error_dn()) {
10122 fg = cur->pick_dirfrag(m->get_error_dentry());
10123 curdir = cur->get_dirfrag(fg);
10124 } else
10125 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10126 }
10127
10128 if (p.end())
10129 break;
10130
10131 // dentry
10132 CDentry *dn = add_replica_dentry(p, curdir, finished);
10133
10134 if (p.end())
10135 break;
10136
10137 // inode
10138 cur = add_replica_inode(p, dn, finished);
10139
10140 next = MDiscoverReply::DIR;
10141 }
10142
10143 // dir error?
10144 // or dir_auth hint?
10145 if (m->is_flag_error_dir() && !cur->is_dir()) {
10146 // not a dir.
10147 cur->take_waiting(CInode::WAIT_DIR, error);
10148 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10149 mds_rank_t who = m->get_dir_auth_hint();
10150 if (who == mds->get_nodeid()) who = -1;
10151 if (who >= 0)
10152 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10153
7c673cae
FG
10154
10155 if (m->get_wanted_base_dir()) {
31f18b77
FG
10156 frag_t fg = m->get_base_dir_frag();
10157 CDir *dir = cur->get_dirfrag(fg);
10158
7c673cae
FG
10159 if (cur->is_waiting_for_dir(fg)) {
10160 if (cur->is_auth())
10161 cur->take_waiting(CInode::WAIT_DIR, finished);
10162 else if (dir || !cur->dirfragtree.is_leaf(fg))
10163 cur->take_dir_waiting(fg, finished);
10164 else
10165 discover_dir_frag(cur, fg, 0, who);
10166 } else
10167 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10168 }
10169
10170 // try again?
10171 if (m->get_error_dentry().length()) {
31f18b77
FG
10172 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10173 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10174 // wanted a dentry
10175 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10176 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10177 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10178 m->get_wanted_snapid(), finished);
10179 } else {
10180 filepath relpath(m->get_error_dentry(), 0);
10181 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10182 }
10183 } else
10184 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10185 << m->get_error_dentry() << dendl;
10186 }
31f18b77
FG
10187 } else if (m->is_flag_error_dn()) {
10188 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10189 CDir *dir = cur->get_dirfrag(fg);
10190 if (dir) {
10191 if (dir->is_auth()) {
10192 dir->take_sub_waiting(finished);
10193 } else {
10194 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10195 m->get_wanted_snapid(), error);
10196 }
10197 }
7c673cae
FG
10198 }
10199
10200 // waiters
10201 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10202 mds->queue_waiters(finished);
10203
10204 // done
10205 m->put();
10206}
10207
10208
10209
10210// ----------------------------
10211// REPLICAS
10212
b32b8144
FG
10213
10214void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10215{
10216 dirfrag_t df = dir->dirfrag();
10217 ::encode(df, bl);
10218 dir->encode_replica(to, bl);
10219}
10220
10221void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10222{
94b18763 10223 ::encode(dn->get_name(), bl);
b32b8144
FG
10224 ::encode(dn->last, bl);
10225 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10226}
10227
10228void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10229 uint64_t features)
10230{
10231 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
10232 ::encode(in->last, bl);
10233 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10234}
10235
7c673cae
FG
10236CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10237 list<MDSInternalContextBase*>& finished)
10238{
10239 dirfrag_t df;
10240 ::decode(df, p);
10241
10242 assert(diri->ino() == df.ino);
10243
10244 // add it (_replica_)
10245 CDir *dir = diri->get_dirfrag(df.frag);
10246
10247 if (dir) {
10248 // had replica. update w/ new nonce.
10249 dir->decode_replica(p);
10250 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10251 } else {
10252 // force frag to leaf in the diri tree
10253 if (!diri->dirfragtree.is_leaf(df.frag)) {
10254 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10255 << diri->dirfragtree << dendl;
10256 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10257 }
10258
10259 // add replica.
10260 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10261 dir->decode_replica(p);
10262
10263 // is this a dir_auth delegation boundary?
10264 if (from != diri->authority().first ||
10265 diri->is_ambiguous_auth() ||
10266 diri->is_base())
10267 adjust_subtree_auth(dir, from);
10268
10269 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10270
10271 // get waiters
10272 diri->take_dir_waiting(df.frag, finished);
10273 }
10274
10275 return dir;
10276}
10277
7c673cae
FG
10278CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10279{
10280 string name;
10281 snapid_t last;
10282 ::decode(name, p);
10283 ::decode(last, p);
10284
10285 CDentry *dn = dir->lookup(name, last);
10286
10287 // have it?
10288 if (dn) {
10289 dn->decode_replica(p, false);
10290 dout(7) << "add_replica_dentry had " << *dn << dendl;
10291 } else {
10292 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10293 dn->decode_replica(p, true);
10294 dout(7) << "add_replica_dentry added " << *dn << dendl;
10295 }
10296
10297 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10298
10299 return dn;
10300}
10301
10302CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10303{
10304 inodeno_t ino;
10305 snapid_t last;
10306 ::decode(ino, p);
10307 ::decode(last, p);
10308 CInode *in = get_inode(ino, last);
10309 if (!in) {
10310 in = new CInode(this, false, 1, last);
10311 in->decode_replica(p, true);
10312 add_inode(in);
10313 if (in->ino() == MDS_INO_ROOT)
10314 in->inode_auth.first = 0;
10315 else if (in->is_mdsdir())
10316 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10317 dout(10) << "add_replica_inode added " << *in << dendl;
10318 if (dn) {
10319 assert(dn->get_linkage()->is_null());
10320 dn->dir->link_primary_inode(dn, in);
10321 }
10322 } else {
10323 in->decode_replica(p, false);
10324 dout(10) << "add_replica_inode had " << *in << dendl;
10325 }
10326
10327 if (dn) {
10328 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10329 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10330 }
10331
10332 return in;
10333}
10334
10335
10336void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10337{
10338 uint64_t features = mds->mdsmap->get_up_features();
10339 replicate_inode(get_myin(), who, bl, features);
10340 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10341 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10342 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10343 replicate_dir(straydn->get_dir(), who, bl);
10344 replicate_dentry(straydn, who, bl);
10345}
10346
10347CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10348{
10349 list<MDSInternalContextBase*> finished;
10350 bufferlist::iterator p = bl.begin();
10351
10352 CInode *mdsin = add_replica_inode(p, NULL, finished);
10353 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10354 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10355 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10356 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10357 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10358 if (!finished.empty())
10359 mds->queue_waiters(finished);
10360
10361 return straydn;
10362}
10363
10364
10365int MDCache::send_dir_updates(CDir *dir, bool bcast)
10366{
10367 // this is an FYI, re: replication
10368
10369 set<mds_rank_t> who;
10370 if (bcast) {
10371 mds->get_mds_map()->get_active_mds_set(who);
10372 } else {
181888fb
FG
10373 for (const auto &p : dir->get_replicas()) {
10374 who.insert(p.first);
10375 }
7c673cae
FG
10376 }
10377
10378 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10379
10380 filepath path;
10381 dir->inode->make_path(path);
10382
10383 mds_rank_t whoami = mds->get_nodeid();
10384 for (set<mds_rank_t>::iterator it = who.begin();
10385 it != who.end();
10386 ++it) {
10387 if (*it == whoami) continue;
10388 //if (*it == except) continue;
10389 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10390
94b18763
FG
10391 std::set<int32_t> s;
10392 for (const auto &r : dir->dir_rep_by) {
10393 s.insert(r);
10394 }
7c673cae
FG
10395 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10396 dir->dirfrag(),
10397 dir->dir_rep,
94b18763 10398 s,
7c673cae
FG
10399 path,
10400 bcast),
10401 *it);
10402 }
10403
10404 return 0;
10405}
10406
10407/* This function DOES put the passed message before returning */
10408void MDCache::handle_dir_update(MDirUpdate *m)
10409{
224ce89b
WB
10410 dirfrag_t df = m->get_dirfrag();
10411 CDir *dir = get_dirfrag(df);
7c673cae 10412 if (!dir) {
224ce89b 10413 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10414
10415 // discover it?
10416 if (m->should_discover()) {
10417 // only try once!
10418 // this is key to avoid a fragtree update race, among other things.
224ce89b 10419 m->inc_tried_discover();
7c673cae
FG
10420 vector<CDentry*> trace;
10421 CInode *in;
10422 filepath path = m->get_path();
10423 dout(5) << "trying discover on dir_update for " << path << dendl;
10424 MDRequestRef null_ref;
10425 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10426 if (r > 0)
10427 return;
224ce89b
WB
10428 if (r == 0 &&
10429 in->ino() == df.ino &&
10430 in->get_approx_dirfrag(df.frag) == NULL) {
10431 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10432 return;
10433 }
7c673cae
FG
10434 }
10435
10436 m->put();
10437 return;
10438 }
10439
224ce89b
WB
10440 if (!m->has_tried_discover()) {
10441 // Update if it already exists. Othwerwise it got updated by discover reply.
10442 dout(5) << "dir_update on " << *dir << dendl;
10443 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10444 dir->dir_rep_by.clear();
10445 for (const auto &e : m->get_dir_rep_by()) {
10446 dir->dir_rep_by.insert(e);
10447 }
224ce89b
WB
10448 }
10449
7c673cae
FG
10450 // done
10451 m->put();
10452}
10453
10454
10455
10456
10457
10458// LINK
10459
10460void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10461{
10462 dout(7) << "send_dentry_link " << *dn << dendl;
10463
10464 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10465 for (const auto &p : dn->get_replicas()) {
7c673cae 10466 // don't tell (rename) witnesses; they already know
181888fb 10467 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10468 continue;
181888fb
FG
10469 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10470 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10471 rejoin_gather.count(p.first)))
7c673cae
FG
10472 continue;
10473 CDentry::linkage_t *dnl = dn->get_linkage();
10474 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
94b18763 10475 dn->get_name(), dnl->is_primary());
7c673cae
FG
10476 if (dnl->is_primary()) {
10477 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10478 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10479 mds->mdsmap->get_up_features());
10480 } else if (dnl->is_remote()) {
10481 inodeno_t ino = dnl->get_remote_ino();
10482 __u8 d_type = dnl->get_remote_d_type();
10483 dout(10) << " remote " << ino << " " << d_type << dendl;
10484 ::encode(ino, m->bl);
10485 ::encode(d_type, m->bl);
10486 } else
10487 ceph_abort(); // aie, bad caller!
181888fb 10488 mds->send_message_mds(m, p.first);
7c673cae
FG
10489 }
10490}
10491
10492/* This function DOES put the passed message before returning */
10493void MDCache::handle_dentry_link(MDentryLink *m)
10494{
10495
10496 CDentry *dn = NULL;
10497 CDir *dir = get_dirfrag(m->get_dirfrag());
10498 if (!dir) {
10499 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10500 } else {
10501 dn = dir->lookup(m->get_dn());
10502 if (!dn) {
10503 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10504 } else {
10505 dout(7) << "handle_dentry_link on " << *dn << dendl;
10506 CDentry::linkage_t *dnl = dn->get_linkage();
10507
10508 assert(!dn->is_auth());
10509 assert(dnl->is_null());
10510 }
10511 }
10512
10513 bufferlist::iterator p = m->bl.begin();
10514 list<MDSInternalContextBase*> finished;
10515 if (dn) {
10516 if (m->get_is_primary()) {
10517 // primary link.
10518 add_replica_inode(p, dn, finished);
10519 } else {
10520 // remote link, easy enough.
10521 inodeno_t ino;
10522 __u8 d_type;
10523 ::decode(ino, p);
10524 ::decode(d_type, p);
10525 dir->link_remote_inode(dn, ino, d_type);
10526 }
10527 } else {
10528 ceph_abort();
10529 }
10530
10531 if (!finished.empty())
10532 mds->queue_waiters(finished);
10533
10534 m->put();
10535 return;
10536}
10537
10538
10539// UNLINK
10540
10541void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10542{
10543 dout(10) << "send_dentry_unlink " << *dn << dendl;
10544 // share unlink news with replicas
10545 set<mds_rank_t> replicas;
10546 dn->list_replicas(replicas);
10547 if (straydn)
10548 straydn->list_replicas(replicas);
10549 for (set<mds_rank_t>::iterator it = replicas.begin();
10550 it != replicas.end();
10551 ++it) {
10552 // don't tell (rmdir) witnesses; they already know
10553 if (mdr.get() && mdr->more()->witnessed.count(*it))
10554 continue;
10555
10556 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10557 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10558 rejoin_gather.count(*it)))
10559 continue;
10560
94b18763 10561 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
7c673cae
FG
10562 if (straydn)
10563 replicate_stray(straydn, *it, unlink->straybl);
10564 mds->send_message_mds(unlink, *it);
10565 }
10566}
10567
10568/* This function DOES put the passed message before returning */
10569void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10570{
10571 // straydn
10572 CDentry *straydn = NULL;
10573 if (m->straybl.length())
10574 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10575
10576 CDir *dir = get_dirfrag(m->get_dirfrag());
10577 if (!dir) {
10578 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10579 } else {
10580 CDentry *dn = dir->lookup(m->get_dn());
10581 if (!dn) {
10582 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10583 } else {
10584 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10585 CDentry::linkage_t *dnl = dn->get_linkage();
10586
10587 // open inode?
10588 if (dnl->is_primary()) {
10589 CInode *in = dnl->get_inode();
10590 dn->dir->unlink_inode(dn);
10591 assert(straydn);
10592 straydn->dir->link_primary_inode(straydn, in);
10593
10594 // in->first is lazily updated on replica; drag it forward so
10595 // that we always keep it in sync with the dnq
10596 assert(straydn->first >= in->first);
10597 in->first = straydn->first;
10598
10599 // update subtree map?
10600 if (in->is_dir())
10601 adjust_subtree_after_rename(in, dir, false);
10602
10603 // send caps to auth (if we're not already)
10604 if (in->is_any_caps() &&
10605 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10606 migrator->export_caps(in);
10607
7c673cae
FG
10608 straydn = NULL;
10609 } else {
10610 assert(!straydn);
10611 assert(dnl->is_remote());
10612 dn->dir->unlink_inode(dn);
10613 }
10614 assert(dnl->is_null());
7c673cae
FG
10615 }
10616 }
10617
10618 // race with trim_dentry()
10619 if (straydn) {
10620 assert(straydn->get_num_ref() == 0);
10621 assert(straydn->get_linkage()->is_null());
10622 map<mds_rank_t, MCacheExpire*> expiremap;
10623 trim_dentry(straydn, expiremap);
10624 send_expire_messages(expiremap);
10625 }
10626
10627 m->put();
10628 return;
10629}
10630
10631
10632
10633
10634
10635
10636// ===================================================================
10637
10638
10639
10640// ===================================================================
10641// FRAGMENT
10642
10643
10644/**
10645 * adjust_dir_fragments -- adjust fragmentation for a directory
10646 *
10647 * @param diri directory inode
10648 * @param basefrag base fragment
10649 * @param bits bit adjustment. positive for split, negative for merge.
10650 */
10651void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10652 list<CDir*>& resultfrags,
10653 list<MDSInternalContextBase*>& waiters,
10654 bool replay)
10655{
10656 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10657 << " on " << *diri << dendl;
10658
10659 list<CDir*> srcfrags;
10660 diri->get_dirfrags_under(basefrag, srcfrags);
10661
10662 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10663}
10664
10665CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10666{
10667 CDir *dir = diri->get_dirfrag(fg);
10668 if (dir)
10669 return dir;
10670
10671 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10672
10673 list<CDir*> src, result;
10674 list<MDSInternalContextBase*> waiters;
10675
10676 // split a parent?
10677 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10678 while (1) {
10679 CDir *pdir = diri->get_dirfrag(parent);
10680 if (pdir) {
10681 int split = fg.bits() - parent.bits();
10682 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10683 src.push_back(pdir);
10684 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10685 dir = diri->get_dirfrag(fg);
10686 if (dir) {
10687 dout(10) << "force_dir_fragment result " << *dir << dendl;
10688 break;
10689 }
10690 }
10691 if (parent == frag_t())
10692 break;
10693 frag_t last = parent;
10694 parent = parent.parent();
10695 dout(10) << " " << last << " parent is " << parent << dendl;
10696 }
10697
10698 if (!dir) {
10699 // hoover up things under fg?
10700 diri->get_dirfrags_under(fg, src);
10701 if (src.empty()) {
10702 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10703 } else {
10704 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10705 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10706 dir = result.front();
10707 dout(10) << "force_dir_fragment result " << *dir << dendl;
10708 }
10709 }
10710 if (!replay)
10711 mds->queue_waiters(waiters);
10712 return dir;
10713}
10714
10715void MDCache::adjust_dir_fragments(CInode *diri,
10716 list<CDir*>& srcfrags,
10717 frag_t basefrag, int bits,
10718 list<CDir*>& resultfrags,
10719 list<MDSInternalContextBase*>& waiters,
10720 bool replay)
10721{
10722 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10723 << " srcfrags " << srcfrags
10724 << " on " << *diri << dendl;
10725
10726 // adjust fragtree
10727 // yuck. we may have discovered the inode while it was being fragmented.
10728 if (!diri->dirfragtree.is_leaf(basefrag))
10729 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10730
10731 if (bits > 0)
10732 diri->dirfragtree.split(basefrag, bits);
10733 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10734
10735 if (srcfrags.empty())
10736 return;
10737
10738 // split
10739 CDir *parent_dir = diri->get_parent_dir();
10740 CDir *parent_subtree = 0;
10741 if (parent_dir)
10742 parent_subtree = get_subtree_root(parent_dir);
10743
10744 if (bits > 0) {
10745 // SPLIT
10746 assert(srcfrags.size() == 1);
10747 CDir *dir = srcfrags.front();
10748
10749 dir->split(bits, resultfrags, waiters, replay);
10750
10751 // did i change the subtree map?
10752 if (dir->is_subtree_root()) {
10753 // new frags are now separate subtrees
10754 for (list<CDir*>::iterator p = resultfrags.begin();
10755 p != resultfrags.end();
10756 ++p)
10757 subtrees[*p].clear(); // new frag is now its own subtree
10758
10759 // was i a bound?
10760 if (parent_subtree) {
10761 assert(subtrees[parent_subtree].count(dir));
10762 subtrees[parent_subtree].erase(dir);
10763 for (list<CDir*>::iterator p = resultfrags.begin();
10764 p != resultfrags.end();
10765 ++p) {
10766 assert((*p)->is_subtree_root());
10767 subtrees[parent_subtree].insert(*p);
10768 }
10769 }
10770
10771 // adjust my bounds.
10772 set<CDir*> bounds;
10773 bounds.swap(subtrees[dir]);
10774 subtrees.erase(dir);
10775 for (set<CDir*>::iterator p = bounds.begin();
10776 p != bounds.end();
10777 ++p) {
10778 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10779 subtrees[frag].insert(*p);
10780 }
10781
10782 show_subtrees(10);
7c673cae
FG
10783 }
10784
10785 diri->close_dirfrag(dir->get_frag());
10786
10787 } else {
10788 // MERGE
10789
10790 // are my constituent bits subtrees? if so, i will be too.
10791 // (it's all or none, actually.)
31f18b77
FG
10792 bool any_subtree = false;
10793 for (CDir *dir : srcfrags) {
7c673cae 10794 if (dir->is_subtree_root()) {
31f18b77
FG
10795 any_subtree = true;
10796 break;
10797 }
10798 }
10799 set<CDir*> new_bounds;
10800 if (any_subtree) {
10801 for (CDir *dir : srcfrags) {
10802 // this simplifies the code that find subtrees underneath the dirfrag
10803 if (!dir->is_subtree_root()) {
10804 dir->state_set(CDir::STATE_AUXSUBTREE);
10805 adjust_subtree_auth(dir, mds->get_nodeid());
10806 }
10807 }
10808
10809 for (CDir *dir : srcfrags) {
10810 assert(dir->is_subtree_root());
7c673cae 10811 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
10812 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10813 set<CDir*>::iterator r = q->second.begin();
10814 while (r != subtrees[dir].end()) {
10815 new_bounds.insert(*r);
10816 subtrees[dir].erase(r++);
10817 }
10818 subtrees.erase(q);
31f18b77 10819
7c673cae
FG
10820 // remove myself as my parent's bound
10821 if (parent_subtree)
10822 subtrees[parent_subtree].erase(dir);
10823 }
10824 }
10825
10826 // merge
10827 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10828 f->merge(srcfrags, waiters, replay);
7c673cae 10829
31f18b77 10830 if (any_subtree) {
7c673cae
FG
10831 assert(f->is_subtree_root());
10832 subtrees[f].swap(new_bounds);
10833 if (parent_subtree)
10834 subtrees[parent_subtree].insert(f);
10835
10836 show_subtrees(10);
10837 }
10838
10839 resultfrags.push_back(f);
10840 }
10841}
10842
10843
10844class C_MDC_FragmentFrozen : public MDSInternalContext {
10845 MDCache *mdcache;
10846 MDRequestRef mdr;
10847public:
10848 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10849 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10850 void finish(int r) override {
10851 mdcache->fragment_frozen(mdr, r);
10852 }
10853};
10854
10855bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10856{
10857 if (is_readonly()) {
10858 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10859 return false;
10860 }
10861 if (mds->is_cluster_degraded()) {
10862 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10863 return false;
10864 }
10865 if (diri->get_parent_dir() &&
10866 diri->get_parent_dir()->get_inode()->is_stray()) {
10867 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10868 return false;
10869 }
10870 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10871 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10872 return false;
10873 }
10874
10875 if (diri->scrub_is_in_progress()) {
10876 dout(7) << "can_fragment: scrub in progress" << dendl;
10877 return false;
10878 }
10879
10880 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10881 CDir *dir = *p;
10882 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10883 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10884 return false;
10885 }
10886 if (!dir->is_auth()) {
10887 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10888 return false;
10889 }
10890 if (dir->is_bad()) {
10891 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10892 return false;
10893 }
10894 if (dir->is_frozen() ||
10895 dir->is_freezing()) {
10896 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10897 return false;
10898 }
10899 }
10900
10901 return true;
10902}
10903
10904void MDCache::split_dir(CDir *dir, int bits)
10905{
10906 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10907 assert(dir->is_auth());
10908 CInode *diri = dir->inode;
10909
10910 list<CDir*> dirs;
10911 dirs.push_back(dir);
10912
10913 if (!can_fragment(diri, dirs)) {
10914 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10915 return;
10916 }
10917
31f18b77
FG
10918 if (dir->frag.bits() + bits > 24) {
10919 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10920 return;
10921 }
10922
7c673cae
FG
10923 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10924 mdr->more()->fragment_base = dir->dirfrag();
10925
10926 assert(fragments.count(dir->dirfrag()) == 0);
10927 fragment_info_t& info = fragments[dir->dirfrag()];
10928 info.mdr = mdr;
10929 info.dirs.push_back(dir);
10930 info.bits = bits;
10931 info.last_cum_auth_pins_change = ceph_clock_now();
10932
10933 fragment_freeze_dirs(dirs);
10934 // initial mark+complete pass
10935 fragment_mark_and_complete(mdr);
10936}
10937
10938void MDCache::merge_dir(CInode *diri, frag_t frag)
10939{
10940 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10941
10942 list<CDir*> dirs;
10943 if (!diri->get_dirfrags_under(frag, dirs)) {
10944 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10945 return;
10946 }
10947
10948 if (diri->dirfragtree.is_leaf(frag)) {
10949 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10950 return;
10951 }
10952
10953 if (!can_fragment(diri, dirs))
10954 return;
10955
10956 CDir *first = dirs.front();
10957 int bits = first->get_frag().bits() - frag.bits();
10958 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10959
10960 dirfrag_t basedirfrag(diri->ino(), frag);
10961 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10962 mdr->more()->fragment_base = basedirfrag;
10963
10964 assert(fragments.count(basedirfrag) == 0);
10965 fragment_info_t& info = fragments[basedirfrag];
10966 info.mdr = mdr;
10967 info.dirs = dirs;
10968 info.bits = -bits;
10969 info.last_cum_auth_pins_change = ceph_clock_now();
10970
10971 fragment_freeze_dirs(dirs);
10972 // initial mark+complete pass
10973 fragment_mark_and_complete(mdr);
10974}
10975
10976void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10977{
10978 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10979 CDir *dir = *p;
10980 dir->auth_pin(dir); // until we mark and complete them
10981 dir->state_set(CDir::STATE_FRAGMENTING);
10982 dir->freeze_dir();
10983 assert(dir->is_freezing_dir());
10984 }
10985}
10986
10987class C_MDC_FragmentMarking : public MDCacheContext {
10988 MDRequestRef mdr;
10989public:
10990 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10991 void finish(int r) override {
10992 mdcache->fragment_mark_and_complete(mdr);
10993 }
10994};
10995
10996void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10997{
10998 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10999 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11000 if (it == fragments.end() || it->second.mdr != mdr) {
11001 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11002 request_finish(mdr);
11003 return;
11004 }
11005
11006 fragment_info_t& info = it->second;
11007 CInode *diri = info.dirs.front()->get_inode();
11008 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11009
11010 MDSGatherBuilder gather(g_ceph_context);
11011
11012 for (list<CDir*>::iterator p = info.dirs.begin();
11013 p != info.dirs.end();
11014 ++p) {
11015 CDir *dir = *p;
11016
11017 bool ready = true;
11018 if (!dir->is_complete()) {
11019 dout(15) << " fetching incomplete " << *dir << dendl;
11020 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11021 ready = false;
11022 } else if (dir->get_frag() == frag_t()) {
11023 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11024 // the operation. To avoid CDir::fetch() complaining about missing object,
11025 // we commit new dirfrag first.
11026 if (dir->state_test(CDir::STATE_CREATING)) {
11027 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11028 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11029 ready = false;
11030 } else if (dir->is_new()) {
11031 dout(15) << " committing new " << *dir << dendl;
11032 assert(dir->is_dirty());
11033 dir->commit(0, gather.new_sub(), true);
11034 ready = false;
11035 }
11036 }
11037 if (!ready)
11038 continue;
11039
11040 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11041 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11042 for (auto &p : dir->items) {
11043 CDentry *dn = p.second;
7c673cae
FG
11044 dn->get(CDentry::PIN_FRAGMENTING);
11045 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11046 dn->state_set(CDentry::STATE_FRAGMENTING);
11047 }
11048 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11049 dir->auth_unpin(dir);
11050 } else {
11051 dout(15) << " already marked " << *dir << dendl;
11052 }
11053 }
11054 if (gather.has_subs()) {
11055 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11056 gather.activate();
11057 return;
11058 }
11059
11060 for (list<CDir*>::iterator p = info.dirs.begin();
11061 p != info.dirs.end();
11062 ++p) {
11063 CDir *dir = *p;
11064 if (!dir->is_frozen_dir()) {
11065 assert(dir->is_freezing_dir());
11066 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11067 }
11068 }
11069 if (gather.has_subs()) {
11070 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11071 gather.activate();
11072 // flush log so that request auth_pins are retired
11073 mds->mdlog->flush();
11074 return;
11075 }
11076
11077 fragment_frozen(mdr, 0);
11078}
11079
11080void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11081{
11082 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11083 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11084 CDir *dir = *p;
11085 dout(10) << " frag " << *dir << dendl;
11086
11087 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11088 dir->state_clear(CDir::STATE_FRAGMENTING);
11089
11090 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11091 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11092
94b18763
FG
11093 for (auto &p : dir->items) {
11094 CDentry *dn = p.second;
7c673cae
FG
11095 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11096 dn->state_clear(CDentry::STATE_FRAGMENTING);
11097 dn->put(CDentry::PIN_FRAGMENTING);
11098 }
11099 } else {
11100 dir->auth_unpin(dir);
11101 }
11102
11103 dir->unfreeze_dir();
11104 }
11105}
11106
11107bool MDCache::fragment_are_all_frozen(CDir *dir)
11108{
11109 assert(dir->is_frozen_dir());
11110 map<dirfrag_t,fragment_info_t>::iterator p;
11111 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11112 p != fragments.end() && p->first.ino == dir->ino();
11113 ++p) {
11114 if (p->first.frag.contains(dir->get_frag()))
11115 return p->second.all_frozen;
11116 }
11117 ceph_abort();
11118 return false;
11119}
11120
11121void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11122{
11123 map<dirfrag_t,fragment_info_t>::iterator p;
11124 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11125 p != fragments.end() && p->first.ino == dir->ino();
11126 ++p) {
11127 if (p->first.frag.contains(dir->get_frag())) {
11128 p->second.num_remote_waiters++;
11129 return;
11130 }
11131 }
11132 ceph_abort();
11133}
11134
11135void MDCache::find_stale_fragment_freeze()
11136{
11137 dout(10) << "find_stale_fragment_freeze" << dendl;
11138 // see comment in Migrator::find_stale_export_freeze()
11139 utime_t now = ceph_clock_now();
11140 utime_t cutoff = now;
11141 cutoff -= g_conf->mds_freeze_tree_timeout;
11142
11143 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11144 p != fragments.end(); ) {
11145 dirfrag_t df = p->first;
11146 fragment_info_t& info = p->second;
11147 ++p;
11148 if (info.all_frozen)
11149 continue;
11150 CDir *dir;
11151 int total_auth_pins = 0;
11152 for (list<CDir*>::iterator q = info.dirs.begin();
11153 q != info.dirs.end();
11154 ++q) {
11155 dir = *q;
11156 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11157 total_auth_pins = -1;
11158 break;
11159 }
11160 if (dir->is_frozen_dir())
11161 continue;
11162 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11163 }
11164 if (total_auth_pins < 0)
11165 continue;
11166 if (info.last_cum_auth_pins != total_auth_pins) {
11167 info.last_cum_auth_pins = total_auth_pins;
11168 info.last_cum_auth_pins_change = now;
11169 continue;
11170 }
11171 if (info.last_cum_auth_pins_change >= cutoff)
11172 continue;
11173 dir = info.dirs.front();
11174 if (info.num_remote_waiters > 0 ||
11175 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11176 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11177 list<CDir*> dirs;
11178 info.dirs.swap(dirs);
11179 fragments.erase(df);
11180 fragment_unmark_unfreeze_dirs(dirs);
11181 }
11182 }
11183}
11184
11185class C_MDC_FragmentPrep : public MDCacheLogContext {
11186 MDRequestRef mdr;
11187public:
11188 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11189 void finish(int r) override {
11190 mdcache->_fragment_logged(mdr);
11191 }
11192};
11193
11194class C_MDC_FragmentStore : public MDCacheContext {
11195 MDRequestRef mdr;
11196public:
11197 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11198 void finish(int r) override {
11199 mdcache->_fragment_stored(mdr);
11200 }
11201};
11202
11203class C_MDC_FragmentCommit : public MDCacheLogContext {
11204 dirfrag_t basedirfrag;
11205 list<CDir*> resultfrags;
11206public:
11207 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11208 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11209 void finish(int r) override {
11210 mdcache->_fragment_committed(basedirfrag, resultfrags);
11211 }
11212};
11213
11214class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11215 dirfrag_t basedirfrag;
11216 list<CDir*> resultfrags;
11217public:
11218 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11219 MDCacheIOContext(m), basedirfrag(f) {
11220 resultfrags.swap(l);
11221 }
11222 void finish(int r) override {
11223 assert(r == 0 || r == -ENOENT);
11224 mdcache->_fragment_finish(basedirfrag, resultfrags);
11225 }
11226};
11227
11228void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11229{
11230 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11231 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11232 if (it == fragments.end() || it->second.mdr != mdr) {
11233 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11234 request_finish(mdr);
11235 return;
11236 }
11237
11238 assert(r == 0);
11239 fragment_info_t& info = it->second;
11240 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11241 << " on " << info.dirs.front()->get_inode() << dendl;
11242
11243 info.all_frozen = true;
11244 dispatch_fragment_dir(mdr);
11245}
11246
11247void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11248{
11249 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11250 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11251 if (it == fragments.end() || it->second.mdr != mdr) {
11252 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11253 request_finish(mdr);
11254 return;
11255 }
11256
11257 fragment_info_t& info = it->second;
11258 CInode *diri = info.dirs.front()->get_inode();
11259
11260 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11261 << " on " << *diri << dendl;
11262 if (!mdr->aborted) {
11263 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11264 wrlocks.insert(&diri->dirfragtreelock);
11265 // prevent a racing gather on any other scatterlocks too
11266 wrlocks.insert(&diri->nestlock);
11267 wrlocks.insert(&diri->filelock);
11268 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11269 if (!mdr->aborted)
11270 return;
11271 }
11272
11273 if (mdr->aborted) {
11274 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11275 << info.dirs.front()->dirfrag() << dendl;
11276 if (info.bits > 0)
11277 mds->balancer->queue_split(info.dirs.front(), false);
11278 else
11279 mds->balancer->queue_merge(info.dirs.front());
11280 fragment_unmark_unfreeze_dirs(info.dirs);
11281 fragments.erase(it);
11282 request_finish(mdr);
11283 return;
11284 }
11285
11286 mdr->ls = mds->mdlog->get_current_segment();
11287 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11288 mds->mdlog->start_entry(le);
11289
11290 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11291 CDir *dir = *p;
11292 dirfrag_rollback rollback;
11293 rollback.fnode = dir->fnode;
11294 le->add_orig_frag(dir->get_frag(), &rollback);
11295 }
11296
11297 // refragment
11298 list<MDSInternalContextBase*> waiters;
11299 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11300 info.resultfrags, waiters, false);
11301 if (g_conf->mds_debug_frag)
11302 diri->verify_dirfrags();
11303 mds->queue_waiters(waiters);
11304
11305 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11306 assert(!diri->dirfragtree.is_leaf(*p));
11307
11308 le->metablob.add_dir_context(*info.resultfrags.begin());
11309 for (list<CDir*>::iterator p = info.resultfrags.begin();
11310 p != info.resultfrags.end();
11311 ++p) {
11312 if (diri->is_auth()) {
11313 le->metablob.add_fragmented_dir(*p, false, false);
11314 } else {
11315 (*p)->state_set(CDir::STATE_DIRTYDFT);
11316 le->metablob.add_fragmented_dir(*p, false, true);
11317 }
11318 }
11319
11320 // dft lock
11321 if (diri->is_auth()) {
11322 // journal dirfragtree
94b18763
FG
11323 auto &pi = diri->project_inode();
11324 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11325 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11326 } else {
11327 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11328 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11329 mdr->add_updated_lock(&diri->dirfragtreelock);
11330 }
11331
11332 /*
11333 // filelock
11334 mds->locker->mark_updated_scatterlock(&diri->filelock);
11335 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11336 mut->add_updated_lock(&diri->filelock);
11337
11338 // dirlock
11339 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11340 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11341 mut->add_updated_lock(&diri->nestlock);
11342 */
11343
11344 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11345 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11346 mdr, __func__);
11347 mds->mdlog->flush();
11348}
11349
11350void MDCache::_fragment_logged(MDRequestRef& mdr)
11351{
11352 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11353 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11354 assert(it != fragments.end());
11355 fragment_info_t &info = it->second;
11356 CInode *diri = info.resultfrags.front()->get_inode();
11357
11358 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11359 << " on " << *diri << dendl;
11360
11361 if (diri->is_auth())
11362 diri->pop_and_dirty_projected_inode(mdr->ls);
11363
11364 mdr->apply(); // mark scatterlock
11365
11366 // store resulting frags
11367 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11368
11369 for (list<CDir*>::iterator p = info.resultfrags.begin();
11370 p != info.resultfrags.end();
11371 ++p) {
11372 CDir *dir = *p;
11373 dout(10) << " storing result frag " << *dir << dendl;
11374
11375 // freeze and store them too
11376 dir->auth_pin(this);
11377 dir->state_set(CDir::STATE_FRAGMENTING);
11378 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11379 }
11380
11381 gather.activate();
11382}
11383
11384void MDCache::_fragment_stored(MDRequestRef& mdr)
11385{
11386 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11387 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11388 assert(it != fragments.end());
11389 fragment_info_t &info = it->second;
11390 CInode *diri = info.resultfrags.front()->get_inode();
11391
11392 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11393 << " on " << *diri << dendl;
11394
11395 // tell peers
11396 CDir *first = *info.resultfrags.begin();
181888fb
FG
11397 for (const auto &p : first->get_replicas()) {
11398 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11399 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11400 rejoin_gather.count(p.first)))
7c673cae
FG
11401 continue;
11402
11403 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11404
11405 // freshly replicate new dirs to peers
11406 for (list<CDir*>::iterator q = info.resultfrags.begin();
11407 q != info.resultfrags.end();
11408 ++q)
181888fb 11409 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11410
181888fb 11411 mds->send_message_mds(notify, p.first);
7c673cae
FG
11412 }
11413
11414 // journal commit
11415 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11416 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11417 info.resultfrags));
11418
11419 mds->locker->drop_locks(mdr.get());
11420
11421 // unfreeze resulting frags
11422 for (list<CDir*>::iterator p = info.resultfrags.begin();
11423 p != info.resultfrags.end();
11424 ++p) {
11425 CDir *dir = *p;
11426 dout(10) << " result frag " << *dir << dendl;
11427
94b18763
FG
11428 for (auto &p : dir->items) {
11429 CDentry *dn = p.second;
7c673cae
FG
11430 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11431 dn->state_clear(CDentry::STATE_FRAGMENTING);
11432 dn->put(CDentry::PIN_FRAGMENTING);
11433 }
11434
11435 // unfreeze
11436 dir->unfreeze_dir();
11437 }
11438
11439 fragments.erase(it);
11440 request_finish(mdr);
11441}
11442
11443void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11444{
11445 dout(10) << "fragment_committed " << basedirfrag << dendl;
11446 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11447 assert(it != uncommitted_fragments.end());
11448 ufragment &uf = it->second;
11449
11450 // remove old frags
11451 C_GatherBuilder gather(
11452 g_ceph_context,
11453 new C_OnFinisher(
11454 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11455 mds->finisher));
11456
11457 SnapContext nullsnapc;
11458 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11459 for (list<frag_t>::iterator p = uf.old_frags.begin();
11460 p != uf.old_frags.end();
11461 ++p) {
11462 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11463 ObjectOperation op;
11464 if (*p == frag_t()) {
11465 // backtrace object
11466 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11467 op.truncate(0);
11468 op.omap_clear();
11469 } else {
11470 dout(10) << " removing orphan dirfrag " << oid << dendl;
11471 op.remove();
11472 }
11473 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11474 ceph::real_clock::now(),
11475 0, gather.new_sub());
11476 }
11477
11478 assert(gather.has_subs());
11479 gather.activate();
11480}
11481
11482void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11483{
11484 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11485 << resultfrags.size() << dendl;
11486 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11487 assert(it != uncommitted_fragments.end());
11488 ufragment &uf = it->second;
11489
11490 // unmark & auth_unpin
11491 for (const auto &dir : resultfrags) {
11492 dir->state_clear(CDir::STATE_FRAGMENTING);
11493 dir->auth_unpin(this);
11494
11495 // In case the resulting fragments are beyond the split size,
11496 // we might need to split them again right away (they could
11497 // have been taking inserts between unfreezing and getting
11498 // here)
11499 mds->balancer->maybe_fragment(dir, false);
11500 }
11501
11502 if (mds->logger) {
11503 if (resultfrags.size() > 1) {
11504 mds->logger->inc(l_mds_dir_split);
11505 } else {
11506 mds->logger->inc(l_mds_dir_merge);
11507 }
11508 }
11509
11510 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11511 mds->mdlog->start_submit_entry(le);
11512
11513 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11514}
11515
11516/* This function DOES put the passed message before returning */
11517void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11518{
11519 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11520
11521 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11522 notify->put();
11523 return;
11524 }
11525
11526 CInode *diri = get_inode(notify->get_ino());
11527 if (diri) {
11528 frag_t base = notify->get_basefrag();
11529 int bits = notify->get_bits();
11530
11531/*
11532 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11533 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11534 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11535 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11536 notify->put();
11537 return;
11538 }
11539*/
11540
11541 // refragment
11542 list<MDSInternalContextBase*> waiters;
11543 list<CDir*> resultfrags;
11544 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11545 if (g_conf->mds_debug_frag)
11546 diri->verify_dirfrags();
11547
11548 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11549 diri->take_dir_waiting((*p)->get_frag(), waiters);
11550
11551 // add new replica dirs values
11552 bufferlist::iterator p = notify->basebl.begin();
11553 while (!p.end())
11554 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11555
11556 mds->queue_waiters(waiters);
11557 } else {
11558 ceph_abort();
11559 }
11560
11561 notify->put();
11562}
11563
11564void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11565 LogSegment *ls, bufferlist *rollback)
11566{
11567 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11568 assert(!uncommitted_fragments.count(basedirfrag));
11569 ufragment& uf = uncommitted_fragments[basedirfrag];
11570 uf.old_frags = old_frags;
11571 uf.bits = bits;
11572 uf.ls = ls;
11573 ls->uncommitted_fragments.insert(basedirfrag);
11574 if (rollback)
11575 uf.rollback.swap(*rollback);
11576}
11577
11578void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11579{
11580 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11581 << " op " << EFragment::op_name(op) << dendl;
11582 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11583 if (it != uncommitted_fragments.end()) {
11584 ufragment& uf = it->second;
11585 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11586 uf.committed = true;
11587 } else {
11588 uf.ls->uncommitted_fragments.erase(basedirfrag);
11589 mds->queue_waiters(uf.waiters);
11590 uncommitted_fragments.erase(it);
11591 }
11592 }
11593}
11594
11595void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11596{
11597 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11598 << " old_frags (" << old_frags << ")" << dendl;
11599 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11600 if (it != uncommitted_fragments.end()) {
11601 ufragment& uf = it->second;
11602 if (!uf.old_frags.empty()) {
11603 uf.old_frags.swap(old_frags);
11604 uf.committed = true;
11605 } else {
11606 uf.ls->uncommitted_fragments.erase(basedirfrag);
11607 uncommitted_fragments.erase(it);
11608 }
11609 }
11610}
11611
11612void MDCache::rollback_uncommitted_fragments()
11613{
11614 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11615 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11616 p != uncommitted_fragments.end();
11617 ++p) {
11618 ufragment &uf = p->second;
11619 CInode *diri = get_inode(p->first.ino);
11620 assert(diri);
11621
11622 if (uf.committed) {
11623 list<CDir*> frags;
11624 diri->get_dirfrags_under(p->first.frag, frags);
11625 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11626 CDir *dir = *q;
11627 dir->auth_pin(this);
11628 dir->state_set(CDir::STATE_FRAGMENTING);
11629 }
11630 _fragment_committed(p->first, frags);
11631 continue;
11632 }
11633
11634 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11635
11636 LogSegment *ls = mds->mdlog->get_current_segment();
11637 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11638 mds->mdlog->start_entry(le);
11639 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11640
11641 list<frag_t> old_frags;
11642 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11643
11644 list<CDir*> resultfrags;
11645 if (uf.old_frags.empty()) {
11646 // created by old format EFragment
11647 list<MDSInternalContextBase*> waiters;
11648 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11649 } else {
11650 bufferlist::iterator bp = uf.rollback.begin();
11651 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11652 CDir *dir = force_dir_fragment(diri, *q);
11653 resultfrags.push_back(dir);
11654
11655 dirfrag_rollback rollback;
11656 ::decode(rollback, bp);
11657
11658 dir->set_version(rollback.fnode.version);
11659 dir->fnode = rollback.fnode;
11660
11661 dir->_mark_dirty(ls);
11662
11663 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11664 dout(10) << " dirty nestinfo on " << *dir << dendl;
11665 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11666 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11667 }
11668 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11669 dout(10) << " dirty fragstat on " << *dir << dendl;
11670 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11671 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11672 }
11673
11674 le->add_orig_frag(dir->get_frag());
11675 le->metablob.add_dir_context(dir);
11676 if (diri_auth) {
11677 le->metablob.add_fragmented_dir(dir, true, false);
11678 } else {
11679 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11680 dir->state_set(CDir::STATE_DIRTYDFT);
11681 le->metablob.add_fragmented_dir(dir, true, true);
11682 }
11683 }
11684 }
11685
11686 if (diri_auth) {
94b18763
FG
11687 auto &pi = diri->project_inode();
11688 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11689 diri->pop_and_dirty_projected_inode(ls); // hacky
11690 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11691 } else {
11692 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11693 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11694 }
11695
11696 if (g_conf->mds_debug_frag)
11697 diri->verify_dirfrags();
11698
11699 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11700 assert(!diri->dirfragtree.is_leaf(*q));
11701
11702 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11703 CDir *dir = *q;
11704 dir->auth_pin(this);
11705 dir->state_set(CDir::STATE_FRAGMENTING);
11706 }
11707
11708 mds->mdlog->submit_entry(le);
11709
11710 uf.old_frags.swap(old_frags);
11711 _fragment_committed(p->first, resultfrags);
11712 }
11713}
11714
11715void MDCache::force_readonly()
11716{
11717 if (is_readonly())
11718 return;
11719
11720 dout(1) << "force file system read-only" << dendl;
11721 mds->clog->warn() << "force file system read-only";
11722
11723 set_readonly();
11724
11725 mds->server->force_clients_readonly();
11726
11727 // revoke write caps
94b18763 11728 for (auto &p : inode_map) {
b32b8144 11729 CInode *in = p.second;
7c673cae
FG
11730 if (in->is_head())
11731 mds->locker->eval(in, CEPH_CAP_LOCKS);
11732 }
11733
11734 mds->mdlog->flush();
11735}
11736
11737
11738// ==============================================================
11739// debug crap
11740
11741void MDCache::show_subtrees(int dbl)
11742{
11743 if (g_conf->mds_thrash_exports)
11744 dbl += 15;
11745
11746 //dout(10) << "show_subtrees" << dendl;
11747
11748 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11749 return; // i won't print anything.
11750
11751 if (subtrees.empty()) {
11752 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11753 return;
11754 }
11755
11756 // root frags
11757 list<CDir*> basefrags;
11758 for (set<CInode*>::iterator p = base_inodes.begin();
11759 p != base_inodes.end();
11760 ++p)
11761 (*p)->get_dirfrags(basefrags);
11762 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11763 dout(15) << "show_subtrees" << dendl;
11764
11765 // queue stuff
11766 list<pair<CDir*,int> > q;
11767 string indent;
11768 set<CDir*> seen;
11769
11770 // calc max depth
11771 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11772 q.push_back(pair<CDir*,int>(*p, 0));
11773
11774 set<CDir*> subtrees_seen;
11775
11776 int depth = 0;
11777 while (!q.empty()) {
11778 CDir *dir = q.front().first;
11779 int d = q.front().second;
11780 q.pop_front();
11781
11782 if (subtrees.count(dir) == 0) continue;
11783
11784 subtrees_seen.insert(dir);
11785
11786 if (d > depth) depth = d;
11787
11788 // sanity check
11789 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11790 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11791 assert(seen.count(dir) == 0);
11792 seen.insert(dir);
11793
11794 // nested items?
11795 if (!subtrees[dir].empty()) {
11796 for (set<CDir*>::iterator p = subtrees[dir].begin();
11797 p != subtrees[dir].end();
11798 ++p) {
11799 //dout(25) << " saw sub " << **p << dendl;
11800 q.push_front(pair<CDir*,int>(*p, d+1));
11801 }
11802 }
11803 }
11804
11805
11806 // print tree
11807 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11808 q.push_back(pair<CDir*,int>(*p, 0));
11809
11810 while (!q.empty()) {
11811 CDir *dir = q.front().first;
11812 int d = q.front().second;
11813 q.pop_front();
11814
11815 if (subtrees.count(dir) == 0) continue;
11816
11817 // adjust indenter
11818 while ((unsigned)d < indent.size())
11819 indent.resize(d);
11820
11821 // pad
11822 string pad = "______________________________________";
11823 pad.resize(depth*2+1-indent.size());
11824 if (!subtrees[dir].empty())
11825 pad[0] = '.'; // parent
11826
11827
11828 string auth;
11829 if (dir->is_auth())
11830 auth = "auth ";
11831 else
11832 auth = " rep ";
11833
11834 char s[10];
11835 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11836 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11837 else
11838 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11839
11840 // print
11841 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11842
11843 if (dir->ino() == MDS_INO_ROOT)
11844 assert(dir->inode == root);
11845 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11846 assert(dir->inode == myin);
11847 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11848 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11849
11850 // nested items?
11851 if (!subtrees[dir].empty()) {
11852 // more at my level?
11853 if (!q.empty() && q.front().second == d)
11854 indent += "| ";
11855 else
11856 indent += " ";
11857
11858 for (set<CDir*>::iterator p = subtrees[dir].begin();
11859 p != subtrees[dir].end();
11860 ++p)
11861 q.push_front(pair<CDir*,int>(*p, d+2));
11862 }
11863 }
11864
11865 // verify there isn't stray crap in subtree map
11866 int lost = 0;
11867 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11868 p != subtrees.end();
11869 ++p) {
11870 if (subtrees_seen.count(p->first)) continue;
11871 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11872 lost++;
11873 }
11874 assert(lost == 0);
11875}
11876
7c673cae
FG
11877void MDCache::show_cache()
11878{
11879 dout(7) << "show_cache" << dendl;
b32b8144
FG
11880
11881 auto show_func = [this](CInode *in) {
7c673cae 11882 // unlinked?
b32b8144
FG
11883 if (!in->parent)
11884 dout(7) << " unlinked " << *in << dendl;
11885
7c673cae
FG
11886 // dirfrags?
11887 list<CDir*> dfs;
b32b8144 11888 in->get_dirfrags(dfs);
7c673cae
FG
11889 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11890 CDir *dir = *p;
11891 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 11892
94b18763
FG
11893 for (auto &p : dir->items) {
11894 CDentry *dn = p.second;
7c673cae
FG
11895 dout(7) << " dentry " << *dn << dendl;
11896 CDentry::linkage_t *dnl = dn->get_linkage();
11897 if (dnl->is_primary() && dnl->get_inode())
11898 dout(7) << " inode " << *dnl->get_inode() << dendl;
11899 }
11900 }
b32b8144
FG
11901 };
11902
94b18763 11903 for (auto &p : inode_map)
b32b8144 11904 show_func(p.second);
94b18763 11905 for (auto &p : snap_inode_map)
b32b8144 11906 show_func(p.second);
7c673cae
FG
11907}
11908
181888fb
FG
11909int MDCache::cache_status(Formatter *f)
11910{
11911 f->open_object_section("cache");
11912
11913 f->open_object_section("pool");
11914 mempool::get_pool(mempool::mds_co::id).dump(f);
11915 f->close_section();
11916
11917 f->close_section();
11918 return 0;
11919}
11920
94b18763 11921int MDCache::dump_cache(boost::string_view file_name)
7c673cae 11922{
94b18763 11923 return dump_cache(file_name, NULL);
7c673cae
FG
11924}
11925
31f18b77 11926int MDCache::dump_cache(Formatter *f)
7c673cae 11927{
94b18763 11928 return dump_cache(boost::string_view(""), f);
7c673cae
FG
11929}
11930
94b18763 11931int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
7c673cae 11932{
94b18763 11933 return dump_cache(boost::string_view(""), f, dump_root, depth);
7c673cae
FG
11934}
11935
11936/**
11937 * Dump the metadata cache, either to a Formatter, if
11938 * provided, else to a plain text file.
11939 */
94b18763
FG
11940int MDCache::dump_cache(boost::string_view fn, Formatter *f,
11941 boost::string_view dump_root, int depth)
7c673cae
FG
11942{
11943 int r = 0;
11944 int fd = -1;
11945
11946 if (f) {
11947 f->open_array_section("inodes");
11948 } else {
94b18763
FG
11949 char path[PATH_MAX] = "";
11950 if (fn.length()) {
11951 snprintf(path, sizeof path, "%s", fn.data());
11952 } else {
11953 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
11954 }
11955
94b18763 11956 dout(1) << "dump_cache to " << path << dendl;
7c673cae 11957
94b18763 11958 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL, 0600);
7c673cae 11959 if (fd < 0) {
94b18763 11960 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 11961 return errno;
7c673cae
FG
11962 }
11963 }
11964
b32b8144
FG
11965 auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
11966 int r;
7c673cae
FG
11967 if (!dump_root.empty()) {
11968 string ipath;
11969 if (in->is_root())
11970 ipath = "/";
11971 else
11972 in->make_path_string(ipath);
11973
11974 if (dump_root.length() > ipath.length() ||
11975 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
b32b8144 11976 return 0;
7c673cae
FG
11977
11978 if (depth >= 0 &&
11979 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
b32b8144 11980 return 0;
7c673cae
FG
11981 }
11982
11983 if (f) {
11984 f->open_object_section("inode");
11985 in->dump(f);
11986 } else {
11987 ostringstream ss;
11988 ss << *in << std::endl;
11989 std::string s = ss.str();
11990 r = safe_write(fd, s.c_str(), s.length());
b32b8144
FG
11991 if (r < 0)
11992 return r;
7c673cae
FG
11993 }
11994
11995 list<CDir*> dfs;
11996 in->get_dirfrags(dfs);
11997 if (f) {
11998 f->open_array_section("dirfrags");
11999 }
12000 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12001 CDir *dir = *p;
12002 if (f) {
12003 f->open_object_section("dir");
12004 dir->dump(f);
12005 } else {
12006 ostringstream tt;
12007 tt << " " << *dir << std::endl;
12008 string t = tt.str();
12009 r = safe_write(fd, t.c_str(), t.length());
b32b8144
FG
12010 if (r < 0)
12011 return r;
7c673cae
FG
12012 }
12013
12014 if (f) {
12015 f->open_array_section("dentries");
12016 }
94b18763
FG
12017 for (auto &p : dir->items) {
12018 CDentry *dn = p.second;
7c673cae
FG
12019 if (f) {
12020 f->open_object_section("dentry");
12021 dn->dump(f);
12022 f->close_section();
12023 } else {
12024 ostringstream uu;
12025 uu << " " << *dn << std::endl;
12026 string u = uu.str();
12027 r = safe_write(fd, u.c_str(), u.length());
b32b8144
FG
12028 if (r < 0)
12029 return r;
7c673cae
FG
12030 }
12031 }
12032 if (f) {
12033 f->close_section(); //dentries
12034 }
12035 dir->check_rstats();
12036 if (f) {
12037 f->close_section(); //dir
12038 }
12039 }
12040 if (f) {
12041 f->close_section(); // dirfrags
12042 }
12043
12044 if (f) {
12045 f->close_section(); // inode
12046 }
b32b8144
FG
12047 return 1;
12048 };
12049
94b18763 12050 for (auto &p : inode_map) {
b32b8144
FG
12051 r = dump_func(p.second);
12052 if (r < 0)
12053 goto out;
12054 }
94b18763 12055 for (auto &p : snap_inode_map) {
b32b8144
FG
12056 r = dump_func(p.second);
12057 if (r < 0)
12058 goto out;
7c673cae 12059 }
b32b8144 12060 r = 0;
7c673cae
FG
12061
12062 out:
12063 if (f) {
12064 f->close_section(); // inodes
12065 } else {
12066 ::close(fd);
12067 }
31f18b77 12068 return r;
7c673cae
FG
12069}
12070
12071
12072
12073C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12074 : MDSInternalContext(c->mds), cache(c), mdr(r)
12075{}
12076
12077void C_MDS_RetryRequest::finish(int r)
12078{
12079 mdr->retry++;
12080 cache->dispatch_request(mdr);
12081}
12082
12083
12084class C_MDS_EnqueueScrub : public Context
12085{
12086 Formatter *formatter;
12087 Context *on_finish;
12088public:
12089 ScrubHeaderRef header;
12090 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12091 formatter(f), on_finish(fin), header(nullptr) {}
12092
12093 Context *take_finisher() {
12094 Context *fin = on_finish;
12095 on_finish = NULL;
12096 return fin;
12097 }
12098
12099 void finish(int r) override {
12100 if (r < 0) { // we failed the lookup or something; dump ourselves
12101 formatter->open_object_section("results");
12102 formatter->dump_int("return_code", r);
12103 formatter->close_section(); // results
12104 }
12105 if (on_finish)
12106 on_finish->complete(r);
12107 }
12108};
12109
12110void MDCache::enqueue_scrub(
94b18763
FG
12111 boost::string_view path,
12112 boost::string_view tag,
7c673cae
FG
12113 bool force, bool recursive, bool repair,
12114 Formatter *f, Context *fin)
12115{
12116 dout(10) << __func__ << path << dendl;
12117 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
94b18763 12118 filepath fp(path);
7c673cae
FG
12119 mdr->set_filepath(fp);
12120
12121 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12122 cs->header = std::make_shared<ScrubHeader>(
12123 tag, force, recursive, repair, f);
12124
12125 mdr->internal_op_finish = cs;
12126 enqueue_scrub_work(mdr);
1adf2230
AA
12127
12128 // since recursive scrub is asynchronous, dump minimal output
12129 // to not upset cli tools.
12130 if (recursive) {
12131 f->open_object_section("results");
12132 f->close_section(); // results
12133 }
7c673cae
FG
12134}
12135
12136void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12137{
12138 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12139 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12140 if (NULL == in)
12141 return;
12142
12143 // TODO: Remove this restriction
12144 assert(in->is_auth());
12145
12146 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12147 if (!locked)
12148 return;
12149
12150 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12151 ScrubHeaderRef &header = cs->header;
12152
12153 // Cannot scrub same dentry twice at same time
12154 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12155 mds->server->respond_to_request(mdr, -EBUSY);
12156 return;
12157 } else {
12158 in->scrub_info();
12159 }
12160
12161 header->set_origin(in);
12162
b32b8144
FG
12163 Context *fin = nullptr;
12164 if (!header->get_recursive()) {
12165 fin = cs->take_finisher();
12166 }
12167
12168 // If the scrub did some repair, then flush the journal at the end of
12169 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12170 // the on disk state will still look damaged.
28e407b8
AA
12171 auto scrub_finish = new FunctionContext([this, header, fin](int r){
12172 if (!header->get_repaired()) {
12173 if (fin)
12174 fin->complete(r);
12175 return;
12176 }
12177
12178 auto flush_finish = new FunctionContext([this, fin](int r){
12179 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12180 mds->mdlog->trim_all();
12181
12182 if (fin) {
12183 MDSGatherBuilder gather(g_ceph_context);
12184 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12185 for (auto logseg : expiring_segments)
12186 logseg->wait_for_expiry(gather.new_sub());
12187 assert(gather.has_subs());
12188 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12189 gather.activate();
b32b8144 12190 }
28e407b8
AA
12191 });
12192
12193 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12194 mds->mdlog->start_new_segment();
12195 mds->mdlog->flush();
12196 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12197 });
12198
7c673cae 12199 if (!header->get_recursive()) {
7c673cae 12200 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12201 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12202 } else {
12203 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12204 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12205 }
7c673cae
FG
12206
12207 mds->server->respond_to_request(mdr, 0);
12208 return;
12209}
12210
12211struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12212 MDRequestRef mdr;
12213 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12214 MDCacheLogContext(c), mdr(m) {}
12215 void finish(int r) override {
12216 mdr->apply();
12217 get_mds()->server->respond_to_request(mdr, r);
12218 }
12219};
12220
12221void MDCache::repair_dirfrag_stats(CDir *dir)
12222{
12223 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12224 mdr->pin(dir);
12225 mdr->internal_op_private = dir;
12226 mdr->internal_op_finish = new C_MDSInternalNoop;
12227 repair_dirfrag_stats_work(mdr);
12228}
12229
12230void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12231{
12232 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12233 dout(10) << __func__ << " " << *dir << dendl;
12234
12235 if (!dir->is_auth()) {
12236 mds->server->respond_to_request(mdr, -ESTALE);
12237 return;
12238 }
12239
12240 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12241 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12242
7c673cae
FG
12243 mds->locker->drop_locks(mdr.get());
12244 mdr->drop_local_auth_pins();
224ce89b
WB
12245 if (!mdr->remote_auth_pins.empty())
12246 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12247 return;
12248 }
12249
12250 mdr->auth_pin(dir);
12251
12252 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12253 CInode *diri = dir->inode;
12254 rdlocks.insert(&diri->dirfragtreelock);
12255 wrlocks.insert(&diri->nestlock);
12256 wrlocks.insert(&diri->filelock);
12257 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12258 return;
12259
12260 if (!dir->is_complete()) {
12261 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12262 return;
12263 }
12264
12265 frag_info_t frag_info;
12266 nest_info_t nest_info;
94b18763 12267 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12268 CDentry *dn = it->second;
12269 if (dn->last != CEPH_NOSNAP)
12270 continue;
12271 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12272 if (dnl->is_primary()) {
12273 CInode *in = dnl->get_inode();
12274 nest_info.add(in->get_projected_inode()->accounted_rstat);
12275 if (in->is_dir())
12276 frag_info.nsubdirs++;
12277 else
12278 frag_info.nfiles++;
12279 } else if (dnl->is_remote())
12280 frag_info.nfiles++;
12281 }
12282
12283 fnode_t *pf = dir->get_projected_fnode();
12284 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12285 bool good_rstat = nest_info.same_sums(pf->rstat);
12286 if (good_fragstat && good_rstat) {
12287 dout(10) << __func__ << " no corruption found" << dendl;
12288 mds->server->respond_to_request(mdr, 0);
12289 return;
12290 }
12291
12292 pf = dir->project_fnode();
12293 pf->version = dir->pre_dirty();
12294 mdr->add_projected_fnode(dir);
12295
12296 mdr->ls = mds->mdlog->get_current_segment();
12297 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12298 mds->mdlog->start_entry(le);
12299
12300 if (!good_fragstat) {
12301 if (pf->fragstat.mtime > frag_info.mtime)
12302 frag_info.mtime = pf->fragstat.mtime;
12303 if (pf->fragstat.change_attr > frag_info.change_attr)
12304 frag_info.change_attr = pf->fragstat.change_attr;
12305 pf->fragstat = frag_info;
12306 mds->locker->mark_updated_scatterlock(&diri->filelock);
12307 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12308 mdr->add_updated_lock(&diri->filelock);
12309 }
12310
12311 if (!good_rstat) {
12312 if (pf->rstat.rctime > nest_info.rctime)
12313 nest_info.rctime = pf->rstat.rctime;
12314 pf->rstat = nest_info;
12315 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12316 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12317 mdr->add_updated_lock(&diri->nestlock);
12318 }
12319
12320 le->metablob.add_dir_context(dir);
12321 le->metablob.add_dir(dir, true);
12322
12323 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12324}
12325
12326void MDCache::repair_inode_stats(CInode *diri)
12327{
12328 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12329 mdr->pin(diri);
12330 mdr->internal_op_private = diri;
12331 mdr->internal_op_finish = new C_MDSInternalNoop;
12332 repair_inode_stats_work(mdr);
12333}
12334
12335void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12336{
12337 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12338 dout(10) << __func__ << " " << *diri << dendl;
12339
12340 if (!diri->is_auth()) {
12341 mds->server->respond_to_request(mdr, -ESTALE);
12342 return;
12343 }
12344 if (!diri->is_dir()) {
12345 mds->server->respond_to_request(mdr, -ENOTDIR);
12346 return;
12347 }
12348
12349 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12350 std::list<frag_t> frags;
12351
12352 if (mdr->ls) // already marked filelock/nestlock dirty ?
12353 goto do_rdlocks;
12354
12355 rdlocks.insert(&diri->dirfragtreelock);
12356 wrlocks.insert(&diri->nestlock);
12357 wrlocks.insert(&diri->filelock);
12358 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12359 return;
12360
12361 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12362 // the scatter-gather process, which will fix any fragstat/rstat errors.
12363 diri->dirfragtree.get_leaves(frags);
12364 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12365 CDir *dir = diri->get_dirfrag(*p);
12366 if (!dir) {
12367 assert(mdr->is_auth_pinned(diri));
12368 dir = diri->get_or_open_dirfrag(this, *p);
12369 }
12370 if (dir->get_version() == 0) {
12371 assert(dir->is_auth());
12372 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12373 return;
12374 }
12375 }
12376
12377 diri->state_set(CInode::STATE_REPAIRSTATS);
12378 mdr->ls = mds->mdlog->get_current_segment();
12379 mds->locker->mark_updated_scatterlock(&diri->filelock);
12380 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12381 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12382 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12383
12384 mds->locker->drop_locks(mdr.get());
12385
12386do_rdlocks:
12387 // force the scatter-gather process
12388 rdlocks.insert(&diri->dirfragtreelock);
12389 rdlocks.insert(&diri->nestlock);
12390 rdlocks.insert(&diri->filelock);
12391 wrlocks.clear();
12392 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12393 return;
12394
12395 diri->state_clear(CInode::STATE_REPAIRSTATS);
12396
12397 frag_info_t dir_info;
12398 nest_info_t nest_info;
12399 nest_info.rsubdirs++; // it gets one to account for self
12400
12401 diri->dirfragtree.get_leaves(frags);
12402 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12403 CDir *dir = diri->get_dirfrag(*p);
12404 assert(dir);
12405 assert(dir->get_version() > 0);
12406 dir_info.add(dir->fnode.accounted_fragstat);
12407 nest_info.add(dir->fnode.accounted_rstat);
12408 }
12409
12410 if (!dir_info.same_sums(diri->inode.dirstat) ||
12411 !nest_info.same_sums(diri->inode.rstat)) {
12412 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12413 << *diri << dendl;
12414 }
12415
12416 mds->server->respond_to_request(mdr, 0);
12417}
12418
94b18763 12419void MDCache::flush_dentry(boost::string_view path, Context *fin)
7c673cae
FG
12420{
12421 if (is_readonly()) {
12422 dout(10) << __func__ << ": read-only FS" << dendl;
12423 fin->complete(-EROFS);
12424 return;
12425 }
12426 dout(10) << "flush_dentry " << path << dendl;
12427 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 12428 filepath fp(path);
7c673cae
FG
12429 mdr->set_filepath(fp);
12430 mdr->internal_op_finish = fin;
12431 flush_dentry_work(mdr);
12432}
12433
12434class C_FinishIOMDR : public MDSInternalContextBase {
12435protected:
12436 MDSRank *mds;
12437 MDRequestRef mdr;
12438 MDSRank *get_mds() override { return mds; }
12439public:
12440 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12441 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12442};
12443
12444void MDCache::flush_dentry_work(MDRequestRef& mdr)
12445{
12446 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12447 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12448 if (NULL == in)
12449 return;
12450
12451 // TODO: Is this necessary? Fix it if so
12452 assert(in->is_auth());
12453 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12454 if (!locked)
12455 return;
12456 in->flush(new C_FinishIOMDR(mds, mdr));
12457}
12458
12459
12460/**
12461 * Initialize performance counters with global perfcounter
12462 * collection.
12463 */
12464void MDCache::register_perfcounters()
12465{
12466 PerfCountersBuilder pcb(g_ceph_context,
12467 "mds_cache", l_mdc_first, l_mdc_last);
12468
12469 /* Stray/purge statistics */
12470 pcb.add_u64(l_mdc_num_strays, "num_strays",
c07f9fc5 12471 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12472 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12473 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12474
12475 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12476 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12477 "Stray dentries enqueued for purge");
12478 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12479 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12480
12481
12482 /* Recovery queue statistics */
12483 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12484 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
c07f9fc5 12485 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12486 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12487 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12488 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
c07f9fc5 12489 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 12490
d2e6a577
FG
12491 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12492 "Internal Request type enqueue scrub");
12493 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12494 "Internal Request type export dir");
12495 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12496 "Internal Request type flush");
12497 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12498 "Internal Request type fragmentdir");
12499 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12500 "Internal Request type frag stats");
12501 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12502 "Internal Request type inode stats");
12503
7c673cae
FG
12504 logger.reset(pcb.create_perf_counters());
12505 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12506 recovery_queue.set_logger(logger.get());
12507 stray_manager.set_logger(logger.get());
12508}
12509
12510void MDCache::activate_stray_manager()
12511{
12512 if (open) {
12513 stray_manager.activate();
12514 } else {
12515 wait_for_open(
12516 new MDSInternalContextWrapper(mds,
12517 new FunctionContext([this](int r){
12518 stray_manager.activate();
12519 })
12520 )
12521 );
12522 }
12523}
12524
12525/**
12526 * Call this when putting references to an inode/dentry or
12527 * when attempting to trim it.
12528 *
12529 * If this inode is no longer linked by anyone, and this MDS
12530 * rank holds the primary dentry, and that dentry is in a stray
12531 * directory, then give up the dentry to the StrayManager, never
12532 * to be seen again by MDCache.
12533 *
12534 * @param delay if true, then purgeable inodes are stashed til
12535 * the next trim(), rather than being purged right
12536 * away.
12537 */
12538void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12539 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12540 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12541 return;
224ce89b 12542
7c673cae
FG
12543 CDentry *dn = in->get_projected_parent_dn();
12544
12545 if (dn->state_test(CDentry::STATE_PURGING)) {
12546 /* We have already entered the purging process, no need
12547 * to re-evaluate me ! */
12548 return;
12549 }
12550
12551 if (dn->get_projected_linkage()->is_primary() &&
12552 dn->get_dir()->get_inode()->is_stray()) {
12553 stray_manager.eval_stray(dn, delay);
12554 }
12555}
12556
31f18b77
FG
12557void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12558 dout(10) << __func__ << " " << *diri << dendl;
12559 assert(diri->get_projected_parent_dir()->inode->is_stray());
12560 list<CDir*> ls;
12561 diri->get_dirfrags(ls);
94b18763 12562 for (auto &p : ls) {
31f18b77
FG
12563 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12564 p->try_remove_dentries_for_stray();
12565 }
12566 if (!diri->snaprealm) {
12567 if (diri->is_auth())
12568 diri->clear_dirty_rstat();
12569 diri->clear_scatter_dirty();
12570 }
12571}
12572