]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
94b18763 20#include <boost/utility/string_view.hpp>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae
FG
44
45#include "msg/Message.h"
46#include "msg/Messenger.h"
47
181888fb 48#include "common/MemoryModel.h"
7c673cae 49#include "common/errno.h"
7c673cae 50#include "common/perf_counters.h"
181888fb
FG
51#include "common/safe_io.h"
52
7c673cae
FG
53#include "osdc/Journaler.h"
54#include "osdc/Filer.h"
55
56#include "events/ESubtreeMap.h"
57#include "events/EUpdate.h"
58#include "events/ESlaveUpdate.h"
59#include "events/EImportFinish.h"
60#include "events/EFragment.h"
61#include "events/ECommitted.h"
62#include "events/ESessions.h"
63
64#include "messages/MGenericMessage.h"
65
66#include "messages/MMDSResolve.h"
67#include "messages/MMDSResolveAck.h"
68#include "messages/MMDSCacheRejoin.h"
69
70#include "messages/MDiscover.h"
71#include "messages/MDiscoverReply.h"
72
73//#include "messages/MInodeUpdate.h"
74#include "messages/MDirUpdate.h"
75#include "messages/MCacheExpire.h"
76
77#include "messages/MInodeFileCaps.h"
78
79#include "messages/MLock.h"
80#include "messages/MDentryLink.h"
81#include "messages/MDentryUnlink.h"
82
83#include "messages/MMDSFindIno.h"
84#include "messages/MMDSFindInoReply.h"
85
86#include "messages/MMDSOpenIno.h"
87#include "messages/MMDSOpenInoReply.h"
88
89#include "messages/MClientRequest.h"
90#include "messages/MClientCaps.h"
91#include "messages/MClientSnap.h"
92#include "messages/MClientQuota.h"
93
94#include "messages/MMDSSlaveRequest.h"
95
96#include "messages/MMDSFragmentNotify.h"
97
98#include "messages/MGatherCaps.h"
99
100#include "InoTable.h"
101
102#include "common/Timer.h"
103
104#include "perfglue/heap_profiler.h"
105
106using namespace std;
107
108#include "common/config.h"
109#include "include/assert.h"
110
111#define dout_context g_ceph_context
112#define dout_subsys ceph_subsys_mds
113#undef dout_prefix
114#define dout_prefix _prefix(_dout, mds)
115static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
116 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
117}
118
119set<int> SimpleLock::empty_gather_set;
120
121
122/**
123 * All non-I/O contexts that require a reference
124 * to an MDCache instance descend from this.
125 */
126class MDCacheContext : public virtual MDSInternalContextBase {
127protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
131 assert(mdcache != NULL);
132 return mdcache->mds;
133 }
134public:
135 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
136};
137
138
139/**
140 * Only for contexts called back from an I/O completion
141 *
142 * Note: duplication of members wrt MDCacheContext, because
143 * it'ls the lesser of two evils compared with introducing
144 * yet another piece of (multiple) inheritance.
145 */
146class MDCacheIOContext : public virtual MDSIOContextBase {
147protected:
148 MDCache *mdcache;
149 MDSRank *get_mds() override
150 {
151 assert(mdcache != NULL);
152 return mdcache->mds;
153 }
154public:
155 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
156};
157
158class MDCacheLogContext : public virtual MDSLogContextBase {
159protected:
160 MDCache *mdcache;
161 MDSRank *get_mds() override
162 {
163 assert(mdcache != NULL);
164 return mdcache->mds;
165 }
166public:
167 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
168};
169
170MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
171 mds(m),
172 filer(m->objecter, m->finisher),
173 exceeded_size_limit(false),
174 recovery_queue(m),
175 stray_manager(m, purge_queue_)
176{
177 migrator.reset(new Migrator(mds, this));
178 root = NULL;
179 myin = NULL;
180 readonly = false;
181
182 stray_index = 0;
183 for (int i = 0; i < NUM_STRAY; ++i) {
184 strays[i] = NULL;
185 }
186
b32b8144 187 num_shadow_inodes = 0;
7c673cae
FG
188 num_inodes_with_caps = 0;
189
190 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
191 (g_conf->mds_dir_max_commit_size << 20) :
192 (0.9 *(g_conf->osd_max_write_size << 20));
193
194 discover_last_tid = 0;
195 open_ino_last_tid = 0;
196 find_ino_peer_last_tid = 0;
197
198 last_cap_id = 0;
199
200 client_lease_durations[0] = 5.0;
201 client_lease_durations[1] = 30.0;
202 client_lease_durations[2] = 300.0;
203
204 resolves_pending = false;
205 rejoins_pending = false;
206 cap_imports_num_opening = 0;
207
208 opening_root = open = false;
181888fb 209 lru.lru_set_midpoint(cache_mid());
7c673cae 210
31f18b77
FG
211 bottom_lru.lru_set_midpoint(0);
212
7c673cae
FG
213 decayrate.set_halflife(g_conf->mds_decay_halflife);
214
215 did_shutdown_log_cap = false;
216}
217
218MDCache::~MDCache()
219{
220 if (logger) {
221 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
222 }
223}
224
225
226
227void MDCache::log_stat()
228{
181888fb 229 mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
7c673cae
FG
230 mds->logger->set(l_mds_inodes, lru.lru_get_size());
231 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
232 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
233 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
234 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
235 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
236 mds->logger->set(l_mds_caps, Capability::count());
237}
238
239
240//
241
242bool MDCache::shutdown()
243{
244 if (lru.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
246 //show_cache();
247 show_subtrees();
248 //dump();
249 }
250 return true;
251}
252
253
254// ====================================================================
255// some inode functions
256
257void MDCache::add_inode(CInode *in)
258{
259 // add to lru, inode map
b32b8144
FG
260 if (in->last == CEPH_NOSNAP) {
261 auto &p = inode_map[in->ino()];
262 assert(!p); // should be no dup inos!
263 p = in;
264 } else {
265 auto &p = snap_inode_map[in->vino()];
266 assert(!p); // should be no dup inos!
267 p = in;
268 }
7c673cae
FG
269
270 if (in->ino() < MDS_INO_SYSTEM_BASE) {
271 if (in->ino() == MDS_INO_ROOT)
272 root = in;
273 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
274 myin = in;
275 else if (in->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
277 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
278 }
279 }
280 if (in->is_base())
281 base_inodes.insert(in);
282 }
283
181888fb 284 if (cache_toofull()) {
7c673cae
FG
285 exceeded_size_limit = true;
286 }
287}
288
289void MDCache::remove_inode(CInode *o)
290{
291 dout(14) << "remove_inode " << *o << dendl;
292
293 if (o->get_parent_dn()) {
294 // FIXME: multiple parents?
295 CDentry *dn = o->get_parent_dn();
296 assert(!dn->is_dirty());
297 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
298 }
299
300 if (o->is_dirty())
301 o->mark_clean();
302 if (o->is_dirty_parent())
303 o->clear_dirty_parent();
304
305 o->clear_scatter_dirty();
306
307 o->item_open_file.remove_myself();
308
31f18b77
FG
309 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
310 export_pin_queue.erase(o);
7c673cae
FG
311
312 // remove from inode map
b32b8144
FG
313 if (o->last == CEPH_NOSNAP)
314 inode_map.erase(o->ino());
315 else
316 snap_inode_map.erase(o->vino());
7c673cae
FG
317
318 if (o->ino() < MDS_INO_SYSTEM_BASE) {
319 if (o == root) root = 0;
320 if (o == myin) myin = 0;
321 if (o->is_stray()) {
322 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
323 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
324 }
325 }
326 if (o->is_base())
327 base_inodes.erase(o);
328 }
329
330 // delete it
331 assert(o->get_num_ref() == 0);
332 delete o;
333}
334
335file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
336{
337 file_layout_t result = file_layout_t::get_default();
338 result.pool_id = mdsmap.get_first_data_pool();
339 return result;
340}
341
342file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
343{
344 file_layout_t result = file_layout_t::get_default();
345 result.pool_id = mdsmap.get_metadata_pool();
346 if (g_conf->mds_log_segment_size > 0) {
347 result.object_size = g_conf->mds_log_segment_size;
348 result.stripe_unit = g_conf->mds_log_segment_size;
349 }
350 return result;
351}
352
353void MDCache::init_layouts()
354{
355 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
356 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
357}
358
359void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
360 int mode) const
361{
362 in->inode.ino = ino;
363 in->inode.version = 1;
364 in->inode.xattr_version = 1;
365 in->inode.mode = 0500 | mode;
366 in->inode.size = 0;
367 in->inode.ctime =
368 in->inode.mtime =
369 in->inode.btime = ceph_clock_now();
370 in->inode.nlink = 1;
371 in->inode.truncate_size = -1ull;
372 in->inode.change_attr = 0;
373 in->inode.export_pin = MDS_RANK_NONE;
374
375 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
376 if (in->inode.is_dir()) {
377 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
378 ++in->inode.rstat.rsubdirs;
379 } else {
380 in->inode.layout = default_file_layout;
381 ++in->inode.rstat.rfiles;
382 }
383 in->inode.accounted_rstat = in->inode.rstat;
384
385 if (in->is_base()) {
386 if (in->is_root())
387 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
388 else
389 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
390 in->open_snaprealm(); // empty snaprealm
391 assert(!in->snaprealm->parent); // created its own
392 in->snaprealm->srnode.seq = 1;
393 }
394}
395
396CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
397{
398 dout(0) << "creating system inode with ino:" << ino << dendl;
399 CInode *in = new CInode(this);
400 create_unlinked_system_inode(in, ino, mode);
401 add_inode(in);
402 return in;
403}
404
405CInode *MDCache::create_root_inode()
406{
407 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
408 i->inode.uid = g_conf->mds_root_ino_uid;
409 i->inode.gid = g_conf->mds_root_ino_gid;
410 i->inode.layout = default_file_layout;
411 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
412 return i;
413}
414
415void MDCache::create_empty_hierarchy(MDSGather *gather)
416{
417 // create root dir
418 CInode *root = create_root_inode();
419
420 // force empty root dir
421 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
422 adjust_subtree_auth(rootdir, mds->get_nodeid());
423 rootdir->dir_rep = CDir::REP_ALL; //NONE;
424
425 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
426 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
427
428 root->inode.dirstat = rootdir->fnode.fragstat;
429 root->inode.rstat = rootdir->fnode.rstat;
430 ++root->inode.rstat.rsubdirs;
431 root->inode.accounted_rstat = root->inode.rstat;
432
433 rootdir->mark_complete();
434 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
435 rootdir->commit(0, gather->new_sub());
436
28e407b8
AA
437 root->mark_clean();
438 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
439 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
440 root->flush(gather->new_sub());
7c673cae
FG
441}
442
443void MDCache::create_mydir_hierarchy(MDSGather *gather)
444{
445 // create mds dir
446 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
447
448 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
449 adjust_subtree_auth(mydir, mds->get_nodeid());
450
451 LogSegment *ls = mds->mdlog->get_current_segment();
452
453 // stray dir
454 for (int i = 0; i < NUM_STRAY; ++i) {
455 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
456 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
457 stringstream name;
458 name << "stray" << i;
459 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
460 sdn->_mark_dirty(mds->mdlog->get_current_segment());
461
462 stray->inode.dirstat = straydir->fnode.fragstat;
463
464 mydir->fnode.rstat.add(stray->inode.rstat);
465 mydir->fnode.fragstat.nsubdirs++;
466 // save them
467 straydir->mark_complete();
468 straydir->mark_dirty(straydir->pre_dirty(), ls);
469 straydir->commit(0, gather->new_sub());
28e407b8 470 stray->mark_dirty_parent(ls, true);
7c673cae
FG
471 stray->store_backtrace(gather->new_sub());
472 }
473
474 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
475 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
476
477 myin->inode.dirstat = mydir->fnode.fragstat;
478 myin->inode.rstat = mydir->fnode.rstat;
479 ++myin->inode.rstat.rsubdirs;
480 myin->inode.accounted_rstat = myin->inode.rstat;
481
482 mydir->mark_complete();
483 mydir->mark_dirty(mydir->pre_dirty(), ls);
484 mydir->commit(0, gather->new_sub());
485
486 myin->store(gather->new_sub());
487}
488
489struct C_MDC_CreateSystemFile : public MDCacheLogContext {
490 MutationRef mut;
491 CDentry *dn;
492 version_t dpv;
493 MDSInternalContextBase *fin;
494 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
495 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
496 void finish(int r) override {
497 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
498 }
499};
500
501void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
502{
503 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
504 CDentry *dn = dir->add_null_dentry(name);
505
506 dn->push_projected_linkage(in);
507 version_t dpv = dn->pre_dirty();
508
509 CDir *mdir = 0;
510 if (in->inode.is_dir()) {
511 in->inode.rstat.rsubdirs = 1;
512
513 mdir = in->get_or_open_dirfrag(this, frag_t());
514 mdir->mark_complete();
515 mdir->pre_dirty();
516 } else
517 in->inode.rstat.rfiles = 1;
518 in->inode.version = dn->pre_dirty();
519
520 SnapRealm *realm = dir->get_inode()->find_snaprealm();
521 dn->first = in->first = realm->get_newest_seq() + 1;
522
523 MutationRef mut(new MutationImpl());
524
525 // force some locks. hacky.
526 mds->locker->wrlock_force(&dir->inode->filelock, mut);
527 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
528
529 mut->ls = mds->mdlog->get_current_segment();
530 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
531 mds->mdlog->start_entry(le);
532
533 if (!in->is_mdsdir()) {
534 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
535 le->metablob.add_primary_dentry(dn, in, true);
536 } else {
537 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
538 journal_dirty_inode(mut.get(), &le->metablob, in);
539 dn->push_projected_linkage(in->ino(), in->d_type());
540 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
541 le->metablob.add_root(true, in);
542 }
543 if (mdir)
544 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
545
546 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
547 mds->mdlog->flush();
548}
549
550void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
551{
552 dout(10) << "_create_system_file_finish " << *dn << dendl;
553
554 dn->pop_projected_linkage();
555 dn->mark_dirty(dpv, mut->ls);
556
557 CInode *in = dn->get_linkage()->get_inode();
558 in->inode.version--;
559 in->mark_dirty(in->inode.version + 1, mut->ls);
560
561 if (in->inode.is_dir()) {
562 CDir *dir = in->get_dirfrag(frag_t());
563 assert(dir);
564 dir->mark_dirty(1, mut->ls);
565 dir->mark_new(mut->ls);
566 }
567
568 mut->apply();
569 mds->locker->drop_locks(mut.get());
570 mut->cleanup();
571
572 fin->complete(0);
573
574 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
575 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
576}
577
578
579
580struct C_MDS_RetryOpenRoot : public MDSInternalContext {
581 MDCache *cache;
582 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
583 void finish(int r) override {
584 if (r < 0) {
585 // If we can't open root, something disastrous has happened: mark
586 // this rank damaged for operator intervention. Note that
587 // it is not okay to call suicide() here because we are in
588 // a Finisher callback.
589 cache->mds->damaged();
590 ceph_abort(); // damaged should never return
591 } else {
592 cache->open_root();
593 }
594 }
595};
596
597void MDCache::open_root_inode(MDSInternalContextBase *c)
598{
599 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
600 CInode *in;
601 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
602 in->fetch(c);
603 } else {
604 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
605 }
606}
607
608void MDCache::open_mydir_inode(MDSInternalContextBase *c)
609{
610 MDSGatherBuilder gather(g_ceph_context);
611
612 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
613 in->fetch(gather.new_sub());
614
615 gather.set_finisher(c);
616 gather.activate();
617}
618
28e407b8
AA
619void MDCache::open_mydir_frag(MDSInternalContextBase *c)
620{
621 open_mydir_inode(
622 new MDSInternalContextWrapper(mds,
623 new FunctionContext([this, c](int r) {
624 if (r < 0) {
625 c->complete(r);
626 return;
627 }
628 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
629 assert(mydir);
630 adjust_subtree_auth(mydir, mds->get_nodeid());
631 mydir->fetch(c);
632 })
633 )
634 );
635}
636
7c673cae
FG
637void MDCache::open_root()
638{
639 dout(10) << "open_root" << dendl;
640
641 if (!root) {
642 open_root_inode(new C_MDS_RetryOpenRoot(this));
643 return;
644 }
645 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
646 assert(root->is_auth());
647 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
648 assert(rootdir);
649 if (!rootdir->is_subtree_root())
650 adjust_subtree_auth(rootdir, mds->get_nodeid());
651 if (!rootdir->is_complete()) {
652 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
653 return;
654 }
655 } else {
656 assert(!root->is_auth());
657 CDir *rootdir = root->get_dirfrag(frag_t());
658 if (!rootdir) {
224ce89b 659 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
660 return;
661 }
662 }
663
664 if (!myin) {
665 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
666 in->fetch(new C_MDS_RetryOpenRoot(this));
667 return;
668 }
669 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
670 assert(mydir);
671 adjust_subtree_auth(mydir, mds->get_nodeid());
672
673 populate_mydir();
674}
675
676void MDCache::populate_mydir()
677{
678 assert(myin);
679 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
680 assert(mydir);
681
682 dout(10) << "populate_mydir " << *mydir << dendl;
683
684 if (!mydir->is_complete()) {
685 mydir->fetch(new C_MDS_RetryOpenRoot(this));
686 return;
687 }
688
689 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
690 // A missing dirfrag, we will recreate it. Before that, we must dirty
691 // it before dirtying any of the strays we create within it.
692 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
693 "recreating it now";
694 LogSegment *ls = mds->mdlog->get_current_segment();
695 mydir->state_clear(CDir::STATE_BADFRAG);
696 mydir->mark_complete();
697 mydir->mark_dirty(mydir->pre_dirty(), ls);
698 }
699
700 // open or create stray
701 uint64_t num_strays = 0;
702 for (int i = 0; i < NUM_STRAY; ++i) {
703 stringstream name;
704 name << "stray" << i;
705 CDentry *straydn = mydir->lookup(name.str());
706
707 // allow for older fs's with stray instead of stray0
708 if (straydn == NULL && i == 0)
709 straydn = mydir->lookup("stray");
710
711 if (!straydn || !straydn->get_linkage()->get_inode()) {
712 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
713 new C_MDS_RetryOpenRoot(this));
714 return;
715 }
716 assert(straydn);
717 assert(strays[i]);
718 // we make multiple passes through this method; make sure we only pin each stray once.
719 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
720 strays[i]->get(CInode::PIN_STRAY);
721 strays[i]->state_set(CInode::STATE_STRAYPINNED);
722 strays[i]->get_stickydirs();
723 }
724 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
725
726 // open all frags
727 list<frag_t> ls;
728 strays[i]->dirfragtree.get_leaves(ls);
729 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
730 frag_t fg = *p;
731 CDir *dir = strays[i]->get_dirfrag(fg);
732 if (!dir) {
733 dir = strays[i]->get_or_open_dirfrag(this, fg);
734 }
735
736 // DamageTable applies special handling to strays: it will
737 // have damaged() us out if one is damaged.
738 assert(!dir->state_test(CDir::STATE_BADFRAG));
739
740 if (dir->get_version() == 0) {
741 dir->fetch(new C_MDS_RetryOpenRoot(this));
742 return;
743 }
744
745 if (dir->get_frag_size() > 0)
746 num_strays += dir->get_frag_size();
747 }
748 }
749
750 stray_manager.set_num_strays(num_strays);
751
752 // okay!
753 dout(10) << "populate_mydir done" << dendl;
754 assert(!open);
755 open = true;
756 mds->queue_waiters(waiting_for_open);
757
758 scan_stray_dir();
759}
760
761void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
762{
763 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
764}
765
766CDir *MDCache::get_stray_dir(CInode *in)
767{
768 string straydname;
769 in->name_stray_dentry(straydname);
770
771 CInode *strayi = get_stray();
772 assert(strayi);
773 frag_t fg = strayi->pick_dirfrag(straydname);
774 CDir *straydir = strayi->get_dirfrag(fg);
775 assert(straydir);
776 return straydir;
777}
778
779CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
780{
781 CDir *straydir = get_stray_dir(in);
782 string straydname;
783 in->name_stray_dentry(straydname);
784 CDentry *straydn = straydir->lookup(straydname);
785 if (!straydn) {
786 straydn = straydir->add_null_dentry(straydname);
787 straydn->mark_new();
788 } else {
789 assert(straydn->get_projected_linkage()->is_null());
790 }
791
792 straydn->state_set(CDentry::STATE_STRAY);
793 return straydn;
794}
795
796
797
798MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
799{
800 // inode?
801 if (info.ino)
802 return get_inode(info.ino, info.snapid);
803
804 // dir or dentry.
805 CDir *dir = get_dirfrag(info.dirfrag);
806 if (!dir) return 0;
807
808 if (info.dname.length())
809 return dir->lookup(info.dname, info.snapid);
810 else
811 return dir;
812}
813
814
815
816
817// ====================================================================
818// subtree management
819
820void MDCache::list_subtrees(list<CDir*>& ls)
821{
822 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
823 p != subtrees.end();
824 ++p)
825 ls.push_back(p->first);
826}
827
828/*
829 * adjust the dir_auth of a subtree.
830 * merge with parent and/or child subtrees, if is it appropriate.
831 * merge can ONLY happen if both parent and child have unambiguous auth.
832 */
28e407b8 833void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
834{
835 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
836 << " on " << *dir << dendl;
837
7c673cae
FG
838 show_subtrees();
839
840 CDir *root;
841 if (dir->inode->is_base()) {
842 root = dir; // bootstrap hack.
843 if (subtrees.count(root) == 0) {
844 subtrees[root];
845 root->get(CDir::PIN_SUBTREE);
846 }
847 } else {
848 root = get_subtree_root(dir); // subtree root
849 }
850 assert(root);
851 assert(subtrees.count(root));
852 dout(7) << " current root is " << *root << dendl;
853
854 if (root == dir) {
855 // i am already a subtree.
856 dir->set_dir_auth(auth);
857 } else {
858 // i am a new subtree.
859 dout(10) << " new subtree at " << *dir << dendl;
860 assert(subtrees.count(dir) == 0);
861 subtrees[dir]; // create empty subtree bounds list for me.
862 dir->get(CDir::PIN_SUBTREE);
863
864 // set dir_auth
865 dir->set_dir_auth(auth);
866
867 // move items nested beneath me, under me.
868 set<CDir*>::iterator p = subtrees[root].begin();
869 while (p != subtrees[root].end()) {
870 set<CDir*>::iterator next = p;
871 ++next;
872 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
873 // move under me
874 dout(10) << " claiming child bound " << **p << dendl;
875 subtrees[dir].insert(*p);
876 subtrees[root].erase(p);
877 }
878 p = next;
879 }
880
881 // i am a bound of the parent subtree.
882 subtrees[root].insert(dir);
883
884 // i am now the subtree root.
885 root = dir;
886
887 // adjust recursive pop counters
28e407b8 888 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
889 utime_t now = ceph_clock_now();
890 CDir *p = dir->get_parent_dir();
891 while (p) {
892 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
893 if (p->is_subtree_root()) break;
894 p = p->inode->get_parent_dir();
895 }
896 }
7c673cae
FG
897 }
898
899 show_subtrees();
900}
901
902
903void MDCache::try_subtree_merge(CDir *dir)
904{
905 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
906 // record my old bounds
907 auto oldbounds = subtrees.at(dir);
7c673cae 908
224ce89b 909 set<CInode*> to_eval;
7c673cae 910 // try merge at my root
224ce89b 911 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
912
913 // try merge at my old bounds
224ce89b
WB
914 for (auto bound : oldbounds)
915 try_subtree_merge_at(bound, &to_eval);
916
917 if (!(mds->is_any_replay() || mds->is_resolve())) {
918 for(auto in : to_eval)
919 eval_subtree_root(in);
920 }
7c673cae
FG
921}
922
923class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
924 CInode *in;
925 MutationRef mut;
926public:
927 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
928 void finish(int r) override {
929 mdcache->subtree_merge_writebehind_finish(in, mut);
930 }
931};
932
28e407b8 933void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
934{
935 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
936
937 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
938 dir->state_test(CDir::STATE_EXPORTBOUND) ||
939 dir->state_test(CDir::STATE_AUXSUBTREE))
940 return;
941
942 auto it = subtrees.find(dir);
943 assert(it != subtrees.end());
7c673cae 944
7c673cae
FG
945 // merge with parent?
946 CDir *parent = dir;
947 if (!dir->inode->is_base())
948 parent = get_subtree_root(dir->get_parent_dir());
949
b32b8144
FG
950 if (parent != dir && // we have a parent,
951 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
952 // merge with parent.
953 dout(10) << " subtree merge at " << *dir << dendl;
954 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
955
956 // move our bounds under the parent
b32b8144 957 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
958
959 // we are no longer a subtree or bound
960 dir->put(CDir::PIN_SUBTREE);
b32b8144 961 subtrees.erase(it);
7c673cae
FG
962 subtrees[parent].erase(dir);
963
964 // adjust popularity?
28e407b8 965 if (adjust_pop && dir->is_auth()) {
7c673cae 966 utime_t now = ceph_clock_now();
28e407b8 967 CDir *cur = dir;
7c673cae
FG
968 CDir *p = dir->get_parent_dir();
969 while (p) {
970 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
28e407b8 971 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 972 if (p->is_subtree_root()) break;
28e407b8 973 cur = p;
7c673cae
FG
974 p = p->inode->get_parent_dir();
975 }
976 }
977
224ce89b
WB
978 if (to_eval && dir->get_inode()->is_auth())
979 to_eval->insert(dir->get_inode());
7c673cae 980
181888fb
FG
981 show_subtrees(15);
982 }
7c673cae
FG
983}
984
985void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
986{
987 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
988 in->pop_and_dirty_projected_inode(mut->ls);
989
990 mut->apply();
991 mds->locker->drop_locks(mut.get());
992 mut->cleanup();
993
994 in->auth_unpin(this);
995}
996
997void MDCache::eval_subtree_root(CInode *diri)
998{
999 // evaluate subtree inode filelock?
1000 // (we should scatter the filelock on subtree bounds)
224ce89b
WB
1001 assert(diri->is_auth());
1002 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1003}
1004
1005
1006void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
1007{
1008 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1009 << " on " << *dir
1010 << " bounds " << bounds
1011 << dendl;
1012
1013 show_subtrees();
1014
1015 CDir *root;
1016 if (dir->ino() == MDS_INO_ROOT) {
1017 root = dir; // bootstrap hack.
1018 if (subtrees.count(root) == 0) {
1019 subtrees[root];
1020 root->get(CDir::PIN_SUBTREE);
1021 }
1022 } else {
1023 root = get_subtree_root(dir); // subtree root
1024 }
1025 assert(root);
1026 assert(subtrees.count(root));
1027 dout(7) << " current root is " << *root << dendl;
1028
1029 mds_authority_t oldauth = dir->authority();
1030
1031 if (root == dir) {
1032 // i am already a subtree.
1033 dir->set_dir_auth(auth);
1034 } else {
1035 // i am a new subtree.
1036 dout(10) << " new subtree at " << *dir << dendl;
1037 assert(subtrees.count(dir) == 0);
1038 subtrees[dir]; // create empty subtree bounds list for me.
1039 dir->get(CDir::PIN_SUBTREE);
1040
1041 // set dir_auth
1042 dir->set_dir_auth(auth);
1043
1044 // move items nested beneath me, under me.
1045 set<CDir*>::iterator p = subtrees[root].begin();
1046 while (p != subtrees[root].end()) {
1047 set<CDir*>::iterator next = p;
1048 ++next;
1049 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1050 // move under me
1051 dout(10) << " claiming child bound " << **p << dendl;
1052 subtrees[dir].insert(*p);
1053 subtrees[root].erase(p);
1054 }
1055 p = next;
1056 }
1057
1058 // i am a bound of the parent subtree.
1059 subtrees[root].insert(dir);
1060
1061 // i am now the subtree root.
1062 root = dir;
1063 }
1064
224ce89b
WB
1065 set<CInode*> to_eval;
1066
7c673cae
FG
1067 // verify/adjust bounds.
1068 // - these may be new, or
1069 // - beneath existing ambiguous bounds (which will be collapsed),
1070 // - but NOT beneath unambiguous bounds.
1071 for (set<CDir*>::iterator p = bounds.begin();
1072 p != bounds.end();
1073 ++p) {
1074 CDir *bound = *p;
1075
1076 // new bound?
1077 if (subtrees[dir].count(bound) == 0) {
1078 if (get_subtree_root(bound) == dir) {
1079 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1080 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1081 }
1082 else {
1083 dout(10) << " want bound " << *bound << dendl;
1084 CDir *t = get_subtree_root(bound->get_parent_dir());
1085 if (subtrees[t].count(bound) == 0) {
1086 assert(t != dir);
1087 dout(10) << " new bound " << *bound << dendl;
1088 adjust_subtree_auth(bound, t->authority());
1089 }
1090 // make sure it's nested beneath ambiguous subtree(s)
1091 while (1) {
1092 while (subtrees[dir].count(t) == 0)
1093 t = get_subtree_root(t->get_parent_dir());
1094 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1095 adjust_subtree_auth(t, auth);
224ce89b 1096 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1097 t = get_subtree_root(bound->get_parent_dir());
1098 if (t == dir) break;
1099 }
1100 }
1101 }
1102 else {
1103 dout(10) << " already have bound " << *bound << dendl;
1104 }
1105 }
1106 // merge stray bounds?
1107 while (!subtrees[dir].empty()) {
1108 set<CDir*> copy = subtrees[dir];
1109 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1110 if (bounds.count(*p) == 0) {
1111 CDir *stray = *p;
1112 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1113 adjust_subtree_auth(stray, auth);
224ce89b 1114 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1115 }
1116 }
1117 // swallowing subtree may add new subtree bounds
1118 if (copy == subtrees[dir])
1119 break;
1120 }
1121
1122 // bound should now match.
1123 verify_subtree_bounds(dir, bounds);
1124
1125 show_subtrees();
224ce89b
WB
1126
1127 if (!(mds->is_any_replay() || mds->is_resolve())) {
1128 for(auto in : to_eval)
1129 eval_subtree_root(in);
1130 }
7c673cae
FG
1131}
1132
1133
1134/*
1135 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1136 * fragmentation as necessary to get an equivalent bounding set. That is, only
1137 * split if one of our frags spans the provided bounding set. Never merge.
1138 */
1139void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1140{
1141 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1142
1143 // sort by ino
1144 map<inodeno_t, fragset_t> byino;
1145 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1146 byino[p->ino].insert(p->frag);
1147 dout(10) << " by ino: " << byino << dendl;
1148
1149 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1150 CInode *diri = get_inode(p->first);
1151 if (!diri)
1152 continue;
1153 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1154
1155 fragtree_t tmpdft;
1156 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1157 tmpdft.force_to_leaf(g_ceph_context, *q);
1158
1159 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1160 frag_t fg = *q;
1161 list<frag_t> fgls;
1162 diri->dirfragtree.get_leaves_under(fg, fgls);
1163 if (fgls.empty()) {
1164 bool all = true;
1165 frag_t approx_fg = diri->dirfragtree[fg.value()];
1166 list<frag_t> ls;
1167 tmpdft.get_leaves_under(approx_fg, ls);
1168 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1169 if (p->second.get().count(*r) == 0) {
1170 // not bound, so the resolve message is from auth MDS of the dirfrag
1171 force_dir_fragment(diri, *r);
1172 all = false;
1173 }
1174 }
1175 if (all)
1176 fgls.push_back(approx_fg);
1177 else
1178 diri->dirfragtree.get_leaves_under(fg, fgls);
1179 }
1180 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1181 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1182 CDir *dir = diri->get_dirfrag(*r);
1183 if (dir)
1184 bounds.insert(dir);
1185 }
1186 }
1187 }
1188}
1189
1190void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1191{
1192 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1193 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1194
1195 set<CDir*> bounds;
1196 get_force_dirfrag_bound_set(bound_dfs, bounds);
1197 adjust_bounded_subtree_auth(dir, bounds, auth);
1198}
1199
1200void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1201{
1202 dout(10) << "map_dirfrag_set " << dfs << dendl;
1203
1204 // group by inode
1205 map<inodeno_t, fragset_t> ino_fragset;
1206 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1207 ino_fragset[p->ino].insert(p->frag);
1208
1209 // get frags
1210 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1211 p != ino_fragset.end();
1212 ++p) {
1213 CInode *in = get_inode(p->first);
1214 if (!in)
1215 continue;
1216
1217 list<frag_t> fglist;
1218 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1219 in->dirfragtree.get_leaves_under(*q, fglist);
1220
1221 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1222 << " on " << *in << dendl;
1223
1224 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1225 CDir *dir = in->get_dirfrag(*q);
1226 if (dir)
1227 result.insert(dir);
1228 }
1229 }
1230}
1231
1232
1233
1234CDir *MDCache::get_subtree_root(CDir *dir)
1235{
1236 // find the underlying dir that delegates (or is about to delegate) auth
1237 while (true) {
1238 if (dir->is_subtree_root())
1239 return dir;
1240 dir = dir->get_inode()->get_parent_dir();
1241 if (!dir)
1242 return 0; // none
1243 }
1244}
1245
1246CDir *MDCache::get_projected_subtree_root(CDir *dir)
1247{
1248 // find the underlying dir that delegates (or is about to delegate) auth
1249 while (true) {
1250 if (dir->is_subtree_root())
1251 return dir;
1252 dir = dir->get_inode()->get_projected_parent_dir();
1253 if (!dir)
1254 return 0; // none
1255 }
1256}
1257
1258void MDCache::remove_subtree(CDir *dir)
1259{
1260 dout(10) << "remove_subtree " << *dir << dendl;
1261 assert(subtrees.count(dir));
1262 assert(subtrees[dir].empty());
1263 subtrees.erase(dir);
1264 dir->put(CDir::PIN_SUBTREE);
1265 if (dir->get_parent_dir()) {
1266 CDir *p = get_subtree_root(dir->get_parent_dir());
1267 assert(subtrees[p].count(dir));
1268 subtrees[p].erase(dir);
1269 }
1270}
1271
1272void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1273{
1274 assert(subtrees.count(dir));
1275 bounds = subtrees[dir];
1276}
1277
1278void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1279{
1280 if (subtrees.count(dir)) {
1281 // just copy them, dir is a subtree.
1282 get_subtree_bounds(dir, bounds);
1283 } else {
1284 // find them
1285 CDir *root = get_subtree_root(dir);
1286 for (set<CDir*>::iterator p = subtrees[root].begin();
1287 p != subtrees[root].end();
1288 ++p) {
1289 CDir *t = *p;
1290 while (t != root) {
1291 t = t->get_parent_dir();
1292 assert(t);
1293 if (t == dir) {
1294 bounds.insert(*p);
1295 continue;
1296 }
1297 }
1298 }
1299 }
1300}
1301
1302void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1303{
1304 // for debugging only.
1305 assert(subtrees.count(dir));
1306 if (bounds != subtrees[dir]) {
1307 dout(0) << "verify_subtree_bounds failed" << dendl;
1308 set<CDir*> b = bounds;
1309 for (auto &cd : subtrees[dir]) {
1310 if (bounds.count(cd)) {
1311 b.erase(cd);
1312 continue;
1313 }
1314 dout(0) << " missing bound " << *cd << dendl;
1315 }
1316 for (const auto &cd : b)
1317 dout(0) << " extra bound " << *cd << dendl;
1318 }
1319 assert(bounds == subtrees[dir]);
1320}
1321
1322void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1323{
1324 // for debugging only.
1325 assert(subtrees.count(dir));
1326
1327 // make sure that any bounds i do have are properly noted as such.
1328 int failed = 0;
1329 for (const auto &fg : bounds) {
1330 CDir *bd = get_dirfrag(fg);
1331 if (!bd) continue;
1332 if (subtrees[dir].count(bd) == 0) {
1333 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1334 failed++;
1335 }
1336 }
1337 assert(failed == 0);
1338}
1339
1340void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1341{
1342 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1343 << " to " << *newdir << dendl;
1344 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1345}
1346
224ce89b 1347void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1348{
1349 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1350
1351 //show_subtrees();
28e407b8 1352 utime_t now = ceph_clock_now();
7c673cae
FG
1353
1354 CDir *newdir = diri->get_parent_dir();
1355
1356 if (pop) {
1357 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1358 assert(p != projected_subtree_renames.end());
1359 assert(!p->second.empty());
1360 assert(p->second.front().first == olddir);
1361 assert(p->second.front().second == newdir);
1362 p->second.pop_front();
1363 if (p->second.empty())
1364 projected_subtree_renames.erase(p);
1365 }
1366
1367 // adjust subtree
1368 list<CDir*> dfls;
1369 // make sure subtree dirfrags are at the front of the list
1370 diri->get_subtree_dirfrags(dfls);
1371 diri->get_nested_dirfrags(dfls);
1372 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1373 CDir *dir = *p;
1374
1375 dout(10) << "dirfrag " << *dir << dendl;
1376 CDir *oldparent = get_subtree_root(olddir);
1377 dout(10) << " old parent " << *oldparent << dendl;
1378 CDir *newparent = get_subtree_root(newdir);
1379 dout(10) << " new parent " << *newparent << dendl;
1380
28e407b8
AA
1381 if (olddir != newdir)
1382 mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
1383
7c673cae
FG
1384 if (oldparent == newparent) {
1385 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1386 } else if (dir->is_subtree_root()) {
7c673cae
FG
1387 // children are fine. change parent.
1388 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1389 assert(subtrees[oldparent].count(dir));
1390 subtrees[oldparent].erase(dir);
1391 assert(subtrees.count(newparent));
1392 subtrees[newparent].insert(dir);
224ce89b 1393 // caller is responsible for 'eval diri'
28e407b8 1394 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1395 } else {
1396 // mid-subtree.
1397
1398 // see if any old bounds move to the new parent.
1399 list<CDir*> tomove;
1400 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1401 p != subtrees[oldparent].end();
1402 ++p) {
1403 CDir *bound = *p;
1404 CDir *broot = get_subtree_root(bound->get_parent_dir());
1405 if (broot != oldparent) {
1406 assert(broot == newparent);
1407 tomove.push_back(bound);
1408 }
1409 }
1410 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1411 CDir *bound = *p;
1412 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1413 subtrees[oldparent].erase(bound);
1414 subtrees[newparent].insert(bound);
1415 }
1416
1417 // did auth change?
1418 if (oldparent->authority() != newparent->authority()) {
28e407b8 1419 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1420 // caller is responsible for 'eval diri'
28e407b8 1421 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1422 }
1423 }
28e407b8
AA
1424
1425 if (olddir != newdir)
1426 mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
7c673cae
FG
1427 }
1428
1429 show_subtrees();
1430}
1431
1432
1433void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1434{
1435 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1436 p != subtrees.end();
1437 ++p) {
1438 CDir *root = p->first;
1439 if (root->is_full_dir_auth())
1440 s.insert(root);
1441 }
1442}
1443void MDCache::get_auth_subtrees(set<CDir*>& s)
1444{
1445 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1446 p != subtrees.end();
1447 ++p) {
1448 CDir *root = p->first;
1449 if (root->is_auth())
1450 s.insert(root);
1451 }
1452}
1453
1454
1455// count.
1456
1457int MDCache::num_subtrees()
1458{
1459 return subtrees.size();
1460}
1461
1462int MDCache::num_subtrees_fullauth()
1463{
1464 int n = 0;
1465 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1466 p != subtrees.end();
1467 ++p) {
1468 CDir *root = p->first;
1469 if (root->is_full_dir_auth())
1470 n++;
1471 }
1472 return n;
1473}
1474
1475int MDCache::num_subtrees_fullnonauth()
1476{
1477 int n = 0;
1478 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1479 p != subtrees.end();
1480 ++p) {
1481 CDir *root = p->first;
1482 if (root->is_full_dir_nonauth())
1483 n++;
1484 }
1485 return n;
1486}
1487
1488
1489
1490// ===================================
1491// journal and snap/cow helpers
1492
1493
1494/*
1495 * find first inode in cache that follows given snapid. otherwise, return current.
1496 */
1497CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1498{
1499 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1500 assert(in->last == CEPH_NOSNAP);
1501
b32b8144
FG
1502 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1503 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1504 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1505 in = p->second;
7c673cae 1506 }
b32b8144 1507
7c673cae
FG
1508 return in;
1509}
1510
1511
1512/*
1513 * note: i'm currently cheating wrt dirty and inode.version on cow
1514 * items. instead of doing a full dir predirty, i just take the
1515 * original item's version, and set the dirty flag (via
1516 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1517 * means a special case in the dir commit clean sweep assertions.
1518 * bah.
1519 */
1520CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1521{
1522 assert(last >= in->first);
1523
b32b8144 1524 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae
FG
1525 oldin->inode = *in->get_previous_projected_inode();
1526 oldin->symlink = in->symlink;
1527 oldin->xattrs = *in->get_previous_projected_xattrs();
1528 oldin->inode.trim_client_ranges(last);
1529
1530 if (in->first < in->oldest_snap)
1531 in->oldest_snap = in->first;
1532
1533 in->first = last+1;
1534
1535 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1536 add_inode(oldin);
1537
1538 if (in->last != CEPH_NOSNAP) {
1539 CInode *head_in = get_inode(in->ino());
1540 assert(head_in);
1541 if (head_in->split_need_snapflush(oldin, in)) {
1542 oldin->client_snap_caps = in->client_snap_caps;
94b18763
FG
1543 for (const auto &p : in->client_snap_caps) {
1544 SimpleLock *lock = oldin->get_lock(p.first);
7c673cae 1545 assert(lock);
94b18763 1546 for (const auto &q : p.second) {
7c673cae
FG
1547 oldin->auth_pin(lock);
1548 lock->set_state(LOCK_SNAP_SYNC); // gathering
1549 lock->get_wrlock(true);
94b18763 1550 (void)q; /* unused */
7c673cae
FG
1551 }
1552 }
1553 }
1554 return oldin;
1555 }
1556
b32b8144
FG
1557 if (!in->client_caps.empty()) {
1558 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1559 // clone caps?
94b18763 1560 for (auto &p : in->client_caps) {
b32b8144
FG
1561 client_t client = p.first;
1562 Capability *cap = p.second;
1563 int issued = cap->issued();
1564 if ((issued & CEPH_CAP_ANY_WR) &&
1565 cap->client_follows < last) {
1566 // note in oldin
1567 for (int i = 0; i < num_cinode_locks; i++) {
1568 if (issued & cinode_lock_info[i].wr_caps) {
1569 int lockid = cinode_lock_info[i].lock;
1570 SimpleLock *lock = oldin->get_lock(lockid);
1571 assert(lock);
1572 oldin->client_snap_caps[lockid].insert(client);
1573 oldin->auth_pin(lock);
1574 lock->set_state(LOCK_SNAP_SYNC); // gathering
1575 lock->get_wrlock(true);
1576 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1577 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1578 }
7c673cae 1579 }
b32b8144
FG
1580 cap->client_follows = last;
1581
1582 // we need snapflushes for any intervening snaps
1583 dout(10) << " snaps " << snaps << dendl;
1584 for (auto q = snaps.lower_bound(oldin->first);
1585 q != snaps.end() && *q <= last;
1586 ++q) {
1587 in->add_need_snapflush(oldin, *q, client);
1588 }
1589 } else {
1590 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1591 }
7c673cae
FG
1592 }
1593 }
7c673cae
FG
1594 return oldin;
1595}
1596
1597void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1598 CDentry *dn, snapid_t follows,
1599 CInode **pcow_inode, CDentry::linkage_t *dnl)
1600{
1601 if (!dn) {
1602 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1603 return;
1604 }
1605 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1606 assert(dn->is_auth());
1607
1608 // nothing to cow on a null dentry, fix caller
1609 if (!dnl)
1610 dnl = dn->get_projected_linkage();
1611 assert(!dnl->is_null());
1612
1613 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1614 // multiversion inode.
1615 CInode *in = dnl->get_inode();
1616 SnapRealm *realm = NULL;
1617
1618 if (in->get_projected_parent_dn() != dn) {
1619 assert(follows == CEPH_NOSNAP);
1620 realm = dn->dir->inode->find_snaprealm();
1621 snapid_t dir_follows = realm->get_newest_snap();
1622
1623 if (dir_follows+1 > dn->first) {
1624 snapid_t oldfirst = dn->first;
1625 dn->first = dir_follows+1;
1626 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1627 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1628 oldfirst, dir_follows);
1629 olddn->pre_dirty();
1630 dout(10) << " olddn " << *olddn << dendl;
1631 metablob->add_remote_dentry(olddn, true);
1632 mut->add_cow_dentry(olddn);
1633 // FIXME: adjust link count here? hmm.
1634
1635 if (dir_follows+1 > in->first)
1636 in->cow_old_inode(dir_follows, false);
1637 }
1638 }
1639
1640 if (in->snaprealm) {
1641 realm = in->snaprealm;
1642 follows = realm->get_newest_seq();
1643 } else
1644 follows = dir_follows;
1645 } else {
1646 realm = in->find_snaprealm();
1647 if (follows == CEPH_NOSNAP)
1648 follows = realm->get_newest_seq();
1649 }
1650
1651 // already cloned?
1652 if (follows < in->first) {
1653 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1654 return;
1655 }
1656
1657 if (!realm->has_snaps_in_range(in->first, follows)) {
1658 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1659 in->first = follows + 1;
1660 return;
1661 }
1662
1663 in->cow_old_inode(follows, false);
1664
1665 } else {
1666 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1667 if (follows == CEPH_NOSNAP)
1668 follows = realm->get_newest_seq();
1669
1670 // already cloned?
1671 if (follows < dn->first) {
1672 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1673 return;
1674 }
1675
1676 // update dn.first before adding old dentry to cdir's map
1677 snapid_t oldfirst = dn->first;
1678 dn->first = follows+1;
1679
1680 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1681
1682 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1683 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1684 if (in)
1685 in->first = follows+1;
1686 return;
1687 }
1688
1689 dout(10) << " dn " << *dn << dendl;
1690 if (in) {
1691 CInode *oldin = cow_inode(in, follows);
1692 mut->add_cow_inode(oldin);
1693 if (pcow_inode)
1694 *pcow_inode = oldin;
94b18763 1695 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
7c673cae
FG
1696 oldin->inode.version = olddn->pre_dirty();
1697 dout(10) << " olddn " << *olddn << dendl;
1698 bool need_snapflush = !oldin->client_snap_caps.empty();
1699 if (need_snapflush)
1700 mut->ls->open_files.push_back(&oldin->item_open_file);
1701 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1702 mut->add_cow_dentry(olddn);
1703 } else {
1704 assert(dnl->is_remote());
94b18763 1705 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1706 oldfirst, follows);
1707 olddn->pre_dirty();
1708 dout(10) << " olddn " << *olddn << dendl;
1709 metablob->add_remote_dentry(olddn, true);
1710 mut->add_cow_dentry(olddn);
1711 }
1712 }
1713}
1714
1715
1716void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1717 CInode *in, snapid_t follows,
1718 CInode **pcow_inode)
1719{
1720 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1721 CDentry *dn = in->get_projected_parent_dn();
1722 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1723}
1724
1725void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1726{
1727 if (in->is_base()) {
1728 metablob->add_root(true, in, in->get_projected_inode());
1729 } else {
1730 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1731 follows = in->first - 1;
1732 CDentry *dn = in->get_projected_parent_dn();
1733 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1734 journal_cow_dentry(mut, metablob, dn, follows);
1735 if (in->get_projected_inode()->is_backtrace_updated()) {
1736 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1737 in->get_previous_projected_inode()->layout.pool_id;
1738 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1739 } else {
1740 metablob->add_primary_dentry(dn, in, true);
1741 }
1742 }
1743}
1744
1745
1746
1747// nested ---------------------------------------------------------------
1748
1749void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1750 int linkunlink, SnapRealm *prealm)
1751{
1752 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1753 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1754
1755 if (cur->first > first)
1756 first = cur->first;
1757
1758 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1759 << " " << *cur << dendl;
1760 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1761 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1762
1763 /*
1764 * FIXME. this incompletely propagates rstats to _old_ parents
1765 * (i.e. shortly after a directory rename). but we need full
1766 * blown hard link backpointers to make this work properly...
1767 */
1768 snapid_t floor = parentdn->first;
1769 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1770
1771 if (!prealm)
1772 prealm = parent->inode->find_snaprealm();
1773 const set<snapid_t> snaps = prealm->get_snaps();
1774
1775 if (cur->last != CEPH_NOSNAP) {
1776 assert(cur->dirty_old_rstats.empty());
1777 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1778 if (q == snaps.end() || *q > cur->last)
1779 return;
1780 }
1781
1782 if (cur->last >= floor) {
1783 bool update = true;
1784 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1785 // rename src inode is not projected in the slave rename prep case. so we should
1786 // avoid updateing the inode.
1787 assert(linkunlink < 0);
1788 assert(cur->is_frozen_inode());
1789 update = false;
1790 }
1791 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1792 linkunlink, update);
1793 }
1794
1795 if (g_conf->mds_snap_rstat) {
94b18763
FG
1796 for (const auto &p : cur->dirty_old_rstats) {
1797 auto &old = cur->old_inodes[p];
1798 snapid_t ofirst = std::max(old.first, floor);
1799 auto it = snaps.lower_bound(ofirst);
1800 if (it == snaps.end() || *it > p)
7c673cae 1801 continue;
94b18763
FG
1802 if (p >= floor)
1803 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1804 }
1805 }
1806 cur->dirty_old_rstats.clear();
1807}
1808
1809
94b18763 1810void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1811 CDir *parent, int linkunlink, bool update_inode)
1812{
1813 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1814 dout(20) << " inode rstat " << inode.rstat << dendl;
1815 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1816 nest_info_t delta;
1817 if (linkunlink == 0) {
1818 delta.add(inode.rstat);
1819 delta.sub(inode.accounted_rstat);
1820 } else if (linkunlink < 0) {
1821 delta.sub(inode.accounted_rstat);
1822 } else {
1823 delta.add(inode.rstat);
1824 }
1825 dout(20) << " delta " << delta << dendl;
1826
1827 if (update_inode)
1828 inode.accounted_rstat = inode.rstat;
1829
1830 while (last >= ofirst) {
1831 /*
1832 * pick fnode version to update. at each iteration, we want to
1833 * pick a segment ending in 'last' to update. split as necessary
1834 * to make that work. then, adjust first up so that we only
1835 * update one segment at a time. then loop to cover the whole
1836 * [ofirst,last] interval.
1837 */
1838 nest_info_t *prstat;
1839 snapid_t first;
1840 fnode_t *pf = parent->get_projected_fnode();
1841 if (last == CEPH_NOSNAP) {
1842 if (g_conf->mds_snap_rstat)
1843 first = MAX(ofirst, parent->first);
1844 else
1845 first = parent->first;
1846 prstat = &pf->rstat;
1847 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1848
1849 if (first > parent->first &&
1850 !(pf->rstat == pf->accounted_rstat)) {
1851 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1852 << parent->first << "," << (first-1) << "] "
1853 << " " << *prstat << "/" << pf->accounted_rstat
1854 << dendl;
1855 parent->dirty_old_rstat[first-1].first = parent->first;
1856 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1857 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1858 }
1859 parent->first = first;
1860 } else if (!g_conf->mds_snap_rstat) {
1861 // drop snapshots' rstats
1862 break;
1863 } else if (last >= parent->first) {
1864 first = parent->first;
1865 parent->dirty_old_rstat[last].first = first;
1866 parent->dirty_old_rstat[last].rstat = pf->rstat;
1867 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1868 prstat = &parent->dirty_old_rstat[last].rstat;
1869 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1870 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1871 } else {
1872 // be careful, dirty_old_rstat is a _sparse_ map.
1873 // sorry, this is ugly.
1874 first = ofirst;
1875
1876 // find any intersection with last
94b18763
FG
1877 auto it = parent->dirty_old_rstat.lower_bound(last);
1878 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1879 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1880 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1881 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1882 first = parent->dirty_old_rstat.rbegin()->first+1;
1883 }
1884 } else {
94b18763
FG
1885 // *it last is >= last
1886 if (it->second.first <= last) {
1887 // *it intersects [first,last]
1888 if (it->second.first < first) {
1889 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1890 parent->dirty_old_rstat[first-1] = it->second;
1891 it->second.first = first;
7c673cae 1892 }
94b18763
FG
1893 if (it->second.first > first)
1894 first = it->second.first;
1895 if (last < it->first) {
1896 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1897 parent->dirty_old_rstat[last] = it->second;
1898 it->second.first = last+1;
7c673cae
FG
1899 }
1900 } else {
94b18763
FG
1901 // *it is to the _right_ of [first,last]
1902 it = parent->dirty_old_rstat.lower_bound(first);
1903 // new *it last is >= first
1904 if (it->second.first <= last && // new *it isn't also to the right, and
1905 it->first >= first) { // it intersects our first bit,
1906 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1907 first = it->first+1;
7c673cae
FG
1908 }
1909 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1910 }
1911 }
1912 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1913 parent->dirty_old_rstat[last].first = first;
1914 prstat = &parent->dirty_old_rstat[last].rstat;
1915 }
1916
1917 // apply
1918 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1919 assert(last >= first);
1920 prstat->add(delta);
1921 if (update_inode)
1922 inode.accounted_rstat = inode.rstat;
1923 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1924
1925 last = first-1;
1926 }
1927}
1928
1929void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1930 snapid_t ofirst, snapid_t last,
1931 CInode *pin, bool cow_head)
1932{
1933 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1934 dout(20) << " frag rstat " << rstat << dendl;
1935 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1936 nest_info_t delta = rstat;
1937 delta.sub(accounted_rstat);
1938 dout(20) << " delta " << delta << dendl;
1939
1940 while (last >= ofirst) {
94b18763 1941 CInode::mempool_inode *pi;
7c673cae
FG
1942 snapid_t first;
1943 if (last == pin->last) {
1944 pi = pin->get_projected_inode();
1945 first = MAX(ofirst, pin->first);
1946 if (first > pin->first) {
94b18763 1947 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1948 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1949 }
1950 } else {
1951 if (last >= pin->first) {
1952 first = pin->first;
1953 pin->cow_old_inode(last, cow_head);
1954 } else {
1955 // our life is easier here because old_inodes is not sparse
1956 // (although it may not begin at snapid 1)
94b18763
FG
1957 auto it = pin->old_inodes.lower_bound(last);
1958 if (it == pin->old_inodes.end()) {
7c673cae
FG
1959 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1960 break;
1961 }
94b18763 1962 first = it->second.first;
7c673cae 1963 if (first > last) {
94b18763 1964 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1965 //assert(p == pin->old_inodes.begin());
1966 break;
1967 }
94b18763
FG
1968 if (it->first > last) {
1969 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1970 << (last+1) << "," << it->first << "]" << dendl;
1971 pin->old_inodes[last] = it->second;
1972 it->second.first = last+1;
1973 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1974 }
1975 }
1976 if (first < ofirst) {
1977 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1978 << first << "," << ofirst-1 << "]" << dendl;
1979 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1980 pin->dirty_old_rstats.insert(ofirst-1);
1981 pin->old_inodes[last].first = first = ofirst;
1982 }
1983 pi = &pin->old_inodes[last].inode;
1984 pin->dirty_old_rstats.insert(last);
1985 }
1986 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1987 pi->rstat.add(delta);
1988 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1989
1990 last = first-1;
1991 }
1992}
1993
28e407b8 1994void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
7c673cae
FG
1995{
1996 if (!in->is_auth() || in->is_frozen())
1997 return;
1998
94b18763 1999 auto i = in->get_projected_inode();
7c673cae
FG
2000
2001 if (!i->quota.is_enable())
2002 return;
2003
2004 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
2005 it != in->client_caps.end();
2006 ++it) {
2007 Session *session = mds->get_session(it->first);
2008 if (!session || !session->connection ||
2009 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
2010 continue;
2011
2012 Capability *cap = it->second;
28e407b8
AA
2013
2014 if (exclude_ct >= 0 && exclude_ct != it->first)
2015 goto update;
2016
7c673cae
FG
2017 if (cap->last_rbytes == i->rstat.rbytes &&
2018 cap->last_rsize == i->rstat.rsize())
2019 continue;
2020
2021 if (i->quota.max_files > 0) {
2022 if (i->rstat.rsize() >= i->quota.max_files)
2023 goto update;
2024
2025 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2026 abs(cap->last_rsize - i->rstat.rsize()))
2027 goto update;
2028 }
2029
2030 if (i->quota.max_bytes > 0) {
2031 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2032 goto update;
2033
2034 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2035 abs(cap->last_rbytes - i->rstat.rbytes))
2036 goto update;
2037 }
2038
2039 continue;
2040
2041update:
2042 cap->last_rsize = i->rstat.rsize();
2043 cap->last_rbytes = i->rstat.rbytes;
2044
2045 MClientQuota *msg = new MClientQuota();
2046 msg->ino = in->ino();
2047 msg->rstat = i->rstat;
2048 msg->quota = i->quota;
2049 mds->send_message_client_counted(msg, session->connection);
2050 }
181888fb 2051 for (const auto &it : in->get_replicas()) {
7c673cae
FG
2052 MGatherCaps *msg = new MGatherCaps;
2053 msg->ino = in->ino();
181888fb 2054 mds->send_message_mds(msg, it.first);
7c673cae
FG
2055 }
2056}
2057
2058/*
2059 * NOTE: we _have_ to delay the scatter if we are called during a
2060 * rejoin, because we can't twiddle locks between when the
2061 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2062 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2063 * (no requests), and a survivor acks immediately. _except_ that
2064 * during rejoin_(weak|strong) processing, we may complete a lock
2065 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2066 * scatterlock state in that case or the lock states will get out of
2067 * sync between the auth and replica.
2068 *
2069 * the simple solution is to never do the scatter here. instead, put
2070 * the scatterlock on a list if it isn't already wrlockable. this is
2071 * probably the best plan anyway, since we avoid too many
2072 * scatters/locks under normal usage.
2073 */
2074/*
2075 * some notes on dirlock/nestlock scatterlock semantics:
2076 *
2077 * the fragstat (dirlock) will never be updated without
2078 * dirlock+nestlock wrlock held by the caller.
2079 *
2080 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2081 * data is pushed up the tree. this could be changed with some
2082 * restructuring here, but in its current form we ensure that the
2083 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2084 * frag, which is nice. and, we only need to track frags that need to
2085 * be nudged (and not inodes with pending rstat changes that need to
2086 * be pushed into the frag). a consequence of this is that the
2087 * accounted_rstat on scatterlock sync may not match our current
2088 * rstat. this is normal and expected.
2089 */
2090void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2091 CInode *in, CDir *parent,
2092 int flags, int linkunlink,
2093 snapid_t cfollows)
2094{
2095 bool primary_dn = flags & PREDIRTY_PRIMARY;
2096 bool do_parent_mtime = flags & PREDIRTY_DIR;
2097 bool shallow = flags & PREDIRTY_SHALLOW;
2098
2099 assert(mds->mdlog->entry_is_open());
2100
2101 // make sure stamp is set
2102 if (mut->get_mds_stamp() == utime_t())
2103 mut->set_mds_stamp(ceph_clock_now());
2104
2105 if (in->is_base())
2106 return;
2107
2108 dout(10) << "predirty_journal_parents"
2109 << (do_parent_mtime ? " do_parent_mtime":"")
2110 << " linkunlink=" << linkunlink
2111 << (primary_dn ? " primary_dn":" remote_dn")
2112 << (shallow ? " SHALLOW":"")
2113 << " follows " << cfollows
2114 << " " << *in << dendl;
2115
2116 if (!parent) {
2117 assert(primary_dn);
2118 parent = in->get_projected_parent_dn()->get_dir();
2119 }
2120
2121 if (flags == 0 && linkunlink == 0) {
2122 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2123 blob->add_dir_context(parent);
2124 return;
2125 }
2126
2127 // build list of inodes to wrlock, dirty, and update
2128 list<CInode*> lsi;
2129 CInode *cur = in;
2130 CDentry *parentdn = NULL;
2131 bool first = true;
2132 while (parent) {
2133 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2134 assert(parent->is_auth());
2135
2136 // opportunistically adjust parent dirfrag
2137 CInode *pin = parent->get_inode();
2138
2139 // inode -> dirfrag
2140 mut->auth_pin(parent);
2141 mut->add_projected_fnode(parent);
2142
2143 fnode_t *pf = parent->project_fnode();
2144 pf->version = parent->pre_dirty();
2145
2146 if (do_parent_mtime || linkunlink) {
2147 assert(mut->wrlocks.count(&pin->filelock));
2148 assert(mut->wrlocks.count(&pin->nestlock));
2149 assert(cfollows == CEPH_NOSNAP);
2150
2151 // update stale fragstat/rstat?
2152 parent->resync_accounted_fragstat();
2153 parent->resync_accounted_rstat();
2154
2155 if (do_parent_mtime) {
2156 pf->fragstat.mtime = mut->get_op_stamp();
2157 pf->fragstat.change_attr++;
2158 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2159 if (pf->fragstat.mtime > pf->rstat.rctime) {
2160 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2161 pf->rstat.rctime = pf->fragstat.mtime;
2162 } else {
2163 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2164 }
2165 }
2166 if (linkunlink) {
2167 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2168 if (in->is_dir()) {
2169 pf->fragstat.nsubdirs += linkunlink;
2170 //pf->rstat.rsubdirs += linkunlink;
2171 } else {
2172 pf->fragstat.nfiles += linkunlink;
2173 //pf->rstat.rfiles += linkunlink;
2174 }
2175 }
2176 }
2177
2178 // rstat
2179 if (!primary_dn) {
2180 // don't update parent this pass
2181 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2182 pin->versionlock.can_wrlock())) {
2183 dout(20) << " unwritable parent nestlock " << pin->nestlock
2184 << ", marking dirty rstat on " << *cur << dendl;
2185 cur->mark_dirty_rstat();
2186 } else {
2187 // if we don't hold a wrlock reference on this nestlock, take one,
2188 // because we are about to write into the dirfrag fnode and that needs
2189 // to commit before the lock can cycle.
2190 if (linkunlink) {
2191 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2192 }
2193
2194 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2195 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2196 mds->locker->wrlock_force(&pin->nestlock, mut);
2197 }
2198
2199 // now we can project the inode rstat diff the dirfrag
2200 SnapRealm *prealm = pin->find_snaprealm();
2201
2202 snapid_t follows = cfollows;
2203 if (follows == CEPH_NOSNAP)
2204 follows = prealm->get_newest_seq();
2205
2206 snapid_t first = follows+1;
2207
2208 // first, if the frag is stale, bring it back in sync.
2209 parent->resync_accounted_rstat();
2210
2211 // now push inode rstats into frag
2212 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2213 cur->clear_dirty_rstat();
2214 }
2215
2216 bool stop = false;
2217 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2218 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2219 stop = true;
2220 }
2221
2222 // delay propagating until later?
2223 if (!stop && !first &&
2224 g_conf->mds_dirstat_min_interval > 0) {
2225 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2226 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2227 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2228 << " < " << g_conf->mds_dirstat_min_interval
2229 << ", stopping" << dendl;
2230 stop = true;
2231 } else {
2232 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2233 }
2234 }
2235
2236 // can cast only because i'm passing nowait=true in the sole user
2237 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2238 if (!stop &&
2239 mut->wrlocks.count(&pin->nestlock) == 0 &&
2240 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2241 //true
2242 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2243 )) { // ** do not initiate.. see above comment **
2244 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2245 << " on " << *pin << dendl;
2246 stop = true;
2247 }
2248 if (stop) {
2249 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2250 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2251 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2252 mut->add_updated_lock(&pin->nestlock);
2253 if (do_parent_mtime || linkunlink) {
2254 mds->locker->mark_updated_scatterlock(&pin->filelock);
2255 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2256 mut->add_updated_lock(&pin->filelock);
2257 }
2258 break;
2259 }
2260 if (!mut->wrlocks.count(&pin->versionlock))
2261 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2262
2263 assert(mut->wrlocks.count(&pin->nestlock) ||
2264 mut->is_slave());
2265
2266 pin->last_dirstat_prop = mut->get_mds_stamp();
2267
2268 // dirfrag -> diri
2269 mut->auth_pin(pin);
2270 mut->add_projected_inode(pin);
2271 lsi.push_front(pin);
2272
2273 pin->pre_cow_old_inode(); // avoid cow mayhem!
2274
94b18763
FG
2275 auto &pi = pin->project_inode();
2276 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2277
2278 // dirstat
2279 if (do_parent_mtime || linkunlink) {
2280 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2281 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2282 bool touched_mtime = false, touched_chattr = false;
94b18763 2283 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2284 pf->accounted_fragstat = pf->fragstat;
2285 if (touched_mtime)
94b18763 2286 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2287 if (touched_chattr)
94b18763
FG
2288 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2289 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2290
2291 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2292 if (pi.inode.dirstat.size() < 0)
7c673cae 2293 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
94b18763 2294 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2295 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2296 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2297 << ", dirfrag has " << pf->fragstat;
2298
2299 // trust the dirfrag for now
94b18763 2300 pi.inode.dirstat = pf->fragstat;
7c673cae
FG
2301
2302 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2303 }
2304 }
2305 }
2306
2307 /*
2308 * the rule here is to follow the _oldest_ parent with dirty rstat
2309 * data. if we don't propagate all data, we add ourselves to the
2310 * nudge list. that way all rstat data will (eventually) get
2311 * pushed up the tree.
2312 *
2313 * actually, no. for now, silently drop rstats for old parents. we need
2314 * hard link backpointers to do the above properly.
2315 */
2316
2317 // stop?
2318 if (pin->is_base())
2319 break;
2320 parentdn = pin->get_projected_parent_dn();
2321 assert(parentdn);
2322
2323 // rstat
2324 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2325
2326 // first, if the frag is stale, bring it back in sync.
2327 parent->resync_accounted_rstat();
2328
2329 if (g_conf->mds_snap_rstat) {
94b18763
FG
2330 for (auto &p : parent->dirty_old_rstat) {
2331 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2332 p.first, pin, true);
2333 }
7c673cae
FG
2334 }
2335 parent->dirty_old_rstat.clear();
2336 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2337
2338 pf->accounted_rstat = pf->rstat;
2339
2340 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2341 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2342 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2343 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2344 << ", dirfrag has " << pf->rstat;
2345
2346 // trust the dirfrag for now
94b18763 2347 pi.inode.rstat = pf->rstat;
7c673cae
FG
2348
2349 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2350 }
2351 }
2352
2353 parent->check_rstats();
2354 broadcast_quota_to_client(pin);
2355 // next parent!
2356 cur = pin;
2357 parent = parentdn->get_dir();
2358 linkunlink = 0;
2359 do_parent_mtime = false;
2360 primary_dn = true;
2361 first = false;
2362 }
2363
2364 // now, stick it in the blob
2365 assert(parent);
2366 assert(parent->is_auth());
2367 blob->add_dir_context(parent);
2368 blob->add_dir(parent, true);
2369 for (list<CInode*>::iterator p = lsi.begin();
2370 p != lsi.end();
2371 ++p) {
2372 CInode *cur = *p;
2373 journal_dirty_inode(mut.get(), blob, cur);
2374 }
2375
2376}
2377
2378
2379
2380
2381
2382// ===================================
2383// slave requests
2384
2385
2386/*
2387 * some handlers for master requests with slaves. we need to make
2388 * sure slaves journal commits before we forget we mastered them and
2389 * remove them from the uncommitted_masters map (used during recovery
2390 * to commit|abort slaves).
2391 */
2392struct C_MDC_CommittedMaster : public MDCacheLogContext {
2393 metareqid_t reqid;
2394 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2395 void finish(int r) override {
2396 mdcache->_logged_master_commit(reqid);
2397 }
2398};
2399
2400void MDCache::log_master_commit(metareqid_t reqid)
2401{
2402 dout(10) << "log_master_commit " << reqid << dendl;
2403 uncommitted_masters[reqid].committing = true;
2404 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2405 new C_MDC_CommittedMaster(this, reqid));
2406}
2407
2408void MDCache::_logged_master_commit(metareqid_t reqid)
2409{
2410 dout(10) << "_logged_master_commit " << reqid << dendl;
2411 assert(uncommitted_masters.count(reqid));
2412 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2413 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2414 uncommitted_masters.erase(reqid);
2415}
2416
2417// while active...
2418
2419void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2420{
2421 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2422 assert(uncommitted_masters.count(r));
2423 uncommitted_masters[r].slaves.erase(from);
2424 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2425 log_master_commit(r);
2426}
2427
2428void MDCache::logged_master_update(metareqid_t reqid)
2429{
2430 dout(10) << "logged_master_update " << reqid << dendl;
2431 assert(uncommitted_masters.count(reqid));
2432 uncommitted_masters[reqid].safe = true;
2433 if (pending_masters.count(reqid)) {
2434 pending_masters.erase(reqid);
2435 if (pending_masters.empty())
2436 process_delayed_resolve();
2437 }
2438}
2439
2440/*
2441 * Master may crash after receiving all slaves' commit acks, but before journalling
2442 * the final commit. Slaves may crash after journalling the slave commit, but before
2443 * sending commit ack to the master. Commit masters with no uncommitted slave when
2444 * resolve finishes.
2445 */
2446void MDCache::finish_committed_masters()
2447{
2448 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2449 p != uncommitted_masters.end();
2450 ++p) {
2451 p->second.recovering = false;
2452 if (!p->second.committing && p->second.slaves.empty()) {
2453 dout(10) << "finish_committed_masters " << p->first << dendl;
2454 log_master_commit(p->first);
2455 }
2456 }
2457}
2458
2459/*
2460 * at end of resolve... we must journal a commit|abort for all slave
2461 * updates, before moving on.
2462 *
2463 * this is so that the master can safely journal ECommitted on ops it
2464 * masters when it reaches up:active (all other recovering nodes must
2465 * complete resolve before that happens).
2466 */
2467struct C_MDC_SlaveCommit : public MDCacheLogContext {
2468 mds_rank_t from;
2469 metareqid_t reqid;
2470 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2471 void finish(int r) override {
2472 mdcache->_logged_slave_commit(from, reqid);
2473 }
2474};
2475
2476void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2477{
2478 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2479
2480 // send a message
2481 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2482 mds->send_message_mds(req, from);
2483}
2484
2485
2486
2487
2488
2489
2490// ====================================================================
2491// import map, recovery
2492
2493void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2494 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2495{
2496 if (subtrees.count(oldparent)) {
2497 vector<dirfrag_t>& v = subtrees[oldparent];
2498 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2499 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2500 if (*it == df) {
2501 v.erase(it);
2502 break;
2503 }
2504 }
2505 if (subtrees.count(newparent)) {
2506 vector<dirfrag_t>& v = subtrees[newparent];
2507 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2508 v.push_back(df);
2509 }
2510}
2511
2512ESubtreeMap *MDCache::create_subtree_map()
2513{
2514 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2515 << num_subtrees_fullauth() << " fullauth"
2516 << dendl;
2517
2518 show_subtrees();
2519
2520 ESubtreeMap *le = new ESubtreeMap();
2521 mds->mdlog->_start_entry(le);
2522
2523 map<dirfrag_t, CDir*> dirs_to_add;
2524
2525 if (myin) {
2526 CDir* mydir = myin->get_dirfrag(frag_t());
2527 dirs_to_add[mydir->dirfrag()] = mydir;
2528 }
2529
2530 // include all auth subtrees, and their bounds.
2531 // and a spanning tree to tie it to the root.
2532 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2533 p != subtrees.end();
2534 ++p) {
2535 CDir *dir = p->first;
2536
2537 // journal subtree as "ours" if we are
2538 // me, -2
2539 // me, me
2540 // me, !me (may be importing and ambiguous!)
2541
2542 // so not
2543 // !me, *
2544 if (dir->get_dir_auth().first != mds->get_nodeid())
2545 continue;
2546
2547 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2548 my_ambiguous_imports.count(dir->dirfrag())) {
2549 dout(15) << " ambig subtree " << *dir << dendl;
2550 le->ambiguous_subtrees.insert(dir->dirfrag());
2551 } else {
2552 dout(15) << " subtree " << *dir << dendl;
2553 }
2554
2555 dirs_to_add[dir->dirfrag()] = dir;
2556 le->subtrees[dir->dirfrag()].clear();
2557
2558
2559 // bounds
2560 for (set<CDir*>::iterator q = p->second.begin();
2561 q != p->second.end();
2562 ++q) {
2563 CDir *bound = *q;
2564 dout(15) << " subtree bound " << *bound << dendl;
2565 dirs_to_add[bound->dirfrag()] = bound;
2566 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2567 }
2568 }
2569
2570 // apply projected renames
2571 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2572 p != projected_subtree_renames.end();
2573 ++p) {
2574 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2575 CInode *diri = p->first;
2576 CDir *olddir = q->first;
2577 CDir *newdir = q->second;
2578 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2579
2580 list<CDir*> dfls;
2581 diri->get_dirfrags(dfls);
2582 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2583 CDir *dir = *p;
2584 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2585 CDir *oldparent = get_projected_subtree_root(olddir);
2586 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2587 CDir *newparent = get_projected_subtree_root(newdir);
2588 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2589
2590 if (oldparent == newparent) {
2591 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2592 << oldparent->dirfrag() << dendl;
2593 continue;
2594 }
2595
2596 if (dir->is_subtree_root()) {
2597 if (le->subtrees.count(newparent->dirfrag()) &&
2598 oldparent->get_dir_auth() != newparent->get_dir_auth())
2599 dirs_to_add[dir->dirfrag()] = dir;
2600 // children are fine. change parent.
2601 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2602 le->subtrees);
2603 } else {
2604 // mid-subtree.
2605
2606 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2607 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2608 // if oldparent is auth, subtree is mine; include it.
2609 if (le->subtrees.count(oldparent->dirfrag())) {
2610 dirs_to_add[dir->dirfrag()] = dir;
2611 le->subtrees[dir->dirfrag()].clear();
2612 }
2613 // if newparent is auth, subtree is a new bound
2614 if (le->subtrees.count(newparent->dirfrag())) {
2615 dirs_to_add[dir->dirfrag()] = dir;
2616 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2617 }
2618 newparent = dir;
2619 }
2620
2621 // see if any old bounds move to the new parent.
2622 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2623 p != subtrees[oldparent].end();
2624 ++p) {
2625 CDir *bound = *p;
2626 if (dir->contains(bound->get_parent_dir()))
2627 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2628 le->subtrees);
2629 }
2630 }
2631 }
2632 }
2633 }
2634
2635 // simplify the journaled map. our in memory map may have more
2636 // subtrees than needed due to migrations that are just getting
2637 // started or just completing. but on replay, the "live" map will
2638 // be simple and we can do a straight comparison.
2639 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2640 if (le->ambiguous_subtrees.count(p->first))
2641 continue;
2642 unsigned i = 0;
2643 while (i < p->second.size()) {
2644 dirfrag_t b = p->second[i];
2645 if (le->subtrees.count(b) &&
2646 le->ambiguous_subtrees.count(b) == 0) {
2647 vector<dirfrag_t>& bb = le->subtrees[b];
2648 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2649 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2650 p->second.push_back(*r);
2651 dirs_to_add.erase(b);
2652 le->subtrees.erase(b);
2653 p->second.erase(p->second.begin() + i);
2654 } else {
2655 ++i;
2656 }
2657 }
2658 }
2659
94b18763 2660 for (auto &p : dirs_to_add) {
7c673cae
FG
2661 CDir *dir = p.second;
2662 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2663 le->metablob.add_dir(dir, false);
2664 }
2665
2666 dout(15) << " subtrees " << le->subtrees << dendl;
2667 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2668
2669 //le->metablob.print(cout);
2670 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2671 return le;
2672}
2673
2674void MDCache::dump_resolve_status(Formatter *f) const
2675{
2676 f->open_object_section("resolve_status");
2677 f->dump_stream("resolve_gather") << resolve_gather;
2678 f->dump_stream("resolve_ack_gather") << resolve_gather;
2679 f->close_section();
2680}
2681
2682void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2683{
2684 dout(10) << "resolve_start" << dendl;
2685 assert(!resolve_done);
2686 resolve_done.reset(resolve_done_);
2687
2688 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2689 // if we don't have the root dir, adjust it to UNKNOWN. during
2690 // resolve we want mds0 to explicit claim the portion of it that
2691 // it owns, so that anything beyond its bounds get left as
2692 // unknown.
2693 CDir *rootdir = root->get_dirfrag(frag_t());
2694 if (rootdir)
2695 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2696 }
2697 resolve_gather = recovery_set;
2698}
2699
2700void MDCache::send_resolves()
2701{
2702 send_slave_resolves();
2703 if (!resolve_ack_gather.empty()) {
2704 dout(10) << "send_resolves still waiting for resolve ack from ("
2705 << resolve_ack_gather << ")" << dendl;
2706 return;
2707 }
2708 if (!need_resolve_rollback.empty()) {
2709 dout(10) << "send_resolves still waiting for rollback to commit on ("
2710 << need_resolve_rollback << ")" << dendl;
2711 return;
2712 }
2713 send_subtree_resolves();
2714}
2715
2716void MDCache::send_slave_resolves()
2717{
2718 dout(10) << "send_slave_resolves" << dendl;
2719
2720 map<mds_rank_t, MMDSResolve*> resolves;
2721
2722 if (mds->is_resolve()) {
2723 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2724 p != uncommitted_slave_updates.end();
2725 ++p) {
2726 resolves[p->first] = new MMDSResolve;
2727 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2728 q != p->second.end();
2729 ++q) {
2730 dout(10) << " including uncommitted " << q->first << dendl;
2731 resolves[p->first]->add_slave_request(q->first, false);
2732 }
2733 }
2734 } else {
2735 set<mds_rank_t> resolve_set;
2736 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2737 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2738 p != active_requests.end();
2739 ++p) {
2740 MDRequestRef& mdr = p->second;
2741 if (!mdr->is_slave())
2742 continue;
2743 if (!mdr->slave_did_prepare() && !mdr->committing) {
2744 continue;
2745 }
2746 mds_rank_t master = mdr->slave_to_mds;
2747 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2748 dout(10) << " including uncommitted " << *mdr << dendl;
2749 if (!resolves.count(master))
2750 resolves[master] = new MMDSResolve;
2751 if (!mdr->committing &&
2752 mdr->has_more() && mdr->more()->is_inode_exporter) {
2753 // re-send cap exports
2754 CInode *in = mdr->more()->rename_inode;
2755 map<client_t, Capability::Export> cap_map;
2756 in->export_client_caps(cap_map);
2757 bufferlist bl;
2758 ::encode(in->ino(), bl);
2759 ::encode(cap_map, bl);
2760 resolves[master]->add_slave_request(p->first, bl);
2761 } else {
2762 resolves[master]->add_slave_request(p->first, mdr->committing);
2763 }
2764 }
2765 }
2766 }
2767
2768 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2769 p != resolves.end();
2770 ++p) {
2771 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2772 mds->send_message_mds(p->second, p->first);
2773 resolve_ack_gather.insert(p->first);
2774 }
2775}
2776
2777void MDCache::send_subtree_resolves()
2778{
2779 dout(10) << "send_subtree_resolves" << dendl;
2780
2781 if (migrator->is_exporting() || migrator->is_importing()) {
2782 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2783 migrator->show_importing();
2784 migrator->show_exporting();
2785 resolves_pending = true;
2786 return; // not now
2787 }
2788
2789 map<mds_rank_t, MMDSResolve*> resolves;
2790 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2791 p != recovery_set.end();
2792 ++p) {
2793 if (*p == mds->get_nodeid())
2794 continue;
2795 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2796 resolves[*p] = new MMDSResolve;
2797 }
2798
2799 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2800 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2801
2802 // known
2803 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2804 p != subtrees.end();
2805 ++p) {
2806 CDir *dir = p->first;
2807
2808 // only our subtrees
2809 if (dir->authority().first != mds->get_nodeid())
2810 continue;
2811
2812 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2813 continue; // we'll add it below
2814
2815 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2816 // ambiguous (mid-import)
2817 set<CDir*> bounds;
2818 get_subtree_bounds(dir, bounds);
2819 vector<dirfrag_t> dfls;
2820 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2821 dfls.push_back((*q)->dirfrag());
2822
2823 my_ambig_imports[dir->dirfrag()] = dfls;
2824 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2825 } else {
2826 // not ambiguous.
2827 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2828 q != resolves.end();
2829 ++q)
2830 resolves[q->first]->add_subtree(dir->dirfrag());
2831 // bounds too
2832 vector<dirfrag_t> dfls;
2833 for (set<CDir*>::iterator q = subtrees[dir].begin();
2834 q != subtrees[dir].end();
2835 ++q) {
2836 CDir *bound = *q;
2837 dfls.push_back(bound->dirfrag());
2838 }
2839
2840 my_subtrees[dir->dirfrag()] = dfls;
2841 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2842 }
2843 }
2844
2845 // ambiguous
2846 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2847 p != my_ambiguous_imports.end();
2848 ++p) {
2849 my_ambig_imports[p->first] = p->second;
2850 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2851 }
2852
2853 // simplify the claimed subtree.
2854 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2855 unsigned i = 0;
2856 while (i < p->second.size()) {
2857 dirfrag_t b = p->second[i];
2858 if (my_subtrees.count(b)) {
2859 vector<dirfrag_t>& bb = my_subtrees[b];
2860 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2861 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2862 p->second.push_back(*r);
2863 my_subtrees.erase(b);
2864 p->second.erase(p->second.begin() + i);
2865 } else {
2866 ++i;
2867 }
2868 }
2869 }
2870
2871 // send
2872 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2873 p != resolves.end();
2874 ++p) {
2875 MMDSResolve* m = p->second;
2876 m->subtrees = my_subtrees;
2877 m->ambiguous_imports = my_ambig_imports;
2878 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2879 mds->send_message_mds(m, p->first);
2880 }
2881 resolves_pending = false;
2882}
2883
2884void MDCache::handle_mds_failure(mds_rank_t who)
2885{
2886 dout(7) << "handle_mds_failure mds." << who << dendl;
2887
2888 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2889
2890 resolve_gather.insert(who);
2891 discard_delayed_resolve(who);
2892 ambiguous_slave_updates.erase(who);
2893
2894 rejoin_gather.insert(who);
2895 rejoin_sent.erase(who); // i need to send another
31f18b77 2896 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2897 rejoin_ack_gather.erase(who); // i'll need/get another.
2898
2899 dout(10) << " resolve_gather " << resolve_gather << dendl;
2900 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2901 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2902 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2903 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2904
2905
2906 // tell the migrator too.
2907 migrator->handle_mds_failure_or_stop(who);
2908
224ce89b
WB
2909 // tell the balancer too.
2910 mds->balancer->handle_mds_failure(who);
2911
7c673cae
FG
2912 // clean up any requests slave to/from this node
2913 list<MDRequestRef> finish;
2914 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2915 p != active_requests.end();
2916 ++p) {
2917 MDRequestRef& mdr = p->second;
2918 // slave to the failed node?
2919 if (mdr->slave_to_mds == who) {
2920 if (mdr->slave_did_prepare()) {
2921 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2922 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2923 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2924
2925 if (!mdr->more()->waiting_on_slave.empty()) {
2926 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2927 // will rollback, no need to wait
2928 if (mdr->slave_request) {
2929 mdr->slave_request->put();
2930 mdr->slave_request = 0;
2931 }
2932 mdr->more()->waiting_on_slave.clear();
2933 }
2934 } else if (!mdr->committing) {
2935 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2936 if (mdr->slave_request || mdr->slave_rolling_back())
2937 mdr->aborted = true;
2938 else
2939 finish.push_back(mdr);
2940 }
2941 }
2942
2943 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2944 if (mdr->more()->waiting_on_slave.count(who)) {
2945 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2946 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2947 << who << dendl;
2948 mdr->more()->waiting_on_slave.erase(who);
2949 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2950 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2951 }
2952
2953 if (mdr->more()->srcdn_auth_mds == who &&
2954 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2955 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2956 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2957 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2958 }
31f18b77
FG
2959 } else if (mdr->slave_request) {
2960 MMDSSlaveRequest *slave_req = mdr->slave_request;
2961 // FIXME: Slave rename request can arrive after we notice mds failure.
2962 // This can cause mds to crash (does not affect integrity of FS).
2963 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2964 slave_req->srcdn_auth == who)
2965 slave_req->mark_interrupted();
7c673cae
FG
2966 }
2967
2968 // failed node is slave?
2969 if (mdr->is_master() && !mdr->committing) {
2970 if (mdr->more()->srcdn_auth_mds == who) {
2971 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2972 << who << " to recover" << dendl;
2973 assert(mdr->more()->witnessed.count(who) == 0);
2974 if (mdr->more()->is_ambiguous_auth)
2975 mdr->clear_ambiguous_auth();
2976 // rename srcdn's auth mds failed, all witnesses will rollback
2977 mdr->more()->witnessed.clear();
2978 pending_masters.erase(p->first);
2979 }
2980
2981 if (mdr->more()->witnessed.count(who)) {
2982 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2983 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2984 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2985 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2986 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2987 // until either the request is committing or the slave also fails.
2988 assert(mdr->more()->waiting_on_slave.size() == 1);
2989 pending_masters.insert(p->first);
2990 } else {
2991 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2992 << who << " to recover" << dendl;
2993 if (srcdn_auth >= 0)
2994 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2995
2996 // discard this peer's prepare (if any)
2997 mdr->more()->witnessed.erase(who);
2998 }
2999 }
3000
3001 if (mdr->more()->waiting_on_slave.count(who)) {
3002 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3003 << " to recover" << dendl;
3004 // retry request when peer recovers
3005 mdr->more()->waiting_on_slave.erase(who);
3006 if (mdr->more()->waiting_on_slave.empty())
3007 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3008 }
3009
3010 if (mdr->locking && mdr->locking_target_mds == who)
3011 mdr->finish_locking(mdr->locking);
3012 }
3013 }
3014
3015 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3016 p != uncommitted_masters.end();
3017 ++p) {
3018 // The failed MDS may have already committed the slave update
3019 if (p->second.slaves.count(who)) {
3020 p->second.recovering = true;
3021 p->second.slaves.erase(who);
3022 }
3023 }
3024
3025 while (!finish.empty()) {
3026 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3027 request_finish(finish.front());
3028 finish.pop_front();
3029 }
3030
3031 kick_find_ino_peers(who);
3032 kick_open_ino_peers(who);
3033
3034 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3035 p != fragments.end(); ) {
3036 dirfrag_t df = p->first;
3037 fragment_info_t& info = p->second;
3038 ++p;
3039 if (info.is_fragmenting())
3040 continue;
3041 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3042 list<CDir*> dirs;
3043 info.dirs.swap(dirs);
3044 fragments.erase(df);
3045 fragment_unmark_unfreeze_dirs(dirs);
3046 }
3047
3048 // MDCache::shutdown_export_strays() always exports strays to mds.0
3049 if (who == mds_rank_t(0))
3050 shutdown_exported_strays.clear();
3051
3052 show_subtrees();
3053}
3054
3055/*
3056 * handle_mds_recovery - called on another node's transition
3057 * from resolve -> active.
3058 */
3059void MDCache::handle_mds_recovery(mds_rank_t who)
3060{
3061 dout(7) << "handle_mds_recovery mds." << who << dendl;
3062
3063 // exclude all discover waiters. kick_discovers() will do the job
3064 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3065 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3066
3067 list<MDSInternalContextBase*> waiters;
3068
3069 // wake up any waiters in their subtrees
3070 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3071 p != subtrees.end();
3072 ++p) {
3073 CDir *dir = p->first;
3074
3075 if (dir->authority().first != who ||
3076 dir->authority().second == mds->get_nodeid())
3077 continue;
3078 assert(!dir->is_auth());
3079
3080 // wake any waiters
3081 list<CDir*> q;
3082 q.push_back(dir);
3083
3084 while (!q.empty()) {
3085 CDir *d = q.front();
3086 q.pop_front();
3087 d->take_waiting(d_mask, waiters);
3088
3089 // inode waiters too
94b18763
FG
3090 for (auto &p : d->items) {
3091 CDentry *dn = p.second;
7c673cae
FG
3092 CDentry::linkage_t *dnl = dn->get_linkage();
3093 if (dnl->is_primary()) {
3094 dnl->get_inode()->take_waiting(i_mask, waiters);
3095
3096 // recurse?
3097 list<CDir*> ls;
3098 dnl->get_inode()->get_dirfrags(ls);
3099 for (list<CDir*>::iterator p = ls.begin();
3100 p != ls.end();
3101 ++p) {
3102 CDir *subdir = *p;
3103 if (!subdir->is_subtree_root())
3104 q.push_back(subdir);
3105 }
3106 }
3107 }
3108 }
3109 }
3110
3111 kick_open_ino_peers(who);
3112 kick_find_ino_peers(who);
3113
3114 // queue them up.
3115 mds->queue_waiters(waiters);
3116}
3117
3118void MDCache::set_recovery_set(set<mds_rank_t>& s)
3119{
3120 dout(7) << "set_recovery_set " << s << dendl;
3121 recovery_set = s;
3122}
3123
3124
3125/*
3126 * during resolve state, we share resolves to determine who
3127 * is authoritative for which trees. we expect to get an resolve
3128 * from _everyone_ in the recovery_set (the mds cluster at the time of
3129 * the first failure).
3130 *
3131 * This functions puts the passed message before returning
3132 */
3133void MDCache::handle_resolve(MMDSResolve *m)
3134{
3135 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3136 mds_rank_t from = mds_rank_t(m->get_source().num());
3137
3138 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3139 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3140 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3141 return;
3142 }
3143 // wait until we reach the resolve stage!
3144 m->put();
3145 return;
3146 }
3147
3148 discard_delayed_resolve(from);
3149
3150 // ambiguous slave requests?
3151 if (!m->slave_requests.empty()) {
3152 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3153 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3154 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3155 assert(!p->second.committing);
3156 pending_masters.insert(p->first);
3157 }
3158 }
3159
3160 if (!pending_masters.empty()) {
3161 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3162 delayed_resolve[from] = m;
3163 return;
3164 }
3165 }
3166
3167 MMDSResolveAck *ack = new MMDSResolveAck;
3168 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3169 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3170 // COMMIT
3171 if (p->second.committing) {
3172 // already committing, waiting for the OP_COMMITTED slave reply
3173 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3174 } else {
3175 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3176 ack->add_commit(p->first);
3177 }
3178 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3179
3180 if (p->second.inode_caps.length() > 0) {
3181 // slave wants to export caps (rename)
3182 assert(mds->is_resolve());
3183
3184 inodeno_t ino;
3185 map<client_t,Capability::Export> cap_exports;
3186 bufferlist::iterator q = p->second.inode_caps.begin();
3187 ::decode(ino, q);
3188 ::decode(cap_exports, q);
3189
3190 assert(get_inode(ino));
3191
3192 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3193 q != cap_exports.end();
3194 ++q) {
3195 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3196 im.cap_id = ++last_cap_id; // assign a new cap ID
3197 im.issue_seq = 1;
3198 im.mseq = q->second.mseq;
28e407b8
AA
3199
3200 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3201 if (session)
3202 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3203 }
3204
3205 // will process these caps in rejoin stage
3206 rejoin_slave_exports[ino].first = from;
3207 rejoin_slave_exports[ino].second.swap(cap_exports);
3208
3209 // send information of imported caps back to slave
3210 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3211 }
3212 } else {
3213 // ABORT
3214 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3215 assert(!p->second.committing);
3216 ack->add_abort(p->first);
3217 }
3218 }
3219 mds->send_message(ack, m->get_connection());
3220 m->put();
3221 return;
3222 }
3223
3224 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3225 dout(10) << "delay processing subtree resolve" << dendl;
3226 delayed_resolve[from] = m;
3227 return;
3228 }
3229
3230 bool survivor = false;
3231 // am i a surviving ambiguous importer?
3232 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3233 survivor = true;
3234 // check for any import success/failure (from this node)
3235 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3236 while (p != my_ambiguous_imports.end()) {
3237 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3238 ++next;
3239 CDir *dir = get_dirfrag(p->first);
3240 assert(dir);
3241 dout(10) << "checking ambiguous import " << *dir << dendl;
3242 if (migrator->is_importing(dir->dirfrag()) &&
3243 migrator->get_import_peer(dir->dirfrag()) == from) {
3244 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3245
3246 // check if sender claims the subtree
3247 bool claimed_by_sender = false;
3248 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3249 q != m->subtrees.end();
3250 ++q) {
3251 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3252 CDir *base = get_force_dirfrag(q->first, false);
3253 if (!base || !base->contains(dir))
3254 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3255
3256 bool inside = true;
3257 set<CDir*> bounds;
3258 get_force_dirfrag_bound_set(q->second, bounds);
3259 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3260 CDir *bound = *p;
3261 if (bound->contains(dir)) {
3262 inside = false; // nope, bound is dir or parent of dir, not inside.
3263 break;
3264 }
3265 }
3266 if (inside)
3267 claimed_by_sender = true;
3268 }
3269
3270 my_ambiguous_imports.erase(p); // no longer ambiguous.
3271 if (claimed_by_sender) {
3272 dout(7) << "ambiguous import failed on " << *dir << dendl;
3273 migrator->import_reverse(dir);
3274 } else {
3275 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3276 migrator->import_finish(dir, true);
3277 }
3278 }
3279 p = next;
3280 }
3281 }
3282
3283 // update my dir_auth values
3284 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3285 // migrations between other nodes)
3286 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3287 pi != m->subtrees.end();
3288 ++pi) {
3289 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3290 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3291 if (!dir)
3292 continue;
3293 adjust_bounded_subtree_auth(dir, pi->second, from);
3294 try_subtree_merge(dir);
3295 }
3296
3297 show_subtrees();
3298
3299 // note ambiguous imports too
3300 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3301 pi != m->ambiguous_imports.end();
3302 ++pi) {
3303 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3304 other_ambiguous_imports[from][pi->first].swap( pi->second );
3305 }
3306
3307 // did i get them all?
3308 resolve_gather.erase(from);
3309
3310 maybe_resolve_finish();
3311
3312 m->put();
3313}
3314
3315void MDCache::process_delayed_resolve()
3316{
3317 dout(10) << "process_delayed_resolve" << dendl;
3318 map<mds_rank_t, MMDSResolve*> tmp;
3319 tmp.swap(delayed_resolve);
3320 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3321 handle_resolve(p->second);
3322}
3323
3324void MDCache::discard_delayed_resolve(mds_rank_t who)
3325{
3326 if (delayed_resolve.count(who)) {
3327 delayed_resolve[who]->put();
3328 delayed_resolve.erase(who);
3329 }
3330}
3331
3332void MDCache::maybe_resolve_finish()
3333{
3334 assert(resolve_ack_gather.empty());
3335 assert(need_resolve_rollback.empty());
3336
3337 if (!resolve_gather.empty()) {
3338 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3339 << resolve_gather << ")" << dendl;
3340 return;
3341 }
3342
3343 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3344 disambiguate_my_imports();
3345 finish_committed_masters();
3346
3347 if (resolve_done) {
3348 assert(mds->is_resolve());
3349 trim_unlinked_inodes();
3350 recalc_auth_bits(false);
3351 resolve_done.release()->complete(0);
3352 } else {
3353 maybe_send_pending_rejoins();
3354 }
3355}
3356
3357/* This functions puts the passed message before returning */
3358void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3359{
3360 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3361 mds_rank_t from = mds_rank_t(ack->get_source().num());
3362
3363 if (!resolve_ack_gather.count(from) ||
3364 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3365 ack->put();
3366 return;
3367 }
3368
3369 if (ambiguous_slave_updates.count(from)) {
3370 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3371 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3372 }
3373
3374 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3375 p != ack->commit.end();
3376 ++p) {
3377 dout(10) << " commit on slave " << p->first << dendl;
3378
3379 if (ambiguous_slave_updates.count(from)) {
3380 remove_ambiguous_slave_update(p->first, from);
3381 continue;
3382 }
3383
3384 if (mds->is_resolve()) {
3385 // replay
3386 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3387 assert(su);
3388
3389 // log commit
3390 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3391 ESlaveUpdate::OP_COMMIT, su->origop),
3392 new C_MDC_SlaveCommit(this, from, p->first));
3393 mds->mdlog->flush();
3394
3395 finish_uncommitted_slave_update(p->first, from);
3396 } else {
3397 MDRequestRef mdr = request_get(p->first);
3398 // information about master imported caps
3399 if (p->second.length() > 0)
3400 mdr->more()->inode_import.claim(p->second);
3401
3402 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3403 request_finish(mdr);
3404 }
3405 }
3406
3407 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3408 p != ack->abort.end();
3409 ++p) {
3410 dout(10) << " abort on slave " << *p << dendl;
3411
3412 if (mds->is_resolve()) {
3413 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3414 assert(su);
3415
3416 // perform rollback (and journal a rollback entry)
3417 // note: this will hold up the resolve a bit, until the rollback entries journal.
3418 MDRequestRef null_ref;
3419 switch (su->origop) {
3420 case ESlaveUpdate::LINK:
3421 mds->server->do_link_rollback(su->rollback, from, null_ref);
3422 break;
3423 case ESlaveUpdate::RENAME:
3424 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3425 break;
3426 case ESlaveUpdate::RMDIR:
3427 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3428 break;
3429 default:
3430 ceph_abort();
3431 }
3432 } else {
3433 MDRequestRef mdr = request_get(*p);
3434 mdr->aborted = true;
3435 if (mdr->slave_request) {
3436 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3437 add_rollback(*p, from);
3438 } else {
3439 request_finish(mdr);
3440 }
3441 }
3442 }
3443
3444 if (!ambiguous_slave_updates.count(from))
3445 resolve_ack_gather.erase(from);
3446 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3447 send_subtree_resolves();
3448 process_delayed_resolve();
3449 }
3450
3451 ack->put();
3452}
3453
3454void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3455{
3456 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3457 uncommitted_slave_updates[master][reqid] = su;
3458 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3459 uncommitted_slave_rename_olddir[*p]++;
3460 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3461 uncommitted_slave_unlink[*p]++;
3462}
3463
3464void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3465{
3466 assert(uncommitted_slave_updates[master].count(reqid));
3467 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3468
3469 uncommitted_slave_updates[master].erase(reqid);
3470 if (uncommitted_slave_updates[master].empty())
3471 uncommitted_slave_updates.erase(master);
3472 // discard the non-auth subtree we renamed out of
3473 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3474 CInode *diri = *p;
3475 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3476 assert(it != uncommitted_slave_rename_olddir.end());
3477 it->second--;
3478 if (it->second == 0) {
3479 uncommitted_slave_rename_olddir.erase(it);
3480 list<CDir*> ls;
3481 diri->get_dirfrags(ls);
3482 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3483 CDir *root = get_subtree_root(*q);
3484 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3485 try_trim_non_auth_subtree(root);
3486 if (*q != root)
3487 break;
3488 }
3489 }
3490 } else
3491 assert(it->second > 0);
3492 }
3493 // removed the inodes that were unlinked by slave update
3494 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3495 CInode *in = *p;
3496 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3497 assert(it != uncommitted_slave_unlink.end());
3498 it->second--;
3499 if (it->second == 0) {
3500 uncommitted_slave_unlink.erase(it);
3501 if (!in->get_projected_parent_dn())
3502 mds->mdcache->remove_inode_recursive(in);
3503 } else
3504 assert(it->second > 0);
3505 }
3506 delete su;
3507}
3508
3509MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3510{
3511
3512 MDSlaveUpdate* su = NULL;
3513 if (uncommitted_slave_updates.count(master) &&
3514 uncommitted_slave_updates[master].count(reqid)) {
3515 su = uncommitted_slave_updates[master][reqid];
3516 assert(su);
3517 }
3518 return su;
3519}
3520
3521void MDCache::finish_rollback(metareqid_t reqid) {
3522 assert(need_resolve_rollback.count(reqid));
3523 if (mds->is_resolve())
3524 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3525 need_resolve_rollback.erase(reqid);
3526 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3527 send_subtree_resolves();
3528 process_delayed_resolve();
3529 }
3530}
3531
3532void MDCache::disambiguate_other_imports()
3533{
3534 dout(10) << "disambiguate_other_imports" << dendl;
3535
3536 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3537 // other nodes' ambiguous imports
3538 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3539 p != other_ambiguous_imports.end();
3540 ++p) {
3541 mds_rank_t who = p->first;
3542 dout(10) << "ambiguous imports for mds." << who << dendl;
3543
3544 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3545 q != p->second.end();
3546 ++q) {
3547 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3548 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3549 CDir *dir = get_force_dirfrag(q->first, recovering);
3550 if (!dir) continue;
3551
3552 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3553 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3554 dout(10) << " mds." << who << " did import " << *dir << dendl;
3555 adjust_bounded_subtree_auth(dir, q->second, who);
3556 try_subtree_merge(dir);
3557 } else {
3558 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3559 }
3560 }
3561 }
3562 other_ambiguous_imports.clear();
3563}
3564
3565void MDCache::disambiguate_my_imports()
3566{
3567 dout(10) << "disambiguate_my_imports" << dendl;
3568
3569 if (!mds->is_resolve()) {
3570 assert(my_ambiguous_imports.empty());
3571 return;
3572 }
3573
3574 disambiguate_other_imports();
3575
3576 // my ambiguous imports
3577 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3578 while (!my_ambiguous_imports.empty()) {
3579 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3580
3581 CDir *dir = get_dirfrag(q->first);
3582 assert(dir);
3583
3584 if (dir->authority() != me_ambig) {
3585 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3586 cancel_ambiguous_import(dir);
3587
3588 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3589
3590 // subtree may have been swallowed by another node claiming dir
3591 // as their own.
3592 CDir *root = get_subtree_root(dir);
3593 if (root != dir)
3594 dout(10) << " subtree root is " << *root << dendl;
3595 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3596 try_trim_non_auth_subtree(root);
3597 } else {
3598 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3599 finish_ambiguous_import(q->first);
3600 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3601 }
3602 }
3603 assert(my_ambiguous_imports.empty());
3604 mds->mdlog->flush();
3605
3606 // verify all my subtrees are unambiguous!
3607 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3608 p != subtrees.end();
3609 ++p) {
3610 CDir *dir = p->first;
3611 if (dir->is_ambiguous_dir_auth()) {
3612 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3613 }
3614 assert(!dir->is_ambiguous_dir_auth());
3615 }
3616
3617 show_subtrees();
3618}
3619
3620
3621void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3622{
3623 assert(my_ambiguous_imports.count(base) == 0);
3624 my_ambiguous_imports[base] = bounds;
3625}
3626
3627
3628void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3629{
3630 // make a list
3631 vector<dirfrag_t> binos;
3632 for (set<CDir*>::iterator p = bounds.begin();
3633 p != bounds.end();
3634 ++p)
3635 binos.push_back((*p)->dirfrag());
3636
3637 // note: this can get called twice if the exporter fails during recovery
3638 if (my_ambiguous_imports.count(base->dirfrag()))
3639 my_ambiguous_imports.erase(base->dirfrag());
3640
3641 add_ambiguous_import(base->dirfrag(), binos);
3642}
3643
3644void MDCache::cancel_ambiguous_import(CDir *dir)
3645{
3646 dirfrag_t df = dir->dirfrag();
3647 assert(my_ambiguous_imports.count(df));
3648 dout(10) << "cancel_ambiguous_import " << df
3649 << " bounds " << my_ambiguous_imports[df]
3650 << " " << *dir
3651 << dendl;
3652 my_ambiguous_imports.erase(df);
3653}
3654
3655void MDCache::finish_ambiguous_import(dirfrag_t df)
3656{
3657 assert(my_ambiguous_imports.count(df));
3658 vector<dirfrag_t> bounds;
3659 bounds.swap(my_ambiguous_imports[df]);
3660 my_ambiguous_imports.erase(df);
3661
3662 dout(10) << "finish_ambiguous_import " << df
3663 << " bounds " << bounds
3664 << dendl;
3665 CDir *dir = get_dirfrag(df);
3666 assert(dir);
3667
3668 // adjust dir_auth, import maps
3669 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3670 try_subtree_merge(dir);
3671}
3672
3673void MDCache::remove_inode_recursive(CInode *in)
3674{
3675 dout(10) << "remove_inode_recursive " << *in << dendl;
3676 list<CDir*> ls;
3677 in->get_dirfrags(ls);
3678 list<CDir*>::iterator p = ls.begin();
3679 while (p != ls.end()) {
3680 CDir *subdir = *p++;
3681
3682 dout(10) << " removing dirfrag " << subdir << dendl;
94b18763
FG
3683 auto it = subdir->items.begin();
3684 while (it != subdir->items.end()) {
3685 CDentry *dn = it->second;
3686 ++it;
7c673cae
FG
3687 CDentry::linkage_t *dnl = dn->get_linkage();
3688 if (dnl->is_primary()) {
3689 CInode *tin = dnl->get_inode();
31f18b77 3690 subdir->unlink_inode(dn, false);
7c673cae
FG
3691 remove_inode_recursive(tin);
3692 }
3693 subdir->remove_dentry(dn);
3694 }
3695
3696 if (subdir->is_subtree_root())
3697 remove_subtree(subdir);
3698 in->close_dirfrag(subdir->dirfrag().frag);
3699 }
3700 remove_inode(in);
3701}
3702
3703bool MDCache::expire_recursive(
3704 CInode *in,
3705 map<mds_rank_t, MCacheExpire*>& expiremap)
3706{
3707 assert(!in->is_auth());
3708
3709 dout(10) << __func__ << ":" << *in << dendl;
3710
3711 // Recurse into any dirfrags beneath this inode
3712 list<CDir*> ls;
3713 in->get_dirfrags(ls);
3714 for (auto subdir : ls) {
3715 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3716 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3717 return true;
3718 }
3719
3720 for (auto &it : subdir->items) {
3721 CDentry *dn = it.second;
3722 CDentry::linkage_t *dnl = dn->get_linkage();
3723 if (dnl->is_primary()) {
3724 CInode *tin = dnl->get_inode();
3725
3726 /* Remote strays with linkage (i.e. hardlinks) should not be
3727 * expired, because they may be the target of
3728 * a rename() as the owning MDS shuts down */
3729 if (!tin->is_stray() && tin->inode.nlink) {
3730 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3731 return true;
3732 }
3733
3734 const bool abort = expire_recursive(tin, expiremap);
3735 if (abort) {
3736 return true;
3737 }
3738 }
3739 if (dn->lru_is_expireable()) {
3740 trim_dentry(dn, expiremap);
3741 } else {
3742 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3743 return true;
3744 }
3745 }
3746 }
3747
3748 return false;
3749}
3750
3751void MDCache::trim_unlinked_inodes()
3752{
3753 dout(7) << "trim_unlinked_inodes" << dendl;
3754 list<CInode*> q;
94b18763 3755 for (auto &p : inode_map) {
b32b8144 3756 CInode *in = p.second;
7c673cae
FG
3757 if (in->get_parent_dn() == NULL && !in->is_base()) {
3758 dout(7) << " will trim from " << *in << dendl;
3759 q.push_back(in);
3760 }
3761 }
3762 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3763 remove_inode_recursive(*p);
3764}
3765
3766/** recalc_auth_bits()
3767 * once subtree auth is disambiguated, we need to adjust all the
3768 * auth and dirty bits in our cache before moving on.
3769 */
3770void MDCache::recalc_auth_bits(bool replay)
3771{
3772 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3773
3774 if (root) {
3775 root->inode_auth.first = mds->mdsmap->get_root();
3776 bool auth = mds->get_nodeid() == root->inode_auth.first;
3777 if (auth) {
3778 root->state_set(CInode::STATE_AUTH);
3779 } else {
3780 root->state_clear(CInode::STATE_AUTH);
3781 if (!replay)
3782 root->state_set(CInode::STATE_REJOINING);
3783 }
3784 }
3785
3786 set<CInode*> subtree_inodes;
3787 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3788 p != subtrees.end();
3789 ++p) {
3790 if (p->first->dir_auth.first == mds->get_nodeid())
3791 subtree_inodes.insert(p->first->inode);
3792 }
3793
3794 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3795 p != subtrees.end();
3796 ++p) {
3797 if (p->first->inode->is_mdsdir()) {
3798 CInode *in = p->first->inode;
3799 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3800 if (auth) {
3801 in->state_set(CInode::STATE_AUTH);
3802 } else {
3803 in->state_clear(CInode::STATE_AUTH);
3804 if (!replay)
3805 in->state_set(CInode::STATE_REJOINING);
3806 }
3807 }
3808
3809 list<CDir*> dfq; // dirfrag queue
3810 dfq.push_back(p->first);
3811
3812 bool auth = p->first->authority().first == mds->get_nodeid();
3813 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3814
3815 while (!dfq.empty()) {
3816 CDir *dir = dfq.front();
3817 dfq.pop_front();
3818
3819 // dir
3820 if (auth) {
3821 dir->state_set(CDir::STATE_AUTH);
3822 } else {
3823 dir->state_clear(CDir::STATE_AUTH);
3824 if (!replay) {
3825 // close empty non-auth dirfrag
3826 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3827 dir->inode->close_dirfrag(dir->get_frag());
3828 continue;
3829 }
3830 dir->state_set(CDir::STATE_REJOINING);
3831 dir->state_clear(CDir::STATE_COMPLETE);
3832 if (dir->is_dirty())
3833 dir->mark_clean();
3834 }
3835 }
3836
3837 // dentries in this dir
94b18763 3838 for (auto &p : dir->items) {
7c673cae 3839 // dn
94b18763 3840 CDentry *dn = p.second;
7c673cae
FG
3841 CDentry::linkage_t *dnl = dn->get_linkage();
3842 if (auth) {
3843 dn->state_set(CDentry::STATE_AUTH);
3844 } else {
3845 dn->state_clear(CDentry::STATE_AUTH);
3846 if (!replay) {
3847 dn->state_set(CDentry::STATE_REJOINING);
3848 if (dn->is_dirty())
3849 dn->mark_clean();
3850 }
3851 }
3852
3853 if (dnl->is_primary()) {
3854 // inode
3855 CInode *in = dnl->get_inode();
3856 if (auth) {
3857 in->state_set(CInode::STATE_AUTH);
3858 } else {
3859 in->state_clear(CInode::STATE_AUTH);
3860 if (!replay) {
3861 in->state_set(CInode::STATE_REJOINING);
3862 if (in->is_dirty())
3863 in->mark_clean();
3864 if (in->is_dirty_parent())
3865 in->clear_dirty_parent();
3866 // avoid touching scatterlocks for our subtree roots!
3867 if (subtree_inodes.count(in) == 0)
3868 in->clear_scatter_dirty();
3869 }
3870 }
3871 // recurse?
3872 if (in->is_dir())
3873 in->get_nested_dirfrags(dfq);
3874 }
3875 }
3876 }
3877 }
3878
3879 show_subtrees();
3880 show_cache();
3881}
3882
3883
3884
3885// ===========================================================================
3886// REJOIN
3887
3888/*
3889 * notes on scatterlock recovery:
3890 *
3891 * - recovering inode replica sends scatterlock data for any subtree
3892 * roots (the only ones that are possibly dirty).
3893 *
3894 * - surviving auth incorporates any provided scatterlock data. any
3895 * pending gathers are then finished, as with the other lock types.
3896 *
3897 * that takes care of surviving auth + (recovering replica)*.
3898 *
3899 * - surviving replica sends strong_inode, which includes current
3900 * scatterlock state, AND any dirty scatterlock data. this
3901 * provides the recovering auth with everything it might need.
3902 *
3903 * - recovering auth must pick initial scatterlock state based on
3904 * (weak|strong) rejoins.
3905 * - always assimilate scatterlock data (it can't hurt)
3906 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3907 * - include base inode in ack for all inodes that saw scatterlock content
3908 *
3909 * also, for scatter gather,
3910 *
3911 * - auth increments {frag,r}stat.version on completion of any gather.
3912 *
3913 * - auth incorporates changes in a gather _only_ if the version
3914 * matches.
3915 *
3916 * - replica discards changes any time the scatterlock syncs, and
3917 * after recovery.
3918 */
3919
3920void MDCache::dump_rejoin_status(Formatter *f) const
3921{
3922 f->open_object_section("rejoin_status");
3923 f->dump_stream("rejoin_gather") << rejoin_gather;
3924 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3925 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3926 f->close_section();
3927}
3928
3929void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3930{
3931 dout(10) << "rejoin_start" << dendl;
3932 assert(!rejoin_done);
3933 rejoin_done.reset(rejoin_done_);
3934
3935 rejoin_gather = recovery_set;
3936 // need finish opening cap inodes before sending cache rejoins
3937 rejoin_gather.insert(mds->get_nodeid());
3938 process_imported_caps();
3939}
3940
3941/*
3942 * rejoin phase!
3943 *
3944 * this initiates rejoin. it shoudl be called before we get any
3945 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3946 *
3947 * we start out by sending rejoins to everyone in the recovery set.
3948 *
3949 * if we are rejoin, send for all regions in our cache.
3950 * if we are active|stopping, send only to nodes that are are rejoining.
3951 */
3952void MDCache::rejoin_send_rejoins()
3953{
3954 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3955
3956 if (rejoin_gather.count(mds->get_nodeid())) {
3957 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3958 rejoins_pending = true;
3959 return;
3960 }
3961 if (!resolve_gather.empty()) {
3962 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3963 << resolve_gather << ")" << dendl;
3964 rejoins_pending = true;
3965 return;
3966 }
3967
3968 assert(!migrator->is_importing());
3969 assert(!migrator->is_exporting());
3970
3971 if (!mds->is_rejoin()) {
3972 disambiguate_other_imports();
3973 }
3974
3975 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3976
3977
3978 // if i am rejoining, send a rejoin to everyone.
3979 // otherwise, just send to others who are rejoining.
3980 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3981 p != recovery_set.end();
3982 ++p) {
3983 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3984 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3985 if (mds->is_rejoin())
3986 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3987 else if (mds->mdsmap->is_rejoin(*p))
3988 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3989 }
3990
3991 if (mds->is_rejoin()) {
3992 map<client_t, set<mds_rank_t> > client_exports;
3993 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
28e407b8 3994 mds_rank_t target = p->second.first;
7c673cae
FG
3995 if (rejoins.count(target) == 0)
3996 continue;
28e407b8
AA
3997 rejoins[target]->cap_exports[p->first] = p->second.second;
3998 for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
7c673cae
FG
3999 client_exports[q->first].insert(target);
4000 }
4001 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
4002 p != client_exports.end();
4003 ++p) {
4004 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
4005 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
4006 rejoins[*q]->client_map[p->first] = inst;
4007 }
4008 }
4009
4010
4011 // check all subtrees
4012 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4013 p != subtrees.end();
4014 ++p) {
4015 CDir *dir = p->first;
4016 assert(dir->is_subtree_root());
4017 if (dir->is_ambiguous_dir_auth()) {
4018 // exporter is recovering, importer is survivor.
4019 assert(rejoins.count(dir->authority().first));
4020 assert(!rejoins.count(dir->authority().second));
4021 continue;
4022 }
4023
4024 // my subtree?
4025 if (dir->is_auth())
4026 continue; // skip my own regions!
4027
4028 mds_rank_t auth = dir->get_dir_auth().first;
4029 assert(auth >= 0);
4030 if (rejoins.count(auth) == 0)
4031 continue; // don't care about this node's subtrees
4032
4033 rejoin_walk(dir, rejoins[auth]);
4034 }
4035
4036 // rejoin root inodes, too
4037 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4038 p != rejoins.end();
4039 ++p) {
4040 if (mds->is_rejoin()) {
4041 // weak
4042 if (p->first == 0 && root) {
4043 p->second->add_weak_inode(root->vino());
4044 if (root->is_dirty_scattered()) {
4045 dout(10) << " sending scatterlock state on root " << *root << dendl;
4046 p->second->add_scatterlock_state(root);
4047 }
4048 }
4049 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4050 if (in)
4051 p->second->add_weak_inode(in->vino());
4052 }
4053 } else {
4054 // strong
4055 if (p->first == 0 && root) {
4056 p->second->add_strong_inode(root->vino(),
4057 root->get_replica_nonce(),
4058 root->get_caps_wanted(),
4059 root->filelock.get_state(),
4060 root->nestlock.get_state(),
4061 root->dirfragtreelock.get_state());
4062 root->state_set(CInode::STATE_REJOINING);
4063 if (root->is_dirty_scattered()) {
4064 dout(10) << " sending scatterlock state on root " << *root << dendl;
4065 p->second->add_scatterlock_state(root);
4066 }
4067 }
4068
4069 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4070 p->second->add_strong_inode(in->vino(),
4071 in->get_replica_nonce(),
4072 in->get_caps_wanted(),
4073 in->filelock.get_state(),
4074 in->nestlock.get_state(),
4075 in->dirfragtreelock.get_state());
4076 in->state_set(CInode::STATE_REJOINING);
4077 }
4078 }
4079 }
4080
4081 if (!mds->is_rejoin()) {
4082 // i am survivor. send strong rejoin.
4083 // note request remote_auth_pins, xlocks
4084 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4085 p != active_requests.end();
4086 ++p) {
4087 MDRequestRef& mdr = p->second;
4088 if (mdr->is_slave())
4089 continue;
4090 // auth pins
4091 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4092 q != mdr->remote_auth_pins.end();
4093 ++q) {
4094 if (!q->first->is_auth()) {
4095 assert(q->second == q->first->authority().first);
4096 if (rejoins.count(q->second) == 0) continue;
4097 MMDSCacheRejoin *rejoin = rejoins[q->second];
4098
4099 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4100 MDSCacheObjectInfo i;
4101 q->first->set_object_info(i);
4102 if (i.ino)
4103 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4104 else
4105 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4106
4107 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4108 mdr->more()->rename_inode == q->first)
4109 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4110 mdr->reqid, mdr->attempt);
4111 }
4112 }
4113 // xlocks
4114 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4115 q != mdr->xlocks.end();
4116 ++q) {
4117 if (!(*q)->get_parent()->is_auth()) {
4118 mds_rank_t who = (*q)->get_parent()->authority().first;
4119 if (rejoins.count(who) == 0) continue;
4120 MMDSCacheRejoin *rejoin = rejoins[who];
4121
4122 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4123 MDSCacheObjectInfo i;
4124 (*q)->get_parent()->set_object_info(i);
4125 if (i.ino)
4126 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4127 mdr->reqid, mdr->attempt);
4128 else
4129 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4130 mdr->reqid, mdr->attempt);
4131 }
4132 }
4133 // remote wrlocks
4134 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4135 q != mdr->remote_wrlocks.end();
4136 ++q) {
4137 mds_rank_t who = q->second;
4138 if (rejoins.count(who) == 0) continue;
4139 MMDSCacheRejoin *rejoin = rejoins[who];
4140
4141 dout(15) << " " << *mdr << " wrlock on " << q->second
4142 << " " << q->first->get_parent() << dendl;
4143 MDSCacheObjectInfo i;
4144 q->first->get_parent()->set_object_info(i);
4145 assert(i.ino);
4146 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4147 mdr->reqid, mdr->attempt);
4148 }
4149 }
4150 }
4151
4152 // send the messages
4153 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4154 p != rejoins.end();
4155 ++p) {
4156 assert(rejoin_sent.count(p->first) == 0);
4157 assert(rejoin_ack_gather.count(p->first) == 0);
4158 rejoin_sent.insert(p->first);
4159 rejoin_ack_gather.insert(p->first);
4160 mds->send_message_mds(p->second, p->first);
4161 }
4162 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4163 rejoins_pending = false;
4164
4165 // nothing?
28e407b8 4166 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4167 dout(10) << "nothing to rejoin" << dendl;
4168 rejoin_gather_finish();
4169 }
4170}
4171
4172
4173/**
4174 * rejoin_walk - build rejoin declarations for a subtree
4175 *
4176 * @param dir subtree root
4177 * @param rejoin rejoin message
4178 *
4179 * from a rejoining node:
4180 * weak dirfrag
4181 * weak dentries (w/ connectivity)
4182 *
4183 * from a surviving node:
4184 * strong dirfrag
4185 * strong dentries (no connectivity!)
4186 * strong inodes
4187 */
4188void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4189{
4190 dout(10) << "rejoin_walk " << *dir << dendl;
4191
4192 list<CDir*> nested; // finish this dir, then do nested items
4193
4194 if (mds->is_rejoin()) {
4195 // WEAK
4196 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4197 for (auto &p : dir->items) {
4198 CDentry *dn = p.second;
4199 assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4200 CDentry::linkage_t *dnl = dn->get_linkage();
4201 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4202 assert(dnl->is_primary());
4203 CInode *in = dnl->get_inode();
4204 assert(dnl->get_inode()->is_dir());
94b18763 4205 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
7c673cae
FG
4206 in->get_nested_dirfrags(nested);
4207 if (in->is_dirty_scattered()) {
4208 dout(10) << " sending scatterlock state on " << *in << dendl;
4209 rejoin->add_scatterlock_state(in);
4210 }
4211 }
4212 } else {
4213 // STRONG
4214 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4215 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4216 dir->state_set(CDir::STATE_REJOINING);
4217
94b18763
FG
4218 for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
4219 CDentry *dn = it->second;
7c673cae
FG
4220 CDentry::linkage_t *dnl = dn->get_linkage();
4221 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4222 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4223 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4224 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4225 dnl->is_remote() ? dnl->get_remote_d_type():0,
4226 dn->get_replica_nonce(),
4227 dn->lock.get_state());
4228 dn->state_set(CDentry::STATE_REJOINING);
4229 if (dnl->is_primary()) {
4230 CInode *in = dnl->get_inode();
4231 dout(15) << " add_strong_inode " << *in << dendl;
4232 rejoin->add_strong_inode(in->vino(),
4233 in->get_replica_nonce(),
4234 in->get_caps_wanted(),
4235 in->filelock.get_state(),
4236 in->nestlock.get_state(),
4237 in->dirfragtreelock.get_state());
4238 in->state_set(CInode::STATE_REJOINING);
4239 in->get_nested_dirfrags(nested);
4240 if (in->is_dirty_scattered()) {
4241 dout(10) << " sending scatterlock state on " << *in << dendl;
4242 rejoin->add_scatterlock_state(in);
4243 }
4244 }
4245 }
4246 }
4247
4248 // recurse into nested dirs
4249 for (list<CDir*>::iterator p = nested.begin();
4250 p != nested.end();
4251 ++p)
4252 rejoin_walk(*p, rejoin);
4253}
4254
4255
4256/*
4257 * i got a rejoin.
4258 * - reply with the lockstate
4259 *
4260 * if i am active|stopping,
4261 * - remove source from replica list for everything not referenced here.
4262 * This function puts the passed message before returning.
4263 */
4264void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4265{
4266 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4267 << " (" << m->get_payload().length() << " bytes)"
4268 << dendl;
4269
4270 switch (m->op) {
4271 case MMDSCacheRejoin::OP_WEAK:
4272 handle_cache_rejoin_weak(m);
4273 break;
4274 case MMDSCacheRejoin::OP_STRONG:
4275 handle_cache_rejoin_strong(m);
4276 break;
4277 case MMDSCacheRejoin::OP_ACK:
4278 handle_cache_rejoin_ack(m);
4279 break;
4280
4281 default:
4282 ceph_abort();
4283 }
4284 m->put();
4285}
4286
4287
4288/*
4289 * handle_cache_rejoin_weak
4290 *
4291 * the sender
4292 * - is recovering from their journal.
4293 * - may have incorrect (out of date) inode contents
4294 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4295 *
4296 * if the sender didn't trim_non_auth(), they
4297 * - may have incorrect (out of date) dentry/inode linkage
4298 * - may have deleted/purged inodes
4299 * and i may have to go to disk to get accurate inode contents. yuck.
4300 * This functions DOES NOT put the passed message before returning
4301 */
4302void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4303{
4304 mds_rank_t from = mds_rank_t(weak->get_source().num());
4305
4306 // possible response(s)
4307 MMDSCacheRejoin *ack = 0; // if survivor
4308 set<vinodeno_t> acked_inodes; // if survivor
4309 set<SimpleLock *> gather_locks; // if survivor
4310 bool survivor = false; // am i a survivor?
4311
4312 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4313 survivor = true;
4314 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4315 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4316
4317 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4318
4319 // check cap exports
4320 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4321 CInode *in = get_inode(p->first);
4322 assert(!in || in->is_auth());
4323 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4324 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4325 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4326 Capability::Import& im = imported_caps[p->first][q->first];
4327 if (cap) {
4328 im.cap_id = cap->get_cap_id();
4329 im.issue_seq = cap->get_last_seq();
4330 im.mseq = cap->get_mseq();
4331 } else {
4332 // all are zero
4333 }
4334 }
4335 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4336 }
4337
4338 ::encode(imported_caps, ack->imported_caps);
4339 } else {
4340 assert(mds->is_rejoin());
4341
4342 // we may have already received a strong rejoin from the sender.
4343 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4344 assert(gather_locks.empty());
4345
4346 // check cap exports.
4347 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4348
4349 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4350 CInode *in = get_inode(p->first);
b32b8144 4351 assert(!in || in->is_auth());
7c673cae
FG
4352 // note
4353 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4354 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4355 cap_imports[p->first][q->first][from] = q->second;
4356 }
4357 }
4358 }
4359
4360 // assimilate any potentially dirty scatterlock state
4361 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4362 p != weak->inode_scatterlocks.end();
4363 ++p) {
4364 CInode *in = get_inode(p->first);
4365 assert(in);
4366 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4367 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4368 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4369 if (!survivor)
4370 rejoin_potential_updated_scatterlocks.insert(in);
4371 }
4372
4373 // recovering peer may send incorrect dirfrags here. we need to
4374 // infer which dirfrag they meant. the ack will include a
4375 // strong_dirfrag that will set them straight on the fragmentation.
4376
4377 // walk weak map
4378 set<CDir*> dirs_to_share;
4379 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4380 p != weak->weak_dirfrags.end();
4381 ++p) {
4382 CInode *diri = get_inode(p->ino);
4383 if (!diri)
4384 dout(0) << " missing dir ino " << p->ino << dendl;
4385 assert(diri);
4386
4387 list<frag_t> ls;
4388 if (diri->dirfragtree.is_leaf(p->frag)) {
4389 ls.push_back(p->frag);
4390 } else {
4391 diri->dirfragtree.get_leaves_under(p->frag, ls);
4392 if (ls.empty())
4393 ls.push_back(diri->dirfragtree[p->frag.value()]);
4394 }
4395 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4396 frag_t fg = *q;
4397 CDir *dir = diri->get_dirfrag(fg);
4398 if (!dir) {
4399 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4400 continue;
4401 }
4402 assert(dir);
4403 if (dirs_to_share.count(dir)) {
4404 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4405 } else {
4406 dirs_to_share.insert(dir);
4407 unsigned nonce = dir->add_replica(from);
4408 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4409 if (ack) {
4410 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4411 ack->add_dirfrag_base(dir);
4412 }
4413 }
4414 }
4415 }
4416
4417 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4418 p != weak->weak.end();
4419 ++p) {
4420 CInode *diri = get_inode(p->first);
4421 if (!diri)
4422 dout(0) << " missing dir ino " << p->first << dendl;
4423 assert(diri);
4424
4425 // weak dentries
4426 CDir *dir = 0;
4427 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4428 q != p->second.end();
4429 ++q) {
4430 // locate proper dirfrag.
4431 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4432 frag_t fg = diri->pick_dirfrag(q->first.name);
4433 if (!dir || dir->get_frag() != fg) {
4434 dir = diri->get_dirfrag(fg);
4435 if (!dir)
4436 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4437 assert(dir);
4438 assert(dirs_to_share.count(dir));
4439 }
4440
4441 // and dentry
4442 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4443 assert(dn);
4444 CDentry::linkage_t *dnl = dn->get_linkage();
4445 assert(dnl->is_primary());
4446
4447 if (survivor && dn->is_replica(from))
4448 dentry_remove_replica(dn, from, gather_locks);
4449 unsigned dnonce = dn->add_replica(from);
4450 dout(10) << " have " << *dn << dendl;
4451 if (ack)
94b18763 4452 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4453 dnl->get_inode()->ino(), inodeno_t(0), 0,
4454 dnonce, dn->lock.get_replica_state());
4455
4456 // inode
4457 CInode *in = dnl->get_inode();
4458 assert(in);
4459
4460 if (survivor && in->is_replica(from))
4461 inode_remove_replica(in, from, true, gather_locks);
4462 unsigned inonce = in->add_replica(from);
4463 dout(10) << " have " << *in << dendl;
4464
4465 // scatter the dirlock, just in case?
4466 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4467 in->filelock.set_state(LOCK_MIX);
4468
4469 if (ack) {
4470 acked_inodes.insert(in->vino());
4471 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4472 bufferlist bl;
4473 in->_encode_locks_state_for_rejoin(bl, from);
4474 ack->add_inode_locks(in, inonce, bl);
4475 }
4476 }
4477 }
4478
4479 // weak base inodes? (root, stray, etc.)
4480 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4481 p != weak->weak_inodes.end();
4482 ++p) {
4483 CInode *in = get_inode(*p);
4484 assert(in); // hmm fixme wrt stray?
4485 if (survivor && in->is_replica(from))
4486 inode_remove_replica(in, from, true, gather_locks);
4487 unsigned inonce = in->add_replica(from);
4488 dout(10) << " have base " << *in << dendl;
4489
4490 if (ack) {
4491 acked_inodes.insert(in->vino());
4492 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4493 bufferlist bl;
4494 in->_encode_locks_state_for_rejoin(bl, from);
4495 ack->add_inode_locks(in, inonce, bl);
4496 }
4497 }
4498
4499 assert(rejoin_gather.count(from));
4500 rejoin_gather.erase(from);
4501 if (survivor) {
4502 // survivor. do everything now.
4503 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4504 p != weak->inode_scatterlocks.end();
4505 ++p) {
4506 CInode *in = get_inode(p->first);
4507 assert(in);
4508 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4509 acked_inodes.insert(in->vino());
4510 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4511 }
4512
4513 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4514 mds->send_message(ack, weak->get_connection());
4515
4516 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4517 if (!(*p)->is_stable())
4518 mds->locker->eval_gather(*p);
4519 }
4520 } else {
4521 // done?
28e407b8 4522 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4523 rejoin_gather_finish();
4524 } else {
4525 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4526 }
4527 }
4528}
4529
7c673cae
FG
4530/*
4531 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4532 *
4533 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4534 * ack, the replica dne, and we can remove it from our replica maps.
4535 */
4536void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4537 set<vinodeno_t>& acked_inodes,
4538 set<SimpleLock *>& gather_locks)
4539{
4540 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4541
b32b8144 4542 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4543 // inode?
4544 if (in->is_auth() &&
4545 in->is_replica(from) &&
b32b8144 4546 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4547 inode_remove_replica(in, from, false, gather_locks);
4548 dout(10) << " rem " << *in << dendl;
4549 }
4550
b32b8144
FG
4551 if (!in->is_dir())
4552 return;
7c673cae
FG
4553
4554 list<CDir*> dfs;
4555 in->get_dirfrags(dfs);
4556 for (list<CDir*>::iterator p = dfs.begin();
4557 p != dfs.end();
4558 ++p) {
4559 CDir *dir = *p;
181888fb
FG
4560 if (!dir->is_auth())
4561 continue;
7c673cae 4562
181888fb 4563 if (dir->is_replica(from) &&
7c673cae
FG
4564 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4565 dir->remove_replica(from);
4566 dout(10) << " rem " << *dir << dendl;
4567 }
4568
4569 // dentries
94b18763
FG
4570 for (auto &p : dir->items) {
4571 CDentry *dn = p.second;
7c673cae
FG
4572
4573 if (dn->is_replica(from) &&
4574 (ack == NULL ||
4575 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
94b18763 4576 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
7c673cae
FG
4577 dentry_remove_replica(dn, from, gather_locks);
4578 dout(10) << " rem " << *dn << dendl;
4579 }
4580 }
4581 }
b32b8144
FG
4582 };
4583
94b18763 4584 for (auto &p : inode_map)
b32b8144 4585 scour_func(p.second);
94b18763 4586 for (auto &p : snap_inode_map)
b32b8144 4587 scour_func(p.second);
7c673cae
FG
4588}
4589
4590
4591CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4592{
4593 CInode *in = new CInode(this, true, 1, last);
4594 in->inode.ino = ino;
4595 in->state_set(CInode::STATE_REJOINUNDEF);
4596 add_inode(in);
4597 rejoin_undef_inodes.insert(in);
4598 dout(10) << " invented " << *in << dendl;
4599 return in;
4600}
4601
4602CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4603{
4604 CInode *in = get_inode(df.ino);
4605 if (!in)
4606 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4607 if (!in->is_dir()) {
4608 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4609 in->inode.mode = S_IFDIR;
4610 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4611 }
4612 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4613 dir->state_set(CDir::STATE_REJOINUNDEF);
4614 rejoin_undef_dirfrags.insert(dir);
4615 dout(10) << " invented " << *dir << dendl;
4616 return dir;
4617}
4618
4619/* This functions DOES NOT put the passed message before returning */
4620void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4621{
4622 mds_rank_t from = mds_rank_t(strong->get_source().num());
4623
4624 // only a recovering node will get a strong rejoin.
4625 assert(mds->is_rejoin());
4626
4627 // assimilate any potentially dirty scatterlock state
4628 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4629 p != strong->inode_scatterlocks.end();
4630 ++p) {
4631 CInode *in = get_inode(p->first);
4632 assert(in);
4633 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4634 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4635 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4636 rejoin_potential_updated_scatterlocks.insert(in);
4637 }
4638
4639 rejoin_unlinked_inodes[from].clear();
4640
4641 // surviving peer may send incorrect dirfrag here (maybe they didn't
4642 // get the fragment notify, or maybe we rolled back?). we need to
4643 // infer the right frag and get them with the program. somehow.
4644 // we don't normally send ACK.. so we'll need to bundle this with
4645 // MISSING or something.
4646
4647 // strong dirfrags/dentries.
4648 // also process auth_pins, xlocks.
4649 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4650 p != strong->strong_dirfrags.end();
4651 ++p) {
4652 CInode *diri = get_inode(p->first.ino);
4653 if (!diri)
4654 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4655 CDir *dir = diri->get_dirfrag(p->first.frag);
4656 bool refragged = false;
4657 if (dir) {
4658 dout(10) << " have " << *dir << dendl;
4659 } else {
4660 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4661 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4662 else if (diri->dirfragtree.is_leaf(p->first.frag))
4663 dir = rejoin_invent_dirfrag(p->first);
4664 }
4665 if (dir) {
4666 dir->add_replica(from, p->second.nonce);
4667 dir->dir_rep = p->second.dir_rep;
4668 } else {
4669 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4670 list<frag_t> ls;
4671 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4672 if (ls.empty())
4673 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4674 dout(10) << " maps to frag(s) " << ls << dendl;
4675 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4676 CDir *dir = diri->get_dirfrag(*q);
4677 if (!dir)
4678 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4679 else
4680 dout(10) << " have(approx) " << *dir << dendl;
4681 dir->add_replica(from, p->second.nonce);
4682 dir->dir_rep = p->second.dir_rep;
4683 }
4684 refragged = true;
4685 }
4686
4687 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4688 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4689 q != dmap.end();
4690 ++q) {
4691 CDentry *dn;
4692 if (!refragged)
4693 dn = dir->lookup(q->first.name, q->first.snapid);
4694 else {
4695 frag_t fg = diri->pick_dirfrag(q->first.name);
4696 dir = diri->get_dirfrag(fg);
4697 assert(dir);
4698 dn = dir->lookup(q->first.name, q->first.snapid);
4699 }
4700 if (!dn) {
4701 if (q->second.is_remote()) {
4702 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4703 q->second.first, q->first.snapid);
4704 } else if (q->second.is_null()) {
4705 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4706 } else {
4707 CInode *in = get_inode(q->second.ino, q->first.snapid);
4708 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4709 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4710 }
4711 dout(10) << " invented " << *dn << dendl;
4712 }
4713 CDentry::linkage_t *dnl = dn->get_linkage();
4714
4715 // dn auth_pin?
4716 if (strong->authpinned_dentries.count(p->first) &&
4717 strong->authpinned_dentries[p->first].count(q->first)) {
4718 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4719 r != strong->authpinned_dentries[p->first][q->first].end();
4720 ++r) {
4721 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4722
4723 // get/create slave mdrequest
4724 MDRequestRef mdr;
4725 if (have_request(r->reqid))
4726 mdr = request_get(r->reqid);
4727 else
4728 mdr = request_start_slave(r->reqid, r->attempt, strong);
4729 mdr->auth_pin(dn);
4730 }
4731 }
4732
4733 // dn xlock?
4734 if (strong->xlocked_dentries.count(p->first) &&
4735 strong->xlocked_dentries[p->first].count(q->first)) {
4736 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4737 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4738 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4739 assert(mdr->is_auth_pinned(dn));
4740 if (!mdr->xlocks.count(&dn->versionlock)) {
4741 assert(dn->versionlock.can_xlock_local());
4742 dn->versionlock.get_xlock(mdr, mdr->get_client());
4743 mdr->xlocks.insert(&dn->versionlock);
4744 mdr->locks.insert(&dn->versionlock);
4745 }
4746 if (dn->lock.is_stable())
4747 dn->auth_pin(&dn->lock);
4748 dn->lock.set_state(LOCK_XLOCK);
4749 dn->lock.get_xlock(mdr, mdr->get_client());
4750 mdr->xlocks.insert(&dn->lock);
4751 mdr->locks.insert(&dn->lock);
4752 }
4753
4754 dn->add_replica(from, q->second.nonce);
4755 dout(10) << " have " << *dn << dendl;
4756
4757 if (dnl->is_primary()) {
4758 if (q->second.is_primary()) {
4759 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4760 // the survivor missed MDentryUnlink+MDentryLink messages ?
4761 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4762 CInode *in = get_inode(q->second.ino, q->first.snapid);
4763 assert(in);
4764 assert(in->get_parent_dn());
4765 rejoin_unlinked_inodes[from].insert(in);
4766 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4767 }
4768 } else {
4769 // the survivor missed MDentryLink message ?
4770 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4771 dout(7) << " sender doesn't have primay dentry" << dendl;
4772 }
4773 } else {
4774 if (q->second.is_primary()) {
4775 // the survivor missed MDentryUnlink message ?
4776 CInode *in = get_inode(q->second.ino, q->first.snapid);
4777 assert(in);
4778 assert(in->get_parent_dn());
4779 rejoin_unlinked_inodes[from].insert(in);
4780 dout(7) << " sender has primary dentry but we don't" << dendl;
4781 }
4782 }
4783 }
4784 }
4785
4786 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4787 p != strong->strong_inodes.end();
4788 ++p) {
4789 CInode *in = get_inode(p->first);
4790 assert(in);
4791 in->add_replica(from, p->second.nonce);
4792 dout(10) << " have " << *in << dendl;
4793
4794 MMDSCacheRejoin::inode_strong &is = p->second;
4795
4796 // caps_wanted
4797 if (is.caps_wanted) {
4798 in->mds_caps_wanted[from] = is.caps_wanted;
4799 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4800 << " on " << *in << dendl;
4801 }
4802
4803 // scatterlocks?
4804 // infer state from replica state:
4805 // * go to MIX if they might have wrlocks
4806 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4807 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4808 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4809 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4810
4811 // auth pin?
4812 if (strong->authpinned_inodes.count(in->vino())) {
4813 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4814 r != strong->authpinned_inodes[in->vino()].end();
4815 ++r) {
4816 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4817
4818 // get/create slave mdrequest
4819 MDRequestRef mdr;
4820 if (have_request(r->reqid))
4821 mdr = request_get(r->reqid);
4822 else
4823 mdr = request_start_slave(r->reqid, r->attempt, strong);
4824 if (strong->frozen_authpin_inodes.count(in->vino())) {
4825 assert(!in->get_num_auth_pins());
4826 mdr->freeze_auth_pin(in);
4827 } else {
4828 assert(!in->is_frozen_auth_pin());
4829 }
4830 mdr->auth_pin(in);
4831 }
4832 }
4833 // xlock(s)?
4834 if (strong->xlocked_inodes.count(in->vino())) {
4835 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4836 q != strong->xlocked_inodes[in->vino()].end();
4837 ++q) {
4838 SimpleLock *lock = in->get_lock(q->first);
4839 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4840 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4841 assert(mdr->is_auth_pinned(in));
4842 if (!mdr->xlocks.count(&in->versionlock)) {
4843 assert(in->versionlock.can_xlock_local());
4844 in->versionlock.get_xlock(mdr, mdr->get_client());
4845 mdr->xlocks.insert(&in->versionlock);
4846 mdr->locks.insert(&in->versionlock);
4847 }
4848 if (lock->is_stable())
4849 in->auth_pin(lock);
4850 lock->set_state(LOCK_XLOCK);
4851 if (lock == &in->filelock)
4852 in->loner_cap = -1;
4853 lock->get_xlock(mdr, mdr->get_client());
4854 mdr->xlocks.insert(lock);
4855 mdr->locks.insert(lock);
4856 }
4857 }
4858 }
4859 // wrlock(s)?
4860 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4861 p != strong->wrlocked_inodes.end();
4862 ++p) {
4863 CInode *in = get_inode(p->first);
4864 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4865 q != p->second.end();
4866 ++q) {
4867 SimpleLock *lock = in->get_lock(q->first);
4868 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4869 r != q->second.end();
4870 ++r) {
4871 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4872 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4873 if (in->is_auth())
4874 assert(mdr->is_auth_pinned(in));
4875 lock->set_state(LOCK_MIX);
4876 if (lock == &in->filelock)
4877 in->loner_cap = -1;
4878 lock->get_wrlock(true);
4879 mdr->wrlocks.insert(lock);
4880 mdr->locks.insert(lock);
4881 }
4882 }
4883 }
4884
4885 // done?
4886 assert(rejoin_gather.count(from));
4887 rejoin_gather.erase(from);
28e407b8 4888 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4889 rejoin_gather_finish();
4890 } else {
4891 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4892 }
4893}
4894
4895/* This functions DOES NOT put the passed message before returning */
4896void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4897{
4898 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4899 mds_rank_t from = mds_rank_t(ack->get_source().num());
4900
b32b8144
FG
4901 assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4902 bool survivor = !mds->is_rejoin();
4903
7c673cae
FG
4904 // for sending cache expire message
4905 set<CInode*> isolated_inodes;
4906 set<CInode*> refragged_inodes;
4907
4908 // dirs
4909 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4910 p != ack->strong_dirfrags.end();
4911 ++p) {
4912 // we may have had incorrect dir fragmentation; refragment based
4913 // on what they auth tells us.
4914 CDir *dir = get_dirfrag(p->first);
4915 if (!dir) {
4916 dir = get_force_dirfrag(p->first, false);
4917 if (dir)
4918 refragged_inodes.insert(dir->get_inode());
4919 }
4920 if (!dir) {
4921 CInode *diri = get_inode(p->first.ino);
4922 if (!diri) {
4923 // barebones inode; the full inode loop below will clean up.
4924 diri = new CInode(this, false);
4925 diri->inode.ino = p->first.ino;
4926 diri->inode.mode = S_IFDIR;
4927 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4928 add_inode(diri);
4929 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4930 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4931 dout(10) << " add inode " << *diri << dendl;
4932 } else {
4933 diri->inode_auth = CDIR_AUTH_DEFAULT;
4934 isolated_inodes.insert(diri);
4935 dout(10) << " unconnected dirfrag " << p->first << dendl;
4936 }
4937 }
4938 // barebones dirfrag; the full dirfrag loop below will clean up.
4939 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4940 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4941 (dir->authority() != CDIR_AUTH_UNDEF &&
4942 dir->authority().first != from))
4943 adjust_subtree_auth(dir, from);
4944 dout(10) << " add dirfrag " << *dir << dendl;
4945 }
4946
4947 dir->set_replica_nonce(p->second.nonce);
4948 dir->state_clear(CDir::STATE_REJOINING);
4949 dout(10) << " got " << *dir << dendl;
4950
4951 // dentries
4952 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4953 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4954 q != dmap.end();
4955 ++q) {
4956 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4957 if(!dn)
4958 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4959
4960 CDentry::linkage_t *dnl = dn->get_linkage();
4961
4962 assert(dn->last == q->first.snapid);
4963 if (dn->first != q->second.first) {
4964 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4965 dn->first = q->second.first;
4966 }
4967
4968 // may have bad linkage if we missed dentry link/unlink messages
4969 if (dnl->is_primary()) {
4970 CInode *in = dnl->get_inode();
4971 if (!q->second.is_primary() ||
4972 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4973 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4974 dir->unlink_inode(dn);
4975 }
4976 } else if (dnl->is_remote()) {
4977 if (!q->second.is_remote() ||
4978 q->second.remote_ino != dnl->get_remote_ino() ||
4979 q->second.remote_d_type != dnl->get_remote_d_type()) {
4980 dout(10) << " had bad linkage for " << *dn << dendl;
4981 dir->unlink_inode(dn);
4982 }
4983 } else {
4984 if (!q->second.is_null())
4985 dout(10) << " had bad linkage for " << *dn << dendl;
4986 }
4987
4988 // hmm, did we have the proper linkage here?
4989 if (dnl->is_null() && !q->second.is_null()) {
4990 if (q->second.is_remote()) {
4991 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4992 } else {
4993 CInode *in = get_inode(q->second.ino, q->first.snapid);
4994 if (!in) {
4995 // barebones inode; assume it's dir, the full inode loop below will clean up.
4996 in = new CInode(this, false, q->second.first, q->first.snapid);
4997 in->inode.ino = q->second.ino;
4998 in->inode.mode = S_IFDIR;
4999 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
5000 add_inode(in);
5001 dout(10) << " add inode " << *in << dendl;
5002 } else if (in->get_parent_dn()) {
5003 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5004 << ", unlinking " << *in << dendl;
5005 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5006 }
5007 dn->dir->link_primary_inode(dn, in);
5008 isolated_inodes.erase(in);
5009 }
5010 }
5011
5012 dn->set_replica_nonce(q->second.nonce);
b32b8144 5013 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
7c673cae
FG
5014 dn->state_clear(CDentry::STATE_REJOINING);
5015 dout(10) << " got " << *dn << dendl;
5016 }
5017 }
5018
5019 for (set<CInode*>::iterator p = refragged_inodes.begin();
5020 p != refragged_inodes.end();
5021 ++p) {
5022 list<CDir*> ls;
5023 (*p)->get_nested_dirfrags(ls);
5024 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5025 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5026 continue;
5027 assert((*q)->get_num_any() == 0);
5028 (*p)->close_dirfrag((*q)->get_frag());
5029 }
5030 }
5031
5032 // full dirfrags
5033 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5034 p != ack->dirfrag_bases.end();
5035 ++p) {
5036 CDir *dir = get_dirfrag(p->first);
5037 assert(dir);
5038 bufferlist::iterator q = p->second.begin();
5039 dir->_decode_base(q);
5040 dout(10) << " got dir replica " << *dir << dendl;
5041 }
5042
5043 // full inodes
5044 bufferlist::iterator p = ack->inode_base.begin();
5045 while (!p.end()) {
5046 inodeno_t ino;
5047 snapid_t last;
5048 bufferlist basebl;
5049 ::decode(ino, p);
5050 ::decode(last, p);
5051 ::decode(basebl, p);
5052 CInode *in = get_inode(ino, last);
5053 assert(in);
5054 bufferlist::iterator q = basebl.begin();
5055 in->_decode_base(q);
5056 dout(10) << " got inode base " << *in << dendl;
5057 }
5058
5059 // inodes
5060 p = ack->inode_locks.begin();
5061 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5062 while (!p.end()) {
5063 inodeno_t ino;
5064 snapid_t last;
5065 __u32 nonce;
5066 bufferlist lockbl;
5067 ::decode(ino, p);
5068 ::decode(last, p);
5069 ::decode(nonce, p);
5070 ::decode(lockbl, p);
5071
5072 CInode *in = get_inode(ino, last);
5073 assert(in);
5074 in->set_replica_nonce(nonce);
5075 bufferlist::iterator q = lockbl.begin();
b32b8144 5076 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5077 in->state_clear(CInode::STATE_REJOINING);
5078 dout(10) << " got inode locks " << *in << dendl;
5079 }
5080
5081 // FIXME: This can happen if entire subtree, together with the inode subtree root
5082 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5083 assert(isolated_inodes.empty());
5084
5085 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5086 bufferlist::iterator bp = ack->imported_caps.begin();
5087 ::decode(peer_imported, bp);
5088
5089 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5090 p != peer_imported.end();
5091 ++p) {
28e407b8
AA
5092 auto& ex = cap_exports.at(p->first);
5093 assert(ex.first == from);
7c673cae
FG
5094 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5095 q != p->second.end();
5096 ++q) {
28e407b8
AA
5097 auto r = ex.second.find(q->first);
5098 assert(r != ex.second.end());
7c673cae
FG
5099
5100 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5101 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5102 if (!session) {
5103 dout(10) << " no session for client." << p->first << dendl;
5104 ex.second.erase(r);
5105 continue;
5106 }
7c673cae
FG
5107
5108 // mark client caps stale.
5109 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5110 r->second.capinfo.cap_id, 0,
7c673cae
FG
5111 mds->get_osd_epoch_barrier());
5112 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5113 (q->second.cap_id > 0 ? from : -1), 0);
5114 mds->send_message_client_counted(m, session);
5115
28e407b8 5116 ex.second.erase(r);
7c673cae 5117 }
28e407b8 5118 assert(ex.second.empty());
7c673cae
FG
5119 }
5120
5121 // done?
5122 assert(rejoin_ack_gather.count(from));
5123 rejoin_ack_gather.erase(from);
b32b8144 5124 if (!survivor) {
7c673cae
FG
5125
5126 if (rejoin_gather.empty()) {
5127 // eval unstable scatter locks after all wrlocks are rejoined.
5128 while (!rejoin_eval_locks.empty()) {
5129 SimpleLock *lock = rejoin_eval_locks.front();
5130 rejoin_eval_locks.pop_front();
5131 if (!lock->is_stable())
5132 mds->locker->eval_gather(lock);
5133 }
5134 }
5135
5136 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5137 rejoin_ack_gather.empty()) {
5138 // finally, kickstart past snap parent opens
5139 open_snap_parents();
5140 } else {
5141 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5142 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5143 }
5144 } else {
5145 // survivor.
5146 mds->queue_waiters(rejoin_waiters);
5147 }
5148}
5149
5150/**
5151 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5152 *
5153 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5154 * messages that clean these guys up...
5155 */
5156void MDCache::rejoin_trim_undef_inodes()
5157{
5158 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5159
5160 while (!rejoin_undef_inodes.empty()) {
5161 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5162 CInode *in = *p;
5163 rejoin_undef_inodes.erase(p);
5164
5165 in->clear_replica_map();
5166
5167 // close out dirfrags
5168 if (in->is_dir()) {
5169 list<CDir*> dfls;
5170 in->get_dirfrags(dfls);
5171 for (list<CDir*>::iterator p = dfls.begin();
5172 p != dfls.end();
5173 ++p) {
5174 CDir *dir = *p;
5175 dir->clear_replica_map();
5176
94b18763
FG
5177 for (auto &p : dir->items) {
5178 CDentry *dn = p.second;
7c673cae
FG
5179 dn->clear_replica_map();
5180
5181 dout(10) << " trimming " << *dn << dendl;
5182 dir->remove_dentry(dn);
5183 }
5184
5185 dout(10) << " trimming " << *dir << dendl;
5186 in->close_dirfrag(dir->dirfrag().frag);
5187 }
5188 }
5189
5190 CDentry *dn = in->get_parent_dn();
5191 if (dn) {
5192 dn->clear_replica_map();
5193 dout(10) << " trimming " << *dn << dendl;
5194 dn->dir->remove_dentry(dn);
5195 } else {
5196 dout(10) << " trimming " << *in << dendl;
5197 remove_inode(in);
5198 }
5199 }
5200
5201 assert(rejoin_undef_inodes.empty());
5202}
5203
5204void MDCache::rejoin_gather_finish()
5205{
5206 dout(10) << "rejoin_gather_finish" << dendl;
5207 assert(mds->is_rejoin());
28e407b8 5208 assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5209
5210 if (open_undef_inodes_dirfrags())
5211 return;
5212
5213 if (process_imported_caps())
5214 return;
5215
5216 choose_lock_states_and_reconnect_caps();
5217
5218 identify_files_to_recover();
5219 rejoin_send_acks();
5220
5221 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5222 rejoin_ack_gather.erase(mds->get_nodeid());
5223
5224 // did we already get our acks too?
5225 if (rejoin_ack_gather.empty()) {
5226 // finally, kickstart past snap parent opens
5227 open_snap_parents();
5228 }
5229}
5230
5231class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5232 inodeno_t ino;
5233public:
5234 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5235 void finish(int r) override {
5236 mdcache->rejoin_open_ino_finish(ino, r);
5237 }
5238};
5239
5240void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5241{
5242 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5243
5244 if (ret < 0) {
5245 cap_imports_missing.insert(ino);
5246 } else if (ret == mds->get_nodeid()) {
5247 assert(get_inode(ino));
5248 } else {
5249 auto p = cap_imports.find(ino);
5250 assert(p != cap_imports.end());
5251 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5252 assert(q->second.count(MDS_RANK_NONE));
5253 assert(q->second.size() == 1);
5254 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5255 }
5256 cap_imports.erase(p);
5257 }
5258
5259 assert(cap_imports_num_opening > 0);
5260 cap_imports_num_opening--;
5261
5262 if (cap_imports_num_opening == 0) {
5263 if (rejoin_gather.empty())
5264 rejoin_gather_finish();
5265 else if (rejoin_gather.count(mds->get_nodeid()))
5266 process_imported_caps();
5267 }
5268}
5269
5270class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5271public:
28e407b8
AA
5272 map<client_t,pair<Session*,uint64_t> > session_map;
5273 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae
FG
5274 void finish(int r) override {
5275 assert(r == 0);
28e407b8 5276 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5277 }
5278};
5279
28e407b8 5280void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5281{
5282 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5283 mds->server->finish_force_open_sessions(session_map);
5284 rejoin_session_map.swap(session_map);
7c673cae
FG
5285 if (rejoin_gather.empty())
5286 rejoin_gather_finish();
5287}
5288
5289bool MDCache::process_imported_caps()
5290{
5291 dout(10) << "process_imported_caps" << dendl;
5292
5293 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5294 CInode *in = get_inode(p->first);
5295 if (in) {
5296 assert(in->is_auth());
5297 cap_imports_missing.erase(p->first);
5298 continue;
5299 }
5300 if (cap_imports_missing.count(p->first) > 0)
5301 continue;
5302
5303 cap_imports_num_opening++;
5304 dout(10) << " opening missing ino " << p->first << dendl;
5305 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
28e407b8
AA
5306 if (!(cap_imports_num_opening % 1000))
5307 mds->heartbeat_reset();
7c673cae
FG
5308 }
5309
5310 if (cap_imports_num_opening > 0)
5311 return true;
5312
5313 // called by rejoin_gather_finish() ?
5314 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5315 if (!rejoin_client_map.empty() &&
5316 rejoin_session_map.empty()) {
5317 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5318 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5319 finish->session_map);
5320 mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
5321 mds->mdlog->flush();
5322 rejoin_client_map.clear();
5323 return true;
7c673cae 5324 }
7c673cae
FG
5325
5326 // process caps that were exported by slave rename
5327 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5328 p != rejoin_slave_exports.end();
5329 ++p) {
5330 CInode *in = get_inode(p->first);
5331 assert(in);
5332 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5333 q != p->second.second.end();
5334 ++q) {
28e407b8
AA
5335 auto r = rejoin_session_map.find(q->first);
5336 if (r == rejoin_session_map.end())
5337 continue;
7c673cae 5338
28e407b8 5339 Session *session = r->second.first;
7c673cae
FG
5340 Capability *cap = in->get_client_cap(q->first);
5341 if (!cap)
5342 cap = in->add_client_cap(q->first, session);
5343 cap->merge(q->second, true);
5344
5345 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5346 assert(cap->get_last_seq() == im.issue_seq);
5347 assert(cap->get_mseq() == im.mseq);
5348 cap->set_cap_id(im.cap_id);
5349 // send cap import because we assigned a new cap ID
5350 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5351 p->second.first, CEPH_CAP_FLAG_AUTH);
5352 }
5353 }
5354 rejoin_slave_exports.clear();
5355 rejoin_imported_caps.clear();
5356
5357 // process cap imports
5358 // ino -> client -> frommds -> capex
5359 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5360 CInode *in = get_inode(p->first);
5361 if (!in) {
5362 dout(10) << " still missing ino " << p->first
5363 << ", will try again after replayed client requests" << dendl;
5364 ++p;
5365 continue;
5366 }
5367 assert(in->is_auth());
5368 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5369 Session *session;
5370 {
5371 auto r = rejoin_session_map.find(q->first);
5372 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5373 }
5374
7c673cae 5375 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5376 if (!session) {
5377 if (r->first >= 0)
5378 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5379 continue;
5380 }
5381
7c673cae
FG
5382 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5383 add_reconnected_cap(q->first, in->ino(), r->second);
5384 if (r->first >= 0) {
5385 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5386 cap->inc_mseq();
5387 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5388
5389 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5390 im.cap_id = cap->get_cap_id();
5391 im.issue_seq = cap->get_last_seq();
5392 im.mseq = cap->get_mseq();
5393 }
5394 }
5395 }
5396 cap_imports.erase(p++); // remove and move on
5397 }
5398 } else {
5399 trim_non_auth();
5400
28e407b8 5401 assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5402 rejoin_gather.erase(mds->get_nodeid());
28e407b8 5403 assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5404 maybe_send_pending_rejoins();
7c673cae
FG
5405 }
5406 return false;
5407}
5408
5409void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5410{
5411 // are this realm's parents fully open?
5412 if (realm->have_past_parents_open()) {
5413 dout(10) << " have past snap parents for realm " << *realm
5414 << " on " << *realm->inode << dendl;
5415 if (reconnect) {
5416 // finish off client snaprealm reconnects?
5417 auto p = reconnected_snaprealms.find(realm->inode->ino());
5418 if (p != reconnected_snaprealms.end()) {
5419 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5420 finish_snaprealm_reconnect(q->first, realm, q->second);
5421 reconnected_snaprealms.erase(p);
5422 }
5423 }
5424 } else {
5425 if (!missing_snap_parents.count(realm->inode)) {
5426 dout(10) << " MISSING past snap parents for realm " << *realm
5427 << " on " << *realm->inode << dendl;
5428 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5429 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5430 } else {
5431 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5432 << " on " << *realm->inode << dendl;
5433 }
5434 }
5435}
5436
5437void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5438 client_t client, snapid_t snap_follows)
5439{
5440 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5441
5442 const set<snapid_t>& snaps = realm->get_snaps();
5443 snapid_t follows = snap_follows;
5444
5445 while (true) {
5446 CInode *in = pick_inode_snap(head_in, follows);
5447 if (in == head_in)
5448 break;
5449 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5450
5451 /* TODO: we can check the reconnected/flushing caps to find
5452 * which locks need gathering */
5453 for (int i = 0; i < num_cinode_locks; i++) {
5454 int lockid = cinode_lock_info[i].lock;
5455 SimpleLock *lock = in->get_lock(lockid);
5456 assert(lock);
5457 in->client_snap_caps[lockid].insert(client);
5458 in->auth_pin(lock);
5459 lock->set_state(LOCK_SNAP_SYNC);
5460 lock->get_wrlock(true);
5461 }
5462
5463 for (auto p = snaps.lower_bound(in->first);
5464 p != snaps.end() && *p <= in->last;
5465 ++p) {
5466 head_in->add_need_snapflush(in, *p, client);
5467 }
5468
5469 follows = in->last;
5470 }
5471}
5472
5473/*
5474 * choose lock states based on reconnected caps
5475 */
5476void MDCache::choose_lock_states_and_reconnect_caps()
5477{
5478 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5479
5480 map<client_t,MClientSnap*> splits;
5481
b32b8144
FG
5482 for (auto i : inode_map) {
5483 CInode *in = i.second;
7c673cae
FG
5484
5485 if (in->last != CEPH_NOSNAP)
5486 continue;
5487
5488 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5489 in->mark_dirty_rstat();
5490
7c673cae 5491 int dirty_caps = 0;
b32b8144 5492 auto p = reconnected_caps.find(in->ino());
7c673cae
FG
5493 if (p != reconnected_caps.end()) {
5494 for (const auto &it : p->second)
5495 dirty_caps |= it.second.dirty_caps;
5496 }
5497 in->choose_lock_states(dirty_caps);
5498 dout(15) << " chose lock states on " << *in << dendl;
5499
5500 SnapRealm *realm = in->find_snaprealm();
5501
5502 check_realm_past_parents(realm, realm == in->snaprealm);
5503
5504 if (p != reconnected_caps.end()) {
5505 bool missing_snap_parent = false;
5506 // also, make sure client's cap is in the correct snaprealm.
5507 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5508 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5509 if (realm->have_past_parents_open()) {
5510 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5511 } else {
5512 missing_snap_parent = true;
5513 }
5514 }
5515
5516 if (q->second.realm_ino == realm->inode->ino()) {
5517 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5518 } else {
5519 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5520 << " != " << realm->inode->ino() << dendl;
5521 if (realm->have_past_parents_open()) {
5522 // ok, include in a split message _now_.
5523 prepare_realm_split(realm, q->first, in->ino(), splits);
5524 } else {
5525 // send the split later.
5526 missing_snap_parent = true;
5527 }
5528 }
5529 }
5530 if (missing_snap_parent)
5531 missing_snap_parents[realm->inode].insert(in);
5532 }
5533 }
5534
5535 send_snaps(splits);
5536}
5537
5538void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5539 map<client_t,MClientSnap*>& splits)
5540{
5541 MClientSnap *snap;
5542 if (splits.count(client) == 0) {
5543 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5544 snap->head.split = realm->inode->ino();
5545 realm->build_snap_trace(snap->bl);
5546
5547 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5548 p != realm->open_children.end();
5549 ++p)
5550 snap->split_realms.push_back((*p)->inode->ino());
5551
5552 } else
5553 snap = splits[client];
5554 snap->split_inos.push_back(ino);
5555}
5556
5557void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5558{
5559 dout(10) << "send_snaps" << dendl;
5560
5561 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5562 p != splits.end();
5563 ++p) {
5564 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5565 if (session) {
5566 dout(10) << " client." << p->first
5567 << " split " << p->second->head.split
5568 << " inos " << p->second->split_inos
5569 << dendl;
5570 mds->send_message_client_counted(p->second, session);
5571 } else {
5572 dout(10) << " no session for client." << p->first << dendl;
5573 p->second->put();
5574 }
5575 }
5576 splits.clear();
5577}
5578
5579
5580/*
5581 * remove any items from logsegment open_file lists that don't have
5582 * any caps
5583 */
5584void MDCache::clean_open_file_lists()
5585{
5586 dout(10) << "clean_open_file_lists" << dendl;
5587
5588 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5589 p != mds->mdlog->segments.end();
5590 ++p) {
5591 LogSegment *ls = p->second;
5592
5593 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5594 while (!q.end()) {
5595 CInode *in = *q;
5596 ++q;
5597 if (in->last == CEPH_NOSNAP) {
5598 if (!in->is_any_caps_wanted()) {
5599 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5600 in->item_open_file.remove_myself();
5601 }
5602 } else if (in->last != CEPH_NOSNAP) {
5603 if (in->client_snap_caps.empty()) {
5604 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5605 in->item_open_file.remove_myself();
5606 }
5607 }
5608 }
5609 }
5610}
5611
5612
5613
5614Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5615{
5616 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5617 << " on " << *in << dendl;
5618 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5619 if (!session) {
5620 dout(10) << " no session for client." << client << dendl;
5621 return NULL;
5622 }
5623
5624 Capability *cap = in->reconnect_cap(client, icr, session);
5625
5626 if (frommds >= 0) {
5627 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5628 cap->inc_mseq();
5629 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5630 }
5631
5632 return cap;
5633}
5634
5635void MDCache::export_remaining_imported_caps()
5636{
5637 dout(10) << "export_remaining_imported_caps" << dendl;
5638
5639 stringstream warn_str;
5640
5641 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5642 warn_str << " ino " << p->first << "\n";
5643 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5644 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5645 if (session) {
5646 // mark client caps stale.
5647 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5648 stale->set_cap_peer(0, 0, 0, -1, 0);
5649 mds->send_message_client_counted(stale, q->first);
5650 }
5651 }
5652
5653 mds->heartbeat_reset();
5654 }
5655
5656 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5657 p != cap_reconnect_waiters.end();
5658 ++p)
5659 mds->queue_waiters(p->second);
5660
5661 cap_imports.clear();
5662 cap_reconnect_waiters.clear();
5663
5664 if (warn_str.peek() != EOF) {
5665 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5666 mds->clog->warn(warn_str);
5667 }
5668}
5669
5670void MDCache::try_reconnect_cap(CInode *in, Session *session)
5671{
5672 client_t client = session->info.get_client();
5673 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5674 if (rc) {
5675 in->reconnect_cap(client, *rc, session);
5676 dout(10) << "try_reconnect_cap client." << client
5677 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5678 << " issue " << ccap_string(rc->capinfo.issued)
5679 << " on " << *in << dendl;
5680 remove_replay_cap_reconnect(in->ino(), client);
5681
5682 if (in->is_replicated()) {
5683 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5684 } else {
5685 int dirty_caps = 0;
5686 auto p = reconnected_caps.find(in->ino());
5687 if (p != reconnected_caps.end()) {
5688 auto q = p->second.find(client);
5689 if (q != p->second.end())
5690 dirty_caps = q->second.dirty_caps;
5691 }
5692 in->choose_lock_states(dirty_caps);
5693 dout(15) << " chose lock states on " << *in << dendl;
5694 }
5695
5696 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5697 cap_reconnect_waiters.find(in->ino());
5698 if (it != cap_reconnect_waiters.end()) {
5699 mds->queue_waiters(it->second);
5700 cap_reconnect_waiters.erase(it);
5701 }
5702 }
5703}
5704
5705
5706
5707// -------
5708// cap imports and delayed snap parent opens
5709
5710void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5711 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5712 int peer, int p_flags)
5713{
5714 client_t client = session->info.inst.name.num();
5715 SnapRealm *realm = in->find_snaprealm();
5716 if (realm->have_past_parents_open()) {
5717 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5718 if (cap->get_last_seq() == 0) // reconnected cap
5719 cap->inc_last_seq();
5720 cap->set_last_issue();
5721 cap->set_last_issue_stamp(ceph_clock_now());
5722 cap->clear_new();
5723 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5724 in->ino(),
5725 realm->inode->ino(),
5726 cap->get_cap_id(), cap->get_last_seq(),
5727 cap->pending(), cap->wanted(), 0,
5728 cap->get_mseq(), mds->get_osd_epoch_barrier());
5729 in->encode_cap_message(reap, cap);
5730 realm->build_snap_trace(reap->snapbl);
5731 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5732 mds->send_message_client_counted(reap, session);
5733 } else {
5734 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5735 << cap->get_mseq() << " on " << *in << dendl;
5736 in->auth_pin(this);
5737 cap->inc_suppress();
5738 delayed_imported_caps[client].insert(in);
5739 missing_snap_parents[in].size();
5740 }
5741}
5742
5743void MDCache::do_delayed_cap_imports()
5744{
5745 dout(10) << "do_delayed_cap_imports" << dendl;
5746
5747 assert(delayed_imported_caps.empty());
5748}
5749
5750struct C_MDC_OpenSnapParents : public MDCacheContext {
5751 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5752 void finish(int r) override {
5753 mdcache->open_snap_parents();
5754 }
5755};
5756
5757void MDCache::open_snap_parents()
5758{
5759 dout(10) << "open_snap_parents" << dendl;
5760
5761 map<client_t,MClientSnap*> splits;
5762 MDSGatherBuilder gather(g_ceph_context);
5763
5764 auto p = missing_snap_parents.begin();
5765 while (p != missing_snap_parents.end()) {
5766 CInode *in = p->first;
5767 assert(in->snaprealm);
5768 if (in->snaprealm->open_parents(gather.new_sub())) {
5769 dout(10) << " past parents now open on " << *in << dendl;
5770
5771 for (CInode *child : p->second) {
5772 auto q = reconnected_caps.find(child->ino());
5773 assert(q != reconnected_caps.end());
5774 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5775 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5776 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5777 }
5778 // make sure client's cap is in the correct snaprealm.
5779 if (r->second.realm_ino != in->ino()) {
5780 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5781 }
5782 }
5783 }
5784
5785 missing_snap_parents.erase(p++);
5786
5787 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5788
5789 // finish off client snaprealm reconnects?
5790 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5791 if (q != reconnected_snaprealms.end()) {
5792 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5793 r != q->second.end();
5794 ++r)
5795 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5796 reconnected_snaprealms.erase(q);
5797 }
5798 } else {
5799 dout(10) << " opening past parents on " << *in << dendl;
5800 ++p;
5801 }
5802 }
5803
5804 send_snaps(splits);
5805
5806 if (gather.has_subs()) {
5807 dout(10) << "open_snap_parents - waiting for "
5808 << gather.num_subs_remaining() << dendl;
5809 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5810 gather.activate();
5811 } else {
5812 if (!reconnected_snaprealms.empty()) {
5813 stringstream warn_str;
5814 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5815 p != reconnected_snaprealms.end();
5816 ++p) {
5817 warn_str << " unconnected snaprealm " << p->first << "\n";
5818 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5819 q != p->second.end();
5820 ++q)
5821 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5822 }
5823 mds->clog->warn() << "open_snap_parents has:";
5824 mds->clog->warn(warn_str);
5825 }
5826 assert(rejoin_waiters.empty());
5827 assert(missing_snap_parents.empty());
5828 dout(10) << "open_snap_parents - all open" << dendl;
5829 do_delayed_cap_imports();
5830
5831 assert(rejoin_done);
5832 rejoin_done.release()->complete(0);
5833 reconnected_caps.clear();
5834 }
5835}
5836
5837bool MDCache::open_undef_inodes_dirfrags()
5838{
5839 dout(10) << "open_undef_inodes_dirfrags "
5840 << rejoin_undef_inodes.size() << " inodes "
5841 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5842
5843 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5844
5845 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5846 p != rejoin_undef_inodes.end();
5847 ++p) {
5848 CInode *in = *p;
5849 assert(!in->is_base());
5850 fetch_queue.insert(in->get_parent_dir());
5851 }
5852
5853 if (fetch_queue.empty())
5854 return false;
5855
28e407b8
AA
5856 MDSGatherBuilder gather(g_ceph_context,
5857 new MDSInternalContextWrapper(mds,
5858 new FunctionContext([this](int r) {
5859 if (rejoin_gather.empty())
5860 rejoin_gather_finish();
5861 })
5862 )
5863 );
5864
7c673cae
FG
5865 for (set<CDir*>::iterator p = fetch_queue.begin();
5866 p != fetch_queue.end();
5867 ++p) {
5868 CDir *dir = *p;
5869 CInode *diri = dir->get_inode();
5870 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5871 continue;
5872 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5873 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5874 dir->fetch(gather.new_sub());
5875 }
5876 assert(gather.has_subs());
5877 gather.activate();
5878 return true;
5879}
5880
5881void MDCache::opened_undef_inode(CInode *in) {
5882 dout(10) << "opened_undef_inode " << *in << dendl;
5883 rejoin_undef_inodes.erase(in);
5884 if (in->is_dir()) {
5885 // FIXME: re-hash dentries if necessary
5886 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5887 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5888 CDir *dir = in->get_dirfrag(frag_t());
5889 assert(dir);
5890 rejoin_undef_dirfrags.erase(dir);
5891 in->force_dirfrags();
5892 list<CDir*> ls;
5893 in->get_dirfrags(ls);
5894 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5895 rejoin_undef_dirfrags.insert(*p);
5896 }
5897 }
5898}
5899
5900void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5901{
5902 if (seq < realm->get_newest_seq()) {
5903 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5904 << realm->get_newest_seq()
5905 << " on " << *realm << dendl;
5906 // send an update
5907 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5908 if (session) {
5909 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5910 realm->build_snap_trace(snap->bl);
5911 mds->send_message_client_counted(snap, session);
5912 } else {
5913 dout(10) << " ...or not, no session for this client!" << dendl;
5914 }
5915 } else {
5916 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5917 << " on " << *realm << dendl;
5918 }
5919}
5920
5921
5922
5923void MDCache::rejoin_send_acks()
5924{
5925 dout(7) << "rejoin_send_acks" << dendl;
5926
5927 // replicate stray
5928 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5929 p != rejoin_unlinked_inodes.end();
5930 ++p) {
5931 for (set<CInode*>::iterator q = p->second.begin();
5932 q != p->second.end();
5933 ++q) {
5934 CInode *in = *q;
5935 dout(7) << " unlinked inode " << *in << dendl;
5936 // inode expired
5937 if (!in->is_replica(p->first))
5938 continue;
5939 while (1) {
5940 CDentry *dn = in->get_parent_dn();
5941 if (dn->is_replica(p->first))
5942 break;
5943 dn->add_replica(p->first);
5944 CDir *dir = dn->get_dir();
5945 if (dir->is_replica(p->first))
5946 break;
5947 dir->add_replica(p->first);
5948 in = dir->get_inode();
5949 if (in->is_replica(p->first))
5950 break;
224ce89b 5951 in->add_replica(p->first);
7c673cae
FG
5952 if (in->is_base())
5953 break;
5954 }
5955 }
5956 }
5957 rejoin_unlinked_inodes.clear();
5958
5959 // send acks to everyone in the recovery set
31f18b77 5960 map<mds_rank_t,MMDSCacheRejoin*> acks;
7c673cae
FG
5961 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5962 p != recovery_set.end();
31f18b77
FG
5963 ++p) {
5964 if (rejoin_ack_sent.count(*p))
5965 continue;
5966 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5967 }
5968
5969 rejoin_ack_sent = recovery_set;
7c673cae
FG
5970
5971 // walk subtrees
5972 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5973 p != subtrees.end();
5974 ++p) {
5975 CDir *dir = p->first;
5976 if (!dir->is_auth())
5977 continue;
5978 dout(10) << "subtree " << *dir << dendl;
5979
5980 // auth items in this subtree
5981 list<CDir*> dq;
5982 dq.push_back(dir);
5983
5984 while (!dq.empty()) {
5985 CDir *dir = dq.front();
5986 dq.pop_front();
5987
5988 // dir
181888fb
FG
5989 for (auto &r : dir->get_replicas()) {
5990 auto it = acks.find(r.first);
31f18b77
FG
5991 if (it == acks.end())
5992 continue;
181888fb 5993 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 5994 it->second->add_dirfrag_base(dir);
7c673cae
FG
5995 }
5996
94b18763
FG
5997 for (auto &p : dir->items) {
5998 CDentry *dn = p.second;
7c673cae
FG
5999 CDentry::linkage_t *dnl = dn->get_linkage();
6000
6001 // inode
6002 CInode *in = NULL;
6003 if (dnl->is_primary())
6004 in = dnl->get_inode();
6005
6006 // dentry
181888fb
FG
6007 for (auto &r : dn->get_replicas()) {
6008 auto it = acks.find(r.first);
31f18b77
FG
6009 if (it == acks.end())
6010 continue;
94b18763 6011 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6012 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6013 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6014 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6015 ++r.second,
7c673cae
FG
6016 dn->lock.get_replica_state());
6017 // peer missed MDentrylink message ?
181888fb
FG
6018 if (in && !in->is_replica(r.first))
6019 in->add_replica(r.first);
7c673cae
FG
6020 }
6021
6022 if (!in)
6023 continue;
6024
181888fb
FG
6025 for (auto &r : in->get_replicas()) {
6026 auto it = acks.find(r.first);
31f18b77
FG
6027 if (it == acks.end())
6028 continue;
6029 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6030 bufferlist bl;
181888fb
FG
6031 in->_encode_locks_state_for_rejoin(bl, r.first);
6032 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6033 }
6034
6035 // subdirs in this subtree?
6036 in->get_nested_dirfrags(dq);
6037 }
6038 }
6039 }
6040
6041 // base inodes too
6042 if (root && root->is_auth())
181888fb
FG
6043 for (auto &r : root->get_replicas()) {
6044 auto it = acks.find(r.first);
31f18b77
FG
6045 if (it == acks.end())
6046 continue;
6047 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6048 bufferlist bl;
181888fb
FG
6049 root->_encode_locks_state_for_rejoin(bl, r.first);
6050 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6051 }
6052 if (myin)
181888fb
FG
6053 for (auto &r : myin->get_replicas()) {
6054 auto it = acks.find(r.first);
31f18b77
FG
6055 if (it == acks.end())
6056 continue;
6057 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6058 bufferlist bl;
181888fb
FG
6059 myin->_encode_locks_state_for_rejoin(bl, r.first);
6060 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6061 }
6062
6063 // include inode base for any inodes whose scatterlocks may have updated
6064 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6065 p != rejoin_potential_updated_scatterlocks.end();
6066 ++p) {
6067 CInode *in = *p;
181888fb
FG
6068 for (const auto &r : in->get_replicas()) {
6069 auto it = acks.find(r.first);
31f18b77
FG
6070 if (it == acks.end())
6071 continue;
6072 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6073 }
7c673cae
FG
6074 }
6075
6076 // send acks
31f18b77 6077 for (auto p = acks.begin(); p != acks.end(); ++p) {
7c673cae
FG
6078 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6079 mds->send_message_mds(p->second, p->first);
6080 }
6081
6082 rejoin_imported_caps.clear();
6083}
6084
c07f9fc5
FG
6085class C_MDC_ReIssueCaps : public MDCacheContext {
6086 CInode *in;
6087public:
6088 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6089 MDCacheContext(mdc), in(i)
6090 {
6091 in->get(CInode::PIN_PTRWAITER);
6092 }
6093 void finish(int r) override {
6094 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6095 mdcache->mds->locker->issue_caps(in);
6096 in->put(CInode::PIN_PTRWAITER);
6097 }
6098};
7c673cae
FG
6099
6100void MDCache::reissue_all_caps()
6101{
6102 dout(10) << "reissue_all_caps" << dendl;
6103
94b18763 6104 for (auto &p : inode_map) {
b32b8144 6105 CInode *in = p.second;
7c673cae 6106 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6107 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6108 if (in->is_frozen_inode()) {
6109 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6110 continue;
6111 }
7c673cae
FG
6112 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6113 mds->locker->issue_caps(in);
6114 }
6115 }
6116}
6117
6118
6119// ===============================================================================
6120
6121struct C_MDC_QueuedCow : public MDCacheContext {
6122 CInode *in;
6123 MutationRef mut;
6124 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6125 MDCacheContext(mdc), in(i), mut(m) {}
6126 void finish(int r) override {
6127 mdcache->_queued_file_recover_cow(in, mut);
6128 }
6129};
6130
6131
6132void MDCache::queue_file_recover(CInode *in)
6133{
6134 dout(10) << "queue_file_recover " << *in << dendl;
6135 assert(in->is_auth());
6136
6137 // cow?
6138 /*
6139 SnapRealm *realm = in->find_snaprealm();
6140 set<snapid_t> s = realm->get_snaps();
6141 while (!s.empty() && *s.begin() < in->first)
6142 s.erase(s.begin());
6143 while (!s.empty() && *s.rbegin() > in->last)
6144 s.erase(*s.rbegin());
6145 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6146 if (s.size() > 1) {
94b18763 6147 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6148 pi->version = in->pre_dirty();
6149
6150 auto mut(std::make_shared<MutationImpl>());
6151 mut->ls = mds->mdlog->get_current_segment();
6152 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6153 mds->mdlog->start_entry(le);
6154 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6155
6156 s.erase(*s.begin());
6157 while (!s.empty()) {
6158 snapid_t snapid = *s.begin();
6159 CInode *cow_inode = 0;
6160 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6161 assert(cow_inode);
6162 recovery_queue.enqueue(cow_inode);
6163 s.erase(*s.begin());
6164 }
6165
6166 in->parent->first = in->first;
6167 le->metablob.add_primary_dentry(in->parent, in, true);
6168 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6169 mds->mdlog->flush();
6170 }
6171 */
6172
6173 recovery_queue.enqueue(in);
6174}
6175
6176void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6177{
6178 in->pop_and_dirty_projected_inode(mut->ls);
6179 mut->apply();
6180 mds->locker->drop_locks(mut.get());
6181 mut->cleanup();
6182}
6183
6184
6185/*
6186 * called after recovery to recover file sizes for previously opened (for write)
6187 * files. that is, those where max_size > size.
6188 */
6189void MDCache::identify_files_to_recover()
6190{
6191 dout(10) << "identify_files_to_recover" << dendl;
94b18763 6192 for (auto &p : inode_map) {
b32b8144 6193 CInode *in = p.second;
7c673cae
FG
6194 if (!in->is_auth())
6195 continue;
6196
6197 if (in->last != CEPH_NOSNAP)
6198 continue;
6199
6200 // Only normal files need file size recovery
6201 if (!in->is_file()) {
6202 continue;
6203 }
6204
6205 bool recover = false;
6206 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6207 p != in->inode.client_ranges.end();
6208 ++p) {
6209 Capability *cap = in->get_client_cap(p->first);
6210 if (!cap) {
6211 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6212 recover = true;
6213 break;
6214 }
6215 }
6216
6217 if (recover) {
6218 if (in->filelock.is_stable()) {
6219 in->auth_pin(&in->filelock);
6220 } else {
6221 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6222 }
6223 in->filelock.set_state(LOCK_PRE_SCAN);
6224 rejoin_recover_q.push_back(in);
6225 } else {
6226 rejoin_check_q.push_back(in);
6227 }
6228 }
6229}
6230
6231void MDCache::start_files_to_recover()
6232{
6233 for (CInode *in : rejoin_check_q) {
6234 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6235 mds->locker->issue_caps(in);
6236 mds->locker->check_inode_max_size(in);
6237 }
6238 rejoin_check_q.clear();
6239 for (CInode *in : rejoin_recover_q) {
6240 mds->locker->file_recover(&in->filelock);
6241 }
6242 if (!rejoin_recover_q.empty()) {
6243 rejoin_recover_q.clear();
6244 do_file_recover();
6245 }
6246}
6247
6248void MDCache::do_file_recover()
6249{
6250 recovery_queue.advance();
6251}
6252
6253// ===============================================================================
6254
6255
6256// ----------------------------
6257// truncate
6258
6259class C_MDC_RetryTruncate : public MDCacheContext {
6260 CInode *in;
6261 LogSegment *ls;
6262public:
6263 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6264 MDCacheContext(c), in(i), ls(l) {}
6265 void finish(int r) override {
6266 mdcache->_truncate_inode(in, ls);
6267 }
6268};
6269
6270void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6271{
94b18763 6272 auto pi = in->get_projected_inode();
7c673cae
FG
6273 dout(10) << "truncate_inode "
6274 << pi->truncate_from << " -> " << pi->truncate_size
6275 << " on " << *in
6276 << dendl;
6277
6278 ls->truncating_inodes.insert(in);
6279 in->get(CInode::PIN_TRUNCATING);
6280 in->auth_pin(this);
6281
6282 if (!in->client_need_snapflush.empty() &&
6283 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6284 assert(in->filelock.is_xlocked());
6285 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6286 mds->locker->issue_caps(in);
6287 return;
6288 }
6289
6290 _truncate_inode(in, ls);
6291}
6292
6293struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6294 CInode *in;
6295 LogSegment *ls;
6296 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6297 MDCacheIOContext(c), in(i), ls(l) {}
6298 void finish(int r) override {
6299 assert(r == 0 || r == -ENOENT);
6300 mdcache->truncate_inode_finish(in, ls);
6301 }
6302};
6303
6304void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6305{
94b18763 6306 auto pi = &in->inode;
7c673cae
FG
6307 dout(10) << "_truncate_inode "
6308 << pi->truncate_from << " -> " << pi->truncate_size
6309 << " on " << *in << dendl;
6310
6311 assert(pi->is_truncating());
6312 assert(pi->truncate_size < (1ULL << 63));
6313 assert(pi->truncate_from < (1ULL << 63));
6314 assert(pi->truncate_size < pi->truncate_from);
6315
6316
6317 SnapRealm *realm = in->find_snaprealm();
6318 SnapContext nullsnap;
6319 const SnapContext *snapc;
6320 if (realm) {
6321 dout(10) << " realm " << *realm << dendl;
6322 snapc = &realm->get_snap_context();
6323 } else {
6324 dout(10) << " NO realm, using null context" << dendl;
6325 snapc = &nullsnap;
6326 assert(in->last == CEPH_NOSNAP);
6327 }
6328 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6329 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6330 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6331 pi->truncate_seq, ceph::real_time::min(), 0,
6332 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6333 mds->finisher));
6334}
6335
6336struct C_MDC_TruncateLogged : public MDCacheLogContext {
6337 CInode *in;
6338 MutationRef mut;
6339 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6340 MDCacheLogContext(m), in(i), mut(mu) {}
6341 void finish(int r) override {
6342 mdcache->truncate_inode_logged(in, mut);
6343 }
6344};
6345
6346void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6347{
6348 dout(10) << "truncate_inode_finish " << *in << dendl;
6349
6350 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6351 assert(p != ls->truncating_inodes.end());
6352 ls->truncating_inodes.erase(p);
6353
6354 // update
94b18763
FG
6355 auto &pi = in->project_inode();
6356 pi.inode.version = in->pre_dirty();
6357 pi.inode.truncate_from = 0;
6358 pi.inode.truncate_pending--;
7c673cae
FG
6359
6360 MutationRef mut(new MutationImpl());
6361 mut->ls = mds->mdlog->get_current_segment();
6362 mut->add_projected_inode(in);
6363
6364 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6365 mds->mdlog->start_entry(le);
6366 CDentry *dn = in->get_projected_parent_dn();
6367 le->metablob.add_dir_context(dn->get_dir());
6368 le->metablob.add_primary_dentry(dn, in, true);
6369 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6370
6371 journal_dirty_inode(mut.get(), &le->metablob, in);
6372 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6373
6374 // flush immediately if there are readers/writers waiting
6375 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6376 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6377 mds->mdlog->flush();
6378}
6379
6380void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6381{
6382 dout(10) << "truncate_inode_logged " << *in << dendl;
6383 mut->apply();
6384 mds->locker->drop_locks(mut.get());
6385 mut->cleanup();
6386
6387 in->put(CInode::PIN_TRUNCATING);
6388 in->auth_unpin(this);
6389
6390 list<MDSInternalContextBase*> waiters;
6391 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6392 mds->queue_waiters(waiters);
6393}
6394
6395
6396void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6397{
6398 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6399 << ls->seq << "/" << ls->offset << dendl;
6400 ls->truncating_inodes.insert(in);
6401 in->get(CInode::PIN_TRUNCATING);
6402}
6403
6404void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6405{
6406 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6407 << ls->seq << "/" << ls->offset << dendl;
6408 // if we have the logseg the truncate started in, it must be in our list.
6409 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6410 assert(p != ls->truncating_inodes.end());
6411 ls->truncating_inodes.erase(p);
6412 in->put(CInode::PIN_TRUNCATING);
6413}
6414
6415void MDCache::start_recovered_truncates()
6416{
6417 dout(10) << "start_recovered_truncates" << dendl;
6418 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6419 p != mds->mdlog->segments.end();
6420 ++p) {
6421 LogSegment *ls = p->second;
6422 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6423 q != ls->truncating_inodes.end();
6424 ++q) {
6425 CInode *in = *q;
6426 in->auth_pin(this);
6427
6428 if (!in->client_need_snapflush.empty() &&
6429 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6430 assert(in->filelock.is_stable());
6431 in->filelock.set_state(LOCK_XLOCKDONE);
6432 in->auth_pin(&in->filelock);
6433 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6434 // start_files_to_recover will revoke caps
6435 continue;
6436 }
6437 _truncate_inode(in, ls);
6438 }
6439 }
6440}
6441
6442
6443
6444
6445
6446
6447// ================================================================================
6448// cache trimming
6449
181888fb
FG
6450void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6451{
7c673cae 6452 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6453 std::vector<CDentry *> unexpirables;
6454 uint64_t trimmed = 0;
6455
6456 dout(7) << "trim_lru trimming " << count
6457 << " items from LRU"
6458 << " size=" << lru.lru_get_size()
6459 << " mid=" << lru.lru_get_top()
6460 << " pintail=" << lru.lru_get_pintail()
6461 << " pinned=" << lru.lru_get_num_pinned()
6462 << dendl;
7c673cae 6463
31f18b77
FG
6464 for (;;) {
6465 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6466 if (!dn)
6467 break;
6468 if (trim_dentry(dn, expiremap)) {
6469 unexpirables.push_back(dn);
181888fb
FG
6470 } else {
6471 trimmed++;
31f18b77
FG
6472 }
6473 }
6474
181888fb 6475 for (auto &dn : unexpirables) {
31f18b77 6476 bottom_lru.lru_insert_mid(dn);
181888fb 6477 }
31f18b77
FG
6478 unexpirables.clear();
6479
181888fb
FG
6480 // trim dentries from the LRU until count is reached
6481 while (cache_toofull() || count > 0) {
7c673cae
FG
6482 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6483 if (!dn) {
6484 break;
6485 }
7c673cae 6486 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6487 dn->get_linkage()->inode->item_open_file.is_on_list())) {
7c673cae 6488 unexpirables.push_back(dn);
181888fb
FG
6489 } else if (trim_dentry(dn, expiremap)) {
6490 unexpirables.push_back(dn);
6491 } else {
6492 trimmed++;
3efd9988 6493 if (count > 0) count--;
7c673cae
FG
6494 }
6495 }
181888fb
FG
6496
6497 for (auto &dn : unexpirables) {
31f18b77 6498 lru.lru_insert_mid(dn);
181888fb 6499 }
31f18b77 6500 unexpirables.clear();
7c673cae 6501
181888fb
FG
6502 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6503}
6504
6505/*
6506 * note: only called while MDS is active or stopping... NOT during recovery.
6507 * however, we may expire a replica whose authority is recovering.
6508 *
6509 * @param count is number of dentries to try to expire
6510 */
6511bool MDCache::trim(uint64_t count)
6512{
6513 uint64_t used = cache_size();
6514 uint64_t limit = cache_limit_memory();
6515 map<mds_rank_t, MCacheExpire*> expiremap;
6516
6517 dout(7) << "trim bytes_used=" << bytes2str(used)
6518 << " limit=" << bytes2str(limit)
6519 << " reservation=" << cache_reservation()
6520 << "% count=" << count << dendl;
6521
6522 // process delayed eval_stray()
6523 stray_manager.advance_delayed();
6524
6525 trim_lru(count, expiremap);
6526
7c673cae 6527 // trim non-auth, non-bound subtrees
181888fb 6528 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6529 CDir *dir = p->first;
6530 ++p;
31f18b77
FG
6531 CInode *diri = dir->get_inode();
6532 if (dir->is_auth()) {
6533 if (!diri->is_auth() && !diri->is_base() &&
6534 dir->get_num_head_items() == 0) {
6535 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6536 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6537 dir->is_freezing() || dir->is_frozen())
6538 continue;
6539
6540 migrator->export_empty_import(dir);
6541 }
6542 } else {
6543 if (!diri->is_auth()) {
6544 if (dir->get_num_ref() > 1) // only subtree pin
6545 continue;
6546 list<CDir*> ls;
6547 diri->get_subtree_dirfrags(ls);
6548 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6549 continue;
6550
6551 // don't trim subtree root if its auth MDS is recovering.
6552 // This simplify the cache rejoin code.
6553 if (dir->is_subtree_root() &&
6554 rejoin_ack_gather.count(dir->get_dir_auth().first))
6555 continue;
7c673cae 6556 trim_dirfrag(dir, 0, expiremap);
31f18b77 6557 }
7c673cae
FG
6558 }
6559 }
6560
6561 // trim root?
181888fb 6562 if (mds->is_stopping() && root) {
7c673cae
FG
6563 list<CDir*> ls;
6564 root->get_dirfrags(ls);
6565 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6566 CDir *dir = *p;
6567 if (dir->get_num_ref() == 1) // subtree pin
6568 trim_dirfrag(dir, 0, expiremap);
6569 }
6570 if (root->get_num_ref() == 0)
6571 trim_inode(0, root, 0, expiremap);
6572 }
6573
6574 std::set<mds_rank_t> stopping;
6575 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6576 stopping.erase(mds->get_nodeid());
6577 for (auto rank : stopping) {
6578 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6579 if (!mdsdir_in)
6580 continue;
6581
6582 if (expiremap.count(rank) == 0) {
6583 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6584 }
6585
6586 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6587
6588 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6589 if (!aborted) {
6590 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6591 list<CDir*> ls;
6592 mdsdir_in->get_dirfrags(ls);
6593 for (auto dir : ls) {
6594 if (dir->get_num_ref() == 1) // subtree pin
6595 trim_dirfrag(dir, dir, expiremap);
6596 }
6597 if (mdsdir_in->get_num_ref() == 0)
6598 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6599 } else {
6600 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6601 }
6602 }
6603
6604 // Other rank's base inodes (when I'm stopping)
181888fb 6605 if (mds->is_stopping()) {
7c673cae
FG
6606 for (set<CInode*>::iterator p = base_inodes.begin();
6607 p != base_inodes.end(); ++p) {
6608 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6609 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6610 if ((*p)->get_num_ref() == 0) {
6611 trim_inode(NULL, *p, NULL, expiremap);
6612 }
6613 }
6614 }
6615 }
6616
6617 // send any expire messages
6618 send_expire_messages(expiremap);
6619
6620 return true;
6621}
6622
6623void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6624{
6625 // send expires
6626 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6627 it != expiremap.end();
6628 ++it) {
6629 if (mds->is_cluster_degraded() &&
6630 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6631 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6632 rejoin_sent.count(it->first) == 0))) {
6633 it->second->put();
6634 continue;
6635 }
6636 dout(7) << "sending cache_expire to " << it->first << dendl;
6637 mds->send_message_mds(it->second, it->first);
6638 }
6639}
6640
6641
6642bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6643{
6644 dout(12) << "trim_dentry " << *dn << dendl;
6645
6646 CDentry::linkage_t *dnl = dn->get_linkage();
6647
6648 CDir *dir = dn->get_dir();
6649 assert(dir);
6650
6651 CDir *con = get_subtree_root(dir);
6652 if (con)
6653 dout(12) << " in container " << *con << dendl;
6654 else {
6655 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6656 assert(dn->is_auth());
6657 }
6658
6659 // If replica dentry is not readable, it's likely we will receive
6660 // MDentryLink/MDentryUnlink message soon (It's possible we first
6661 // receive a MDentryUnlink message, then MDentryLink message)
6662 // MDentryLink message only replicates an inode, so we should
6663 // avoid trimming the inode's parent dentry. This is because that
6664 // unconnected replicas are problematic for subtree migration.
6665 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6666 !dn->get_dir()->get_inode()->is_stray())
6667 return true;
6668
6669 // adjust the dir state
6670 // NOTE: we can safely remove a clean, null dentry without effecting
6671 // directory completeness.
6672 // (check this _before_ we unlink the inode, below!)
6673 bool clear_complete = false;
6674 if (!(dnl->is_null() && dn->is_clean()))
6675 clear_complete = true;
6676
6677 // unlink the dentry
6678 if (dnl->is_remote()) {
6679 // just unlink.
31f18b77 6680 dir->unlink_inode(dn, false);
7c673cae
FG
6681 } else if (dnl->is_primary()) {
6682 // expire the inode, too.
6683 CInode *in = dnl->get_inode();
6684 assert(in);
6685 if (trim_inode(dn, in, con, expiremap))
6686 return true; // purging stray instead of trimming
6687 } else {
6688 assert(dnl->is_null());
6689 }
6690
6691 if (!dn->is_auth()) {
6692 // notify dentry authority.
6693 mds_authority_t auth = dn->authority();
6694
6695 for (int p=0; p<2; p++) {
6696 mds_rank_t a = auth.first;
6697 if (p) a = auth.second;
6698 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6699 if (mds->get_nodeid() == auth.second &&
6700 con->is_importing()) break; // don't send any expire while importing.
6701 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6702
6703 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6704 assert(a != mds->get_nodeid());
6705 if (expiremap.count(a) == 0)
6706 expiremap[a] = new MCacheExpire(mds->get_nodeid());
94b18763 6707 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6708 }
6709 }
6710
6711 // remove dentry
6712 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6713 dir->add_to_bloom(dn);
6714 dir->remove_dentry(dn);
6715
6716 if (clear_complete)
6717 dir->state_clear(CDir::STATE_COMPLETE);
6718
7c673cae
FG
6719 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6720 return false;
6721}
6722
6723
6724void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6725{
6726 dout(15) << "trim_dirfrag " << *dir << dendl;
6727
6728 if (dir->is_subtree_root()) {
6729 assert(!dir->is_auth() ||
6730 (!dir->is_replicated() && dir->inode->is_base()));
6731 remove_subtree(dir); // remove from subtree map
6732 }
6733 assert(dir->get_num_ref() == 0);
6734
6735 CInode *in = dir->get_inode();
6736
6737 if (!dir->is_auth()) {
6738 mds_authority_t auth = dir->authority();
6739
6740 // was this an auth delegation? (if so, slightly modified container)
6741 dirfrag_t condf;
6742 if (dir->is_subtree_root()) {
6743 dout(12) << " subtree root, container is " << *dir << dendl;
6744 con = dir;
6745 condf = dir->dirfrag();
6746 } else {
6747 condf = con->dirfrag();
6748 }
6749
6750 for (int p=0; p<2; p++) {
6751 mds_rank_t a = auth.first;
6752 if (p) a = auth.second;
6753 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6754 if (mds->get_nodeid() == auth.second &&
6755 con->is_importing()) break; // don't send any expire while importing.
6756 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6757
6758 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6759 assert(a != mds->get_nodeid());
6760 if (expiremap.count(a) == 0)
6761 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6762 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6763 }
6764 }
6765
6766 in->close_dirfrag(dir->dirfrag().frag);
6767}
6768
6769/**
6770 * Try trimming an inode from the cache
6771 *
6772 * @return true if the inode is still in cache, else false if it was trimmed
6773 */
6774bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6775{
6776 dout(15) << "trim_inode " << *in << dendl;
6777 assert(in->get_num_ref() == 0);
6778
6779 if (in->is_dir()) {
6780 // If replica inode's dirfragtreelock is not readable, it's likely
6781 // some dirfrags of the inode are being fragmented and we will receive
6782 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6783 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6784 // This is because that unconnected replicas are problematic for
6785 // subtree migration.
6786 //
28e407b8 6787 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
7c673cae 6788 return true;
28e407b8 6789 }
7c673cae
FG
6790
6791 // DIR
6792 list<CDir*> dfls;
6793 in->get_dirfrags(dfls);
6794 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6795 CDir *dir = *p;
6796 assert(!dir->is_subtree_root());
6797 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6798 }
6799 }
6800
6801 // INODE
6802 if (in->is_auth()) {
6803 // eval stray after closing dirfrags
6804 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6805 maybe_eval_stray(in);
6806 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6807 return true;
6808 }
6809 } else {
6810 mds_authority_t auth = in->authority();
6811
6812 dirfrag_t df;
6813 if (con)
6814 df = con->dirfrag();
6815 else
6816 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6817
6818 for (int p=0; p<2; p++) {
6819 mds_rank_t a = auth.first;
6820 if (p) a = auth.second;
6821 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6822 if (con && mds->get_nodeid() == auth.second &&
6823 con->is_importing()) break; // don't send any expire while importing.
6824 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6825
6826 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6827 assert(a != mds->get_nodeid());
6828 if (expiremap.count(a) == 0)
6829 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6830 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6831 }
6832 }
6833
6834 /*
6835 if (in->is_auth()) {
6836 if (in->hack_accessed)
6837 mds->logger->inc("outt");
6838 else {
6839 mds->logger->inc("outut");
6840 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6841 }
6842 }
6843 */
6844
6845 // unlink
6846 if (dn)
31f18b77 6847 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
6848 remove_inode(in);
6849 return false;
6850}
6851
6852
6853/**
6854 * trim_non_auth - remove any non-auth items from our cache
6855 *
6856 * this reduces the amount of non-auth metadata in our cache, reducing the
6857 * load incurred by the rejoin phase.
6858 *
6859 * the only non-auth items that remain are those that are needed to
6860 * attach our own subtrees to the root.
6861 *
6862 * when we are done, all dentries will be in the top bit of the lru.
6863 *
6864 * why we have to do this:
6865 * we may not have accurate linkage for non-auth items. which means we will
6866 * know which subtree it falls into, and can not be sure to declare it to the
6867 * correct authority.
6868 */
6869void MDCache::trim_non_auth()
6870{
6871 dout(7) << "trim_non_auth" << dendl;
6872
6873 // temporarily pin all subtree roots
6874 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6875 p != subtrees.end();
6876 ++p)
6877 p->first->get(CDir::PIN_SUBTREETEMP);
6878
31f18b77 6879 list<CDentry*> auth_list;
7c673cae
FG
6880
6881 // trim non-auth items from the lru
31f18b77
FG
6882 for (;;) {
6883 CDentry *dn = NULL;
6884 if (bottom_lru.lru_get_size() > 0)
6885 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6886 if (!dn && lru.lru_get_size() > 0)
6887 dn = static_cast<CDentry*>(lru.lru_expire());
6888 if (!dn)
6889 break;
6890
7c673cae
FG
6891 CDentry::linkage_t *dnl = dn->get_linkage();
6892
6893 if (dn->is_auth()) {
6894 // add back into lru (at the top)
31f18b77 6895 auth_list.push_back(dn);
7c673cae
FG
6896
6897 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6898 dn->unlink_remote(dnl);
7c673cae
FG
6899 } else {
6900 // non-auth. expire.
6901 CDir *dir = dn->get_dir();
6902 assert(dir);
6903
6904 // unlink the dentry
6905 dout(10) << " removing " << *dn << dendl;
6906 if (dnl->is_remote()) {
31f18b77 6907 dir->unlink_inode(dn, false);
7c673cae
FG
6908 }
6909 else if (dnl->is_primary()) {
6910 CInode *in = dnl->get_inode();
6911 dout(10) << " removing " << *in << dendl;
6912 list<CDir*> ls;
6913 in->get_dirfrags(ls);
6914 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6915 CDir *subdir = *p;
6916 assert(!subdir->is_subtree_root());
6917 in->close_dirfrag(subdir->dirfrag().frag);
6918 }
31f18b77 6919 dir->unlink_inode(dn, false);
7c673cae
FG
6920 remove_inode(in);
6921 }
6922 else {
6923 assert(dnl->is_null());
6924 }
6925
6926 assert(!dir->has_bloom());
6927 dir->remove_dentry(dn);
6928 // adjust the dir state
6929 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6930 // close empty non-auth dirfrag
6931 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6932 dir->inode->close_dirfrag(dir->get_frag());
6933 }
6934 }
6935
31f18b77
FG
6936 for (auto dn : auth_list) {
6937 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6938 bottom_lru.lru_insert_mid(dn);
6939 else
6940 lru.lru_insert_top(dn);
6941 }
6942
7c673cae
FG
6943 // move everything in the pintail to the top bit of the lru.
6944 lru.lru_touch_entire_pintail();
6945
6946 // unpin all subtrees
6947 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6948 p != subtrees.end();
6949 ++p)
6950 p->first->put(CDir::PIN_SUBTREETEMP);
6951
31f18b77
FG
6952 if (lru.lru_get_size() == 0 &&
6953 bottom_lru.lru_get_size() == 0) {
7c673cae 6954 // root, stray, etc.?
b32b8144 6955 auto p = inode_map.begin();
7c673cae 6956 while (p != inode_map.end()) {
7c673cae 6957 CInode *in = p->second;
b32b8144 6958 ++p;
7c673cae
FG
6959 if (!in->is_auth()) {
6960 list<CDir*> ls;
6961 in->get_dirfrags(ls);
6962 for (list<CDir*>::iterator p = ls.begin();
6963 p != ls.end();
6964 ++p) {
6965 dout(10) << " removing " << **p << dendl;
6966 assert((*p)->get_num_ref() == 1); // SUBTREE
6967 remove_subtree((*p));
6968 in->close_dirfrag((*p)->dirfrag().frag);
6969 }
6970 dout(10) << " removing " << *in << dendl;
6971 assert(!in->get_parent_dn());
6972 assert(in->get_num_ref() == 0);
6973 remove_inode(in);
6974 }
7c673cae
FG
6975 }
6976 }
6977
6978 show_subtrees();
6979}
6980
6981/**
6982 * Recursively trim the subtree rooted at directory to remove all
6983 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6984 * of those links. This is used to clear invalid data out of the cache.
6985 * Note that it doesn't clear the passed-in directory, since that's not
6986 * always safe.
6987 */
6988bool MDCache::trim_non_auth_subtree(CDir *dir)
6989{
6990 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6991
6992 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6993
94b18763
FG
6994 auto j = dir->begin();
6995 auto i = j;
7c673cae
FG
6996 while (j != dir->end()) {
6997 i = j++;
6998 CDentry *dn = i->second;
6999 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7000 CDentry::linkage_t *dnl = dn->get_linkage();
7001 if (dnl->is_primary()) { // check for subdirectories, etc
7002 CInode *in = dnl->get_inode();
7003 bool keep_inode = false;
7004 if (in->is_dir()) {
7005 list<CDir*> subdirs;
7006 in->get_dirfrags(subdirs);
7007 for (list<CDir*>::iterator subdir = subdirs.begin();
7008 subdir != subdirs.end();
7009 ++subdir) {
7010 if ((*subdir)->is_subtree_root()) {
7011 keep_inode = true;
7012 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7013 } else {
7014 if (trim_non_auth_subtree(*subdir))
7015 keep_inode = true;
7016 else {
7017 in->close_dirfrag((*subdir)->get_frag());
7018 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7019 }
7020 }
7021 }
7022
7023 }
7024 if (!keep_inode) { // remove it!
7025 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7026 dir->unlink_inode(dn, false);
7c673cae
FG
7027 remove_inode(in);
7028 assert(!dir->has_bloom());
7029 dir->remove_dentry(dn);
7030 } else {
7031 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7032 dn->state_clear(CDentry::STATE_AUTH);
7033 in->state_clear(CInode::STATE_AUTH);
7034 }
7035 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7036 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7037 } else { // just remove it
7038 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7039 if (dnl->is_remote())
31f18b77 7040 dir->unlink_inode(dn, false);
7c673cae
FG
7041 dir->remove_dentry(dn);
7042 }
7043 }
7044 dir->state_clear(CDir::STATE_AUTH);
7045 /**
7046 * We've now checked all our children and deleted those that need it.
7047 * Now return to caller, and tell them if *we're* a keeper.
7048 */
7049 return keep_dir || dir->get_num_any();
7050}
7051
7052/*
7053 * during replay, when we determine a subtree is no longer ours, we
7054 * try to trim it from our cache. because subtrees must be connected
7055 * to the root, the fact that we can trim this tree may mean that our
7056 * children or parents can also be trimmed.
7057 */
7058void MDCache::try_trim_non_auth_subtree(CDir *dir)
7059{
7060 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7061
7062 // can we now trim child subtrees?
7063 set<CDir*> bounds;
7064 get_subtree_bounds(dir, bounds);
7065 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7066 CDir *bd = *p;
7067 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7068 bd->get_num_any() == 0 && // and empty
7069 can_trim_non_auth_dirfrag(bd)) {
7070 CInode *bi = bd->get_inode();
7071 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7072 remove_subtree(bd);
7073 bd->mark_clean();
7074 bi->close_dirfrag(bd->get_frag());
7075 }
7076 }
7077
7078 if (trim_non_auth_subtree(dir)) {
7079 // keep
7080 try_subtree_merge(dir);
7081 } else {
7082 // can we trim this subtree (and possibly our ancestors) too?
7083 while (true) {
7084 CInode *diri = dir->get_inode();
7085 if (diri->is_base()) {
7086 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7087 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7088 remove_subtree(dir);
7089 dir->mark_clean();
7090 diri->close_dirfrag(dir->get_frag());
7091
7092 dout(10) << " removing " << *diri << dendl;
7093 assert(!diri->get_parent_dn());
7094 assert(diri->get_num_ref() == 0);
7095 remove_inode(diri);
7096 }
7097 break;
7098 }
7099
7100 CDir *psub = get_subtree_root(diri->get_parent_dir());
7101 dout(10) << " parent subtree is " << *psub << dendl;
7102 if (psub->get_dir_auth().first == mds->get_nodeid())
7103 break; // we are auth, keep.
7104
7105 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7106 remove_subtree(dir);
7107 dir->mark_clean();
7108 diri->close_dirfrag(dir->get_frag());
7109
7110 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7111 if (trim_non_auth_subtree(psub))
7112 break;
7113 dir = psub;
7114 }
7115 }
7116
7117 show_subtrees();
7118}
7119
7120void MDCache::standby_trim_segment(LogSegment *ls)
7121{
7122 ls->new_dirfrags.clear_list();
7123 ls->open_files.clear_list();
7124
7125 while (!ls->dirty_dirfrags.empty()) {
7126 CDir *dir = ls->dirty_dirfrags.front();
7127 dir->mark_clean();
7128 }
7129 while (!ls->dirty_inodes.empty()) {
7130 CInode *in = ls->dirty_inodes.front();
7131 in->mark_clean();
7132 }
7133 while (!ls->dirty_dentries.empty()) {
7134 CDentry *dn = ls->dirty_dentries.front();
7135 dn->mark_clean();
7136 }
7137 while (!ls->dirty_parent_inodes.empty()) {
7138 CInode *in = ls->dirty_parent_inodes.front();
7139 in->clear_dirty_parent();
7140 }
7141 while (!ls->dirty_dirfrag_dir.empty()) {
7142 CInode *in = ls->dirty_dirfrag_dir.front();
7143 in->filelock.remove_dirty();
7144 }
7145 while (!ls->dirty_dirfrag_nest.empty()) {
7146 CInode *in = ls->dirty_dirfrag_nest.front();
7147 in->nestlock.remove_dirty();
7148 }
7149 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7150 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7151 in->dirfragtreelock.remove_dirty();
7152 }
7153}
7154
7155/* This function DOES put the passed message before returning */
7156void MDCache::handle_cache_expire(MCacheExpire *m)
7157{
7158 mds_rank_t from = mds_rank_t(m->get_from());
7159
7160 dout(7) << "cache_expire from mds." << from << dendl;
7161
7162 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7163 m->put();
7164 return;
7165 }
7166
7167 set<SimpleLock *> gather_locks;
7168 // loop over realms
7169 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7170 p != m->realms.end();
7171 ++p) {
7172 // check container?
7173 if (p->first.ino > 0) {
7174 CInode *expired_inode = get_inode(p->first.ino);
7175 assert(expired_inode); // we had better have this.
7176 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7177 assert(parent_dir);
7178
7179 int export_state = -1;
7180 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7181 export_state = migrator->get_export_state(parent_dir);
7182 assert(export_state >= 0);
7183 }
7184
7185 if (!parent_dir->is_auth() ||
7186 (export_state != -1 &&
7187 ((export_state == Migrator::EXPORT_WARNING &&
7188 migrator->export_has_warned(parent_dir,from)) ||
7189 export_state == Migrator::EXPORT_EXPORTING ||
7190 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7191 (export_state == Migrator::EXPORT_NOTIFYING &&
7192 !migrator->export_has_notified(parent_dir,from))))) {
7193
7194 // not auth.
7195 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7196 assert(parent_dir->is_frozen_tree_root());
7197
7198 // make a message container
7199 if (delayed_expire[parent_dir].count(from) == 0)
7200 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7201
7202 // merge these expires into it
7203 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7204 continue;
7205 }
7206 assert(export_state <= Migrator::EXPORT_PREPPING ||
7207 (export_state == Migrator::EXPORT_WARNING &&
7208 !migrator->export_has_warned(parent_dir, from)));
7209
7210 dout(7) << "expires for " << *parent_dir << dendl;
7211 } else {
7212 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7213 }
7214
7215 // INODES
7216 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7217 it != p->second.inodes.end();
7218 ++it) {
7219 CInode *in = get_inode(it->first);
7220 unsigned nonce = it->second;
7221
7222 if (!in) {
7223 dout(0) << " inode expire on " << it->first << " from " << from
7224 << ", don't have it" << dendl;
7225 assert(in);
7226 }
7227 assert(in->is_auth());
7228 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7229
7230 // check nonce
7231 if (nonce == in->get_replica_nonce(from)) {
7232 // remove from our cached_by
7233 dout(7) << " inode expire on " << *in << " from mds." << from
7234 << " cached_by was " << in->get_replicas() << dendl;
7235 inode_remove_replica(in, from, false, gather_locks);
7236 }
7237 else {
7238 // this is an old nonce, ignore expire.
7239 dout(7) << " inode expire on " << *in << " from mds." << from
7240 << " with old nonce " << nonce
7241 << " (current " << in->get_replica_nonce(from) << "), dropping"
7242 << dendl;
7243 }
7244 }
7245
7246 // DIRS
7247 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7248 it != p->second.dirs.end();
7249 ++it) {
7250 CDir *dir = get_dirfrag(it->first);
7251 unsigned nonce = it->second;
7252
7253 if (!dir) {
7254 CInode *diri = get_inode(it->first.ino);
7255 if (diri) {
7256 if (mds->is_rejoin() &&
7257 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7258 !diri->is_replica(from)) {
7259 list<CDir*> ls;
7260 diri->get_nested_dirfrags(ls);
7261 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7262 << " while rejoining, inode isn't replicated" << dendl;
7263 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7264 dir = *q;
7265 if (dir->is_replica(from)) {
7266 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7267 dir->remove_replica(from);
7268 }
7269 }
7270 continue;
7271 }
7272 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7273 if (other) {
7274 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7275 << " have " << *other << ", mismatched frags, dropping" << dendl;
7276 continue;
7277 }
7278 }
7279 dout(0) << " dir expire on " << it->first << " from " << from
7280 << ", don't have it" << dendl;
7281 assert(dir);
7282 }
7283 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7284
7285 assert(dir->is_auth());
7286
7287 // check nonce
7288 if (nonce == dir->get_replica_nonce(from)) {
7289 // remove from our cached_by
7290 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7291 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7292 dir->remove_replica(from);
7293 }
7294 else {
7295 // this is an old nonce, ignore expire.
7296 dout(7) << " dir expire on " << *dir << " from mds." << from
7297 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7298 << "), dropping" << dendl;
7299 }
7300 }
7301
7302 // DENTRIES
7303 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7304 pd != p->second.dentries.end();
7305 ++pd) {
7306 dout(10) << " dn expires in dir " << pd->first << dendl;
7307 CInode *diri = get_inode(pd->first.ino);
7308 assert(diri);
7309 CDir *dir = diri->get_dirfrag(pd->first.frag);
7310
7311 if (!dir) {
7312 dout(0) << " dn expires on " << pd->first << " from " << from
7313 << ", must have refragmented" << dendl;
7314 } else {
7315 assert(dir->is_auth());
7316 }
7317
7318 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7319 p != pd->second.end();
7320 ++p) {
7321 unsigned nonce = p->second;
7322 CDentry *dn;
7323
7324 if (dir) {
7325 dn = dir->lookup(p->first.first, p->first.second);
7326 } else {
7327 // which dirfrag for this dentry?
7328 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7329 assert(dir);
7330 assert(dir->is_auth());
7331 dn = dir->lookup(p->first.first, p->first.second);
7332 }
7333
7334 if (!dn) {
7335 if (dir)
7336 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7337 else
7338 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7339 }
7340 assert(dn);
7341
7342 if (nonce == dn->get_replica_nonce(from)) {
7343 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7344 dentry_remove_replica(dn, from, gather_locks);
7345 }
7346 else {
7347 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7348 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7349 << "), dropping" << dendl;
7350 }
7351 }
7352 }
7353 }
7354
7355 // done
7356 m->put();
7357
7358 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7359 if (!(*p)->is_stable())
7360 mds->locker->eval_gather(*p);
7361 }
7362}
7363
7364void MDCache::process_delayed_expire(CDir *dir)
7365{
7366 dout(7) << "process_delayed_expire on " << *dir << dendl;
7367 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7368 p != delayed_expire[dir].end();
7369 ++p)
7370 handle_cache_expire(p->second);
7371 delayed_expire.erase(dir);
7372}
7373
7374void MDCache::discard_delayed_expire(CDir *dir)
7375{
7376 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7377 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7378 p != delayed_expire[dir].end();
7379 ++p)
7380 p->second->put();
7381 delayed_expire.erase(dir);
7382}
7383
7384void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7385 set<SimpleLock *>& gather_locks)
7386{
7387 in->remove_replica(from);
7388 in->mds_caps_wanted.erase(from);
7389
7390 // note: this code calls _eval more often than it needs to!
7391 // fix lock
7392 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7393 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7394 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7395 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7396 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7397 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7398
7399 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7400 // Don't remove the recovering mds from lock's gathering list because
7401 // it may hold rejoined wrlocks.
7402 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7403 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7404 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7405}
7406
7407void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7408{
7409 dn->remove_replica(from);
7410
7411 // fix lock
7412 if (dn->lock.remove_replica(from))
7413 gather_locks.insert(&dn->lock);
7414
7415 // Replicated strays might now be elegible for purge
7416 CDentry::linkage_t *dnl = dn->get_linkage();
7417 if (dnl->is_primary()) {
7418 maybe_eval_stray(dnl->get_inode());
7419 }
7420}
7421
7422void MDCache::trim_client_leases()
7423{
7424 utime_t now = ceph_clock_now();
7425
7426 dout(10) << "trim_client_leases" << dendl;
7427
7428 for (int pool=0; pool<client_lease_pools; pool++) {
7429 int before = client_leases[pool].size();
7430 if (client_leases[pool].empty())
7431 continue;
7432
7433 while (!client_leases[pool].empty()) {
7434 ClientLease *r = client_leases[pool].front();
7435 if (r->ttl > now) break;
7436 CDentry *dn = static_cast<CDentry*>(r->parent);
7437 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7438 dn->remove_client_lease(r, mds->locker);
7439 }
7440 int after = client_leases[pool].size();
7441 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7442 << (before-after) << " leases, " << after << " left" << dendl;
7443 }
7444}
7445
7446
7447void MDCache::check_memory_usage()
7448{
7449 static MemoryModel mm(g_ceph_context);
7450 static MemoryModel::snap last;
7451 mm.sample(&last);
7452 static MemoryModel::snap baseline = last;
7453
7454 // check client caps
b32b8144 7455 assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7456 double caps_per_inode = 0.0;
7c673cae 7457 if (CInode::count())
181888fb 7458 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae
FG
7459
7460 dout(2) << "check_memory_usage"
7461 << " total " << last.get_total()
7462 << ", rss " << last.get_rss()
7463 << ", heap " << last.get_heap()
7464 << ", baseline " << baseline.get_heap()
7465 << ", buffers " << (buffer::get_total_alloc() >> 10)
7466 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7467 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7468 << dendl;
7469
c07f9fc5 7470 mds->update_mlogger();
7c673cae
FG
7471 mds->mlogger->set(l_mdm_rss, last.get_rss());
7472 mds->mlogger->set(l_mdm_heap, last.get_heap());
7473
181888fb
FG
7474 if (cache_toofull()) {
7475 last_recall_state = ceph_clock_now();
7476 mds->server->recall_client_state();
7c673cae
FG
7477 }
7478
7479 // If the cache size had exceeded its limit, but we're back in bounds
7480 // now, free any unused pool memory so that our memory usage isn't
7481 // permanently bloated.
181888fb 7482 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7483 // Only do this once we are back in bounds: otherwise the releases would
7484 // slow down whatever process caused us to exceed bounds to begin with
7485 if (ceph_using_tcmalloc()) {
7486 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7487 << dendl;
7488 ceph_heap_release_free_memory();
7489 }
7490 exceeded_size_limit = false;
7491 }
7492}
7493
7494
7495
7496// =========================================================================================
7497// shutdown
7498
7499class C_MDC_ShutdownCheck : public MDCacheContext {
7500public:
7501 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7502 void finish(int) override {
7503 mdcache->shutdown_check();
7504 }
7505};
7506
7507void MDCache::shutdown_check()
7508{
7509 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7510
7511 // cache
7512 char old_val[32] = { 0 };
7513 char *o = old_val;
7514 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7515 g_conf->set_val("debug_mds", "10");
7516 g_conf->apply_changes(NULL);
7517 show_cache();
7518 g_conf->set_val("debug_mds", old_val);
7519 g_conf->apply_changes(NULL);
7520 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7521
7522 // this
31f18b77 7523 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7524 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7525
7526
7527 if (mds->objecter->is_active()) {
7528 dout(0) << "objecter still active" << dendl;
7529 mds->objecter->dump_active();
7530 }
7531}
7532
7533
7534void MDCache::shutdown_start()
7535{
7536 dout(2) << "shutdown_start" << dendl;
7537
7538 if (g_conf->mds_shutdown_check)
7539 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7540
7541 // g_conf->debug_mds = 10;
7542}
7543
7544
7545
7546bool MDCache::shutdown_pass()
7547{
7548 dout(7) << "shutdown_pass" << dendl;
7549
7550 if (mds->is_stopped()) {
7551 dout(7) << " already shut down" << dendl;
7552 show_cache();
7553 show_subtrees();
7554 return true;
7555 }
7556
7557 // empty stray dir
28e407b8 7558 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7559
7560 // trim cache
181888fb 7561 trim(UINT64_MAX);
31f18b77 7562 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7563
28e407b8 7564 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae
FG
7565 int num_auth_subtree = 0;
7566 if (!subtrees.empty() &&
28e407b8 7567 mds->get_nodeid() != 0) {
7c673cae
FG
7568 dout(7) << "looking for subtrees to export to mds0" << dendl;
7569 list<CDir*> ls;
7570 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7571 it != subtrees.end();
7572 ++it) {
7573 CDir *dir = it->first;
7574 if (dir->get_inode()->is_mdsdir())
7575 continue;
7576 if (dir->is_auth()) {
7577 num_auth_subtree++;
7578 if (dir->is_frozen() ||
7579 dir->is_freezing() ||
7580 dir->is_ambiguous_dir_auth() ||
7581 dir->state_test(CDir::STATE_EXPORTING))
7582 continue;
7583 ls.push_back(dir);
7584 }
7585 }
28e407b8
AA
7586
7587 migrator->clear_export_queue();
7c673cae
FG
7588 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7589 CDir *dir = *p;
7590 mds_rank_t dest = dir->get_inode()->authority().first;
7591 if (dest > 0 && !mds->mdsmap->is_active(dest))
7592 dest = 0;
7593 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7594 migrator->export_dir_nicely(dir, dest);
7595 }
7596 }
7597
28e407b8
AA
7598 if (!strays_all_exported) {
7599 dout(7) << "waiting for strays to migrate" << dendl;
7600 return false;
7601 }
7602
7c673cae 7603 if (num_auth_subtree > 0) {
28e407b8 7604 assert(mds->get_nodeid() > 0);
7c673cae
FG
7605 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7606 show_subtrees();
7607 return false;
7608 }
7609
7610 // close out any sessions (and open files!) before we try to trim the log, etc.
7611 if (mds->sessionmap.have_unclosed_sessions()) {
7612 if (!mds->server->terminating_sessions)
7613 mds->server->terminate_sessions();
7614 return false;
7615 }
7616
28e407b8
AA
7617 // Fully trim the log so that all objects in cache are clean and may be
7618 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7619 // trim the log such that the cache eventually becomes clean.
7620 mds->mdlog->trim(0);
7621 if (mds->mdlog->get_num_segments() > 1) {
7622 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7623 return false;
7624 }
7625
7626 // drop our reference to our stray dir inode
7627 for (int i = 0; i < NUM_STRAY; ++i) {
7628 if (strays[i] &&
7629 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7630 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7631 strays[i]->put(CInode::PIN_STRAY);
7632 strays[i]->put_stickydirs();
7633 }
7634 }
7635
7c673cae
FG
7636 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7637 if (mydir && !mydir->is_subtree_root())
7638 mydir = NULL;
7639
7640 // subtrees map not empty yet?
7641 if (subtrees.size() > (mydir ? 1 : 0)) {
7642 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7643 show_subtrees();
7644 migrator->show_importing();
7645 migrator->show_exporting();
7646 if (!migrator->is_importing() && !migrator->is_exporting())
7647 show_cache();
7648 return false;
7649 }
7650 assert(!migrator->is_exporting());
7651 assert(!migrator->is_importing());
7652
181888fb
FG
7653 if ((myin && myin->is_auth_pinned()) ||
7654 (mydir && mydir->is_auth_pinned())) {
7655 dout(7) << "still have auth pinned objects" << dendl;
7656 return false;
7657 }
7658
7c673cae
FG
7659 // (only do this once!)
7660 if (!mds->mdlog->is_capped()) {
7661 dout(7) << "capping the log" << dendl;
7662 mds->mdlog->cap();
7663 mds->mdlog->trim();
7664 }
7665
7666 if (!mds->mdlog->empty()) {
7667 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7668 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7669 return false;
7670 }
7671
7672 if (!did_shutdown_log_cap) {
7673 // flush journal header
7674 dout(7) << "writing header for (now-empty) journal" << dendl;
7675 assert(mds->mdlog->empty());
7676 mds->mdlog->write_head(0);
7677 // NOTE: filer active checker below will block us until this completes.
7678 did_shutdown_log_cap = true;
7679 return false;
7680 }
7681
7682 // filer active?
7683 if (mds->objecter->is_active()) {
7684 dout(7) << "objecter still active" << dendl;
7685 mds->objecter->dump_active();
7686 return false;
7687 }
7688
7689 // trim what we can from the cache
31f18b77
FG
7690 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7691 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7692 show_cache();
7693 //dump();
7694 return false;
7695 }
31f18b77
FG
7696
7697 // make mydir subtree go away
7698 if (mydir) {
7699 if (mydir->get_num_ref() > 1) { // subtree pin
7700 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7701 show_cache();
7702 return false;
7703 }
7704
7705 remove_subtree(mydir);
7706 myin->close_dirfrag(mydir->get_frag());
7707 }
7708 assert(subtrees.empty());
7709
7710 if (myin)
7711 remove_inode(myin);
7c673cae
FG
7712
7713 // done!
7714 dout(2) << "shutdown done." << dendl;
7715 return true;
7716}
7717
7718bool MDCache::shutdown_export_strays()
7719{
7720 if (mds->get_nodeid() == 0)
7721 return true;
7722
7723 dout(10) << "shutdown_export_strays" << dendl;
7724
7725 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7726
7727 bool done = true;
7728
7729 list<CDir*> dfs;
7730 for (int i = 0; i < NUM_STRAY; ++i) {
28e407b8
AA
7731 if (!strays[i] ||
7732 !strays[i]->state_test(CInode::STATE_STRAYPINNED))
7c673cae 7733 continue;
7c673cae
FG
7734 strays[i]->get_dirfrags(dfs);
7735 }
7736
7737 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7738 dfs_i != dfs.end(); ++dfs_i)
7739 {
7740 CDir *dir = *dfs_i;
7741
7742 if (!dir->is_complete()) {
7743 dir->fetch(0);
7744 done = false;
7745 if (!mds0_active)
7746 break;
7747 }
7748
94b18763
FG
7749 for (auto &p : dir->items) {
7750 CDentry *dn = p.second;
28e407b8 7751 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7752 if (dnl->is_null())
7753 continue;
7754 done = false;
7755 if (!mds0_active)
7756 break;
7757
7758 if (dn->state_test(CDentry::STATE_PURGING)) {
7759 // Don't try to migrate anything that is actually
7760 // being purged right now
7761 continue;
7762 }
7763
7764 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7765 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7766 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7767 } else {
7768 dout(10) << "already exporting " << *dn << dendl;
7769 }
7770 }
7771 }
7772
7773 return done;
7774}
7775
7776// ========= messaging ==============
7777
7778/* This function DOES put the passed message before returning */
7779void MDCache::dispatch(Message *m)
7780{
7781 switch (m->get_type()) {
7782
7783 // RESOLVE
7784 case MSG_MDS_RESOLVE:
7785 handle_resolve(static_cast<MMDSResolve*>(m));
7786 break;
7787 case MSG_MDS_RESOLVEACK:
7788 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7789 break;
7790
7791 // REJOIN
7792 case MSG_MDS_CACHEREJOIN:
7793 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7794 break;
7795
7796 case MSG_MDS_DISCOVER:
7797 handle_discover(static_cast<MDiscover*>(m));
7798 break;
7799 case MSG_MDS_DISCOVERREPLY:
7800 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7801 break;
7802
7803 case MSG_MDS_DIRUPDATE:
7804 handle_dir_update(static_cast<MDirUpdate*>(m));
7805 break;
7806
7807 case MSG_MDS_CACHEEXPIRE:
7808 handle_cache_expire(static_cast<MCacheExpire*>(m));
7809 break;
7810
7811 case MSG_MDS_DENTRYLINK:
7812 handle_dentry_link(static_cast<MDentryLink*>(m));
7813 break;
7814 case MSG_MDS_DENTRYUNLINK:
7815 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7816 break;
7817
7818 case MSG_MDS_FRAGMENTNOTIFY:
7819 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7820 break;
7821
7822 case MSG_MDS_FINDINO:
7823 handle_find_ino(static_cast<MMDSFindIno *>(m));
7824 break;
7825 case MSG_MDS_FINDINOREPLY:
7826 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7827 break;
7828
7829 case MSG_MDS_OPENINO:
7830 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7831 break;
7832 case MSG_MDS_OPENINOREPLY:
7833 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7834 break;
7835
7836 default:
7837 derr << "cache unknown message " << m->get_type() << dendl;
7838 assert(0 == "cache unknown message");
7839 }
7840}
7841
7842MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7843{
7844 if (mdr) {
7845 dout(20) << "_get_waiter retryrequest" << dendl;
7846 return new C_MDS_RetryRequest(this, mdr);
7847 } else if (req) {
7848 dout(20) << "_get_waiter retrymessage" << dendl;
7849 return new C_MDS_RetryMessage(mds, req);
7850 } else {
7851 return fin;
7852 }
7853}
7854
7855int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7856 const filepath& path, // what
7857 vector<CDentry*> *pdnvec, // result
7858 CInode **pin,
7859 int onfail)
7860{
7861 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7862 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7863 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7864
7865 assert(mdr || req || fin);
7866 assert(!forward || mdr || req); // forward requires a request
7867
7868 snapid_t snapid = CEPH_NOSNAP;
7869 if (mdr)
7870 mdr->snapid = snapid;
7871
7872 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7873
7874 if (mds->logger) mds->logger->inc(l_mds_traverse);
7875
7876 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7877 CInode *cur = get_inode(path.get_ino());
7878 if (cur == NULL) {
7879 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7880 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7881 else {
7882 //ceph_abort(); // hrm.. broken
7883 return -ESTALE;
7884 }
7885 return 1;
7886 }
7887 if (cur->state_test(CInode::STATE_PURGING))
7888 return -ESTALE;
7889
7890 // make sure snaprealm are open...
7891 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7892 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7893 return 1;
7894 }
7895
7896 // start trace
7897 if (pdnvec)
7898 pdnvec->clear();
7899 if (pin)
7900 *pin = cur;
7901
7902 unsigned depth = 0;
7903 while (depth < path.depth()) {
7904 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7905 << "' snapid " << snapid << dendl;
7906
7907 if (!cur->is_dir()) {
7908 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7909 return -ENOTDIR;
7910 }
7911
7912 // walk into snapdir?
7913 if (path[depth].length() == 0) {
7914 dout(10) << "traverse: snapdir" << dendl;
7915 if (!mdr)
7916 return -EINVAL;
7917 snapid = CEPH_SNAPDIR;
7918 mdr->snapid = snapid;
7919 depth++;
7920 continue;
7921 }
7922 // walk thru snapdir?
7923 if (snapid == CEPH_SNAPDIR) {
7924 if (!mdr)
7925 return -EINVAL;
7926 SnapRealm *realm = cur->find_snaprealm();
7927 snapid = realm->resolve_snapname(path[depth], cur->ino());
7928 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7929 if (!snapid)
7930 return -ENOENT;
7931 mdr->snapid = snapid;
7932 depth++;
7933 continue;
7934 }
7935
7936 // open dir
7937 frag_t fg = cur->pick_dirfrag(path[depth]);
7938 CDir *curdir = cur->get_dirfrag(fg);
7939 if (!curdir) {
7940 if (cur->is_auth()) {
7941 // parent dir frozen_dir?
7942 if (cur->is_frozen()) {
7943 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7944 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7945 return 1;
7946 }
7947 curdir = cur->get_or_open_dirfrag(this, fg);
7948 } else {
7949 // discover?
7950 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7951 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7952 null_okay);
7953 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7954 return 1;
7955 }
7956 }
7957 assert(curdir);
7958
7959#ifdef MDS_VERIFY_FRAGSTAT
7960 if (curdir->is_complete())
7961 curdir->verify_fragstat();
7962#endif
7963
7964 // frozen?
7965 /*
7966 if (curdir->is_frozen()) {
7967 // doh!
7968 // FIXME: traverse is allowed?
7969 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7970 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7971 if (onfinish) delete onfinish;
7972 return 1;
7973 }
7974 */
7975
7976 // Before doing dirfrag->dn lookup, compare with DamageTable's
7977 // record of which dentries were unreadable
7978 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7979 dout(4) << "traverse: stopped lookup at damaged dentry "
7980 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7981 return -EIO;
7982 }
7983
7984 // dentry
7985 CDentry *dn = curdir->lookup(path[depth], snapid);
7986 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7987
7988 // null and last_bit and xlocked by me?
7989 if (dnl && dnl->is_null() && null_okay) {
7990 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7991 if (pdnvec)
7992 pdnvec->push_back(dn);
7993 if (pin)
7994 *pin = 0;
7995 break; // done!
7996 }
7997
7998 if (dnl &&
7999 dn->lock.is_xlocked() &&
8000 dn->lock.get_xlock_by() != mdr &&
8001 !dn->lock.can_read(client) &&
8002 (dnl->is_null() || forward)) {
8003 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
8004 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8005 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
8006 mds->mdlog->flush();
8007 return 1;
8008 }
8009
8010 // can we conclude ENOENT?
8011 if (dnl && dnl->is_null()) {
8012 if (dn->lock.can_read(client) ||
8013 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8014 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8015 if (pdnvec) {
8016 if (depth == path.depth() - 1)
8017 pdnvec->push_back(dn);
8018 else
8019 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8020 }
8021 return -ENOENT;
8022 } else {
8023 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
8024 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8025 return 1;
8026 }
8027 }
8028
8029 if (dnl && !dnl->is_null()) {
8030 CInode *in = dnl->get_inode();
8031
8032 // do we have inode?
8033 if (!in) {
8034 assert(dnl->is_remote());
8035 // do i have it?
8036 in = get_inode(dnl->get_remote_ino());
8037 if (in) {
8038 dout(7) << "linking in remote in " << *in << dendl;
8039 dn->link_remote(dnl, in);
8040 } else {
8041 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8042 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8043 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8044 dout(4) << "traverse: remote dentry points to damaged ino "
8045 << *dn << dendl;
8046 return -EIO;
8047 }
8048 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8049 (null_okay && depth == path.depth() - 1));
8050 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8051 return 1;
8052 }
8053 }
8054
8055 cur = in;
8056 // make sure snaprealm are open...
8057 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8058 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8059 return 1;
8060 }
8061
8062 // add to trace, continue.
8063 touch_inode(cur);
8064 if (pdnvec)
8065 pdnvec->push_back(dn);
8066 if (pin)
8067 *pin = cur;
8068 depth++;
8069 continue;
8070 }
8071
8072
8073 // MISS. dentry doesn't exist.
8074 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8075
8076 if (curdir->is_auth()) {
8077 // dentry is mine.
8078 if (curdir->is_complete() ||
8079 (snapid == CEPH_NOSNAP &&
8080 curdir->has_bloom() &&
8081 !curdir->is_in_bloom(path[depth]))){
8082 // file not found
8083 if (pdnvec) {
8084 // instantiate a null dn?
8085 if (depth < path.depth()-1){
8086 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8087 dn = NULL;
8088 } else if (dn) {
8089 ceph_abort(); // should have fallen out in ->is_null() check above
8090 } else if (curdir->is_frozen()) {
8091 dout(20) << " not adding null to frozen dir " << dendl;
8092 } else if (snapid < CEPH_MAXSNAP) {
8093 dout(20) << " not adding null for snapid " << snapid << dendl;
8094 } else {
8095 // create a null dentry
8096 dn = curdir->add_null_dentry(path[depth]);
8097 dout(20) << " added null " << *dn << dendl;
8098 }
8099 if (dn)
8100 pdnvec->push_back(dn);
8101 else
8102 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8103 }
8104 return -ENOENT;
8105 } else {
8106
8107 // Check DamageTable for missing fragments before trying to fetch
8108 // this
8109 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8110 dout(4) << "traverse: damaged dirfrag " << *curdir
8111 << ", blocking fetch" << dendl;
8112 return -EIO;
8113 }
8114
8115 // directory isn't complete; reload
8116 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8117 touch_inode(cur);
8118 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8119 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8120 return 1;
8121 }
8122 } else {
8123 // dirfrag/dentry is not mine.
8124 mds_authority_t dauth = curdir->authority();
8125
8126 if (forward &&
8127 snapid && mdr && mdr->client_request &&
8128 (int)depth < mdr->client_request->get_num_fwd()) {
8129 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8130 << " < fwd " << mdr->client_request->get_num_fwd()
8131 << ", discovering instead of forwarding" << dendl;
8132 discover = true;
8133 }
8134
8135 if ((discover || null_okay)) {
8136 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8137 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8138 null_okay);
8139 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8140 return 1;
8141 }
8142 if (forward) {
8143 // forward
8144 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8145
8146 if (curdir->is_ambiguous_auth()) {
8147 // wait
8148 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8149 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8150 return 1;
8151 }
8152
8153 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8154
8155 if (mdr)
8156 request_forward(mdr, dauth.first);
8157 else
8158 mds->forward_message_mds(req, dauth.first);
8159
8160 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8161 assert(fin == NULL);
8162 return 2;
8163 }
8164 }
8165
8166 ceph_abort(); // i shouldn't get here
8167 }
8168
8169 // success.
8170 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8171 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8172 if (mdr)
8173 assert(mdr->snapid == snapid);
8174 return 0;
8175}
8176
8177CInode *MDCache::cache_traverse(const filepath& fp)
8178{
8179 dout(10) << "cache_traverse " << fp << dendl;
8180
8181 CInode *in;
8182 if (fp.get_ino())
8183 in = get_inode(fp.get_ino());
8184 else
8185 in = root;
8186 if (!in)
8187 return NULL;
8188
8189 for (unsigned i = 0; i < fp.depth(); i++) {
94b18763 8190 boost::string_view dname = fp[i];
7c673cae
FG
8191 frag_t fg = in->pick_dirfrag(dname);
8192 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8193 CDir *curdir = in->get_dirfrag(fg);
8194 if (!curdir)
8195 return NULL;
8196 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8197 if (!dn)
8198 return NULL;
8199 in = dn->get_linkage()->get_inode();
8200 if (!in)
8201 return NULL;
8202 }
8203 dout(10) << " got " << *in << dendl;
8204 return in;
8205}
8206
8207
8208/**
8209 * open_remote_dir -- open up a remote dirfrag
8210 *
8211 * @param diri base inode
8212 * @param approxfg approximate fragment.
8213 * @param fin completion callback
8214 */
8215void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8216{
8217 dout(10) << "open_remote_dir on " << *diri << dendl;
7c673cae
FG
8218 assert(diri->is_dir());
8219 assert(!diri->is_auth());
8220 assert(diri->get_dirfrag(approxfg) == 0);
8221
224ce89b 8222 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8223}
8224
8225
8226/**
8227 * get_dentry_inode - get or open inode
8228 *
8229 * @param dn the dentry
8230 * @param mdr current request
8231 *
8232 * will return inode for primary, or link up/open up remote link's inode as necessary.
8233 * If it's not available right now, puts mdr on wait list and returns null.
8234 */
8235CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8236{
8237 CDentry::linkage_t *dnl;
8238 if (projected)
8239 dnl = dn->get_projected_linkage();
8240 else
8241 dnl = dn->get_linkage();
8242
8243 assert(!dnl->is_null());
8244
8245 if (dnl->is_primary())
8246 return dnl->inode;
8247
8248 assert(dnl->is_remote());
8249 CInode *in = get_inode(dnl->get_remote_ino());
8250 if (in) {
8251 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8252 dn->link_remote(dnl, in);
8253 return in;
8254 } else {
8255 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8256 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8257 return 0;
8258 }
8259}
8260
8261struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8262 CDentry *dn;
8263 inodeno_t ino;
8264 MDSInternalContextBase *onfinish;
8265 bool want_xlocked;
8266 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
31f18b77
FG
8267 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8268 dn->get(MDSCacheObject::PIN_PTRWAITER);
8269 }
7c673cae
FG
8270 void finish(int r) override {
8271 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8272 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8273 }
8274};
8275
8276void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8277{
8278 dout(10) << "open_remote_dentry " << *dn << dendl;
8279 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8280 inodeno_t ino = dnl->get_remote_ino();
8281 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8282 open_ino(ino, pool,
8283 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8284}
8285
8286void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8287 bool want_xlocked, int r)
8288{
8289 if (r < 0) {
31f18b77
FG
8290 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8291 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8292 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8293 dn->state_set(CDentry::STATE_BADREMOTEINO);
8294
8295 std::string path;
8296 CDir *dir = dn->get_dir();
8297 if (dir) {
31f18b77 8298 dir->get_inode()->make_path_string(path);
94b18763
FG
8299 path += "/";
8300 path += std::string(dn->get_name());
7c673cae
FG
8301 }
8302
31f18b77 8303 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8304 if (fatal) {
31f18b77
FG
8305 mds->damaged();
8306 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8307 }
31f18b77
FG
8308 } else {
8309 r = 0;
8310 }
7c673cae
FG
8311 }
8312 fin->complete(r < 0 ? r : 0);
8313}
8314
8315
8316void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8317{
8318 // empty trace if we're a base inode
8319 if (in->is_base())
8320 return;
8321
8322 CInode *parent = in->get_parent_inode();
8323 assert(parent);
8324 make_trace(trace, parent);
8325
8326 CDentry *dn = in->get_parent_dn();
8327 dout(15) << "make_trace adding " << *dn << dendl;
8328 trace.push_back(dn);
8329}
8330
8331
8332// -------------------------------------------------------------------------------
8333// Open inode by inode number
8334
8335class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8336 inodeno_t ino;
8337 public:
8338 bufferlist bl;
8339 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8340 MDCacheIOContext(c), ino(i) {}
8341 void finish(int r) override {
8342 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8343 }
8344};
8345
8346struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8347 inodeno_t ino;
8348 MMDSOpenIno *msg;
8349 bool parent;
8350 public:
8351 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8352 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8353 void finish(int r) override {
8354 if (r < 0 && !parent)
8355 r = -EAGAIN;
8356 if (msg) {
8357 mdcache->handle_open_ino(msg, r);
8358 return;
8359 }
8360 assert(mdcache->opening_inodes.count(ino));
8361 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8362 }
8363};
8364
8365struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8366 inodeno_t ino;
8367 public:
8368 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8369 void finish(int r) override {
8370 mdcache->_open_ino_parent_opened(ino, r);
8371 }
8372};
8373
8374void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8375{
8376 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8377
8378 assert(opening_inodes.count(ino));
8379 open_ino_info_t& info = opening_inodes[ino];
8380
8381 CInode *in = get_inode(ino);
8382 if (in) {
8383 dout(10) << " found cached " << *in << dendl;
8384 open_ino_finish(ino, info, in->authority().first);
8385 return;
8386 }
8387
8388 inode_backtrace_t backtrace;
8389 if (err == 0) {
8390 try {
8391 ::decode(backtrace, bl);
8392 } catch (const buffer::error &decode_exc) {
8393 derr << "corrupt backtrace on ino x0" << std::hex << ino
8394 << std::dec << ": " << decode_exc << dendl;
8395 open_ino_finish(ino, info, -EIO);
8396 return;
8397 }
8398 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8399 dout(10) << " old object in pool " << info.pool
8400 << ", retrying pool " << backtrace.pool << dendl;
8401 info.pool = backtrace.pool;
8402 C_IO_MDC_OpenInoBacktraceFetched *fin =
8403 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8404 fetch_backtrace(ino, info.pool, fin->bl,
8405 new C_OnFinisher(fin, mds->finisher));
8406 return;
8407 }
8408 } else if (err == -ENOENT) {
8409 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8410 if (info.pool != meta_pool) {
8411 dout(10) << " no object in pool " << info.pool
8412 << ", retrying pool " << meta_pool << dendl;
8413 info.pool = meta_pool;
8414 C_IO_MDC_OpenInoBacktraceFetched *fin =
8415 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8416 fetch_backtrace(ino, info.pool, fin->bl,
8417 new C_OnFinisher(fin, mds->finisher));
8418 return;
8419 }
8420 err = 0; // backtrace.ancestors.empty() is checked below
8421 }
8422
8423 if (err == 0) {
8424 if (backtrace.ancestors.empty()) {
8425 dout(10) << " got empty backtrace " << dendl;
8426 err = -EIO;
8427 } else if (!info.ancestors.empty()) {
8428 if (info.ancestors[0] == backtrace.ancestors[0]) {
8429 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8430 err = -EINVAL;
8431 } else {
8432 info.last_err = 0;
8433 }
8434 }
8435 }
8436 if (err) {
8437 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8438 if (info.last_err)
8439 err = info.last_err;
8440 open_ino_finish(ino, info, err);
8441 return;
8442 }
8443
8444 dout(10) << " got backtrace " << backtrace << dendl;
8445 info.ancestors = backtrace.ancestors;
8446
8447 _open_ino_traverse_dir(ino, info, 0);
8448}
8449
8450void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8451{
8452 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8453
8454 assert(opening_inodes.count(ino));
8455 open_ino_info_t& info = opening_inodes[ino];
8456
8457 CInode *in = get_inode(ino);
8458 if (in) {
8459 dout(10) << " found cached " << *in << dendl;
8460 open_ino_finish(ino, info, in->authority().first);
8461 return;
8462 }
8463
8464 if (ret == mds->get_nodeid()) {
8465 _open_ino_traverse_dir(ino, info, 0);
8466 } else {
8467 if (ret >= 0) {
8468 mds_rank_t checked_rank = mds_rank_t(ret);
8469 info.check_peers = true;
8470 info.auth_hint = checked_rank;
8471 info.checked.erase(checked_rank);
8472 }
8473 do_open_ino(ino, info, ret);
8474 }
8475}
8476
8477void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8478{
8479 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8480
8481 CInode *in = get_inode(ino);
8482 if (in) {
8483 dout(10) << " found cached " << *in << dendl;
8484 open_ino_finish(ino, info, in->authority().first);
8485 return;
8486 }
8487
8488 if (ret) {
8489 do_open_ino(ino, info, ret);
8490 return;
8491 }
8492
8493 mds_rank_t hint = info.auth_hint;
8494 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8495 info.discover, info.want_xlocked, &hint);
8496 if (ret > 0)
8497 return;
8498 if (hint != mds->get_nodeid())
8499 info.auth_hint = hint;
8500 do_open_ino(ino, info, ret);
8501}
8502
8503void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8504{
8505 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8506 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8507 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8508}
8509
8510int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8511 vector<inode_backpointer_t>& ancestors,
8512 bool discover, bool want_xlocked, mds_rank_t *hint)
8513{
8514 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8515 int err = 0;
8516 for (unsigned i = 0; i < ancestors.size(); i++) {
8517 CInode *diri = get_inode(ancestors[i].dirino);
8518
8519 if (!diri) {
8520 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8521 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8522 return 1;
8523 }
8524 continue;
8525 }
8526
8527 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8528 CDir *dir = diri->get_parent_dir();
8529 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8530 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8531 dir = dir->get_inode()->get_parent_dir();
8532 _open_ino_fetch_dir(ino, m, dir, i == 0);
8533 return 1;
8534 }
8535
8536 if (!diri->is_dir()) {
8537 dout(10) << " " << *diri << " is not dir" << dendl;
8538 if (i == 0)
8539 err = -ENOTDIR;
8540 break;
8541 }
8542
8543 string &name = ancestors[i].dname;
8544 frag_t fg = diri->pick_dirfrag(name);
8545 CDir *dir = diri->get_dirfrag(fg);
8546 if (!dir) {
8547 if (diri->is_auth()) {
8548 if (diri->is_frozen()) {
8549 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8550 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8551 return 1;
8552 }
8553 dir = diri->get_or_open_dirfrag(this, fg);
8554 } else if (discover) {
8555 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8556 return 1;
8557 }
8558 }
8559 if (dir) {
8560 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8561 CDentry *dn = dir->lookup(name);
8562 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8563 if (dir->is_auth()) {
8564 if (dnl && dnl->is_primary() &&
8565 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8566 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8567 _open_ino_fetch_dir(ino, m, dir, i == 0);
8568 return 1;
8569 }
8570
8571 if (!dnl && !dir->is_complete() &&
8572 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8573 dout(10) << " fetching incomplete " << *dir << dendl;
8574 _open_ino_fetch_dir(ino, m, dir, i == 0);
8575 return 1;
8576 }
8577
8578 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8579 if (i == 0)
8580 err = -ENOENT;
8581 } else if (discover) {
8582 if (!dnl) {
8583 filepath path(name, 0);
8584 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8585 (i == 0 && want_xlocked));
8586 return 1;
8587 }
8588 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8589 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8590 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8591 return 1;
8592 }
8593 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8594 if (i == 0)
8595 err = -ENOENT;
8596 }
8597 }
8598 if (hint && i == 0)
8599 *hint = dir ? dir->authority().first : diri->authority().first;
8600 break;
8601 }
8602 return err;
8603}
8604
8605void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8606{
8607 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8608
8609 list<MDSInternalContextBase*> waiters;
8610 waiters.swap(info.waiters);
8611 opening_inodes.erase(ino);
8612 finish_contexts(g_ceph_context, waiters, ret);
8613}
8614
8615void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8616{
8617 if (err < 0 && err != -EAGAIN) {
8618 info.checked.clear();
7c673cae
FG
8619 info.checking = MDS_RANK_NONE;
8620 info.check_peers = true;
8621 info.fetch_backtrace = true;
8622 if (info.discover) {
8623 info.discover = false;
8624 info.ancestors.clear();
8625 }
8626 if (err != -ENOENT && err != -ENOTDIR)
8627 info.last_err = err;
8628 }
8629
d2e6a577
FG
8630 if (info.check_peers || info.discover) {
8631 if (info.discover) {
8632 // got backtrace from peer, but failed to find inode. re-check peers
8633 info.discover = false;
8634 info.ancestors.clear();
8635 info.checked.clear();
8636 }
7c673cae
FG
8637 info.check_peers = false;
8638 info.checking = MDS_RANK_NONE;
8639 do_open_ino_peer(ino, info);
8640 } else if (info.fetch_backtrace) {
8641 info.check_peers = true;
8642 info.fetch_backtrace = false;
8643 info.checking = mds->get_nodeid();
8644 info.checked.clear();
7c673cae
FG
8645 C_IO_MDC_OpenInoBacktraceFetched *fin =
8646 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8647 fetch_backtrace(ino, info.pool, fin->bl,
8648 new C_OnFinisher(fin, mds->finisher));
8649 } else {
8650 assert(!info.ancestors.empty());
8651 info.checking = mds->get_nodeid();
8652 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8653 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8654 }
8655}
8656
8657void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8658{
8659 set<mds_rank_t> all, active;
8660 mds->mdsmap->get_mds_set(all);
8661 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8662 if (mds->get_state() == MDSMap::STATE_REJOIN)
8663 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8664
8665 dout(10) << "do_open_ino_peer " << ino << " active " << active
8666 << " all " << all << " checked " << info.checked << dendl;
8667
8668 mds_rank_t peer = MDS_RANK_NONE;
8669 if (info.auth_hint >= 0) {
8670 if (active.count(info.auth_hint)) {
8671 peer = info.auth_hint;
8672 info.auth_hint = MDS_RANK_NONE;
8673 }
8674 } else {
8675 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8676 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8677 peer = *p;
8678 break;
8679 }
8680 }
8681 if (peer < 0) {
d2e6a577
FG
8682 all.erase(mds->get_nodeid());
8683 if (all != info.checked) {
7c673cae
FG
8684 dout(10) << " waiting for more peers to be active" << dendl;
8685 } else {
8686 dout(10) << " all MDS peers have been checked " << dendl;
8687 do_open_ino(ino, info, 0);
8688 }
8689 } else {
8690 info.checking = peer;
8691 vector<inode_backpointer_t> *pa = NULL;
8692 // got backtrace from peer or backtrace just fetched
8693 if (info.discover || !info.fetch_backtrace)
8694 pa = &info.ancestors;
8695 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8696 }
8697}
8698
8699void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8700{
8701 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8702 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8703 m->put();
8704 return;
8705 }
8706
8707 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8708
8709 inodeno_t ino = m->ino;
8710 MMDSOpenInoReply *reply;
8711 CInode *in = get_inode(ino);
8712 if (in) {
8713 dout(10) << " have " << *in << dendl;
8714 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8715 if (in->is_auth()) {
8716 touch_inode(in);
8717 while (1) {
8718 CDentry *pdn = in->get_parent_dn();
8719 if (!pdn)
8720 break;
8721 CInode *diri = pdn->get_dir()->get_inode();
94b18763 8722 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
8723 in->inode.version));
8724 in = diri;
8725 }
8726 } else {
8727 reply->hint = in->authority().first;
8728 }
8729 } else if (err < 0) {
8730 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8731 } else {
8732 mds_rank_t hint = MDS_RANK_NONE;
8733 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8734 if (ret > 0)
8735 return;
8736 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8737 }
8738 m->get_connection()->send_message(reply);
8739 m->put();
8740}
8741
8742void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8743{
8744 dout(10) << "handle_open_ino_reply " << *m << dendl;
8745
8746 inodeno_t ino = m->ino;
8747 mds_rank_t from = mds_rank_t(m->get_source().num());
8748 auto it = opening_inodes.find(ino);
8749 if (it != opening_inodes.end() && it->second.checking == from) {
8750 open_ino_info_t& info = it->second;
8751 info.checking = MDS_RANK_NONE;
8752 info.checked.insert(from);
8753
8754 CInode *in = get_inode(ino);
8755 if (in) {
8756 dout(10) << " found cached " << *in << dendl;
8757 open_ino_finish(ino, info, in->authority().first);
8758 } else if (!m->ancestors.empty()) {
8759 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8760 if (!info.want_replica) {
8761 open_ino_finish(ino, info, from);
8762 m->put();
8763 return;
8764 }
8765
8766 info.ancestors = m->ancestors;
8767 info.auth_hint = from;
8768 info.checking = mds->get_nodeid();
8769 info.discover = true;
8770 _open_ino_traverse_dir(ino, info, 0);
8771 } else if (m->error) {
8772 dout(10) << " error " << m->error << " from mds." << from << dendl;
8773 do_open_ino(ino, info, m->error);
8774 } else {
8775 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8776 info.auth_hint = m->hint;
8777 info.checked.erase(m->hint);
8778 }
8779 do_open_ino_peer(ino, info);
8780 }
8781 }
8782 m->put();
8783}
8784
8785void MDCache::kick_open_ino_peers(mds_rank_t who)
8786{
8787 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8788
8789 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8790 p != opening_inodes.end();
8791 ++p) {
8792 open_ino_info_t& info = p->second;
8793 if (info.checking == who) {
8794 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8795 info.checking = MDS_RANK_NONE;
8796 do_open_ino_peer(p->first, info);
8797 } else if (info.checking == MDS_RANK_NONE) {
8798 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8799 do_open_ino_peer(p->first, info);
8800 }
8801 }
8802}
8803
8804void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8805 bool want_replica, bool want_xlocked)
8806{
8807 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8808 << want_replica << dendl;
8809
8810 if (opening_inodes.count(ino)) {
8811 open_ino_info_t& info = opening_inodes[ino];
8812 if (want_replica) {
8813 info.want_replica = true;
8814 if (want_xlocked && !info.want_xlocked) {
8815 if (!info.ancestors.empty()) {
8816 CInode *diri = get_inode(info.ancestors[0].dirino);
8817 if (diri) {
8818 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8819 CDir *dir = diri->get_dirfrag(fg);
8820 if (dir && !dir->is_auth()) {
8821 filepath path(info.ancestors[0].dname, 0);
8822 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8823 }
8824 }
8825 }
8826 info.want_xlocked = true;
8827 }
8828 }
8829 info.waiters.push_back(fin);
8830 } else {
8831 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
8832 info.want_replica = want_replica;
8833 info.want_xlocked = want_xlocked;
8834 info.tid = ++open_ino_last_tid;
8835 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8836 info.waiters.push_back(fin);
8837 do_open_ino(ino, info, 0);
8838 }
8839}
8840
8841/* ---------------------------- */
8842
8843/*
8844 * search for a given inode on MDS peers. optionally start with the given node.
8845
8846
8847 TODO
8848 - recover from mds node failure, recovery
8849 - traverse path
8850
8851 */
8852void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8853{
8854 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
8855 CInode *in = get_inode(ino);
8856 if (in && in->state_test(CInode::STATE_PURGING)) {
8857 c->complete(-ESTALE);
8858 return;
8859 }
8860 assert(!in);
7c673cae
FG
8861
8862 ceph_tid_t tid = ++find_ino_peer_last_tid;
8863 find_ino_peer_info_t& fip = find_ino_peer[tid];
8864 fip.ino = ino;
8865 fip.tid = tid;
8866 fip.fin = c;
8867 fip.hint = hint;
7c673cae
FG
8868 _do_find_ino_peer(fip);
8869}
8870
8871void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8872{
8873 set<mds_rank_t> all, active;
8874 mds->mdsmap->get_mds_set(all);
8875 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8876
8877 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8878 << " active " << active << " all " << all
8879 << " checked " << fip.checked
8880 << dendl;
8881
8882 mds_rank_t m = MDS_RANK_NONE;
8883 if (fip.hint >= 0) {
8884 m = fip.hint;
8885 fip.hint = MDS_RANK_NONE;
8886 } else {
8887 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8888 if (*p != mds->get_nodeid() &&
8889 fip.checked.count(*p) == 0) {
8890 m = *p;
8891 break;
8892 }
8893 }
8894 if (m == MDS_RANK_NONE) {
d2e6a577
FG
8895 all.erase(mds->get_nodeid());
8896 if (all != fip.checked) {
7c673cae
FG
8897 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8898 } else {
8899 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8900 fip.fin->complete(-ESTALE);
8901 find_ino_peer.erase(fip.tid);
8902 }
8903 } else {
8904 fip.checking = m;
8905 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8906 }
8907}
8908
8909void MDCache::handle_find_ino(MMDSFindIno *m)
8910{
8911 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8912 m->put();
8913 return;
8914 }
8915
8916 dout(10) << "handle_find_ino " << *m << dendl;
8917 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8918 CInode *in = get_inode(m->ino);
8919 if (in) {
8920 in->make_path(r->path);
8921 dout(10) << " have " << r->path << " " << *in << dendl;
8922 }
8923 m->get_connection()->send_message(r);
8924 m->put();
8925}
8926
8927
8928void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8929{
8930 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8931 if (p != find_ino_peer.end()) {
8932 dout(10) << "handle_find_ino_reply " << *m << dendl;
8933 find_ino_peer_info_t& fip = p->second;
8934
8935 // success?
8936 if (get_inode(fip.ino)) {
8937 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8938 mds->queue_waiter(fip.fin);
8939 find_ino_peer.erase(p);
8940 m->put();
8941 return;
8942 }
8943
8944 mds_rank_t from = mds_rank_t(m->get_source().num());
8945 if (fip.checking == from)
8946 fip.checking = MDS_RANK_NONE;
8947 fip.checked.insert(from);
8948
8949 if (!m->path.empty()) {
8950 // we got a path!
8951 vector<CDentry*> trace;
8952 MDRequestRef null_ref;
8953 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8954 if (r > 0)
8955 return;
8956 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8957 << ", retrying" << dendl;
8958 fip.checked.clear();
8959 _do_find_ino_peer(fip);
8960 } else {
8961 // nope, continue.
8962 _do_find_ino_peer(fip);
8963 }
8964 } else {
8965 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8966 }
8967 m->put();
8968}
8969
8970void MDCache::kick_find_ino_peers(mds_rank_t who)
8971{
8972 // find_ino_peers requests we should move on from
8973 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8974 p != find_ino_peer.end();
8975 ++p) {
8976 find_ino_peer_info_t& fip = p->second;
8977 if (fip.checking == who) {
8978 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8979 fip.checking = MDS_RANK_NONE;
8980 _do_find_ino_peer(fip);
8981 } else if (fip.checking == MDS_RANK_NONE) {
8982 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8983 _do_find_ino_peer(fip);
8984 }
8985 }
8986}
8987
8988/* ---------------------------- */
8989
8990int MDCache::get_num_client_requests()
8991{
8992 int count = 0;
8993 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8994 p != active_requests.end();
8995 ++p) {
8996 MDRequestRef& mdr = p->second;
8997 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8998 count++;
8999 }
9000 return count;
9001}
9002
9003/* This function takes over the reference to the passed Message */
9004MDRequestRef MDCache::request_start(MClientRequest *req)
9005{
9006 // did we win a forward race against a slave?
9007 if (active_requests.count(req->get_reqid())) {
9008 MDRequestRef& mdr = active_requests[req->get_reqid()];
9009 assert(mdr);
9010 if (mdr->is_slave()) {
9011 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9012 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9013 } else {
9014 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9015 req->put();
9016 }
9017 return MDRequestRef();
9018 }
9019
9020 // register new client request
9021 MDRequestImpl::Params params;
9022 params.reqid = req->get_reqid();
9023 params.attempt = req->get_num_fwd();
9024 params.client_req = req;
9025 params.initiated = req->get_recv_stamp();
9026 params.throttled = req->get_throttle_stamp();
9027 params.all_read = req->get_recv_complete_stamp();
9028 params.dispatched = req->get_dispatch_stamp();
9029
9030 MDRequestRef mdr =
9031 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9032 active_requests[params.reqid] = mdr;
9033 mdr->set_op_stamp(req->get_stamp());
9034 dout(7) << "request_start " << *mdr << dendl;
9035 return mdr;
9036}
9037
9038MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9039{
9040 int by = m->get_source().num();
9041 MDRequestImpl::Params params;
9042 params.reqid = ri;
9043 params.attempt = attempt;
9044 params.triggering_slave_req = m;
9045 params.slave_to = by;
9046 params.initiated = m->get_recv_stamp();
9047 params.throttled = m->get_throttle_stamp();
9048 params.all_read = m->get_recv_complete_stamp();
9049 params.dispatched = m->get_dispatch_stamp();
9050 MDRequestRef mdr =
9051 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9052 assert(active_requests.count(mdr->reqid) == 0);
9053 active_requests[mdr->reqid] = mdr;
9054 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9055 return mdr;
9056}
9057
9058MDRequestRef MDCache::request_start_internal(int op)
9059{
9060 MDRequestImpl::Params params;
9061 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9062 params.reqid.tid = mds->issue_tid();
9063 params.initiated = ceph_clock_now();
9064 params.internal_op = op;
9065 MDRequestRef mdr =
9066 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9067
9068 assert(active_requests.count(mdr->reqid) == 0);
9069 active_requests[mdr->reqid] = mdr;
9070 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9071 return mdr;
9072}
9073
9074MDRequestRef MDCache::request_get(metareqid_t rid)
9075{
9076 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9077 assert(p != active_requests.end());
9078 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9079 return p->second;
9080}
9081
9082void MDCache::request_finish(MDRequestRef& mdr)
9083{
9084 dout(7) << "request_finish " << *mdr << dendl;
9085 mdr->mark_event("finishing request");
9086
9087 // slave finisher?
9088 if (mdr->has_more() && mdr->more()->slave_commit) {
9089 Context *fin = mdr->more()->slave_commit;
9090 mdr->more()->slave_commit = 0;
9091 int ret;
9092 if (mdr->aborted) {
9093 mdr->aborted = false;
9094 ret = -1;
9095 mdr->more()->slave_rolling_back = true;
9096 } else {
9097 ret = 0;
9098 mdr->committing = true;
9099 }
9100 fin->complete(ret); // this must re-call request_finish.
9101 return;
9102 }
9103
d2e6a577
FG
9104 switch(mdr->internal_op) {
9105 case CEPH_MDS_OP_FRAGMENTDIR:
9106 logger->inc(l_mdss_ireq_fragmentdir);
9107 break;
9108 case CEPH_MDS_OP_EXPORTDIR:
9109 logger->inc(l_mdss_ireq_exportdir);
9110 break;
9111 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9112 logger->inc(l_mdss_ireq_enqueue_scrub);
9113 break;
9114 case CEPH_MDS_OP_FLUSH:
9115 logger->inc(l_mdss_ireq_flush);
9116 break;
9117 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9118 logger->inc(l_mdss_ireq_fragstats);
9119 break;
9120 case CEPH_MDS_OP_REPAIR_INODESTATS:
9121 logger->inc(l_mdss_ireq_inodestats);
9122 break;
9123 }
9124
7c673cae
FG
9125 request_cleanup(mdr);
9126}
9127
9128
9129void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9130{
9131 mdr->mark_event("forwarding request");
9132 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9133 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9134 << *mdr->client_request << dendl;
9135 mds->forward_message_mds(mdr->client_request, who);
9136 mdr->client_request = 0;
9137 if (mds->logger) mds->logger->inc(l_mds_forward);
9138 } else if (mdr->internal_op >= 0) {
9139 dout(10) << "request_forward on internal op; cancelling" << dendl;
9140 mdr->internal_op_finish->complete(-EXDEV);
9141 } else {
9142 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9143 << " was from mds" << dendl;
9144 }
9145 request_cleanup(mdr);
9146}
9147
9148
9149void MDCache::dispatch_request(MDRequestRef& mdr)
9150{
9151 if (mdr->client_request) {
9152 mds->server->dispatch_client_request(mdr);
9153 } else if (mdr->slave_request) {
9154 mds->server->dispatch_slave_request(mdr);
9155 } else {
9156 switch (mdr->internal_op) {
9157 case CEPH_MDS_OP_FRAGMENTDIR:
9158 dispatch_fragment_dir(mdr);
9159 break;
9160 case CEPH_MDS_OP_EXPORTDIR:
9161 migrator->dispatch_export_dir(mdr, 0);
9162 break;
9163 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9164 enqueue_scrub_work(mdr);
9165 break;
9166 case CEPH_MDS_OP_FLUSH:
9167 flush_dentry_work(mdr);
9168 break;
9169 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9170 repair_dirfrag_stats_work(mdr);
9171 break;
9172 case CEPH_MDS_OP_REPAIR_INODESTATS:
9173 repair_inode_stats_work(mdr);
9174 break;
9175 default:
9176 ceph_abort();
9177 }
9178 }
9179}
9180
9181
9182void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9183{
9184 if (!mdr->has_more())
9185 return;
9186
9187 // clean up slaves
9188 // (will implicitly drop remote dn pins)
9189 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9190 p != mdr->more()->slaves.end();
9191 ++p) {
9192 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9193 MMDSSlaveRequest::OP_FINISH);
9194
9195 if (mdr->killed && !mdr->committing) {
9196 r->mark_abort();
9197 } else if (mdr->more()->srcdn_auth_mds == *p &&
9198 mdr->more()->inode_import.length() > 0) {
9199 // information about rename imported caps
9200 r->inode_export.claim(mdr->more()->inode_import);
9201 }
9202
9203 mds->send_message_mds(r, *p);
9204 }
9205
9206 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9207 * implicitly. Note that we don't call the finishers -- there shouldn't
9208 * be any on a remote lock and the request finish wakes up all
9209 * the waiters anyway! */
9210 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9211 while (p != mdr->xlocks.end()) {
9212 if ((*p)->get_parent()->is_auth())
9213 ++p;
9214 else {
9215 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9216 << " on " << *(*p)->get_parent() << dendl;
9217 (*p)->put_xlock();
9218 mdr->locks.erase(*p);
9219 mdr->xlocks.erase(p++);
9220 }
9221 }
9222
9223 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9224 while (q != mdr->remote_wrlocks.end()) {
9225 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9226 << " on mds." << q->second
9227 << " on " << *(q->first)->get_parent() << dendl;
9228 mdr->locks.erase(q->first);
9229 mdr->remote_wrlocks.erase(q++);
9230 }
9231
9232 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9233 * leaving them in can cause double-notifies as
9234 * this function can get called more than once */
9235}
9236
9237void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9238{
9239 request_drop_foreign_locks(mdr);
9240 mds->locker->drop_non_rdlocks(mdr.get());
9241}
9242
9243void MDCache::request_drop_locks(MDRequestRef& mdr)
9244{
9245 request_drop_foreign_locks(mdr);
9246 mds->locker->drop_locks(mdr.get());
9247}
9248
9249void MDCache::request_cleanup(MDRequestRef& mdr)
9250{
9251 dout(15) << "request_cleanup " << *mdr << dendl;
9252
9253 if (mdr->has_more()) {
9254 if (mdr->more()->is_ambiguous_auth)
9255 mdr->clear_ambiguous_auth();
9256 if (!mdr->more()->waiting_for_finish.empty())
9257 mds->queue_waiters(mdr->more()->waiting_for_finish);
9258 }
9259
9260 request_drop_locks(mdr);
9261
9262 // drop (local) auth pins
9263 mdr->drop_local_auth_pins();
9264
9265 // drop stickydirs
9266 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9267 p != mdr->stickydirs.end();
9268 ++p)
9269 (*p)->put_stickydirs();
9270
9271 mds->locker->kick_cap_releases(mdr);
9272
9273 // drop cache pins
9274 mdr->drop_pins();
9275
9276 // remove from session
9277 mdr->item_session_request.remove_myself();
9278
9279 // remove from map
9280 active_requests.erase(mdr->reqid);
9281
9282 if (mds->logger)
9283 log_stat();
9284
9285 mdr->mark_event("cleaned up request");
9286}
9287
9288void MDCache::request_kill(MDRequestRef& mdr)
9289{
9290 // rollback slave requests is tricky. just let the request proceed.
94b18763 9291 if (mdr->has_more() &&
7c673cae 9292 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
94b18763
FG
9293 if (!mdr->done_locking) {
9294 assert(mdr->more()->witnessed.empty());
9295 mdr->aborted = true;
9296 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9297 } else {
9298 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9299 }
7c673cae
FG
9300
9301 assert(mdr->used_prealloc_ino == 0);
9302 assert(mdr->prealloc_inos.empty());
9303
9304 mdr->session = NULL;
9305 mdr->item_session_request.remove_myself();
9306 return;
9307 }
9308
9309 mdr->killed = true;
9310 mdr->mark_event("killing request");
9311
9312 if (mdr->committing) {
9313 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9314 } else {
9315 dout(10) << "request_kill " << *mdr << dendl;
9316 request_cleanup(mdr);
9317 }
9318}
9319
9320// -------------------------------------------------------------------------------
9321// SNAPREALMS
9322
9323struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9324 MDRequestRef mdr;
9325 MutationRef mut;
9326 CInode *in;
9327 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9328 MutationRef& mu, CInode *i) :
9329 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9330 void finish(int r) override {
9331 mdcache->_snaprealm_create_finish(mdr, mut, in);
9332 }
9333};
9334
9335void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9336{
9337 dout(10) << "snaprealm_create " << *in << dendl;
9338 assert(!in->snaprealm);
9339
9340 // allocate an id..
9341 if (!mdr->more()->stid) {
9342 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9343 new C_MDS_RetryRequest(this, mdr));
9344 return;
9345 }
9346
9347 MutationRef mut(new MutationImpl());
9348 mut->ls = mds->mdlog->get_current_segment();
9349 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9350 mds->mdlog->start_entry(le);
9351
9352 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9353
94b18763
FG
9354 auto &pi = in->project_inode(false, true);
9355 pi.inode.version = in->pre_dirty();
9356 pi.inode.rstat.rsnaprealms++;
7c673cae
FG
9357
9358 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9359 snapid_t seq;
9360 ::decode(seq, p);
9361
94b18763
FG
9362 auto &newsnap = *pi.snapnode;
9363 newsnap.created = seq;
9364 newsnap.seq = seq;
9365 newsnap.last_created = seq;
7c673cae
FG
9366
9367 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9368 journal_cow_inode(mut, &le->metablob, in);
9369 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9370
9371 mds->server->submit_mdlog_entry(le,
9372 new C_MDC_snaprealm_create_finish(this, mdr,
9373 mut, in),
9374 mdr, __func__);
9375 mds->mdlog->flush();
9376}
9377
9378
9379void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9380{
9381 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9382
9383 vector<inodeno_t> split_inos;
9384 vector<inodeno_t> split_realms;
9385
9386 if (snapop == CEPH_SNAP_OP_SPLIT) {
9387 // notify clients of update|split
9388 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9389 !p.end(); ++p)
9390 split_inos.push_back((*p)->ino());
9391
9392 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9393 p != in->snaprealm->open_children.end();
9394 ++p)
9395 split_realms.push_back((*p)->inode->ino());
9396 }
9397
9398 bufferlist snapbl;
9399 in->snaprealm->build_snap_trace(snapbl);
9400
9401 set<SnapRealm*> past_children;
9402 map<client_t, MClientSnap*> updates;
9403 list<SnapRealm*> q;
9404 q.push_back(in->snaprealm);
9405 while (!q.empty()) {
9406 SnapRealm *realm = q.front();
9407 q.pop_front();
9408
9409 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9410 realm->invalidate_cached_snaps();
9411
9412 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9413 p != realm->client_caps.end();
9414 ++p) {
9415 assert(!p->second->empty());
9416 if (!nosend && updates.count(p->first) == 0) {
9417 MClientSnap *update = new MClientSnap(snapop);
9418 update->head.split = in->ino();
9419 update->split_inos = split_inos;
9420 update->split_realms = split_realms;
9421 update->bl = snapbl;
9422 updates[p->first] = update;
9423 }
9424 }
9425
9426 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9427 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9428 p != realm->open_past_children.end();
9429 ++p)
9430 past_children.insert(*p);
9431 }
9432
9433 // notify for active children, too.
9434 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9435 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9436 p != realm->open_children.end();
9437 ++p)
9438 q.push_back(*p);
9439 }
9440
9441 if (!nosend)
9442 send_snaps(updates);
9443
9444 // notify past children and their descendants if we update/delete old snapshots
9445 for (set<SnapRealm*>::iterator p = past_children.begin();
9446 p != past_children.end();
9447 ++p)
9448 q.push_back(*p);
9449
9450 while (!q.empty()) {
9451 SnapRealm *realm = q.front();
9452 q.pop_front();
9453
9454 realm->invalidate_cached_snaps();
9455
9456 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9457 p != realm->open_children.end();
9458 ++p) {
9459 if (past_children.count(*p) == 0)
9460 q.push_back(*p);
9461 }
9462
9463 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9464 p != realm->open_past_children.end();
9465 ++p) {
9466 if (past_children.count(*p) == 0) {
9467 q.push_back(*p);
9468 past_children.insert(*p);
9469 }
9470 }
9471 }
9472
9473 if (snapop == CEPH_SNAP_OP_DESTROY) {
9474 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9475 for (set<SnapRealm*>::iterator p = past_children.begin();
9476 p != past_children.end();
9477 ++p)
9478 maybe_eval_stray((*p)->inode, true);
9479 }
9480}
9481
9482void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9483{
9484 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9485
9486 // apply
9487 in->pop_and_dirty_projected_inode(mut->ls);
9488 mut->apply();
9489 mds->locker->drop_locks(mut.get());
9490 mut->cleanup();
9491
9492 // tell table we've committed
9493 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9494
9495 // create
9496 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9497 snapid_t seq;
9498 ::decode(seq, p);
9499
9500 in->open_snaprealm();
9501 in->snaprealm->srnode.seq = seq;
9502 in->snaprealm->srnode.created = seq;
9503 bool ok = in->snaprealm->_open_parents(NULL);
9504 assert(ok);
9505
9506 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9507
9508 /*
9509 static int count = 5;
9510 if (--count == 0)
9511 ceph_abort(); // hack test test **********
9512 */
9513
9514 // done.
9515 mdr->more()->stid = 0; // caller will likely need to reuse this
9516 dispatch_request(mdr);
9517}
9518
9519
9520// -------------------------------------------------------------------------------
9521// STRAYS
9522
9523struct C_MDC_RetryScanStray : public MDCacheContext {
9524 dirfrag_t next;
9525 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9526 void finish(int r) override {
9527 mdcache->scan_stray_dir(next);
9528 }
9529};
9530
9531void MDCache::scan_stray_dir(dirfrag_t next)
9532{
9533 dout(10) << "scan_stray_dir " << next << dendl;
9534
9535 list<CDir*> ls;
9536 for (int i = 0; i < NUM_STRAY; ++i) {
9537 if (strays[i]->ino() < next.ino)
9538 continue;
9539 strays[i]->get_dirfrags(ls);
9540 }
9541
9542 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9543 CDir *dir = *p;
9544 if (dir->dirfrag() < next)
9545 continue;
9546 if (!dir->is_complete()) {
9547 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9548 return;
9549 }
94b18763
FG
9550 for (auto &p : dir->items) {
9551 CDentry *dn = p.second;
7c673cae
FG
9552 dn->state_set(CDentry::STATE_STRAY);
9553 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9554 if (dnl->is_primary()) {
9555 CInode *in = dnl->get_inode();
9556 if (in->inode.nlink == 0)
9557 in->state_set(CInode::STATE_ORPHAN);
9558 maybe_eval_stray(in);
9559 }
9560 }
9561 }
9562}
9563
7c673cae
FG
9564void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9565{
9566 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9567 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9568}
9569
9570
9571
9572
9573
9574// ========================================================================================
9575// DISCOVER
9576/*
9577
9578 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9579 to the parent metadata object in the cache (pinning it).
9580
9581 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9582
9583*/
9584
9585void MDCache::_send_discover(discover_info_t& d)
9586{
9587 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9588 d.want_base_dir, d.want_xlocked);
9589 dis->set_tid(d.tid);
9590 mds->send_message_mds(dis, d.mds);
9591}
9592
9593void MDCache::discover_base_ino(inodeno_t want_ino,
9594 MDSInternalContextBase *onfinish,
9595 mds_rank_t from)
9596{
9597 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9598 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9599 discover_info_t& d = _create_discover(from);
9600 d.ino = want_ino;
9601 _send_discover(d);
9602 }
9603 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9604}
9605
9606
9607void MDCache::discover_dir_frag(CInode *base,
9608 frag_t approx_fg,
9609 MDSInternalContextBase *onfinish,
9610 mds_rank_t from)
9611{
9612 if (from < 0)
9613 from = base->authority().first;
9614
9615 dirfrag_t df(base->ino(), approx_fg);
9616 dout(7) << "discover_dir_frag " << df
9617 << " from mds." << from << dendl;
9618
9619 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9620 discover_info_t& d = _create_discover(from);
9621 d.pin_base(base);
9622 d.ino = base->ino();
9623 d.frag = approx_fg;
9624 d.want_base_dir = true;
9625 _send_discover(d);
9626 }
9627
9628 if (onfinish)
9629 base->add_dir_waiter(approx_fg, onfinish);
9630}
9631
9632struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9633 CInode *base;
9634 snapid_t snapid;
9635 filepath path;
9636 mds_rank_t from;
9637 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9638 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9639 void finish(int r) override {
9640 mdcache->discover_path(base, snapid, path, 0, from);
9641 }
9642};
9643
9644void MDCache::discover_path(CInode *base,
9645 snapid_t snap,
9646 filepath want_path,
9647 MDSInternalContextBase *onfinish,
9648 bool want_xlocked,
9649 mds_rank_t from)
9650{
9651 if (from < 0)
9652 from = base->authority().first;
9653
9654 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9655 << (want_xlocked ? " want_xlocked":"")
9656 << dendl;
9657
9658 if (base->is_ambiguous_auth()) {
9659 dout(10) << " waiting for single auth on " << *base << dendl;
9660 if (!onfinish)
9661 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9662 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9663 return;
9664 } else if (from == mds->get_nodeid()) {
9665 list<MDSInternalContextBase*> finished;
9666 base->take_waiting(CInode::WAIT_DIR, finished);
9667 mds->queue_waiters(finished);
9668 return;
9669 }
9670
9671 frag_t fg = base->pick_dirfrag(want_path[0]);
9672 if ((want_xlocked && want_path.depth() == 1) ||
9673 !base->is_waiting_for_dir(fg) || !onfinish) {
9674 discover_info_t& d = _create_discover(from);
9675 d.ino = base->ino();
9676 d.pin_base(base);
9677 d.frag = fg;
9678 d.snap = snap;
9679 d.want_path = want_path;
9680 d.want_base_dir = true;
9681 d.want_xlocked = want_xlocked;
9682 _send_discover(d);
9683 }
9684
9685 // register + wait
9686 if (onfinish)
9687 base->add_dir_waiter(fg, onfinish);
9688}
9689
9690struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9691 CDir *base;
9692 snapid_t snapid;
9693 filepath path;
9694 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9695 MDCacheContext(c), base(b), snapid(s), path(p) {}
9696 void finish(int r) override {
9697 mdcache->discover_path(base, snapid, path, 0);
9698 }
9699};
9700
9701void MDCache::discover_path(CDir *base,
9702 snapid_t snap,
9703 filepath want_path,
9704 MDSInternalContextBase *onfinish,
9705 bool want_xlocked)
9706{
9707 mds_rank_t from = base->authority().first;
9708
9709 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9710 << (want_xlocked ? " want_xlocked":"")
9711 << dendl;
9712
9713 if (base->is_ambiguous_auth()) {
9714 dout(7) << " waiting for single auth on " << *base << dendl;
9715 if (!onfinish)
9716 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9717 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9718 return;
9719 } else if (from == mds->get_nodeid()) {
9720 list<MDSInternalContextBase*> finished;
9721 base->take_sub_waiting(finished);
9722 mds->queue_waiters(finished);
9723 return;
9724 }
9725
9726 if ((want_xlocked && want_path.depth() == 1) ||
9727 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9728 discover_info_t& d = _create_discover(from);
9729 d.ino = base->ino();
31f18b77 9730 d.pin_base(base->inode);
7c673cae
FG
9731 d.frag = base->get_frag();
9732 d.snap = snap;
9733 d.want_path = want_path;
9734 d.want_base_dir = false;
9735 d.want_xlocked = want_xlocked;
9736 _send_discover(d);
9737 }
9738
9739 // register + wait
9740 if (onfinish)
9741 base->add_dentry_waiter(want_path[0], snap, onfinish);
9742}
9743
9744void MDCache::kick_discovers(mds_rank_t who)
9745{
9746 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9747 p != discovers.end();
9748 ++p) {
9749 if (p->second.mds != who)
9750 continue;
9751 _send_discover(p->second);
9752 }
9753}
9754
9755
9756/* This function DOES put the passed message before returning */
9757void MDCache::handle_discover(MDiscover *dis)
9758{
9759 mds_rank_t whoami = mds->get_nodeid();
9760 mds_rank_t from = mds_rank_t(dis->get_source().num());
9761
9762 assert(from != whoami);
9763
9764 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9765 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 9766 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9767 dis->put();
9768 return;
9769 }
9770
9771 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9772 // delay processing request from survivor because we may not yet choose lock states.
9773 if (!mds->mdsmap->is_rejoin(from)) {
9774 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9775 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9776 return;
9777 }
9778 }
9779
9780
9781 CInode *cur = 0;
9782 MDiscoverReply *reply = new MDiscoverReply(dis);
9783
9784 snapid_t snapid = dis->get_snapid();
9785
9786 // get started.
9787 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9788 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9789 // wants root
9790 dout(7) << "handle_discover from mds." << from
9791 << " wants base + " << dis->get_want().get_path()
9792 << " snap " << snapid
9793 << dendl;
9794
9795 cur = get_inode(dis->get_base_ino());
9796 assert(cur);
9797
9798 // add root
9799 reply->starts_with = MDiscoverReply::INODE;
9800 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9801 dout(10) << "added base " << *cur << dendl;
9802 }
9803 else {
9804 // there's a base inode
9805 cur = get_inode(dis->get_base_ino(), snapid);
9806 if (!cur && snapid != CEPH_NOSNAP) {
9807 cur = get_inode(dis->get_base_ino());
9808 if (cur && !cur->is_multiversion())
9809 cur = NULL; // nope!
9810 }
9811
9812 if (!cur) {
9813 dout(7) << "handle_discover mds." << from
9814 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9815 << dendl;
9816 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9817 reply->set_error_dentry(dis->get_dentry(0));
9818 reply->set_flag_error_dir();
9819 } else if (dis->wants_base_dir()) {
9820 dout(7) << "handle_discover mds." << from
9821 << " wants basedir+" << dis->get_want().get_path()
9822 << " has " << *cur
9823 << dendl;
9824 } else {
9825 dout(7) << "handle_discover mds." << from
9826 << " wants " << dis->get_want().get_path()
9827 << " has " << *cur
9828 << dendl;
9829 }
9830 }
9831
9832 assert(reply);
9833
9834 // add content
9835 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9836 for (unsigned i = 0;
9837 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9838 i++) {
9839
9840 // -- figure out the dir
9841
9842 // is *cur even a dir at all?
9843 if (!cur->is_dir()) {
9844 dout(7) << *cur << " not a dir" << dendl;
9845 reply->set_flag_error_dir();
9846 break;
9847 }
9848
9849 // pick frag
9850 frag_t fg;
9851 if (dis->get_want().depth()) {
9852 // dentry specifies
9853 fg = cur->pick_dirfrag(dis->get_dentry(i));
9854 } else {
9855 // requester explicity specified the frag
9856 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9857 fg = dis->get_base_dir_frag();
9858 if (!cur->dirfragtree.is_leaf(fg))
9859 fg = cur->dirfragtree[fg.value()];
9860 }
9861 CDir *curdir = cur->get_dirfrag(fg);
9862
9863 if ((!curdir && !cur->is_auth()) ||
9864 (curdir && !curdir->is_auth())) {
9865
9866 /* before:
9867 * ONLY set flag if empty!!
9868 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9869 * resulting in duplicate discovers in flight,
9870 * which can wreak havoc when discovering rename srcdn (which may move)
9871 */
9872
9873 if (reply->is_empty()) {
9874 // only hint if empty.
9875 // someday this could be better, but right now the waiter logic isn't smart enough.
9876
9877 // hint
9878 if (curdir) {
9879 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9880 reply->set_dir_auth_hint(curdir->authority().first);
9881 } else {
9882 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9883 << *cur << dendl;
9884 reply->set_dir_auth_hint(cur->authority().first);
9885 }
9886
9887 // note error dentry, if any
9888 // NOTE: important, as it allows requester to issue an equivalent discover
9889 // to whomever we hint at.
9890 if (dis->get_want().depth() > i)
9891 reply->set_error_dentry(dis->get_dentry(i));
9892 }
9893
9894 break;
9895 }
9896
9897 if (!curdir) { // open dir?
9898 if (cur->is_frozen()) {
9899 if (!reply->is_empty()) {
9900 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9901 break;
9902 }
9903 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9904 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9905 reply->put();
9906 return;
9907 }
9908 curdir = cur->get_or_open_dirfrag(this, fg);
9909 } else if (curdir->is_frozen_tree() ||
9910 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
9911 if (!reply->is_empty()) {
9912 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9913 break;
9914 }
7c673cae
FG
9915 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9916 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9917 reply->set_flag_error_dir();
9918 break;
9919 }
7c673cae
FG
9920 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9921 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9922 reply->put();
9923 return;
9924 }
9925
9926 // add dir
9927 if (curdir->get_version() == 0) {
9928 // fetch newly opened dir
9929 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9930 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9931 // make sure the base frag is correct, though, in there was a refragment since the
9932 // original request was sent.
9933 reply->set_base_dir_frag(curdir->get_frag());
9934 } else {
9935 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9936 if (!reply->trace.length())
9937 reply->starts_with = MDiscoverReply::DIR;
9938 replicate_dir(curdir, from, reply->trace);
9939 dout(7) << "handle_discover added dir " << *curdir << dendl;
9940 }
9941
9942 // lookup
9943 CDentry *dn = 0;
9944 if (curdir->get_version() == 0) {
9945 // fetch newly opened dir
31f18b77 9946 assert(!curdir->has_bloom());
7c673cae
FG
9947 } else if (dis->get_want().depth() > 0) {
9948 // lookup dentry
9949 dn = curdir->lookup(dis->get_dentry(i), snapid);
9950 } else
9951 break; // done!
9952
9953 // incomplete dir?
9954 if (!dn) {
31f18b77
FG
9955 if (!curdir->is_complete() &&
9956 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
9957 // readdir
9958 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9959 if (reply->is_empty()) {
9960 // fetch and wait
9961 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9962 dis->wants_base_dir() && curdir->get_version() == 0);
9963 reply->put();
9964 return;
9965 } else {
9966 // initiate fetch, but send what we have so far
9967 curdir->fetch(0);
9968 break;
9969 }
9970 }
9971
9972 // send null dentry
9973 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9974 << *curdir << dendl;
9975 dn = curdir->add_null_dentry(dis->get_dentry(i));
9976 }
9977 assert(dn);
9978
31f18b77
FG
9979 // don't add replica to purging dentry/inode
9980 if (dn->state_test(CDentry::STATE_PURGING)) {
9981 if (reply->is_empty())
9982 reply->set_flag_error_dn(dis->get_dentry(i));
9983 break;
9984 }
9985
7c673cae
FG
9986 CDentry::linkage_t *dnl = dn->get_linkage();
9987
9988 // xlocked dentry?
9989 // ...always block on non-tail items (they are unrelated)
9990 // ...allow xlocked tail disocvery _only_ if explicitly requested
9991 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9992 if (dn->lock.is_xlocked()) {
9993 // is this the last (tail) item in the discover traversal?
9994 if (tailitem && dis->wants_xlocked()) {
9995 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9996 } else if (reply->is_empty()) {
9997 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9998 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9999 reply->put();
10000 return;
10001 } else {
10002 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10003 break;
10004 }
10005 }
10006
10007 // frozen inode?
10008 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10009 if (tailitem && dis->wants_xlocked()) {
10010 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10011 } else if (reply->is_empty()) {
10012 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10013 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10014 reply->put();
10015 return;
10016 } else {
10017 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10018 break;
10019 }
10020 }
10021
10022 // add dentry
10023 if (!reply->trace.length())
10024 reply->starts_with = MDiscoverReply::DENTRY;
10025 replicate_dentry(dn, from, reply->trace);
10026 dout(7) << "handle_discover added dentry " << *dn << dendl;
10027
10028 if (!dnl->is_primary()) break; // stop on null or remote link.
10029
10030 // add inode
10031 CInode *next = dnl->get_inode();
10032 assert(next->is_auth());
10033
10034 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10035 dout(7) << "handle_discover added inode " << *next << dendl;
10036
10037 // descend, keep going.
10038 cur = next;
10039 continue;
10040 }
10041
10042 // how did we do?
10043 assert(!reply->is_empty());
10044 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10045 mds->send_message(reply, dis->get_connection());
10046
10047 dis->put();
10048}
10049
10050/* This function DOES put the passed message before returning */
10051void MDCache::handle_discover_reply(MDiscoverReply *m)
10052{
10053 /*
10054 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10055 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10056 m->put();
10057 return;
10058 }
10059 */
10060 dout(7) << "discover_reply " << *m << dendl;
10061 if (m->is_flag_error_dir())
10062 dout(7) << " flag error, dir" << dendl;
10063 if (m->is_flag_error_dn())
10064 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10065
10066 list<MDSInternalContextBase*> finished, error;
10067 mds_rank_t from = mds_rank_t(m->get_source().num());
10068
10069 // starting point
10070 CInode *cur = get_inode(m->get_base_ino());
10071 bufferlist::iterator p = m->trace.begin();
10072
10073 int next = m->starts_with;
10074
10075 // decrement discover counters
10076 if (m->get_tid()) {
10077 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10078 if (p != discovers.end()) {
10079 dout(10) << " found tid " << m->get_tid() << dendl;
10080 discovers.erase(p);
10081 } else {
10082 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10083 }
10084 }
10085
10086 // discover may start with an inode
10087 if (!p.end() && next == MDiscoverReply::INODE) {
10088 cur = add_replica_inode(p, NULL, finished);
10089 dout(7) << "discover_reply got base inode " << *cur << dendl;
10090 assert(cur->is_base());
10091
10092 next = MDiscoverReply::DIR;
10093
10094 // take waiters?
10095 if (cur->is_base() &&
10096 waiting_for_base_ino[from].count(cur->ino())) {
10097 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10098 waiting_for_base_ino[from].erase(cur->ino());
10099 }
10100 }
10101 assert(cur);
10102
10103 // loop over discover results.
10104 // indexes follow each ([[dir] dentry] inode)
10105 // can start, end with any type.
10106 while (!p.end()) {
10107 // dir
10108 frag_t fg;
10109 CDir *curdir = 0;
10110 if (next == MDiscoverReply::DIR) {
10111 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10112 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10113 assert(m->get_wanted_base_dir());
10114 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10115 }
10116 } else {
10117 // note: this can only happen our first way around this loop.
10118 if (p.end() && m->is_flag_error_dn()) {
10119 fg = cur->pick_dirfrag(m->get_error_dentry());
10120 curdir = cur->get_dirfrag(fg);
10121 } else
10122 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10123 }
10124
10125 if (p.end())
10126 break;
10127
10128 // dentry
10129 CDentry *dn = add_replica_dentry(p, curdir, finished);
10130
10131 if (p.end())
10132 break;
10133
10134 // inode
10135 cur = add_replica_inode(p, dn, finished);
10136
10137 next = MDiscoverReply::DIR;
10138 }
10139
10140 // dir error?
10141 // or dir_auth hint?
10142 if (m->is_flag_error_dir() && !cur->is_dir()) {
10143 // not a dir.
10144 cur->take_waiting(CInode::WAIT_DIR, error);
10145 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10146 mds_rank_t who = m->get_dir_auth_hint();
10147 if (who == mds->get_nodeid()) who = -1;
10148 if (who >= 0)
10149 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10150
7c673cae
FG
10151
10152 if (m->get_wanted_base_dir()) {
31f18b77
FG
10153 frag_t fg = m->get_base_dir_frag();
10154 CDir *dir = cur->get_dirfrag(fg);
10155
7c673cae
FG
10156 if (cur->is_waiting_for_dir(fg)) {
10157 if (cur->is_auth())
10158 cur->take_waiting(CInode::WAIT_DIR, finished);
10159 else if (dir || !cur->dirfragtree.is_leaf(fg))
10160 cur->take_dir_waiting(fg, finished);
10161 else
10162 discover_dir_frag(cur, fg, 0, who);
10163 } else
10164 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10165 }
10166
10167 // try again?
10168 if (m->get_error_dentry().length()) {
31f18b77
FG
10169 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10170 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10171 // wanted a dentry
10172 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10173 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10174 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10175 m->get_wanted_snapid(), finished);
10176 } else {
10177 filepath relpath(m->get_error_dentry(), 0);
10178 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10179 }
10180 } else
10181 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10182 << m->get_error_dentry() << dendl;
10183 }
31f18b77
FG
10184 } else if (m->is_flag_error_dn()) {
10185 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10186 CDir *dir = cur->get_dirfrag(fg);
10187 if (dir) {
10188 if (dir->is_auth()) {
10189 dir->take_sub_waiting(finished);
10190 } else {
10191 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10192 m->get_wanted_snapid(), error);
10193 }
10194 }
7c673cae
FG
10195 }
10196
10197 // waiters
10198 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10199 mds->queue_waiters(finished);
10200
10201 // done
10202 m->put();
10203}
10204
10205
10206
10207// ----------------------------
10208// REPLICAS
10209
b32b8144
FG
10210
10211void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10212{
10213 dirfrag_t df = dir->dirfrag();
10214 ::encode(df, bl);
10215 dir->encode_replica(to, bl);
10216}
10217
10218void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10219{
94b18763 10220 ::encode(dn->get_name(), bl);
b32b8144
FG
10221 ::encode(dn->last, bl);
10222 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10223}
10224
10225void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10226 uint64_t features)
10227{
10228 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
10229 ::encode(in->last, bl);
10230 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10231}
10232
7c673cae
FG
10233CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10234 list<MDSInternalContextBase*>& finished)
10235{
10236 dirfrag_t df;
10237 ::decode(df, p);
10238
10239 assert(diri->ino() == df.ino);
10240
10241 // add it (_replica_)
10242 CDir *dir = diri->get_dirfrag(df.frag);
10243
10244 if (dir) {
10245 // had replica. update w/ new nonce.
10246 dir->decode_replica(p);
10247 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10248 } else {
10249 // force frag to leaf in the diri tree
10250 if (!diri->dirfragtree.is_leaf(df.frag)) {
10251 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10252 << diri->dirfragtree << dendl;
10253 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10254 }
10255
10256 // add replica.
10257 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10258 dir->decode_replica(p);
10259
10260 // is this a dir_auth delegation boundary?
10261 if (from != diri->authority().first ||
10262 diri->is_ambiguous_auth() ||
10263 diri->is_base())
10264 adjust_subtree_auth(dir, from);
10265
10266 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10267
10268 // get waiters
10269 diri->take_dir_waiting(df.frag, finished);
10270 }
10271
10272 return dir;
10273}
10274
7c673cae
FG
10275CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10276{
10277 string name;
10278 snapid_t last;
10279 ::decode(name, p);
10280 ::decode(last, p);
10281
10282 CDentry *dn = dir->lookup(name, last);
10283
10284 // have it?
10285 if (dn) {
10286 dn->decode_replica(p, false);
10287 dout(7) << "add_replica_dentry had " << *dn << dendl;
10288 } else {
10289 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10290 dn->decode_replica(p, true);
10291 dout(7) << "add_replica_dentry added " << *dn << dendl;
10292 }
10293
10294 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10295
10296 return dn;
10297}
10298
10299CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10300{
10301 inodeno_t ino;
10302 snapid_t last;
10303 ::decode(ino, p);
10304 ::decode(last, p);
10305 CInode *in = get_inode(ino, last);
10306 if (!in) {
10307 in = new CInode(this, false, 1, last);
10308 in->decode_replica(p, true);
10309 add_inode(in);
10310 if (in->ino() == MDS_INO_ROOT)
10311 in->inode_auth.first = 0;
10312 else if (in->is_mdsdir())
10313 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10314 dout(10) << "add_replica_inode added " << *in << dendl;
10315 if (dn) {
10316 assert(dn->get_linkage()->is_null());
10317 dn->dir->link_primary_inode(dn, in);
10318 }
10319 } else {
10320 in->decode_replica(p, false);
10321 dout(10) << "add_replica_inode had " << *in << dendl;
10322 }
10323
10324 if (dn) {
10325 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10326 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10327 }
10328
10329 return in;
10330}
10331
10332
10333void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10334{
10335 uint64_t features = mds->mdsmap->get_up_features();
10336 replicate_inode(get_myin(), who, bl, features);
10337 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10338 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10339 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10340 replicate_dir(straydn->get_dir(), who, bl);
10341 replicate_dentry(straydn, who, bl);
10342}
10343
10344CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10345{
10346 list<MDSInternalContextBase*> finished;
10347 bufferlist::iterator p = bl.begin();
10348
10349 CInode *mdsin = add_replica_inode(p, NULL, finished);
10350 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10351 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10352 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10353 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10354 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10355 if (!finished.empty())
10356 mds->queue_waiters(finished);
10357
10358 return straydn;
10359}
10360
10361
10362int MDCache::send_dir_updates(CDir *dir, bool bcast)
10363{
10364 // this is an FYI, re: replication
10365
10366 set<mds_rank_t> who;
10367 if (bcast) {
10368 mds->get_mds_map()->get_active_mds_set(who);
10369 } else {
181888fb
FG
10370 for (const auto &p : dir->get_replicas()) {
10371 who.insert(p.first);
10372 }
7c673cae
FG
10373 }
10374
10375 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10376
10377 filepath path;
10378 dir->inode->make_path(path);
10379
10380 mds_rank_t whoami = mds->get_nodeid();
10381 for (set<mds_rank_t>::iterator it = who.begin();
10382 it != who.end();
10383 ++it) {
10384 if (*it == whoami) continue;
10385 //if (*it == except) continue;
10386 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10387
94b18763
FG
10388 std::set<int32_t> s;
10389 for (const auto &r : dir->dir_rep_by) {
10390 s.insert(r);
10391 }
7c673cae
FG
10392 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10393 dir->dirfrag(),
10394 dir->dir_rep,
94b18763 10395 s,
7c673cae
FG
10396 path,
10397 bcast),
10398 *it);
10399 }
10400
10401 return 0;
10402}
10403
10404/* This function DOES put the passed message before returning */
10405void MDCache::handle_dir_update(MDirUpdate *m)
10406{
224ce89b
WB
10407 dirfrag_t df = m->get_dirfrag();
10408 CDir *dir = get_dirfrag(df);
7c673cae 10409 if (!dir) {
224ce89b 10410 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10411
10412 // discover it?
10413 if (m->should_discover()) {
10414 // only try once!
10415 // this is key to avoid a fragtree update race, among other things.
224ce89b 10416 m->inc_tried_discover();
7c673cae
FG
10417 vector<CDentry*> trace;
10418 CInode *in;
10419 filepath path = m->get_path();
10420 dout(5) << "trying discover on dir_update for " << path << dendl;
10421 MDRequestRef null_ref;
10422 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10423 if (r > 0)
10424 return;
224ce89b
WB
10425 if (r == 0 &&
10426 in->ino() == df.ino &&
10427 in->get_approx_dirfrag(df.frag) == NULL) {
10428 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10429 return;
10430 }
7c673cae
FG
10431 }
10432
10433 m->put();
10434 return;
10435 }
10436
224ce89b
WB
10437 if (!m->has_tried_discover()) {
10438 // Update if it already exists. Othwerwise it got updated by discover reply.
10439 dout(5) << "dir_update on " << *dir << dendl;
10440 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10441 dir->dir_rep_by.clear();
10442 for (const auto &e : m->get_dir_rep_by()) {
10443 dir->dir_rep_by.insert(e);
10444 }
224ce89b
WB
10445 }
10446
7c673cae
FG
10447 // done
10448 m->put();
10449}
10450
10451
10452
10453
10454
10455// LINK
10456
10457void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10458{
10459 dout(7) << "send_dentry_link " << *dn << dendl;
10460
10461 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10462 for (const auto &p : dn->get_replicas()) {
7c673cae 10463 // don't tell (rename) witnesses; they already know
181888fb 10464 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10465 continue;
181888fb
FG
10466 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10467 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10468 rejoin_gather.count(p.first)))
7c673cae
FG
10469 continue;
10470 CDentry::linkage_t *dnl = dn->get_linkage();
10471 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
94b18763 10472 dn->get_name(), dnl->is_primary());
7c673cae
FG
10473 if (dnl->is_primary()) {
10474 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10475 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10476 mds->mdsmap->get_up_features());
10477 } else if (dnl->is_remote()) {
10478 inodeno_t ino = dnl->get_remote_ino();
10479 __u8 d_type = dnl->get_remote_d_type();
10480 dout(10) << " remote " << ino << " " << d_type << dendl;
10481 ::encode(ino, m->bl);
10482 ::encode(d_type, m->bl);
10483 } else
10484 ceph_abort(); // aie, bad caller!
181888fb 10485 mds->send_message_mds(m, p.first);
7c673cae
FG
10486 }
10487}
10488
10489/* This function DOES put the passed message before returning */
10490void MDCache::handle_dentry_link(MDentryLink *m)
10491{
10492
10493 CDentry *dn = NULL;
10494 CDir *dir = get_dirfrag(m->get_dirfrag());
10495 if (!dir) {
10496 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10497 } else {
10498 dn = dir->lookup(m->get_dn());
10499 if (!dn) {
10500 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10501 } else {
10502 dout(7) << "handle_dentry_link on " << *dn << dendl;
10503 CDentry::linkage_t *dnl = dn->get_linkage();
10504
10505 assert(!dn->is_auth());
10506 assert(dnl->is_null());
10507 }
10508 }
10509
10510 bufferlist::iterator p = m->bl.begin();
10511 list<MDSInternalContextBase*> finished;
10512 if (dn) {
10513 if (m->get_is_primary()) {
10514 // primary link.
10515 add_replica_inode(p, dn, finished);
10516 } else {
10517 // remote link, easy enough.
10518 inodeno_t ino;
10519 __u8 d_type;
10520 ::decode(ino, p);
10521 ::decode(d_type, p);
10522 dir->link_remote_inode(dn, ino, d_type);
10523 }
10524 } else {
10525 ceph_abort();
10526 }
10527
10528 if (!finished.empty())
10529 mds->queue_waiters(finished);
10530
10531 m->put();
10532 return;
10533}
10534
10535
10536// UNLINK
10537
10538void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10539{
10540 dout(10) << "send_dentry_unlink " << *dn << dendl;
10541 // share unlink news with replicas
10542 set<mds_rank_t> replicas;
10543 dn->list_replicas(replicas);
10544 if (straydn)
10545 straydn->list_replicas(replicas);
10546 for (set<mds_rank_t>::iterator it = replicas.begin();
10547 it != replicas.end();
10548 ++it) {
10549 // don't tell (rmdir) witnesses; they already know
10550 if (mdr.get() && mdr->more()->witnessed.count(*it))
10551 continue;
10552
10553 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10554 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10555 rejoin_gather.count(*it)))
10556 continue;
10557
94b18763 10558 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
7c673cae
FG
10559 if (straydn)
10560 replicate_stray(straydn, *it, unlink->straybl);
10561 mds->send_message_mds(unlink, *it);
10562 }
10563}
10564
10565/* This function DOES put the passed message before returning */
10566void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10567{
10568 // straydn
10569 CDentry *straydn = NULL;
10570 if (m->straybl.length())
10571 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10572
10573 CDir *dir = get_dirfrag(m->get_dirfrag());
10574 if (!dir) {
10575 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10576 } else {
10577 CDentry *dn = dir->lookup(m->get_dn());
10578 if (!dn) {
10579 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10580 } else {
10581 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10582 CDentry::linkage_t *dnl = dn->get_linkage();
10583
10584 // open inode?
10585 if (dnl->is_primary()) {
10586 CInode *in = dnl->get_inode();
10587 dn->dir->unlink_inode(dn);
10588 assert(straydn);
10589 straydn->dir->link_primary_inode(straydn, in);
10590
10591 // in->first is lazily updated on replica; drag it forward so
10592 // that we always keep it in sync with the dnq
10593 assert(straydn->first >= in->first);
10594 in->first = straydn->first;
10595
10596 // update subtree map?
10597 if (in->is_dir())
10598 adjust_subtree_after_rename(in, dir, false);
10599
10600 // send caps to auth (if we're not already)
10601 if (in->is_any_caps() &&
10602 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10603 migrator->export_caps(in);
10604
7c673cae
FG
10605 straydn = NULL;
10606 } else {
10607 assert(!straydn);
10608 assert(dnl->is_remote());
10609 dn->dir->unlink_inode(dn);
10610 }
10611 assert(dnl->is_null());
7c673cae
FG
10612 }
10613 }
10614
10615 // race with trim_dentry()
10616 if (straydn) {
10617 assert(straydn->get_num_ref() == 0);
10618 assert(straydn->get_linkage()->is_null());
10619 map<mds_rank_t, MCacheExpire*> expiremap;
10620 trim_dentry(straydn, expiremap);
10621 send_expire_messages(expiremap);
10622 }
10623
10624 m->put();
10625 return;
10626}
10627
10628
10629
10630
10631
10632
10633// ===================================================================
10634
10635
10636
10637// ===================================================================
10638// FRAGMENT
10639
10640
10641/**
10642 * adjust_dir_fragments -- adjust fragmentation for a directory
10643 *
10644 * @param diri directory inode
10645 * @param basefrag base fragment
10646 * @param bits bit adjustment. positive for split, negative for merge.
10647 */
10648void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10649 list<CDir*>& resultfrags,
10650 list<MDSInternalContextBase*>& waiters,
10651 bool replay)
10652{
10653 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10654 << " on " << *diri << dendl;
10655
10656 list<CDir*> srcfrags;
10657 diri->get_dirfrags_under(basefrag, srcfrags);
10658
10659 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10660}
10661
10662CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10663{
10664 CDir *dir = diri->get_dirfrag(fg);
10665 if (dir)
10666 return dir;
10667
10668 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10669
10670 list<CDir*> src, result;
10671 list<MDSInternalContextBase*> waiters;
10672
10673 // split a parent?
10674 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10675 while (1) {
10676 CDir *pdir = diri->get_dirfrag(parent);
10677 if (pdir) {
10678 int split = fg.bits() - parent.bits();
10679 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10680 src.push_back(pdir);
10681 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10682 dir = diri->get_dirfrag(fg);
10683 if (dir) {
10684 dout(10) << "force_dir_fragment result " << *dir << dendl;
10685 break;
10686 }
10687 }
10688 if (parent == frag_t())
10689 break;
10690 frag_t last = parent;
10691 parent = parent.parent();
10692 dout(10) << " " << last << " parent is " << parent << dendl;
10693 }
10694
10695 if (!dir) {
10696 // hoover up things under fg?
10697 diri->get_dirfrags_under(fg, src);
10698 if (src.empty()) {
10699 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10700 } else {
10701 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10702 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10703 dir = result.front();
10704 dout(10) << "force_dir_fragment result " << *dir << dendl;
10705 }
10706 }
10707 if (!replay)
10708 mds->queue_waiters(waiters);
10709 return dir;
10710}
10711
10712void MDCache::adjust_dir_fragments(CInode *diri,
10713 list<CDir*>& srcfrags,
10714 frag_t basefrag, int bits,
10715 list<CDir*>& resultfrags,
10716 list<MDSInternalContextBase*>& waiters,
10717 bool replay)
10718{
10719 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10720 << " srcfrags " << srcfrags
10721 << " on " << *diri << dendl;
10722
10723 // adjust fragtree
10724 // yuck. we may have discovered the inode while it was being fragmented.
10725 if (!diri->dirfragtree.is_leaf(basefrag))
10726 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10727
10728 if (bits > 0)
10729 diri->dirfragtree.split(basefrag, bits);
10730 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10731
10732 if (srcfrags.empty())
10733 return;
10734
10735 // split
10736 CDir *parent_dir = diri->get_parent_dir();
10737 CDir *parent_subtree = 0;
10738 if (parent_dir)
10739 parent_subtree = get_subtree_root(parent_dir);
10740
10741 if (bits > 0) {
10742 // SPLIT
10743 assert(srcfrags.size() == 1);
10744 CDir *dir = srcfrags.front();
10745
10746 dir->split(bits, resultfrags, waiters, replay);
10747
10748 // did i change the subtree map?
10749 if (dir->is_subtree_root()) {
10750 // new frags are now separate subtrees
10751 for (list<CDir*>::iterator p = resultfrags.begin();
10752 p != resultfrags.end();
10753 ++p)
10754 subtrees[*p].clear(); // new frag is now its own subtree
10755
10756 // was i a bound?
10757 if (parent_subtree) {
10758 assert(subtrees[parent_subtree].count(dir));
10759 subtrees[parent_subtree].erase(dir);
10760 for (list<CDir*>::iterator p = resultfrags.begin();
10761 p != resultfrags.end();
10762 ++p) {
10763 assert((*p)->is_subtree_root());
10764 subtrees[parent_subtree].insert(*p);
10765 }
10766 }
10767
10768 // adjust my bounds.
10769 set<CDir*> bounds;
10770 bounds.swap(subtrees[dir]);
10771 subtrees.erase(dir);
10772 for (set<CDir*>::iterator p = bounds.begin();
10773 p != bounds.end();
10774 ++p) {
10775 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10776 subtrees[frag].insert(*p);
10777 }
10778
10779 show_subtrees(10);
10780
10781 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10782 dir->dir_auth = CDIR_AUTH_DEFAULT;
10783 }
10784
10785 diri->close_dirfrag(dir->get_frag());
10786
10787 } else {
10788 // MERGE
10789
10790 // are my constituent bits subtrees? if so, i will be too.
10791 // (it's all or none, actually.)
31f18b77
FG
10792 bool any_subtree = false;
10793 for (CDir *dir : srcfrags) {
7c673cae 10794 if (dir->is_subtree_root()) {
31f18b77
FG
10795 any_subtree = true;
10796 break;
10797 }
10798 }
10799 set<CDir*> new_bounds;
10800 if (any_subtree) {
10801 for (CDir *dir : srcfrags) {
10802 // this simplifies the code that find subtrees underneath the dirfrag
10803 if (!dir->is_subtree_root()) {
10804 dir->state_set(CDir::STATE_AUXSUBTREE);
10805 adjust_subtree_auth(dir, mds->get_nodeid());
10806 }
10807 }
10808
10809 for (CDir *dir : srcfrags) {
10810 assert(dir->is_subtree_root());
7c673cae 10811 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
10812 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10813 set<CDir*>::iterator r = q->second.begin();
10814 while (r != subtrees[dir].end()) {
10815 new_bounds.insert(*r);
10816 subtrees[dir].erase(r++);
10817 }
10818 subtrees.erase(q);
31f18b77 10819
7c673cae
FG
10820 // remove myself as my parent's bound
10821 if (parent_subtree)
10822 subtrees[parent_subtree].erase(dir);
10823 }
10824 }
10825
10826 // merge
10827 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10828 f->merge(srcfrags, waiters, replay);
7c673cae 10829
31f18b77 10830 if (any_subtree) {
7c673cae
FG
10831 assert(f->is_subtree_root());
10832 subtrees[f].swap(new_bounds);
10833 if (parent_subtree)
10834 subtrees[parent_subtree].insert(f);
10835
10836 show_subtrees(10);
10837 }
10838
10839 resultfrags.push_back(f);
10840 }
10841}
10842
10843
10844class C_MDC_FragmentFrozen : public MDSInternalContext {
10845 MDCache *mdcache;
10846 MDRequestRef mdr;
10847public:
10848 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10849 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10850 void finish(int r) override {
10851 mdcache->fragment_frozen(mdr, r);
10852 }
10853};
10854
10855bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10856{
10857 if (is_readonly()) {
10858 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10859 return false;
10860 }
10861 if (mds->is_cluster_degraded()) {
10862 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10863 return false;
10864 }
10865 if (diri->get_parent_dir() &&
10866 diri->get_parent_dir()->get_inode()->is_stray()) {
10867 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10868 return false;
10869 }
10870 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10871 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10872 return false;
10873 }
10874
10875 if (diri->scrub_is_in_progress()) {
10876 dout(7) << "can_fragment: scrub in progress" << dendl;
10877 return false;
10878 }
10879
10880 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10881 CDir *dir = *p;
10882 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10883 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10884 return false;
10885 }
10886 if (!dir->is_auth()) {
10887 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10888 return false;
10889 }
10890 if (dir->is_bad()) {
10891 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10892 return false;
10893 }
10894 if (dir->is_frozen() ||
10895 dir->is_freezing()) {
10896 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10897 return false;
10898 }
10899 }
10900
10901 return true;
10902}
10903
10904void MDCache::split_dir(CDir *dir, int bits)
10905{
10906 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10907 assert(dir->is_auth());
10908 CInode *diri = dir->inode;
10909
10910 list<CDir*> dirs;
10911 dirs.push_back(dir);
10912
10913 if (!can_fragment(diri, dirs)) {
10914 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10915 return;
10916 }
10917
31f18b77
FG
10918 if (dir->frag.bits() + bits > 24) {
10919 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10920 return;
10921 }
10922
7c673cae
FG
10923 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10924 mdr->more()->fragment_base = dir->dirfrag();
10925
10926 assert(fragments.count(dir->dirfrag()) == 0);
10927 fragment_info_t& info = fragments[dir->dirfrag()];
10928 info.mdr = mdr;
10929 info.dirs.push_back(dir);
10930 info.bits = bits;
10931 info.last_cum_auth_pins_change = ceph_clock_now();
10932
10933 fragment_freeze_dirs(dirs);
10934 // initial mark+complete pass
10935 fragment_mark_and_complete(mdr);
10936}
10937
10938void MDCache::merge_dir(CInode *diri, frag_t frag)
10939{
10940 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10941
10942 list<CDir*> dirs;
10943 if (!diri->get_dirfrags_under(frag, dirs)) {
10944 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10945 return;
10946 }
10947
10948 if (diri->dirfragtree.is_leaf(frag)) {
10949 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10950 return;
10951 }
10952
10953 if (!can_fragment(diri, dirs))
10954 return;
10955
10956 CDir *first = dirs.front();
10957 int bits = first->get_frag().bits() - frag.bits();
10958 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10959
10960 dirfrag_t basedirfrag(diri->ino(), frag);
10961 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10962 mdr->more()->fragment_base = basedirfrag;
10963
10964 assert(fragments.count(basedirfrag) == 0);
10965 fragment_info_t& info = fragments[basedirfrag];
10966 info.mdr = mdr;
10967 info.dirs = dirs;
10968 info.bits = -bits;
10969 info.last_cum_auth_pins_change = ceph_clock_now();
10970
10971 fragment_freeze_dirs(dirs);
10972 // initial mark+complete pass
10973 fragment_mark_and_complete(mdr);
10974}
10975
10976void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10977{
10978 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10979 CDir *dir = *p;
10980 dir->auth_pin(dir); // until we mark and complete them
10981 dir->state_set(CDir::STATE_FRAGMENTING);
10982 dir->freeze_dir();
10983 assert(dir->is_freezing_dir());
10984 }
10985}
10986
10987class C_MDC_FragmentMarking : public MDCacheContext {
10988 MDRequestRef mdr;
10989public:
10990 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10991 void finish(int r) override {
10992 mdcache->fragment_mark_and_complete(mdr);
10993 }
10994};
10995
10996void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10997{
10998 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10999 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11000 if (it == fragments.end() || it->second.mdr != mdr) {
11001 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11002 request_finish(mdr);
11003 return;
11004 }
11005
11006 fragment_info_t& info = it->second;
11007 CInode *diri = info.dirs.front()->get_inode();
11008 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11009
11010 MDSGatherBuilder gather(g_ceph_context);
11011
11012 for (list<CDir*>::iterator p = info.dirs.begin();
11013 p != info.dirs.end();
11014 ++p) {
11015 CDir *dir = *p;
11016
11017 bool ready = true;
11018 if (!dir->is_complete()) {
11019 dout(15) << " fetching incomplete " << *dir << dendl;
11020 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11021 ready = false;
11022 } else if (dir->get_frag() == frag_t()) {
11023 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11024 // the operation. To avoid CDir::fetch() complaining about missing object,
11025 // we commit new dirfrag first.
11026 if (dir->state_test(CDir::STATE_CREATING)) {
11027 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11028 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11029 ready = false;
11030 } else if (dir->is_new()) {
11031 dout(15) << " committing new " << *dir << dendl;
11032 assert(dir->is_dirty());
11033 dir->commit(0, gather.new_sub(), true);
11034 ready = false;
11035 }
11036 }
11037 if (!ready)
11038 continue;
11039
11040 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11041 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11042 for (auto &p : dir->items) {
11043 CDentry *dn = p.second;
7c673cae
FG
11044 dn->get(CDentry::PIN_FRAGMENTING);
11045 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11046 dn->state_set(CDentry::STATE_FRAGMENTING);
11047 }
11048 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11049 dir->auth_unpin(dir);
11050 } else {
11051 dout(15) << " already marked " << *dir << dendl;
11052 }
11053 }
11054 if (gather.has_subs()) {
11055 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11056 gather.activate();
11057 return;
11058 }
11059
11060 for (list<CDir*>::iterator p = info.dirs.begin();
11061 p != info.dirs.end();
11062 ++p) {
11063 CDir *dir = *p;
11064 if (!dir->is_frozen_dir()) {
11065 assert(dir->is_freezing_dir());
11066 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11067 }
11068 }
11069 if (gather.has_subs()) {
11070 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11071 gather.activate();
11072 // flush log so that request auth_pins are retired
11073 mds->mdlog->flush();
11074 return;
11075 }
11076
11077 fragment_frozen(mdr, 0);
11078}
11079
11080void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11081{
11082 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11083 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11084 CDir *dir = *p;
11085 dout(10) << " frag " << *dir << dendl;
11086
11087 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11088 dir->state_clear(CDir::STATE_FRAGMENTING);
11089
11090 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11091 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11092
94b18763
FG
11093 for (auto &p : dir->items) {
11094 CDentry *dn = p.second;
7c673cae
FG
11095 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11096 dn->state_clear(CDentry::STATE_FRAGMENTING);
11097 dn->put(CDentry::PIN_FRAGMENTING);
11098 }
11099 } else {
11100 dir->auth_unpin(dir);
11101 }
11102
11103 dir->unfreeze_dir();
11104 }
11105}
11106
11107bool MDCache::fragment_are_all_frozen(CDir *dir)
11108{
11109 assert(dir->is_frozen_dir());
11110 map<dirfrag_t,fragment_info_t>::iterator p;
11111 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11112 p != fragments.end() && p->first.ino == dir->ino();
11113 ++p) {
11114 if (p->first.frag.contains(dir->get_frag()))
11115 return p->second.all_frozen;
11116 }
11117 ceph_abort();
11118 return false;
11119}
11120
11121void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11122{
11123 map<dirfrag_t,fragment_info_t>::iterator p;
11124 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11125 p != fragments.end() && p->first.ino == dir->ino();
11126 ++p) {
11127 if (p->first.frag.contains(dir->get_frag())) {
11128 p->second.num_remote_waiters++;
11129 return;
11130 }
11131 }
11132 ceph_abort();
11133}
11134
11135void MDCache::find_stale_fragment_freeze()
11136{
11137 dout(10) << "find_stale_fragment_freeze" << dendl;
11138 // see comment in Migrator::find_stale_export_freeze()
11139 utime_t now = ceph_clock_now();
11140 utime_t cutoff = now;
11141 cutoff -= g_conf->mds_freeze_tree_timeout;
11142
11143 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11144 p != fragments.end(); ) {
11145 dirfrag_t df = p->first;
11146 fragment_info_t& info = p->second;
11147 ++p;
11148 if (info.all_frozen)
11149 continue;
11150 CDir *dir;
11151 int total_auth_pins = 0;
11152 for (list<CDir*>::iterator q = info.dirs.begin();
11153 q != info.dirs.end();
11154 ++q) {
11155 dir = *q;
11156 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11157 total_auth_pins = -1;
11158 break;
11159 }
11160 if (dir->is_frozen_dir())
11161 continue;
11162 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11163 }
11164 if (total_auth_pins < 0)
11165 continue;
11166 if (info.last_cum_auth_pins != total_auth_pins) {
11167 info.last_cum_auth_pins = total_auth_pins;
11168 info.last_cum_auth_pins_change = now;
11169 continue;
11170 }
11171 if (info.last_cum_auth_pins_change >= cutoff)
11172 continue;
11173 dir = info.dirs.front();
11174 if (info.num_remote_waiters > 0 ||
11175 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11176 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11177 list<CDir*> dirs;
11178 info.dirs.swap(dirs);
11179 fragments.erase(df);
11180 fragment_unmark_unfreeze_dirs(dirs);
11181 }
11182 }
11183}
11184
11185class C_MDC_FragmentPrep : public MDCacheLogContext {
11186 MDRequestRef mdr;
11187public:
11188 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11189 void finish(int r) override {
11190 mdcache->_fragment_logged(mdr);
11191 }
11192};
11193
11194class C_MDC_FragmentStore : public MDCacheContext {
11195 MDRequestRef mdr;
11196public:
11197 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11198 void finish(int r) override {
11199 mdcache->_fragment_stored(mdr);
11200 }
11201};
11202
11203class C_MDC_FragmentCommit : public MDCacheLogContext {
11204 dirfrag_t basedirfrag;
11205 list<CDir*> resultfrags;
11206public:
11207 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11208 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11209 void finish(int r) override {
11210 mdcache->_fragment_committed(basedirfrag, resultfrags);
11211 }
11212};
11213
11214class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11215 dirfrag_t basedirfrag;
11216 list<CDir*> resultfrags;
11217public:
11218 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11219 MDCacheIOContext(m), basedirfrag(f) {
11220 resultfrags.swap(l);
11221 }
11222 void finish(int r) override {
11223 assert(r == 0 || r == -ENOENT);
11224 mdcache->_fragment_finish(basedirfrag, resultfrags);
11225 }
11226};
11227
11228void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11229{
11230 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11231 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11232 if (it == fragments.end() || it->second.mdr != mdr) {
11233 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11234 request_finish(mdr);
11235 return;
11236 }
11237
11238 assert(r == 0);
11239 fragment_info_t& info = it->second;
11240 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11241 << " on " << info.dirs.front()->get_inode() << dendl;
11242
11243 info.all_frozen = true;
11244 dispatch_fragment_dir(mdr);
11245}
11246
11247void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11248{
11249 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11250 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11251 if (it == fragments.end() || it->second.mdr != mdr) {
11252 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11253 request_finish(mdr);
11254 return;
11255 }
11256
11257 fragment_info_t& info = it->second;
11258 CInode *diri = info.dirs.front()->get_inode();
11259
11260 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11261 << " on " << *diri << dendl;
11262 if (!mdr->aborted) {
11263 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11264 wrlocks.insert(&diri->dirfragtreelock);
11265 // prevent a racing gather on any other scatterlocks too
11266 wrlocks.insert(&diri->nestlock);
11267 wrlocks.insert(&diri->filelock);
11268 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11269 if (!mdr->aborted)
11270 return;
11271 }
11272
11273 if (mdr->aborted) {
11274 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11275 << info.dirs.front()->dirfrag() << dendl;
11276 if (info.bits > 0)
11277 mds->balancer->queue_split(info.dirs.front(), false);
11278 else
11279 mds->balancer->queue_merge(info.dirs.front());
11280 fragment_unmark_unfreeze_dirs(info.dirs);
11281 fragments.erase(it);
11282 request_finish(mdr);
11283 return;
11284 }
11285
11286 mdr->ls = mds->mdlog->get_current_segment();
11287 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11288 mds->mdlog->start_entry(le);
11289
11290 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11291 CDir *dir = *p;
11292 dirfrag_rollback rollback;
11293 rollback.fnode = dir->fnode;
11294 le->add_orig_frag(dir->get_frag(), &rollback);
11295 }
11296
11297 // refragment
11298 list<MDSInternalContextBase*> waiters;
11299 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11300 info.resultfrags, waiters, false);
11301 if (g_conf->mds_debug_frag)
11302 diri->verify_dirfrags();
11303 mds->queue_waiters(waiters);
11304
11305 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11306 assert(!diri->dirfragtree.is_leaf(*p));
11307
11308 le->metablob.add_dir_context(*info.resultfrags.begin());
11309 for (list<CDir*>::iterator p = info.resultfrags.begin();
11310 p != info.resultfrags.end();
11311 ++p) {
11312 if (diri->is_auth()) {
11313 le->metablob.add_fragmented_dir(*p, false, false);
11314 } else {
11315 (*p)->state_set(CDir::STATE_DIRTYDFT);
11316 le->metablob.add_fragmented_dir(*p, false, true);
11317 }
11318 }
11319
11320 // dft lock
11321 if (diri->is_auth()) {
11322 // journal dirfragtree
94b18763
FG
11323 auto &pi = diri->project_inode();
11324 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11325 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11326 } else {
11327 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11328 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11329 mdr->add_updated_lock(&diri->dirfragtreelock);
11330 }
11331
11332 /*
11333 // filelock
11334 mds->locker->mark_updated_scatterlock(&diri->filelock);
11335 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11336 mut->add_updated_lock(&diri->filelock);
11337
11338 // dirlock
11339 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11340 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11341 mut->add_updated_lock(&diri->nestlock);
11342 */
11343
11344 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11345 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11346 mdr, __func__);
11347 mds->mdlog->flush();
11348}
11349
11350void MDCache::_fragment_logged(MDRequestRef& mdr)
11351{
11352 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11353 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11354 assert(it != fragments.end());
11355 fragment_info_t &info = it->second;
11356 CInode *diri = info.resultfrags.front()->get_inode();
11357
11358 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11359 << " on " << *diri << dendl;
11360
11361 if (diri->is_auth())
11362 diri->pop_and_dirty_projected_inode(mdr->ls);
11363
11364 mdr->apply(); // mark scatterlock
11365
11366 // store resulting frags
11367 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11368
11369 for (list<CDir*>::iterator p = info.resultfrags.begin();
11370 p != info.resultfrags.end();
11371 ++p) {
11372 CDir *dir = *p;
11373 dout(10) << " storing result frag " << *dir << dendl;
11374
11375 // freeze and store them too
11376 dir->auth_pin(this);
11377 dir->state_set(CDir::STATE_FRAGMENTING);
11378 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11379 }
11380
11381 gather.activate();
11382}
11383
11384void MDCache::_fragment_stored(MDRequestRef& mdr)
11385{
11386 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11387 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11388 assert(it != fragments.end());
11389 fragment_info_t &info = it->second;
11390 CInode *diri = info.resultfrags.front()->get_inode();
11391
11392 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11393 << " on " << *diri << dendl;
11394
11395 // tell peers
11396 CDir *first = *info.resultfrags.begin();
181888fb
FG
11397 for (const auto &p : first->get_replicas()) {
11398 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11399 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11400 rejoin_gather.count(p.first)))
7c673cae
FG
11401 continue;
11402
11403 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11404
11405 // freshly replicate new dirs to peers
11406 for (list<CDir*>::iterator q = info.resultfrags.begin();
11407 q != info.resultfrags.end();
11408 ++q)
181888fb 11409 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11410
181888fb 11411 mds->send_message_mds(notify, p.first);
7c673cae
FG
11412 }
11413
11414 // journal commit
11415 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11416 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11417 info.resultfrags));
11418
11419 mds->locker->drop_locks(mdr.get());
11420
11421 // unfreeze resulting frags
11422 for (list<CDir*>::iterator p = info.resultfrags.begin();
11423 p != info.resultfrags.end();
11424 ++p) {
11425 CDir *dir = *p;
11426 dout(10) << " result frag " << *dir << dendl;
11427
94b18763
FG
11428 for (auto &p : dir->items) {
11429 CDentry *dn = p.second;
7c673cae
FG
11430 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11431 dn->state_clear(CDentry::STATE_FRAGMENTING);
11432 dn->put(CDentry::PIN_FRAGMENTING);
11433 }
11434
11435 // unfreeze
11436 dir->unfreeze_dir();
11437 }
11438
11439 fragments.erase(it);
11440 request_finish(mdr);
11441}
11442
11443void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11444{
11445 dout(10) << "fragment_committed " << basedirfrag << dendl;
11446 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11447 assert(it != uncommitted_fragments.end());
11448 ufragment &uf = it->second;
11449
11450 // remove old frags
11451 C_GatherBuilder gather(
11452 g_ceph_context,
11453 new C_OnFinisher(
11454 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11455 mds->finisher));
11456
11457 SnapContext nullsnapc;
11458 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11459 for (list<frag_t>::iterator p = uf.old_frags.begin();
11460 p != uf.old_frags.end();
11461 ++p) {
11462 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11463 ObjectOperation op;
11464 if (*p == frag_t()) {
11465 // backtrace object
11466 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11467 op.truncate(0);
11468 op.omap_clear();
11469 } else {
11470 dout(10) << " removing orphan dirfrag " << oid << dendl;
11471 op.remove();
11472 }
11473 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11474 ceph::real_clock::now(),
11475 0, gather.new_sub());
11476 }
11477
11478 assert(gather.has_subs());
11479 gather.activate();
11480}
11481
11482void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11483{
11484 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11485 << resultfrags.size() << dendl;
11486 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11487 assert(it != uncommitted_fragments.end());
11488 ufragment &uf = it->second;
11489
11490 // unmark & auth_unpin
11491 for (const auto &dir : resultfrags) {
11492 dir->state_clear(CDir::STATE_FRAGMENTING);
11493 dir->auth_unpin(this);
11494
11495 // In case the resulting fragments are beyond the split size,
11496 // we might need to split them again right away (they could
11497 // have been taking inserts between unfreezing and getting
11498 // here)
11499 mds->balancer->maybe_fragment(dir, false);
11500 }
11501
11502 if (mds->logger) {
11503 if (resultfrags.size() > 1) {
11504 mds->logger->inc(l_mds_dir_split);
11505 } else {
11506 mds->logger->inc(l_mds_dir_merge);
11507 }
11508 }
11509
11510 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11511 mds->mdlog->start_submit_entry(le);
11512
11513 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11514}
11515
11516/* This function DOES put the passed message before returning */
11517void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11518{
11519 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11520
11521 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11522 notify->put();
11523 return;
11524 }
11525
11526 CInode *diri = get_inode(notify->get_ino());
11527 if (diri) {
11528 frag_t base = notify->get_basefrag();
11529 int bits = notify->get_bits();
11530
11531/*
11532 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11533 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11534 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11535 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11536 notify->put();
11537 return;
11538 }
11539*/
11540
11541 // refragment
11542 list<MDSInternalContextBase*> waiters;
11543 list<CDir*> resultfrags;
11544 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11545 if (g_conf->mds_debug_frag)
11546 diri->verify_dirfrags();
11547
11548 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11549 diri->take_dir_waiting((*p)->get_frag(), waiters);
11550
11551 // add new replica dirs values
11552 bufferlist::iterator p = notify->basebl.begin();
11553 while (!p.end())
11554 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11555
11556 mds->queue_waiters(waiters);
11557 } else {
11558 ceph_abort();
11559 }
11560
11561 notify->put();
11562}
11563
11564void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11565 LogSegment *ls, bufferlist *rollback)
11566{
11567 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11568 assert(!uncommitted_fragments.count(basedirfrag));
11569 ufragment& uf = uncommitted_fragments[basedirfrag];
11570 uf.old_frags = old_frags;
11571 uf.bits = bits;
11572 uf.ls = ls;
11573 ls->uncommitted_fragments.insert(basedirfrag);
11574 if (rollback)
11575 uf.rollback.swap(*rollback);
11576}
11577
11578void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11579{
11580 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11581 << " op " << EFragment::op_name(op) << dendl;
11582 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11583 if (it != uncommitted_fragments.end()) {
11584 ufragment& uf = it->second;
11585 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11586 uf.committed = true;
11587 } else {
11588 uf.ls->uncommitted_fragments.erase(basedirfrag);
11589 mds->queue_waiters(uf.waiters);
11590 uncommitted_fragments.erase(it);
11591 }
11592 }
11593}
11594
11595void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11596{
11597 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11598 << " old_frags (" << old_frags << ")" << dendl;
11599 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11600 if (it != uncommitted_fragments.end()) {
11601 ufragment& uf = it->second;
11602 if (!uf.old_frags.empty()) {
11603 uf.old_frags.swap(old_frags);
11604 uf.committed = true;
11605 } else {
11606 uf.ls->uncommitted_fragments.erase(basedirfrag);
11607 uncommitted_fragments.erase(it);
11608 }
11609 }
11610}
11611
11612void MDCache::rollback_uncommitted_fragments()
11613{
11614 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11615 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11616 p != uncommitted_fragments.end();
11617 ++p) {
11618 ufragment &uf = p->second;
11619 CInode *diri = get_inode(p->first.ino);
11620 assert(diri);
11621
11622 if (uf.committed) {
11623 list<CDir*> frags;
11624 diri->get_dirfrags_under(p->first.frag, frags);
11625 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11626 CDir *dir = *q;
11627 dir->auth_pin(this);
11628 dir->state_set(CDir::STATE_FRAGMENTING);
11629 }
11630 _fragment_committed(p->first, frags);
11631 continue;
11632 }
11633
11634 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11635
11636 LogSegment *ls = mds->mdlog->get_current_segment();
11637 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11638 mds->mdlog->start_entry(le);
11639 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11640
11641 list<frag_t> old_frags;
11642 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11643
11644 list<CDir*> resultfrags;
11645 if (uf.old_frags.empty()) {
11646 // created by old format EFragment
11647 list<MDSInternalContextBase*> waiters;
11648 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11649 } else {
11650 bufferlist::iterator bp = uf.rollback.begin();
11651 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11652 CDir *dir = force_dir_fragment(diri, *q);
11653 resultfrags.push_back(dir);
11654
11655 dirfrag_rollback rollback;
11656 ::decode(rollback, bp);
11657
11658 dir->set_version(rollback.fnode.version);
11659 dir->fnode = rollback.fnode;
11660
11661 dir->_mark_dirty(ls);
11662
11663 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11664 dout(10) << " dirty nestinfo on " << *dir << dendl;
11665 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11666 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11667 }
11668 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11669 dout(10) << " dirty fragstat on " << *dir << dendl;
11670 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11671 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11672 }
11673
11674 le->add_orig_frag(dir->get_frag());
11675 le->metablob.add_dir_context(dir);
11676 if (diri_auth) {
11677 le->metablob.add_fragmented_dir(dir, true, false);
11678 } else {
11679 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11680 dir->state_set(CDir::STATE_DIRTYDFT);
11681 le->metablob.add_fragmented_dir(dir, true, true);
11682 }
11683 }
11684 }
11685
11686 if (diri_auth) {
94b18763
FG
11687 auto &pi = diri->project_inode();
11688 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11689 diri->pop_and_dirty_projected_inode(ls); // hacky
11690 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11691 } else {
11692 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11693 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11694 }
11695
11696 if (g_conf->mds_debug_frag)
11697 diri->verify_dirfrags();
11698
11699 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11700 assert(!diri->dirfragtree.is_leaf(*q));
11701
11702 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11703 CDir *dir = *q;
11704 dir->auth_pin(this);
11705 dir->state_set(CDir::STATE_FRAGMENTING);
11706 }
11707
11708 mds->mdlog->submit_entry(le);
11709
11710 uf.old_frags.swap(old_frags);
11711 _fragment_committed(p->first, resultfrags);
11712 }
11713}
11714
11715void MDCache::force_readonly()
11716{
11717 if (is_readonly())
11718 return;
11719
11720 dout(1) << "force file system read-only" << dendl;
11721 mds->clog->warn() << "force file system read-only";
11722
11723 set_readonly();
11724
11725 mds->server->force_clients_readonly();
11726
11727 // revoke write caps
94b18763 11728 for (auto &p : inode_map) {
b32b8144 11729 CInode *in = p.second;
7c673cae
FG
11730 if (in->is_head())
11731 mds->locker->eval(in, CEPH_CAP_LOCKS);
11732 }
11733
11734 mds->mdlog->flush();
11735}
11736
11737
11738// ==============================================================
11739// debug crap
11740
11741void MDCache::show_subtrees(int dbl)
11742{
11743 if (g_conf->mds_thrash_exports)
11744 dbl += 15;
11745
11746 //dout(10) << "show_subtrees" << dendl;
11747
11748 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11749 return; // i won't print anything.
11750
11751 if (subtrees.empty()) {
11752 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11753 return;
11754 }
11755
11756 // root frags
11757 list<CDir*> basefrags;
11758 for (set<CInode*>::iterator p = base_inodes.begin();
11759 p != base_inodes.end();
11760 ++p)
11761 (*p)->get_dirfrags(basefrags);
11762 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11763 dout(15) << "show_subtrees" << dendl;
11764
11765 // queue stuff
11766 list<pair<CDir*,int> > q;
11767 string indent;
11768 set<CDir*> seen;
11769
11770 // calc max depth
11771 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11772 q.push_back(pair<CDir*,int>(*p, 0));
11773
11774 set<CDir*> subtrees_seen;
11775
11776 int depth = 0;
11777 while (!q.empty()) {
11778 CDir *dir = q.front().first;
11779 int d = q.front().second;
11780 q.pop_front();
11781
11782 if (subtrees.count(dir) == 0) continue;
11783
11784 subtrees_seen.insert(dir);
11785
11786 if (d > depth) depth = d;
11787
11788 // sanity check
11789 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11790 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11791 assert(seen.count(dir) == 0);
11792 seen.insert(dir);
11793
11794 // nested items?
11795 if (!subtrees[dir].empty()) {
11796 for (set<CDir*>::iterator p = subtrees[dir].begin();
11797 p != subtrees[dir].end();
11798 ++p) {
11799 //dout(25) << " saw sub " << **p << dendl;
11800 q.push_front(pair<CDir*,int>(*p, d+1));
11801 }
11802 }
11803 }
11804
11805
11806 // print tree
11807 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11808 q.push_back(pair<CDir*,int>(*p, 0));
11809
11810 while (!q.empty()) {
11811 CDir *dir = q.front().first;
11812 int d = q.front().second;
11813 q.pop_front();
11814
11815 if (subtrees.count(dir) == 0) continue;
11816
11817 // adjust indenter
11818 while ((unsigned)d < indent.size())
11819 indent.resize(d);
11820
11821 // pad
11822 string pad = "______________________________________";
11823 pad.resize(depth*2+1-indent.size());
11824 if (!subtrees[dir].empty())
11825 pad[0] = '.'; // parent
11826
11827
11828 string auth;
11829 if (dir->is_auth())
11830 auth = "auth ";
11831 else
11832 auth = " rep ";
11833
11834 char s[10];
11835 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11836 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11837 else
11838 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11839
11840 // print
11841 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11842
11843 if (dir->ino() == MDS_INO_ROOT)
11844 assert(dir->inode == root);
11845 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11846 assert(dir->inode == myin);
11847 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11848 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11849
11850 // nested items?
11851 if (!subtrees[dir].empty()) {
11852 // more at my level?
11853 if (!q.empty() && q.front().second == d)
11854 indent += "| ";
11855 else
11856 indent += " ";
11857
11858 for (set<CDir*>::iterator p = subtrees[dir].begin();
11859 p != subtrees[dir].end();
11860 ++p)
11861 q.push_front(pair<CDir*,int>(*p, d+2));
11862 }
11863 }
11864
11865 // verify there isn't stray crap in subtree map
11866 int lost = 0;
11867 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11868 p != subtrees.end();
11869 ++p) {
11870 if (subtrees_seen.count(p->first)) continue;
11871 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11872 lost++;
11873 }
11874 assert(lost == 0);
11875}
11876
7c673cae
FG
11877void MDCache::show_cache()
11878{
11879 dout(7) << "show_cache" << dendl;
b32b8144
FG
11880
11881 auto show_func = [this](CInode *in) {
7c673cae 11882 // unlinked?
b32b8144
FG
11883 if (!in->parent)
11884 dout(7) << " unlinked " << *in << dendl;
11885
7c673cae
FG
11886 // dirfrags?
11887 list<CDir*> dfs;
b32b8144 11888 in->get_dirfrags(dfs);
7c673cae
FG
11889 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11890 CDir *dir = *p;
11891 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 11892
94b18763
FG
11893 for (auto &p : dir->items) {
11894 CDentry *dn = p.second;
7c673cae
FG
11895 dout(7) << " dentry " << *dn << dendl;
11896 CDentry::linkage_t *dnl = dn->get_linkage();
11897 if (dnl->is_primary() && dnl->get_inode())
11898 dout(7) << " inode " << *dnl->get_inode() << dendl;
11899 }
11900 }
b32b8144
FG
11901 };
11902
94b18763 11903 for (auto &p : inode_map)
b32b8144 11904 show_func(p.second);
94b18763 11905 for (auto &p : snap_inode_map)
b32b8144 11906 show_func(p.second);
7c673cae
FG
11907}
11908
181888fb
FG
11909int MDCache::cache_status(Formatter *f)
11910{
11911 f->open_object_section("cache");
11912
11913 f->open_object_section("pool");
11914 mempool::get_pool(mempool::mds_co::id).dump(f);
11915 f->close_section();
11916
11917 f->close_section();
11918 return 0;
11919}
11920
94b18763 11921int MDCache::dump_cache(boost::string_view file_name)
7c673cae 11922{
94b18763 11923 return dump_cache(file_name, NULL);
7c673cae
FG
11924}
11925
31f18b77 11926int MDCache::dump_cache(Formatter *f)
7c673cae 11927{
94b18763 11928 return dump_cache(boost::string_view(""), f);
7c673cae
FG
11929}
11930
94b18763 11931int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
7c673cae 11932{
94b18763 11933 return dump_cache(boost::string_view(""), f, dump_root, depth);
7c673cae
FG
11934}
11935
11936/**
11937 * Dump the metadata cache, either to a Formatter, if
11938 * provided, else to a plain text file.
11939 */
94b18763
FG
11940int MDCache::dump_cache(boost::string_view fn, Formatter *f,
11941 boost::string_view dump_root, int depth)
7c673cae
FG
11942{
11943 int r = 0;
11944 int fd = -1;
11945
11946 if (f) {
11947 f->open_array_section("inodes");
11948 } else {
94b18763
FG
11949 char path[PATH_MAX] = "";
11950 if (fn.length()) {
11951 snprintf(path, sizeof path, "%s", fn.data());
11952 } else {
11953 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
11954 }
11955
94b18763 11956 dout(1) << "dump_cache to " << path << dendl;
7c673cae 11957
94b18763 11958 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL, 0600);
7c673cae 11959 if (fd < 0) {
94b18763 11960 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 11961 return errno;
7c673cae
FG
11962 }
11963 }
11964
b32b8144
FG
11965 auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
11966 int r;
7c673cae
FG
11967 if (!dump_root.empty()) {
11968 string ipath;
11969 if (in->is_root())
11970 ipath = "/";
11971 else
11972 in->make_path_string(ipath);
11973
11974 if (dump_root.length() > ipath.length() ||
11975 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
b32b8144 11976 return 0;
7c673cae
FG
11977
11978 if (depth >= 0 &&
11979 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
b32b8144 11980 return 0;
7c673cae
FG
11981 }
11982
11983 if (f) {
11984 f->open_object_section("inode");
11985 in->dump(f);
11986 } else {
11987 ostringstream ss;
11988 ss << *in << std::endl;
11989 std::string s = ss.str();
11990 r = safe_write(fd, s.c_str(), s.length());
b32b8144
FG
11991 if (r < 0)
11992 return r;
7c673cae
FG
11993 }
11994
11995 list<CDir*> dfs;
11996 in->get_dirfrags(dfs);
11997 if (f) {
11998 f->open_array_section("dirfrags");
11999 }
12000 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12001 CDir *dir = *p;
12002 if (f) {
12003 f->open_object_section("dir");
12004 dir->dump(f);
12005 } else {
12006 ostringstream tt;
12007 tt << " " << *dir << std::endl;
12008 string t = tt.str();
12009 r = safe_write(fd, t.c_str(), t.length());
b32b8144
FG
12010 if (r < 0)
12011 return r;
7c673cae
FG
12012 }
12013
12014 if (f) {
12015 f->open_array_section("dentries");
12016 }
94b18763
FG
12017 for (auto &p : dir->items) {
12018 CDentry *dn = p.second;
7c673cae
FG
12019 if (f) {
12020 f->open_object_section("dentry");
12021 dn->dump(f);
12022 f->close_section();
12023 } else {
12024 ostringstream uu;
12025 uu << " " << *dn << std::endl;
12026 string u = uu.str();
12027 r = safe_write(fd, u.c_str(), u.length());
b32b8144
FG
12028 if (r < 0)
12029 return r;
7c673cae
FG
12030 }
12031 }
12032 if (f) {
12033 f->close_section(); //dentries
12034 }
12035 dir->check_rstats();
12036 if (f) {
12037 f->close_section(); //dir
12038 }
12039 }
12040 if (f) {
12041 f->close_section(); // dirfrags
12042 }
12043
12044 if (f) {
12045 f->close_section(); // inode
12046 }
b32b8144
FG
12047 return 1;
12048 };
12049
94b18763 12050 for (auto &p : inode_map) {
b32b8144
FG
12051 r = dump_func(p.second);
12052 if (r < 0)
12053 goto out;
12054 }
94b18763 12055 for (auto &p : snap_inode_map) {
b32b8144
FG
12056 r = dump_func(p.second);
12057 if (r < 0)
12058 goto out;
7c673cae 12059 }
b32b8144 12060 r = 0;
7c673cae
FG
12061
12062 out:
12063 if (f) {
12064 f->close_section(); // inodes
12065 } else {
12066 ::close(fd);
12067 }
31f18b77 12068 return r;
7c673cae
FG
12069}
12070
12071
12072
12073C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12074 : MDSInternalContext(c->mds), cache(c), mdr(r)
12075{}
12076
12077void C_MDS_RetryRequest::finish(int r)
12078{
12079 mdr->retry++;
12080 cache->dispatch_request(mdr);
12081}
12082
12083
12084class C_MDS_EnqueueScrub : public Context
12085{
12086 Formatter *formatter;
12087 Context *on_finish;
12088public:
12089 ScrubHeaderRef header;
12090 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12091 formatter(f), on_finish(fin), header(nullptr) {}
12092
12093 Context *take_finisher() {
12094 Context *fin = on_finish;
12095 on_finish = NULL;
12096 return fin;
12097 }
12098
12099 void finish(int r) override {
12100 if (r < 0) { // we failed the lookup or something; dump ourselves
12101 formatter->open_object_section("results");
12102 formatter->dump_int("return_code", r);
12103 formatter->close_section(); // results
12104 }
12105 if (on_finish)
12106 on_finish->complete(r);
12107 }
12108};
12109
12110void MDCache::enqueue_scrub(
94b18763
FG
12111 boost::string_view path,
12112 boost::string_view tag,
7c673cae
FG
12113 bool force, bool recursive, bool repair,
12114 Formatter *f, Context *fin)
12115{
12116 dout(10) << __func__ << path << dendl;
12117 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
94b18763 12118 filepath fp(path);
7c673cae
FG
12119 mdr->set_filepath(fp);
12120
12121 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12122 cs->header = std::make_shared<ScrubHeader>(
12123 tag, force, recursive, repair, f);
12124
12125 mdr->internal_op_finish = cs;
12126 enqueue_scrub_work(mdr);
12127}
12128
12129void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12130{
12131 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12132 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12133 if (NULL == in)
12134 return;
12135
12136 // TODO: Remove this restriction
12137 assert(in->is_auth());
12138
12139 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12140 if (!locked)
12141 return;
12142
12143 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12144 ScrubHeaderRef &header = cs->header;
12145
12146 // Cannot scrub same dentry twice at same time
12147 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12148 mds->server->respond_to_request(mdr, -EBUSY);
12149 return;
12150 } else {
12151 in->scrub_info();
12152 }
12153
12154 header->set_origin(in);
12155
b32b8144
FG
12156 Context *fin = nullptr;
12157 if (!header->get_recursive()) {
12158 fin = cs->take_finisher();
12159 }
12160
12161 // If the scrub did some repair, then flush the journal at the end of
12162 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12163 // the on disk state will still look damaged.
28e407b8
AA
12164 auto scrub_finish = new FunctionContext([this, header, fin](int r){
12165 if (!header->get_repaired()) {
12166 if (fin)
12167 fin->complete(r);
12168 return;
12169 }
12170
12171 auto flush_finish = new FunctionContext([this, fin](int r){
12172 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12173 mds->mdlog->trim_all();
12174
12175 if (fin) {
12176 MDSGatherBuilder gather(g_ceph_context);
12177 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12178 for (auto logseg : expiring_segments)
12179 logseg->wait_for_expiry(gather.new_sub());
12180 assert(gather.has_subs());
12181 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12182 gather.activate();
b32b8144 12183 }
28e407b8
AA
12184 });
12185
12186 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12187 mds->mdlog->start_new_segment();
12188 mds->mdlog->flush();
12189 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12190 });
12191
7c673cae 12192 if (!header->get_recursive()) {
7c673cae 12193 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12194 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12195 } else {
12196 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12197 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12198 }
7c673cae
FG
12199
12200 mds->server->respond_to_request(mdr, 0);
12201 return;
12202}
12203
12204struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12205 MDRequestRef mdr;
12206 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12207 MDCacheLogContext(c), mdr(m) {}
12208 void finish(int r) override {
12209 mdr->apply();
12210 get_mds()->server->respond_to_request(mdr, r);
12211 }
12212};
12213
12214void MDCache::repair_dirfrag_stats(CDir *dir)
12215{
12216 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12217 mdr->pin(dir);
12218 mdr->internal_op_private = dir;
12219 mdr->internal_op_finish = new C_MDSInternalNoop;
12220 repair_dirfrag_stats_work(mdr);
12221}
12222
12223void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12224{
12225 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12226 dout(10) << __func__ << " " << *dir << dendl;
12227
12228 if (!dir->is_auth()) {
12229 mds->server->respond_to_request(mdr, -ESTALE);
12230 return;
12231 }
12232
12233 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12234 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12235
7c673cae
FG
12236 mds->locker->drop_locks(mdr.get());
12237 mdr->drop_local_auth_pins();
224ce89b
WB
12238 if (!mdr->remote_auth_pins.empty())
12239 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12240 return;
12241 }
12242
12243 mdr->auth_pin(dir);
12244
12245 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12246 CInode *diri = dir->inode;
12247 rdlocks.insert(&diri->dirfragtreelock);
12248 wrlocks.insert(&diri->nestlock);
12249 wrlocks.insert(&diri->filelock);
12250 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12251 return;
12252
12253 if (!dir->is_complete()) {
12254 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12255 return;
12256 }
12257
12258 frag_info_t frag_info;
12259 nest_info_t nest_info;
94b18763 12260 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12261 CDentry *dn = it->second;
12262 if (dn->last != CEPH_NOSNAP)
12263 continue;
12264 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12265 if (dnl->is_primary()) {
12266 CInode *in = dnl->get_inode();
12267 nest_info.add(in->get_projected_inode()->accounted_rstat);
12268 if (in->is_dir())
12269 frag_info.nsubdirs++;
12270 else
12271 frag_info.nfiles++;
12272 } else if (dnl->is_remote())
12273 frag_info.nfiles++;
12274 }
12275
12276 fnode_t *pf = dir->get_projected_fnode();
12277 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12278 bool good_rstat = nest_info.same_sums(pf->rstat);
12279 if (good_fragstat && good_rstat) {
12280 dout(10) << __func__ << " no corruption found" << dendl;
12281 mds->server->respond_to_request(mdr, 0);
12282 return;
12283 }
12284
12285 pf = dir->project_fnode();
12286 pf->version = dir->pre_dirty();
12287 mdr->add_projected_fnode(dir);
12288
12289 mdr->ls = mds->mdlog->get_current_segment();
12290 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12291 mds->mdlog->start_entry(le);
12292
12293 if (!good_fragstat) {
12294 if (pf->fragstat.mtime > frag_info.mtime)
12295 frag_info.mtime = pf->fragstat.mtime;
12296 if (pf->fragstat.change_attr > frag_info.change_attr)
12297 frag_info.change_attr = pf->fragstat.change_attr;
12298 pf->fragstat = frag_info;
12299 mds->locker->mark_updated_scatterlock(&diri->filelock);
12300 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12301 mdr->add_updated_lock(&diri->filelock);
12302 }
12303
12304 if (!good_rstat) {
12305 if (pf->rstat.rctime > nest_info.rctime)
12306 nest_info.rctime = pf->rstat.rctime;
12307 pf->rstat = nest_info;
12308 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12309 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12310 mdr->add_updated_lock(&diri->nestlock);
12311 }
12312
12313 le->metablob.add_dir_context(dir);
12314 le->metablob.add_dir(dir, true);
12315
12316 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12317}
12318
12319void MDCache::repair_inode_stats(CInode *diri)
12320{
12321 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12322 mdr->pin(diri);
12323 mdr->internal_op_private = diri;
12324 mdr->internal_op_finish = new C_MDSInternalNoop;
12325 repair_inode_stats_work(mdr);
12326}
12327
12328void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12329{
12330 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12331 dout(10) << __func__ << " " << *diri << dendl;
12332
12333 if (!diri->is_auth()) {
12334 mds->server->respond_to_request(mdr, -ESTALE);
12335 return;
12336 }
12337 if (!diri->is_dir()) {
12338 mds->server->respond_to_request(mdr, -ENOTDIR);
12339 return;
12340 }
12341
12342 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12343 std::list<frag_t> frags;
12344
12345 if (mdr->ls) // already marked filelock/nestlock dirty ?
12346 goto do_rdlocks;
12347
12348 rdlocks.insert(&diri->dirfragtreelock);
12349 wrlocks.insert(&diri->nestlock);
12350 wrlocks.insert(&diri->filelock);
12351 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12352 return;
12353
12354 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12355 // the scatter-gather process, which will fix any fragstat/rstat errors.
12356 diri->dirfragtree.get_leaves(frags);
12357 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12358 CDir *dir = diri->get_dirfrag(*p);
12359 if (!dir) {
12360 assert(mdr->is_auth_pinned(diri));
12361 dir = diri->get_or_open_dirfrag(this, *p);
12362 }
12363 if (dir->get_version() == 0) {
12364 assert(dir->is_auth());
12365 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12366 return;
12367 }
12368 }
12369
12370 diri->state_set(CInode::STATE_REPAIRSTATS);
12371 mdr->ls = mds->mdlog->get_current_segment();
12372 mds->locker->mark_updated_scatterlock(&diri->filelock);
12373 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12374 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12375 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12376
12377 mds->locker->drop_locks(mdr.get());
12378
12379do_rdlocks:
12380 // force the scatter-gather process
12381 rdlocks.insert(&diri->dirfragtreelock);
12382 rdlocks.insert(&diri->nestlock);
12383 rdlocks.insert(&diri->filelock);
12384 wrlocks.clear();
12385 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12386 return;
12387
12388 diri->state_clear(CInode::STATE_REPAIRSTATS);
12389
12390 frag_info_t dir_info;
12391 nest_info_t nest_info;
12392 nest_info.rsubdirs++; // it gets one to account for self
12393
12394 diri->dirfragtree.get_leaves(frags);
12395 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12396 CDir *dir = diri->get_dirfrag(*p);
12397 assert(dir);
12398 assert(dir->get_version() > 0);
12399 dir_info.add(dir->fnode.accounted_fragstat);
12400 nest_info.add(dir->fnode.accounted_rstat);
12401 }
12402
12403 if (!dir_info.same_sums(diri->inode.dirstat) ||
12404 !nest_info.same_sums(diri->inode.rstat)) {
12405 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12406 << *diri << dendl;
12407 }
12408
12409 mds->server->respond_to_request(mdr, 0);
12410}
12411
94b18763 12412void MDCache::flush_dentry(boost::string_view path, Context *fin)
7c673cae
FG
12413{
12414 if (is_readonly()) {
12415 dout(10) << __func__ << ": read-only FS" << dendl;
12416 fin->complete(-EROFS);
12417 return;
12418 }
12419 dout(10) << "flush_dentry " << path << dendl;
12420 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 12421 filepath fp(path);
7c673cae
FG
12422 mdr->set_filepath(fp);
12423 mdr->internal_op_finish = fin;
12424 flush_dentry_work(mdr);
12425}
12426
12427class C_FinishIOMDR : public MDSInternalContextBase {
12428protected:
12429 MDSRank *mds;
12430 MDRequestRef mdr;
12431 MDSRank *get_mds() override { return mds; }
12432public:
12433 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12434 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12435};
12436
12437void MDCache::flush_dentry_work(MDRequestRef& mdr)
12438{
12439 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12440 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12441 if (NULL == in)
12442 return;
12443
12444 // TODO: Is this necessary? Fix it if so
12445 assert(in->is_auth());
12446 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12447 if (!locked)
12448 return;
12449 in->flush(new C_FinishIOMDR(mds, mdr));
12450}
12451
12452
12453/**
12454 * Initialize performance counters with global perfcounter
12455 * collection.
12456 */
12457void MDCache::register_perfcounters()
12458{
12459 PerfCountersBuilder pcb(g_ceph_context,
12460 "mds_cache", l_mdc_first, l_mdc_last);
12461
12462 /* Stray/purge statistics */
12463 pcb.add_u64(l_mdc_num_strays, "num_strays",
c07f9fc5 12464 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12465 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12466 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12467
12468 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12469 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12470 "Stray dentries enqueued for purge");
12471 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12472 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12473
12474
12475 /* Recovery queue statistics */
12476 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12477 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
c07f9fc5 12478 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12479 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12480 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12481 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
c07f9fc5 12482 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 12483
d2e6a577
FG
12484 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12485 "Internal Request type enqueue scrub");
12486 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12487 "Internal Request type export dir");
12488 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12489 "Internal Request type flush");
12490 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12491 "Internal Request type fragmentdir");
12492 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12493 "Internal Request type frag stats");
12494 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12495 "Internal Request type inode stats");
12496
7c673cae
FG
12497 logger.reset(pcb.create_perf_counters());
12498 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12499 recovery_queue.set_logger(logger.get());
12500 stray_manager.set_logger(logger.get());
12501}
12502
12503void MDCache::activate_stray_manager()
12504{
12505 if (open) {
12506 stray_manager.activate();
12507 } else {
12508 wait_for_open(
12509 new MDSInternalContextWrapper(mds,
12510 new FunctionContext([this](int r){
12511 stray_manager.activate();
12512 })
12513 )
12514 );
12515 }
12516}
12517
12518/**
12519 * Call this when putting references to an inode/dentry or
12520 * when attempting to trim it.
12521 *
12522 * If this inode is no longer linked by anyone, and this MDS
12523 * rank holds the primary dentry, and that dentry is in a stray
12524 * directory, then give up the dentry to the StrayManager, never
12525 * to be seen again by MDCache.
12526 *
12527 * @param delay if true, then purgeable inodes are stashed til
12528 * the next trim(), rather than being purged right
12529 * away.
12530 */
12531void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12532 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12533 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12534 return;
224ce89b 12535
7c673cae
FG
12536 CDentry *dn = in->get_projected_parent_dn();
12537
12538 if (dn->state_test(CDentry::STATE_PURGING)) {
12539 /* We have already entered the purging process, no need
12540 * to re-evaluate me ! */
12541 return;
12542 }
12543
12544 if (dn->get_projected_linkage()->is_primary() &&
12545 dn->get_dir()->get_inode()->is_stray()) {
12546 stray_manager.eval_stray(dn, delay);
12547 }
12548}
12549
31f18b77
FG
12550void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12551 dout(10) << __func__ << " " << *diri << dendl;
12552 assert(diri->get_projected_parent_dir()->inode->is_stray());
12553 list<CDir*> ls;
12554 diri->get_dirfrags(ls);
94b18763 12555 for (auto &p : ls) {
31f18b77
FG
12556 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12557 p->try_remove_dentries_for_stray();
12558 }
12559 if (!diri->snaprealm) {
12560 if (diri->is_auth())
12561 diri->clear_dirty_rstat();
12562 diri->clear_scatter_dirty();
12563 }
12564}
12565