]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
update sources to v12.2.1
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
20#include <map>
21
22#include "MDCache.h"
23#include "MDSRank.h"
24#include "Server.h"
25#include "Locker.h"
26#include "MDLog.h"
27#include "MDBalancer.h"
28#include "Migrator.h"
29#include "ScrubStack.h"
30
31#include "SnapClient.h"
32
33#include "MDSMap.h"
34
35#include "CInode.h"
36#include "CDir.h"
37
38#include "Mutation.h"
39
40#include "include/ceph_fs.h"
41#include "include/filepath.h"
181888fb 42#include "include/util.h"
7c673cae
FG
43
44#include "msg/Message.h"
45#include "msg/Messenger.h"
46
181888fb 47#include "common/MemoryModel.h"
7c673cae 48#include "common/errno.h"
7c673cae 49#include "common/perf_counters.h"
181888fb
FG
50#include "common/safe_io.h"
51
7c673cae
FG
52#include "osdc/Journaler.h"
53#include "osdc/Filer.h"
54
55#include "events/ESubtreeMap.h"
56#include "events/EUpdate.h"
57#include "events/ESlaveUpdate.h"
58#include "events/EImportFinish.h"
59#include "events/EFragment.h"
60#include "events/ECommitted.h"
61#include "events/ESessions.h"
62
63#include "messages/MGenericMessage.h"
64
65#include "messages/MMDSResolve.h"
66#include "messages/MMDSResolveAck.h"
67#include "messages/MMDSCacheRejoin.h"
68
69#include "messages/MDiscover.h"
70#include "messages/MDiscoverReply.h"
71
72//#include "messages/MInodeUpdate.h"
73#include "messages/MDirUpdate.h"
74#include "messages/MCacheExpire.h"
75
76#include "messages/MInodeFileCaps.h"
77
78#include "messages/MLock.h"
79#include "messages/MDentryLink.h"
80#include "messages/MDentryUnlink.h"
81
82#include "messages/MMDSFindIno.h"
83#include "messages/MMDSFindInoReply.h"
84
85#include "messages/MMDSOpenIno.h"
86#include "messages/MMDSOpenInoReply.h"
87
88#include "messages/MClientRequest.h"
89#include "messages/MClientCaps.h"
90#include "messages/MClientSnap.h"
91#include "messages/MClientQuota.h"
92
93#include "messages/MMDSSlaveRequest.h"
94
95#include "messages/MMDSFragmentNotify.h"
96
97#include "messages/MGatherCaps.h"
98
99#include "InoTable.h"
100
101#include "common/Timer.h"
102
103#include "perfglue/heap_profiler.h"
104
105using namespace std;
106
107#include "common/config.h"
108#include "include/assert.h"
109
110#define dout_context g_ceph_context
111#define dout_subsys ceph_subsys_mds
112#undef dout_prefix
113#define dout_prefix _prefix(_dout, mds)
114static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
115 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
116}
117
118set<int> SimpleLock::empty_gather_set;
119
120
121/**
122 * All non-I/O contexts that require a reference
123 * to an MDCache instance descend from this.
124 */
125class MDCacheContext : public virtual MDSInternalContextBase {
126protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
130 assert(mdcache != NULL);
131 return mdcache->mds;
132 }
133public:
134 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
135};
136
137
138/**
139 * Only for contexts called back from an I/O completion
140 *
141 * Note: duplication of members wrt MDCacheContext, because
142 * it'ls the lesser of two evils compared with introducing
143 * yet another piece of (multiple) inheritance.
144 */
145class MDCacheIOContext : public virtual MDSIOContextBase {
146protected:
147 MDCache *mdcache;
148 MDSRank *get_mds() override
149 {
150 assert(mdcache != NULL);
151 return mdcache->mds;
152 }
153public:
154 explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {}
155};
156
157class MDCacheLogContext : public virtual MDSLogContextBase {
158protected:
159 MDCache *mdcache;
160 MDSRank *get_mds() override
161 {
162 assert(mdcache != NULL);
163 return mdcache->mds;
164 }
165public:
166 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
167};
168
169MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
170 mds(m),
171 filer(m->objecter, m->finisher),
172 exceeded_size_limit(false),
173 recovery_queue(m),
174 stray_manager(m, purge_queue_)
175{
176 migrator.reset(new Migrator(mds, this));
177 root = NULL;
178 myin = NULL;
179 readonly = false;
180
181 stray_index = 0;
182 for (int i = 0; i < NUM_STRAY; ++i) {
183 strays[i] = NULL;
184 }
185
186 num_inodes_with_caps = 0;
187
188 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
189 (g_conf->mds_dir_max_commit_size << 20) :
190 (0.9 *(g_conf->osd_max_write_size << 20));
191
192 discover_last_tid = 0;
193 open_ino_last_tid = 0;
194 find_ino_peer_last_tid = 0;
195
196 last_cap_id = 0;
197
198 client_lease_durations[0] = 5.0;
199 client_lease_durations[1] = 30.0;
200 client_lease_durations[2] = 300.0;
201
202 resolves_pending = false;
203 rejoins_pending = false;
204 cap_imports_num_opening = 0;
205
206 opening_root = open = false;
181888fb 207 lru.lru_set_midpoint(cache_mid());
7c673cae 208
31f18b77
FG
209 bottom_lru.lru_set_midpoint(0);
210
7c673cae
FG
211 decayrate.set_halflife(g_conf->mds_decay_halflife);
212
213 did_shutdown_log_cap = false;
214}
215
216MDCache::~MDCache()
217{
218 if (logger) {
219 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
220 }
221}
222
223
224
225void MDCache::log_stat()
226{
181888fb 227 mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes());
7c673cae
FG
228 mds->logger->set(l_mds_inodes, lru.lru_get_size());
229 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
230 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
231 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
232 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
233 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
234 mds->logger->set(l_mds_caps, Capability::count());
235}
236
237
238//
239
240bool MDCache::shutdown()
241{
242 if (lru.lru_get_size() > 0) {
243 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
244 //show_cache();
245 show_subtrees();
246 //dump();
247 }
248 return true;
249}
250
251
252// ====================================================================
253// some inode functions
254
255void MDCache::add_inode(CInode *in)
256{
257 // add to lru, inode map
258 assert(inode_map.count(in->vino()) == 0); // should be no dup inos!
259 inode_map[ in->vino() ] = in;
260
261 if (in->ino() < MDS_INO_SYSTEM_BASE) {
262 if (in->ino() == MDS_INO_ROOT)
263 root = in;
264 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
265 myin = in;
266 else if (in->is_stray()) {
267 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
268 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
269 }
270 }
271 if (in->is_base())
272 base_inodes.insert(in);
273 }
274
181888fb 275 if (cache_toofull()) {
7c673cae
FG
276 exceeded_size_limit = true;
277 }
278}
279
280void MDCache::remove_inode(CInode *o)
281{
282 dout(14) << "remove_inode " << *o << dendl;
283
284 if (o->get_parent_dn()) {
285 // FIXME: multiple parents?
286 CDentry *dn = o->get_parent_dn();
287 assert(!dn->is_dirty());
288 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
289 }
290
291 if (o->is_dirty())
292 o->mark_clean();
293 if (o->is_dirty_parent())
294 o->clear_dirty_parent();
295
296 o->clear_scatter_dirty();
297
298 o->item_open_file.remove_myself();
299
31f18b77
FG
300 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
301 export_pin_queue.erase(o);
7c673cae
FG
302
303 // remove from inode map
304 inode_map.erase(o->vino());
305
306 if (o->ino() < MDS_INO_SYSTEM_BASE) {
307 if (o == root) root = 0;
308 if (o == myin) myin = 0;
309 if (o->is_stray()) {
310 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
311 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
312 }
313 }
314 if (o->is_base())
315 base_inodes.erase(o);
316 }
317
318 // delete it
319 assert(o->get_num_ref() == 0);
320 delete o;
321}
322
323file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
324{
325 file_layout_t result = file_layout_t::get_default();
326 result.pool_id = mdsmap.get_first_data_pool();
327 return result;
328}
329
330file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
331{
332 file_layout_t result = file_layout_t::get_default();
333 result.pool_id = mdsmap.get_metadata_pool();
334 if (g_conf->mds_log_segment_size > 0) {
335 result.object_size = g_conf->mds_log_segment_size;
336 result.stripe_unit = g_conf->mds_log_segment_size;
337 }
338 return result;
339}
340
341void MDCache::init_layouts()
342{
343 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
344 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
345}
346
347void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
348 int mode) const
349{
350 in->inode.ino = ino;
351 in->inode.version = 1;
352 in->inode.xattr_version = 1;
353 in->inode.mode = 0500 | mode;
354 in->inode.size = 0;
355 in->inode.ctime =
356 in->inode.mtime =
357 in->inode.btime = ceph_clock_now();
358 in->inode.nlink = 1;
359 in->inode.truncate_size = -1ull;
360 in->inode.change_attr = 0;
361 in->inode.export_pin = MDS_RANK_NONE;
362
363 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
364 if (in->inode.is_dir()) {
365 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
366 ++in->inode.rstat.rsubdirs;
367 } else {
368 in->inode.layout = default_file_layout;
369 ++in->inode.rstat.rfiles;
370 }
371 in->inode.accounted_rstat = in->inode.rstat;
372
373 if (in->is_base()) {
374 if (in->is_root())
375 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
376 else
377 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
378 in->open_snaprealm(); // empty snaprealm
379 assert(!in->snaprealm->parent); // created its own
380 in->snaprealm->srnode.seq = 1;
381 }
382}
383
384CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
385{
386 dout(0) << "creating system inode with ino:" << ino << dendl;
387 CInode *in = new CInode(this);
388 create_unlinked_system_inode(in, ino, mode);
389 add_inode(in);
390 return in;
391}
392
393CInode *MDCache::create_root_inode()
394{
395 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
396 i->inode.uid = g_conf->mds_root_ino_uid;
397 i->inode.gid = g_conf->mds_root_ino_gid;
398 i->inode.layout = default_file_layout;
399 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
400 return i;
401}
402
403void MDCache::create_empty_hierarchy(MDSGather *gather)
404{
405 // create root dir
406 CInode *root = create_root_inode();
407
408 // force empty root dir
409 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
410 adjust_subtree_auth(rootdir, mds->get_nodeid());
411 rootdir->dir_rep = CDir::REP_ALL; //NONE;
412
413 rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat;
414 rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
415
416 root->inode.dirstat = rootdir->fnode.fragstat;
417 root->inode.rstat = rootdir->fnode.rstat;
418 ++root->inode.rstat.rsubdirs;
419 root->inode.accounted_rstat = root->inode.rstat;
420
421 rootdir->mark_complete();
422 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
423 rootdir->commit(0, gather->new_sub());
424
425 root->store(gather->new_sub());
426}
427
428void MDCache::create_mydir_hierarchy(MDSGather *gather)
429{
430 // create mds dir
431 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
432
433 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
434 adjust_subtree_auth(mydir, mds->get_nodeid());
435
436 LogSegment *ls = mds->mdlog->get_current_segment();
437
438 // stray dir
439 for (int i = 0; i < NUM_STRAY; ++i) {
440 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
441 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
442 stringstream name;
443 name << "stray" << i;
444 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
445 sdn->_mark_dirty(mds->mdlog->get_current_segment());
446
447 stray->inode.dirstat = straydir->fnode.fragstat;
448
449 mydir->fnode.rstat.add(stray->inode.rstat);
450 mydir->fnode.fragstat.nsubdirs++;
451 // save them
452 straydir->mark_complete();
453 straydir->mark_dirty(straydir->pre_dirty(), ls);
454 straydir->commit(0, gather->new_sub());
455 stray->_mark_dirty_parent(ls, true);
456 stray->store_backtrace(gather->new_sub());
457 }
458
459 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
460 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
461
462 myin->inode.dirstat = mydir->fnode.fragstat;
463 myin->inode.rstat = mydir->fnode.rstat;
464 ++myin->inode.rstat.rsubdirs;
465 myin->inode.accounted_rstat = myin->inode.rstat;
466
467 mydir->mark_complete();
468 mydir->mark_dirty(mydir->pre_dirty(), ls);
469 mydir->commit(0, gather->new_sub());
470
471 myin->store(gather->new_sub());
472}
473
474struct C_MDC_CreateSystemFile : public MDCacheLogContext {
475 MutationRef mut;
476 CDentry *dn;
477 version_t dpv;
478 MDSInternalContextBase *fin;
479 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
480 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
481 void finish(int r) override {
482 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
483 }
484};
485
486void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
487{
488 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
489 CDentry *dn = dir->add_null_dentry(name);
490
491 dn->push_projected_linkage(in);
492 version_t dpv = dn->pre_dirty();
493
494 CDir *mdir = 0;
495 if (in->inode.is_dir()) {
496 in->inode.rstat.rsubdirs = 1;
497
498 mdir = in->get_or_open_dirfrag(this, frag_t());
499 mdir->mark_complete();
500 mdir->pre_dirty();
501 } else
502 in->inode.rstat.rfiles = 1;
503 in->inode.version = dn->pre_dirty();
504
505 SnapRealm *realm = dir->get_inode()->find_snaprealm();
506 dn->first = in->first = realm->get_newest_seq() + 1;
507
508 MutationRef mut(new MutationImpl());
509
510 // force some locks. hacky.
511 mds->locker->wrlock_force(&dir->inode->filelock, mut);
512 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
513
514 mut->ls = mds->mdlog->get_current_segment();
515 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
516 mds->mdlog->start_entry(le);
517
518 if (!in->is_mdsdir()) {
519 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
520 le->metablob.add_primary_dentry(dn, in, true);
521 } else {
522 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
523 journal_dirty_inode(mut.get(), &le->metablob, in);
524 dn->push_projected_linkage(in->ino(), in->d_type());
525 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
526 le->metablob.add_root(true, in);
527 }
528 if (mdir)
529 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
530
531 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
532 mds->mdlog->flush();
533}
534
535void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
536{
537 dout(10) << "_create_system_file_finish " << *dn << dendl;
538
539 dn->pop_projected_linkage();
540 dn->mark_dirty(dpv, mut->ls);
541
542 CInode *in = dn->get_linkage()->get_inode();
543 in->inode.version--;
544 in->mark_dirty(in->inode.version + 1, mut->ls);
545
546 if (in->inode.is_dir()) {
547 CDir *dir = in->get_dirfrag(frag_t());
548 assert(dir);
549 dir->mark_dirty(1, mut->ls);
550 dir->mark_new(mut->ls);
551 }
552
553 mut->apply();
554 mds->locker->drop_locks(mut.get());
555 mut->cleanup();
556
557 fin->complete(0);
558
559 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
560 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
561}
562
563
564
565struct C_MDS_RetryOpenRoot : public MDSInternalContext {
566 MDCache *cache;
567 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
568 void finish(int r) override {
569 if (r < 0) {
570 // If we can't open root, something disastrous has happened: mark
571 // this rank damaged for operator intervention. Note that
572 // it is not okay to call suicide() here because we are in
573 // a Finisher callback.
574 cache->mds->damaged();
575 ceph_abort(); // damaged should never return
576 } else {
577 cache->open_root();
578 }
579 }
580};
581
582void MDCache::open_root_inode(MDSInternalContextBase *c)
583{
584 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
585 CInode *in;
586 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
587 in->fetch(c);
588 } else {
589 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
590 }
591}
592
593void MDCache::open_mydir_inode(MDSInternalContextBase *c)
594{
595 MDSGatherBuilder gather(g_ceph_context);
596
597 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
598 in->fetch(gather.new_sub());
599
600 gather.set_finisher(c);
601 gather.activate();
602}
603
604void MDCache::open_root()
605{
606 dout(10) << "open_root" << dendl;
607
608 if (!root) {
609 open_root_inode(new C_MDS_RetryOpenRoot(this));
610 return;
611 }
612 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
613 assert(root->is_auth());
614 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
615 assert(rootdir);
616 if (!rootdir->is_subtree_root())
617 adjust_subtree_auth(rootdir, mds->get_nodeid());
618 if (!rootdir->is_complete()) {
619 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
620 return;
621 }
622 } else {
623 assert(!root->is_auth());
624 CDir *rootdir = root->get_dirfrag(frag_t());
625 if (!rootdir) {
224ce89b 626 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
627 return;
628 }
629 }
630
631 if (!myin) {
632 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
633 in->fetch(new C_MDS_RetryOpenRoot(this));
634 return;
635 }
636 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
637 assert(mydir);
638 adjust_subtree_auth(mydir, mds->get_nodeid());
639
640 populate_mydir();
641}
642
643void MDCache::populate_mydir()
644{
645 assert(myin);
646 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
647 assert(mydir);
648
649 dout(10) << "populate_mydir " << *mydir << dendl;
650
651 if (!mydir->is_complete()) {
652 mydir->fetch(new C_MDS_RetryOpenRoot(this));
653 return;
654 }
655
656 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
657 // A missing dirfrag, we will recreate it. Before that, we must dirty
658 // it before dirtying any of the strays we create within it.
659 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
660 "recreating it now";
661 LogSegment *ls = mds->mdlog->get_current_segment();
662 mydir->state_clear(CDir::STATE_BADFRAG);
663 mydir->mark_complete();
664 mydir->mark_dirty(mydir->pre_dirty(), ls);
665 }
666
667 // open or create stray
668 uint64_t num_strays = 0;
669 for (int i = 0; i < NUM_STRAY; ++i) {
670 stringstream name;
671 name << "stray" << i;
672 CDentry *straydn = mydir->lookup(name.str());
673
674 // allow for older fs's with stray instead of stray0
675 if (straydn == NULL && i == 0)
676 straydn = mydir->lookup("stray");
677
678 if (!straydn || !straydn->get_linkage()->get_inode()) {
679 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
680 new C_MDS_RetryOpenRoot(this));
681 return;
682 }
683 assert(straydn);
684 assert(strays[i]);
685 // we make multiple passes through this method; make sure we only pin each stray once.
686 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
687 strays[i]->get(CInode::PIN_STRAY);
688 strays[i]->state_set(CInode::STATE_STRAYPINNED);
689 strays[i]->get_stickydirs();
690 }
691 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
692
693 // open all frags
694 list<frag_t> ls;
695 strays[i]->dirfragtree.get_leaves(ls);
696 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
697 frag_t fg = *p;
698 CDir *dir = strays[i]->get_dirfrag(fg);
699 if (!dir) {
700 dir = strays[i]->get_or_open_dirfrag(this, fg);
701 }
702
703 // DamageTable applies special handling to strays: it will
704 // have damaged() us out if one is damaged.
705 assert(!dir->state_test(CDir::STATE_BADFRAG));
706
707 if (dir->get_version() == 0) {
708 dir->fetch(new C_MDS_RetryOpenRoot(this));
709 return;
710 }
711
712 if (dir->get_frag_size() > 0)
713 num_strays += dir->get_frag_size();
714 }
715 }
716
717 stray_manager.set_num_strays(num_strays);
718
719 // okay!
720 dout(10) << "populate_mydir done" << dendl;
721 assert(!open);
722 open = true;
723 mds->queue_waiters(waiting_for_open);
724
725 scan_stray_dir();
726}
727
728void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
729{
730 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
731}
732
733CDir *MDCache::get_stray_dir(CInode *in)
734{
735 string straydname;
736 in->name_stray_dentry(straydname);
737
738 CInode *strayi = get_stray();
739 assert(strayi);
740 frag_t fg = strayi->pick_dirfrag(straydname);
741 CDir *straydir = strayi->get_dirfrag(fg);
742 assert(straydir);
743 return straydir;
744}
745
746CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
747{
748 CDir *straydir = get_stray_dir(in);
749 string straydname;
750 in->name_stray_dentry(straydname);
751 CDentry *straydn = straydir->lookup(straydname);
752 if (!straydn) {
753 straydn = straydir->add_null_dentry(straydname);
754 straydn->mark_new();
755 } else {
756 assert(straydn->get_projected_linkage()->is_null());
757 }
758
759 straydn->state_set(CDentry::STATE_STRAY);
760 return straydn;
761}
762
763
764
765MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
766{
767 // inode?
768 if (info.ino)
769 return get_inode(info.ino, info.snapid);
770
771 // dir or dentry.
772 CDir *dir = get_dirfrag(info.dirfrag);
773 if (!dir) return 0;
774
775 if (info.dname.length())
776 return dir->lookup(info.dname, info.snapid);
777 else
778 return dir;
779}
780
781
782
783
784// ====================================================================
785// subtree management
786
787void MDCache::list_subtrees(list<CDir*>& ls)
788{
789 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
790 p != subtrees.end();
791 ++p)
792 ls.push_back(p->first);
793}
794
795/*
796 * adjust the dir_auth of a subtree.
797 * merge with parent and/or child subtrees, if is it appropriate.
798 * merge can ONLY happen if both parent and child have unambiguous auth.
799 */
224ce89b 800void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth)
7c673cae
FG
801{
802 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
803 << " on " << *dir << dendl;
804
7c673cae
FG
805 show_subtrees();
806
807 CDir *root;
808 if (dir->inode->is_base()) {
809 root = dir; // bootstrap hack.
810 if (subtrees.count(root) == 0) {
811 subtrees[root];
812 root->get(CDir::PIN_SUBTREE);
813 }
814 } else {
815 root = get_subtree_root(dir); // subtree root
816 }
817 assert(root);
818 assert(subtrees.count(root));
819 dout(7) << " current root is " << *root << dendl;
820
821 if (root == dir) {
822 // i am already a subtree.
823 dir->set_dir_auth(auth);
824 } else {
825 // i am a new subtree.
826 dout(10) << " new subtree at " << *dir << dendl;
827 assert(subtrees.count(dir) == 0);
828 subtrees[dir]; // create empty subtree bounds list for me.
829 dir->get(CDir::PIN_SUBTREE);
830
831 // set dir_auth
832 dir->set_dir_auth(auth);
833
834 // move items nested beneath me, under me.
835 set<CDir*>::iterator p = subtrees[root].begin();
836 while (p != subtrees[root].end()) {
837 set<CDir*>::iterator next = p;
838 ++next;
839 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
840 // move under me
841 dout(10) << " claiming child bound " << **p << dendl;
842 subtrees[dir].insert(*p);
843 subtrees[root].erase(p);
844 }
845 p = next;
846 }
847
848 // i am a bound of the parent subtree.
849 subtrees[root].insert(dir);
850
851 // i am now the subtree root.
852 root = dir;
853
854 // adjust recursive pop counters
855 if (dir->is_auth()) {
856 utime_t now = ceph_clock_now();
857 CDir *p = dir->get_parent_dir();
858 while (p) {
859 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
860 if (p->is_subtree_root()) break;
861 p = p->inode->get_parent_dir();
862 }
863 }
7c673cae
FG
864 }
865
866 show_subtrees();
867}
868
869
870void MDCache::try_subtree_merge(CDir *dir)
871{
872 dout(7) << "try_subtree_merge " << *dir << dendl;
873 assert(subtrees.count(dir));
874 set<CDir*> oldbounds = subtrees[dir];
875
224ce89b 876 set<CInode*> to_eval;
7c673cae 877 // try merge at my root
224ce89b 878 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
879
880 // try merge at my old bounds
224ce89b
WB
881 for (auto bound : oldbounds)
882 try_subtree_merge_at(bound, &to_eval);
883
884 if (!(mds->is_any_replay() || mds->is_resolve())) {
885 for(auto in : to_eval)
886 eval_subtree_root(in);
887 }
7c673cae
FG
888}
889
890class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
891 CInode *in;
892 MutationRef mut;
893public:
894 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
895 void finish(int r) override {
896 mdcache->subtree_merge_writebehind_finish(in, mut);
897 }
898};
899
224ce89b 900void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval)
7c673cae
FG
901{
902 dout(10) << "try_subtree_merge_at " << *dir << dendl;
903 assert(subtrees.count(dir));
904
7c673cae
FG
905 // merge with parent?
906 CDir *parent = dir;
907 if (!dir->inode->is_base())
908 parent = get_subtree_root(dir->get_parent_dir());
909
910 if (parent != dir && // we have a parent,
911 parent->dir_auth == dir->dir_auth && // auth matches,
912 dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous,
913 !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound,
914 !dir->state_test(CDir::STATE_AUXSUBTREE)) { // not aux subtree
915 // merge with parent.
916 dout(10) << " subtree merge at " << *dir << dendl;
917 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
918
919 // move our bounds under the parent
920 for (set<CDir*>::iterator p = subtrees[dir].begin();
921 p != subtrees[dir].end();
922 ++p)
923 subtrees[parent].insert(*p);
924
925 // we are no longer a subtree or bound
926 dir->put(CDir::PIN_SUBTREE);
927 subtrees.erase(dir);
928 subtrees[parent].erase(dir);
929
930 // adjust popularity?
931 if (dir->is_auth()) {
932 utime_t now = ceph_clock_now();
933 CDir *p = dir->get_parent_dir();
934 while (p) {
935 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
936 if (p->is_subtree_root()) break;
937 p = p->inode->get_parent_dir();
938 }
939 }
940
224ce89b
WB
941 if (to_eval && dir->get_inode()->is_auth())
942 to_eval->insert(dir->get_inode());
7c673cae 943
181888fb
FG
944 show_subtrees(15);
945 }
7c673cae
FG
946}
947
948void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
949{
950 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
951 in->pop_and_dirty_projected_inode(mut->ls);
952
953 mut->apply();
954 mds->locker->drop_locks(mut.get());
955 mut->cleanup();
956
957 in->auth_unpin(this);
958}
959
960void MDCache::eval_subtree_root(CInode *diri)
961{
962 // evaluate subtree inode filelock?
963 // (we should scatter the filelock on subtree bounds)
224ce89b
WB
964 assert(diri->is_auth());
965 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
966}
967
968
969void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
970{
971 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
972 << " on " << *dir
973 << " bounds " << bounds
974 << dendl;
975
976 show_subtrees();
977
978 CDir *root;
979 if (dir->ino() == MDS_INO_ROOT) {
980 root = dir; // bootstrap hack.
981 if (subtrees.count(root) == 0) {
982 subtrees[root];
983 root->get(CDir::PIN_SUBTREE);
984 }
985 } else {
986 root = get_subtree_root(dir); // subtree root
987 }
988 assert(root);
989 assert(subtrees.count(root));
990 dout(7) << " current root is " << *root << dendl;
991
992 mds_authority_t oldauth = dir->authority();
993
994 if (root == dir) {
995 // i am already a subtree.
996 dir->set_dir_auth(auth);
997 } else {
998 // i am a new subtree.
999 dout(10) << " new subtree at " << *dir << dendl;
1000 assert(subtrees.count(dir) == 0);
1001 subtrees[dir]; // create empty subtree bounds list for me.
1002 dir->get(CDir::PIN_SUBTREE);
1003
1004 // set dir_auth
1005 dir->set_dir_auth(auth);
1006
1007 // move items nested beneath me, under me.
1008 set<CDir*>::iterator p = subtrees[root].begin();
1009 while (p != subtrees[root].end()) {
1010 set<CDir*>::iterator next = p;
1011 ++next;
1012 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1013 // move under me
1014 dout(10) << " claiming child bound " << **p << dendl;
1015 subtrees[dir].insert(*p);
1016 subtrees[root].erase(p);
1017 }
1018 p = next;
1019 }
1020
1021 // i am a bound of the parent subtree.
1022 subtrees[root].insert(dir);
1023
1024 // i am now the subtree root.
1025 root = dir;
1026 }
1027
224ce89b
WB
1028 set<CInode*> to_eval;
1029
7c673cae
FG
1030 // verify/adjust bounds.
1031 // - these may be new, or
1032 // - beneath existing ambiguous bounds (which will be collapsed),
1033 // - but NOT beneath unambiguous bounds.
1034 for (set<CDir*>::iterator p = bounds.begin();
1035 p != bounds.end();
1036 ++p) {
1037 CDir *bound = *p;
1038
1039 // new bound?
1040 if (subtrees[dir].count(bound) == 0) {
1041 if (get_subtree_root(bound) == dir) {
1042 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1043 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1044 }
1045 else {
1046 dout(10) << " want bound " << *bound << dendl;
1047 CDir *t = get_subtree_root(bound->get_parent_dir());
1048 if (subtrees[t].count(bound) == 0) {
1049 assert(t != dir);
1050 dout(10) << " new bound " << *bound << dendl;
1051 adjust_subtree_auth(bound, t->authority());
1052 }
1053 // make sure it's nested beneath ambiguous subtree(s)
1054 while (1) {
1055 while (subtrees[dir].count(t) == 0)
1056 t = get_subtree_root(t->get_parent_dir());
1057 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1058 adjust_subtree_auth(t, auth);
224ce89b 1059 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1060 t = get_subtree_root(bound->get_parent_dir());
1061 if (t == dir) break;
1062 }
1063 }
1064 }
1065 else {
1066 dout(10) << " already have bound " << *bound << dendl;
1067 }
1068 }
1069 // merge stray bounds?
1070 while (!subtrees[dir].empty()) {
1071 set<CDir*> copy = subtrees[dir];
1072 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1073 if (bounds.count(*p) == 0) {
1074 CDir *stray = *p;
1075 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1076 adjust_subtree_auth(stray, auth);
224ce89b 1077 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1078 }
1079 }
1080 // swallowing subtree may add new subtree bounds
1081 if (copy == subtrees[dir])
1082 break;
1083 }
1084
1085 // bound should now match.
1086 verify_subtree_bounds(dir, bounds);
1087
1088 show_subtrees();
224ce89b
WB
1089
1090 if (!(mds->is_any_replay() || mds->is_resolve())) {
1091 for(auto in : to_eval)
1092 eval_subtree_root(in);
1093 }
7c673cae
FG
1094}
1095
1096
1097/*
1098 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1099 * fragmentation as necessary to get an equivalent bounding set. That is, only
1100 * split if one of our frags spans the provided bounding set. Never merge.
1101 */
1102void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1103{
1104 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1105
1106 // sort by ino
1107 map<inodeno_t, fragset_t> byino;
1108 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1109 byino[p->ino].insert(p->frag);
1110 dout(10) << " by ino: " << byino << dendl;
1111
1112 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1113 CInode *diri = get_inode(p->first);
1114 if (!diri)
1115 continue;
1116 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1117
1118 fragtree_t tmpdft;
1119 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1120 tmpdft.force_to_leaf(g_ceph_context, *q);
1121
1122 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1123 frag_t fg = *q;
1124 list<frag_t> fgls;
1125 diri->dirfragtree.get_leaves_under(fg, fgls);
1126 if (fgls.empty()) {
1127 bool all = true;
1128 frag_t approx_fg = diri->dirfragtree[fg.value()];
1129 list<frag_t> ls;
1130 tmpdft.get_leaves_under(approx_fg, ls);
1131 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1132 if (p->second.get().count(*r) == 0) {
1133 // not bound, so the resolve message is from auth MDS of the dirfrag
1134 force_dir_fragment(diri, *r);
1135 all = false;
1136 }
1137 }
1138 if (all)
1139 fgls.push_back(approx_fg);
1140 else
1141 diri->dirfragtree.get_leaves_under(fg, fgls);
1142 }
1143 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1144 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1145 CDir *dir = diri->get_dirfrag(*r);
1146 if (dir)
1147 bounds.insert(dir);
1148 }
1149 }
1150 }
1151}
1152
1153void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1154{
1155 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1156 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1157
1158 set<CDir*> bounds;
1159 get_force_dirfrag_bound_set(bound_dfs, bounds);
1160 adjust_bounded_subtree_auth(dir, bounds, auth);
1161}
1162
1163void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1164{
1165 dout(10) << "map_dirfrag_set " << dfs << dendl;
1166
1167 // group by inode
1168 map<inodeno_t, fragset_t> ino_fragset;
1169 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1170 ino_fragset[p->ino].insert(p->frag);
1171
1172 // get frags
1173 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1174 p != ino_fragset.end();
1175 ++p) {
1176 CInode *in = get_inode(p->first);
1177 if (!in)
1178 continue;
1179
1180 list<frag_t> fglist;
1181 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1182 in->dirfragtree.get_leaves_under(*q, fglist);
1183
1184 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1185 << " on " << *in << dendl;
1186
1187 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1188 CDir *dir = in->get_dirfrag(*q);
1189 if (dir)
1190 result.insert(dir);
1191 }
1192 }
1193}
1194
1195
1196
1197CDir *MDCache::get_subtree_root(CDir *dir)
1198{
1199 // find the underlying dir that delegates (or is about to delegate) auth
1200 while (true) {
1201 if (dir->is_subtree_root())
1202 return dir;
1203 dir = dir->get_inode()->get_parent_dir();
1204 if (!dir)
1205 return 0; // none
1206 }
1207}
1208
1209CDir *MDCache::get_projected_subtree_root(CDir *dir)
1210{
1211 // find the underlying dir that delegates (or is about to delegate) auth
1212 while (true) {
1213 if (dir->is_subtree_root())
1214 return dir;
1215 dir = dir->get_inode()->get_projected_parent_dir();
1216 if (!dir)
1217 return 0; // none
1218 }
1219}
1220
1221void MDCache::remove_subtree(CDir *dir)
1222{
1223 dout(10) << "remove_subtree " << *dir << dendl;
1224 assert(subtrees.count(dir));
1225 assert(subtrees[dir].empty());
1226 subtrees.erase(dir);
1227 dir->put(CDir::PIN_SUBTREE);
1228 if (dir->get_parent_dir()) {
1229 CDir *p = get_subtree_root(dir->get_parent_dir());
1230 assert(subtrees[p].count(dir));
1231 subtrees[p].erase(dir);
1232 }
1233}
1234
1235void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1236{
1237 assert(subtrees.count(dir));
1238 bounds = subtrees[dir];
1239}
1240
1241void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1242{
1243 if (subtrees.count(dir)) {
1244 // just copy them, dir is a subtree.
1245 get_subtree_bounds(dir, bounds);
1246 } else {
1247 // find them
1248 CDir *root = get_subtree_root(dir);
1249 for (set<CDir*>::iterator p = subtrees[root].begin();
1250 p != subtrees[root].end();
1251 ++p) {
1252 CDir *t = *p;
1253 while (t != root) {
1254 t = t->get_parent_dir();
1255 assert(t);
1256 if (t == dir) {
1257 bounds.insert(*p);
1258 continue;
1259 }
1260 }
1261 }
1262 }
1263}
1264
1265void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1266{
1267 // for debugging only.
1268 assert(subtrees.count(dir));
1269 if (bounds != subtrees[dir]) {
1270 dout(0) << "verify_subtree_bounds failed" << dendl;
1271 set<CDir*> b = bounds;
1272 for (auto &cd : subtrees[dir]) {
1273 if (bounds.count(cd)) {
1274 b.erase(cd);
1275 continue;
1276 }
1277 dout(0) << " missing bound " << *cd << dendl;
1278 }
1279 for (const auto &cd : b)
1280 dout(0) << " extra bound " << *cd << dendl;
1281 }
1282 assert(bounds == subtrees[dir]);
1283}
1284
1285void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1286{
1287 // for debugging only.
1288 assert(subtrees.count(dir));
1289
1290 // make sure that any bounds i do have are properly noted as such.
1291 int failed = 0;
1292 for (const auto &fg : bounds) {
1293 CDir *bd = get_dirfrag(fg);
1294 if (!bd) continue;
1295 if (subtrees[dir].count(bd) == 0) {
1296 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1297 failed++;
1298 }
1299 }
1300 assert(failed == 0);
1301}
1302
1303void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1304{
1305 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1306 << " to " << *newdir << dendl;
1307 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1308}
1309
224ce89b 1310void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1311{
1312 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1313
1314 //show_subtrees();
1315
1316 CDir *newdir = diri->get_parent_dir();
1317
1318 if (pop) {
1319 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1320 assert(p != projected_subtree_renames.end());
1321 assert(!p->second.empty());
1322 assert(p->second.front().first == olddir);
1323 assert(p->second.front().second == newdir);
1324 p->second.pop_front();
1325 if (p->second.empty())
1326 projected_subtree_renames.erase(p);
1327 }
1328
1329 // adjust subtree
1330 list<CDir*> dfls;
1331 // make sure subtree dirfrags are at the front of the list
1332 diri->get_subtree_dirfrags(dfls);
1333 diri->get_nested_dirfrags(dfls);
1334 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1335 CDir *dir = *p;
1336
1337 dout(10) << "dirfrag " << *dir << dendl;
1338 CDir *oldparent = get_subtree_root(olddir);
1339 dout(10) << " old parent " << *oldparent << dendl;
1340 CDir *newparent = get_subtree_root(newdir);
1341 dout(10) << " new parent " << *newparent << dendl;
1342
1343 if (oldparent == newparent) {
1344 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1345 continue;
1346 }
1347
1348 if (dir->is_subtree_root()) {
1349 // children are fine. change parent.
1350 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1351 assert(subtrees[oldparent].count(dir));
1352 subtrees[oldparent].erase(dir);
1353 assert(subtrees.count(newparent));
1354 subtrees[newparent].insert(dir);
224ce89b
WB
1355 // caller is responsible for 'eval diri'
1356 try_subtree_merge_at(dir, NULL);
7c673cae
FG
1357 } else {
1358 // mid-subtree.
1359
1360 // see if any old bounds move to the new parent.
1361 list<CDir*> tomove;
1362 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1363 p != subtrees[oldparent].end();
1364 ++p) {
1365 CDir *bound = *p;
1366 CDir *broot = get_subtree_root(bound->get_parent_dir());
1367 if (broot != oldparent) {
1368 assert(broot == newparent);
1369 tomove.push_back(bound);
1370 }
1371 }
1372 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1373 CDir *bound = *p;
1374 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1375 subtrees[oldparent].erase(bound);
1376 subtrees[newparent].insert(bound);
1377 }
1378
1379 // did auth change?
1380 if (oldparent->authority() != newparent->authority()) {
224ce89b
WB
1381 adjust_subtree_auth(dir, oldparent->authority());
1382 // caller is responsible for 'eval diri'
1383 try_subtree_merge_at(dir, NULL);
7c673cae
FG
1384 }
1385 }
1386 }
1387
1388 show_subtrees();
1389}
1390
1391
1392void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1393{
1394 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1395 p != subtrees.end();
1396 ++p) {
1397 CDir *root = p->first;
1398 if (root->is_full_dir_auth())
1399 s.insert(root);
1400 }
1401}
1402void MDCache::get_auth_subtrees(set<CDir*>& s)
1403{
1404 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1405 p != subtrees.end();
1406 ++p) {
1407 CDir *root = p->first;
1408 if (root->is_auth())
1409 s.insert(root);
1410 }
1411}
1412
1413
1414// count.
1415
1416int MDCache::num_subtrees()
1417{
1418 return subtrees.size();
1419}
1420
1421int MDCache::num_subtrees_fullauth()
1422{
1423 int n = 0;
1424 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1425 p != subtrees.end();
1426 ++p) {
1427 CDir *root = p->first;
1428 if (root->is_full_dir_auth())
1429 n++;
1430 }
1431 return n;
1432}
1433
1434int MDCache::num_subtrees_fullnonauth()
1435{
1436 int n = 0;
1437 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1438 p != subtrees.end();
1439 ++p) {
1440 CDir *root = p->first;
1441 if (root->is_full_dir_nonauth())
1442 n++;
1443 }
1444 return n;
1445}
1446
1447
1448
1449// ===================================
1450// journal and snap/cow helpers
1451
1452
1453/*
1454 * find first inode in cache that follows given snapid. otherwise, return current.
1455 */
1456CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1457{
1458 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1459 assert(in->last == CEPH_NOSNAP);
1460
1461 SnapRealm *realm = in->find_snaprealm();
1462 const set<snapid_t>& snaps = realm->get_snaps();
1463 dout(10) << " realm " << *realm << " " << *realm->inode << dendl;
1464 dout(10) << " snaps " << snaps << dendl;
1465
1466 if (snaps.empty())
1467 return in;
1468
1469 for (set<snapid_t>::const_iterator p = snaps.upper_bound(follows); // first item > follows
1470 p != snaps.end();
1471 ++p) {
1472 CInode *t = get_inode(in->ino(), *p);
1473 if (t) {
1474 in = t;
1475 dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl;
1476 break;
1477 }
1478 }
1479 return in;
1480}
1481
1482
1483/*
1484 * note: i'm currently cheating wrt dirty and inode.version on cow
1485 * items. instead of doing a full dir predirty, i just take the
1486 * original item's version, and set the dirty flag (via
1487 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1488 * means a special case in the dir commit clean sweep assertions.
1489 * bah.
1490 */
1491CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1492{
1493 assert(last >= in->first);
1494
1495 SnapRealm *realm = in->find_snaprealm();
1496 const set<snapid_t>& snaps = realm->get_snaps();
1497
1498 // make sure snap inode's last match existing snapshots.
1499 // MDCache::pick_inode_snap() requires this.
1500 snapid_t last_snap = last;
1501 if (snaps.count(last) == 0) {
1502 set<snapid_t>::const_iterator p = snaps.upper_bound(last);
1503 if (p != snaps.begin()) {
1504 --p;
1505 if (*p >= in->first)
1506 last_snap = *p;
1507 }
1508 }
1509
1510 CInode *oldin = new CInode(this, true, in->first, last_snap);
1511 oldin->inode = *in->get_previous_projected_inode();
1512 oldin->symlink = in->symlink;
1513 oldin->xattrs = *in->get_previous_projected_xattrs();
1514 oldin->inode.trim_client_ranges(last);
1515
1516 if (in->first < in->oldest_snap)
1517 in->oldest_snap = in->first;
1518
1519 in->first = last+1;
1520
1521 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1522 add_inode(oldin);
1523
1524 if (in->last != CEPH_NOSNAP) {
1525 CInode *head_in = get_inode(in->ino());
1526 assert(head_in);
1527 if (head_in->split_need_snapflush(oldin, in)) {
1528 oldin->client_snap_caps = in->client_snap_caps;
1529 for (compact_map<int,set<client_t> >::iterator p = in->client_snap_caps.begin();
1530 p != in->client_snap_caps.end();
1531 ++p) {
1532 SimpleLock *lock = oldin->get_lock(p->first);
1533 assert(lock);
1534 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
1535 oldin->auth_pin(lock);
1536 lock->set_state(LOCK_SNAP_SYNC); // gathering
1537 lock->get_wrlock(true);
1538 }
1539 }
1540 }
1541 return oldin;
1542 }
1543
1544 // clone caps?
1545 for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
1546 p != in->client_caps.end();
1547 ++p) {
1548 client_t client = p->first;
1549 Capability *cap = p->second;
1550 int issued = cap->issued();
1551 if ((issued & CEPH_CAP_ANY_WR) &&
1552 cap->client_follows < last) {
1553 // note in oldin
1554 for (int i = 0; i < num_cinode_locks; i++) {
1555 if (issued & cinode_lock_info[i].wr_caps) {
1556 int lockid = cinode_lock_info[i].lock;
1557 SimpleLock *lock = oldin->get_lock(lockid);
1558 assert(lock);
1559 oldin->client_snap_caps[lockid].insert(client);
1560 oldin->auth_pin(lock);
1561 lock->set_state(LOCK_SNAP_SYNC); // gathering
1562 lock->get_wrlock(true);
1563 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1564 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1565 }
1566 }
1567 cap->client_follows = last;
1568
1569 // we need snapflushes for any intervening snaps
1570 dout(10) << " snaps " << snaps << dendl;
1571 for (set<snapid_t>::const_iterator q = snaps.lower_bound(oldin->first);
1572 q != snaps.end() && *q <= last;
1573 ++q) {
1574 in->add_need_snapflush(oldin, *q, client);
1575 }
1576 } else {
1577 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1578 }
1579 }
1580
1581 return oldin;
1582}
1583
1584void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1585 CDentry *dn, snapid_t follows,
1586 CInode **pcow_inode, CDentry::linkage_t *dnl)
1587{
1588 if (!dn) {
1589 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1590 return;
1591 }
1592 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1593 assert(dn->is_auth());
1594
1595 // nothing to cow on a null dentry, fix caller
1596 if (!dnl)
1597 dnl = dn->get_projected_linkage();
1598 assert(!dnl->is_null());
1599
1600 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1601 // multiversion inode.
1602 CInode *in = dnl->get_inode();
1603 SnapRealm *realm = NULL;
1604
1605 if (in->get_projected_parent_dn() != dn) {
1606 assert(follows == CEPH_NOSNAP);
1607 realm = dn->dir->inode->find_snaprealm();
1608 snapid_t dir_follows = realm->get_newest_snap();
1609
1610 if (dir_follows+1 > dn->first) {
1611 snapid_t oldfirst = dn->first;
1612 dn->first = dir_follows+1;
1613 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1614 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(),
1615 oldfirst, dir_follows);
1616 olddn->pre_dirty();
1617 dout(10) << " olddn " << *olddn << dendl;
1618 metablob->add_remote_dentry(olddn, true);
1619 mut->add_cow_dentry(olddn);
1620 // FIXME: adjust link count here? hmm.
1621
1622 if (dir_follows+1 > in->first)
1623 in->cow_old_inode(dir_follows, false);
1624 }
1625 }
1626
1627 if (in->snaprealm) {
1628 realm = in->snaprealm;
1629 follows = realm->get_newest_seq();
1630 } else
1631 follows = dir_follows;
1632 } else {
1633 realm = in->find_snaprealm();
1634 if (follows == CEPH_NOSNAP)
1635 follows = realm->get_newest_seq();
1636 }
1637
1638 // already cloned?
1639 if (follows < in->first) {
1640 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1641 return;
1642 }
1643
1644 if (!realm->has_snaps_in_range(in->first, follows)) {
1645 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1646 in->first = follows + 1;
1647 return;
1648 }
1649
1650 in->cow_old_inode(follows, false);
1651
1652 } else {
1653 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1654 if (follows == CEPH_NOSNAP)
1655 follows = realm->get_newest_seq();
1656
1657 // already cloned?
1658 if (follows < dn->first) {
1659 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1660 return;
1661 }
1662
1663 // update dn.first before adding old dentry to cdir's map
1664 snapid_t oldfirst = dn->first;
1665 dn->first = follows+1;
1666
1667 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1668
1669 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1670 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1671 if (in)
1672 in->first = follows+1;
1673 return;
1674 }
1675
1676 dout(10) << " dn " << *dn << dendl;
1677 if (in) {
1678 CInode *oldin = cow_inode(in, follows);
1679 mut->add_cow_inode(oldin);
1680 if (pcow_inode)
1681 *pcow_inode = oldin;
1682 CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, follows);
1683 oldin->inode.version = olddn->pre_dirty();
1684 dout(10) << " olddn " << *olddn << dendl;
1685 bool need_snapflush = !oldin->client_snap_caps.empty();
1686 if (need_snapflush)
1687 mut->ls->open_files.push_back(&oldin->item_open_file);
1688 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1689 mut->add_cow_dentry(olddn);
1690 } else {
1691 assert(dnl->is_remote());
1692 CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(),
1693 oldfirst, follows);
1694 olddn->pre_dirty();
1695 dout(10) << " olddn " << *olddn << dendl;
1696 metablob->add_remote_dentry(olddn, true);
1697 mut->add_cow_dentry(olddn);
1698 }
1699 }
1700}
1701
1702
1703void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1704 CInode *in, snapid_t follows,
1705 CInode **pcow_inode)
1706{
1707 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1708 CDentry *dn = in->get_projected_parent_dn();
1709 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1710}
1711
1712void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1713{
1714 if (in->is_base()) {
1715 metablob->add_root(true, in, in->get_projected_inode());
1716 } else {
1717 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1718 follows = in->first - 1;
1719 CDentry *dn = in->get_projected_parent_dn();
1720 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1721 journal_cow_dentry(mut, metablob, dn, follows);
1722 if (in->get_projected_inode()->is_backtrace_updated()) {
1723 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1724 in->get_previous_projected_inode()->layout.pool_id;
1725 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1726 } else {
1727 metablob->add_primary_dentry(dn, in, true);
1728 }
1729 }
1730}
1731
1732
1733
1734// nested ---------------------------------------------------------------
1735
1736void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1737 int linkunlink, SnapRealm *prealm)
1738{
1739 CDentry *parentdn = cur->get_projected_parent_dn();
1740 inode_t *curi = cur->get_projected_inode();
1741
1742 if (cur->first > first)
1743 first = cur->first;
1744
1745 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1746 << " " << *cur << dendl;
1747 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1748 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1749
1750 /*
1751 * FIXME. this incompletely propagates rstats to _old_ parents
1752 * (i.e. shortly after a directory rename). but we need full
1753 * blown hard link backpointers to make this work properly...
1754 */
1755 snapid_t floor = parentdn->first;
1756 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1757
1758 if (!prealm)
1759 prealm = parent->inode->find_snaprealm();
1760 const set<snapid_t> snaps = prealm->get_snaps();
1761
1762 if (cur->last != CEPH_NOSNAP) {
1763 assert(cur->dirty_old_rstats.empty());
1764 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1765 if (q == snaps.end() || *q > cur->last)
1766 return;
1767 }
1768
1769 if (cur->last >= floor) {
1770 bool update = true;
1771 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1772 // rename src inode is not projected in the slave rename prep case. so we should
1773 // avoid updateing the inode.
1774 assert(linkunlink < 0);
1775 assert(cur->is_frozen_inode());
1776 update = false;
1777 }
1778 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1779 linkunlink, update);
1780 }
1781
1782 if (g_conf->mds_snap_rstat) {
1783 for (compact_set<snapid_t>::iterator p = cur->dirty_old_rstats.begin();
1784 p != cur->dirty_old_rstats.end();
1785 ++p) {
1786 old_inode_t& old = cur->old_inodes[*p];
1787 snapid_t ofirst = MAX(old.first, floor);
1788 set<snapid_t>::const_iterator q = snaps.lower_bound(ofirst);
1789 if (q == snaps.end() || *q > *p)
1790 continue;
1791 if (*p >= floor)
1792 _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false);
1793 }
1794 }
1795 cur->dirty_old_rstats.clear();
1796}
1797
1798
1799void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
1800 CDir *parent, int linkunlink, bool update_inode)
1801{
1802 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1803 dout(20) << " inode rstat " << inode.rstat << dendl;
1804 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1805 nest_info_t delta;
1806 if (linkunlink == 0) {
1807 delta.add(inode.rstat);
1808 delta.sub(inode.accounted_rstat);
1809 } else if (linkunlink < 0) {
1810 delta.sub(inode.accounted_rstat);
1811 } else {
1812 delta.add(inode.rstat);
1813 }
1814 dout(20) << " delta " << delta << dendl;
1815
1816 if (update_inode)
1817 inode.accounted_rstat = inode.rstat;
1818
1819 while (last >= ofirst) {
1820 /*
1821 * pick fnode version to update. at each iteration, we want to
1822 * pick a segment ending in 'last' to update. split as necessary
1823 * to make that work. then, adjust first up so that we only
1824 * update one segment at a time. then loop to cover the whole
1825 * [ofirst,last] interval.
1826 */
1827 nest_info_t *prstat;
1828 snapid_t first;
1829 fnode_t *pf = parent->get_projected_fnode();
1830 if (last == CEPH_NOSNAP) {
1831 if (g_conf->mds_snap_rstat)
1832 first = MAX(ofirst, parent->first);
1833 else
1834 first = parent->first;
1835 prstat = &pf->rstat;
1836 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1837
1838 if (first > parent->first &&
1839 !(pf->rstat == pf->accounted_rstat)) {
1840 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1841 << parent->first << "," << (first-1) << "] "
1842 << " " << *prstat << "/" << pf->accounted_rstat
1843 << dendl;
1844 parent->dirty_old_rstat[first-1].first = parent->first;
1845 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1846 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1847 }
1848 parent->first = first;
1849 } else if (!g_conf->mds_snap_rstat) {
1850 // drop snapshots' rstats
1851 break;
1852 } else if (last >= parent->first) {
1853 first = parent->first;
1854 parent->dirty_old_rstat[last].first = first;
1855 parent->dirty_old_rstat[last].rstat = pf->rstat;
1856 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1857 prstat = &parent->dirty_old_rstat[last].rstat;
1858 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1859 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1860 } else {
1861 // be careful, dirty_old_rstat is a _sparse_ map.
1862 // sorry, this is ugly.
1863 first = ofirst;
1864
1865 // find any intersection with last
1866 compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.lower_bound(last);
1867 if (p == parent->dirty_old_rstat.end()) {
1868 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1869 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1870 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1871 first = parent->dirty_old_rstat.rbegin()->first+1;
1872 }
1873 } else {
1874 // *p last is >= last
1875 if (p->second.first <= last) {
1876 // *p intersects [first,last]
1877 if (p->second.first < first) {
1878 dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl;
1879 parent->dirty_old_rstat[first-1] = p->second;
1880 p->second.first = first;
1881 }
1882 if (p->second.first > first)
1883 first = p->second.first;
1884 if (last < p->first) {
1885 dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl;
1886 parent->dirty_old_rstat[last] = p->second;
1887 p->second.first = last+1;
1888 }
1889 } else {
1890 // *p is to the _right_ of [first,last]
1891 p = parent->dirty_old_rstat.lower_bound(first);
1892 // new *p last is >= first
1893 if (p->second.first <= last && // new *p isn't also to the right, and
1894 p->first >= first) { // it intersects our first bit,
1895 dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl;
1896 first = p->first+1;
1897 }
1898 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1899 }
1900 }
1901 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1902 parent->dirty_old_rstat[last].first = first;
1903 prstat = &parent->dirty_old_rstat[last].rstat;
1904 }
1905
1906 // apply
1907 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1908 assert(last >= first);
1909 prstat->add(delta);
1910 if (update_inode)
1911 inode.accounted_rstat = inode.rstat;
1912 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1913
1914 last = first-1;
1915 }
1916}
1917
1918void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1919 snapid_t ofirst, snapid_t last,
1920 CInode *pin, bool cow_head)
1921{
1922 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1923 dout(20) << " frag rstat " << rstat << dendl;
1924 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1925 nest_info_t delta = rstat;
1926 delta.sub(accounted_rstat);
1927 dout(20) << " delta " << delta << dendl;
1928
1929 while (last >= ofirst) {
1930 inode_t *pi;
1931 snapid_t first;
1932 if (last == pin->last) {
1933 pi = pin->get_projected_inode();
1934 first = MAX(ofirst, pin->first);
1935 if (first > pin->first) {
1936 old_inode_t& old = pin->cow_old_inode(first-1, cow_head);
1937 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1938 }
1939 } else {
1940 if (last >= pin->first) {
1941 first = pin->first;
1942 pin->cow_old_inode(last, cow_head);
1943 } else {
1944 // our life is easier here because old_inodes is not sparse
1945 // (although it may not begin at snapid 1)
1946 compact_map<snapid_t,old_inode_t>::iterator p = pin->old_inodes.lower_bound(last);
1947 if (p == pin->old_inodes.end()) {
1948 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1949 break;
1950 }
1951 first = p->second.first;
1952 if (first > last) {
1953 dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl;
1954 //assert(p == pin->old_inodes.begin());
1955 break;
1956 }
1957 if (p->first > last) {
1958 dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to ["
1959 << (last+1) << "," << p->first << "]" << dendl;
1960 pin->old_inodes[last] = p->second;
1961 p->second.first = last+1;
1962 pin->dirty_old_rstats.insert(p->first);
1963 }
1964 }
1965 if (first < ofirst) {
1966 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1967 << first << "," << ofirst-1 << "]" << dendl;
1968 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1969 pin->dirty_old_rstats.insert(ofirst-1);
1970 pin->old_inodes[last].first = first = ofirst;
1971 }
1972 pi = &pin->old_inodes[last].inode;
1973 pin->dirty_old_rstats.insert(last);
1974 }
1975 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1976 pi->rstat.add(delta);
1977 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1978
1979 last = first-1;
1980 }
1981}
1982
1983void MDCache::broadcast_quota_to_client(CInode *in)
1984{
1985 if (!in->is_auth() || in->is_frozen())
1986 return;
1987
1988 inode_t *i = in->get_projected_inode();
1989
1990 if (!i->quota.is_enable())
1991 return;
1992
1993 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
1994 it != in->client_caps.end();
1995 ++it) {
1996 Session *session = mds->get_session(it->first);
1997 if (!session || !session->connection ||
1998 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
1999 continue;
2000
2001 Capability *cap = it->second;
2002 if (cap->last_rbytes == i->rstat.rbytes &&
2003 cap->last_rsize == i->rstat.rsize())
2004 continue;
2005
2006 if (i->quota.max_files > 0) {
2007 if (i->rstat.rsize() >= i->quota.max_files)
2008 goto update;
2009
2010 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2011 abs(cap->last_rsize - i->rstat.rsize()))
2012 goto update;
2013 }
2014
2015 if (i->quota.max_bytes > 0) {
2016 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2017 goto update;
2018
2019 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2020 abs(cap->last_rbytes - i->rstat.rbytes))
2021 goto update;
2022 }
2023
2024 continue;
2025
2026update:
2027 cap->last_rsize = i->rstat.rsize();
2028 cap->last_rbytes = i->rstat.rbytes;
2029
2030 MClientQuota *msg = new MClientQuota();
2031 msg->ino = in->ino();
2032 msg->rstat = i->rstat;
2033 msg->quota = i->quota;
2034 mds->send_message_client_counted(msg, session->connection);
2035 }
181888fb 2036 for (const auto &it : in->get_replicas()) {
7c673cae
FG
2037 MGatherCaps *msg = new MGatherCaps;
2038 msg->ino = in->ino();
181888fb 2039 mds->send_message_mds(msg, it.first);
7c673cae
FG
2040 }
2041}
2042
2043/*
2044 * NOTE: we _have_ to delay the scatter if we are called during a
2045 * rejoin, because we can't twiddle locks between when the
2046 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2047 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2048 * (no requests), and a survivor acks immediately. _except_ that
2049 * during rejoin_(weak|strong) processing, we may complete a lock
2050 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2051 * scatterlock state in that case or the lock states will get out of
2052 * sync between the auth and replica.
2053 *
2054 * the simple solution is to never do the scatter here. instead, put
2055 * the scatterlock on a list if it isn't already wrlockable. this is
2056 * probably the best plan anyway, since we avoid too many
2057 * scatters/locks under normal usage.
2058 */
2059/*
2060 * some notes on dirlock/nestlock scatterlock semantics:
2061 *
2062 * the fragstat (dirlock) will never be updated without
2063 * dirlock+nestlock wrlock held by the caller.
2064 *
2065 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2066 * data is pushed up the tree. this could be changed with some
2067 * restructuring here, but in its current form we ensure that the
2068 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2069 * frag, which is nice. and, we only need to track frags that need to
2070 * be nudged (and not inodes with pending rstat changes that need to
2071 * be pushed into the frag). a consequence of this is that the
2072 * accounted_rstat on scatterlock sync may not match our current
2073 * rstat. this is normal and expected.
2074 */
2075void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2076 CInode *in, CDir *parent,
2077 int flags, int linkunlink,
2078 snapid_t cfollows)
2079{
2080 bool primary_dn = flags & PREDIRTY_PRIMARY;
2081 bool do_parent_mtime = flags & PREDIRTY_DIR;
2082 bool shallow = flags & PREDIRTY_SHALLOW;
2083
2084 assert(mds->mdlog->entry_is_open());
2085
2086 // make sure stamp is set
2087 if (mut->get_mds_stamp() == utime_t())
2088 mut->set_mds_stamp(ceph_clock_now());
2089
2090 if (in->is_base())
2091 return;
2092
2093 dout(10) << "predirty_journal_parents"
2094 << (do_parent_mtime ? " do_parent_mtime":"")
2095 << " linkunlink=" << linkunlink
2096 << (primary_dn ? " primary_dn":" remote_dn")
2097 << (shallow ? " SHALLOW":"")
2098 << " follows " << cfollows
2099 << " " << *in << dendl;
2100
2101 if (!parent) {
2102 assert(primary_dn);
2103 parent = in->get_projected_parent_dn()->get_dir();
2104 }
2105
2106 if (flags == 0 && linkunlink == 0) {
2107 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2108 blob->add_dir_context(parent);
2109 return;
2110 }
2111
2112 // build list of inodes to wrlock, dirty, and update
2113 list<CInode*> lsi;
2114 CInode *cur = in;
2115 CDentry *parentdn = NULL;
2116 bool first = true;
2117 while (parent) {
2118 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2119 assert(parent->is_auth());
2120
2121 // opportunistically adjust parent dirfrag
2122 CInode *pin = parent->get_inode();
2123
2124 // inode -> dirfrag
2125 mut->auth_pin(parent);
2126 mut->add_projected_fnode(parent);
2127
2128 fnode_t *pf = parent->project_fnode();
2129 pf->version = parent->pre_dirty();
2130
2131 if (do_parent_mtime || linkunlink) {
2132 assert(mut->wrlocks.count(&pin->filelock));
2133 assert(mut->wrlocks.count(&pin->nestlock));
2134 assert(cfollows == CEPH_NOSNAP);
2135
2136 // update stale fragstat/rstat?
2137 parent->resync_accounted_fragstat();
2138 parent->resync_accounted_rstat();
2139
2140 if (do_parent_mtime) {
2141 pf->fragstat.mtime = mut->get_op_stamp();
2142 pf->fragstat.change_attr++;
2143 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2144 if (pf->fragstat.mtime > pf->rstat.rctime) {
2145 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2146 pf->rstat.rctime = pf->fragstat.mtime;
2147 } else {
2148 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2149 }
2150 }
2151 if (linkunlink) {
2152 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2153 if (in->is_dir()) {
2154 pf->fragstat.nsubdirs += linkunlink;
2155 //pf->rstat.rsubdirs += linkunlink;
2156 } else {
2157 pf->fragstat.nfiles += linkunlink;
2158 //pf->rstat.rfiles += linkunlink;
2159 }
2160 }
2161 }
2162
2163 // rstat
2164 if (!primary_dn) {
2165 // don't update parent this pass
2166 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2167 pin->versionlock.can_wrlock())) {
2168 dout(20) << " unwritable parent nestlock " << pin->nestlock
2169 << ", marking dirty rstat on " << *cur << dendl;
2170 cur->mark_dirty_rstat();
2171 } else {
2172 // if we don't hold a wrlock reference on this nestlock, take one,
2173 // because we are about to write into the dirfrag fnode and that needs
2174 // to commit before the lock can cycle.
2175 if (linkunlink) {
2176 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2177 }
2178
2179 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2180 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2181 mds->locker->wrlock_force(&pin->nestlock, mut);
2182 }
2183
2184 // now we can project the inode rstat diff the dirfrag
2185 SnapRealm *prealm = pin->find_snaprealm();
2186
2187 snapid_t follows = cfollows;
2188 if (follows == CEPH_NOSNAP)
2189 follows = prealm->get_newest_seq();
2190
2191 snapid_t first = follows+1;
2192
2193 // first, if the frag is stale, bring it back in sync.
2194 parent->resync_accounted_rstat();
2195
2196 // now push inode rstats into frag
2197 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2198 cur->clear_dirty_rstat();
2199 }
2200
2201 bool stop = false;
2202 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2203 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2204 stop = true;
2205 }
2206
2207 // delay propagating until later?
2208 if (!stop && !first &&
2209 g_conf->mds_dirstat_min_interval > 0) {
2210 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2211 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2212 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2213 << " < " << g_conf->mds_dirstat_min_interval
2214 << ", stopping" << dendl;
2215 stop = true;
2216 } else {
2217 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2218 }
2219 }
2220
2221 // can cast only because i'm passing nowait=true in the sole user
2222 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2223 if (!stop &&
2224 mut->wrlocks.count(&pin->nestlock) == 0 &&
2225 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2226 //true
2227 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2228 )) { // ** do not initiate.. see above comment **
2229 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2230 << " on " << *pin << dendl;
2231 stop = true;
2232 }
2233 if (stop) {
2234 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2235 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2236 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2237 mut->add_updated_lock(&pin->nestlock);
2238 if (do_parent_mtime || linkunlink) {
2239 mds->locker->mark_updated_scatterlock(&pin->filelock);
2240 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2241 mut->add_updated_lock(&pin->filelock);
2242 }
2243 break;
2244 }
2245 if (!mut->wrlocks.count(&pin->versionlock))
2246 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2247
2248 assert(mut->wrlocks.count(&pin->nestlock) ||
2249 mut->is_slave());
2250
2251 pin->last_dirstat_prop = mut->get_mds_stamp();
2252
2253 // dirfrag -> diri
2254 mut->auth_pin(pin);
2255 mut->add_projected_inode(pin);
2256 lsi.push_front(pin);
2257
2258 pin->pre_cow_old_inode(); // avoid cow mayhem!
2259
2260 inode_t *pi = pin->project_inode();
2261 pi->version = pin->pre_dirty();
2262
2263 // dirstat
2264 if (do_parent_mtime || linkunlink) {
2265 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2266 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2267 bool touched_mtime = false, touched_chattr = false;
2268 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2269 pf->accounted_fragstat = pf->fragstat;
2270 if (touched_mtime)
2271 pi->mtime = pi->ctime = pi->dirstat.mtime;
2272 if (touched_chattr)
2273 pi->change_attr = pi->dirstat.change_attr;
2274 dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
2275
2276 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2277 if (pi->dirstat.size() < 0)
2278 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
2279 if (pi->dirstat.size() != pf->fragstat.size()) {
2280 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2281 << parent->dirfrag() << ", inode has " << pi->dirstat
2282 << ", dirfrag has " << pf->fragstat;
2283
2284 // trust the dirfrag for now
2285 pi->dirstat = pf->fragstat;
2286
2287 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2288 }
2289 }
2290 }
2291
2292 /*
2293 * the rule here is to follow the _oldest_ parent with dirty rstat
2294 * data. if we don't propagate all data, we add ourselves to the
2295 * nudge list. that way all rstat data will (eventually) get
2296 * pushed up the tree.
2297 *
2298 * actually, no. for now, silently drop rstats for old parents. we need
2299 * hard link backpointers to do the above properly.
2300 */
2301
2302 // stop?
2303 if (pin->is_base())
2304 break;
2305 parentdn = pin->get_projected_parent_dn();
2306 assert(parentdn);
2307
2308 // rstat
2309 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2310
2311 // first, if the frag is stale, bring it back in sync.
2312 parent->resync_accounted_rstat();
2313
2314 if (g_conf->mds_snap_rstat) {
2315 for (compact_map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
2316 p != parent->dirty_old_rstat.end();
2317 ++p)
2318 project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first,
2319 p->first, pin, true);//false);
2320 }
2321 parent->dirty_old_rstat.clear();
2322 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2323
2324 pf->accounted_rstat = pf->rstat;
2325
2326 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2327 if (pi->rstat.rbytes != pf->rstat.rbytes) {
2328 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2329 << parent->dirfrag() << ", inode has " << pi->rstat
2330 << ", dirfrag has " << pf->rstat;
2331
2332 // trust the dirfrag for now
2333 pi->rstat = pf->rstat;
2334
2335 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2336 }
2337 }
2338
2339 parent->check_rstats();
2340 broadcast_quota_to_client(pin);
2341 // next parent!
2342 cur = pin;
2343 parent = parentdn->get_dir();
2344 linkunlink = 0;
2345 do_parent_mtime = false;
2346 primary_dn = true;
2347 first = false;
2348 }
2349
2350 // now, stick it in the blob
2351 assert(parent);
2352 assert(parent->is_auth());
2353 blob->add_dir_context(parent);
2354 blob->add_dir(parent, true);
2355 for (list<CInode*>::iterator p = lsi.begin();
2356 p != lsi.end();
2357 ++p) {
2358 CInode *cur = *p;
2359 journal_dirty_inode(mut.get(), blob, cur);
2360 }
2361
2362}
2363
2364
2365
2366
2367
2368// ===================================
2369// slave requests
2370
2371
2372/*
2373 * some handlers for master requests with slaves. we need to make
2374 * sure slaves journal commits before we forget we mastered them and
2375 * remove them from the uncommitted_masters map (used during recovery
2376 * to commit|abort slaves).
2377 */
2378struct C_MDC_CommittedMaster : public MDCacheLogContext {
2379 metareqid_t reqid;
2380 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2381 void finish(int r) override {
2382 mdcache->_logged_master_commit(reqid);
2383 }
2384};
2385
2386void MDCache::log_master_commit(metareqid_t reqid)
2387{
2388 dout(10) << "log_master_commit " << reqid << dendl;
2389 uncommitted_masters[reqid].committing = true;
2390 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2391 new C_MDC_CommittedMaster(this, reqid));
2392}
2393
2394void MDCache::_logged_master_commit(metareqid_t reqid)
2395{
2396 dout(10) << "_logged_master_commit " << reqid << dendl;
2397 assert(uncommitted_masters.count(reqid));
2398 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2399 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2400 uncommitted_masters.erase(reqid);
2401}
2402
2403// while active...
2404
2405void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2406{
2407 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2408 assert(uncommitted_masters.count(r));
2409 uncommitted_masters[r].slaves.erase(from);
2410 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2411 log_master_commit(r);
2412}
2413
2414void MDCache::logged_master_update(metareqid_t reqid)
2415{
2416 dout(10) << "logged_master_update " << reqid << dendl;
2417 assert(uncommitted_masters.count(reqid));
2418 uncommitted_masters[reqid].safe = true;
2419 if (pending_masters.count(reqid)) {
2420 pending_masters.erase(reqid);
2421 if (pending_masters.empty())
2422 process_delayed_resolve();
2423 }
2424}
2425
2426/*
2427 * Master may crash after receiving all slaves' commit acks, but before journalling
2428 * the final commit. Slaves may crash after journalling the slave commit, but before
2429 * sending commit ack to the master. Commit masters with no uncommitted slave when
2430 * resolve finishes.
2431 */
2432void MDCache::finish_committed_masters()
2433{
2434 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2435 p != uncommitted_masters.end();
2436 ++p) {
2437 p->second.recovering = false;
2438 if (!p->second.committing && p->second.slaves.empty()) {
2439 dout(10) << "finish_committed_masters " << p->first << dendl;
2440 log_master_commit(p->first);
2441 }
2442 }
2443}
2444
2445/*
2446 * at end of resolve... we must journal a commit|abort for all slave
2447 * updates, before moving on.
2448 *
2449 * this is so that the master can safely journal ECommitted on ops it
2450 * masters when it reaches up:active (all other recovering nodes must
2451 * complete resolve before that happens).
2452 */
2453struct C_MDC_SlaveCommit : public MDCacheLogContext {
2454 mds_rank_t from;
2455 metareqid_t reqid;
2456 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2457 void finish(int r) override {
2458 mdcache->_logged_slave_commit(from, reqid);
2459 }
2460};
2461
2462void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2463{
2464 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2465
2466 // send a message
2467 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2468 mds->send_message_mds(req, from);
2469}
2470
2471
2472
2473
2474
2475
2476// ====================================================================
2477// import map, recovery
2478
2479void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2480 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2481{
2482 if (subtrees.count(oldparent)) {
2483 vector<dirfrag_t>& v = subtrees[oldparent];
2484 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2485 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2486 if (*it == df) {
2487 v.erase(it);
2488 break;
2489 }
2490 }
2491 if (subtrees.count(newparent)) {
2492 vector<dirfrag_t>& v = subtrees[newparent];
2493 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2494 v.push_back(df);
2495 }
2496}
2497
2498ESubtreeMap *MDCache::create_subtree_map()
2499{
2500 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2501 << num_subtrees_fullauth() << " fullauth"
2502 << dendl;
2503
2504 show_subtrees();
2505
2506 ESubtreeMap *le = new ESubtreeMap();
2507 mds->mdlog->_start_entry(le);
2508
2509 map<dirfrag_t, CDir*> dirs_to_add;
2510
2511 if (myin) {
2512 CDir* mydir = myin->get_dirfrag(frag_t());
2513 dirs_to_add[mydir->dirfrag()] = mydir;
2514 }
2515
2516 // include all auth subtrees, and their bounds.
2517 // and a spanning tree to tie it to the root.
2518 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2519 p != subtrees.end();
2520 ++p) {
2521 CDir *dir = p->first;
2522
2523 // journal subtree as "ours" if we are
2524 // me, -2
2525 // me, me
2526 // me, !me (may be importing and ambiguous!)
2527
2528 // so not
2529 // !me, *
2530 if (dir->get_dir_auth().first != mds->get_nodeid())
2531 continue;
2532
2533 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2534 my_ambiguous_imports.count(dir->dirfrag())) {
2535 dout(15) << " ambig subtree " << *dir << dendl;
2536 le->ambiguous_subtrees.insert(dir->dirfrag());
2537 } else {
2538 dout(15) << " subtree " << *dir << dendl;
2539 }
2540
2541 dirs_to_add[dir->dirfrag()] = dir;
2542 le->subtrees[dir->dirfrag()].clear();
2543
2544
2545 // bounds
2546 for (set<CDir*>::iterator q = p->second.begin();
2547 q != p->second.end();
2548 ++q) {
2549 CDir *bound = *q;
2550 dout(15) << " subtree bound " << *bound << dendl;
2551 dirs_to_add[bound->dirfrag()] = bound;
2552 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2553 }
2554 }
2555
2556 // apply projected renames
2557 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2558 p != projected_subtree_renames.end();
2559 ++p) {
2560 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2561 CInode *diri = p->first;
2562 CDir *olddir = q->first;
2563 CDir *newdir = q->second;
2564 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2565
2566 list<CDir*> dfls;
2567 diri->get_dirfrags(dfls);
2568 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2569 CDir *dir = *p;
2570 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2571 CDir *oldparent = get_projected_subtree_root(olddir);
2572 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2573 CDir *newparent = get_projected_subtree_root(newdir);
2574 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2575
2576 if (oldparent == newparent) {
2577 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2578 << oldparent->dirfrag() << dendl;
2579 continue;
2580 }
2581
2582 if (dir->is_subtree_root()) {
2583 if (le->subtrees.count(newparent->dirfrag()) &&
2584 oldparent->get_dir_auth() != newparent->get_dir_auth())
2585 dirs_to_add[dir->dirfrag()] = dir;
2586 // children are fine. change parent.
2587 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2588 le->subtrees);
2589 } else {
2590 // mid-subtree.
2591
2592 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2593 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2594 // if oldparent is auth, subtree is mine; include it.
2595 if (le->subtrees.count(oldparent->dirfrag())) {
2596 dirs_to_add[dir->dirfrag()] = dir;
2597 le->subtrees[dir->dirfrag()].clear();
2598 }
2599 // if newparent is auth, subtree is a new bound
2600 if (le->subtrees.count(newparent->dirfrag())) {
2601 dirs_to_add[dir->dirfrag()] = dir;
2602 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2603 }
2604 newparent = dir;
2605 }
2606
2607 // see if any old bounds move to the new parent.
2608 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2609 p != subtrees[oldparent].end();
2610 ++p) {
2611 CDir *bound = *p;
2612 if (dir->contains(bound->get_parent_dir()))
2613 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2614 le->subtrees);
2615 }
2616 }
2617 }
2618 }
2619 }
2620
2621 // simplify the journaled map. our in memory map may have more
2622 // subtrees than needed due to migrations that are just getting
2623 // started or just completing. but on replay, the "live" map will
2624 // be simple and we can do a straight comparison.
2625 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2626 if (le->ambiguous_subtrees.count(p->first))
2627 continue;
2628 unsigned i = 0;
2629 while (i < p->second.size()) {
2630 dirfrag_t b = p->second[i];
2631 if (le->subtrees.count(b) &&
2632 le->ambiguous_subtrees.count(b) == 0) {
2633 vector<dirfrag_t>& bb = le->subtrees[b];
2634 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2635 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2636 p->second.push_back(*r);
2637 dirs_to_add.erase(b);
2638 le->subtrees.erase(b);
2639 p->second.erase(p->second.begin() + i);
2640 } else {
2641 ++i;
2642 }
2643 }
2644 }
2645
2646 for (auto p : dirs_to_add) {
2647 CDir *dir = p.second;
2648 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2649 le->metablob.add_dir(dir, false);
2650 }
2651
2652 dout(15) << " subtrees " << le->subtrees << dendl;
2653 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2654
2655 //le->metablob.print(cout);
2656 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2657 return le;
2658}
2659
2660void MDCache::dump_resolve_status(Formatter *f) const
2661{
2662 f->open_object_section("resolve_status");
2663 f->dump_stream("resolve_gather") << resolve_gather;
2664 f->dump_stream("resolve_ack_gather") << resolve_gather;
2665 f->close_section();
2666}
2667
2668void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2669{
2670 dout(10) << "resolve_start" << dendl;
2671 assert(!resolve_done);
2672 resolve_done.reset(resolve_done_);
2673
2674 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2675 // if we don't have the root dir, adjust it to UNKNOWN. during
2676 // resolve we want mds0 to explicit claim the portion of it that
2677 // it owns, so that anything beyond its bounds get left as
2678 // unknown.
2679 CDir *rootdir = root->get_dirfrag(frag_t());
2680 if (rootdir)
2681 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2682 }
2683 resolve_gather = recovery_set;
2684}
2685
2686void MDCache::send_resolves()
2687{
2688 send_slave_resolves();
2689 if (!resolve_ack_gather.empty()) {
2690 dout(10) << "send_resolves still waiting for resolve ack from ("
2691 << resolve_ack_gather << ")" << dendl;
2692 return;
2693 }
2694 if (!need_resolve_rollback.empty()) {
2695 dout(10) << "send_resolves still waiting for rollback to commit on ("
2696 << need_resolve_rollback << ")" << dendl;
2697 return;
2698 }
2699 send_subtree_resolves();
2700}
2701
2702void MDCache::send_slave_resolves()
2703{
2704 dout(10) << "send_slave_resolves" << dendl;
2705
2706 map<mds_rank_t, MMDSResolve*> resolves;
2707
2708 if (mds->is_resolve()) {
2709 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2710 p != uncommitted_slave_updates.end();
2711 ++p) {
2712 resolves[p->first] = new MMDSResolve;
2713 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2714 q != p->second.end();
2715 ++q) {
2716 dout(10) << " including uncommitted " << q->first << dendl;
2717 resolves[p->first]->add_slave_request(q->first, false);
2718 }
2719 }
2720 } else {
2721 set<mds_rank_t> resolve_set;
2722 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2723 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2724 p != active_requests.end();
2725 ++p) {
2726 MDRequestRef& mdr = p->second;
2727 if (!mdr->is_slave())
2728 continue;
2729 if (!mdr->slave_did_prepare() && !mdr->committing) {
2730 continue;
2731 }
2732 mds_rank_t master = mdr->slave_to_mds;
2733 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2734 dout(10) << " including uncommitted " << *mdr << dendl;
2735 if (!resolves.count(master))
2736 resolves[master] = new MMDSResolve;
2737 if (!mdr->committing &&
2738 mdr->has_more() && mdr->more()->is_inode_exporter) {
2739 // re-send cap exports
2740 CInode *in = mdr->more()->rename_inode;
2741 map<client_t, Capability::Export> cap_map;
2742 in->export_client_caps(cap_map);
2743 bufferlist bl;
2744 ::encode(in->ino(), bl);
2745 ::encode(cap_map, bl);
2746 resolves[master]->add_slave_request(p->first, bl);
2747 } else {
2748 resolves[master]->add_slave_request(p->first, mdr->committing);
2749 }
2750 }
2751 }
2752 }
2753
2754 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2755 p != resolves.end();
2756 ++p) {
2757 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2758 mds->send_message_mds(p->second, p->first);
2759 resolve_ack_gather.insert(p->first);
2760 }
2761}
2762
2763void MDCache::send_subtree_resolves()
2764{
2765 dout(10) << "send_subtree_resolves" << dendl;
2766
2767 if (migrator->is_exporting() || migrator->is_importing()) {
2768 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2769 migrator->show_importing();
2770 migrator->show_exporting();
2771 resolves_pending = true;
2772 return; // not now
2773 }
2774
2775 map<mds_rank_t, MMDSResolve*> resolves;
2776 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2777 p != recovery_set.end();
2778 ++p) {
2779 if (*p == mds->get_nodeid())
2780 continue;
2781 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2782 resolves[*p] = new MMDSResolve;
2783 }
2784
2785 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2786 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2787
2788 // known
2789 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2790 p != subtrees.end();
2791 ++p) {
2792 CDir *dir = p->first;
2793
2794 // only our subtrees
2795 if (dir->authority().first != mds->get_nodeid())
2796 continue;
2797
2798 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2799 continue; // we'll add it below
2800
2801 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2802 // ambiguous (mid-import)
2803 set<CDir*> bounds;
2804 get_subtree_bounds(dir, bounds);
2805 vector<dirfrag_t> dfls;
2806 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2807 dfls.push_back((*q)->dirfrag());
2808
2809 my_ambig_imports[dir->dirfrag()] = dfls;
2810 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2811 } else {
2812 // not ambiguous.
2813 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2814 q != resolves.end();
2815 ++q)
2816 resolves[q->first]->add_subtree(dir->dirfrag());
2817 // bounds too
2818 vector<dirfrag_t> dfls;
2819 for (set<CDir*>::iterator q = subtrees[dir].begin();
2820 q != subtrees[dir].end();
2821 ++q) {
2822 CDir *bound = *q;
2823 dfls.push_back(bound->dirfrag());
2824 }
2825
2826 my_subtrees[dir->dirfrag()] = dfls;
2827 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2828 }
2829 }
2830
2831 // ambiguous
2832 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2833 p != my_ambiguous_imports.end();
2834 ++p) {
2835 my_ambig_imports[p->first] = p->second;
2836 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2837 }
2838
2839 // simplify the claimed subtree.
2840 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2841 unsigned i = 0;
2842 while (i < p->second.size()) {
2843 dirfrag_t b = p->second[i];
2844 if (my_subtrees.count(b)) {
2845 vector<dirfrag_t>& bb = my_subtrees[b];
2846 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2847 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2848 p->second.push_back(*r);
2849 my_subtrees.erase(b);
2850 p->second.erase(p->second.begin() + i);
2851 } else {
2852 ++i;
2853 }
2854 }
2855 }
2856
2857 // send
2858 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2859 p != resolves.end();
2860 ++p) {
2861 MMDSResolve* m = p->second;
2862 m->subtrees = my_subtrees;
2863 m->ambiguous_imports = my_ambig_imports;
2864 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2865 mds->send_message_mds(m, p->first);
2866 }
2867 resolves_pending = false;
2868}
2869
2870void MDCache::handle_mds_failure(mds_rank_t who)
2871{
2872 dout(7) << "handle_mds_failure mds." << who << dendl;
2873
2874 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2875
2876 resolve_gather.insert(who);
2877 discard_delayed_resolve(who);
2878 ambiguous_slave_updates.erase(who);
2879
2880 rejoin_gather.insert(who);
2881 rejoin_sent.erase(who); // i need to send another
31f18b77 2882 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2883 rejoin_ack_gather.erase(who); // i'll need/get another.
2884
2885 dout(10) << " resolve_gather " << resolve_gather << dendl;
2886 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2887 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2888 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2889 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2890
2891
2892 // tell the migrator too.
2893 migrator->handle_mds_failure_or_stop(who);
2894
224ce89b
WB
2895 // tell the balancer too.
2896 mds->balancer->handle_mds_failure(who);
2897
7c673cae
FG
2898 // clean up any requests slave to/from this node
2899 list<MDRequestRef> finish;
2900 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2901 p != active_requests.end();
2902 ++p) {
2903 MDRequestRef& mdr = p->second;
2904 // slave to the failed node?
2905 if (mdr->slave_to_mds == who) {
2906 if (mdr->slave_did_prepare()) {
2907 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2908 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2909 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2910
2911 if (!mdr->more()->waiting_on_slave.empty()) {
2912 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2913 // will rollback, no need to wait
2914 if (mdr->slave_request) {
2915 mdr->slave_request->put();
2916 mdr->slave_request = 0;
2917 }
2918 mdr->more()->waiting_on_slave.clear();
2919 }
2920 } else if (!mdr->committing) {
2921 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2922 if (mdr->slave_request || mdr->slave_rolling_back())
2923 mdr->aborted = true;
2924 else
2925 finish.push_back(mdr);
2926 }
2927 }
2928
2929 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2930 if (mdr->more()->waiting_on_slave.count(who)) {
2931 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2932 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2933 << who << dendl;
2934 mdr->more()->waiting_on_slave.erase(who);
2935 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2936 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2937 }
2938
2939 if (mdr->more()->srcdn_auth_mds == who &&
2940 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2941 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2942 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2943 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2944 }
31f18b77
FG
2945 } else if (mdr->slave_request) {
2946 MMDSSlaveRequest *slave_req = mdr->slave_request;
2947 // FIXME: Slave rename request can arrive after we notice mds failure.
2948 // This can cause mds to crash (does not affect integrity of FS).
2949 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2950 slave_req->srcdn_auth == who)
2951 slave_req->mark_interrupted();
7c673cae
FG
2952 }
2953
2954 // failed node is slave?
2955 if (mdr->is_master() && !mdr->committing) {
2956 if (mdr->more()->srcdn_auth_mds == who) {
2957 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2958 << who << " to recover" << dendl;
2959 assert(mdr->more()->witnessed.count(who) == 0);
2960 if (mdr->more()->is_ambiguous_auth)
2961 mdr->clear_ambiguous_auth();
2962 // rename srcdn's auth mds failed, all witnesses will rollback
2963 mdr->more()->witnessed.clear();
2964 pending_masters.erase(p->first);
2965 }
2966
2967 if (mdr->more()->witnessed.count(who)) {
2968 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2969 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2970 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2971 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2972 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2973 // until either the request is committing or the slave also fails.
2974 assert(mdr->more()->waiting_on_slave.size() == 1);
2975 pending_masters.insert(p->first);
2976 } else {
2977 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2978 << who << " to recover" << dendl;
2979 if (srcdn_auth >= 0)
2980 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2981
2982 // discard this peer's prepare (if any)
2983 mdr->more()->witnessed.erase(who);
2984 }
2985 }
2986
2987 if (mdr->more()->waiting_on_slave.count(who)) {
2988 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2989 << " to recover" << dendl;
2990 // retry request when peer recovers
2991 mdr->more()->waiting_on_slave.erase(who);
2992 if (mdr->more()->waiting_on_slave.empty())
2993 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2994 }
2995
2996 if (mdr->locking && mdr->locking_target_mds == who)
2997 mdr->finish_locking(mdr->locking);
2998 }
2999 }
3000
3001 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3002 p != uncommitted_masters.end();
3003 ++p) {
3004 // The failed MDS may have already committed the slave update
3005 if (p->second.slaves.count(who)) {
3006 p->second.recovering = true;
3007 p->second.slaves.erase(who);
3008 }
3009 }
3010
3011 while (!finish.empty()) {
3012 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3013 request_finish(finish.front());
3014 finish.pop_front();
3015 }
3016
3017 kick_find_ino_peers(who);
3018 kick_open_ino_peers(who);
3019
3020 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3021 p != fragments.end(); ) {
3022 dirfrag_t df = p->first;
3023 fragment_info_t& info = p->second;
3024 ++p;
3025 if (info.is_fragmenting())
3026 continue;
3027 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3028 list<CDir*> dirs;
3029 info.dirs.swap(dirs);
3030 fragments.erase(df);
3031 fragment_unmark_unfreeze_dirs(dirs);
3032 }
3033
3034 // MDCache::shutdown_export_strays() always exports strays to mds.0
3035 if (who == mds_rank_t(0))
3036 shutdown_exported_strays.clear();
3037
3038 show_subtrees();
3039}
3040
3041/*
3042 * handle_mds_recovery - called on another node's transition
3043 * from resolve -> active.
3044 */
3045void MDCache::handle_mds_recovery(mds_rank_t who)
3046{
3047 dout(7) << "handle_mds_recovery mds." << who << dendl;
3048
3049 // exclude all discover waiters. kick_discovers() will do the job
3050 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3051 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3052
3053 list<MDSInternalContextBase*> waiters;
3054
3055 // wake up any waiters in their subtrees
3056 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3057 p != subtrees.end();
3058 ++p) {
3059 CDir *dir = p->first;
3060
3061 if (dir->authority().first != who ||
3062 dir->authority().second == mds->get_nodeid())
3063 continue;
3064 assert(!dir->is_auth());
3065
3066 // wake any waiters
3067 list<CDir*> q;
3068 q.push_back(dir);
3069
3070 while (!q.empty()) {
3071 CDir *d = q.front();
3072 q.pop_front();
3073 d->take_waiting(d_mask, waiters);
3074
3075 // inode waiters too
3076 for (CDir::map_t::iterator p = d->items.begin();
3077 p != d->items.end();
3078 ++p) {
3079 CDentry *dn = p->second;
3080 CDentry::linkage_t *dnl = dn->get_linkage();
3081 if (dnl->is_primary()) {
3082 dnl->get_inode()->take_waiting(i_mask, waiters);
3083
3084 // recurse?
3085 list<CDir*> ls;
3086 dnl->get_inode()->get_dirfrags(ls);
3087 for (list<CDir*>::iterator p = ls.begin();
3088 p != ls.end();
3089 ++p) {
3090 CDir *subdir = *p;
3091 if (!subdir->is_subtree_root())
3092 q.push_back(subdir);
3093 }
3094 }
3095 }
3096 }
3097 }
3098
3099 kick_open_ino_peers(who);
3100 kick_find_ino_peers(who);
3101
3102 // queue them up.
3103 mds->queue_waiters(waiters);
3104}
3105
3106void MDCache::set_recovery_set(set<mds_rank_t>& s)
3107{
3108 dout(7) << "set_recovery_set " << s << dendl;
3109 recovery_set = s;
3110}
3111
3112
3113/*
3114 * during resolve state, we share resolves to determine who
3115 * is authoritative for which trees. we expect to get an resolve
3116 * from _everyone_ in the recovery_set (the mds cluster at the time of
3117 * the first failure).
3118 *
3119 * This functions puts the passed message before returning
3120 */
3121void MDCache::handle_resolve(MMDSResolve *m)
3122{
3123 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3124 mds_rank_t from = mds_rank_t(m->get_source().num());
3125
3126 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3127 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3128 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3129 return;
3130 }
3131 // wait until we reach the resolve stage!
3132 m->put();
3133 return;
3134 }
3135
3136 discard_delayed_resolve(from);
3137
3138 // ambiguous slave requests?
3139 if (!m->slave_requests.empty()) {
3140 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3141 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3142 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3143 assert(!p->second.committing);
3144 pending_masters.insert(p->first);
3145 }
3146 }
3147
3148 if (!pending_masters.empty()) {
3149 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3150 delayed_resolve[from] = m;
3151 return;
3152 }
3153 }
3154
3155 MMDSResolveAck *ack = new MMDSResolveAck;
3156 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3157 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3158 // COMMIT
3159 if (p->second.committing) {
3160 // already committing, waiting for the OP_COMMITTED slave reply
3161 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3162 } else {
3163 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3164 ack->add_commit(p->first);
3165 }
3166 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3167
3168 if (p->second.inode_caps.length() > 0) {
3169 // slave wants to export caps (rename)
3170 assert(mds->is_resolve());
3171
3172 inodeno_t ino;
3173 map<client_t,Capability::Export> cap_exports;
3174 bufferlist::iterator q = p->second.inode_caps.begin();
3175 ::decode(ino, q);
3176 ::decode(cap_exports, q);
3177
3178 assert(get_inode(ino));
3179
3180 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3181 q != cap_exports.end();
3182 ++q) {
3183 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3184 im.cap_id = ++last_cap_id; // assign a new cap ID
3185 im.issue_seq = 1;
3186 im.mseq = q->second.mseq;
3187 }
3188
3189 // will process these caps in rejoin stage
3190 rejoin_slave_exports[ino].first = from;
3191 rejoin_slave_exports[ino].second.swap(cap_exports);
3192
3193 // send information of imported caps back to slave
3194 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3195 }
3196 } else {
3197 // ABORT
3198 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3199 assert(!p->second.committing);
3200 ack->add_abort(p->first);
3201 }
3202 }
3203 mds->send_message(ack, m->get_connection());
3204 m->put();
3205 return;
3206 }
3207
3208 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3209 dout(10) << "delay processing subtree resolve" << dendl;
3210 delayed_resolve[from] = m;
3211 return;
3212 }
3213
3214 bool survivor = false;
3215 // am i a surviving ambiguous importer?
3216 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3217 survivor = true;
3218 // check for any import success/failure (from this node)
3219 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3220 while (p != my_ambiguous_imports.end()) {
3221 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3222 ++next;
3223 CDir *dir = get_dirfrag(p->first);
3224 assert(dir);
3225 dout(10) << "checking ambiguous import " << *dir << dendl;
3226 if (migrator->is_importing(dir->dirfrag()) &&
3227 migrator->get_import_peer(dir->dirfrag()) == from) {
3228 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3229
3230 // check if sender claims the subtree
3231 bool claimed_by_sender = false;
3232 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3233 q != m->subtrees.end();
3234 ++q) {
3235 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3236 CDir *base = get_force_dirfrag(q->first, false);
3237 if (!base || !base->contains(dir))
3238 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3239
3240 bool inside = true;
3241 set<CDir*> bounds;
3242 get_force_dirfrag_bound_set(q->second, bounds);
3243 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3244 CDir *bound = *p;
3245 if (bound->contains(dir)) {
3246 inside = false; // nope, bound is dir or parent of dir, not inside.
3247 break;
3248 }
3249 }
3250 if (inside)
3251 claimed_by_sender = true;
3252 }
3253
3254 my_ambiguous_imports.erase(p); // no longer ambiguous.
3255 if (claimed_by_sender) {
3256 dout(7) << "ambiguous import failed on " << *dir << dendl;
3257 migrator->import_reverse(dir);
3258 } else {
3259 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3260 migrator->import_finish(dir, true);
3261 }
3262 }
3263 p = next;
3264 }
3265 }
3266
3267 // update my dir_auth values
3268 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3269 // migrations between other nodes)
3270 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3271 pi != m->subtrees.end();
3272 ++pi) {
3273 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3274 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3275 if (!dir)
3276 continue;
3277 adjust_bounded_subtree_auth(dir, pi->second, from);
3278 try_subtree_merge(dir);
3279 }
3280
3281 show_subtrees();
3282
3283 // note ambiguous imports too
3284 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3285 pi != m->ambiguous_imports.end();
3286 ++pi) {
3287 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3288 other_ambiguous_imports[from][pi->first].swap( pi->second );
3289 }
3290
3291 // did i get them all?
3292 resolve_gather.erase(from);
3293
3294 maybe_resolve_finish();
3295
3296 m->put();
3297}
3298
3299void MDCache::process_delayed_resolve()
3300{
3301 dout(10) << "process_delayed_resolve" << dendl;
3302 map<mds_rank_t, MMDSResolve*> tmp;
3303 tmp.swap(delayed_resolve);
3304 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3305 handle_resolve(p->second);
3306}
3307
3308void MDCache::discard_delayed_resolve(mds_rank_t who)
3309{
3310 if (delayed_resolve.count(who)) {
3311 delayed_resolve[who]->put();
3312 delayed_resolve.erase(who);
3313 }
3314}
3315
3316void MDCache::maybe_resolve_finish()
3317{
3318 assert(resolve_ack_gather.empty());
3319 assert(need_resolve_rollback.empty());
3320
3321 if (!resolve_gather.empty()) {
3322 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3323 << resolve_gather << ")" << dendl;
3324 return;
3325 }
3326
3327 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3328 disambiguate_my_imports();
3329 finish_committed_masters();
3330
3331 if (resolve_done) {
3332 assert(mds->is_resolve());
3333 trim_unlinked_inodes();
3334 recalc_auth_bits(false);
3335 resolve_done.release()->complete(0);
3336 } else {
3337 maybe_send_pending_rejoins();
3338 }
3339}
3340
3341/* This functions puts the passed message before returning */
3342void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3343{
3344 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3345 mds_rank_t from = mds_rank_t(ack->get_source().num());
3346
3347 if (!resolve_ack_gather.count(from) ||
3348 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3349 ack->put();
3350 return;
3351 }
3352
3353 if (ambiguous_slave_updates.count(from)) {
3354 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3355 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3356 }
3357
3358 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3359 p != ack->commit.end();
3360 ++p) {
3361 dout(10) << " commit on slave " << p->first << dendl;
3362
3363 if (ambiguous_slave_updates.count(from)) {
3364 remove_ambiguous_slave_update(p->first, from);
3365 continue;
3366 }
3367
3368 if (mds->is_resolve()) {
3369 // replay
3370 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3371 assert(su);
3372
3373 // log commit
3374 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3375 ESlaveUpdate::OP_COMMIT, su->origop),
3376 new C_MDC_SlaveCommit(this, from, p->first));
3377 mds->mdlog->flush();
3378
3379 finish_uncommitted_slave_update(p->first, from);
3380 } else {
3381 MDRequestRef mdr = request_get(p->first);
3382 // information about master imported caps
3383 if (p->second.length() > 0)
3384 mdr->more()->inode_import.claim(p->second);
3385
3386 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3387 request_finish(mdr);
3388 }
3389 }
3390
3391 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3392 p != ack->abort.end();
3393 ++p) {
3394 dout(10) << " abort on slave " << *p << dendl;
3395
3396 if (mds->is_resolve()) {
3397 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3398 assert(su);
3399
3400 // perform rollback (and journal a rollback entry)
3401 // note: this will hold up the resolve a bit, until the rollback entries journal.
3402 MDRequestRef null_ref;
3403 switch (su->origop) {
3404 case ESlaveUpdate::LINK:
3405 mds->server->do_link_rollback(su->rollback, from, null_ref);
3406 break;
3407 case ESlaveUpdate::RENAME:
3408 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3409 break;
3410 case ESlaveUpdate::RMDIR:
3411 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3412 break;
3413 default:
3414 ceph_abort();
3415 }
3416 } else {
3417 MDRequestRef mdr = request_get(*p);
3418 mdr->aborted = true;
3419 if (mdr->slave_request) {
3420 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3421 add_rollback(*p, from);
3422 } else {
3423 request_finish(mdr);
3424 }
3425 }
3426 }
3427
3428 if (!ambiguous_slave_updates.count(from))
3429 resolve_ack_gather.erase(from);
3430 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3431 send_subtree_resolves();
3432 process_delayed_resolve();
3433 }
3434
3435 ack->put();
3436}
3437
3438void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3439{
3440 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3441 uncommitted_slave_updates[master][reqid] = su;
3442 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3443 uncommitted_slave_rename_olddir[*p]++;
3444 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3445 uncommitted_slave_unlink[*p]++;
3446}
3447
3448void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3449{
3450 assert(uncommitted_slave_updates[master].count(reqid));
3451 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3452
3453 uncommitted_slave_updates[master].erase(reqid);
3454 if (uncommitted_slave_updates[master].empty())
3455 uncommitted_slave_updates.erase(master);
3456 // discard the non-auth subtree we renamed out of
3457 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3458 CInode *diri = *p;
3459 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3460 assert(it != uncommitted_slave_rename_olddir.end());
3461 it->second--;
3462 if (it->second == 0) {
3463 uncommitted_slave_rename_olddir.erase(it);
3464 list<CDir*> ls;
3465 diri->get_dirfrags(ls);
3466 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3467 CDir *root = get_subtree_root(*q);
3468 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3469 try_trim_non_auth_subtree(root);
3470 if (*q != root)
3471 break;
3472 }
3473 }
3474 } else
3475 assert(it->second > 0);
3476 }
3477 // removed the inodes that were unlinked by slave update
3478 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3479 CInode *in = *p;
3480 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3481 assert(it != uncommitted_slave_unlink.end());
3482 it->second--;
3483 if (it->second == 0) {
3484 uncommitted_slave_unlink.erase(it);
3485 if (!in->get_projected_parent_dn())
3486 mds->mdcache->remove_inode_recursive(in);
3487 } else
3488 assert(it->second > 0);
3489 }
3490 delete su;
3491}
3492
3493MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3494{
3495
3496 MDSlaveUpdate* su = NULL;
3497 if (uncommitted_slave_updates.count(master) &&
3498 uncommitted_slave_updates[master].count(reqid)) {
3499 su = uncommitted_slave_updates[master][reqid];
3500 assert(su);
3501 }
3502 return su;
3503}
3504
3505void MDCache::finish_rollback(metareqid_t reqid) {
3506 assert(need_resolve_rollback.count(reqid));
3507 if (mds->is_resolve())
3508 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3509 need_resolve_rollback.erase(reqid);
3510 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3511 send_subtree_resolves();
3512 process_delayed_resolve();
3513 }
3514}
3515
3516void MDCache::disambiguate_other_imports()
3517{
3518 dout(10) << "disambiguate_other_imports" << dendl;
3519
3520 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3521 // other nodes' ambiguous imports
3522 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3523 p != other_ambiguous_imports.end();
3524 ++p) {
3525 mds_rank_t who = p->first;
3526 dout(10) << "ambiguous imports for mds." << who << dendl;
3527
3528 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3529 q != p->second.end();
3530 ++q) {
3531 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3532 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3533 CDir *dir = get_force_dirfrag(q->first, recovering);
3534 if (!dir) continue;
3535
3536 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3537 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3538 dout(10) << " mds." << who << " did import " << *dir << dendl;
3539 adjust_bounded_subtree_auth(dir, q->second, who);
3540 try_subtree_merge(dir);
3541 } else {
3542 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3543 }
3544 }
3545 }
3546 other_ambiguous_imports.clear();
3547}
3548
3549void MDCache::disambiguate_my_imports()
3550{
3551 dout(10) << "disambiguate_my_imports" << dendl;
3552
3553 if (!mds->is_resolve()) {
3554 assert(my_ambiguous_imports.empty());
3555 return;
3556 }
3557
3558 disambiguate_other_imports();
3559
3560 // my ambiguous imports
3561 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3562 while (!my_ambiguous_imports.empty()) {
3563 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3564
3565 CDir *dir = get_dirfrag(q->first);
3566 assert(dir);
3567
3568 if (dir->authority() != me_ambig) {
3569 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3570 cancel_ambiguous_import(dir);
3571
3572 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3573
3574 // subtree may have been swallowed by another node claiming dir
3575 // as their own.
3576 CDir *root = get_subtree_root(dir);
3577 if (root != dir)
3578 dout(10) << " subtree root is " << *root << dendl;
3579 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3580 try_trim_non_auth_subtree(root);
3581 } else {
3582 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3583 finish_ambiguous_import(q->first);
3584 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3585 }
3586 }
3587 assert(my_ambiguous_imports.empty());
3588 mds->mdlog->flush();
3589
3590 // verify all my subtrees are unambiguous!
3591 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3592 p != subtrees.end();
3593 ++p) {
3594 CDir *dir = p->first;
3595 if (dir->is_ambiguous_dir_auth()) {
3596 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3597 }
3598 assert(!dir->is_ambiguous_dir_auth());
3599 }
3600
3601 show_subtrees();
3602}
3603
3604
3605void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3606{
3607 assert(my_ambiguous_imports.count(base) == 0);
3608 my_ambiguous_imports[base] = bounds;
3609}
3610
3611
3612void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3613{
3614 // make a list
3615 vector<dirfrag_t> binos;
3616 for (set<CDir*>::iterator p = bounds.begin();
3617 p != bounds.end();
3618 ++p)
3619 binos.push_back((*p)->dirfrag());
3620
3621 // note: this can get called twice if the exporter fails during recovery
3622 if (my_ambiguous_imports.count(base->dirfrag()))
3623 my_ambiguous_imports.erase(base->dirfrag());
3624
3625 add_ambiguous_import(base->dirfrag(), binos);
3626}
3627
3628void MDCache::cancel_ambiguous_import(CDir *dir)
3629{
3630 dirfrag_t df = dir->dirfrag();
3631 assert(my_ambiguous_imports.count(df));
3632 dout(10) << "cancel_ambiguous_import " << df
3633 << " bounds " << my_ambiguous_imports[df]
3634 << " " << *dir
3635 << dendl;
3636 my_ambiguous_imports.erase(df);
3637}
3638
3639void MDCache::finish_ambiguous_import(dirfrag_t df)
3640{
3641 assert(my_ambiguous_imports.count(df));
3642 vector<dirfrag_t> bounds;
3643 bounds.swap(my_ambiguous_imports[df]);
3644 my_ambiguous_imports.erase(df);
3645
3646 dout(10) << "finish_ambiguous_import " << df
3647 << " bounds " << bounds
3648 << dendl;
3649 CDir *dir = get_dirfrag(df);
3650 assert(dir);
3651
3652 // adjust dir_auth, import maps
3653 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3654 try_subtree_merge(dir);
3655}
3656
3657void MDCache::remove_inode_recursive(CInode *in)
3658{
3659 dout(10) << "remove_inode_recursive " << *in << dendl;
3660 list<CDir*> ls;
3661 in->get_dirfrags(ls);
3662 list<CDir*>::iterator p = ls.begin();
3663 while (p != ls.end()) {
3664 CDir *subdir = *p++;
3665
3666 dout(10) << " removing dirfrag " << subdir << dendl;
3667 CDir::map_t::iterator q = subdir->items.begin();
3668 while (q != subdir->items.end()) {
3669 CDentry *dn = q->second;
3670 ++q;
3671 CDentry::linkage_t *dnl = dn->get_linkage();
3672 if (dnl->is_primary()) {
3673 CInode *tin = dnl->get_inode();
31f18b77 3674 subdir->unlink_inode(dn, false);
7c673cae
FG
3675 remove_inode_recursive(tin);
3676 }
3677 subdir->remove_dentry(dn);
3678 }
3679
3680 if (subdir->is_subtree_root())
3681 remove_subtree(subdir);
3682 in->close_dirfrag(subdir->dirfrag().frag);
3683 }
3684 remove_inode(in);
3685}
3686
3687bool MDCache::expire_recursive(
3688 CInode *in,
3689 map<mds_rank_t, MCacheExpire*>& expiremap)
3690{
3691 assert(!in->is_auth());
3692
3693 dout(10) << __func__ << ":" << *in << dendl;
3694
3695 // Recurse into any dirfrags beneath this inode
3696 list<CDir*> ls;
3697 in->get_dirfrags(ls);
3698 for (auto subdir : ls) {
3699 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3700 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3701 return true;
3702 }
3703
3704 for (auto &it : subdir->items) {
3705 CDentry *dn = it.second;
3706 CDentry::linkage_t *dnl = dn->get_linkage();
3707 if (dnl->is_primary()) {
3708 CInode *tin = dnl->get_inode();
3709
3710 /* Remote strays with linkage (i.e. hardlinks) should not be
3711 * expired, because they may be the target of
3712 * a rename() as the owning MDS shuts down */
3713 if (!tin->is_stray() && tin->inode.nlink) {
3714 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3715 return true;
3716 }
3717
3718 const bool abort = expire_recursive(tin, expiremap);
3719 if (abort) {
3720 return true;
3721 }
3722 }
3723 if (dn->lru_is_expireable()) {
3724 trim_dentry(dn, expiremap);
3725 } else {
3726 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3727 return true;
3728 }
3729 }
3730 }
3731
3732 return false;
3733}
3734
3735void MDCache::trim_unlinked_inodes()
3736{
3737 dout(7) << "trim_unlinked_inodes" << dendl;
3738 list<CInode*> q;
3739 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
3740 p != inode_map.end();
3741 ++p) {
3742 CInode *in = p->second;
3743 if (in->get_parent_dn() == NULL && !in->is_base()) {
3744 dout(7) << " will trim from " << *in << dendl;
3745 q.push_back(in);
3746 }
3747 }
3748 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3749 remove_inode_recursive(*p);
3750}
3751
3752/** recalc_auth_bits()
3753 * once subtree auth is disambiguated, we need to adjust all the
3754 * auth and dirty bits in our cache before moving on.
3755 */
3756void MDCache::recalc_auth_bits(bool replay)
3757{
3758 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3759
3760 if (root) {
3761 root->inode_auth.first = mds->mdsmap->get_root();
3762 bool auth = mds->get_nodeid() == root->inode_auth.first;
3763 if (auth) {
3764 root->state_set(CInode::STATE_AUTH);
3765 } else {
3766 root->state_clear(CInode::STATE_AUTH);
3767 if (!replay)
3768 root->state_set(CInode::STATE_REJOINING);
3769 }
3770 }
3771
3772 set<CInode*> subtree_inodes;
3773 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3774 p != subtrees.end();
3775 ++p) {
3776 if (p->first->dir_auth.first == mds->get_nodeid())
3777 subtree_inodes.insert(p->first->inode);
3778 }
3779
3780 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3781 p != subtrees.end();
3782 ++p) {
3783 if (p->first->inode->is_mdsdir()) {
3784 CInode *in = p->first->inode;
3785 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3786 if (auth) {
3787 in->state_set(CInode::STATE_AUTH);
3788 } else {
3789 in->state_clear(CInode::STATE_AUTH);
3790 if (!replay)
3791 in->state_set(CInode::STATE_REJOINING);
3792 }
3793 }
3794
3795 list<CDir*> dfq; // dirfrag queue
3796 dfq.push_back(p->first);
3797
3798 bool auth = p->first->authority().first == mds->get_nodeid();
3799 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3800
3801 while (!dfq.empty()) {
3802 CDir *dir = dfq.front();
3803 dfq.pop_front();
3804
3805 // dir
3806 if (auth) {
3807 dir->state_set(CDir::STATE_AUTH);
3808 } else {
3809 dir->state_clear(CDir::STATE_AUTH);
3810 if (!replay) {
3811 // close empty non-auth dirfrag
3812 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3813 dir->inode->close_dirfrag(dir->get_frag());
3814 continue;
3815 }
3816 dir->state_set(CDir::STATE_REJOINING);
3817 dir->state_clear(CDir::STATE_COMPLETE);
3818 if (dir->is_dirty())
3819 dir->mark_clean();
3820 }
3821 }
3822
3823 // dentries in this dir
3824 for (CDir::map_t::iterator q = dir->items.begin();
3825 q != dir->items.end();
3826 ++q) {
3827 // dn
3828 CDentry *dn = q->second;
3829 CDentry::linkage_t *dnl = dn->get_linkage();
3830 if (auth) {
3831 dn->state_set(CDentry::STATE_AUTH);
3832 } else {
3833 dn->state_clear(CDentry::STATE_AUTH);
3834 if (!replay) {
3835 dn->state_set(CDentry::STATE_REJOINING);
3836 if (dn->is_dirty())
3837 dn->mark_clean();
3838 }
3839 }
3840
3841 if (dnl->is_primary()) {
3842 // inode
3843 CInode *in = dnl->get_inode();
3844 if (auth) {
3845 in->state_set(CInode::STATE_AUTH);
3846 } else {
3847 in->state_clear(CInode::STATE_AUTH);
3848 if (!replay) {
3849 in->state_set(CInode::STATE_REJOINING);
3850 if (in->is_dirty())
3851 in->mark_clean();
3852 if (in->is_dirty_parent())
3853 in->clear_dirty_parent();
3854 // avoid touching scatterlocks for our subtree roots!
3855 if (subtree_inodes.count(in) == 0)
3856 in->clear_scatter_dirty();
3857 }
3858 }
3859 // recurse?
3860 if (in->is_dir())
3861 in->get_nested_dirfrags(dfq);
3862 }
3863 }
3864 }
3865 }
3866
3867 show_subtrees();
3868 show_cache();
3869}
3870
3871
3872
3873// ===========================================================================
3874// REJOIN
3875
3876/*
3877 * notes on scatterlock recovery:
3878 *
3879 * - recovering inode replica sends scatterlock data for any subtree
3880 * roots (the only ones that are possibly dirty).
3881 *
3882 * - surviving auth incorporates any provided scatterlock data. any
3883 * pending gathers are then finished, as with the other lock types.
3884 *
3885 * that takes care of surviving auth + (recovering replica)*.
3886 *
3887 * - surviving replica sends strong_inode, which includes current
3888 * scatterlock state, AND any dirty scatterlock data. this
3889 * provides the recovering auth with everything it might need.
3890 *
3891 * - recovering auth must pick initial scatterlock state based on
3892 * (weak|strong) rejoins.
3893 * - always assimilate scatterlock data (it can't hurt)
3894 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3895 * - include base inode in ack for all inodes that saw scatterlock content
3896 *
3897 * also, for scatter gather,
3898 *
3899 * - auth increments {frag,r}stat.version on completion of any gather.
3900 *
3901 * - auth incorporates changes in a gather _only_ if the version
3902 * matches.
3903 *
3904 * - replica discards changes any time the scatterlock syncs, and
3905 * after recovery.
3906 */
3907
3908void MDCache::dump_rejoin_status(Formatter *f) const
3909{
3910 f->open_object_section("rejoin_status");
3911 f->dump_stream("rejoin_gather") << rejoin_gather;
3912 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3913 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3914 f->close_section();
3915}
3916
3917void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3918{
3919 dout(10) << "rejoin_start" << dendl;
3920 assert(!rejoin_done);
3921 rejoin_done.reset(rejoin_done_);
3922
3923 rejoin_gather = recovery_set;
3924 // need finish opening cap inodes before sending cache rejoins
3925 rejoin_gather.insert(mds->get_nodeid());
3926 process_imported_caps();
3927}
3928
3929/*
3930 * rejoin phase!
3931 *
3932 * this initiates rejoin. it shoudl be called before we get any
3933 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3934 *
3935 * we start out by sending rejoins to everyone in the recovery set.
3936 *
3937 * if we are rejoin, send for all regions in our cache.
3938 * if we are active|stopping, send only to nodes that are are rejoining.
3939 */
3940void MDCache::rejoin_send_rejoins()
3941{
3942 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3943
3944 if (rejoin_gather.count(mds->get_nodeid())) {
3945 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3946 rejoins_pending = true;
3947 return;
3948 }
3949 if (!resolve_gather.empty()) {
3950 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3951 << resolve_gather << ")" << dendl;
3952 rejoins_pending = true;
3953 return;
3954 }
3955
3956 assert(!migrator->is_importing());
3957 assert(!migrator->is_exporting());
3958
3959 if (!mds->is_rejoin()) {
3960 disambiguate_other_imports();
3961 }
3962
3963 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3964
3965
3966 // if i am rejoining, send a rejoin to everyone.
3967 // otherwise, just send to others who are rejoining.
3968 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3969 p != recovery_set.end();
3970 ++p) {
3971 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3972 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3973 if (mds->is_rejoin())
3974 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
3975 else if (mds->mdsmap->is_rejoin(*p))
3976 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
3977 }
3978
3979 if (mds->is_rejoin()) {
3980 map<client_t, set<mds_rank_t> > client_exports;
3981 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
3982 assert(cap_export_targets.count(p->first));
3983 mds_rank_t target = cap_export_targets[p->first];
3984 if (rejoins.count(target) == 0)
3985 continue;
3986 rejoins[target]->cap_exports[p->first] = p->second;
3987 for (auto q = p->second.begin(); q != p->second.end(); ++q)
3988 client_exports[q->first].insert(target);
3989 }
3990 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
3991 p != client_exports.end();
3992 ++p) {
3993 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
3994 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
3995 rejoins[*q]->client_map[p->first] = inst;
3996 }
3997 }
3998
3999
4000 // check all subtrees
4001 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4002 p != subtrees.end();
4003 ++p) {
4004 CDir *dir = p->first;
4005 assert(dir->is_subtree_root());
4006 if (dir->is_ambiguous_dir_auth()) {
4007 // exporter is recovering, importer is survivor.
4008 assert(rejoins.count(dir->authority().first));
4009 assert(!rejoins.count(dir->authority().second));
4010 continue;
4011 }
4012
4013 // my subtree?
4014 if (dir->is_auth())
4015 continue; // skip my own regions!
4016
4017 mds_rank_t auth = dir->get_dir_auth().first;
4018 assert(auth >= 0);
4019 if (rejoins.count(auth) == 0)
4020 continue; // don't care about this node's subtrees
4021
4022 rejoin_walk(dir, rejoins[auth]);
4023 }
4024
4025 // rejoin root inodes, too
4026 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4027 p != rejoins.end();
4028 ++p) {
4029 if (mds->is_rejoin()) {
4030 // weak
4031 if (p->first == 0 && root) {
4032 p->second->add_weak_inode(root->vino());
4033 if (root->is_dirty_scattered()) {
4034 dout(10) << " sending scatterlock state on root " << *root << dendl;
4035 p->second->add_scatterlock_state(root);
4036 }
4037 }
4038 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4039 if (in)
4040 p->second->add_weak_inode(in->vino());
4041 }
4042 } else {
4043 // strong
4044 if (p->first == 0 && root) {
4045 p->second->add_strong_inode(root->vino(),
4046 root->get_replica_nonce(),
4047 root->get_caps_wanted(),
4048 root->filelock.get_state(),
4049 root->nestlock.get_state(),
4050 root->dirfragtreelock.get_state());
4051 root->state_set(CInode::STATE_REJOINING);
4052 if (root->is_dirty_scattered()) {
4053 dout(10) << " sending scatterlock state on root " << *root << dendl;
4054 p->second->add_scatterlock_state(root);
4055 }
4056 }
4057
4058 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4059 p->second->add_strong_inode(in->vino(),
4060 in->get_replica_nonce(),
4061 in->get_caps_wanted(),
4062 in->filelock.get_state(),
4063 in->nestlock.get_state(),
4064 in->dirfragtreelock.get_state());
4065 in->state_set(CInode::STATE_REJOINING);
4066 }
4067 }
4068 }
4069
4070 if (!mds->is_rejoin()) {
4071 // i am survivor. send strong rejoin.
4072 // note request remote_auth_pins, xlocks
4073 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4074 p != active_requests.end();
4075 ++p) {
4076 MDRequestRef& mdr = p->second;
4077 if (mdr->is_slave())
4078 continue;
4079 // auth pins
4080 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4081 q != mdr->remote_auth_pins.end();
4082 ++q) {
4083 if (!q->first->is_auth()) {
4084 assert(q->second == q->first->authority().first);
4085 if (rejoins.count(q->second) == 0) continue;
4086 MMDSCacheRejoin *rejoin = rejoins[q->second];
4087
4088 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4089 MDSCacheObjectInfo i;
4090 q->first->set_object_info(i);
4091 if (i.ino)
4092 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4093 else
4094 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4095
4096 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4097 mdr->more()->rename_inode == q->first)
4098 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4099 mdr->reqid, mdr->attempt);
4100 }
4101 }
4102 // xlocks
4103 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4104 q != mdr->xlocks.end();
4105 ++q) {
4106 if (!(*q)->get_parent()->is_auth()) {
4107 mds_rank_t who = (*q)->get_parent()->authority().first;
4108 if (rejoins.count(who) == 0) continue;
4109 MMDSCacheRejoin *rejoin = rejoins[who];
4110
4111 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4112 MDSCacheObjectInfo i;
4113 (*q)->get_parent()->set_object_info(i);
4114 if (i.ino)
4115 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4116 mdr->reqid, mdr->attempt);
4117 else
4118 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4119 mdr->reqid, mdr->attempt);
4120 }
4121 }
4122 // remote wrlocks
4123 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4124 q != mdr->remote_wrlocks.end();
4125 ++q) {
4126 mds_rank_t who = q->second;
4127 if (rejoins.count(who) == 0) continue;
4128 MMDSCacheRejoin *rejoin = rejoins[who];
4129
4130 dout(15) << " " << *mdr << " wrlock on " << q->second
4131 << " " << q->first->get_parent() << dendl;
4132 MDSCacheObjectInfo i;
4133 q->first->get_parent()->set_object_info(i);
4134 assert(i.ino);
4135 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4136 mdr->reqid, mdr->attempt);
4137 }
4138 }
4139 }
4140
4141 // send the messages
4142 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4143 p != rejoins.end();
4144 ++p) {
4145 assert(rejoin_sent.count(p->first) == 0);
4146 assert(rejoin_ack_gather.count(p->first) == 0);
4147 rejoin_sent.insert(p->first);
4148 rejoin_ack_gather.insert(p->first);
4149 mds->send_message_mds(p->second, p->first);
4150 }
4151 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4152 rejoins_pending = false;
4153
4154 // nothing?
4155 if (mds->is_rejoin() && rejoins.empty()) {
4156 dout(10) << "nothing to rejoin" << dendl;
4157 rejoin_gather_finish();
4158 }
4159}
4160
4161
4162/**
4163 * rejoin_walk - build rejoin declarations for a subtree
4164 *
4165 * @param dir subtree root
4166 * @param rejoin rejoin message
4167 *
4168 * from a rejoining node:
4169 * weak dirfrag
4170 * weak dentries (w/ connectivity)
4171 *
4172 * from a surviving node:
4173 * strong dirfrag
4174 * strong dentries (no connectivity!)
4175 * strong inodes
4176 */
4177void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4178{
4179 dout(10) << "rejoin_walk " << *dir << dendl;
4180
4181 list<CDir*> nested; // finish this dir, then do nested items
4182
4183 if (mds->is_rejoin()) {
4184 // WEAK
4185 rejoin->add_weak_dirfrag(dir->dirfrag());
4186 for (CDir::map_t::iterator p = dir->items.begin();
4187 p != dir->items.end();
4188 ++p) {
4189 CDentry *dn = p->second;
4190 CDentry::linkage_t *dnl = dn->get_linkage();
4191 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4192 assert(dnl->is_primary());
4193 CInode *in = dnl->get_inode();
4194 assert(dnl->get_inode()->is_dir());
4195 rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino());
4196 in->get_nested_dirfrags(nested);
4197 if (in->is_dirty_scattered()) {
4198 dout(10) << " sending scatterlock state on " << *in << dendl;
4199 rejoin->add_scatterlock_state(in);
4200 }
4201 }
4202 } else {
4203 // STRONG
4204 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4205 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4206 dir->state_set(CDir::STATE_REJOINING);
4207
4208 for (CDir::map_t::iterator p = dir->items.begin();
4209 p != dir->items.end();
4210 ++p) {
4211 CDentry *dn = p->second;
4212 CDentry::linkage_t *dnl = dn->get_linkage();
4213 dout(15) << " add_strong_dentry " << *dn << dendl;
4214 rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4215 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4216 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4217 dnl->is_remote() ? dnl->get_remote_d_type():0,
4218 dn->get_replica_nonce(),
4219 dn->lock.get_state());
4220 dn->state_set(CDentry::STATE_REJOINING);
4221 if (dnl->is_primary()) {
4222 CInode *in = dnl->get_inode();
4223 dout(15) << " add_strong_inode " << *in << dendl;
4224 rejoin->add_strong_inode(in->vino(),
4225 in->get_replica_nonce(),
4226 in->get_caps_wanted(),
4227 in->filelock.get_state(),
4228 in->nestlock.get_state(),
4229 in->dirfragtreelock.get_state());
4230 in->state_set(CInode::STATE_REJOINING);
4231 in->get_nested_dirfrags(nested);
4232 if (in->is_dirty_scattered()) {
4233 dout(10) << " sending scatterlock state on " << *in << dendl;
4234 rejoin->add_scatterlock_state(in);
4235 }
4236 }
4237 }
4238 }
4239
4240 // recurse into nested dirs
4241 for (list<CDir*>::iterator p = nested.begin();
4242 p != nested.end();
4243 ++p)
4244 rejoin_walk(*p, rejoin);
4245}
4246
4247
4248/*
4249 * i got a rejoin.
4250 * - reply with the lockstate
4251 *
4252 * if i am active|stopping,
4253 * - remove source from replica list for everything not referenced here.
4254 * This function puts the passed message before returning.
4255 */
4256void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4257{
4258 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4259 << " (" << m->get_payload().length() << " bytes)"
4260 << dendl;
4261
4262 switch (m->op) {
4263 case MMDSCacheRejoin::OP_WEAK:
4264 handle_cache_rejoin_weak(m);
4265 break;
4266 case MMDSCacheRejoin::OP_STRONG:
4267 handle_cache_rejoin_strong(m);
4268 break;
4269 case MMDSCacheRejoin::OP_ACK:
4270 handle_cache_rejoin_ack(m);
4271 break;
4272
4273 default:
4274 ceph_abort();
4275 }
4276 m->put();
4277}
4278
4279
4280/*
4281 * handle_cache_rejoin_weak
4282 *
4283 * the sender
4284 * - is recovering from their journal.
4285 * - may have incorrect (out of date) inode contents
4286 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4287 *
4288 * if the sender didn't trim_non_auth(), they
4289 * - may have incorrect (out of date) dentry/inode linkage
4290 * - may have deleted/purged inodes
4291 * and i may have to go to disk to get accurate inode contents. yuck.
4292 * This functions DOES NOT put the passed message before returning
4293 */
4294void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4295{
4296 mds_rank_t from = mds_rank_t(weak->get_source().num());
4297
4298 // possible response(s)
4299 MMDSCacheRejoin *ack = 0; // if survivor
4300 set<vinodeno_t> acked_inodes; // if survivor
4301 set<SimpleLock *> gather_locks; // if survivor
4302 bool survivor = false; // am i a survivor?
4303
4304 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4305 survivor = true;
4306 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4307 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4308
4309 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4310
4311 // check cap exports
4312 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4313 CInode *in = get_inode(p->first);
4314 assert(!in || in->is_auth());
4315 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4316 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4317 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4318 Capability::Import& im = imported_caps[p->first][q->first];
4319 if (cap) {
4320 im.cap_id = cap->get_cap_id();
4321 im.issue_seq = cap->get_last_seq();
4322 im.mseq = cap->get_mseq();
4323 } else {
4324 // all are zero
4325 }
4326 }
4327 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4328 }
4329
4330 ::encode(imported_caps, ack->imported_caps);
4331 } else {
4332 assert(mds->is_rejoin());
4333
4334 // we may have already received a strong rejoin from the sender.
4335 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4336 assert(gather_locks.empty());
4337
4338 // check cap exports.
4339 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4340
4341 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4342 CInode *in = get_inode(p->first);
4343 assert(in && in->is_auth());
4344 // note
4345 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4346 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4347 cap_imports[p->first][q->first][from] = q->second;
4348 }
4349 }
4350 }
4351
4352 // assimilate any potentially dirty scatterlock state
4353 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4354 p != weak->inode_scatterlocks.end();
4355 ++p) {
4356 CInode *in = get_inode(p->first);
4357 assert(in);
4358 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4359 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4360 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4361 if (!survivor)
4362 rejoin_potential_updated_scatterlocks.insert(in);
4363 }
4364
4365 // recovering peer may send incorrect dirfrags here. we need to
4366 // infer which dirfrag they meant. the ack will include a
4367 // strong_dirfrag that will set them straight on the fragmentation.
4368
4369 // walk weak map
4370 set<CDir*> dirs_to_share;
4371 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4372 p != weak->weak_dirfrags.end();
4373 ++p) {
4374 CInode *diri = get_inode(p->ino);
4375 if (!diri)
4376 dout(0) << " missing dir ino " << p->ino << dendl;
4377 assert(diri);
4378
4379 list<frag_t> ls;
4380 if (diri->dirfragtree.is_leaf(p->frag)) {
4381 ls.push_back(p->frag);
4382 } else {
4383 diri->dirfragtree.get_leaves_under(p->frag, ls);
4384 if (ls.empty())
4385 ls.push_back(diri->dirfragtree[p->frag.value()]);
4386 }
4387 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4388 frag_t fg = *q;
4389 CDir *dir = diri->get_dirfrag(fg);
4390 if (!dir) {
4391 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4392 continue;
4393 }
4394 assert(dir);
4395 if (dirs_to_share.count(dir)) {
4396 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4397 } else {
4398 dirs_to_share.insert(dir);
4399 unsigned nonce = dir->add_replica(from);
4400 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4401 if (ack) {
4402 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4403 ack->add_dirfrag_base(dir);
4404 }
4405 }
4406 }
4407 }
4408
4409 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4410 p != weak->weak.end();
4411 ++p) {
4412 CInode *diri = get_inode(p->first);
4413 if (!diri)
4414 dout(0) << " missing dir ino " << p->first << dendl;
4415 assert(diri);
4416
4417 // weak dentries
4418 CDir *dir = 0;
4419 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4420 q != p->second.end();
4421 ++q) {
4422 // locate proper dirfrag.
4423 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4424 frag_t fg = diri->pick_dirfrag(q->first.name);
4425 if (!dir || dir->get_frag() != fg) {
4426 dir = diri->get_dirfrag(fg);
4427 if (!dir)
4428 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4429 assert(dir);
4430 assert(dirs_to_share.count(dir));
4431 }
4432
4433 // and dentry
4434 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4435 assert(dn);
4436 CDentry::linkage_t *dnl = dn->get_linkage();
4437 assert(dnl->is_primary());
4438
4439 if (survivor && dn->is_replica(from))
4440 dentry_remove_replica(dn, from, gather_locks);
4441 unsigned dnonce = dn->add_replica(from);
4442 dout(10) << " have " << *dn << dendl;
4443 if (ack)
4444 ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
4445 dnl->get_inode()->ino(), inodeno_t(0), 0,
4446 dnonce, dn->lock.get_replica_state());
4447
4448 // inode
4449 CInode *in = dnl->get_inode();
4450 assert(in);
4451
4452 if (survivor && in->is_replica(from))
4453 inode_remove_replica(in, from, true, gather_locks);
4454 unsigned inonce = in->add_replica(from);
4455 dout(10) << " have " << *in << dendl;
4456
4457 // scatter the dirlock, just in case?
4458 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4459 in->filelock.set_state(LOCK_MIX);
4460
4461 if (ack) {
4462 acked_inodes.insert(in->vino());
4463 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4464 bufferlist bl;
4465 in->_encode_locks_state_for_rejoin(bl, from);
4466 ack->add_inode_locks(in, inonce, bl);
4467 }
4468 }
4469 }
4470
4471 // weak base inodes? (root, stray, etc.)
4472 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4473 p != weak->weak_inodes.end();
4474 ++p) {
4475 CInode *in = get_inode(*p);
4476 assert(in); // hmm fixme wrt stray?
4477 if (survivor && in->is_replica(from))
4478 inode_remove_replica(in, from, true, gather_locks);
4479 unsigned inonce = in->add_replica(from);
4480 dout(10) << " have base " << *in << dendl;
4481
4482 if (ack) {
4483 acked_inodes.insert(in->vino());
4484 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4485 bufferlist bl;
4486 in->_encode_locks_state_for_rejoin(bl, from);
4487 ack->add_inode_locks(in, inonce, bl);
4488 }
4489 }
4490
4491 assert(rejoin_gather.count(from));
4492 rejoin_gather.erase(from);
4493 if (survivor) {
4494 // survivor. do everything now.
4495 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4496 p != weak->inode_scatterlocks.end();
4497 ++p) {
4498 CInode *in = get_inode(p->first);
4499 assert(in);
4500 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4501 acked_inodes.insert(in->vino());
4502 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4503 }
4504
4505 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4506 mds->send_message(ack, weak->get_connection());
4507
4508 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4509 if (!(*p)->is_stable())
4510 mds->locker->eval_gather(*p);
4511 }
4512 } else {
4513 // done?
4514 if (rejoin_gather.empty()) {
4515 rejoin_gather_finish();
4516 } else {
4517 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4518 }
4519 }
4520}
4521
4522class C_MDC_RejoinGatherFinish : public MDCacheContext {
4523public:
4524 explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {}
4525 void finish(int r) override {
4526 mdcache->rejoin_gather_finish();
4527 }
4528};
4529
4530/*
4531 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4532 *
4533 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4534 * ack, the replica dne, and we can remove it from our replica maps.
4535 */
4536void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4537 set<vinodeno_t>& acked_inodes,
4538 set<SimpleLock *>& gather_locks)
4539{
4540 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4541
4542 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
4543 p != inode_map.end();
4544 ++p) {
4545 CInode *in = p->second;
4546
4547 // inode?
4548 if (in->is_auth() &&
4549 in->is_replica(from) &&
4550 (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) {
4551 inode_remove_replica(in, from, false, gather_locks);
4552 dout(10) << " rem " << *in << dendl;
4553 }
4554
4555 if (!in->is_dir()) continue;
4556
4557 list<CDir*> dfs;
4558 in->get_dirfrags(dfs);
4559 for (list<CDir*>::iterator p = dfs.begin();
4560 p != dfs.end();
4561 ++p) {
4562 CDir *dir = *p;
181888fb
FG
4563 if (!dir->is_auth())
4564 continue;
7c673cae 4565
181888fb 4566 if (dir->is_replica(from) &&
7c673cae
FG
4567 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4568 dir->remove_replica(from);
4569 dout(10) << " rem " << *dir << dendl;
4570 }
4571
4572 // dentries
4573 for (CDir::map_t::iterator p = dir->items.begin();
4574 p != dir->items.end();
4575 ++p) {
4576 CDentry *dn = p->second;
4577
4578 if (dn->is_replica(from) &&
4579 (ack == NULL ||
4580 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
4581 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
4582 dentry_remove_replica(dn, from, gather_locks);
4583 dout(10) << " rem " << *dn << dendl;
4584 }
4585 }
4586 }
4587 }
4588}
4589
4590
4591CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4592{
4593 CInode *in = new CInode(this, true, 1, last);
4594 in->inode.ino = ino;
4595 in->state_set(CInode::STATE_REJOINUNDEF);
4596 add_inode(in);
4597 rejoin_undef_inodes.insert(in);
4598 dout(10) << " invented " << *in << dendl;
4599 return in;
4600}
4601
4602CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4603{
4604 CInode *in = get_inode(df.ino);
4605 if (!in)
4606 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4607 if (!in->is_dir()) {
4608 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4609 in->inode.mode = S_IFDIR;
4610 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4611 }
4612 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4613 dir->state_set(CDir::STATE_REJOINUNDEF);
4614 rejoin_undef_dirfrags.insert(dir);
4615 dout(10) << " invented " << *dir << dendl;
4616 return dir;
4617}
4618
4619/* This functions DOES NOT put the passed message before returning */
4620void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4621{
4622 mds_rank_t from = mds_rank_t(strong->get_source().num());
4623
4624 // only a recovering node will get a strong rejoin.
4625 assert(mds->is_rejoin());
4626
4627 // assimilate any potentially dirty scatterlock state
4628 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4629 p != strong->inode_scatterlocks.end();
4630 ++p) {
4631 CInode *in = get_inode(p->first);
4632 assert(in);
4633 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4634 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4635 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4636 rejoin_potential_updated_scatterlocks.insert(in);
4637 }
4638
4639 rejoin_unlinked_inodes[from].clear();
4640
4641 // surviving peer may send incorrect dirfrag here (maybe they didn't
4642 // get the fragment notify, or maybe we rolled back?). we need to
4643 // infer the right frag and get them with the program. somehow.
4644 // we don't normally send ACK.. so we'll need to bundle this with
4645 // MISSING or something.
4646
4647 // strong dirfrags/dentries.
4648 // also process auth_pins, xlocks.
4649 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4650 p != strong->strong_dirfrags.end();
4651 ++p) {
4652 CInode *diri = get_inode(p->first.ino);
4653 if (!diri)
4654 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4655 CDir *dir = diri->get_dirfrag(p->first.frag);
4656 bool refragged = false;
4657 if (dir) {
4658 dout(10) << " have " << *dir << dendl;
4659 } else {
4660 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4661 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4662 else if (diri->dirfragtree.is_leaf(p->first.frag))
4663 dir = rejoin_invent_dirfrag(p->first);
4664 }
4665 if (dir) {
4666 dir->add_replica(from, p->second.nonce);
4667 dir->dir_rep = p->second.dir_rep;
4668 } else {
4669 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4670 list<frag_t> ls;
4671 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4672 if (ls.empty())
4673 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4674 dout(10) << " maps to frag(s) " << ls << dendl;
4675 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4676 CDir *dir = diri->get_dirfrag(*q);
4677 if (!dir)
4678 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4679 else
4680 dout(10) << " have(approx) " << *dir << dendl;
4681 dir->add_replica(from, p->second.nonce);
4682 dir->dir_rep = p->second.dir_rep;
4683 }
4684 refragged = true;
4685 }
4686
4687 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4688 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4689 q != dmap.end();
4690 ++q) {
4691 CDentry *dn;
4692 if (!refragged)
4693 dn = dir->lookup(q->first.name, q->first.snapid);
4694 else {
4695 frag_t fg = diri->pick_dirfrag(q->first.name);
4696 dir = diri->get_dirfrag(fg);
4697 assert(dir);
4698 dn = dir->lookup(q->first.name, q->first.snapid);
4699 }
4700 if (!dn) {
4701 if (q->second.is_remote()) {
4702 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4703 q->second.first, q->first.snapid);
4704 } else if (q->second.is_null()) {
4705 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4706 } else {
4707 CInode *in = get_inode(q->second.ino, q->first.snapid);
4708 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4709 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4710 }
4711 dout(10) << " invented " << *dn << dendl;
4712 }
4713 CDentry::linkage_t *dnl = dn->get_linkage();
4714
4715 // dn auth_pin?
4716 if (strong->authpinned_dentries.count(p->first) &&
4717 strong->authpinned_dentries[p->first].count(q->first)) {
4718 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4719 r != strong->authpinned_dentries[p->first][q->first].end();
4720 ++r) {
4721 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4722
4723 // get/create slave mdrequest
4724 MDRequestRef mdr;
4725 if (have_request(r->reqid))
4726 mdr = request_get(r->reqid);
4727 else
4728 mdr = request_start_slave(r->reqid, r->attempt, strong);
4729 mdr->auth_pin(dn);
4730 }
4731 }
4732
4733 // dn xlock?
4734 if (strong->xlocked_dentries.count(p->first) &&
4735 strong->xlocked_dentries[p->first].count(q->first)) {
4736 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4737 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4738 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4739 assert(mdr->is_auth_pinned(dn));
4740 if (!mdr->xlocks.count(&dn->versionlock)) {
4741 assert(dn->versionlock.can_xlock_local());
4742 dn->versionlock.get_xlock(mdr, mdr->get_client());
4743 mdr->xlocks.insert(&dn->versionlock);
4744 mdr->locks.insert(&dn->versionlock);
4745 }
4746 if (dn->lock.is_stable())
4747 dn->auth_pin(&dn->lock);
4748 dn->lock.set_state(LOCK_XLOCK);
4749 dn->lock.get_xlock(mdr, mdr->get_client());
4750 mdr->xlocks.insert(&dn->lock);
4751 mdr->locks.insert(&dn->lock);
4752 }
4753
4754 dn->add_replica(from, q->second.nonce);
4755 dout(10) << " have " << *dn << dendl;
4756
4757 if (dnl->is_primary()) {
4758 if (q->second.is_primary()) {
4759 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4760 // the survivor missed MDentryUnlink+MDentryLink messages ?
4761 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4762 CInode *in = get_inode(q->second.ino, q->first.snapid);
4763 assert(in);
4764 assert(in->get_parent_dn());
4765 rejoin_unlinked_inodes[from].insert(in);
4766 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4767 }
4768 } else {
4769 // the survivor missed MDentryLink message ?
4770 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4771 dout(7) << " sender doesn't have primay dentry" << dendl;
4772 }
4773 } else {
4774 if (q->second.is_primary()) {
4775 // the survivor missed MDentryUnlink message ?
4776 CInode *in = get_inode(q->second.ino, q->first.snapid);
4777 assert(in);
4778 assert(in->get_parent_dn());
4779 rejoin_unlinked_inodes[from].insert(in);
4780 dout(7) << " sender has primary dentry but we don't" << dendl;
4781 }
4782 }
4783 }
4784 }
4785
4786 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4787 p != strong->strong_inodes.end();
4788 ++p) {
4789 CInode *in = get_inode(p->first);
4790 assert(in);
4791 in->add_replica(from, p->second.nonce);
4792 dout(10) << " have " << *in << dendl;
4793
4794 MMDSCacheRejoin::inode_strong &is = p->second;
4795
4796 // caps_wanted
4797 if (is.caps_wanted) {
4798 in->mds_caps_wanted[from] = is.caps_wanted;
4799 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4800 << " on " << *in << dendl;
4801 }
4802
4803 // scatterlocks?
4804 // infer state from replica state:
4805 // * go to MIX if they might have wrlocks
4806 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4807 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4808 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4809 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4810
4811 // auth pin?
4812 if (strong->authpinned_inodes.count(in->vino())) {
4813 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4814 r != strong->authpinned_inodes[in->vino()].end();
4815 ++r) {
4816 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4817
4818 // get/create slave mdrequest
4819 MDRequestRef mdr;
4820 if (have_request(r->reqid))
4821 mdr = request_get(r->reqid);
4822 else
4823 mdr = request_start_slave(r->reqid, r->attempt, strong);
4824 if (strong->frozen_authpin_inodes.count(in->vino())) {
4825 assert(!in->get_num_auth_pins());
4826 mdr->freeze_auth_pin(in);
4827 } else {
4828 assert(!in->is_frozen_auth_pin());
4829 }
4830 mdr->auth_pin(in);
4831 }
4832 }
4833 // xlock(s)?
4834 if (strong->xlocked_inodes.count(in->vino())) {
4835 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4836 q != strong->xlocked_inodes[in->vino()].end();
4837 ++q) {
4838 SimpleLock *lock = in->get_lock(q->first);
4839 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4840 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4841 assert(mdr->is_auth_pinned(in));
4842 if (!mdr->xlocks.count(&in->versionlock)) {
4843 assert(in->versionlock.can_xlock_local());
4844 in->versionlock.get_xlock(mdr, mdr->get_client());
4845 mdr->xlocks.insert(&in->versionlock);
4846 mdr->locks.insert(&in->versionlock);
4847 }
4848 if (lock->is_stable())
4849 in->auth_pin(lock);
4850 lock->set_state(LOCK_XLOCK);
4851 if (lock == &in->filelock)
4852 in->loner_cap = -1;
4853 lock->get_xlock(mdr, mdr->get_client());
4854 mdr->xlocks.insert(lock);
4855 mdr->locks.insert(lock);
4856 }
4857 }
4858 }
4859 // wrlock(s)?
4860 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4861 p != strong->wrlocked_inodes.end();
4862 ++p) {
4863 CInode *in = get_inode(p->first);
4864 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4865 q != p->second.end();
4866 ++q) {
4867 SimpleLock *lock = in->get_lock(q->first);
4868 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4869 r != q->second.end();
4870 ++r) {
4871 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4872 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4873 if (in->is_auth())
4874 assert(mdr->is_auth_pinned(in));
4875 lock->set_state(LOCK_MIX);
4876 if (lock == &in->filelock)
4877 in->loner_cap = -1;
4878 lock->get_wrlock(true);
4879 mdr->wrlocks.insert(lock);
4880 mdr->locks.insert(lock);
4881 }
4882 }
4883 }
4884
4885 // done?
4886 assert(rejoin_gather.count(from));
4887 rejoin_gather.erase(from);
4888 if (rejoin_gather.empty()) {
4889 rejoin_gather_finish();
4890 } else {
4891 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4892 }
4893}
4894
4895/* This functions DOES NOT put the passed message before returning */
4896void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4897{
4898 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4899 mds_rank_t from = mds_rank_t(ack->get_source().num());
4900
4901 // for sending cache expire message
4902 set<CInode*> isolated_inodes;
4903 set<CInode*> refragged_inodes;
4904
4905 // dirs
4906 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4907 p != ack->strong_dirfrags.end();
4908 ++p) {
4909 // we may have had incorrect dir fragmentation; refragment based
4910 // on what they auth tells us.
4911 CDir *dir = get_dirfrag(p->first);
4912 if (!dir) {
4913 dir = get_force_dirfrag(p->first, false);
4914 if (dir)
4915 refragged_inodes.insert(dir->get_inode());
4916 }
4917 if (!dir) {
4918 CInode *diri = get_inode(p->first.ino);
4919 if (!diri) {
4920 // barebones inode; the full inode loop below will clean up.
4921 diri = new CInode(this, false);
4922 diri->inode.ino = p->first.ino;
4923 diri->inode.mode = S_IFDIR;
4924 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4925 add_inode(diri);
4926 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4927 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4928 dout(10) << " add inode " << *diri << dendl;
4929 } else {
4930 diri->inode_auth = CDIR_AUTH_DEFAULT;
4931 isolated_inodes.insert(diri);
4932 dout(10) << " unconnected dirfrag " << p->first << dendl;
4933 }
4934 }
4935 // barebones dirfrag; the full dirfrag loop below will clean up.
4936 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4937 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4938 (dir->authority() != CDIR_AUTH_UNDEF &&
4939 dir->authority().first != from))
4940 adjust_subtree_auth(dir, from);
4941 dout(10) << " add dirfrag " << *dir << dendl;
4942 }
4943
4944 dir->set_replica_nonce(p->second.nonce);
4945 dir->state_clear(CDir::STATE_REJOINING);
4946 dout(10) << " got " << *dir << dendl;
4947
4948 // dentries
4949 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4950 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4951 q != dmap.end();
4952 ++q) {
4953 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4954 if(!dn)
4955 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4956
4957 CDentry::linkage_t *dnl = dn->get_linkage();
4958
4959 assert(dn->last == q->first.snapid);
4960 if (dn->first != q->second.first) {
4961 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4962 dn->first = q->second.first;
4963 }
4964
4965 // may have bad linkage if we missed dentry link/unlink messages
4966 if (dnl->is_primary()) {
4967 CInode *in = dnl->get_inode();
4968 if (!q->second.is_primary() ||
4969 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4970 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4971 dir->unlink_inode(dn);
4972 }
4973 } else if (dnl->is_remote()) {
4974 if (!q->second.is_remote() ||
4975 q->second.remote_ino != dnl->get_remote_ino() ||
4976 q->second.remote_d_type != dnl->get_remote_d_type()) {
4977 dout(10) << " had bad linkage for " << *dn << dendl;
4978 dir->unlink_inode(dn);
4979 }
4980 } else {
4981 if (!q->second.is_null())
4982 dout(10) << " had bad linkage for " << *dn << dendl;
4983 }
4984
4985 // hmm, did we have the proper linkage here?
4986 if (dnl->is_null() && !q->second.is_null()) {
4987 if (q->second.is_remote()) {
4988 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
4989 } else {
4990 CInode *in = get_inode(q->second.ino, q->first.snapid);
4991 if (!in) {
4992 // barebones inode; assume it's dir, the full inode loop below will clean up.
4993 in = new CInode(this, false, q->second.first, q->first.snapid);
4994 in->inode.ino = q->second.ino;
4995 in->inode.mode = S_IFDIR;
4996 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4997 add_inode(in);
4998 dout(10) << " add inode " << *in << dendl;
4999 } else if (in->get_parent_dn()) {
5000 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5001 << ", unlinking " << *in << dendl;
5002 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5003 }
5004 dn->dir->link_primary_inode(dn, in);
5005 isolated_inodes.erase(in);
5006 }
5007 }
5008
5009 dn->set_replica_nonce(q->second.nonce);
5010 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
5011 dn->state_clear(CDentry::STATE_REJOINING);
5012 dout(10) << " got " << *dn << dendl;
5013 }
5014 }
5015
5016 for (set<CInode*>::iterator p = refragged_inodes.begin();
5017 p != refragged_inodes.end();
5018 ++p) {
5019 list<CDir*> ls;
5020 (*p)->get_nested_dirfrags(ls);
5021 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5022 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5023 continue;
5024 assert((*q)->get_num_any() == 0);
5025 (*p)->close_dirfrag((*q)->get_frag());
5026 }
5027 }
5028
5029 // full dirfrags
5030 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5031 p != ack->dirfrag_bases.end();
5032 ++p) {
5033 CDir *dir = get_dirfrag(p->first);
5034 assert(dir);
5035 bufferlist::iterator q = p->second.begin();
5036 dir->_decode_base(q);
5037 dout(10) << " got dir replica " << *dir << dendl;
5038 }
5039
5040 // full inodes
5041 bufferlist::iterator p = ack->inode_base.begin();
5042 while (!p.end()) {
5043 inodeno_t ino;
5044 snapid_t last;
5045 bufferlist basebl;
5046 ::decode(ino, p);
5047 ::decode(last, p);
5048 ::decode(basebl, p);
5049 CInode *in = get_inode(ino, last);
5050 assert(in);
5051 bufferlist::iterator q = basebl.begin();
5052 in->_decode_base(q);
5053 dout(10) << " got inode base " << *in << dendl;
5054 }
5055
5056 // inodes
5057 p = ack->inode_locks.begin();
5058 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5059 while (!p.end()) {
5060 inodeno_t ino;
5061 snapid_t last;
5062 __u32 nonce;
5063 bufferlist lockbl;
5064 ::decode(ino, p);
5065 ::decode(last, p);
5066 ::decode(nonce, p);
5067 ::decode(lockbl, p);
5068
5069 CInode *in = get_inode(ino, last);
5070 assert(in);
5071 in->set_replica_nonce(nonce);
5072 bufferlist::iterator q = lockbl.begin();
5073 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
5074 in->state_clear(CInode::STATE_REJOINING);
5075 dout(10) << " got inode locks " << *in << dendl;
5076 }
5077
5078 // FIXME: This can happen if entire subtree, together with the inode subtree root
5079 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5080 assert(isolated_inodes.empty());
5081
5082 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5083 bufferlist::iterator bp = ack->imported_caps.begin();
5084 ::decode(peer_imported, bp);
5085
5086 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5087 p != peer_imported.end();
5088 ++p) {
5089 assert(cap_exports.count(p->first));
5090 assert(cap_export_targets.count(p->first));
5091 assert(cap_export_targets[p->first] == from);
5092 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5093 q != p->second.end();
5094 ++q) {
5095 assert(cap_exports[p->first].count(q->first));
5096
5097 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5098 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5099 assert(session);
5100
5101 // mark client caps stale.
5102 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
5103 cap_exports[p->first][q->first].capinfo.cap_id, 0,
5104 mds->get_osd_epoch_barrier());
5105 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5106 (q->second.cap_id > 0 ? from : -1), 0);
5107 mds->send_message_client_counted(m, session);
5108
5109 cap_exports[p->first].erase(q->first);
5110 }
5111 assert(cap_exports[p->first].empty());
5112 }
5113
5114 // done?
5115 assert(rejoin_ack_gather.count(from));
5116 rejoin_ack_gather.erase(from);
5117 if (mds->is_rejoin()) {
5118
5119 if (rejoin_gather.empty()) {
5120 // eval unstable scatter locks after all wrlocks are rejoined.
5121 while (!rejoin_eval_locks.empty()) {
5122 SimpleLock *lock = rejoin_eval_locks.front();
5123 rejoin_eval_locks.pop_front();
5124 if (!lock->is_stable())
5125 mds->locker->eval_gather(lock);
5126 }
5127 }
5128
5129 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5130 rejoin_ack_gather.empty()) {
5131 // finally, kickstart past snap parent opens
5132 open_snap_parents();
5133 } else {
5134 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5135 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5136 }
5137 } else {
5138 // survivor.
5139 mds->queue_waiters(rejoin_waiters);
5140 }
5141}
5142
5143/**
5144 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5145 *
5146 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5147 * messages that clean these guys up...
5148 */
5149void MDCache::rejoin_trim_undef_inodes()
5150{
5151 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5152
5153 while (!rejoin_undef_inodes.empty()) {
5154 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5155 CInode *in = *p;
5156 rejoin_undef_inodes.erase(p);
5157
5158 in->clear_replica_map();
5159
5160 // close out dirfrags
5161 if (in->is_dir()) {
5162 list<CDir*> dfls;
5163 in->get_dirfrags(dfls);
5164 for (list<CDir*>::iterator p = dfls.begin();
5165 p != dfls.end();
5166 ++p) {
5167 CDir *dir = *p;
5168 dir->clear_replica_map();
5169
5170 for (CDir::map_t::iterator p = dir->items.begin();
5171 p != dir->items.end();
5172 ++p) {
5173 CDentry *dn = p->second;
5174 dn->clear_replica_map();
5175
5176 dout(10) << " trimming " << *dn << dendl;
5177 dir->remove_dentry(dn);
5178 }
5179
5180 dout(10) << " trimming " << *dir << dendl;
5181 in->close_dirfrag(dir->dirfrag().frag);
5182 }
5183 }
5184
5185 CDentry *dn = in->get_parent_dn();
5186 if (dn) {
5187 dn->clear_replica_map();
5188 dout(10) << " trimming " << *dn << dendl;
5189 dn->dir->remove_dentry(dn);
5190 } else {
5191 dout(10) << " trimming " << *in << dendl;
5192 remove_inode(in);
5193 }
5194 }
5195
5196 assert(rejoin_undef_inodes.empty());
5197}
5198
5199void MDCache::rejoin_gather_finish()
5200{
5201 dout(10) << "rejoin_gather_finish" << dendl;
5202 assert(mds->is_rejoin());
5203
5204 if (open_undef_inodes_dirfrags())
5205 return;
5206
5207 if (process_imported_caps())
5208 return;
5209
5210 choose_lock_states_and_reconnect_caps();
5211
5212 identify_files_to_recover();
5213 rejoin_send_acks();
5214
5215 // signal completion of fetches, rejoin_gather_finish, etc.
5216 assert(rejoin_ack_gather.count(mds->get_nodeid()));
5217 rejoin_ack_gather.erase(mds->get_nodeid());
5218
5219 // did we already get our acks too?
5220 if (rejoin_ack_gather.empty()) {
5221 // finally, kickstart past snap parent opens
5222 open_snap_parents();
5223 }
5224}
5225
5226class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5227 inodeno_t ino;
5228public:
5229 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5230 void finish(int r) override {
5231 mdcache->rejoin_open_ino_finish(ino, r);
5232 }
5233};
5234
5235void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5236{
5237 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5238
5239 if (ret < 0) {
5240 cap_imports_missing.insert(ino);
5241 } else if (ret == mds->get_nodeid()) {
5242 assert(get_inode(ino));
5243 } else {
5244 auto p = cap_imports.find(ino);
5245 assert(p != cap_imports.end());
5246 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5247 assert(q->second.count(MDS_RANK_NONE));
5248 assert(q->second.size() == 1);
5249 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5250 }
5251 cap_imports.erase(p);
5252 }
5253
5254 assert(cap_imports_num_opening > 0);
5255 cap_imports_num_opening--;
5256
5257 if (cap_imports_num_opening == 0) {
5258 if (rejoin_gather.empty())
5259 rejoin_gather_finish();
5260 else if (rejoin_gather.count(mds->get_nodeid()))
5261 process_imported_caps();
5262 }
5263}
5264
5265class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5266public:
5267 map<client_t,entity_inst_t> client_map;
5268 map<client_t,uint64_t> sseqmap;
5269
5270 C_MDC_RejoinSessionsOpened(MDCache *c, map<client_t,entity_inst_t>& cm) :
5271 MDCacheLogContext(c), client_map(cm) {}
5272 void finish(int r) override {
5273 assert(r == 0);
5274 mdcache->rejoin_open_sessions_finish(client_map, sseqmap);
5275 }
5276};
5277
5278void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
5279 map<client_t,uint64_t>& sseqmap)
5280{
5281 dout(10) << "rejoin_open_sessions_finish" << dendl;
5282 mds->server->finish_force_open_sessions(client_map, sseqmap);
5283 if (rejoin_gather.empty())
5284 rejoin_gather_finish();
5285}
5286
5287bool MDCache::process_imported_caps()
5288{
5289 dout(10) << "process_imported_caps" << dendl;
5290
5291 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5292 CInode *in = get_inode(p->first);
5293 if (in) {
5294 assert(in->is_auth());
5295 cap_imports_missing.erase(p->first);
5296 continue;
5297 }
5298 if (cap_imports_missing.count(p->first) > 0)
5299 continue;
5300
5301 cap_imports_num_opening++;
5302 dout(10) << " opening missing ino " << p->first << dendl;
5303 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5304 }
5305
5306 if (cap_imports_num_opening > 0)
5307 return true;
5308
5309 // called by rejoin_gather_finish() ?
5310 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5311 // if sessions for imported caps are all open ?
5312 for (map<client_t,entity_inst_t>::iterator p = rejoin_client_map.begin();
5313 p != rejoin_client_map.end();
5314 ++p) {
5315 if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) {
5316 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map);
5317 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap);
5318 ESessions *le = new ESessions(pv, rejoin_client_map);
5319 mds->mdlog->start_submit_entry(le, finish);
5320 mds->mdlog->flush();
5321 rejoin_client_map.clear();
5322 return true;
5323 }
5324 }
5325 rejoin_client_map.clear();
5326
5327 // process caps that were exported by slave rename
5328 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5329 p != rejoin_slave_exports.end();
5330 ++p) {
5331 CInode *in = get_inode(p->first);
5332 assert(in);
5333 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5334 q != p->second.second.end();
5335 ++q) {
5336 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5337 assert(session);
5338
5339 Capability *cap = in->get_client_cap(q->first);
5340 if (!cap)
5341 cap = in->add_client_cap(q->first, session);
5342 cap->merge(q->second, true);
5343
5344 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5345 assert(cap->get_last_seq() == im.issue_seq);
5346 assert(cap->get_mseq() == im.mseq);
5347 cap->set_cap_id(im.cap_id);
5348 // send cap import because we assigned a new cap ID
5349 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5350 p->second.first, CEPH_CAP_FLAG_AUTH);
5351 }
5352 }
5353 rejoin_slave_exports.clear();
5354 rejoin_imported_caps.clear();
5355
5356 // process cap imports
5357 // ino -> client -> frommds -> capex
5358 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5359 CInode *in = get_inode(p->first);
5360 if (!in) {
5361 dout(10) << " still missing ino " << p->first
5362 << ", will try again after replayed client requests" << dendl;
5363 ++p;
5364 continue;
5365 }
5366 assert(in->is_auth());
5367 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5368 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5369 assert(session);
5370 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5371 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5372 add_reconnected_cap(q->first, in->ino(), r->second);
5373 if (r->first >= 0) {
5374 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5375 cap->inc_mseq();
5376 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5377
5378 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5379 im.cap_id = cap->get_cap_id();
5380 im.issue_seq = cap->get_last_seq();
5381 im.mseq = cap->get_mseq();
5382 }
5383 }
5384 }
5385 cap_imports.erase(p++); // remove and move on
5386 }
5387 } else {
5388 trim_non_auth();
5389
5390 rejoin_gather.erase(mds->get_nodeid());
5391 maybe_send_pending_rejoins();
5392
5393 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5394 rejoin_gather_finish();
5395 }
5396 return false;
5397}
5398
5399void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5400{
5401 // are this realm's parents fully open?
5402 if (realm->have_past_parents_open()) {
5403 dout(10) << " have past snap parents for realm " << *realm
5404 << " on " << *realm->inode << dendl;
5405 if (reconnect) {
5406 // finish off client snaprealm reconnects?
5407 auto p = reconnected_snaprealms.find(realm->inode->ino());
5408 if (p != reconnected_snaprealms.end()) {
5409 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5410 finish_snaprealm_reconnect(q->first, realm, q->second);
5411 reconnected_snaprealms.erase(p);
5412 }
5413 }
5414 } else {
5415 if (!missing_snap_parents.count(realm->inode)) {
5416 dout(10) << " MISSING past snap parents for realm " << *realm
5417 << " on " << *realm->inode << dendl;
5418 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5419 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5420 } else {
5421 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5422 << " on " << *realm->inode << dendl;
5423 }
5424 }
5425}
5426
5427void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5428 client_t client, snapid_t snap_follows)
5429{
5430 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5431
5432 const set<snapid_t>& snaps = realm->get_snaps();
5433 snapid_t follows = snap_follows;
5434
5435 while (true) {
5436 CInode *in = pick_inode_snap(head_in, follows);
5437 if (in == head_in)
5438 break;
5439 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5440
5441 /* TODO: we can check the reconnected/flushing caps to find
5442 * which locks need gathering */
5443 for (int i = 0; i < num_cinode_locks; i++) {
5444 int lockid = cinode_lock_info[i].lock;
5445 SimpleLock *lock = in->get_lock(lockid);
5446 assert(lock);
5447 in->client_snap_caps[lockid].insert(client);
5448 in->auth_pin(lock);
5449 lock->set_state(LOCK_SNAP_SYNC);
5450 lock->get_wrlock(true);
5451 }
5452
5453 for (auto p = snaps.lower_bound(in->first);
5454 p != snaps.end() && *p <= in->last;
5455 ++p) {
5456 head_in->add_need_snapflush(in, *p, client);
5457 }
5458
5459 follows = in->last;
5460 }
5461}
5462
5463/*
5464 * choose lock states based on reconnected caps
5465 */
5466void MDCache::choose_lock_states_and_reconnect_caps()
5467{
5468 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5469
5470 map<client_t,MClientSnap*> splits;
5471
5472 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator i = inode_map.begin();
5473 i != inode_map.end();
5474 ++i) {
5475 CInode *in = i->second;
5476
5477 if (in->last != CEPH_NOSNAP)
5478 continue;
5479
5480 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5481 in->mark_dirty_rstat();
5482
5483 auto p = reconnected_caps.find(in->ino());
5484
5485 int dirty_caps = 0;
5486 if (p != reconnected_caps.end()) {
5487 for (const auto &it : p->second)
5488 dirty_caps |= it.second.dirty_caps;
5489 }
5490 in->choose_lock_states(dirty_caps);
5491 dout(15) << " chose lock states on " << *in << dendl;
5492
5493 SnapRealm *realm = in->find_snaprealm();
5494
5495 check_realm_past_parents(realm, realm == in->snaprealm);
5496
5497 if (p != reconnected_caps.end()) {
5498 bool missing_snap_parent = false;
5499 // also, make sure client's cap is in the correct snaprealm.
5500 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5501 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5502 if (realm->have_past_parents_open()) {
5503 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5504 } else {
5505 missing_snap_parent = true;
5506 }
5507 }
5508
5509 if (q->second.realm_ino == realm->inode->ino()) {
5510 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5511 } else {
5512 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5513 << " != " << realm->inode->ino() << dendl;
5514 if (realm->have_past_parents_open()) {
5515 // ok, include in a split message _now_.
5516 prepare_realm_split(realm, q->first, in->ino(), splits);
5517 } else {
5518 // send the split later.
5519 missing_snap_parent = true;
5520 }
5521 }
5522 }
5523 if (missing_snap_parent)
5524 missing_snap_parents[realm->inode].insert(in);
5525 }
5526 }
5527
5528 send_snaps(splits);
5529}
5530
5531void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5532 map<client_t,MClientSnap*>& splits)
5533{
5534 MClientSnap *snap;
5535 if (splits.count(client) == 0) {
5536 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5537 snap->head.split = realm->inode->ino();
5538 realm->build_snap_trace(snap->bl);
5539
5540 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5541 p != realm->open_children.end();
5542 ++p)
5543 snap->split_realms.push_back((*p)->inode->ino());
5544
5545 } else
5546 snap = splits[client];
5547 snap->split_inos.push_back(ino);
5548}
5549
5550void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5551{
5552 dout(10) << "send_snaps" << dendl;
5553
5554 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5555 p != splits.end();
5556 ++p) {
5557 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5558 if (session) {
5559 dout(10) << " client." << p->first
5560 << " split " << p->second->head.split
5561 << " inos " << p->second->split_inos
5562 << dendl;
5563 mds->send_message_client_counted(p->second, session);
5564 } else {
5565 dout(10) << " no session for client." << p->first << dendl;
5566 p->second->put();
5567 }
5568 }
5569 splits.clear();
5570}
5571
5572
5573/*
5574 * remove any items from logsegment open_file lists that don't have
5575 * any caps
5576 */
5577void MDCache::clean_open_file_lists()
5578{
5579 dout(10) << "clean_open_file_lists" << dendl;
5580
5581 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5582 p != mds->mdlog->segments.end();
5583 ++p) {
5584 LogSegment *ls = p->second;
5585
5586 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5587 while (!q.end()) {
5588 CInode *in = *q;
5589 ++q;
5590 if (in->last == CEPH_NOSNAP) {
5591 if (!in->is_any_caps_wanted()) {
5592 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5593 in->item_open_file.remove_myself();
5594 }
5595 } else if (in->last != CEPH_NOSNAP) {
5596 if (in->client_snap_caps.empty()) {
5597 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5598 in->item_open_file.remove_myself();
5599 }
5600 }
5601 }
5602 }
5603}
5604
5605
5606
5607Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5608{
5609 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5610 << " on " << *in << dendl;
5611 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5612 if (!session) {
5613 dout(10) << " no session for client." << client << dendl;
5614 return NULL;
5615 }
5616
5617 Capability *cap = in->reconnect_cap(client, icr, session);
5618
5619 if (frommds >= 0) {
5620 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5621 cap->inc_mseq();
5622 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5623 }
5624
5625 return cap;
5626}
5627
5628void MDCache::export_remaining_imported_caps()
5629{
5630 dout(10) << "export_remaining_imported_caps" << dendl;
5631
5632 stringstream warn_str;
5633
5634 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5635 warn_str << " ino " << p->first << "\n";
5636 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5637 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5638 if (session) {
5639 // mark client caps stale.
5640 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5641 stale->set_cap_peer(0, 0, 0, -1, 0);
5642 mds->send_message_client_counted(stale, q->first);
5643 }
5644 }
5645
5646 mds->heartbeat_reset();
5647 }
5648
5649 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5650 p != cap_reconnect_waiters.end();
5651 ++p)
5652 mds->queue_waiters(p->second);
5653
5654 cap_imports.clear();
5655 cap_reconnect_waiters.clear();
5656
5657 if (warn_str.peek() != EOF) {
5658 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5659 mds->clog->warn(warn_str);
5660 }
5661}
5662
5663void MDCache::try_reconnect_cap(CInode *in, Session *session)
5664{
5665 client_t client = session->info.get_client();
5666 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5667 if (rc) {
5668 in->reconnect_cap(client, *rc, session);
5669 dout(10) << "try_reconnect_cap client." << client
5670 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5671 << " issue " << ccap_string(rc->capinfo.issued)
5672 << " on " << *in << dendl;
5673 remove_replay_cap_reconnect(in->ino(), client);
5674
5675 if (in->is_replicated()) {
5676 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5677 } else {
5678 int dirty_caps = 0;
5679 auto p = reconnected_caps.find(in->ino());
5680 if (p != reconnected_caps.end()) {
5681 auto q = p->second.find(client);
5682 if (q != p->second.end())
5683 dirty_caps = q->second.dirty_caps;
5684 }
5685 in->choose_lock_states(dirty_caps);
5686 dout(15) << " chose lock states on " << *in << dendl;
5687 }
5688
5689 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5690 cap_reconnect_waiters.find(in->ino());
5691 if (it != cap_reconnect_waiters.end()) {
5692 mds->queue_waiters(it->second);
5693 cap_reconnect_waiters.erase(it);
5694 }
5695 }
5696}
5697
5698
5699
5700// -------
5701// cap imports and delayed snap parent opens
5702
5703void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5704 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5705 int peer, int p_flags)
5706{
5707 client_t client = session->info.inst.name.num();
5708 SnapRealm *realm = in->find_snaprealm();
5709 if (realm->have_past_parents_open()) {
5710 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5711 if (cap->get_last_seq() == 0) // reconnected cap
5712 cap->inc_last_seq();
5713 cap->set_last_issue();
5714 cap->set_last_issue_stamp(ceph_clock_now());
5715 cap->clear_new();
5716 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5717 in->ino(),
5718 realm->inode->ino(),
5719 cap->get_cap_id(), cap->get_last_seq(),
5720 cap->pending(), cap->wanted(), 0,
5721 cap->get_mseq(), mds->get_osd_epoch_barrier());
5722 in->encode_cap_message(reap, cap);
5723 realm->build_snap_trace(reap->snapbl);
5724 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5725 mds->send_message_client_counted(reap, session);
5726 } else {
5727 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5728 << cap->get_mseq() << " on " << *in << dendl;
5729 in->auth_pin(this);
5730 cap->inc_suppress();
5731 delayed_imported_caps[client].insert(in);
5732 missing_snap_parents[in].size();
5733 }
5734}
5735
5736void MDCache::do_delayed_cap_imports()
5737{
5738 dout(10) << "do_delayed_cap_imports" << dendl;
5739
5740 assert(delayed_imported_caps.empty());
5741}
5742
5743struct C_MDC_OpenSnapParents : public MDCacheContext {
5744 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5745 void finish(int r) override {
5746 mdcache->open_snap_parents();
5747 }
5748};
5749
5750void MDCache::open_snap_parents()
5751{
5752 dout(10) << "open_snap_parents" << dendl;
5753
5754 map<client_t,MClientSnap*> splits;
5755 MDSGatherBuilder gather(g_ceph_context);
5756
5757 auto p = missing_snap_parents.begin();
5758 while (p != missing_snap_parents.end()) {
5759 CInode *in = p->first;
5760 assert(in->snaprealm);
5761 if (in->snaprealm->open_parents(gather.new_sub())) {
5762 dout(10) << " past parents now open on " << *in << dendl;
5763
5764 for (CInode *child : p->second) {
5765 auto q = reconnected_caps.find(child->ino());
5766 assert(q != reconnected_caps.end());
5767 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5768 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5769 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5770 }
5771 // make sure client's cap is in the correct snaprealm.
5772 if (r->second.realm_ino != in->ino()) {
5773 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5774 }
5775 }
5776 }
5777
5778 missing_snap_parents.erase(p++);
5779
5780 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5781
5782 // finish off client snaprealm reconnects?
5783 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5784 if (q != reconnected_snaprealms.end()) {
5785 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5786 r != q->second.end();
5787 ++r)
5788 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5789 reconnected_snaprealms.erase(q);
5790 }
5791 } else {
5792 dout(10) << " opening past parents on " << *in << dendl;
5793 ++p;
5794 }
5795 }
5796
5797 send_snaps(splits);
5798
5799 if (gather.has_subs()) {
5800 dout(10) << "open_snap_parents - waiting for "
5801 << gather.num_subs_remaining() << dendl;
5802 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5803 gather.activate();
5804 } else {
5805 if (!reconnected_snaprealms.empty()) {
5806 stringstream warn_str;
5807 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5808 p != reconnected_snaprealms.end();
5809 ++p) {
5810 warn_str << " unconnected snaprealm " << p->first << "\n";
5811 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5812 q != p->second.end();
5813 ++q)
5814 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5815 }
5816 mds->clog->warn() << "open_snap_parents has:";
5817 mds->clog->warn(warn_str);
5818 }
5819 assert(rejoin_waiters.empty());
5820 assert(missing_snap_parents.empty());
5821 dout(10) << "open_snap_parents - all open" << dendl;
5822 do_delayed_cap_imports();
5823
5824 assert(rejoin_done);
5825 rejoin_done.release()->complete(0);
5826 reconnected_caps.clear();
5827 }
5828}
5829
5830bool MDCache::open_undef_inodes_dirfrags()
5831{
5832 dout(10) << "open_undef_inodes_dirfrags "
5833 << rejoin_undef_inodes.size() << " inodes "
5834 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5835
5836 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5837
5838 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5839 p != rejoin_undef_inodes.end();
5840 ++p) {
5841 CInode *in = *p;
5842 assert(!in->is_base());
5843 fetch_queue.insert(in->get_parent_dir());
5844 }
5845
5846 if (fetch_queue.empty())
5847 return false;
5848
5849 MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
5850 for (set<CDir*>::iterator p = fetch_queue.begin();
5851 p != fetch_queue.end();
5852 ++p) {
5853 CDir *dir = *p;
5854 CInode *diri = dir->get_inode();
5855 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5856 continue;
5857 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5858 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5859 dir->fetch(gather.new_sub());
5860 }
5861 assert(gather.has_subs());
5862 gather.activate();
5863 return true;
5864}
5865
5866void MDCache::opened_undef_inode(CInode *in) {
5867 dout(10) << "opened_undef_inode " << *in << dendl;
5868 rejoin_undef_inodes.erase(in);
5869 if (in->is_dir()) {
5870 // FIXME: re-hash dentries if necessary
5871 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5872 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5873 CDir *dir = in->get_dirfrag(frag_t());
5874 assert(dir);
5875 rejoin_undef_dirfrags.erase(dir);
5876 in->force_dirfrags();
5877 list<CDir*> ls;
5878 in->get_dirfrags(ls);
5879 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5880 rejoin_undef_dirfrags.insert(*p);
5881 }
5882 }
5883}
5884
5885void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5886{
5887 if (seq < realm->get_newest_seq()) {
5888 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5889 << realm->get_newest_seq()
5890 << " on " << *realm << dendl;
5891 // send an update
5892 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5893 if (session) {
5894 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5895 realm->build_snap_trace(snap->bl);
5896 mds->send_message_client_counted(snap, session);
5897 } else {
5898 dout(10) << " ...or not, no session for this client!" << dendl;
5899 }
5900 } else {
5901 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5902 << " on " << *realm << dendl;
5903 }
5904}
5905
5906
5907
5908void MDCache::rejoin_send_acks()
5909{
5910 dout(7) << "rejoin_send_acks" << dendl;
5911
5912 // replicate stray
5913 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5914 p != rejoin_unlinked_inodes.end();
5915 ++p) {
5916 for (set<CInode*>::iterator q = p->second.begin();
5917 q != p->second.end();
5918 ++q) {
5919 CInode *in = *q;
5920 dout(7) << " unlinked inode " << *in << dendl;
5921 // inode expired
5922 if (!in->is_replica(p->first))
5923 continue;
5924 while (1) {
5925 CDentry *dn = in->get_parent_dn();
5926 if (dn->is_replica(p->first))
5927 break;
5928 dn->add_replica(p->first);
5929 CDir *dir = dn->get_dir();
5930 if (dir->is_replica(p->first))
5931 break;
5932 dir->add_replica(p->first);
5933 in = dir->get_inode();
5934 if (in->is_replica(p->first))
5935 break;
224ce89b 5936 in->add_replica(p->first);
7c673cae
FG
5937 if (in->is_base())
5938 break;
5939 }
5940 }
5941 }
5942 rejoin_unlinked_inodes.clear();
5943
5944 // send acks to everyone in the recovery set
31f18b77 5945 map<mds_rank_t,MMDSCacheRejoin*> acks;
7c673cae
FG
5946 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5947 p != recovery_set.end();
31f18b77
FG
5948 ++p) {
5949 if (rejoin_ack_sent.count(*p))
5950 continue;
5951 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5952 }
5953
5954 rejoin_ack_sent = recovery_set;
7c673cae
FG
5955
5956 // walk subtrees
5957 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5958 p != subtrees.end();
5959 ++p) {
5960 CDir *dir = p->first;
5961 if (!dir->is_auth())
5962 continue;
5963 dout(10) << "subtree " << *dir << dendl;
5964
5965 // auth items in this subtree
5966 list<CDir*> dq;
5967 dq.push_back(dir);
5968
5969 while (!dq.empty()) {
5970 CDir *dir = dq.front();
5971 dq.pop_front();
5972
5973 // dir
181888fb
FG
5974 for (auto &r : dir->get_replicas()) {
5975 auto it = acks.find(r.first);
31f18b77
FG
5976 if (it == acks.end())
5977 continue;
181888fb 5978 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 5979 it->second->add_dirfrag_base(dir);
7c673cae
FG
5980 }
5981
5982 for (CDir::map_t::iterator q = dir->items.begin();
5983 q != dir->items.end();
5984 ++q) {
5985 CDentry *dn = q->second;
5986 CDentry::linkage_t *dnl = dn->get_linkage();
5987
5988 // inode
5989 CInode *in = NULL;
5990 if (dnl->is_primary())
5991 in = dnl->get_inode();
5992
5993 // dentry
181888fb
FG
5994 for (auto &r : dn->get_replicas()) {
5995 auto it = acks.find(r.first);
31f18b77
FG
5996 if (it == acks.end())
5997 continue;
5998 it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
7c673cae
FG
5999 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6000 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6001 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6002 ++r.second,
7c673cae
FG
6003 dn->lock.get_replica_state());
6004 // peer missed MDentrylink message ?
181888fb
FG
6005 if (in && !in->is_replica(r.first))
6006 in->add_replica(r.first);
7c673cae
FG
6007 }
6008
6009 if (!in)
6010 continue;
6011
181888fb
FG
6012 for (auto &r : in->get_replicas()) {
6013 auto it = acks.find(r.first);
31f18b77
FG
6014 if (it == acks.end())
6015 continue;
6016 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6017 bufferlist bl;
181888fb
FG
6018 in->_encode_locks_state_for_rejoin(bl, r.first);
6019 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6020 }
6021
6022 // subdirs in this subtree?
6023 in->get_nested_dirfrags(dq);
6024 }
6025 }
6026 }
6027
6028 // base inodes too
6029 if (root && root->is_auth())
181888fb
FG
6030 for (auto &r : root->get_replicas()) {
6031 auto it = acks.find(r.first);
31f18b77
FG
6032 if (it == acks.end())
6033 continue;
6034 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6035 bufferlist bl;
181888fb
FG
6036 root->_encode_locks_state_for_rejoin(bl, r.first);
6037 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6038 }
6039 if (myin)
181888fb
FG
6040 for (auto &r : myin->get_replicas()) {
6041 auto it = acks.find(r.first);
31f18b77
FG
6042 if (it == acks.end())
6043 continue;
6044 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6045 bufferlist bl;
181888fb
FG
6046 myin->_encode_locks_state_for_rejoin(bl, r.first);
6047 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6048 }
6049
6050 // include inode base for any inodes whose scatterlocks may have updated
6051 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6052 p != rejoin_potential_updated_scatterlocks.end();
6053 ++p) {
6054 CInode *in = *p;
181888fb
FG
6055 for (const auto &r : in->get_replicas()) {
6056 auto it = acks.find(r.first);
31f18b77
FG
6057 if (it == acks.end())
6058 continue;
6059 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6060 }
7c673cae
FG
6061 }
6062
6063 // send acks
31f18b77 6064 for (auto p = acks.begin(); p != acks.end(); ++p) {
7c673cae
FG
6065 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6066 mds->send_message_mds(p->second, p->first);
6067 }
6068
6069 rejoin_imported_caps.clear();
6070}
6071
c07f9fc5
FG
6072class C_MDC_ReIssueCaps : public MDCacheContext {
6073 CInode *in;
6074public:
6075 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6076 MDCacheContext(mdc), in(i)
6077 {
6078 in->get(CInode::PIN_PTRWAITER);
6079 }
6080 void finish(int r) override {
6081 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6082 mdcache->mds->locker->issue_caps(in);
6083 in->put(CInode::PIN_PTRWAITER);
6084 }
6085};
7c673cae
FG
6086
6087void MDCache::reissue_all_caps()
6088{
6089 dout(10) << "reissue_all_caps" << dendl;
6090
6091 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6092 p != inode_map.end();
6093 ++p) {
6094 CInode *in = p->second;
6095 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6096 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6097 if (in->is_frozen_inode()) {
6098 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6099 continue;
6100 }
7c673cae
FG
6101 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6102 mds->locker->issue_caps(in);
6103 }
6104 }
6105}
6106
6107
6108// ===============================================================================
6109
6110struct C_MDC_QueuedCow : public MDCacheContext {
6111 CInode *in;
6112 MutationRef mut;
6113 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6114 MDCacheContext(mdc), in(i), mut(m) {}
6115 void finish(int r) override {
6116 mdcache->_queued_file_recover_cow(in, mut);
6117 }
6118};
6119
6120
6121void MDCache::queue_file_recover(CInode *in)
6122{
6123 dout(10) << "queue_file_recover " << *in << dendl;
6124 assert(in->is_auth());
6125
6126 // cow?
6127 /*
6128 SnapRealm *realm = in->find_snaprealm();
6129 set<snapid_t> s = realm->get_snaps();
6130 while (!s.empty() && *s.begin() < in->first)
6131 s.erase(s.begin());
6132 while (!s.empty() && *s.rbegin() > in->last)
6133 s.erase(*s.rbegin());
6134 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6135 if (s.size() > 1) {
6136 inode_t *pi = in->project_inode();
6137 pi->version = in->pre_dirty();
6138
6139 auto mut(std::make_shared<MutationImpl>());
6140 mut->ls = mds->mdlog->get_current_segment();
6141 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6142 mds->mdlog->start_entry(le);
6143 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6144
6145 s.erase(*s.begin());
6146 while (!s.empty()) {
6147 snapid_t snapid = *s.begin();
6148 CInode *cow_inode = 0;
6149 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6150 assert(cow_inode);
6151 recovery_queue.enqueue(cow_inode);
6152 s.erase(*s.begin());
6153 }
6154
6155 in->parent->first = in->first;
6156 le->metablob.add_primary_dentry(in->parent, in, true);
6157 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6158 mds->mdlog->flush();
6159 }
6160 */
6161
6162 recovery_queue.enqueue(in);
6163}
6164
6165void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6166{
6167 in->pop_and_dirty_projected_inode(mut->ls);
6168 mut->apply();
6169 mds->locker->drop_locks(mut.get());
6170 mut->cleanup();
6171}
6172
6173
6174/*
6175 * called after recovery to recover file sizes for previously opened (for write)
6176 * files. that is, those where max_size > size.
6177 */
6178void MDCache::identify_files_to_recover()
6179{
6180 dout(10) << "identify_files_to_recover" << dendl;
6181 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6182 p != inode_map.end();
6183 ++p) {
6184 CInode *in = p->second;
6185 if (!in->is_auth())
6186 continue;
6187
6188 if (in->last != CEPH_NOSNAP)
6189 continue;
6190
6191 // Only normal files need file size recovery
6192 if (!in->is_file()) {
6193 continue;
6194 }
6195
6196 bool recover = false;
6197 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6198 p != in->inode.client_ranges.end();
6199 ++p) {
6200 Capability *cap = in->get_client_cap(p->first);
6201 if (!cap) {
6202 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6203 recover = true;
6204 break;
6205 }
6206 }
6207
6208 if (recover) {
6209 if (in->filelock.is_stable()) {
6210 in->auth_pin(&in->filelock);
6211 } else {
6212 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6213 }
6214 in->filelock.set_state(LOCK_PRE_SCAN);
6215 rejoin_recover_q.push_back(in);
6216 } else {
6217 rejoin_check_q.push_back(in);
6218 }
6219 }
6220}
6221
6222void MDCache::start_files_to_recover()
6223{
6224 for (CInode *in : rejoin_check_q) {
6225 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6226 mds->locker->issue_caps(in);
6227 mds->locker->check_inode_max_size(in);
6228 }
6229 rejoin_check_q.clear();
6230 for (CInode *in : rejoin_recover_q) {
6231 mds->locker->file_recover(&in->filelock);
6232 }
6233 if (!rejoin_recover_q.empty()) {
6234 rejoin_recover_q.clear();
6235 do_file_recover();
6236 }
6237}
6238
6239void MDCache::do_file_recover()
6240{
6241 recovery_queue.advance();
6242}
6243
6244// ===============================================================================
6245
6246
6247// ----------------------------
6248// truncate
6249
6250class C_MDC_RetryTruncate : public MDCacheContext {
6251 CInode *in;
6252 LogSegment *ls;
6253public:
6254 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6255 MDCacheContext(c), in(i), ls(l) {}
6256 void finish(int r) override {
6257 mdcache->_truncate_inode(in, ls);
6258 }
6259};
6260
6261void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6262{
6263 inode_t *pi = in->get_projected_inode();
6264 dout(10) << "truncate_inode "
6265 << pi->truncate_from << " -> " << pi->truncate_size
6266 << " on " << *in
6267 << dendl;
6268
6269 ls->truncating_inodes.insert(in);
6270 in->get(CInode::PIN_TRUNCATING);
6271 in->auth_pin(this);
6272
6273 if (!in->client_need_snapflush.empty() &&
6274 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6275 assert(in->filelock.is_xlocked());
6276 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6277 mds->locker->issue_caps(in);
6278 return;
6279 }
6280
6281 _truncate_inode(in, ls);
6282}
6283
6284struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6285 CInode *in;
6286 LogSegment *ls;
6287 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6288 MDCacheIOContext(c), in(i), ls(l) {}
6289 void finish(int r) override {
6290 assert(r == 0 || r == -ENOENT);
6291 mdcache->truncate_inode_finish(in, ls);
6292 }
6293};
6294
6295void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6296{
6297 inode_t *pi = &in->inode;
6298 dout(10) << "_truncate_inode "
6299 << pi->truncate_from << " -> " << pi->truncate_size
6300 << " on " << *in << dendl;
6301
6302 assert(pi->is_truncating());
6303 assert(pi->truncate_size < (1ULL << 63));
6304 assert(pi->truncate_from < (1ULL << 63));
6305 assert(pi->truncate_size < pi->truncate_from);
6306
6307
6308 SnapRealm *realm = in->find_snaprealm();
6309 SnapContext nullsnap;
6310 const SnapContext *snapc;
6311 if (realm) {
6312 dout(10) << " realm " << *realm << dendl;
6313 snapc = &realm->get_snap_context();
6314 } else {
6315 dout(10) << " NO realm, using null context" << dendl;
6316 snapc = &nullsnap;
6317 assert(in->last == CEPH_NOSNAP);
6318 }
6319 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6320 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6321 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6322 pi->truncate_seq, ceph::real_time::min(), 0,
6323 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6324 mds->finisher));
6325}
6326
6327struct C_MDC_TruncateLogged : public MDCacheLogContext {
6328 CInode *in;
6329 MutationRef mut;
6330 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6331 MDCacheLogContext(m), in(i), mut(mu) {}
6332 void finish(int r) override {
6333 mdcache->truncate_inode_logged(in, mut);
6334 }
6335};
6336
6337void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6338{
6339 dout(10) << "truncate_inode_finish " << *in << dendl;
6340
6341 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6342 assert(p != ls->truncating_inodes.end());
6343 ls->truncating_inodes.erase(p);
6344
6345 // update
6346 inode_t *pi = in->project_inode();
6347 pi->version = in->pre_dirty();
6348 pi->truncate_from = 0;
6349 pi->truncate_pending--;
6350
6351 MutationRef mut(new MutationImpl());
6352 mut->ls = mds->mdlog->get_current_segment();
6353 mut->add_projected_inode(in);
6354
6355 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6356 mds->mdlog->start_entry(le);
6357 CDentry *dn = in->get_projected_parent_dn();
6358 le->metablob.add_dir_context(dn->get_dir());
6359 le->metablob.add_primary_dentry(dn, in, true);
6360 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6361
6362 journal_dirty_inode(mut.get(), &le->metablob, in);
6363 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6364
6365 // flush immediately if there are readers/writers waiting
6366 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6367 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6368 mds->mdlog->flush();
6369}
6370
6371void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6372{
6373 dout(10) << "truncate_inode_logged " << *in << dendl;
6374 mut->apply();
6375 mds->locker->drop_locks(mut.get());
6376 mut->cleanup();
6377
6378 in->put(CInode::PIN_TRUNCATING);
6379 in->auth_unpin(this);
6380
6381 list<MDSInternalContextBase*> waiters;
6382 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6383 mds->queue_waiters(waiters);
6384}
6385
6386
6387void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6388{
6389 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6390 << ls->seq << "/" << ls->offset << dendl;
6391 ls->truncating_inodes.insert(in);
6392 in->get(CInode::PIN_TRUNCATING);
6393}
6394
6395void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6396{
6397 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6398 << ls->seq << "/" << ls->offset << dendl;
6399 // if we have the logseg the truncate started in, it must be in our list.
6400 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6401 assert(p != ls->truncating_inodes.end());
6402 ls->truncating_inodes.erase(p);
6403 in->put(CInode::PIN_TRUNCATING);
6404}
6405
6406void MDCache::start_recovered_truncates()
6407{
6408 dout(10) << "start_recovered_truncates" << dendl;
6409 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6410 p != mds->mdlog->segments.end();
6411 ++p) {
6412 LogSegment *ls = p->second;
6413 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6414 q != ls->truncating_inodes.end();
6415 ++q) {
6416 CInode *in = *q;
6417 in->auth_pin(this);
6418
6419 if (!in->client_need_snapflush.empty() &&
6420 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6421 assert(in->filelock.is_stable());
6422 in->filelock.set_state(LOCK_XLOCKDONE);
6423 in->auth_pin(&in->filelock);
6424 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6425 // start_files_to_recover will revoke caps
6426 continue;
6427 }
6428 _truncate_inode(in, ls);
6429 }
6430 }
6431}
6432
6433
6434
6435
6436
6437
6438// ================================================================================
6439// cache trimming
6440
181888fb
FG
6441void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6442{
7c673cae 6443 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6444 std::vector<CDentry *> unexpirables;
6445 uint64_t trimmed = 0;
6446
6447 dout(7) << "trim_lru trimming " << count
6448 << " items from LRU"
6449 << " size=" << lru.lru_get_size()
6450 << " mid=" << lru.lru_get_top()
6451 << " pintail=" << lru.lru_get_pintail()
6452 << " pinned=" << lru.lru_get_num_pinned()
6453 << dendl;
7c673cae 6454
31f18b77
FG
6455 for (;;) {
6456 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6457 if (!dn)
6458 break;
6459 if (trim_dentry(dn, expiremap)) {
6460 unexpirables.push_back(dn);
181888fb
FG
6461 } else {
6462 trimmed++;
31f18b77
FG
6463 }
6464 }
6465
181888fb 6466 for (auto &dn : unexpirables) {
31f18b77 6467 bottom_lru.lru_insert_mid(dn);
181888fb 6468 }
31f18b77
FG
6469 unexpirables.clear();
6470
181888fb
FG
6471 // trim dentries from the LRU until count is reached
6472 while (cache_toofull() || count > 0) {
7c673cae
FG
6473 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6474 if (!dn) {
6475 break;
6476 }
7c673cae 6477 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6478 dn->get_linkage()->inode->item_open_file.is_on_list())) {
7c673cae 6479 unexpirables.push_back(dn);
181888fb
FG
6480 } else if (trim_dentry(dn, expiremap)) {
6481 unexpirables.push_back(dn);
6482 } else {
6483 trimmed++;
7c673cae 6484 }
181888fb 6485 count--;
7c673cae 6486 }
181888fb
FG
6487
6488 for (auto &dn : unexpirables) {
31f18b77 6489 lru.lru_insert_mid(dn);
181888fb 6490 }
31f18b77 6491 unexpirables.clear();
7c673cae 6492
181888fb
FG
6493 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6494}
6495
6496/*
6497 * note: only called while MDS is active or stopping... NOT during recovery.
6498 * however, we may expire a replica whose authority is recovering.
6499 *
6500 * @param count is number of dentries to try to expire
6501 */
6502bool MDCache::trim(uint64_t count)
6503{
6504 uint64_t used = cache_size();
6505 uint64_t limit = cache_limit_memory();
6506 map<mds_rank_t, MCacheExpire*> expiremap;
6507
6508 dout(7) << "trim bytes_used=" << bytes2str(used)
6509 << " limit=" << bytes2str(limit)
6510 << " reservation=" << cache_reservation()
6511 << "% count=" << count << dendl;
6512
6513 // process delayed eval_stray()
6514 stray_manager.advance_delayed();
6515
6516 trim_lru(count, expiremap);
6517
7c673cae 6518 // trim non-auth, non-bound subtrees
181888fb 6519 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6520 CDir *dir = p->first;
6521 ++p;
31f18b77
FG
6522 CInode *diri = dir->get_inode();
6523 if (dir->is_auth()) {
6524 if (!diri->is_auth() && !diri->is_base() &&
6525 dir->get_num_head_items() == 0) {
6526 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6527 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6528 dir->is_freezing() || dir->is_frozen())
6529 continue;
6530
6531 migrator->export_empty_import(dir);
6532 }
6533 } else {
6534 if (!diri->is_auth()) {
6535 if (dir->get_num_ref() > 1) // only subtree pin
6536 continue;
6537 list<CDir*> ls;
6538 diri->get_subtree_dirfrags(ls);
6539 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6540 continue;
6541
6542 // don't trim subtree root if its auth MDS is recovering.
6543 // This simplify the cache rejoin code.
6544 if (dir->is_subtree_root() &&
6545 rejoin_ack_gather.count(dir->get_dir_auth().first))
6546 continue;
7c673cae 6547 trim_dirfrag(dir, 0, expiremap);
31f18b77 6548 }
7c673cae
FG
6549 }
6550 }
6551
6552 // trim root?
181888fb 6553 if (mds->is_stopping() && root) {
7c673cae
FG
6554 list<CDir*> ls;
6555 root->get_dirfrags(ls);
6556 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6557 CDir *dir = *p;
6558 if (dir->get_num_ref() == 1) // subtree pin
6559 trim_dirfrag(dir, 0, expiremap);
6560 }
6561 if (root->get_num_ref() == 0)
6562 trim_inode(0, root, 0, expiremap);
6563 }
6564
6565 std::set<mds_rank_t> stopping;
6566 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6567 stopping.erase(mds->get_nodeid());
6568 for (auto rank : stopping) {
6569 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6570 if (!mdsdir_in)
6571 continue;
6572
6573 if (expiremap.count(rank) == 0) {
6574 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6575 }
6576
6577 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6578
6579 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6580 if (!aborted) {
6581 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6582 list<CDir*> ls;
6583 mdsdir_in->get_dirfrags(ls);
6584 for (auto dir : ls) {
6585 if (dir->get_num_ref() == 1) // subtree pin
6586 trim_dirfrag(dir, dir, expiremap);
6587 }
6588 if (mdsdir_in->get_num_ref() == 0)
6589 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6590 } else {
6591 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6592 }
6593 }
6594
6595 // Other rank's base inodes (when I'm stopping)
181888fb 6596 if (mds->is_stopping()) {
7c673cae
FG
6597 for (set<CInode*>::iterator p = base_inodes.begin();
6598 p != base_inodes.end(); ++p) {
6599 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6600 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6601 if ((*p)->get_num_ref() == 0) {
6602 trim_inode(NULL, *p, NULL, expiremap);
6603 }
6604 }
6605 }
6606 }
6607
6608 // send any expire messages
6609 send_expire_messages(expiremap);
6610
6611 return true;
6612}
6613
6614void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6615{
6616 // send expires
6617 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6618 it != expiremap.end();
6619 ++it) {
6620 if (mds->is_cluster_degraded() &&
6621 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6622 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6623 rejoin_sent.count(it->first) == 0))) {
6624 it->second->put();
6625 continue;
6626 }
6627 dout(7) << "sending cache_expire to " << it->first << dendl;
6628 mds->send_message_mds(it->second, it->first);
6629 }
6630}
6631
6632
6633bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6634{
6635 dout(12) << "trim_dentry " << *dn << dendl;
6636
6637 CDentry::linkage_t *dnl = dn->get_linkage();
6638
6639 CDir *dir = dn->get_dir();
6640 assert(dir);
6641
6642 CDir *con = get_subtree_root(dir);
6643 if (con)
6644 dout(12) << " in container " << *con << dendl;
6645 else {
6646 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6647 assert(dn->is_auth());
6648 }
6649
6650 // If replica dentry is not readable, it's likely we will receive
6651 // MDentryLink/MDentryUnlink message soon (It's possible we first
6652 // receive a MDentryUnlink message, then MDentryLink message)
6653 // MDentryLink message only replicates an inode, so we should
6654 // avoid trimming the inode's parent dentry. This is because that
6655 // unconnected replicas are problematic for subtree migration.
6656 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6657 !dn->get_dir()->get_inode()->is_stray())
6658 return true;
6659
6660 // adjust the dir state
6661 // NOTE: we can safely remove a clean, null dentry without effecting
6662 // directory completeness.
6663 // (check this _before_ we unlink the inode, below!)
6664 bool clear_complete = false;
6665 if (!(dnl->is_null() && dn->is_clean()))
6666 clear_complete = true;
6667
6668 // unlink the dentry
6669 if (dnl->is_remote()) {
6670 // just unlink.
31f18b77 6671 dir->unlink_inode(dn, false);
7c673cae
FG
6672 } else if (dnl->is_primary()) {
6673 // expire the inode, too.
6674 CInode *in = dnl->get_inode();
6675 assert(in);
6676 if (trim_inode(dn, in, con, expiremap))
6677 return true; // purging stray instead of trimming
6678 } else {
6679 assert(dnl->is_null());
6680 }
6681
6682 if (!dn->is_auth()) {
6683 // notify dentry authority.
6684 mds_authority_t auth = dn->authority();
6685
6686 for (int p=0; p<2; p++) {
6687 mds_rank_t a = auth.first;
6688 if (p) a = auth.second;
6689 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6690 if (mds->get_nodeid() == auth.second &&
6691 con->is_importing()) break; // don't send any expire while importing.
6692 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6693
6694 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6695 assert(a != mds->get_nodeid());
6696 if (expiremap.count(a) == 0)
6697 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6698 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce());
6699 }
6700 }
6701
6702 // remove dentry
6703 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6704 dir->add_to_bloom(dn);
6705 dir->remove_dentry(dn);
6706
6707 if (clear_complete)
6708 dir->state_clear(CDir::STATE_COMPLETE);
6709
7c673cae
FG
6710 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6711 return false;
6712}
6713
6714
6715void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6716{
6717 dout(15) << "trim_dirfrag " << *dir << dendl;
6718
6719 if (dir->is_subtree_root()) {
6720 assert(!dir->is_auth() ||
6721 (!dir->is_replicated() && dir->inode->is_base()));
6722 remove_subtree(dir); // remove from subtree map
6723 }
6724 assert(dir->get_num_ref() == 0);
6725
6726 CInode *in = dir->get_inode();
6727
6728 if (!dir->is_auth()) {
6729 mds_authority_t auth = dir->authority();
6730
6731 // was this an auth delegation? (if so, slightly modified container)
6732 dirfrag_t condf;
6733 if (dir->is_subtree_root()) {
6734 dout(12) << " subtree root, container is " << *dir << dendl;
6735 con = dir;
6736 condf = dir->dirfrag();
6737 } else {
6738 condf = con->dirfrag();
6739 }
6740
6741 for (int p=0; p<2; p++) {
6742 mds_rank_t a = auth.first;
6743 if (p) a = auth.second;
6744 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6745 if (mds->get_nodeid() == auth.second &&
6746 con->is_importing()) break; // don't send any expire while importing.
6747 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6748
6749 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6750 assert(a != mds->get_nodeid());
6751 if (expiremap.count(a) == 0)
6752 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6753 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6754 }
6755 }
6756
6757 in->close_dirfrag(dir->dirfrag().frag);
6758}
6759
6760/**
6761 * Try trimming an inode from the cache
6762 *
6763 * @return true if the inode is still in cache, else false if it was trimmed
6764 */
6765bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6766{
6767 dout(15) << "trim_inode " << *in << dendl;
6768 assert(in->get_num_ref() == 0);
6769
6770 if (in->is_dir()) {
6771 // If replica inode's dirfragtreelock is not readable, it's likely
6772 // some dirfrags of the inode are being fragmented and we will receive
6773 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6774 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6775 // This is because that unconnected replicas are problematic for
6776 // subtree migration.
6777 //
6778 if (!in->is_auth() && !in->dirfragtreelock.can_read(-1))
6779 return true;
6780
6781 // DIR
6782 list<CDir*> dfls;
6783 in->get_dirfrags(dfls);
6784 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6785 CDir *dir = *p;
6786 assert(!dir->is_subtree_root());
6787 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6788 }
6789 }
6790
6791 // INODE
6792 if (in->is_auth()) {
6793 // eval stray after closing dirfrags
6794 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6795 maybe_eval_stray(in);
6796 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6797 return true;
6798 }
6799 } else {
6800 mds_authority_t auth = in->authority();
6801
6802 dirfrag_t df;
6803 if (con)
6804 df = con->dirfrag();
6805 else
6806 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6807
6808 for (int p=0; p<2; p++) {
6809 mds_rank_t a = auth.first;
6810 if (p) a = auth.second;
6811 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6812 if (con && mds->get_nodeid() == auth.second &&
6813 con->is_importing()) break; // don't send any expire while importing.
6814 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6815
6816 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6817 assert(a != mds->get_nodeid());
6818 if (expiremap.count(a) == 0)
6819 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6820 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6821 }
6822 }
6823
6824 /*
6825 if (in->is_auth()) {
6826 if (in->hack_accessed)
6827 mds->logger->inc("outt");
6828 else {
6829 mds->logger->inc("outut");
6830 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6831 }
6832 }
6833 */
6834
6835 // unlink
6836 if (dn)
31f18b77 6837 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
6838 remove_inode(in);
6839 return false;
6840}
6841
6842
6843/**
6844 * trim_non_auth - remove any non-auth items from our cache
6845 *
6846 * this reduces the amount of non-auth metadata in our cache, reducing the
6847 * load incurred by the rejoin phase.
6848 *
6849 * the only non-auth items that remain are those that are needed to
6850 * attach our own subtrees to the root.
6851 *
6852 * when we are done, all dentries will be in the top bit of the lru.
6853 *
6854 * why we have to do this:
6855 * we may not have accurate linkage for non-auth items. which means we will
6856 * know which subtree it falls into, and can not be sure to declare it to the
6857 * correct authority.
6858 */
6859void MDCache::trim_non_auth()
6860{
6861 dout(7) << "trim_non_auth" << dendl;
6862
6863 // temporarily pin all subtree roots
6864 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6865 p != subtrees.end();
6866 ++p)
6867 p->first->get(CDir::PIN_SUBTREETEMP);
6868
31f18b77 6869 list<CDentry*> auth_list;
7c673cae
FG
6870
6871 // trim non-auth items from the lru
31f18b77
FG
6872 for (;;) {
6873 CDentry *dn = NULL;
6874 if (bottom_lru.lru_get_size() > 0)
6875 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6876 if (!dn && lru.lru_get_size() > 0)
6877 dn = static_cast<CDentry*>(lru.lru_expire());
6878 if (!dn)
6879 break;
6880
7c673cae
FG
6881 CDentry::linkage_t *dnl = dn->get_linkage();
6882
6883 if (dn->is_auth()) {
6884 // add back into lru (at the top)
31f18b77 6885 auth_list.push_back(dn);
7c673cae
FG
6886
6887 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6888 dn->unlink_remote(dnl);
7c673cae
FG
6889 } else {
6890 // non-auth. expire.
6891 CDir *dir = dn->get_dir();
6892 assert(dir);
6893
6894 // unlink the dentry
6895 dout(10) << " removing " << *dn << dendl;
6896 if (dnl->is_remote()) {
31f18b77 6897 dir->unlink_inode(dn, false);
7c673cae
FG
6898 }
6899 else if (dnl->is_primary()) {
6900 CInode *in = dnl->get_inode();
6901 dout(10) << " removing " << *in << dendl;
6902 list<CDir*> ls;
6903 in->get_dirfrags(ls);
6904 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6905 CDir *subdir = *p;
6906 assert(!subdir->is_subtree_root());
6907 in->close_dirfrag(subdir->dirfrag().frag);
6908 }
31f18b77 6909 dir->unlink_inode(dn, false);
7c673cae
FG
6910 remove_inode(in);
6911 }
6912 else {
6913 assert(dnl->is_null());
6914 }
6915
6916 assert(!dir->has_bloom());
6917 dir->remove_dentry(dn);
6918 // adjust the dir state
6919 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6920 // close empty non-auth dirfrag
6921 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6922 dir->inode->close_dirfrag(dir->get_frag());
6923 }
6924 }
6925
31f18b77
FG
6926 for (auto dn : auth_list) {
6927 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6928 bottom_lru.lru_insert_mid(dn);
6929 else
6930 lru.lru_insert_top(dn);
6931 }
6932
7c673cae
FG
6933 // move everything in the pintail to the top bit of the lru.
6934 lru.lru_touch_entire_pintail();
6935
6936 // unpin all subtrees
6937 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6938 p != subtrees.end();
6939 ++p)
6940 p->first->put(CDir::PIN_SUBTREETEMP);
6941
31f18b77
FG
6942 if (lru.lru_get_size() == 0 &&
6943 bottom_lru.lru_get_size() == 0) {
7c673cae
FG
6944 // root, stray, etc.?
6945 ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
6946 while (p != inode_map.end()) {
6947 ceph::unordered_map<vinodeno_t,CInode*>::iterator next = p;
6948 ++next;
6949 CInode *in = p->second;
6950 if (!in->is_auth()) {
6951 list<CDir*> ls;
6952 in->get_dirfrags(ls);
6953 for (list<CDir*>::iterator p = ls.begin();
6954 p != ls.end();
6955 ++p) {
6956 dout(10) << " removing " << **p << dendl;
6957 assert((*p)->get_num_ref() == 1); // SUBTREE
6958 remove_subtree((*p));
6959 in->close_dirfrag((*p)->dirfrag().frag);
6960 }
6961 dout(10) << " removing " << *in << dendl;
6962 assert(!in->get_parent_dn());
6963 assert(in->get_num_ref() == 0);
6964 remove_inode(in);
6965 }
6966 p = next;
6967 }
6968 }
6969
6970 show_subtrees();
6971}
6972
6973/**
6974 * Recursively trim the subtree rooted at directory to remove all
6975 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6976 * of those links. This is used to clear invalid data out of the cache.
6977 * Note that it doesn't clear the passed-in directory, since that's not
6978 * always safe.
6979 */
6980bool MDCache::trim_non_auth_subtree(CDir *dir)
6981{
6982 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
6983
6984 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
6985
6986 CDir::map_t::iterator j = dir->begin();
6987 CDir::map_t::iterator i = j;
6988 while (j != dir->end()) {
6989 i = j++;
6990 CDentry *dn = i->second;
6991 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
6992 CDentry::linkage_t *dnl = dn->get_linkage();
6993 if (dnl->is_primary()) { // check for subdirectories, etc
6994 CInode *in = dnl->get_inode();
6995 bool keep_inode = false;
6996 if (in->is_dir()) {
6997 list<CDir*> subdirs;
6998 in->get_dirfrags(subdirs);
6999 for (list<CDir*>::iterator subdir = subdirs.begin();
7000 subdir != subdirs.end();
7001 ++subdir) {
7002 if ((*subdir)->is_subtree_root()) {
7003 keep_inode = true;
7004 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7005 } else {
7006 if (trim_non_auth_subtree(*subdir))
7007 keep_inode = true;
7008 else {
7009 in->close_dirfrag((*subdir)->get_frag());
7010 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7011 }
7012 }
7013 }
7014
7015 }
7016 if (!keep_inode) { // remove it!
7017 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7018 dir->unlink_inode(dn, false);
7c673cae
FG
7019 remove_inode(in);
7020 assert(!dir->has_bloom());
7021 dir->remove_dentry(dn);
7022 } else {
7023 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7024 dn->state_clear(CDentry::STATE_AUTH);
7025 in->state_clear(CInode::STATE_AUTH);
7026 }
7027 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7028 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7029 } else { // just remove it
7030 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7031 if (dnl->is_remote())
31f18b77 7032 dir->unlink_inode(dn, false);
7c673cae
FG
7033 dir->remove_dentry(dn);
7034 }
7035 }
7036 dir->state_clear(CDir::STATE_AUTH);
7037 /**
7038 * We've now checked all our children and deleted those that need it.
7039 * Now return to caller, and tell them if *we're* a keeper.
7040 */
7041 return keep_dir || dir->get_num_any();
7042}
7043
7044/*
7045 * during replay, when we determine a subtree is no longer ours, we
7046 * try to trim it from our cache. because subtrees must be connected
7047 * to the root, the fact that we can trim this tree may mean that our
7048 * children or parents can also be trimmed.
7049 */
7050void MDCache::try_trim_non_auth_subtree(CDir *dir)
7051{
7052 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7053
7054 // can we now trim child subtrees?
7055 set<CDir*> bounds;
7056 get_subtree_bounds(dir, bounds);
7057 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7058 CDir *bd = *p;
7059 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7060 bd->get_num_any() == 0 && // and empty
7061 can_trim_non_auth_dirfrag(bd)) {
7062 CInode *bi = bd->get_inode();
7063 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7064 remove_subtree(bd);
7065 bd->mark_clean();
7066 bi->close_dirfrag(bd->get_frag());
7067 }
7068 }
7069
7070 if (trim_non_auth_subtree(dir)) {
7071 // keep
7072 try_subtree_merge(dir);
7073 } else {
7074 // can we trim this subtree (and possibly our ancestors) too?
7075 while (true) {
7076 CInode *diri = dir->get_inode();
7077 if (diri->is_base()) {
7078 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7079 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7080 remove_subtree(dir);
7081 dir->mark_clean();
7082 diri->close_dirfrag(dir->get_frag());
7083
7084 dout(10) << " removing " << *diri << dendl;
7085 assert(!diri->get_parent_dn());
7086 assert(diri->get_num_ref() == 0);
7087 remove_inode(diri);
7088 }
7089 break;
7090 }
7091
7092 CDir *psub = get_subtree_root(diri->get_parent_dir());
7093 dout(10) << " parent subtree is " << *psub << dendl;
7094 if (psub->get_dir_auth().first == mds->get_nodeid())
7095 break; // we are auth, keep.
7096
7097 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7098 remove_subtree(dir);
7099 dir->mark_clean();
7100 diri->close_dirfrag(dir->get_frag());
7101
7102 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7103 if (trim_non_auth_subtree(psub))
7104 break;
7105 dir = psub;
7106 }
7107 }
7108
7109 show_subtrees();
7110}
7111
7112void MDCache::standby_trim_segment(LogSegment *ls)
7113{
7114 ls->new_dirfrags.clear_list();
7115 ls->open_files.clear_list();
7116
7117 while (!ls->dirty_dirfrags.empty()) {
7118 CDir *dir = ls->dirty_dirfrags.front();
7119 dir->mark_clean();
7120 }
7121 while (!ls->dirty_inodes.empty()) {
7122 CInode *in = ls->dirty_inodes.front();
7123 in->mark_clean();
7124 }
7125 while (!ls->dirty_dentries.empty()) {
7126 CDentry *dn = ls->dirty_dentries.front();
7127 dn->mark_clean();
7128 }
7129 while (!ls->dirty_parent_inodes.empty()) {
7130 CInode *in = ls->dirty_parent_inodes.front();
7131 in->clear_dirty_parent();
7132 }
7133 while (!ls->dirty_dirfrag_dir.empty()) {
7134 CInode *in = ls->dirty_dirfrag_dir.front();
7135 in->filelock.remove_dirty();
7136 }
7137 while (!ls->dirty_dirfrag_nest.empty()) {
7138 CInode *in = ls->dirty_dirfrag_nest.front();
7139 in->nestlock.remove_dirty();
7140 }
7141 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7142 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7143 in->dirfragtreelock.remove_dirty();
7144 }
7145}
7146
7147/* This function DOES put the passed message before returning */
7148void MDCache::handle_cache_expire(MCacheExpire *m)
7149{
7150 mds_rank_t from = mds_rank_t(m->get_from());
7151
7152 dout(7) << "cache_expire from mds." << from << dendl;
7153
7154 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7155 m->put();
7156 return;
7157 }
7158
7159 set<SimpleLock *> gather_locks;
7160 // loop over realms
7161 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7162 p != m->realms.end();
7163 ++p) {
7164 // check container?
7165 if (p->first.ino > 0) {
7166 CInode *expired_inode = get_inode(p->first.ino);
7167 assert(expired_inode); // we had better have this.
7168 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7169 assert(parent_dir);
7170
7171 int export_state = -1;
7172 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7173 export_state = migrator->get_export_state(parent_dir);
7174 assert(export_state >= 0);
7175 }
7176
7177 if (!parent_dir->is_auth() ||
7178 (export_state != -1 &&
7179 ((export_state == Migrator::EXPORT_WARNING &&
7180 migrator->export_has_warned(parent_dir,from)) ||
7181 export_state == Migrator::EXPORT_EXPORTING ||
7182 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7183 (export_state == Migrator::EXPORT_NOTIFYING &&
7184 !migrator->export_has_notified(parent_dir,from))))) {
7185
7186 // not auth.
7187 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7188 assert(parent_dir->is_frozen_tree_root());
7189
7190 // make a message container
7191 if (delayed_expire[parent_dir].count(from) == 0)
7192 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7193
7194 // merge these expires into it
7195 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7196 continue;
7197 }
7198 assert(export_state <= Migrator::EXPORT_PREPPING ||
7199 (export_state == Migrator::EXPORT_WARNING &&
7200 !migrator->export_has_warned(parent_dir, from)));
7201
7202 dout(7) << "expires for " << *parent_dir << dendl;
7203 } else {
7204 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7205 }
7206
7207 // INODES
7208 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7209 it != p->second.inodes.end();
7210 ++it) {
7211 CInode *in = get_inode(it->first);
7212 unsigned nonce = it->second;
7213
7214 if (!in) {
7215 dout(0) << " inode expire on " << it->first << " from " << from
7216 << ", don't have it" << dendl;
7217 assert(in);
7218 }
7219 assert(in->is_auth());
7220 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7221
7222 // check nonce
7223 if (nonce == in->get_replica_nonce(from)) {
7224 // remove from our cached_by
7225 dout(7) << " inode expire on " << *in << " from mds." << from
7226 << " cached_by was " << in->get_replicas() << dendl;
7227 inode_remove_replica(in, from, false, gather_locks);
7228 }
7229 else {
7230 // this is an old nonce, ignore expire.
7231 dout(7) << " inode expire on " << *in << " from mds." << from
7232 << " with old nonce " << nonce
7233 << " (current " << in->get_replica_nonce(from) << "), dropping"
7234 << dendl;
7235 }
7236 }
7237
7238 // DIRS
7239 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7240 it != p->second.dirs.end();
7241 ++it) {
7242 CDir *dir = get_dirfrag(it->first);
7243 unsigned nonce = it->second;
7244
7245 if (!dir) {
7246 CInode *diri = get_inode(it->first.ino);
7247 if (diri) {
7248 if (mds->is_rejoin() &&
7249 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7250 !diri->is_replica(from)) {
7251 list<CDir*> ls;
7252 diri->get_nested_dirfrags(ls);
7253 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7254 << " while rejoining, inode isn't replicated" << dendl;
7255 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7256 dir = *q;
7257 if (dir->is_replica(from)) {
7258 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7259 dir->remove_replica(from);
7260 }
7261 }
7262 continue;
7263 }
7264 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7265 if (other) {
7266 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7267 << " have " << *other << ", mismatched frags, dropping" << dendl;
7268 continue;
7269 }
7270 }
7271 dout(0) << " dir expire on " << it->first << " from " << from
7272 << ", don't have it" << dendl;
7273 assert(dir);
7274 }
7275 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7276
7277 assert(dir->is_auth());
7278
7279 // check nonce
7280 if (nonce == dir->get_replica_nonce(from)) {
7281 // remove from our cached_by
7282 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7283 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7284 dir->remove_replica(from);
7285 }
7286 else {
7287 // this is an old nonce, ignore expire.
7288 dout(7) << " dir expire on " << *dir << " from mds." << from
7289 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7290 << "), dropping" << dendl;
7291 }
7292 }
7293
7294 // DENTRIES
7295 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7296 pd != p->second.dentries.end();
7297 ++pd) {
7298 dout(10) << " dn expires in dir " << pd->first << dendl;
7299 CInode *diri = get_inode(pd->first.ino);
7300 assert(diri);
7301 CDir *dir = diri->get_dirfrag(pd->first.frag);
7302
7303 if (!dir) {
7304 dout(0) << " dn expires on " << pd->first << " from " << from
7305 << ", must have refragmented" << dendl;
7306 } else {
7307 assert(dir->is_auth());
7308 }
7309
7310 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7311 p != pd->second.end();
7312 ++p) {
7313 unsigned nonce = p->second;
7314 CDentry *dn;
7315
7316 if (dir) {
7317 dn = dir->lookup(p->first.first, p->first.second);
7318 } else {
7319 // which dirfrag for this dentry?
7320 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7321 assert(dir);
7322 assert(dir->is_auth());
7323 dn = dir->lookup(p->first.first, p->first.second);
7324 }
7325
7326 if (!dn) {
7327 if (dir)
7328 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7329 else
7330 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7331 }
7332 assert(dn);
7333
7334 if (nonce == dn->get_replica_nonce(from)) {
7335 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7336 dentry_remove_replica(dn, from, gather_locks);
7337 }
7338 else {
7339 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7340 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7341 << "), dropping" << dendl;
7342 }
7343 }
7344 }
7345 }
7346
7347 // done
7348 m->put();
7349
7350 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7351 if (!(*p)->is_stable())
7352 mds->locker->eval_gather(*p);
7353 }
7354}
7355
7356void MDCache::process_delayed_expire(CDir *dir)
7357{
7358 dout(7) << "process_delayed_expire on " << *dir << dendl;
7359 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7360 p != delayed_expire[dir].end();
7361 ++p)
7362 handle_cache_expire(p->second);
7363 delayed_expire.erase(dir);
7364}
7365
7366void MDCache::discard_delayed_expire(CDir *dir)
7367{
7368 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7369 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7370 p != delayed_expire[dir].end();
7371 ++p)
7372 p->second->put();
7373 delayed_expire.erase(dir);
7374}
7375
7376void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7377 set<SimpleLock *>& gather_locks)
7378{
7379 in->remove_replica(from);
7380 in->mds_caps_wanted.erase(from);
7381
7382 // note: this code calls _eval more often than it needs to!
7383 // fix lock
7384 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7385 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7386 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7387 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7388 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7389 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7390
7391 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7392 // Don't remove the recovering mds from lock's gathering list because
7393 // it may hold rejoined wrlocks.
7394 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7395 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7396 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7397}
7398
7399void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7400{
7401 dn->remove_replica(from);
7402
7403 // fix lock
7404 if (dn->lock.remove_replica(from))
7405 gather_locks.insert(&dn->lock);
7406
7407 // Replicated strays might now be elegible for purge
7408 CDentry::linkage_t *dnl = dn->get_linkage();
7409 if (dnl->is_primary()) {
7410 maybe_eval_stray(dnl->get_inode());
7411 }
7412}
7413
7414void MDCache::trim_client_leases()
7415{
7416 utime_t now = ceph_clock_now();
7417
7418 dout(10) << "trim_client_leases" << dendl;
7419
7420 for (int pool=0; pool<client_lease_pools; pool++) {
7421 int before = client_leases[pool].size();
7422 if (client_leases[pool].empty())
7423 continue;
7424
7425 while (!client_leases[pool].empty()) {
7426 ClientLease *r = client_leases[pool].front();
7427 if (r->ttl > now) break;
7428 CDentry *dn = static_cast<CDentry*>(r->parent);
7429 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7430 dn->remove_client_lease(r, mds->locker);
7431 }
7432 int after = client_leases[pool].size();
7433 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7434 << (before-after) << " leases, " << after << " left" << dendl;
7435 }
7436}
7437
7438
7439void MDCache::check_memory_usage()
7440{
7441 static MemoryModel mm(g_ceph_context);
7442 static MemoryModel::snap last;
7443 mm.sample(&last);
7444 static MemoryModel::snap baseline = last;
7445
7446 // check client caps
7447 assert(CInode::count() == inode_map.size());
181888fb 7448 double caps_per_inode = 0.0;
7c673cae 7449 if (CInode::count())
181888fb 7450 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae
FG
7451
7452 dout(2) << "check_memory_usage"
7453 << " total " << last.get_total()
7454 << ", rss " << last.get_rss()
7455 << ", heap " << last.get_heap()
7456 << ", baseline " << baseline.get_heap()
7457 << ", buffers " << (buffer::get_total_alloc() >> 10)
7458 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7459 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7460 << dendl;
7461
c07f9fc5 7462 mds->update_mlogger();
7c673cae
FG
7463 mds->mlogger->set(l_mdm_rss, last.get_rss());
7464 mds->mlogger->set(l_mdm_heap, last.get_heap());
7465
181888fb
FG
7466 if (cache_toofull()) {
7467 last_recall_state = ceph_clock_now();
7468 mds->server->recall_client_state();
7c673cae
FG
7469 }
7470
7471 // If the cache size had exceeded its limit, but we're back in bounds
7472 // now, free any unused pool memory so that our memory usage isn't
7473 // permanently bloated.
181888fb 7474 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7475 // Only do this once we are back in bounds: otherwise the releases would
7476 // slow down whatever process caused us to exceed bounds to begin with
7477 if (ceph_using_tcmalloc()) {
7478 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7479 << dendl;
7480 ceph_heap_release_free_memory();
7481 }
7482 exceeded_size_limit = false;
7483 }
7484}
7485
7486
7487
7488// =========================================================================================
7489// shutdown
7490
7491class C_MDC_ShutdownCheck : public MDCacheContext {
7492public:
7493 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7494 void finish(int) override {
7495 mdcache->shutdown_check();
7496 }
7497};
7498
7499void MDCache::shutdown_check()
7500{
7501 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7502
7503 // cache
7504 char old_val[32] = { 0 };
7505 char *o = old_val;
7506 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7507 g_conf->set_val("debug_mds", "10");
7508 g_conf->apply_changes(NULL);
7509 show_cache();
7510 g_conf->set_val("debug_mds", old_val);
7511 g_conf->apply_changes(NULL);
7512 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7513
7514 // this
31f18b77 7515 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7516 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7517
7518
7519 if (mds->objecter->is_active()) {
7520 dout(0) << "objecter still active" << dendl;
7521 mds->objecter->dump_active();
7522 }
7523}
7524
7525
7526void MDCache::shutdown_start()
7527{
7528 dout(2) << "shutdown_start" << dendl;
7529
7530 if (g_conf->mds_shutdown_check)
7531 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7532
7533 // g_conf->debug_mds = 10;
7534}
7535
7536
7537
7538bool MDCache::shutdown_pass()
7539{
7540 dout(7) << "shutdown_pass" << dendl;
7541
7542 if (mds->is_stopped()) {
7543 dout(7) << " already shut down" << dendl;
7544 show_cache();
7545 show_subtrees();
7546 return true;
7547 }
7548
7549 // empty stray dir
7550 if (!shutdown_export_strays()) {
7551 dout(7) << "waiting for strays to migrate" << dendl;
7552 return false;
7553 }
7554
7555 // drop our reference to our stray dir inode
7556 for (int i = 0; i < NUM_STRAY; ++i) {
7557 if (strays[i] &&
7558 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7559 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7560 strays[i]->put(CInode::PIN_STRAY);
7561 strays[i]->put_stickydirs();
7562 }
7563 }
7564
7565 // trim cache
181888fb 7566 trim(UINT64_MAX);
31f18b77 7567 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7568
7569 // SUBTREES
7570 int num_auth_subtree = 0;
7571 if (!subtrees.empty() &&
7572 mds->get_nodeid() != 0 &&
7573 migrator->get_export_queue_size() == 0) {
7574 dout(7) << "looking for subtrees to export to mds0" << dendl;
7575 list<CDir*> ls;
7576 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7577 it != subtrees.end();
7578 ++it) {
7579 CDir *dir = it->first;
7580 if (dir->get_inode()->is_mdsdir())
7581 continue;
7582 if (dir->is_auth()) {
7583 num_auth_subtree++;
7584 if (dir->is_frozen() ||
7585 dir->is_freezing() ||
7586 dir->is_ambiguous_dir_auth() ||
7587 dir->state_test(CDir::STATE_EXPORTING))
7588 continue;
7589 ls.push_back(dir);
7590 }
7591 }
7592 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7593 CDir *dir = *p;
7594 mds_rank_t dest = dir->get_inode()->authority().first;
7595 if (dest > 0 && !mds->mdsmap->is_active(dest))
7596 dest = 0;
7597 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7598 migrator->export_dir_nicely(dir, dest);
7599 }
7600 }
7601
7602 if (num_auth_subtree > 0) {
7603 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7604 show_subtrees();
7605 return false;
7606 }
7607
7608 // close out any sessions (and open files!) before we try to trim the log, etc.
7609 if (mds->sessionmap.have_unclosed_sessions()) {
7610 if (!mds->server->terminating_sessions)
7611 mds->server->terminate_sessions();
7612 return false;
7613 }
7614
7615 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7616 if (mydir && !mydir->is_subtree_root())
7617 mydir = NULL;
7618
7619 // subtrees map not empty yet?
7620 if (subtrees.size() > (mydir ? 1 : 0)) {
7621 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7622 show_subtrees();
7623 migrator->show_importing();
7624 migrator->show_exporting();
7625 if (!migrator->is_importing() && !migrator->is_exporting())
7626 show_cache();
7627 return false;
7628 }
7629 assert(!migrator->is_exporting());
7630 assert(!migrator->is_importing());
7631
7c673cae
FG
7632 // flush what we can from the log
7633 mds->mdlog->trim(0);
7634 if (mds->mdlog->get_num_segments() > 1) {
7635 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7636 return false;
7637 }
7638
181888fb
FG
7639 if ((myin && myin->is_auth_pinned()) ||
7640 (mydir && mydir->is_auth_pinned())) {
7641 dout(7) << "still have auth pinned objects" << dendl;
7642 return false;
7643 }
7644
7c673cae
FG
7645 // (only do this once!)
7646 if (!mds->mdlog->is_capped()) {
7647 dout(7) << "capping the log" << dendl;
7648 mds->mdlog->cap();
7649 mds->mdlog->trim();
7650 }
7651
7652 if (!mds->mdlog->empty()) {
7653 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7654 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7655 return false;
7656 }
7657
7658 if (!did_shutdown_log_cap) {
7659 // flush journal header
7660 dout(7) << "writing header for (now-empty) journal" << dendl;
7661 assert(mds->mdlog->empty());
7662 mds->mdlog->write_head(0);
7663 // NOTE: filer active checker below will block us until this completes.
7664 did_shutdown_log_cap = true;
7665 return false;
7666 }
7667
7668 // filer active?
7669 if (mds->objecter->is_active()) {
7670 dout(7) << "objecter still active" << dendl;
7671 mds->objecter->dump_active();
7672 return false;
7673 }
7674
7675 // trim what we can from the cache
31f18b77
FG
7676 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7677 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7678 show_cache();
7679 //dump();
7680 return false;
7681 }
31f18b77
FG
7682
7683 // make mydir subtree go away
7684 if (mydir) {
7685 if (mydir->get_num_ref() > 1) { // subtree pin
7686 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7687 show_cache();
7688 return false;
7689 }
7690
7691 remove_subtree(mydir);
7692 myin->close_dirfrag(mydir->get_frag());
7693 }
7694 assert(subtrees.empty());
7695
7696 if (myin)
7697 remove_inode(myin);
7c673cae
FG
7698
7699 // done!
7700 dout(2) << "shutdown done." << dendl;
7701 return true;
7702}
7703
7704bool MDCache::shutdown_export_strays()
7705{
7706 if (mds->get_nodeid() == 0)
7707 return true;
7708
7709 dout(10) << "shutdown_export_strays" << dendl;
7710
7711 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7712
7713 bool done = true;
7714
7715 list<CDir*> dfs;
7716 for (int i = 0; i < NUM_STRAY; ++i) {
7717 if (!strays[i]) {
7718 continue;
7719 }
7720 strays[i]->get_dirfrags(dfs);
7721 }
7722
7723 for (std::list<CDir*>::iterator dfs_i = dfs.begin();
7724 dfs_i != dfs.end(); ++dfs_i)
7725 {
7726 CDir *dir = *dfs_i;
7727
7728 if (!dir->is_complete()) {
7729 dir->fetch(0);
7730 done = false;
7731 if (!mds0_active)
7732 break;
7733 }
7734
7735 for (CDir::map_t::iterator p = dir->items.begin();
7736 p != dir->items.end();
7737 ++p) {
7738 CDentry *dn = p->second;
7739 CDentry::linkage_t *dnl = dn->get_linkage();
7740 if (dnl->is_null())
7741 continue;
7742 done = false;
7743 if (!mds0_active)
7744 break;
7745
7746 if (dn->state_test(CDentry::STATE_PURGING)) {
7747 // Don't try to migrate anything that is actually
7748 // being purged right now
7749 continue;
7750 }
7751
7752 if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) {
7753 shutdown_exported_strays.insert(dnl->get_inode()->ino());
7754 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7755 } else {
7756 dout(10) << "already exporting " << *dn << dendl;
7757 }
7758 }
7759 }
7760
7761 return done;
7762}
7763
7764// ========= messaging ==============
7765
7766/* This function DOES put the passed message before returning */
7767void MDCache::dispatch(Message *m)
7768{
7769 switch (m->get_type()) {
7770
7771 // RESOLVE
7772 case MSG_MDS_RESOLVE:
7773 handle_resolve(static_cast<MMDSResolve*>(m));
7774 break;
7775 case MSG_MDS_RESOLVEACK:
7776 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7777 break;
7778
7779 // REJOIN
7780 case MSG_MDS_CACHEREJOIN:
7781 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7782 break;
7783
7784 case MSG_MDS_DISCOVER:
7785 handle_discover(static_cast<MDiscover*>(m));
7786 break;
7787 case MSG_MDS_DISCOVERREPLY:
7788 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7789 break;
7790
7791 case MSG_MDS_DIRUPDATE:
7792 handle_dir_update(static_cast<MDirUpdate*>(m));
7793 break;
7794
7795 case MSG_MDS_CACHEEXPIRE:
7796 handle_cache_expire(static_cast<MCacheExpire*>(m));
7797 break;
7798
7799 case MSG_MDS_DENTRYLINK:
7800 handle_dentry_link(static_cast<MDentryLink*>(m));
7801 break;
7802 case MSG_MDS_DENTRYUNLINK:
7803 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7804 break;
7805
7806 case MSG_MDS_FRAGMENTNOTIFY:
7807 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7808 break;
7809
7810 case MSG_MDS_FINDINO:
7811 handle_find_ino(static_cast<MMDSFindIno *>(m));
7812 break;
7813 case MSG_MDS_FINDINOREPLY:
7814 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7815 break;
7816
7817 case MSG_MDS_OPENINO:
7818 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7819 break;
7820 case MSG_MDS_OPENINOREPLY:
7821 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7822 break;
7823
7824 default:
7825 derr << "cache unknown message " << m->get_type() << dendl;
7826 assert(0 == "cache unknown message");
7827 }
7828}
7829
7830MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7831{
7832 if (mdr) {
7833 dout(20) << "_get_waiter retryrequest" << dendl;
7834 return new C_MDS_RetryRequest(this, mdr);
7835 } else if (req) {
7836 dout(20) << "_get_waiter retrymessage" << dendl;
7837 return new C_MDS_RetryMessage(mds, req);
7838 } else {
7839 return fin;
7840 }
7841}
7842
7843int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7844 const filepath& path, // what
7845 vector<CDentry*> *pdnvec, // result
7846 CInode **pin,
7847 int onfail)
7848{
7849 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7850 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7851 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7852
7853 assert(mdr || req || fin);
7854 assert(!forward || mdr || req); // forward requires a request
7855
7856 snapid_t snapid = CEPH_NOSNAP;
7857 if (mdr)
7858 mdr->snapid = snapid;
7859
7860 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7861
7862 if (mds->logger) mds->logger->inc(l_mds_traverse);
7863
7864 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7865 CInode *cur = get_inode(path.get_ino());
7866 if (cur == NULL) {
7867 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7868 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7869 else {
7870 //ceph_abort(); // hrm.. broken
7871 return -ESTALE;
7872 }
7873 return 1;
7874 }
7875 if (cur->state_test(CInode::STATE_PURGING))
7876 return -ESTALE;
7877
7878 // make sure snaprealm are open...
7879 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7880 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7881 return 1;
7882 }
7883
7884 // start trace
7885 if (pdnvec)
7886 pdnvec->clear();
7887 if (pin)
7888 *pin = cur;
7889
7890 unsigned depth = 0;
7891 while (depth < path.depth()) {
7892 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
7893 << "' snapid " << snapid << dendl;
7894
7895 if (!cur->is_dir()) {
7896 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
7897 return -ENOTDIR;
7898 }
7899
7900 // walk into snapdir?
7901 if (path[depth].length() == 0) {
7902 dout(10) << "traverse: snapdir" << dendl;
7903 if (!mdr)
7904 return -EINVAL;
7905 snapid = CEPH_SNAPDIR;
7906 mdr->snapid = snapid;
7907 depth++;
7908 continue;
7909 }
7910 // walk thru snapdir?
7911 if (snapid == CEPH_SNAPDIR) {
7912 if (!mdr)
7913 return -EINVAL;
7914 SnapRealm *realm = cur->find_snaprealm();
7915 snapid = realm->resolve_snapname(path[depth], cur->ino());
7916 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
7917 if (!snapid)
7918 return -ENOENT;
7919 mdr->snapid = snapid;
7920 depth++;
7921 continue;
7922 }
7923
7924 // open dir
7925 frag_t fg = cur->pick_dirfrag(path[depth]);
7926 CDir *curdir = cur->get_dirfrag(fg);
7927 if (!curdir) {
7928 if (cur->is_auth()) {
7929 // parent dir frozen_dir?
7930 if (cur->is_frozen()) {
7931 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
7932 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7933 return 1;
7934 }
7935 curdir = cur->get_or_open_dirfrag(this, fg);
7936 } else {
7937 // discover?
7938 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
7939 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
7940 null_okay);
7941 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
7942 return 1;
7943 }
7944 }
7945 assert(curdir);
7946
7947#ifdef MDS_VERIFY_FRAGSTAT
7948 if (curdir->is_complete())
7949 curdir->verify_fragstat();
7950#endif
7951
7952 // frozen?
7953 /*
7954 if (curdir->is_frozen()) {
7955 // doh!
7956 // FIXME: traverse is allowed?
7957 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7958 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7959 if (onfinish) delete onfinish;
7960 return 1;
7961 }
7962 */
7963
7964 // Before doing dirfrag->dn lookup, compare with DamageTable's
7965 // record of which dentries were unreadable
7966 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
7967 dout(4) << "traverse: stopped lookup at damaged dentry "
7968 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
7969 return -EIO;
7970 }
7971
7972 // dentry
7973 CDentry *dn = curdir->lookup(path[depth], snapid);
7974 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
7975
7976 // null and last_bit and xlocked by me?
7977 if (dnl && dnl->is_null() && null_okay) {
7978 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
7979 if (pdnvec)
7980 pdnvec->push_back(dn);
7981 if (pin)
7982 *pin = 0;
7983 break; // done!
7984 }
7985
7986 if (dnl &&
7987 dn->lock.is_xlocked() &&
7988 dn->lock.get_xlock_by() != mdr &&
7989 !dn->lock.can_read(client) &&
7990 (dnl->is_null() || forward)) {
7991 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
7992 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
7993 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
7994 mds->mdlog->flush();
7995 return 1;
7996 }
7997
7998 // can we conclude ENOENT?
7999 if (dnl && dnl->is_null()) {
8000 if (dn->lock.can_read(client) ||
8001 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8002 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8003 if (pdnvec) {
8004 if (depth == path.depth() - 1)
8005 pdnvec->push_back(dn);
8006 else
8007 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8008 }
8009 return -ENOENT;
8010 } else {
8011 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
8012 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8013 return 1;
8014 }
8015 }
8016
8017 if (dnl && !dnl->is_null()) {
8018 CInode *in = dnl->get_inode();
8019
8020 // do we have inode?
8021 if (!in) {
8022 assert(dnl->is_remote());
8023 // do i have it?
8024 in = get_inode(dnl->get_remote_ino());
8025 if (in) {
8026 dout(7) << "linking in remote in " << *in << dendl;
8027 dn->link_remote(dnl, in);
8028 } else {
8029 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8030 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8031 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8032 dout(4) << "traverse: remote dentry points to damaged ino "
8033 << *dn << dendl;
8034 return -EIO;
8035 }
8036 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8037 (null_okay && depth == path.depth() - 1));
8038 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8039 return 1;
8040 }
8041 }
8042
8043 cur = in;
8044 // make sure snaprealm are open...
8045 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8046 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8047 return 1;
8048 }
8049
8050 // add to trace, continue.
8051 touch_inode(cur);
8052 if (pdnvec)
8053 pdnvec->push_back(dn);
8054 if (pin)
8055 *pin = cur;
8056 depth++;
8057 continue;
8058 }
8059
8060
8061 // MISS. dentry doesn't exist.
8062 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8063
8064 if (curdir->is_auth()) {
8065 // dentry is mine.
8066 if (curdir->is_complete() ||
8067 (snapid == CEPH_NOSNAP &&
8068 curdir->has_bloom() &&
8069 !curdir->is_in_bloom(path[depth]))){
8070 // file not found
8071 if (pdnvec) {
8072 // instantiate a null dn?
8073 if (depth < path.depth()-1){
8074 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8075 dn = NULL;
8076 } else if (dn) {
8077 ceph_abort(); // should have fallen out in ->is_null() check above
8078 } else if (curdir->is_frozen()) {
8079 dout(20) << " not adding null to frozen dir " << dendl;
8080 } else if (snapid < CEPH_MAXSNAP) {
8081 dout(20) << " not adding null for snapid " << snapid << dendl;
8082 } else {
8083 // create a null dentry
8084 dn = curdir->add_null_dentry(path[depth]);
8085 dout(20) << " added null " << *dn << dendl;
8086 }
8087 if (dn)
8088 pdnvec->push_back(dn);
8089 else
8090 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8091 }
8092 return -ENOENT;
8093 } else {
8094
8095 // Check DamageTable for missing fragments before trying to fetch
8096 // this
8097 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8098 dout(4) << "traverse: damaged dirfrag " << *curdir
8099 << ", blocking fetch" << dendl;
8100 return -EIO;
8101 }
8102
8103 // directory isn't complete; reload
8104 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8105 touch_inode(cur);
8106 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8107 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8108 return 1;
8109 }
8110 } else {
8111 // dirfrag/dentry is not mine.
8112 mds_authority_t dauth = curdir->authority();
8113
8114 if (forward &&
8115 snapid && mdr && mdr->client_request &&
8116 (int)depth < mdr->client_request->get_num_fwd()) {
8117 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8118 << " < fwd " << mdr->client_request->get_num_fwd()
8119 << ", discovering instead of forwarding" << dendl;
8120 discover = true;
8121 }
8122
8123 if ((discover || null_okay)) {
8124 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8125 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8126 null_okay);
8127 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8128 return 1;
8129 }
8130 if (forward) {
8131 // forward
8132 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8133
8134 if (curdir->is_ambiguous_auth()) {
8135 // wait
8136 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8137 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8138 return 1;
8139 }
8140
8141 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8142
8143 if (mdr)
8144 request_forward(mdr, dauth.first);
8145 else
8146 mds->forward_message_mds(req, dauth.first);
8147
8148 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8149 assert(fin == NULL);
8150 return 2;
8151 }
8152 }
8153
8154 ceph_abort(); // i shouldn't get here
8155 }
8156
8157 // success.
8158 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8159 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8160 if (mdr)
8161 assert(mdr->snapid == snapid);
8162 return 0;
8163}
8164
8165CInode *MDCache::cache_traverse(const filepath& fp)
8166{
8167 dout(10) << "cache_traverse " << fp << dendl;
8168
8169 CInode *in;
8170 if (fp.get_ino())
8171 in = get_inode(fp.get_ino());
8172 else
8173 in = root;
8174 if (!in)
8175 return NULL;
8176
8177 for (unsigned i = 0; i < fp.depth(); i++) {
8178 const string& dname = fp[i];
8179 frag_t fg = in->pick_dirfrag(dname);
8180 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8181 CDir *curdir = in->get_dirfrag(fg);
8182 if (!curdir)
8183 return NULL;
8184 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8185 if (!dn)
8186 return NULL;
8187 in = dn->get_linkage()->get_inode();
8188 if (!in)
8189 return NULL;
8190 }
8191 dout(10) << " got " << *in << dendl;
8192 return in;
8193}
8194
8195
8196/**
8197 * open_remote_dir -- open up a remote dirfrag
8198 *
8199 * @param diri base inode
8200 * @param approxfg approximate fragment.
8201 * @param fin completion callback
8202 */
8203void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8204{
8205 dout(10) << "open_remote_dir on " << *diri << dendl;
7c673cae
FG
8206 assert(diri->is_dir());
8207 assert(!diri->is_auth());
8208 assert(diri->get_dirfrag(approxfg) == 0);
8209
224ce89b 8210 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8211}
8212
8213
8214/**
8215 * get_dentry_inode - get or open inode
8216 *
8217 * @param dn the dentry
8218 * @param mdr current request
8219 *
8220 * will return inode for primary, or link up/open up remote link's inode as necessary.
8221 * If it's not available right now, puts mdr on wait list and returns null.
8222 */
8223CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8224{
8225 CDentry::linkage_t *dnl;
8226 if (projected)
8227 dnl = dn->get_projected_linkage();
8228 else
8229 dnl = dn->get_linkage();
8230
8231 assert(!dnl->is_null());
8232
8233 if (dnl->is_primary())
8234 return dnl->inode;
8235
8236 assert(dnl->is_remote());
8237 CInode *in = get_inode(dnl->get_remote_ino());
8238 if (in) {
8239 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8240 dn->link_remote(dnl, in);
8241 return in;
8242 } else {
8243 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8244 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8245 return 0;
8246 }
8247}
8248
8249struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8250 CDentry *dn;
8251 inodeno_t ino;
8252 MDSInternalContextBase *onfinish;
8253 bool want_xlocked;
8254 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
31f18b77
FG
8255 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8256 dn->get(MDSCacheObject::PIN_PTRWAITER);
8257 }
7c673cae
FG
8258 void finish(int r) override {
8259 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8260 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8261 }
8262};
8263
8264void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8265{
8266 dout(10) << "open_remote_dentry " << *dn << dendl;
8267 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8268 inodeno_t ino = dnl->get_remote_ino();
8269 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8270 open_ino(ino, pool,
8271 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8272}
8273
8274void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8275 bool want_xlocked, int r)
8276{
8277 if (r < 0) {
31f18b77
FG
8278 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8279 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8280 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8281 dn->state_set(CDentry::STATE_BADREMOTEINO);
8282
8283 std::string path;
8284 CDir *dir = dn->get_dir();
8285 if (dir) {
31f18b77
FG
8286 dir->get_inode()->make_path_string(path);
8287 path = path + "/" + dn->get_name();
7c673cae
FG
8288 }
8289
31f18b77 8290 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8291 if (fatal) {
31f18b77
FG
8292 mds->damaged();
8293 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8294 }
31f18b77
FG
8295 } else {
8296 r = 0;
8297 }
7c673cae
FG
8298 }
8299 fin->complete(r < 0 ? r : 0);
8300}
8301
8302
8303void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8304{
8305 // empty trace if we're a base inode
8306 if (in->is_base())
8307 return;
8308
8309 CInode *parent = in->get_parent_inode();
8310 assert(parent);
8311 make_trace(trace, parent);
8312
8313 CDentry *dn = in->get_parent_dn();
8314 dout(15) << "make_trace adding " << *dn << dendl;
8315 trace.push_back(dn);
8316}
8317
8318
8319// -------------------------------------------------------------------------------
8320// Open inode by inode number
8321
8322class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8323 inodeno_t ino;
8324 public:
8325 bufferlist bl;
8326 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8327 MDCacheIOContext(c), ino(i) {}
8328 void finish(int r) override {
8329 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8330 }
8331};
8332
8333struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8334 inodeno_t ino;
8335 MMDSOpenIno *msg;
8336 bool parent;
8337 public:
8338 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8339 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8340 void finish(int r) override {
8341 if (r < 0 && !parent)
8342 r = -EAGAIN;
8343 if (msg) {
8344 mdcache->handle_open_ino(msg, r);
8345 return;
8346 }
8347 assert(mdcache->opening_inodes.count(ino));
8348 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8349 }
8350};
8351
8352struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8353 inodeno_t ino;
8354 public:
8355 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8356 void finish(int r) override {
8357 mdcache->_open_ino_parent_opened(ino, r);
8358 }
8359};
8360
8361void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8362{
8363 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8364
8365 assert(opening_inodes.count(ino));
8366 open_ino_info_t& info = opening_inodes[ino];
8367
8368 CInode *in = get_inode(ino);
8369 if (in) {
8370 dout(10) << " found cached " << *in << dendl;
8371 open_ino_finish(ino, info, in->authority().first);
8372 return;
8373 }
8374
8375 inode_backtrace_t backtrace;
8376 if (err == 0) {
8377 try {
8378 ::decode(backtrace, bl);
8379 } catch (const buffer::error &decode_exc) {
8380 derr << "corrupt backtrace on ino x0" << std::hex << ino
8381 << std::dec << ": " << decode_exc << dendl;
8382 open_ino_finish(ino, info, -EIO);
8383 return;
8384 }
8385 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8386 dout(10) << " old object in pool " << info.pool
8387 << ", retrying pool " << backtrace.pool << dendl;
8388 info.pool = backtrace.pool;
8389 C_IO_MDC_OpenInoBacktraceFetched *fin =
8390 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8391 fetch_backtrace(ino, info.pool, fin->bl,
8392 new C_OnFinisher(fin, mds->finisher));
8393 return;
8394 }
8395 } else if (err == -ENOENT) {
8396 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8397 if (info.pool != meta_pool) {
8398 dout(10) << " no object in pool " << info.pool
8399 << ", retrying pool " << meta_pool << dendl;
8400 info.pool = meta_pool;
8401 C_IO_MDC_OpenInoBacktraceFetched *fin =
8402 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8403 fetch_backtrace(ino, info.pool, fin->bl,
8404 new C_OnFinisher(fin, mds->finisher));
8405 return;
8406 }
8407 err = 0; // backtrace.ancestors.empty() is checked below
8408 }
8409
8410 if (err == 0) {
8411 if (backtrace.ancestors.empty()) {
8412 dout(10) << " got empty backtrace " << dendl;
8413 err = -EIO;
8414 } else if (!info.ancestors.empty()) {
8415 if (info.ancestors[0] == backtrace.ancestors[0]) {
8416 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8417 err = -EINVAL;
8418 } else {
8419 info.last_err = 0;
8420 }
8421 }
8422 }
8423 if (err) {
8424 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8425 if (info.last_err)
8426 err = info.last_err;
8427 open_ino_finish(ino, info, err);
8428 return;
8429 }
8430
8431 dout(10) << " got backtrace " << backtrace << dendl;
8432 info.ancestors = backtrace.ancestors;
8433
8434 _open_ino_traverse_dir(ino, info, 0);
8435}
8436
8437void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8438{
8439 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8440
8441 assert(opening_inodes.count(ino));
8442 open_ino_info_t& info = opening_inodes[ino];
8443
8444 CInode *in = get_inode(ino);
8445 if (in) {
8446 dout(10) << " found cached " << *in << dendl;
8447 open_ino_finish(ino, info, in->authority().first);
8448 return;
8449 }
8450
8451 if (ret == mds->get_nodeid()) {
8452 _open_ino_traverse_dir(ino, info, 0);
8453 } else {
8454 if (ret >= 0) {
8455 mds_rank_t checked_rank = mds_rank_t(ret);
8456 info.check_peers = true;
8457 info.auth_hint = checked_rank;
8458 info.checked.erase(checked_rank);
8459 }
8460 do_open_ino(ino, info, ret);
8461 }
8462}
8463
8464void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8465{
8466 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8467
8468 CInode *in = get_inode(ino);
8469 if (in) {
8470 dout(10) << " found cached " << *in << dendl;
8471 open_ino_finish(ino, info, in->authority().first);
8472 return;
8473 }
8474
8475 if (ret) {
8476 do_open_ino(ino, info, ret);
8477 return;
8478 }
8479
8480 mds_rank_t hint = info.auth_hint;
8481 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8482 info.discover, info.want_xlocked, &hint);
8483 if (ret > 0)
8484 return;
8485 if (hint != mds->get_nodeid())
8486 info.auth_hint = hint;
8487 do_open_ino(ino, info, ret);
8488}
8489
8490void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8491{
8492 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8493 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8494 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8495}
8496
8497int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8498 vector<inode_backpointer_t>& ancestors,
8499 bool discover, bool want_xlocked, mds_rank_t *hint)
8500{
8501 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8502 int err = 0;
8503 for (unsigned i = 0; i < ancestors.size(); i++) {
8504 CInode *diri = get_inode(ancestors[i].dirino);
8505
8506 if (!diri) {
8507 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8508 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8509 return 1;
8510 }
8511 continue;
8512 }
8513
8514 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8515 CDir *dir = diri->get_parent_dir();
8516 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8517 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8518 dir = dir->get_inode()->get_parent_dir();
8519 _open_ino_fetch_dir(ino, m, dir, i == 0);
8520 return 1;
8521 }
8522
8523 if (!diri->is_dir()) {
8524 dout(10) << " " << *diri << " is not dir" << dendl;
8525 if (i == 0)
8526 err = -ENOTDIR;
8527 break;
8528 }
8529
8530 string &name = ancestors[i].dname;
8531 frag_t fg = diri->pick_dirfrag(name);
8532 CDir *dir = diri->get_dirfrag(fg);
8533 if (!dir) {
8534 if (diri->is_auth()) {
8535 if (diri->is_frozen()) {
8536 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8537 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8538 return 1;
8539 }
8540 dir = diri->get_or_open_dirfrag(this, fg);
8541 } else if (discover) {
8542 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8543 return 1;
8544 }
8545 }
8546 if (dir) {
8547 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8548 CDentry *dn = dir->lookup(name);
8549 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8550 if (dir->is_auth()) {
8551 if (dnl && dnl->is_primary() &&
8552 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8553 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8554 _open_ino_fetch_dir(ino, m, dir, i == 0);
8555 return 1;
8556 }
8557
8558 if (!dnl && !dir->is_complete() &&
8559 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8560 dout(10) << " fetching incomplete " << *dir << dendl;
8561 _open_ino_fetch_dir(ino, m, dir, i == 0);
8562 return 1;
8563 }
8564
8565 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8566 if (i == 0)
8567 err = -ENOENT;
8568 } else if (discover) {
8569 if (!dnl) {
8570 filepath path(name, 0);
8571 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8572 (i == 0 && want_xlocked));
8573 return 1;
8574 }
8575 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8576 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8577 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8578 return 1;
8579 }
8580 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8581 if (i == 0)
8582 err = -ENOENT;
8583 }
8584 }
8585 if (hint && i == 0)
8586 *hint = dir ? dir->authority().first : diri->authority().first;
8587 break;
8588 }
8589 return err;
8590}
8591
8592void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8593{
8594 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8595
8596 list<MDSInternalContextBase*> waiters;
8597 waiters.swap(info.waiters);
8598 opening_inodes.erase(ino);
8599 finish_contexts(g_ceph_context, waiters, ret);
8600}
8601
8602void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8603{
8604 if (err < 0 && err != -EAGAIN) {
8605 info.checked.clear();
7c673cae
FG
8606 info.checking = MDS_RANK_NONE;
8607 info.check_peers = true;
8608 info.fetch_backtrace = true;
8609 if (info.discover) {
8610 info.discover = false;
8611 info.ancestors.clear();
8612 }
8613 if (err != -ENOENT && err != -ENOTDIR)
8614 info.last_err = err;
8615 }
8616
d2e6a577
FG
8617 if (info.check_peers || info.discover) {
8618 if (info.discover) {
8619 // got backtrace from peer, but failed to find inode. re-check peers
8620 info.discover = false;
8621 info.ancestors.clear();
8622 info.checked.clear();
8623 }
7c673cae
FG
8624 info.check_peers = false;
8625 info.checking = MDS_RANK_NONE;
8626 do_open_ino_peer(ino, info);
8627 } else if (info.fetch_backtrace) {
8628 info.check_peers = true;
8629 info.fetch_backtrace = false;
8630 info.checking = mds->get_nodeid();
8631 info.checked.clear();
7c673cae
FG
8632 C_IO_MDC_OpenInoBacktraceFetched *fin =
8633 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8634 fetch_backtrace(ino, info.pool, fin->bl,
8635 new C_OnFinisher(fin, mds->finisher));
8636 } else {
8637 assert(!info.ancestors.empty());
8638 info.checking = mds->get_nodeid();
8639 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8640 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8641 }
8642}
8643
8644void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8645{
8646 set<mds_rank_t> all, active;
8647 mds->mdsmap->get_mds_set(all);
8648 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8649 if (mds->get_state() == MDSMap::STATE_REJOIN)
8650 mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN);
8651
8652 dout(10) << "do_open_ino_peer " << ino << " active " << active
8653 << " all " << all << " checked " << info.checked << dendl;
8654
8655 mds_rank_t peer = MDS_RANK_NONE;
8656 if (info.auth_hint >= 0) {
8657 if (active.count(info.auth_hint)) {
8658 peer = info.auth_hint;
8659 info.auth_hint = MDS_RANK_NONE;
8660 }
8661 } else {
8662 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8663 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8664 peer = *p;
8665 break;
8666 }
8667 }
8668 if (peer < 0) {
d2e6a577
FG
8669 all.erase(mds->get_nodeid());
8670 if (all != info.checked) {
7c673cae
FG
8671 dout(10) << " waiting for more peers to be active" << dendl;
8672 } else {
8673 dout(10) << " all MDS peers have been checked " << dendl;
8674 do_open_ino(ino, info, 0);
8675 }
8676 } else {
8677 info.checking = peer;
8678 vector<inode_backpointer_t> *pa = NULL;
8679 // got backtrace from peer or backtrace just fetched
8680 if (info.discover || !info.fetch_backtrace)
8681 pa = &info.ancestors;
8682 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8683 }
8684}
8685
8686void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8687{
8688 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8689 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8690 m->put();
8691 return;
8692 }
8693
8694 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8695
8696 inodeno_t ino = m->ino;
8697 MMDSOpenInoReply *reply;
8698 CInode *in = get_inode(ino);
8699 if (in) {
8700 dout(10) << " have " << *in << dendl;
8701 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8702 if (in->is_auth()) {
8703 touch_inode(in);
8704 while (1) {
8705 CDentry *pdn = in->get_parent_dn();
8706 if (!pdn)
8707 break;
8708 CInode *diri = pdn->get_dir()->get_inode();
8709 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name,
8710 in->inode.version));
8711 in = diri;
8712 }
8713 } else {
8714 reply->hint = in->authority().first;
8715 }
8716 } else if (err < 0) {
8717 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8718 } else {
8719 mds_rank_t hint = MDS_RANK_NONE;
8720 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8721 if (ret > 0)
8722 return;
8723 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8724 }
8725 m->get_connection()->send_message(reply);
8726 m->put();
8727}
8728
8729void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8730{
8731 dout(10) << "handle_open_ino_reply " << *m << dendl;
8732
8733 inodeno_t ino = m->ino;
8734 mds_rank_t from = mds_rank_t(m->get_source().num());
8735 auto it = opening_inodes.find(ino);
8736 if (it != opening_inodes.end() && it->second.checking == from) {
8737 open_ino_info_t& info = it->second;
8738 info.checking = MDS_RANK_NONE;
8739 info.checked.insert(from);
8740
8741 CInode *in = get_inode(ino);
8742 if (in) {
8743 dout(10) << " found cached " << *in << dendl;
8744 open_ino_finish(ino, info, in->authority().first);
8745 } else if (!m->ancestors.empty()) {
8746 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8747 if (!info.want_replica) {
8748 open_ino_finish(ino, info, from);
8749 m->put();
8750 return;
8751 }
8752
8753 info.ancestors = m->ancestors;
8754 info.auth_hint = from;
8755 info.checking = mds->get_nodeid();
8756 info.discover = true;
8757 _open_ino_traverse_dir(ino, info, 0);
8758 } else if (m->error) {
8759 dout(10) << " error " << m->error << " from mds." << from << dendl;
8760 do_open_ino(ino, info, m->error);
8761 } else {
8762 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8763 info.auth_hint = m->hint;
8764 info.checked.erase(m->hint);
8765 }
8766 do_open_ino_peer(ino, info);
8767 }
8768 }
8769 m->put();
8770}
8771
8772void MDCache::kick_open_ino_peers(mds_rank_t who)
8773{
8774 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8775
8776 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8777 p != opening_inodes.end();
8778 ++p) {
8779 open_ino_info_t& info = p->second;
8780 if (info.checking == who) {
8781 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8782 info.checking = MDS_RANK_NONE;
8783 do_open_ino_peer(p->first, info);
8784 } else if (info.checking == MDS_RANK_NONE) {
8785 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8786 do_open_ino_peer(p->first, info);
8787 }
8788 }
8789}
8790
8791void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8792 bool want_replica, bool want_xlocked)
8793{
8794 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8795 << want_replica << dendl;
8796
8797 if (opening_inodes.count(ino)) {
8798 open_ino_info_t& info = opening_inodes[ino];
8799 if (want_replica) {
8800 info.want_replica = true;
8801 if (want_xlocked && !info.want_xlocked) {
8802 if (!info.ancestors.empty()) {
8803 CInode *diri = get_inode(info.ancestors[0].dirino);
8804 if (diri) {
8805 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8806 CDir *dir = diri->get_dirfrag(fg);
8807 if (dir && !dir->is_auth()) {
8808 filepath path(info.ancestors[0].dname, 0);
8809 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8810 }
8811 }
8812 }
8813 info.want_xlocked = true;
8814 }
8815 }
8816 info.waiters.push_back(fin);
8817 } else {
8818 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
8819 info.want_replica = want_replica;
8820 info.want_xlocked = want_xlocked;
8821 info.tid = ++open_ino_last_tid;
8822 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8823 info.waiters.push_back(fin);
8824 do_open_ino(ino, info, 0);
8825 }
8826}
8827
8828/* ---------------------------- */
8829
8830/*
8831 * search for a given inode on MDS peers. optionally start with the given node.
8832
8833
8834 TODO
8835 - recover from mds node failure, recovery
8836 - traverse path
8837
8838 */
8839void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8840{
8841 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
8842 assert(!have_inode(ino));
8843
8844 ceph_tid_t tid = ++find_ino_peer_last_tid;
8845 find_ino_peer_info_t& fip = find_ino_peer[tid];
8846 fip.ino = ino;
8847 fip.tid = tid;
8848 fip.fin = c;
8849 fip.hint = hint;
7c673cae
FG
8850 _do_find_ino_peer(fip);
8851}
8852
8853void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8854{
8855 set<mds_rank_t> all, active;
8856 mds->mdsmap->get_mds_set(all);
8857 mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active);
8858
8859 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8860 << " active " << active << " all " << all
8861 << " checked " << fip.checked
8862 << dendl;
8863
8864 mds_rank_t m = MDS_RANK_NONE;
8865 if (fip.hint >= 0) {
8866 m = fip.hint;
8867 fip.hint = MDS_RANK_NONE;
8868 } else {
8869 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8870 if (*p != mds->get_nodeid() &&
8871 fip.checked.count(*p) == 0) {
8872 m = *p;
8873 break;
8874 }
8875 }
8876 if (m == MDS_RANK_NONE) {
d2e6a577
FG
8877 all.erase(mds->get_nodeid());
8878 if (all != fip.checked) {
7c673cae
FG
8879 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
8880 } else {
8881 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
8882 fip.fin->complete(-ESTALE);
8883 find_ino_peer.erase(fip.tid);
8884 }
8885 } else {
8886 fip.checking = m;
8887 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
8888 }
8889}
8890
8891void MDCache::handle_find_ino(MMDSFindIno *m)
8892{
8893 if (mds->get_state() < MDSMap::STATE_REJOIN) {
8894 m->put();
8895 return;
8896 }
8897
8898 dout(10) << "handle_find_ino " << *m << dendl;
8899 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
8900 CInode *in = get_inode(m->ino);
8901 if (in) {
8902 in->make_path(r->path);
8903 dout(10) << " have " << r->path << " " << *in << dendl;
8904 }
8905 m->get_connection()->send_message(r);
8906 m->put();
8907}
8908
8909
8910void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
8911{
8912 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
8913 if (p != find_ino_peer.end()) {
8914 dout(10) << "handle_find_ino_reply " << *m << dendl;
8915 find_ino_peer_info_t& fip = p->second;
8916
8917 // success?
8918 if (get_inode(fip.ino)) {
8919 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
8920 mds->queue_waiter(fip.fin);
8921 find_ino_peer.erase(p);
8922 m->put();
8923 return;
8924 }
8925
8926 mds_rank_t from = mds_rank_t(m->get_source().num());
8927 if (fip.checking == from)
8928 fip.checking = MDS_RANK_NONE;
8929 fip.checked.insert(from);
8930
8931 if (!m->path.empty()) {
8932 // we got a path!
8933 vector<CDentry*> trace;
8934 MDRequestRef null_ref;
8935 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
8936 if (r > 0)
8937 return;
8938 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
8939 << ", retrying" << dendl;
8940 fip.checked.clear();
8941 _do_find_ino_peer(fip);
8942 } else {
8943 // nope, continue.
8944 _do_find_ino_peer(fip);
8945 }
8946 } else {
8947 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
8948 }
8949 m->put();
8950}
8951
8952void MDCache::kick_find_ino_peers(mds_rank_t who)
8953{
8954 // find_ino_peers requests we should move on from
8955 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
8956 p != find_ino_peer.end();
8957 ++p) {
8958 find_ino_peer_info_t& fip = p->second;
8959 if (fip.checking == who) {
8960 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
8961 fip.checking = MDS_RANK_NONE;
8962 _do_find_ino_peer(fip);
8963 } else if (fip.checking == MDS_RANK_NONE) {
8964 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
8965 _do_find_ino_peer(fip);
8966 }
8967 }
8968}
8969
8970/* ---------------------------- */
8971
8972int MDCache::get_num_client_requests()
8973{
8974 int count = 0;
8975 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
8976 p != active_requests.end();
8977 ++p) {
8978 MDRequestRef& mdr = p->second;
8979 if (mdr->reqid.name.is_client() && !mdr->is_slave())
8980 count++;
8981 }
8982 return count;
8983}
8984
8985/* This function takes over the reference to the passed Message */
8986MDRequestRef MDCache::request_start(MClientRequest *req)
8987{
8988 // did we win a forward race against a slave?
8989 if (active_requests.count(req->get_reqid())) {
8990 MDRequestRef& mdr = active_requests[req->get_reqid()];
8991 assert(mdr);
8992 if (mdr->is_slave()) {
8993 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
8994 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
8995 } else {
8996 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
8997 req->put();
8998 }
8999 return MDRequestRef();
9000 }
9001
9002 // register new client request
9003 MDRequestImpl::Params params;
9004 params.reqid = req->get_reqid();
9005 params.attempt = req->get_num_fwd();
9006 params.client_req = req;
9007 params.initiated = req->get_recv_stamp();
9008 params.throttled = req->get_throttle_stamp();
9009 params.all_read = req->get_recv_complete_stamp();
9010 params.dispatched = req->get_dispatch_stamp();
9011
9012 MDRequestRef mdr =
9013 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9014 active_requests[params.reqid] = mdr;
9015 mdr->set_op_stamp(req->get_stamp());
9016 dout(7) << "request_start " << *mdr << dendl;
9017 return mdr;
9018}
9019
9020MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9021{
9022 int by = m->get_source().num();
9023 MDRequestImpl::Params params;
9024 params.reqid = ri;
9025 params.attempt = attempt;
9026 params.triggering_slave_req = m;
9027 params.slave_to = by;
9028 params.initiated = m->get_recv_stamp();
9029 params.throttled = m->get_throttle_stamp();
9030 params.all_read = m->get_recv_complete_stamp();
9031 params.dispatched = m->get_dispatch_stamp();
9032 MDRequestRef mdr =
9033 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9034 assert(active_requests.count(mdr->reqid) == 0);
9035 active_requests[mdr->reqid] = mdr;
9036 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9037 return mdr;
9038}
9039
9040MDRequestRef MDCache::request_start_internal(int op)
9041{
9042 MDRequestImpl::Params params;
9043 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9044 params.reqid.tid = mds->issue_tid();
9045 params.initiated = ceph_clock_now();
9046 params.internal_op = op;
9047 MDRequestRef mdr =
9048 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9049
9050 assert(active_requests.count(mdr->reqid) == 0);
9051 active_requests[mdr->reqid] = mdr;
9052 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9053 return mdr;
9054}
9055
9056MDRequestRef MDCache::request_get(metareqid_t rid)
9057{
9058 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9059 assert(p != active_requests.end());
9060 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9061 return p->second;
9062}
9063
9064void MDCache::request_finish(MDRequestRef& mdr)
9065{
9066 dout(7) << "request_finish " << *mdr << dendl;
9067 mdr->mark_event("finishing request");
9068
9069 // slave finisher?
9070 if (mdr->has_more() && mdr->more()->slave_commit) {
9071 Context *fin = mdr->more()->slave_commit;
9072 mdr->more()->slave_commit = 0;
9073 int ret;
9074 if (mdr->aborted) {
9075 mdr->aborted = false;
9076 ret = -1;
9077 mdr->more()->slave_rolling_back = true;
9078 } else {
9079 ret = 0;
9080 mdr->committing = true;
9081 }
9082 fin->complete(ret); // this must re-call request_finish.
9083 return;
9084 }
9085
d2e6a577
FG
9086 switch(mdr->internal_op) {
9087 case CEPH_MDS_OP_FRAGMENTDIR:
9088 logger->inc(l_mdss_ireq_fragmentdir);
9089 break;
9090 case CEPH_MDS_OP_EXPORTDIR:
9091 logger->inc(l_mdss_ireq_exportdir);
9092 break;
9093 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9094 logger->inc(l_mdss_ireq_enqueue_scrub);
9095 break;
9096 case CEPH_MDS_OP_FLUSH:
9097 logger->inc(l_mdss_ireq_flush);
9098 break;
9099 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9100 logger->inc(l_mdss_ireq_fragstats);
9101 break;
9102 case CEPH_MDS_OP_REPAIR_INODESTATS:
9103 logger->inc(l_mdss_ireq_inodestats);
9104 break;
9105 }
9106
7c673cae
FG
9107 request_cleanup(mdr);
9108}
9109
9110
9111void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9112{
9113 mdr->mark_event("forwarding request");
9114 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9115 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9116 << *mdr->client_request << dendl;
9117 mds->forward_message_mds(mdr->client_request, who);
9118 mdr->client_request = 0;
9119 if (mds->logger) mds->logger->inc(l_mds_forward);
9120 } else if (mdr->internal_op >= 0) {
9121 dout(10) << "request_forward on internal op; cancelling" << dendl;
9122 mdr->internal_op_finish->complete(-EXDEV);
9123 } else {
9124 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9125 << " was from mds" << dendl;
9126 }
9127 request_cleanup(mdr);
9128}
9129
9130
9131void MDCache::dispatch_request(MDRequestRef& mdr)
9132{
9133 if (mdr->client_request) {
9134 mds->server->dispatch_client_request(mdr);
9135 } else if (mdr->slave_request) {
9136 mds->server->dispatch_slave_request(mdr);
9137 } else {
9138 switch (mdr->internal_op) {
9139 case CEPH_MDS_OP_FRAGMENTDIR:
9140 dispatch_fragment_dir(mdr);
9141 break;
9142 case CEPH_MDS_OP_EXPORTDIR:
9143 migrator->dispatch_export_dir(mdr, 0);
9144 break;
9145 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9146 enqueue_scrub_work(mdr);
9147 break;
9148 case CEPH_MDS_OP_FLUSH:
9149 flush_dentry_work(mdr);
9150 break;
9151 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9152 repair_dirfrag_stats_work(mdr);
9153 break;
9154 case CEPH_MDS_OP_REPAIR_INODESTATS:
9155 repair_inode_stats_work(mdr);
9156 break;
9157 default:
9158 ceph_abort();
9159 }
9160 }
9161}
9162
9163
9164void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9165{
9166 if (!mdr->has_more())
9167 return;
9168
9169 // clean up slaves
9170 // (will implicitly drop remote dn pins)
9171 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9172 p != mdr->more()->slaves.end();
9173 ++p) {
9174 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9175 MMDSSlaveRequest::OP_FINISH);
9176
9177 if (mdr->killed && !mdr->committing) {
9178 r->mark_abort();
9179 } else if (mdr->more()->srcdn_auth_mds == *p &&
9180 mdr->more()->inode_import.length() > 0) {
9181 // information about rename imported caps
9182 r->inode_export.claim(mdr->more()->inode_import);
9183 }
9184
9185 mds->send_message_mds(r, *p);
9186 }
9187
9188 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9189 * implicitly. Note that we don't call the finishers -- there shouldn't
9190 * be any on a remote lock and the request finish wakes up all
9191 * the waiters anyway! */
9192 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9193 while (p != mdr->xlocks.end()) {
9194 if ((*p)->get_parent()->is_auth())
9195 ++p;
9196 else {
9197 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9198 << " on " << *(*p)->get_parent() << dendl;
9199 (*p)->put_xlock();
9200 mdr->locks.erase(*p);
9201 mdr->xlocks.erase(p++);
9202 }
9203 }
9204
9205 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9206 while (q != mdr->remote_wrlocks.end()) {
9207 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9208 << " on mds." << q->second
9209 << " on " << *(q->first)->get_parent() << dendl;
9210 mdr->locks.erase(q->first);
9211 mdr->remote_wrlocks.erase(q++);
9212 }
9213
9214 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9215 * leaving them in can cause double-notifies as
9216 * this function can get called more than once */
9217}
9218
9219void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9220{
9221 request_drop_foreign_locks(mdr);
9222 mds->locker->drop_non_rdlocks(mdr.get());
9223}
9224
9225void MDCache::request_drop_locks(MDRequestRef& mdr)
9226{
9227 request_drop_foreign_locks(mdr);
9228 mds->locker->drop_locks(mdr.get());
9229}
9230
9231void MDCache::request_cleanup(MDRequestRef& mdr)
9232{
9233 dout(15) << "request_cleanup " << *mdr << dendl;
9234
9235 if (mdr->has_more()) {
9236 if (mdr->more()->is_ambiguous_auth)
9237 mdr->clear_ambiguous_auth();
9238 if (!mdr->more()->waiting_for_finish.empty())
9239 mds->queue_waiters(mdr->more()->waiting_for_finish);
9240 }
9241
9242 request_drop_locks(mdr);
9243
9244 // drop (local) auth pins
9245 mdr->drop_local_auth_pins();
9246
9247 // drop stickydirs
9248 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9249 p != mdr->stickydirs.end();
9250 ++p)
9251 (*p)->put_stickydirs();
9252
9253 mds->locker->kick_cap_releases(mdr);
9254
9255 // drop cache pins
9256 mdr->drop_pins();
9257
9258 // remove from session
9259 mdr->item_session_request.remove_myself();
9260
9261 // remove from map
9262 active_requests.erase(mdr->reqid);
9263
9264 if (mds->logger)
9265 log_stat();
9266
9267 mdr->mark_event("cleaned up request");
9268}
9269
9270void MDCache::request_kill(MDRequestRef& mdr)
9271{
9272 // rollback slave requests is tricky. just let the request proceed.
9273 if (mdr->done_locking && mdr->has_more() &&
9274 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9275 dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl;
9276
9277 assert(mdr->used_prealloc_ino == 0);
9278 assert(mdr->prealloc_inos.empty());
9279
9280 mdr->session = NULL;
9281 mdr->item_session_request.remove_myself();
9282 return;
9283 }
9284
9285 mdr->killed = true;
9286 mdr->mark_event("killing request");
9287
9288 if (mdr->committing) {
9289 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9290 } else {
9291 dout(10) << "request_kill " << *mdr << dendl;
9292 request_cleanup(mdr);
9293 }
9294}
9295
9296// -------------------------------------------------------------------------------
9297// SNAPREALMS
9298
9299struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9300 MDRequestRef mdr;
9301 MutationRef mut;
9302 CInode *in;
9303 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9304 MutationRef& mu, CInode *i) :
9305 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9306 void finish(int r) override {
9307 mdcache->_snaprealm_create_finish(mdr, mut, in);
9308 }
9309};
9310
9311void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9312{
9313 dout(10) << "snaprealm_create " << *in << dendl;
9314 assert(!in->snaprealm);
9315
9316 // allocate an id..
9317 if (!mdr->more()->stid) {
9318 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9319 new C_MDS_RetryRequest(this, mdr));
9320 return;
9321 }
9322
9323 MutationRef mut(new MutationImpl());
9324 mut->ls = mds->mdlog->get_current_segment();
9325 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9326 mds->mdlog->start_entry(le);
9327
9328 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9329
9330 inode_t *pi = in->project_inode();
9331 pi->version = in->pre_dirty();
9332 pi->rstat.rsnaprealms++;
9333
9334 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9335 snapid_t seq;
9336 ::decode(seq, p);
9337
9338 sr_t *newsnap = in->project_snaprealm(seq);
9339 newsnap->seq = seq;
9340 newsnap->last_created = seq;
9341
9342 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9343 journal_cow_inode(mut, &le->metablob, in);
9344 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9345
9346 mds->server->submit_mdlog_entry(le,
9347 new C_MDC_snaprealm_create_finish(this, mdr,
9348 mut, in),
9349 mdr, __func__);
9350 mds->mdlog->flush();
9351}
9352
9353
9354void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9355{
9356 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9357
9358 vector<inodeno_t> split_inos;
9359 vector<inodeno_t> split_realms;
9360
9361 if (snapop == CEPH_SNAP_OP_SPLIT) {
9362 // notify clients of update|split
9363 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9364 !p.end(); ++p)
9365 split_inos.push_back((*p)->ino());
9366
9367 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9368 p != in->snaprealm->open_children.end();
9369 ++p)
9370 split_realms.push_back((*p)->inode->ino());
9371 }
9372
9373 bufferlist snapbl;
9374 in->snaprealm->build_snap_trace(snapbl);
9375
9376 set<SnapRealm*> past_children;
9377 map<client_t, MClientSnap*> updates;
9378 list<SnapRealm*> q;
9379 q.push_back(in->snaprealm);
9380 while (!q.empty()) {
9381 SnapRealm *realm = q.front();
9382 q.pop_front();
9383
9384 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9385 realm->invalidate_cached_snaps();
9386
9387 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9388 p != realm->client_caps.end();
9389 ++p) {
9390 assert(!p->second->empty());
9391 if (!nosend && updates.count(p->first) == 0) {
9392 MClientSnap *update = new MClientSnap(snapop);
9393 update->head.split = in->ino();
9394 update->split_inos = split_inos;
9395 update->split_realms = split_realms;
9396 update->bl = snapbl;
9397 updates[p->first] = update;
9398 }
9399 }
9400
9401 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9402 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9403 p != realm->open_past_children.end();
9404 ++p)
9405 past_children.insert(*p);
9406 }
9407
9408 // notify for active children, too.
9409 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9410 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9411 p != realm->open_children.end();
9412 ++p)
9413 q.push_back(*p);
9414 }
9415
9416 if (!nosend)
9417 send_snaps(updates);
9418
9419 // notify past children and their descendants if we update/delete old snapshots
9420 for (set<SnapRealm*>::iterator p = past_children.begin();
9421 p != past_children.end();
9422 ++p)
9423 q.push_back(*p);
9424
9425 while (!q.empty()) {
9426 SnapRealm *realm = q.front();
9427 q.pop_front();
9428
9429 realm->invalidate_cached_snaps();
9430
9431 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9432 p != realm->open_children.end();
9433 ++p) {
9434 if (past_children.count(*p) == 0)
9435 q.push_back(*p);
9436 }
9437
9438 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9439 p != realm->open_past_children.end();
9440 ++p) {
9441 if (past_children.count(*p) == 0) {
9442 q.push_back(*p);
9443 past_children.insert(*p);
9444 }
9445 }
9446 }
9447
9448 if (snapop == CEPH_SNAP_OP_DESTROY) {
9449 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9450 for (set<SnapRealm*>::iterator p = past_children.begin();
9451 p != past_children.end();
9452 ++p)
9453 maybe_eval_stray((*p)->inode, true);
9454 }
9455}
9456
9457void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9458{
9459 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9460
9461 // apply
9462 in->pop_and_dirty_projected_inode(mut->ls);
9463 mut->apply();
9464 mds->locker->drop_locks(mut.get());
9465 mut->cleanup();
9466
9467 // tell table we've committed
9468 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9469
9470 // create
9471 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9472 snapid_t seq;
9473 ::decode(seq, p);
9474
9475 in->open_snaprealm();
9476 in->snaprealm->srnode.seq = seq;
9477 in->snaprealm->srnode.created = seq;
9478 bool ok = in->snaprealm->_open_parents(NULL);
9479 assert(ok);
9480
9481 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9482
9483 /*
9484 static int count = 5;
9485 if (--count == 0)
9486 ceph_abort(); // hack test test **********
9487 */
9488
9489 // done.
9490 mdr->more()->stid = 0; // caller will likely need to reuse this
9491 dispatch_request(mdr);
9492}
9493
9494
9495// -------------------------------------------------------------------------------
9496// STRAYS
9497
9498struct C_MDC_RetryScanStray : public MDCacheContext {
9499 dirfrag_t next;
9500 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9501 void finish(int r) override {
9502 mdcache->scan_stray_dir(next);
9503 }
9504};
9505
9506void MDCache::scan_stray_dir(dirfrag_t next)
9507{
9508 dout(10) << "scan_stray_dir " << next << dendl;
9509
9510 list<CDir*> ls;
9511 for (int i = 0; i < NUM_STRAY; ++i) {
9512 if (strays[i]->ino() < next.ino)
9513 continue;
9514 strays[i]->get_dirfrags(ls);
9515 }
9516
9517 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9518 CDir *dir = *p;
9519 if (dir->dirfrag() < next)
9520 continue;
9521 if (!dir->is_complete()) {
9522 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9523 return;
9524 }
9525 for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
9526 CDentry *dn = q->second;
9527 dn->state_set(CDentry::STATE_STRAY);
9528 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9529 if (dnl->is_primary()) {
9530 CInode *in = dnl->get_inode();
9531 if (in->inode.nlink == 0)
9532 in->state_set(CInode::STATE_ORPHAN);
9533 maybe_eval_stray(in);
9534 }
9535 }
9536 }
9537}
9538
7c673cae
FG
9539void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9540{
9541 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9542 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9543}
9544
9545
9546
9547
9548
9549// ========================================================================================
9550// DISCOVER
9551/*
9552
9553 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9554 to the parent metadata object in the cache (pinning it).
9555
9556 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9557
9558*/
9559
9560void MDCache::_send_discover(discover_info_t& d)
9561{
9562 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9563 d.want_base_dir, d.want_xlocked);
9564 dis->set_tid(d.tid);
9565 mds->send_message_mds(dis, d.mds);
9566}
9567
9568void MDCache::discover_base_ino(inodeno_t want_ino,
9569 MDSInternalContextBase *onfinish,
9570 mds_rank_t from)
9571{
9572 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9573 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9574 discover_info_t& d = _create_discover(from);
9575 d.ino = want_ino;
9576 _send_discover(d);
9577 }
9578 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9579}
9580
9581
9582void MDCache::discover_dir_frag(CInode *base,
9583 frag_t approx_fg,
9584 MDSInternalContextBase *onfinish,
9585 mds_rank_t from)
9586{
9587 if (from < 0)
9588 from = base->authority().first;
9589
9590 dirfrag_t df(base->ino(), approx_fg);
9591 dout(7) << "discover_dir_frag " << df
9592 << " from mds." << from << dendl;
9593
9594 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9595 discover_info_t& d = _create_discover(from);
9596 d.pin_base(base);
9597 d.ino = base->ino();
9598 d.frag = approx_fg;
9599 d.want_base_dir = true;
9600 _send_discover(d);
9601 }
9602
9603 if (onfinish)
9604 base->add_dir_waiter(approx_fg, onfinish);
9605}
9606
9607struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9608 CInode *base;
9609 snapid_t snapid;
9610 filepath path;
9611 mds_rank_t from;
9612 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9613 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9614 void finish(int r) override {
9615 mdcache->discover_path(base, snapid, path, 0, from);
9616 }
9617};
9618
9619void MDCache::discover_path(CInode *base,
9620 snapid_t snap,
9621 filepath want_path,
9622 MDSInternalContextBase *onfinish,
9623 bool want_xlocked,
9624 mds_rank_t from)
9625{
9626 if (from < 0)
9627 from = base->authority().first;
9628
9629 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9630 << (want_xlocked ? " want_xlocked":"")
9631 << dendl;
9632
9633 if (base->is_ambiguous_auth()) {
9634 dout(10) << " waiting for single auth on " << *base << dendl;
9635 if (!onfinish)
9636 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9637 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9638 return;
9639 } else if (from == mds->get_nodeid()) {
9640 list<MDSInternalContextBase*> finished;
9641 base->take_waiting(CInode::WAIT_DIR, finished);
9642 mds->queue_waiters(finished);
9643 return;
9644 }
9645
9646 frag_t fg = base->pick_dirfrag(want_path[0]);
9647 if ((want_xlocked && want_path.depth() == 1) ||
9648 !base->is_waiting_for_dir(fg) || !onfinish) {
9649 discover_info_t& d = _create_discover(from);
9650 d.ino = base->ino();
9651 d.pin_base(base);
9652 d.frag = fg;
9653 d.snap = snap;
9654 d.want_path = want_path;
9655 d.want_base_dir = true;
9656 d.want_xlocked = want_xlocked;
9657 _send_discover(d);
9658 }
9659
9660 // register + wait
9661 if (onfinish)
9662 base->add_dir_waiter(fg, onfinish);
9663}
9664
9665struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9666 CDir *base;
9667 snapid_t snapid;
9668 filepath path;
9669 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9670 MDCacheContext(c), base(b), snapid(s), path(p) {}
9671 void finish(int r) override {
9672 mdcache->discover_path(base, snapid, path, 0);
9673 }
9674};
9675
9676void MDCache::discover_path(CDir *base,
9677 snapid_t snap,
9678 filepath want_path,
9679 MDSInternalContextBase *onfinish,
9680 bool want_xlocked)
9681{
9682 mds_rank_t from = base->authority().first;
9683
9684 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9685 << (want_xlocked ? " want_xlocked":"")
9686 << dendl;
9687
9688 if (base->is_ambiguous_auth()) {
9689 dout(7) << " waiting for single auth on " << *base << dendl;
9690 if (!onfinish)
9691 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9692 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9693 return;
9694 } else if (from == mds->get_nodeid()) {
9695 list<MDSInternalContextBase*> finished;
9696 base->take_sub_waiting(finished);
9697 mds->queue_waiters(finished);
9698 return;
9699 }
9700
9701 if ((want_xlocked && want_path.depth() == 1) ||
9702 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9703 discover_info_t& d = _create_discover(from);
9704 d.ino = base->ino();
31f18b77 9705 d.pin_base(base->inode);
7c673cae
FG
9706 d.frag = base->get_frag();
9707 d.snap = snap;
9708 d.want_path = want_path;
9709 d.want_base_dir = false;
9710 d.want_xlocked = want_xlocked;
9711 _send_discover(d);
9712 }
9713
9714 // register + wait
9715 if (onfinish)
9716 base->add_dentry_waiter(want_path[0], snap, onfinish);
9717}
9718
9719void MDCache::kick_discovers(mds_rank_t who)
9720{
9721 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9722 p != discovers.end();
9723 ++p) {
9724 if (p->second.mds != who)
9725 continue;
9726 _send_discover(p->second);
9727 }
9728}
9729
9730
9731/* This function DOES put the passed message before returning */
9732void MDCache::handle_discover(MDiscover *dis)
9733{
9734 mds_rank_t whoami = mds->get_nodeid();
9735 mds_rank_t from = mds_rank_t(dis->get_source().num());
9736
9737 assert(from != whoami);
9738
9739 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9740 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 9741 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9742 dis->put();
9743 return;
9744 }
9745
9746 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9747 // delay processing request from survivor because we may not yet choose lock states.
9748 if (!mds->mdsmap->is_rejoin(from)) {
9749 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9750 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9751 return;
9752 }
9753 }
9754
9755
9756 CInode *cur = 0;
9757 MDiscoverReply *reply = new MDiscoverReply(dis);
9758
9759 snapid_t snapid = dis->get_snapid();
9760
9761 // get started.
9762 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9763 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9764 // wants root
9765 dout(7) << "handle_discover from mds." << from
9766 << " wants base + " << dis->get_want().get_path()
9767 << " snap " << snapid
9768 << dendl;
9769
9770 cur = get_inode(dis->get_base_ino());
9771 assert(cur);
9772
9773 // add root
9774 reply->starts_with = MDiscoverReply::INODE;
9775 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9776 dout(10) << "added base " << *cur << dendl;
9777 }
9778 else {
9779 // there's a base inode
9780 cur = get_inode(dis->get_base_ino(), snapid);
9781 if (!cur && snapid != CEPH_NOSNAP) {
9782 cur = get_inode(dis->get_base_ino());
9783 if (cur && !cur->is_multiversion())
9784 cur = NULL; // nope!
9785 }
9786
9787 if (!cur) {
9788 dout(7) << "handle_discover mds." << from
9789 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9790 << dendl;
9791 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9792 reply->set_error_dentry(dis->get_dentry(0));
9793 reply->set_flag_error_dir();
9794 } else if (dis->wants_base_dir()) {
9795 dout(7) << "handle_discover mds." << from
9796 << " wants basedir+" << dis->get_want().get_path()
9797 << " has " << *cur
9798 << dendl;
9799 } else {
9800 dout(7) << "handle_discover mds." << from
9801 << " wants " << dis->get_want().get_path()
9802 << " has " << *cur
9803 << dendl;
9804 }
9805 }
9806
9807 assert(reply);
9808
9809 // add content
9810 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9811 for (unsigned i = 0;
9812 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9813 i++) {
9814
9815 // -- figure out the dir
9816
9817 // is *cur even a dir at all?
9818 if (!cur->is_dir()) {
9819 dout(7) << *cur << " not a dir" << dendl;
9820 reply->set_flag_error_dir();
9821 break;
9822 }
9823
9824 // pick frag
9825 frag_t fg;
9826 if (dis->get_want().depth()) {
9827 // dentry specifies
9828 fg = cur->pick_dirfrag(dis->get_dentry(i));
9829 } else {
9830 // requester explicity specified the frag
9831 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9832 fg = dis->get_base_dir_frag();
9833 if (!cur->dirfragtree.is_leaf(fg))
9834 fg = cur->dirfragtree[fg.value()];
9835 }
9836 CDir *curdir = cur->get_dirfrag(fg);
9837
9838 if ((!curdir && !cur->is_auth()) ||
9839 (curdir && !curdir->is_auth())) {
9840
9841 /* before:
9842 * ONLY set flag if empty!!
9843 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9844 * resulting in duplicate discovers in flight,
9845 * which can wreak havoc when discovering rename srcdn (which may move)
9846 */
9847
9848 if (reply->is_empty()) {
9849 // only hint if empty.
9850 // someday this could be better, but right now the waiter logic isn't smart enough.
9851
9852 // hint
9853 if (curdir) {
9854 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9855 reply->set_dir_auth_hint(curdir->authority().first);
9856 } else {
9857 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9858 << *cur << dendl;
9859 reply->set_dir_auth_hint(cur->authority().first);
9860 }
9861
9862 // note error dentry, if any
9863 // NOTE: important, as it allows requester to issue an equivalent discover
9864 // to whomever we hint at.
9865 if (dis->get_want().depth() > i)
9866 reply->set_error_dentry(dis->get_dentry(i));
9867 }
9868
9869 break;
9870 }
9871
9872 if (!curdir) { // open dir?
9873 if (cur->is_frozen()) {
9874 if (!reply->is_empty()) {
9875 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
9876 break;
9877 }
9878 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
9879 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9880 reply->put();
9881 return;
9882 }
9883 curdir = cur->get_or_open_dirfrag(this, fg);
9884 } else if (curdir->is_frozen_tree() ||
9885 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
9886 if (!reply->is_empty()) {
9887 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
9888 break;
9889 }
7c673cae
FG
9890 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
9891 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
9892 reply->set_flag_error_dir();
9893 break;
9894 }
7c673cae
FG
9895 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
9896 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9897 reply->put();
9898 return;
9899 }
9900
9901 // add dir
9902 if (curdir->get_version() == 0) {
9903 // fetch newly opened dir
9904 } else if (reply->is_empty() && !dis->wants_base_dir()) {
9905 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
9906 // make sure the base frag is correct, though, in there was a refragment since the
9907 // original request was sent.
9908 reply->set_base_dir_frag(curdir->get_frag());
9909 } else {
9910 assert(!curdir->is_ambiguous_auth()); // would be frozen.
9911 if (!reply->trace.length())
9912 reply->starts_with = MDiscoverReply::DIR;
9913 replicate_dir(curdir, from, reply->trace);
9914 dout(7) << "handle_discover added dir " << *curdir << dendl;
9915 }
9916
9917 // lookup
9918 CDentry *dn = 0;
9919 if (curdir->get_version() == 0) {
9920 // fetch newly opened dir
31f18b77 9921 assert(!curdir->has_bloom());
7c673cae
FG
9922 } else if (dis->get_want().depth() > 0) {
9923 // lookup dentry
9924 dn = curdir->lookup(dis->get_dentry(i), snapid);
9925 } else
9926 break; // done!
9927
9928 // incomplete dir?
9929 if (!dn) {
31f18b77
FG
9930 if (!curdir->is_complete() &&
9931 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
9932 // readdir
9933 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
9934 if (reply->is_empty()) {
9935 // fetch and wait
9936 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
9937 dis->wants_base_dir() && curdir->get_version() == 0);
9938 reply->put();
9939 return;
9940 } else {
9941 // initiate fetch, but send what we have so far
9942 curdir->fetch(0);
9943 break;
9944 }
9945 }
9946
9947 // send null dentry
9948 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
9949 << *curdir << dendl;
9950 dn = curdir->add_null_dentry(dis->get_dentry(i));
9951 }
9952 assert(dn);
9953
31f18b77
FG
9954 // don't add replica to purging dentry/inode
9955 if (dn->state_test(CDentry::STATE_PURGING)) {
9956 if (reply->is_empty())
9957 reply->set_flag_error_dn(dis->get_dentry(i));
9958 break;
9959 }
9960
7c673cae
FG
9961 CDentry::linkage_t *dnl = dn->get_linkage();
9962
9963 // xlocked dentry?
9964 // ...always block on non-tail items (they are unrelated)
9965 // ...allow xlocked tail disocvery _only_ if explicitly requested
9966 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
9967 if (dn->lock.is_xlocked()) {
9968 // is this the last (tail) item in the discover traversal?
9969 if (tailitem && dis->wants_xlocked()) {
9970 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
9971 } else if (reply->is_empty()) {
9972 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
9973 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
9974 reply->put();
9975 return;
9976 } else {
9977 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
9978 break;
9979 }
9980 }
9981
9982 // frozen inode?
9983 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9984 if (tailitem && dis->wants_xlocked()) {
9985 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
9986 } else if (reply->is_empty()) {
9987 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
9988 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
9989 reply->put();
9990 return;
9991 } else {
9992 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
9993 break;
9994 }
9995 }
9996
9997 // add dentry
9998 if (!reply->trace.length())
9999 reply->starts_with = MDiscoverReply::DENTRY;
10000 replicate_dentry(dn, from, reply->trace);
10001 dout(7) << "handle_discover added dentry " << *dn << dendl;
10002
10003 if (!dnl->is_primary()) break; // stop on null or remote link.
10004
10005 // add inode
10006 CInode *next = dnl->get_inode();
10007 assert(next->is_auth());
10008
10009 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10010 dout(7) << "handle_discover added inode " << *next << dendl;
10011
10012 // descend, keep going.
10013 cur = next;
10014 continue;
10015 }
10016
10017 // how did we do?
10018 assert(!reply->is_empty());
10019 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10020 mds->send_message(reply, dis->get_connection());
10021
10022 dis->put();
10023}
10024
10025/* This function DOES put the passed message before returning */
10026void MDCache::handle_discover_reply(MDiscoverReply *m)
10027{
10028 /*
10029 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10030 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10031 m->put();
10032 return;
10033 }
10034 */
10035 dout(7) << "discover_reply " << *m << dendl;
10036 if (m->is_flag_error_dir())
10037 dout(7) << " flag error, dir" << dendl;
10038 if (m->is_flag_error_dn())
10039 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10040
10041 list<MDSInternalContextBase*> finished, error;
10042 mds_rank_t from = mds_rank_t(m->get_source().num());
10043
10044 // starting point
10045 CInode *cur = get_inode(m->get_base_ino());
10046 bufferlist::iterator p = m->trace.begin();
10047
10048 int next = m->starts_with;
10049
10050 // decrement discover counters
10051 if (m->get_tid()) {
10052 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10053 if (p != discovers.end()) {
10054 dout(10) << " found tid " << m->get_tid() << dendl;
10055 discovers.erase(p);
10056 } else {
10057 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10058 }
10059 }
10060
10061 // discover may start with an inode
10062 if (!p.end() && next == MDiscoverReply::INODE) {
10063 cur = add_replica_inode(p, NULL, finished);
10064 dout(7) << "discover_reply got base inode " << *cur << dendl;
10065 assert(cur->is_base());
10066
10067 next = MDiscoverReply::DIR;
10068
10069 // take waiters?
10070 if (cur->is_base() &&
10071 waiting_for_base_ino[from].count(cur->ino())) {
10072 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10073 waiting_for_base_ino[from].erase(cur->ino());
10074 }
10075 }
10076 assert(cur);
10077
10078 // loop over discover results.
10079 // indexes follow each ([[dir] dentry] inode)
10080 // can start, end with any type.
10081 while (!p.end()) {
10082 // dir
10083 frag_t fg;
10084 CDir *curdir = 0;
10085 if (next == MDiscoverReply::DIR) {
10086 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10087 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10088 assert(m->get_wanted_base_dir());
10089 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10090 }
10091 } else {
10092 // note: this can only happen our first way around this loop.
10093 if (p.end() && m->is_flag_error_dn()) {
10094 fg = cur->pick_dirfrag(m->get_error_dentry());
10095 curdir = cur->get_dirfrag(fg);
10096 } else
10097 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10098 }
10099
10100 if (p.end())
10101 break;
10102
10103 // dentry
10104 CDentry *dn = add_replica_dentry(p, curdir, finished);
10105
10106 if (p.end())
10107 break;
10108
10109 // inode
10110 cur = add_replica_inode(p, dn, finished);
10111
10112 next = MDiscoverReply::DIR;
10113 }
10114
10115 // dir error?
10116 // or dir_auth hint?
10117 if (m->is_flag_error_dir() && !cur->is_dir()) {
10118 // not a dir.
10119 cur->take_waiting(CInode::WAIT_DIR, error);
10120 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10121 mds_rank_t who = m->get_dir_auth_hint();
10122 if (who == mds->get_nodeid()) who = -1;
10123 if (who >= 0)
10124 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10125
7c673cae
FG
10126
10127 if (m->get_wanted_base_dir()) {
31f18b77
FG
10128 frag_t fg = m->get_base_dir_frag();
10129 CDir *dir = cur->get_dirfrag(fg);
10130
7c673cae
FG
10131 if (cur->is_waiting_for_dir(fg)) {
10132 if (cur->is_auth())
10133 cur->take_waiting(CInode::WAIT_DIR, finished);
10134 else if (dir || !cur->dirfragtree.is_leaf(fg))
10135 cur->take_dir_waiting(fg, finished);
10136 else
10137 discover_dir_frag(cur, fg, 0, who);
10138 } else
10139 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10140 }
10141
10142 // try again?
10143 if (m->get_error_dentry().length()) {
31f18b77
FG
10144 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10145 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10146 // wanted a dentry
10147 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10148 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10149 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10150 m->get_wanted_snapid(), finished);
10151 } else {
10152 filepath relpath(m->get_error_dentry(), 0);
10153 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10154 }
10155 } else
10156 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10157 << m->get_error_dentry() << dendl;
10158 }
31f18b77
FG
10159 } else if (m->is_flag_error_dn()) {
10160 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10161 CDir *dir = cur->get_dirfrag(fg);
10162 if (dir) {
10163 if (dir->is_auth()) {
10164 dir->take_sub_waiting(finished);
10165 } else {
10166 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10167 m->get_wanted_snapid(), error);
10168 }
10169 }
7c673cae
FG
10170 }
10171
10172 // waiters
10173 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10174 mds->queue_waiters(finished);
10175
10176 // done
10177 m->put();
10178}
10179
10180
10181
10182// ----------------------------
10183// REPLICAS
10184
10185CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10186 list<MDSInternalContextBase*>& finished)
10187{
10188 dirfrag_t df;
10189 ::decode(df, p);
10190
10191 assert(diri->ino() == df.ino);
10192
10193 // add it (_replica_)
10194 CDir *dir = diri->get_dirfrag(df.frag);
10195
10196 if (dir) {
10197 // had replica. update w/ new nonce.
10198 dir->decode_replica(p);
10199 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10200 } else {
10201 // force frag to leaf in the diri tree
10202 if (!diri->dirfragtree.is_leaf(df.frag)) {
10203 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10204 << diri->dirfragtree << dendl;
10205 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10206 }
10207
10208 // add replica.
10209 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10210 dir->decode_replica(p);
10211
10212 // is this a dir_auth delegation boundary?
10213 if (from != diri->authority().first ||
10214 diri->is_ambiguous_auth() ||
10215 diri->is_base())
10216 adjust_subtree_auth(dir, from);
10217
10218 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10219
10220 // get waiters
10221 diri->take_dir_waiting(df.frag, finished);
10222 }
10223
10224 return dir;
10225}
10226
7c673cae
FG
10227CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10228{
10229 string name;
10230 snapid_t last;
10231 ::decode(name, p);
10232 ::decode(last, p);
10233
10234 CDentry *dn = dir->lookup(name, last);
10235
10236 // have it?
10237 if (dn) {
10238 dn->decode_replica(p, false);
10239 dout(7) << "add_replica_dentry had " << *dn << dendl;
10240 } else {
10241 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10242 dn->decode_replica(p, true);
10243 dout(7) << "add_replica_dentry added " << *dn << dendl;
10244 }
10245
10246 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10247
10248 return dn;
10249}
10250
10251CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10252{
10253 inodeno_t ino;
10254 snapid_t last;
10255 ::decode(ino, p);
10256 ::decode(last, p);
10257 CInode *in = get_inode(ino, last);
10258 if (!in) {
10259 in = new CInode(this, false, 1, last);
10260 in->decode_replica(p, true);
10261 add_inode(in);
10262 if (in->ino() == MDS_INO_ROOT)
10263 in->inode_auth.first = 0;
10264 else if (in->is_mdsdir())
10265 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10266 dout(10) << "add_replica_inode added " << *in << dendl;
10267 if (dn) {
10268 assert(dn->get_linkage()->is_null());
10269 dn->dir->link_primary_inode(dn, in);
10270 }
10271 } else {
10272 in->decode_replica(p, false);
10273 dout(10) << "add_replica_inode had " << *in << dendl;
10274 }
10275
10276 if (dn) {
10277 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10278 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10279 }
10280
10281 return in;
10282}
10283
10284
10285void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10286{
10287 uint64_t features = mds->mdsmap->get_up_features();
10288 replicate_inode(get_myin(), who, bl, features);
10289 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10290 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10291 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10292 replicate_dir(straydn->get_dir(), who, bl);
10293 replicate_dentry(straydn, who, bl);
10294}
10295
10296CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10297{
10298 list<MDSInternalContextBase*> finished;
10299 bufferlist::iterator p = bl.begin();
10300
10301 CInode *mdsin = add_replica_inode(p, NULL, finished);
10302 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10303 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10304 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10305 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10306 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10307 if (!finished.empty())
10308 mds->queue_waiters(finished);
10309
10310 return straydn;
10311}
10312
10313
10314int MDCache::send_dir_updates(CDir *dir, bool bcast)
10315{
10316 // this is an FYI, re: replication
10317
10318 set<mds_rank_t> who;
10319 if (bcast) {
10320 mds->get_mds_map()->get_active_mds_set(who);
10321 } else {
181888fb
FG
10322 for (const auto &p : dir->get_replicas()) {
10323 who.insert(p.first);
10324 }
7c673cae
FG
10325 }
10326
10327 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10328
10329 filepath path;
10330 dir->inode->make_path(path);
10331
10332 mds_rank_t whoami = mds->get_nodeid();
10333 for (set<mds_rank_t>::iterator it = who.begin();
10334 it != who.end();
10335 ++it) {
10336 if (*it == whoami) continue;
10337 //if (*it == except) continue;
10338 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10339
10340 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10341 dir->dirfrag(),
10342 dir->dir_rep,
10343 dir->dir_rep_by,
10344 path,
10345 bcast),
10346 *it);
10347 }
10348
10349 return 0;
10350}
10351
10352/* This function DOES put the passed message before returning */
10353void MDCache::handle_dir_update(MDirUpdate *m)
10354{
224ce89b
WB
10355 dirfrag_t df = m->get_dirfrag();
10356 CDir *dir = get_dirfrag(df);
7c673cae 10357 if (!dir) {
224ce89b 10358 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10359
10360 // discover it?
10361 if (m->should_discover()) {
10362 // only try once!
10363 // this is key to avoid a fragtree update race, among other things.
224ce89b 10364 m->inc_tried_discover();
7c673cae
FG
10365 vector<CDentry*> trace;
10366 CInode *in;
10367 filepath path = m->get_path();
10368 dout(5) << "trying discover on dir_update for " << path << dendl;
10369 MDRequestRef null_ref;
10370 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10371 if (r > 0)
10372 return;
224ce89b
WB
10373 if (r == 0 &&
10374 in->ino() == df.ino &&
10375 in->get_approx_dirfrag(df.frag) == NULL) {
10376 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10377 return;
10378 }
7c673cae
FG
10379 }
10380
10381 m->put();
10382 return;
10383 }
10384
224ce89b
WB
10385 if (!m->has_tried_discover()) {
10386 // Update if it already exists. Othwerwise it got updated by discover reply.
10387 dout(5) << "dir_update on " << *dir << dendl;
10388 dir->dir_rep = m->get_dir_rep();
10389 dir->dir_rep_by = m->get_dir_rep_by();
10390 }
10391
7c673cae
FG
10392 // done
10393 m->put();
10394}
10395
10396
10397
10398
10399
10400// LINK
10401
10402void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10403{
10404 dout(7) << "send_dentry_link " << *dn << dendl;
10405
10406 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10407 for (const auto &p : dn->get_replicas()) {
7c673cae 10408 // don't tell (rename) witnesses; they already know
181888fb 10409 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10410 continue;
181888fb
FG
10411 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10412 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10413 rejoin_gather.count(p.first)))
7c673cae
FG
10414 continue;
10415 CDentry::linkage_t *dnl = dn->get_linkage();
10416 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
10417 dn->name, dnl->is_primary());
10418 if (dnl->is_primary()) {
10419 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10420 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10421 mds->mdsmap->get_up_features());
10422 } else if (dnl->is_remote()) {
10423 inodeno_t ino = dnl->get_remote_ino();
10424 __u8 d_type = dnl->get_remote_d_type();
10425 dout(10) << " remote " << ino << " " << d_type << dendl;
10426 ::encode(ino, m->bl);
10427 ::encode(d_type, m->bl);
10428 } else
10429 ceph_abort(); // aie, bad caller!
181888fb 10430 mds->send_message_mds(m, p.first);
7c673cae
FG
10431 }
10432}
10433
10434/* This function DOES put the passed message before returning */
10435void MDCache::handle_dentry_link(MDentryLink *m)
10436{
10437
10438 CDentry *dn = NULL;
10439 CDir *dir = get_dirfrag(m->get_dirfrag());
10440 if (!dir) {
10441 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10442 } else {
10443 dn = dir->lookup(m->get_dn());
10444 if (!dn) {
10445 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10446 } else {
10447 dout(7) << "handle_dentry_link on " << *dn << dendl;
10448 CDentry::linkage_t *dnl = dn->get_linkage();
10449
10450 assert(!dn->is_auth());
10451 assert(dnl->is_null());
10452 }
10453 }
10454
10455 bufferlist::iterator p = m->bl.begin();
10456 list<MDSInternalContextBase*> finished;
10457 if (dn) {
10458 if (m->get_is_primary()) {
10459 // primary link.
10460 add_replica_inode(p, dn, finished);
10461 } else {
10462 // remote link, easy enough.
10463 inodeno_t ino;
10464 __u8 d_type;
10465 ::decode(ino, p);
10466 ::decode(d_type, p);
10467 dir->link_remote_inode(dn, ino, d_type);
10468 }
10469 } else {
10470 ceph_abort();
10471 }
10472
10473 if (!finished.empty())
10474 mds->queue_waiters(finished);
10475
10476 m->put();
10477 return;
10478}
10479
10480
10481// UNLINK
10482
10483void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10484{
10485 dout(10) << "send_dentry_unlink " << *dn << dendl;
10486 // share unlink news with replicas
10487 set<mds_rank_t> replicas;
10488 dn->list_replicas(replicas);
10489 if (straydn)
10490 straydn->list_replicas(replicas);
10491 for (set<mds_rank_t>::iterator it = replicas.begin();
10492 it != replicas.end();
10493 ++it) {
10494 // don't tell (rmdir) witnesses; they already know
10495 if (mdr.get() && mdr->more()->witnessed.count(*it))
10496 continue;
10497
10498 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10499 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10500 rejoin_gather.count(*it)))
10501 continue;
10502
10503 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
10504 if (straydn)
10505 replicate_stray(straydn, *it, unlink->straybl);
10506 mds->send_message_mds(unlink, *it);
10507 }
10508}
10509
10510/* This function DOES put the passed message before returning */
10511void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10512{
10513 // straydn
10514 CDentry *straydn = NULL;
10515 if (m->straybl.length())
10516 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10517
10518 CDir *dir = get_dirfrag(m->get_dirfrag());
10519 if (!dir) {
10520 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10521 } else {
10522 CDentry *dn = dir->lookup(m->get_dn());
10523 if (!dn) {
10524 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10525 } else {
10526 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10527 CDentry::linkage_t *dnl = dn->get_linkage();
10528
10529 // open inode?
10530 if (dnl->is_primary()) {
10531 CInode *in = dnl->get_inode();
10532 dn->dir->unlink_inode(dn);
10533 assert(straydn);
10534 straydn->dir->link_primary_inode(straydn, in);
10535
10536 // in->first is lazily updated on replica; drag it forward so
10537 // that we always keep it in sync with the dnq
10538 assert(straydn->first >= in->first);
10539 in->first = straydn->first;
10540
10541 // update subtree map?
10542 if (in->is_dir())
10543 adjust_subtree_after_rename(in, dir, false);
10544
10545 // send caps to auth (if we're not already)
10546 if (in->is_any_caps() &&
10547 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10548 migrator->export_caps(in);
10549
7c673cae
FG
10550 straydn = NULL;
10551 } else {
10552 assert(!straydn);
10553 assert(dnl->is_remote());
10554 dn->dir->unlink_inode(dn);
10555 }
10556 assert(dnl->is_null());
7c673cae
FG
10557 }
10558 }
10559
10560 // race with trim_dentry()
10561 if (straydn) {
10562 assert(straydn->get_num_ref() == 0);
10563 assert(straydn->get_linkage()->is_null());
10564 map<mds_rank_t, MCacheExpire*> expiremap;
10565 trim_dentry(straydn, expiremap);
10566 send_expire_messages(expiremap);
10567 }
10568
10569 m->put();
10570 return;
10571}
10572
10573
10574
10575
10576
10577
10578// ===================================================================
10579
10580
10581
10582// ===================================================================
10583// FRAGMENT
10584
10585
10586/**
10587 * adjust_dir_fragments -- adjust fragmentation for a directory
10588 *
10589 * @param diri directory inode
10590 * @param basefrag base fragment
10591 * @param bits bit adjustment. positive for split, negative for merge.
10592 */
10593void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10594 list<CDir*>& resultfrags,
10595 list<MDSInternalContextBase*>& waiters,
10596 bool replay)
10597{
10598 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10599 << " on " << *diri << dendl;
10600
10601 list<CDir*> srcfrags;
10602 diri->get_dirfrags_under(basefrag, srcfrags);
10603
10604 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10605}
10606
10607CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10608{
10609 CDir *dir = diri->get_dirfrag(fg);
10610 if (dir)
10611 return dir;
10612
10613 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10614
10615 list<CDir*> src, result;
10616 list<MDSInternalContextBase*> waiters;
10617
10618 // split a parent?
10619 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10620 while (1) {
10621 CDir *pdir = diri->get_dirfrag(parent);
10622 if (pdir) {
10623 int split = fg.bits() - parent.bits();
10624 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10625 src.push_back(pdir);
10626 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10627 dir = diri->get_dirfrag(fg);
10628 if (dir) {
10629 dout(10) << "force_dir_fragment result " << *dir << dendl;
10630 break;
10631 }
10632 }
10633 if (parent == frag_t())
10634 break;
10635 frag_t last = parent;
10636 parent = parent.parent();
10637 dout(10) << " " << last << " parent is " << parent << dendl;
10638 }
10639
10640 if (!dir) {
10641 // hoover up things under fg?
10642 diri->get_dirfrags_under(fg, src);
10643 if (src.empty()) {
10644 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10645 } else {
10646 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10647 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10648 dir = result.front();
10649 dout(10) << "force_dir_fragment result " << *dir << dendl;
10650 }
10651 }
10652 if (!replay)
10653 mds->queue_waiters(waiters);
10654 return dir;
10655}
10656
10657void MDCache::adjust_dir_fragments(CInode *diri,
10658 list<CDir*>& srcfrags,
10659 frag_t basefrag, int bits,
10660 list<CDir*>& resultfrags,
10661 list<MDSInternalContextBase*>& waiters,
10662 bool replay)
10663{
10664 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10665 << " srcfrags " << srcfrags
10666 << " on " << *diri << dendl;
10667
10668 // adjust fragtree
10669 // yuck. we may have discovered the inode while it was being fragmented.
10670 if (!diri->dirfragtree.is_leaf(basefrag))
10671 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10672
10673 if (bits > 0)
10674 diri->dirfragtree.split(basefrag, bits);
10675 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10676
10677 if (srcfrags.empty())
10678 return;
10679
10680 // split
10681 CDir *parent_dir = diri->get_parent_dir();
10682 CDir *parent_subtree = 0;
10683 if (parent_dir)
10684 parent_subtree = get_subtree_root(parent_dir);
10685
10686 if (bits > 0) {
10687 // SPLIT
10688 assert(srcfrags.size() == 1);
10689 CDir *dir = srcfrags.front();
10690
10691 dir->split(bits, resultfrags, waiters, replay);
10692
10693 // did i change the subtree map?
10694 if (dir->is_subtree_root()) {
10695 // new frags are now separate subtrees
10696 for (list<CDir*>::iterator p = resultfrags.begin();
10697 p != resultfrags.end();
10698 ++p)
10699 subtrees[*p].clear(); // new frag is now its own subtree
10700
10701 // was i a bound?
10702 if (parent_subtree) {
10703 assert(subtrees[parent_subtree].count(dir));
10704 subtrees[parent_subtree].erase(dir);
10705 for (list<CDir*>::iterator p = resultfrags.begin();
10706 p != resultfrags.end();
10707 ++p) {
10708 assert((*p)->is_subtree_root());
10709 subtrees[parent_subtree].insert(*p);
10710 }
10711 }
10712
10713 // adjust my bounds.
10714 set<CDir*> bounds;
10715 bounds.swap(subtrees[dir]);
10716 subtrees.erase(dir);
10717 for (set<CDir*>::iterator p = bounds.begin();
10718 p != bounds.end();
10719 ++p) {
10720 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10721 subtrees[frag].insert(*p);
10722 }
10723
10724 show_subtrees(10);
10725
10726 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10727 dir->dir_auth = CDIR_AUTH_DEFAULT;
10728 }
10729
10730 diri->close_dirfrag(dir->get_frag());
10731
10732 } else {
10733 // MERGE
10734
10735 // are my constituent bits subtrees? if so, i will be too.
10736 // (it's all or none, actually.)
31f18b77
FG
10737 bool any_subtree = false;
10738 for (CDir *dir : srcfrags) {
7c673cae 10739 if (dir->is_subtree_root()) {
31f18b77
FG
10740 any_subtree = true;
10741 break;
10742 }
10743 }
10744 set<CDir*> new_bounds;
10745 if (any_subtree) {
10746 for (CDir *dir : srcfrags) {
10747 // this simplifies the code that find subtrees underneath the dirfrag
10748 if (!dir->is_subtree_root()) {
10749 dir->state_set(CDir::STATE_AUXSUBTREE);
10750 adjust_subtree_auth(dir, mds->get_nodeid());
10751 }
10752 }
10753
10754 for (CDir *dir : srcfrags) {
10755 assert(dir->is_subtree_root());
7c673cae 10756 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
10757 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10758 set<CDir*>::iterator r = q->second.begin();
10759 while (r != subtrees[dir].end()) {
10760 new_bounds.insert(*r);
10761 subtrees[dir].erase(r++);
10762 }
10763 subtrees.erase(q);
31f18b77 10764
7c673cae
FG
10765 // remove myself as my parent's bound
10766 if (parent_subtree)
10767 subtrees[parent_subtree].erase(dir);
10768 }
10769 }
10770
10771 // merge
10772 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10773 f->merge(srcfrags, waiters, replay);
7c673cae 10774
31f18b77 10775 if (any_subtree) {
7c673cae
FG
10776 assert(f->is_subtree_root());
10777 subtrees[f].swap(new_bounds);
10778 if (parent_subtree)
10779 subtrees[parent_subtree].insert(f);
10780
10781 show_subtrees(10);
10782 }
10783
10784 resultfrags.push_back(f);
10785 }
10786}
10787
10788
10789class C_MDC_FragmentFrozen : public MDSInternalContext {
10790 MDCache *mdcache;
10791 MDRequestRef mdr;
10792public:
10793 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10794 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10795 void finish(int r) override {
10796 mdcache->fragment_frozen(mdr, r);
10797 }
10798};
10799
10800bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10801{
10802 if (is_readonly()) {
10803 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10804 return false;
10805 }
10806 if (mds->is_cluster_degraded()) {
10807 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10808 return false;
10809 }
10810 if (diri->get_parent_dir() &&
10811 diri->get_parent_dir()->get_inode()->is_stray()) {
10812 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10813 return false;
10814 }
10815 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10816 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10817 return false;
10818 }
10819
10820 if (diri->scrub_is_in_progress()) {
10821 dout(7) << "can_fragment: scrub in progress" << dendl;
10822 return false;
10823 }
10824
10825 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10826 CDir *dir = *p;
10827 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10828 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10829 return false;
10830 }
10831 if (!dir->is_auth()) {
10832 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10833 return false;
10834 }
10835 if (dir->is_bad()) {
10836 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10837 return false;
10838 }
10839 if (dir->is_frozen() ||
10840 dir->is_freezing()) {
10841 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
10842 return false;
10843 }
10844 }
10845
10846 return true;
10847}
10848
10849void MDCache::split_dir(CDir *dir, int bits)
10850{
10851 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
10852 assert(dir->is_auth());
10853 CInode *diri = dir->inode;
10854
10855 list<CDir*> dirs;
10856 dirs.push_back(dir);
10857
10858 if (!can_fragment(diri, dirs)) {
10859 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
10860 return;
10861 }
10862
31f18b77
FG
10863 if (dir->frag.bits() + bits > 24) {
10864 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
10865 return;
10866 }
10867
7c673cae
FG
10868 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10869 mdr->more()->fragment_base = dir->dirfrag();
10870
10871 assert(fragments.count(dir->dirfrag()) == 0);
10872 fragment_info_t& info = fragments[dir->dirfrag()];
10873 info.mdr = mdr;
10874 info.dirs.push_back(dir);
10875 info.bits = bits;
10876 info.last_cum_auth_pins_change = ceph_clock_now();
10877
10878 fragment_freeze_dirs(dirs);
10879 // initial mark+complete pass
10880 fragment_mark_and_complete(mdr);
10881}
10882
10883void MDCache::merge_dir(CInode *diri, frag_t frag)
10884{
10885 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
10886
10887 list<CDir*> dirs;
10888 if (!diri->get_dirfrags_under(frag, dirs)) {
10889 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
10890 return;
10891 }
10892
10893 if (diri->dirfragtree.is_leaf(frag)) {
10894 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
10895 return;
10896 }
10897
10898 if (!can_fragment(diri, dirs))
10899 return;
10900
10901 CDir *first = dirs.front();
10902 int bits = first->get_frag().bits() - frag.bits();
10903 dout(10) << " we are merginb by " << bits << " bits" << dendl;
10904
10905 dirfrag_t basedirfrag(diri->ino(), frag);
10906 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
10907 mdr->more()->fragment_base = basedirfrag;
10908
10909 assert(fragments.count(basedirfrag) == 0);
10910 fragment_info_t& info = fragments[basedirfrag];
10911 info.mdr = mdr;
10912 info.dirs = dirs;
10913 info.bits = -bits;
10914 info.last_cum_auth_pins_change = ceph_clock_now();
10915
10916 fragment_freeze_dirs(dirs);
10917 // initial mark+complete pass
10918 fragment_mark_and_complete(mdr);
10919}
10920
10921void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
10922{
10923 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10924 CDir *dir = *p;
10925 dir->auth_pin(dir); // until we mark and complete them
10926 dir->state_set(CDir::STATE_FRAGMENTING);
10927 dir->freeze_dir();
10928 assert(dir->is_freezing_dir());
10929 }
10930}
10931
10932class C_MDC_FragmentMarking : public MDCacheContext {
10933 MDRequestRef mdr;
10934public:
10935 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
10936 void finish(int r) override {
10937 mdcache->fragment_mark_and_complete(mdr);
10938 }
10939};
10940
10941void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
10942{
10943 dirfrag_t basedirfrag = mdr->more()->fragment_base;
10944 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
10945 if (it == fragments.end() || it->second.mdr != mdr) {
10946 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
10947 request_finish(mdr);
10948 return;
10949 }
10950
10951 fragment_info_t& info = it->second;
10952 CInode *diri = info.dirs.front()->get_inode();
10953 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
10954
10955 MDSGatherBuilder gather(g_ceph_context);
10956
10957 for (list<CDir*>::iterator p = info.dirs.begin();
10958 p != info.dirs.end();
10959 ++p) {
10960 CDir *dir = *p;
10961
10962 bool ready = true;
10963 if (!dir->is_complete()) {
10964 dout(15) << " fetching incomplete " << *dir << dendl;
10965 dir->fetch(gather.new_sub(), true); // ignore authpinnability
10966 ready = false;
10967 } else if (dir->get_frag() == frag_t()) {
10968 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
10969 // the operation. To avoid CDir::fetch() complaining about missing object,
10970 // we commit new dirfrag first.
10971 if (dir->state_test(CDir::STATE_CREATING)) {
10972 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
10973 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
10974 ready = false;
10975 } else if (dir->is_new()) {
10976 dout(15) << " committing new " << *dir << dendl;
10977 assert(dir->is_dirty());
10978 dir->commit(0, gather.new_sub(), true);
10979 ready = false;
10980 }
10981 }
10982 if (!ready)
10983 continue;
10984
10985 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
10986 dout(15) << " marking " << *dir << dendl;
10987 for (CDir::map_t::iterator p = dir->items.begin();
10988 p != dir->items.end();
10989 ++p) {
10990 CDentry *dn = p->second;
10991 dn->get(CDentry::PIN_FRAGMENTING);
10992 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
10993 dn->state_set(CDentry::STATE_FRAGMENTING);
10994 }
10995 dir->state_set(CDir::STATE_DNPINNEDFRAG);
10996 dir->auth_unpin(dir);
10997 } else {
10998 dout(15) << " already marked " << *dir << dendl;
10999 }
11000 }
11001 if (gather.has_subs()) {
11002 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11003 gather.activate();
11004 return;
11005 }
11006
11007 for (list<CDir*>::iterator p = info.dirs.begin();
11008 p != info.dirs.end();
11009 ++p) {
11010 CDir *dir = *p;
11011 if (!dir->is_frozen_dir()) {
11012 assert(dir->is_freezing_dir());
11013 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11014 }
11015 }
11016 if (gather.has_subs()) {
11017 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11018 gather.activate();
11019 // flush log so that request auth_pins are retired
11020 mds->mdlog->flush();
11021 return;
11022 }
11023
11024 fragment_frozen(mdr, 0);
11025}
11026
11027void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11028{
11029 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11030 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11031 CDir *dir = *p;
11032 dout(10) << " frag " << *dir << dendl;
11033
11034 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11035 dir->state_clear(CDir::STATE_FRAGMENTING);
11036
11037 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11038 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11039
11040 for (CDir::map_t::iterator p = dir->items.begin();
11041 p != dir->items.end();
11042 ++p) {
11043 CDentry *dn = p->second;
11044 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11045 dn->state_clear(CDentry::STATE_FRAGMENTING);
11046 dn->put(CDentry::PIN_FRAGMENTING);
11047 }
11048 } else {
11049 dir->auth_unpin(dir);
11050 }
11051
11052 dir->unfreeze_dir();
11053 }
11054}
11055
11056bool MDCache::fragment_are_all_frozen(CDir *dir)
11057{
11058 assert(dir->is_frozen_dir());
11059 map<dirfrag_t,fragment_info_t>::iterator p;
11060 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11061 p != fragments.end() && p->first.ino == dir->ino();
11062 ++p) {
11063 if (p->first.frag.contains(dir->get_frag()))
11064 return p->second.all_frozen;
11065 }
11066 ceph_abort();
11067 return false;
11068}
11069
11070void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11071{
11072 map<dirfrag_t,fragment_info_t>::iterator p;
11073 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11074 p != fragments.end() && p->first.ino == dir->ino();
11075 ++p) {
11076 if (p->first.frag.contains(dir->get_frag())) {
11077 p->second.num_remote_waiters++;
11078 return;
11079 }
11080 }
11081 ceph_abort();
11082}
11083
11084void MDCache::find_stale_fragment_freeze()
11085{
11086 dout(10) << "find_stale_fragment_freeze" << dendl;
11087 // see comment in Migrator::find_stale_export_freeze()
11088 utime_t now = ceph_clock_now();
11089 utime_t cutoff = now;
11090 cutoff -= g_conf->mds_freeze_tree_timeout;
11091
11092 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11093 p != fragments.end(); ) {
11094 dirfrag_t df = p->first;
11095 fragment_info_t& info = p->second;
11096 ++p;
11097 if (info.all_frozen)
11098 continue;
11099 CDir *dir;
11100 int total_auth_pins = 0;
11101 for (list<CDir*>::iterator q = info.dirs.begin();
11102 q != info.dirs.end();
11103 ++q) {
11104 dir = *q;
11105 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11106 total_auth_pins = -1;
11107 break;
11108 }
11109 if (dir->is_frozen_dir())
11110 continue;
11111 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11112 }
11113 if (total_auth_pins < 0)
11114 continue;
11115 if (info.last_cum_auth_pins != total_auth_pins) {
11116 info.last_cum_auth_pins = total_auth_pins;
11117 info.last_cum_auth_pins_change = now;
11118 continue;
11119 }
11120 if (info.last_cum_auth_pins_change >= cutoff)
11121 continue;
11122 dir = info.dirs.front();
11123 if (info.num_remote_waiters > 0 ||
11124 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11125 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11126 list<CDir*> dirs;
11127 info.dirs.swap(dirs);
11128 fragments.erase(df);
11129 fragment_unmark_unfreeze_dirs(dirs);
11130 }
11131 }
11132}
11133
11134class C_MDC_FragmentPrep : public MDCacheLogContext {
11135 MDRequestRef mdr;
11136public:
11137 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11138 void finish(int r) override {
11139 mdcache->_fragment_logged(mdr);
11140 }
11141};
11142
11143class C_MDC_FragmentStore : public MDCacheContext {
11144 MDRequestRef mdr;
11145public:
11146 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11147 void finish(int r) override {
11148 mdcache->_fragment_stored(mdr);
11149 }
11150};
11151
11152class C_MDC_FragmentCommit : public MDCacheLogContext {
11153 dirfrag_t basedirfrag;
11154 list<CDir*> resultfrags;
11155public:
11156 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11157 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11158 void finish(int r) override {
11159 mdcache->_fragment_committed(basedirfrag, resultfrags);
11160 }
11161};
11162
11163class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11164 dirfrag_t basedirfrag;
11165 list<CDir*> resultfrags;
11166public:
11167 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11168 MDCacheIOContext(m), basedirfrag(f) {
11169 resultfrags.swap(l);
11170 }
11171 void finish(int r) override {
11172 assert(r == 0 || r == -ENOENT);
11173 mdcache->_fragment_finish(basedirfrag, resultfrags);
11174 }
11175};
11176
11177void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11178{
11179 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11180 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11181 if (it == fragments.end() || it->second.mdr != mdr) {
11182 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11183 request_finish(mdr);
11184 return;
11185 }
11186
11187 assert(r == 0);
11188 fragment_info_t& info = it->second;
11189 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11190 << " on " << info.dirs.front()->get_inode() << dendl;
11191
11192 info.all_frozen = true;
11193 dispatch_fragment_dir(mdr);
11194}
11195
11196void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11197{
11198 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11199 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11200 if (it == fragments.end() || it->second.mdr != mdr) {
11201 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11202 request_finish(mdr);
11203 return;
11204 }
11205
11206 fragment_info_t& info = it->second;
11207 CInode *diri = info.dirs.front()->get_inode();
11208
11209 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11210 << " on " << *diri << dendl;
11211 if (!mdr->aborted) {
11212 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11213 wrlocks.insert(&diri->dirfragtreelock);
11214 // prevent a racing gather on any other scatterlocks too
11215 wrlocks.insert(&diri->nestlock);
11216 wrlocks.insert(&diri->filelock);
11217 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11218 if (!mdr->aborted)
11219 return;
11220 }
11221
11222 if (mdr->aborted) {
11223 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11224 << info.dirs.front()->dirfrag() << dendl;
11225 if (info.bits > 0)
11226 mds->balancer->queue_split(info.dirs.front(), false);
11227 else
11228 mds->balancer->queue_merge(info.dirs.front());
11229 fragment_unmark_unfreeze_dirs(info.dirs);
11230 fragments.erase(it);
11231 request_finish(mdr);
11232 return;
11233 }
11234
11235 mdr->ls = mds->mdlog->get_current_segment();
11236 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11237 mds->mdlog->start_entry(le);
11238
11239 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11240 CDir *dir = *p;
11241 dirfrag_rollback rollback;
11242 rollback.fnode = dir->fnode;
11243 le->add_orig_frag(dir->get_frag(), &rollback);
11244 }
11245
11246 // refragment
11247 list<MDSInternalContextBase*> waiters;
11248 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11249 info.resultfrags, waiters, false);
11250 if (g_conf->mds_debug_frag)
11251 diri->verify_dirfrags();
11252 mds->queue_waiters(waiters);
11253
11254 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11255 assert(!diri->dirfragtree.is_leaf(*p));
11256
11257 le->metablob.add_dir_context(*info.resultfrags.begin());
11258 for (list<CDir*>::iterator p = info.resultfrags.begin();
11259 p != info.resultfrags.end();
11260 ++p) {
11261 if (diri->is_auth()) {
11262 le->metablob.add_fragmented_dir(*p, false, false);
11263 } else {
11264 (*p)->state_set(CDir::STATE_DIRTYDFT);
11265 le->metablob.add_fragmented_dir(*p, false, true);
11266 }
11267 }
11268
11269 // dft lock
11270 if (diri->is_auth()) {
11271 // journal dirfragtree
11272 inode_t *pi = diri->project_inode();
11273 pi->version = diri->pre_dirty();
11274 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11275 } else {
11276 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11277 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11278 mdr->add_updated_lock(&diri->dirfragtreelock);
11279 }
11280
11281 /*
11282 // filelock
11283 mds->locker->mark_updated_scatterlock(&diri->filelock);
11284 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11285 mut->add_updated_lock(&diri->filelock);
11286
11287 // dirlock
11288 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11289 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11290 mut->add_updated_lock(&diri->nestlock);
11291 */
11292
11293 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11294 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11295 mdr, __func__);
11296 mds->mdlog->flush();
11297}
11298
11299void MDCache::_fragment_logged(MDRequestRef& mdr)
11300{
11301 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11302 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11303 assert(it != fragments.end());
11304 fragment_info_t &info = it->second;
11305 CInode *diri = info.resultfrags.front()->get_inode();
11306
11307 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11308 << " on " << *diri << dendl;
11309
11310 if (diri->is_auth())
11311 diri->pop_and_dirty_projected_inode(mdr->ls);
11312
11313 mdr->apply(); // mark scatterlock
11314
11315 // store resulting frags
11316 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11317
11318 for (list<CDir*>::iterator p = info.resultfrags.begin();
11319 p != info.resultfrags.end();
11320 ++p) {
11321 CDir *dir = *p;
11322 dout(10) << " storing result frag " << *dir << dendl;
11323
11324 // freeze and store them too
11325 dir->auth_pin(this);
11326 dir->state_set(CDir::STATE_FRAGMENTING);
11327 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11328 }
11329
11330 gather.activate();
11331}
11332
11333void MDCache::_fragment_stored(MDRequestRef& mdr)
11334{
11335 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11336 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11337 assert(it != fragments.end());
11338 fragment_info_t &info = it->second;
11339 CInode *diri = info.resultfrags.front()->get_inode();
11340
11341 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11342 << " on " << *diri << dendl;
11343
11344 // tell peers
11345 CDir *first = *info.resultfrags.begin();
181888fb
FG
11346 for (const auto &p : first->get_replicas()) {
11347 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11348 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11349 rejoin_gather.count(p.first)))
7c673cae
FG
11350 continue;
11351
11352 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11353
11354 // freshly replicate new dirs to peers
11355 for (list<CDir*>::iterator q = info.resultfrags.begin();
11356 q != info.resultfrags.end();
11357 ++q)
181888fb 11358 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11359
181888fb 11360 mds->send_message_mds(notify, p.first);
7c673cae
FG
11361 }
11362
11363 // journal commit
11364 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11365 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11366 info.resultfrags));
11367
11368 mds->locker->drop_locks(mdr.get());
11369
11370 // unfreeze resulting frags
11371 for (list<CDir*>::iterator p = info.resultfrags.begin();
11372 p != info.resultfrags.end();
11373 ++p) {
11374 CDir *dir = *p;
11375 dout(10) << " result frag " << *dir << dendl;
11376
11377 for (CDir::map_t::iterator p = dir->items.begin();
11378 p != dir->items.end();
11379 ++p) {
11380 CDentry *dn = p->second;
11381 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11382 dn->state_clear(CDentry::STATE_FRAGMENTING);
11383 dn->put(CDentry::PIN_FRAGMENTING);
11384 }
11385
11386 // unfreeze
11387 dir->unfreeze_dir();
11388 }
11389
11390 fragments.erase(it);
11391 request_finish(mdr);
11392}
11393
11394void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11395{
11396 dout(10) << "fragment_committed " << basedirfrag << dendl;
11397 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11398 assert(it != uncommitted_fragments.end());
11399 ufragment &uf = it->second;
11400
11401 // remove old frags
11402 C_GatherBuilder gather(
11403 g_ceph_context,
11404 new C_OnFinisher(
11405 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11406 mds->finisher));
11407
11408 SnapContext nullsnapc;
11409 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11410 for (list<frag_t>::iterator p = uf.old_frags.begin();
11411 p != uf.old_frags.end();
11412 ++p) {
11413 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11414 ObjectOperation op;
11415 if (*p == frag_t()) {
11416 // backtrace object
11417 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11418 op.truncate(0);
11419 op.omap_clear();
11420 } else {
11421 dout(10) << " removing orphan dirfrag " << oid << dendl;
11422 op.remove();
11423 }
11424 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11425 ceph::real_clock::now(),
11426 0, gather.new_sub());
11427 }
11428
11429 assert(gather.has_subs());
11430 gather.activate();
11431}
11432
11433void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11434{
11435 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11436 << resultfrags.size() << dendl;
11437 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11438 assert(it != uncommitted_fragments.end());
11439 ufragment &uf = it->second;
11440
11441 // unmark & auth_unpin
11442 for (const auto &dir : resultfrags) {
11443 dir->state_clear(CDir::STATE_FRAGMENTING);
11444 dir->auth_unpin(this);
11445
11446 // In case the resulting fragments are beyond the split size,
11447 // we might need to split them again right away (they could
11448 // have been taking inserts between unfreezing and getting
11449 // here)
11450 mds->balancer->maybe_fragment(dir, false);
11451 }
11452
11453 if (mds->logger) {
11454 if (resultfrags.size() > 1) {
11455 mds->logger->inc(l_mds_dir_split);
11456 } else {
11457 mds->logger->inc(l_mds_dir_merge);
11458 }
11459 }
11460
11461 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11462 mds->mdlog->start_submit_entry(le);
11463
11464 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11465}
11466
11467/* This function DOES put the passed message before returning */
11468void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11469{
11470 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11471
11472 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11473 notify->put();
11474 return;
11475 }
11476
11477 CInode *diri = get_inode(notify->get_ino());
11478 if (diri) {
11479 frag_t base = notify->get_basefrag();
11480 int bits = notify->get_bits();
11481
11482/*
11483 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11484 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11485 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11486 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11487 notify->put();
11488 return;
11489 }
11490*/
11491
11492 // refragment
11493 list<MDSInternalContextBase*> waiters;
11494 list<CDir*> resultfrags;
11495 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11496 if (g_conf->mds_debug_frag)
11497 diri->verify_dirfrags();
11498
11499 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11500 diri->take_dir_waiting((*p)->get_frag(), waiters);
11501
11502 // add new replica dirs values
11503 bufferlist::iterator p = notify->basebl.begin();
11504 while (!p.end())
11505 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11506
11507 mds->queue_waiters(waiters);
11508 } else {
11509 ceph_abort();
11510 }
11511
11512 notify->put();
11513}
11514
11515void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11516 LogSegment *ls, bufferlist *rollback)
11517{
11518 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11519 assert(!uncommitted_fragments.count(basedirfrag));
11520 ufragment& uf = uncommitted_fragments[basedirfrag];
11521 uf.old_frags = old_frags;
11522 uf.bits = bits;
11523 uf.ls = ls;
11524 ls->uncommitted_fragments.insert(basedirfrag);
11525 if (rollback)
11526 uf.rollback.swap(*rollback);
11527}
11528
11529void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11530{
11531 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11532 << " op " << EFragment::op_name(op) << dendl;
11533 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11534 if (it != uncommitted_fragments.end()) {
11535 ufragment& uf = it->second;
11536 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11537 uf.committed = true;
11538 } else {
11539 uf.ls->uncommitted_fragments.erase(basedirfrag);
11540 mds->queue_waiters(uf.waiters);
11541 uncommitted_fragments.erase(it);
11542 }
11543 }
11544}
11545
11546void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11547{
11548 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11549 << " old_frags (" << old_frags << ")" << dendl;
11550 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11551 if (it != uncommitted_fragments.end()) {
11552 ufragment& uf = it->second;
11553 if (!uf.old_frags.empty()) {
11554 uf.old_frags.swap(old_frags);
11555 uf.committed = true;
11556 } else {
11557 uf.ls->uncommitted_fragments.erase(basedirfrag);
11558 uncommitted_fragments.erase(it);
11559 }
11560 }
11561}
11562
11563void MDCache::rollback_uncommitted_fragments()
11564{
11565 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11566 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11567 p != uncommitted_fragments.end();
11568 ++p) {
11569 ufragment &uf = p->second;
11570 CInode *diri = get_inode(p->first.ino);
11571 assert(diri);
11572
11573 if (uf.committed) {
11574 list<CDir*> frags;
11575 diri->get_dirfrags_under(p->first.frag, frags);
11576 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11577 CDir *dir = *q;
11578 dir->auth_pin(this);
11579 dir->state_set(CDir::STATE_FRAGMENTING);
11580 }
11581 _fragment_committed(p->first, frags);
11582 continue;
11583 }
11584
11585 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11586
11587 LogSegment *ls = mds->mdlog->get_current_segment();
11588 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11589 mds->mdlog->start_entry(le);
11590 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11591
11592 list<frag_t> old_frags;
11593 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11594
11595 list<CDir*> resultfrags;
11596 if (uf.old_frags.empty()) {
11597 // created by old format EFragment
11598 list<MDSInternalContextBase*> waiters;
11599 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11600 } else {
11601 bufferlist::iterator bp = uf.rollback.begin();
11602 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11603 CDir *dir = force_dir_fragment(diri, *q);
11604 resultfrags.push_back(dir);
11605
11606 dirfrag_rollback rollback;
11607 ::decode(rollback, bp);
11608
11609 dir->set_version(rollback.fnode.version);
11610 dir->fnode = rollback.fnode;
11611
11612 dir->_mark_dirty(ls);
11613
11614 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11615 dout(10) << " dirty nestinfo on " << *dir << dendl;
11616 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11617 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11618 }
11619 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11620 dout(10) << " dirty fragstat on " << *dir << dendl;
11621 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11622 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11623 }
11624
11625 le->add_orig_frag(dir->get_frag());
11626 le->metablob.add_dir_context(dir);
11627 if (diri_auth) {
11628 le->metablob.add_fragmented_dir(dir, true, false);
11629 } else {
11630 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11631 dir->state_set(CDir::STATE_DIRTYDFT);
11632 le->metablob.add_fragmented_dir(dir, true, true);
11633 }
11634 }
11635 }
11636
11637 if (diri_auth) {
11638 diri->project_inode()->version = diri->pre_dirty();
11639 diri->pop_and_dirty_projected_inode(ls); // hacky
11640 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11641 } else {
11642 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11643 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11644 }
11645
11646 if (g_conf->mds_debug_frag)
11647 diri->verify_dirfrags();
11648
11649 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11650 assert(!diri->dirfragtree.is_leaf(*q));
11651
11652 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11653 CDir *dir = *q;
11654 dir->auth_pin(this);
11655 dir->state_set(CDir::STATE_FRAGMENTING);
11656 }
11657
11658 mds->mdlog->submit_entry(le);
11659
11660 uf.old_frags.swap(old_frags);
11661 _fragment_committed(p->first, resultfrags);
11662 }
11663}
11664
11665void MDCache::force_readonly()
11666{
11667 if (is_readonly())
11668 return;
11669
11670 dout(1) << "force file system read-only" << dendl;
11671 mds->clog->warn() << "force file system read-only";
11672
11673 set_readonly();
11674
11675 mds->server->force_clients_readonly();
11676
11677 // revoke write caps
11678 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
11679 p != inode_map.end();
11680 ++p) {
11681 CInode *in = p->second;
11682 if (in->is_head())
11683 mds->locker->eval(in, CEPH_CAP_LOCKS);
11684 }
11685
11686 mds->mdlog->flush();
11687}
11688
11689
11690// ==============================================================
11691// debug crap
11692
11693void MDCache::show_subtrees(int dbl)
11694{
11695 if (g_conf->mds_thrash_exports)
11696 dbl += 15;
11697
11698 //dout(10) << "show_subtrees" << dendl;
11699
11700 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11701 return; // i won't print anything.
11702
11703 if (subtrees.empty()) {
11704 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11705 return;
11706 }
11707
11708 // root frags
11709 list<CDir*> basefrags;
11710 for (set<CInode*>::iterator p = base_inodes.begin();
11711 p != base_inodes.end();
11712 ++p)
11713 (*p)->get_dirfrags(basefrags);
11714 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11715 dout(15) << "show_subtrees" << dendl;
11716
11717 // queue stuff
11718 list<pair<CDir*,int> > q;
11719 string indent;
11720 set<CDir*> seen;
11721
11722 // calc max depth
11723 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11724 q.push_back(pair<CDir*,int>(*p, 0));
11725
11726 set<CDir*> subtrees_seen;
11727
11728 int depth = 0;
11729 while (!q.empty()) {
11730 CDir *dir = q.front().first;
11731 int d = q.front().second;
11732 q.pop_front();
11733
11734 if (subtrees.count(dir) == 0) continue;
11735
11736 subtrees_seen.insert(dir);
11737
11738 if (d > depth) depth = d;
11739
11740 // sanity check
11741 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11742 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11743 assert(seen.count(dir) == 0);
11744 seen.insert(dir);
11745
11746 // nested items?
11747 if (!subtrees[dir].empty()) {
11748 for (set<CDir*>::iterator p = subtrees[dir].begin();
11749 p != subtrees[dir].end();
11750 ++p) {
11751 //dout(25) << " saw sub " << **p << dendl;
11752 q.push_front(pair<CDir*,int>(*p, d+1));
11753 }
11754 }
11755 }
11756
11757
11758 // print tree
11759 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11760 q.push_back(pair<CDir*,int>(*p, 0));
11761
11762 while (!q.empty()) {
11763 CDir *dir = q.front().first;
11764 int d = q.front().second;
11765 q.pop_front();
11766
11767 if (subtrees.count(dir) == 0) continue;
11768
11769 // adjust indenter
11770 while ((unsigned)d < indent.size())
11771 indent.resize(d);
11772
11773 // pad
11774 string pad = "______________________________________";
11775 pad.resize(depth*2+1-indent.size());
11776 if (!subtrees[dir].empty())
11777 pad[0] = '.'; // parent
11778
11779
11780 string auth;
11781 if (dir->is_auth())
11782 auth = "auth ";
11783 else
11784 auth = " rep ";
11785
11786 char s[10];
11787 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11788 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11789 else
11790 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11791
11792 // print
11793 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11794
11795 if (dir->ino() == MDS_INO_ROOT)
11796 assert(dir->inode == root);
11797 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11798 assert(dir->inode == myin);
11799 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11800 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11801
11802 // nested items?
11803 if (!subtrees[dir].empty()) {
11804 // more at my level?
11805 if (!q.empty() && q.front().second == d)
11806 indent += "| ";
11807 else
11808 indent += " ";
11809
11810 for (set<CDir*>::iterator p = subtrees[dir].begin();
11811 p != subtrees[dir].end();
11812 ++p)
11813 q.push_front(pair<CDir*,int>(*p, d+2));
11814 }
11815 }
11816
11817 // verify there isn't stray crap in subtree map
11818 int lost = 0;
11819 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11820 p != subtrees.end();
11821 ++p) {
11822 if (subtrees_seen.count(p->first)) continue;
11823 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11824 lost++;
11825 }
11826 assert(lost == 0);
11827}
11828
11829
11830void MDCache::show_cache()
11831{
11832 dout(7) << "show_cache" << dendl;
11833
11834 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11835 it != inode_map.end();
11836 ++it) {
11837 // unlinked?
11838 if (!it->second->parent)
11839 dout(7) << " unlinked " << *it->second << dendl;
11840
11841 // dirfrags?
11842 list<CDir*> dfs;
11843 it->second->get_dirfrags(dfs);
11844 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11845 CDir *dir = *p;
11846 dout(7) << " dirfrag " << *dir << dendl;
11847
11848 for (CDir::map_t::iterator p = dir->items.begin();
11849 p != dir->items.end();
11850 ++p) {
11851 CDentry *dn = p->second;
11852 dout(7) << " dentry " << *dn << dendl;
11853 CDentry::linkage_t *dnl = dn->get_linkage();
11854 if (dnl->is_primary() && dnl->get_inode())
11855 dout(7) << " inode " << *dnl->get_inode() << dendl;
11856 }
11857 }
11858 }
11859}
11860
181888fb
FG
11861int MDCache::cache_status(Formatter *f)
11862{
11863 f->open_object_section("cache");
11864
11865 f->open_object_section("pool");
11866 mempool::get_pool(mempool::mds_co::id).dump(f);
11867 f->close_section();
11868
11869 f->close_section();
11870 return 0;
11871}
11872
31f18b77 11873int MDCache::dump_cache(std::string const &file_name)
7c673cae 11874{
31f18b77 11875 return dump_cache(file_name.c_str(), NULL);
7c673cae
FG
11876}
11877
31f18b77 11878int MDCache::dump_cache(Formatter *f)
7c673cae 11879{
31f18b77 11880 return dump_cache(NULL, f);
7c673cae
FG
11881}
11882
31f18b77 11883int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f)
7c673cae 11884{
31f18b77 11885 return dump_cache(NULL, f, dump_root, depth);
7c673cae
FG
11886}
11887
11888/**
11889 * Dump the metadata cache, either to a Formatter, if
11890 * provided, else to a plain text file.
11891 */
31f18b77 11892int MDCache::dump_cache(const char *fn, Formatter *f,
7c673cae
FG
11893 const string& dump_root, int depth)
11894{
11895 int r = 0;
11896 int fd = -1;
11897
11898 if (f) {
11899 f->open_array_section("inodes");
11900 } else {
11901 char deffn[200];
11902 if (!fn) {
11903 snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
11904 fn = deffn;
11905 }
11906
11907 dout(1) << "dump_cache to " << fn << dendl;
11908
11909 fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600);
11910 if (fd < 0) {
11911 derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl;
31f18b77 11912 return errno;
7c673cae
FG
11913 }
11914 }
11915
11916 for (ceph::unordered_map<vinodeno_t,CInode*>::iterator it = inode_map.begin();
11917 it != inode_map.end();
11918 ++it) {
11919 CInode *in = it->second;
11920
11921 if (!dump_root.empty()) {
11922 string ipath;
11923 if (in->is_root())
11924 ipath = "/";
11925 else
11926 in->make_path_string(ipath);
11927
11928 if (dump_root.length() > ipath.length() ||
11929 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
11930 continue;
11931
11932 if (depth >= 0 &&
11933 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
11934 continue;
11935 }
11936
11937 if (f) {
11938 f->open_object_section("inode");
11939 in->dump(f);
11940 } else {
11941 ostringstream ss;
11942 ss << *in << std::endl;
11943 std::string s = ss.str();
11944 r = safe_write(fd, s.c_str(), s.length());
11945 if (r < 0) {
11946 goto out;
11947 }
11948 }
11949
11950 list<CDir*> dfs;
11951 in->get_dirfrags(dfs);
11952 if (f) {
11953 f->open_array_section("dirfrags");
11954 }
11955 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
11956 CDir *dir = *p;
11957 if (f) {
11958 f->open_object_section("dir");
11959 dir->dump(f);
11960 } else {
11961 ostringstream tt;
11962 tt << " " << *dir << std::endl;
11963 string t = tt.str();
11964 r = safe_write(fd, t.c_str(), t.length());
11965 if (r < 0) {
11966 goto out;
11967 }
11968 }
11969
11970 if (f) {
11971 f->open_array_section("dentries");
11972 }
11973 for (CDir::map_t::iterator q = dir->items.begin();
11974 q != dir->items.end();
11975 ++q) {
11976 CDentry *dn = q->second;
11977 if (f) {
11978 f->open_object_section("dentry");
11979 dn->dump(f);
11980 f->close_section();
11981 } else {
11982 ostringstream uu;
11983 uu << " " << *dn << std::endl;
11984 string u = uu.str();
11985 r = safe_write(fd, u.c_str(), u.length());
11986 if (r < 0) {
11987 goto out;
11988 }
11989 }
11990 }
11991 if (f) {
11992 f->close_section(); //dentries
11993 }
11994 dir->check_rstats();
11995 if (f) {
11996 f->close_section(); //dir
11997 }
11998 }
11999 if (f) {
12000 f->close_section(); // dirfrags
12001 }
12002
12003 if (f) {
12004 f->close_section(); // inode
12005 }
12006 }
12007
12008 out:
12009 if (f) {
12010 f->close_section(); // inodes
12011 } else {
12012 ::close(fd);
12013 }
31f18b77 12014 return r;
7c673cae
FG
12015}
12016
12017
12018
12019C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12020 : MDSInternalContext(c->mds), cache(c), mdr(r)
12021{}
12022
12023void C_MDS_RetryRequest::finish(int r)
12024{
12025 mdr->retry++;
12026 cache->dispatch_request(mdr);
12027}
12028
12029
12030class C_MDS_EnqueueScrub : public Context
12031{
12032 Formatter *formatter;
12033 Context *on_finish;
12034public:
12035 ScrubHeaderRef header;
12036 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12037 formatter(f), on_finish(fin), header(nullptr) {}
12038
12039 Context *take_finisher() {
12040 Context *fin = on_finish;
12041 on_finish = NULL;
12042 return fin;
12043 }
12044
12045 void finish(int r) override {
12046 if (r < 0) { // we failed the lookup or something; dump ourselves
12047 formatter->open_object_section("results");
12048 formatter->dump_int("return_code", r);
12049 formatter->close_section(); // results
12050 }
12051 if (on_finish)
12052 on_finish->complete(r);
12053 }
12054};
12055
12056void MDCache::enqueue_scrub(
12057 const string& path,
12058 const std::string &tag,
12059 bool force, bool recursive, bool repair,
12060 Formatter *f, Context *fin)
12061{
12062 dout(10) << __func__ << path << dendl;
12063 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12064 filepath fp(path.c_str());
12065 mdr->set_filepath(fp);
12066
12067 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12068 cs->header = std::make_shared<ScrubHeader>(
12069 tag, force, recursive, repair, f);
12070
12071 mdr->internal_op_finish = cs;
12072 enqueue_scrub_work(mdr);
12073}
12074
12075void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12076{
12077 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12078 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12079 if (NULL == in)
12080 return;
12081
12082 // TODO: Remove this restriction
12083 assert(in->is_auth());
12084
12085 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12086 if (!locked)
12087 return;
12088
12089 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12090 ScrubHeaderRef &header = cs->header;
12091
12092 // Cannot scrub same dentry twice at same time
12093 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12094 mds->server->respond_to_request(mdr, -EBUSY);
12095 return;
12096 } else {
12097 in->scrub_info();
12098 }
12099
12100 header->set_origin(in);
12101
12102 // only set completion context for non-recursive scrub, because we don't
12103 // want to block asok caller on long running scrub
12104 if (!header->get_recursive()) {
12105 Context *fin = cs->take_finisher();
12106 mds->scrubstack->enqueue_inode_top(in, header,
12107 new MDSInternalContextWrapper(mds, fin));
12108 } else
12109 mds->scrubstack->enqueue_inode_bottom(in, header, NULL);
12110
12111 mds->server->respond_to_request(mdr, 0);
12112 return;
12113}
12114
12115struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12116 MDRequestRef mdr;
12117 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12118 MDCacheLogContext(c), mdr(m) {}
12119 void finish(int r) override {
12120 mdr->apply();
12121 get_mds()->server->respond_to_request(mdr, r);
12122 }
12123};
12124
12125void MDCache::repair_dirfrag_stats(CDir *dir)
12126{
12127 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12128 mdr->pin(dir);
12129 mdr->internal_op_private = dir;
12130 mdr->internal_op_finish = new C_MDSInternalNoop;
12131 repair_dirfrag_stats_work(mdr);
12132}
12133
12134void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12135{
12136 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12137 dout(10) << __func__ << " " << *dir << dendl;
12138
12139 if (!dir->is_auth()) {
12140 mds->server->respond_to_request(mdr, -ESTALE);
12141 return;
12142 }
12143
12144 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12145 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12146
7c673cae
FG
12147 mds->locker->drop_locks(mdr.get());
12148 mdr->drop_local_auth_pins();
224ce89b
WB
12149 if (!mdr->remote_auth_pins.empty())
12150 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12151 return;
12152 }
12153
12154 mdr->auth_pin(dir);
12155
12156 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12157 CInode *diri = dir->inode;
12158 rdlocks.insert(&diri->dirfragtreelock);
12159 wrlocks.insert(&diri->nestlock);
12160 wrlocks.insert(&diri->filelock);
12161 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12162 return;
12163
12164 if (!dir->is_complete()) {
12165 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12166 return;
12167 }
12168
12169 frag_info_t frag_info;
12170 nest_info_t nest_info;
12171 for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) {
12172 CDentry *dn = it->second;
12173 if (dn->last != CEPH_NOSNAP)
12174 continue;
12175 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12176 if (dnl->is_primary()) {
12177 CInode *in = dnl->get_inode();
12178 nest_info.add(in->get_projected_inode()->accounted_rstat);
12179 if (in->is_dir())
12180 frag_info.nsubdirs++;
12181 else
12182 frag_info.nfiles++;
12183 } else if (dnl->is_remote())
12184 frag_info.nfiles++;
12185 }
12186
12187 fnode_t *pf = dir->get_projected_fnode();
12188 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12189 bool good_rstat = nest_info.same_sums(pf->rstat);
12190 if (good_fragstat && good_rstat) {
12191 dout(10) << __func__ << " no corruption found" << dendl;
12192 mds->server->respond_to_request(mdr, 0);
12193 return;
12194 }
12195
12196 pf = dir->project_fnode();
12197 pf->version = dir->pre_dirty();
12198 mdr->add_projected_fnode(dir);
12199
12200 mdr->ls = mds->mdlog->get_current_segment();
12201 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12202 mds->mdlog->start_entry(le);
12203
12204 if (!good_fragstat) {
12205 if (pf->fragstat.mtime > frag_info.mtime)
12206 frag_info.mtime = pf->fragstat.mtime;
12207 if (pf->fragstat.change_attr > frag_info.change_attr)
12208 frag_info.change_attr = pf->fragstat.change_attr;
12209 pf->fragstat = frag_info;
12210 mds->locker->mark_updated_scatterlock(&diri->filelock);
12211 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12212 mdr->add_updated_lock(&diri->filelock);
12213 }
12214
12215 if (!good_rstat) {
12216 if (pf->rstat.rctime > nest_info.rctime)
12217 nest_info.rctime = pf->rstat.rctime;
12218 pf->rstat = nest_info;
12219 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12220 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12221 mdr->add_updated_lock(&diri->nestlock);
12222 }
12223
12224 le->metablob.add_dir_context(dir);
12225 le->metablob.add_dir(dir, true);
12226
12227 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12228}
12229
12230void MDCache::repair_inode_stats(CInode *diri)
12231{
12232 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12233 mdr->pin(diri);
12234 mdr->internal_op_private = diri;
12235 mdr->internal_op_finish = new C_MDSInternalNoop;
12236 repair_inode_stats_work(mdr);
12237}
12238
12239void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12240{
12241 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12242 dout(10) << __func__ << " " << *diri << dendl;
12243
12244 if (!diri->is_auth()) {
12245 mds->server->respond_to_request(mdr, -ESTALE);
12246 return;
12247 }
12248 if (!diri->is_dir()) {
12249 mds->server->respond_to_request(mdr, -ENOTDIR);
12250 return;
12251 }
12252
12253 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12254 std::list<frag_t> frags;
12255
12256 if (mdr->ls) // already marked filelock/nestlock dirty ?
12257 goto do_rdlocks;
12258
12259 rdlocks.insert(&diri->dirfragtreelock);
12260 wrlocks.insert(&diri->nestlock);
12261 wrlocks.insert(&diri->filelock);
12262 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12263 return;
12264
12265 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12266 // the scatter-gather process, which will fix any fragstat/rstat errors.
12267 diri->dirfragtree.get_leaves(frags);
12268 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12269 CDir *dir = diri->get_dirfrag(*p);
12270 if (!dir) {
12271 assert(mdr->is_auth_pinned(diri));
12272 dir = diri->get_or_open_dirfrag(this, *p);
12273 }
12274 if (dir->get_version() == 0) {
12275 assert(dir->is_auth());
12276 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12277 return;
12278 }
12279 }
12280
12281 diri->state_set(CInode::STATE_REPAIRSTATS);
12282 mdr->ls = mds->mdlog->get_current_segment();
12283 mds->locker->mark_updated_scatterlock(&diri->filelock);
12284 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12285 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12286 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12287
12288 mds->locker->drop_locks(mdr.get());
12289
12290do_rdlocks:
12291 // force the scatter-gather process
12292 rdlocks.insert(&diri->dirfragtreelock);
12293 rdlocks.insert(&diri->nestlock);
12294 rdlocks.insert(&diri->filelock);
12295 wrlocks.clear();
12296 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12297 return;
12298
12299 diri->state_clear(CInode::STATE_REPAIRSTATS);
12300
12301 frag_info_t dir_info;
12302 nest_info_t nest_info;
12303 nest_info.rsubdirs++; // it gets one to account for self
12304
12305 diri->dirfragtree.get_leaves(frags);
12306 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12307 CDir *dir = diri->get_dirfrag(*p);
12308 assert(dir);
12309 assert(dir->get_version() > 0);
12310 dir_info.add(dir->fnode.accounted_fragstat);
12311 nest_info.add(dir->fnode.accounted_rstat);
12312 }
12313
12314 if (!dir_info.same_sums(diri->inode.dirstat) ||
12315 !nest_info.same_sums(diri->inode.rstat)) {
12316 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12317 << *diri << dendl;
12318 }
12319
12320 mds->server->respond_to_request(mdr, 0);
12321}
12322
12323void MDCache::flush_dentry(const string& path, Context *fin)
12324{
12325 if (is_readonly()) {
12326 dout(10) << __func__ << ": read-only FS" << dendl;
12327 fin->complete(-EROFS);
12328 return;
12329 }
12330 dout(10) << "flush_dentry " << path << dendl;
12331 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
12332 filepath fp(path.c_str());
12333 mdr->set_filepath(fp);
12334 mdr->internal_op_finish = fin;
12335 flush_dentry_work(mdr);
12336}
12337
12338class C_FinishIOMDR : public MDSInternalContextBase {
12339protected:
12340 MDSRank *mds;
12341 MDRequestRef mdr;
12342 MDSRank *get_mds() override { return mds; }
12343public:
12344 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12345 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12346};
12347
12348void MDCache::flush_dentry_work(MDRequestRef& mdr)
12349{
12350 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12351 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12352 if (NULL == in)
12353 return;
12354
12355 // TODO: Is this necessary? Fix it if so
12356 assert(in->is_auth());
12357 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12358 if (!locked)
12359 return;
12360 in->flush(new C_FinishIOMDR(mds, mdr));
12361}
12362
12363
12364/**
12365 * Initialize performance counters with global perfcounter
12366 * collection.
12367 */
12368void MDCache::register_perfcounters()
12369{
12370 PerfCountersBuilder pcb(g_ceph_context,
12371 "mds_cache", l_mdc_first, l_mdc_last);
12372
12373 /* Stray/purge statistics */
12374 pcb.add_u64(l_mdc_num_strays, "num_strays",
c07f9fc5 12375 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12376 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed");
12377 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12378
12379 pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created");
12380 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
12381 "Stray dentries enqueued for purge");
12382 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated");
12383 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated");
12384
12385
12386 /* Recovery queue statistics */
12387 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered");
12388 pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued",
c07f9fc5 12389 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
12390 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12391 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started");
12392 pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed",
c07f9fc5 12393 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 12394
d2e6a577
FG
12395 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
12396 "Internal Request type enqueue scrub");
12397 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
12398 "Internal Request type export dir");
12399 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
12400 "Internal Request type flush");
12401 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
12402 "Internal Request type fragmentdir");
12403 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
12404 "Internal Request type frag stats");
12405 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
12406 "Internal Request type inode stats");
12407
7c673cae
FG
12408 logger.reset(pcb.create_perf_counters());
12409 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12410 recovery_queue.set_logger(logger.get());
12411 stray_manager.set_logger(logger.get());
12412}
12413
12414void MDCache::activate_stray_manager()
12415{
12416 if (open) {
12417 stray_manager.activate();
12418 } else {
12419 wait_for_open(
12420 new MDSInternalContextWrapper(mds,
12421 new FunctionContext([this](int r){
12422 stray_manager.activate();
12423 })
12424 )
12425 );
12426 }
12427}
12428
12429/**
12430 * Call this when putting references to an inode/dentry or
12431 * when attempting to trim it.
12432 *
12433 * If this inode is no longer linked by anyone, and this MDS
12434 * rank holds the primary dentry, and that dentry is in a stray
12435 * directory, then give up the dentry to the StrayManager, never
12436 * to be seen again by MDCache.
12437 *
12438 * @param delay if true, then purgeable inodes are stashed til
12439 * the next trim(), rather than being purged right
12440 * away.
12441 */
12442void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12443 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12444 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12445 return;
224ce89b 12446
7c673cae
FG
12447 CDentry *dn = in->get_projected_parent_dn();
12448
12449 if (dn->state_test(CDentry::STATE_PURGING)) {
12450 /* We have already entered the purging process, no need
12451 * to re-evaluate me ! */
12452 return;
12453 }
12454
12455 if (dn->get_projected_linkage()->is_primary() &&
12456 dn->get_dir()->get_inode()->is_stray()) {
12457 stray_manager.eval_stray(dn, delay);
12458 }
12459}
12460
31f18b77
FG
12461void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12462 dout(10) << __func__ << " " << *diri << dendl;
12463 assert(diri->get_projected_parent_dir()->inode->is_stray());
12464 list<CDir*> ls;
12465 diri->get_dirfrags(ls);
12466 for (auto p : ls) {
12467 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12468 p->try_remove_dentries_for_stray();
12469 }
12470 if (!diri->snaprealm) {
12471 if (diri->is_auth())
12472 diri->clear_dirty_rstat();
12473 diri->clear_scatter_dirty();
12474 }
12475}
12476