]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
update source to 12.2.11
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
94b18763 20#include <boost/utility/string_view.hpp>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae
FG
44
45#include "msg/Message.h"
46#include "msg/Messenger.h"
47
181888fb 48#include "common/MemoryModel.h"
7c673cae 49#include "common/errno.h"
7c673cae 50#include "common/perf_counters.h"
181888fb
FG
51#include "common/safe_io.h"
52
7c673cae
FG
53#include "osdc/Journaler.h"
54#include "osdc/Filer.h"
55
56#include "events/ESubtreeMap.h"
57#include "events/EUpdate.h"
58#include "events/ESlaveUpdate.h"
59#include "events/EImportFinish.h"
60#include "events/EFragment.h"
61#include "events/ECommitted.h"
62#include "events/ESessions.h"
63
64#include "messages/MGenericMessage.h"
65
66#include "messages/MMDSResolve.h"
67#include "messages/MMDSResolveAck.h"
68#include "messages/MMDSCacheRejoin.h"
69
70#include "messages/MDiscover.h"
71#include "messages/MDiscoverReply.h"
72
73//#include "messages/MInodeUpdate.h"
74#include "messages/MDirUpdate.h"
75#include "messages/MCacheExpire.h"
76
77#include "messages/MInodeFileCaps.h"
78
79#include "messages/MLock.h"
80#include "messages/MDentryLink.h"
81#include "messages/MDentryUnlink.h"
82
83#include "messages/MMDSFindIno.h"
84#include "messages/MMDSFindInoReply.h"
85
86#include "messages/MMDSOpenIno.h"
87#include "messages/MMDSOpenInoReply.h"
88
89#include "messages/MClientRequest.h"
90#include "messages/MClientCaps.h"
91#include "messages/MClientSnap.h"
92#include "messages/MClientQuota.h"
93
94#include "messages/MMDSSlaveRequest.h"
95
96#include "messages/MMDSFragmentNotify.h"
97
98#include "messages/MGatherCaps.h"
99
100#include "InoTable.h"
101
102#include "common/Timer.h"
103
104#include "perfglue/heap_profiler.h"
105
106using namespace std;
107
108#include "common/config.h"
109#include "include/assert.h"
110
111#define dout_context g_ceph_context
112#define dout_subsys ceph_subsys_mds
113#undef dout_prefix
114#define dout_prefix _prefix(_dout, mds)
115static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
116 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
117}
118
119set<int> SimpleLock::empty_gather_set;
120
121
122/**
123 * All non-I/O contexts that require a reference
124 * to an MDCache instance descend from this.
125 */
126class MDCacheContext : public virtual MDSInternalContextBase {
127protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
131 assert(mdcache != NULL);
132 return mdcache->mds;
133 }
134public:
135 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
136};
137
138
139/**
140 * Only for contexts called back from an I/O completion
141 *
142 * Note: duplication of members wrt MDCacheContext, because
143 * it'ls the lesser of two evils compared with introducing
144 * yet another piece of (multiple) inheritance.
145 */
146class MDCacheIOContext : public virtual MDSIOContextBase {
147protected:
148 MDCache *mdcache;
149 MDSRank *get_mds() override
150 {
151 assert(mdcache != NULL);
152 return mdcache->mds;
153 }
154public:
91327a77
AA
155 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
156 MDSIOContextBase(track), mdcache(mdc_) {}
7c673cae
FG
157};
158
159class MDCacheLogContext : public virtual MDSLogContextBase {
160protected:
161 MDCache *mdcache;
162 MDSRank *get_mds() override
163 {
164 assert(mdcache != NULL);
165 return mdcache->mds;
166 }
167public:
168 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
169};
170
171MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
172 mds(m),
173 filer(m->objecter, m->finisher),
174 exceeded_size_limit(false),
175 recovery_queue(m),
176 stray_manager(m, purge_queue_)
177{
178 migrator.reset(new Migrator(mds, this));
179 root = NULL;
180 myin = NULL;
181 readonly = false;
182
183 stray_index = 0;
184 for (int i = 0; i < NUM_STRAY; ++i) {
185 strays[i] = NULL;
186 }
187
b32b8144 188 num_shadow_inodes = 0;
7c673cae
FG
189 num_inodes_with_caps = 0;
190
191 max_dir_commit_size = g_conf->mds_dir_max_commit_size ?
192 (g_conf->mds_dir_max_commit_size << 20) :
193 (0.9 *(g_conf->osd_max_write_size << 20));
194
195 discover_last_tid = 0;
196 open_ino_last_tid = 0;
197 find_ino_peer_last_tid = 0;
198
199 last_cap_id = 0;
200
201 client_lease_durations[0] = 5.0;
202 client_lease_durations[1] = 30.0;
203 client_lease_durations[2] = 300.0;
204
205 resolves_pending = false;
206 rejoins_pending = false;
207 cap_imports_num_opening = 0;
208
209 opening_root = open = false;
91327a77
AA
210
211 cache_inode_limit = g_conf->get_val<int64_t>("mds_cache_size");
212 cache_memory_limit = g_conf->get_val<uint64_t>("mds_cache_memory_limit");
213 cache_reservation = g_conf->get_val<double>("mds_cache_reservation");
214 cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
215
216 lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
7c673cae 217
31f18b77
FG
218 bottom_lru.lru_set_midpoint(0);
219
7c673cae
FG
220 decayrate.set_halflife(g_conf->mds_decay_halflife);
221
222 did_shutdown_log_cap = false;
223}
224
225MDCache::~MDCache()
226{
227 if (logger) {
228 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
229 }
230}
231
91327a77
AA
232void MDCache::handle_conf_change(const struct md_config_t *conf,
233 const std::set <std::string> &changed,
234 const MDSMap &mdsmap)
235{
236 if (changed.count("mds_cache_size"))
237 cache_inode_limit = g_conf->get_val<int64_t>("mds_cache_size");
238 if (changed.count("mds_cache_memory_limit"))
239 cache_memory_limit = g_conf->get_val<uint64_t>("mds_cache_memory_limit");
240 if (changed.count("mds_cache_reservation"))
241 cache_reservation = g_conf->get_val<double>("mds_cache_reservation");
242 if (changed.count("mds_health_cache_threshold"))
243 cache_health_threshold = g_conf->get_val<double>("mds_health_cache_threshold");
244 if (changed.count("mds_cache_mid"))
245 lru.lru_set_midpoint(g_conf->get_val<double>("mds_cache_mid"));
7c673cae 246
91327a77
AA
247 migrator->handle_conf_change(conf, changed, mdsmap);
248 mds->balancer->handle_conf_change(conf, changed, mdsmap);
249}
7c673cae
FG
250
251void MDCache::log_stat()
252{
91327a77 253 mds->logger->set(l_mds_inode_max, cache_inode_limit ? : INT_MAX);
7c673cae
FG
254 mds->logger->set(l_mds_inodes, lru.lru_get_size());
255 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
256 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
257 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
258 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
259 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
260 mds->logger->set(l_mds_caps, Capability::count());
261}
262
263
264//
265
266bool MDCache::shutdown()
267{
268 if (lru.lru_get_size() > 0) {
269 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
270 //show_cache();
271 show_subtrees();
272 //dump();
273 }
274 return true;
275}
276
277
278// ====================================================================
279// some inode functions
280
281void MDCache::add_inode(CInode *in)
282{
283 // add to lru, inode map
b32b8144
FG
284 if (in->last == CEPH_NOSNAP) {
285 auto &p = inode_map[in->ino()];
286 assert(!p); // should be no dup inos!
287 p = in;
288 } else {
289 auto &p = snap_inode_map[in->vino()];
290 assert(!p); // should be no dup inos!
291 p = in;
292 }
7c673cae
FG
293
294 if (in->ino() < MDS_INO_SYSTEM_BASE) {
295 if (in->ino() == MDS_INO_ROOT)
296 root = in;
297 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
298 myin = in;
299 else if (in->is_stray()) {
300 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
301 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
302 }
303 }
304 if (in->is_base())
305 base_inodes.insert(in);
306 }
307
181888fb 308 if (cache_toofull()) {
7c673cae
FG
309 exceeded_size_limit = true;
310 }
311}
312
313void MDCache::remove_inode(CInode *o)
314{
315 dout(14) << "remove_inode " << *o << dendl;
316
317 if (o->get_parent_dn()) {
318 // FIXME: multiple parents?
319 CDentry *dn = o->get_parent_dn();
320 assert(!dn->is_dirty());
321 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
322 }
323
324 if (o->is_dirty())
325 o->mark_clean();
326 if (o->is_dirty_parent())
327 o->clear_dirty_parent();
328
329 o->clear_scatter_dirty();
330
331 o->item_open_file.remove_myself();
332
31f18b77
FG
333 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
334 export_pin_queue.erase(o);
7c673cae
FG
335
336 // remove from inode map
b32b8144
FG
337 if (o->last == CEPH_NOSNAP)
338 inode_map.erase(o->ino());
339 else
340 snap_inode_map.erase(o->vino());
7c673cae
FG
341
342 if (o->ino() < MDS_INO_SYSTEM_BASE) {
343 if (o == root) root = 0;
344 if (o == myin) myin = 0;
345 if (o->is_stray()) {
346 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
347 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
348 }
349 }
350 if (o->is_base())
351 base_inodes.erase(o);
352 }
353
354 // delete it
355 assert(o->get_num_ref() == 0);
356 delete o;
357}
358
359file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
360{
361 file_layout_t result = file_layout_t::get_default();
362 result.pool_id = mdsmap.get_first_data_pool();
363 return result;
364}
365
366file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
367{
368 file_layout_t result = file_layout_t::get_default();
369 result.pool_id = mdsmap.get_metadata_pool();
370 if (g_conf->mds_log_segment_size > 0) {
371 result.object_size = g_conf->mds_log_segment_size;
372 result.stripe_unit = g_conf->mds_log_segment_size;
373 }
374 return result;
375}
376
377void MDCache::init_layouts()
378{
379 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
380 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
381}
382
383void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
384 int mode) const
385{
386 in->inode.ino = ino;
387 in->inode.version = 1;
388 in->inode.xattr_version = 1;
389 in->inode.mode = 0500 | mode;
390 in->inode.size = 0;
391 in->inode.ctime =
392 in->inode.mtime =
393 in->inode.btime = ceph_clock_now();
394 in->inode.nlink = 1;
395 in->inode.truncate_size = -1ull;
396 in->inode.change_attr = 0;
397 in->inode.export_pin = MDS_RANK_NONE;
398
399 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
400 if (in->inode.is_dir()) {
401 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
f64942e4
AA
402 in->inode.rstat.rsubdirs = 1; /* itself */
403 in->inode.rstat.rctime = in->inode.ctime;
7c673cae
FG
404 } else {
405 in->inode.layout = default_file_layout;
406 ++in->inode.rstat.rfiles;
407 }
408 in->inode.accounted_rstat = in->inode.rstat;
409
410 if (in->is_base()) {
411 if (in->is_root())
412 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
413 else
414 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
415 in->open_snaprealm(); // empty snaprealm
416 assert(!in->snaprealm->parent); // created its own
417 in->snaprealm->srnode.seq = 1;
418 }
419}
420
421CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
422{
423 dout(0) << "creating system inode with ino:" << ino << dendl;
424 CInode *in = new CInode(this);
425 create_unlinked_system_inode(in, ino, mode);
426 add_inode(in);
427 return in;
428}
429
430CInode *MDCache::create_root_inode()
431{
432 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
433 i->inode.uid = g_conf->mds_root_ino_uid;
434 i->inode.gid = g_conf->mds_root_ino_gid;
435 i->inode.layout = default_file_layout;
436 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
437 return i;
438}
439
440void MDCache::create_empty_hierarchy(MDSGather *gather)
441{
442 // create root dir
443 CInode *root = create_root_inode();
444
445 // force empty root dir
446 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
447 adjust_subtree_auth(rootdir, mds->get_nodeid());
448 rootdir->dir_rep = CDir::REP_ALL; //NONE;
449
f64942e4
AA
450 assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
451 assert(rootdir->fnode.fragstat == root->inode.dirstat);
452 assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
453 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
454 * assume version 0 is stale/invalid.
455 */
7c673cae
FG
456
457 rootdir->mark_complete();
458 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
459 rootdir->commit(0, gather->new_sub());
460
28e407b8
AA
461 root->mark_clean();
462 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
463 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
464 root->flush(gather->new_sub());
7c673cae
FG
465}
466
467void MDCache::create_mydir_hierarchy(MDSGather *gather)
468{
469 // create mds dir
470 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
471
472 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
473 adjust_subtree_auth(mydir, mds->get_nodeid());
474
475 LogSegment *ls = mds->mdlog->get_current_segment();
476
477 // stray dir
478 for (int i = 0; i < NUM_STRAY; ++i) {
479 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
480 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
481 stringstream name;
482 name << "stray" << i;
483 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
484 sdn->_mark_dirty(mds->mdlog->get_current_segment());
485
486 stray->inode.dirstat = straydir->fnode.fragstat;
487
488 mydir->fnode.rstat.add(stray->inode.rstat);
489 mydir->fnode.fragstat.nsubdirs++;
490 // save them
491 straydir->mark_complete();
492 straydir->mark_dirty(straydir->pre_dirty(), ls);
493 straydir->commit(0, gather->new_sub());
28e407b8 494 stray->mark_dirty_parent(ls, true);
7c673cae
FG
495 stray->store_backtrace(gather->new_sub());
496 }
497
498 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
499 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
500
501 myin->inode.dirstat = mydir->fnode.fragstat;
502 myin->inode.rstat = mydir->fnode.rstat;
503 ++myin->inode.rstat.rsubdirs;
504 myin->inode.accounted_rstat = myin->inode.rstat;
505
506 mydir->mark_complete();
507 mydir->mark_dirty(mydir->pre_dirty(), ls);
508 mydir->commit(0, gather->new_sub());
509
510 myin->store(gather->new_sub());
511}
512
513struct C_MDC_CreateSystemFile : public MDCacheLogContext {
514 MutationRef mut;
515 CDentry *dn;
516 version_t dpv;
517 MDSInternalContextBase *fin;
518 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) :
519 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
520 void finish(int r) override {
521 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
522 }
523};
524
525void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin)
526{
527 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
528 CDentry *dn = dir->add_null_dentry(name);
529
530 dn->push_projected_linkage(in);
531 version_t dpv = dn->pre_dirty();
532
533 CDir *mdir = 0;
534 if (in->inode.is_dir()) {
535 in->inode.rstat.rsubdirs = 1;
536
537 mdir = in->get_or_open_dirfrag(this, frag_t());
538 mdir->mark_complete();
539 mdir->pre_dirty();
540 } else
541 in->inode.rstat.rfiles = 1;
542 in->inode.version = dn->pre_dirty();
543
544 SnapRealm *realm = dir->get_inode()->find_snaprealm();
545 dn->first = in->first = realm->get_newest_seq() + 1;
546
547 MutationRef mut(new MutationImpl());
548
549 // force some locks. hacky.
550 mds->locker->wrlock_force(&dir->inode->filelock, mut);
551 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
552
553 mut->ls = mds->mdlog->get_current_segment();
554 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
555 mds->mdlog->start_entry(le);
556
557 if (!in->is_mdsdir()) {
558 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
559 le->metablob.add_primary_dentry(dn, in, true);
560 } else {
561 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
562 journal_dirty_inode(mut.get(), &le->metablob, in);
563 dn->push_projected_linkage(in->ino(), in->d_type());
564 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
565 le->metablob.add_root(true, in);
566 }
567 if (mdir)
568 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
569
570 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
571 mds->mdlog->flush();
572}
573
574void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin)
575{
576 dout(10) << "_create_system_file_finish " << *dn << dendl;
577
578 dn->pop_projected_linkage();
579 dn->mark_dirty(dpv, mut->ls);
580
581 CInode *in = dn->get_linkage()->get_inode();
582 in->inode.version--;
583 in->mark_dirty(in->inode.version + 1, mut->ls);
584
585 if (in->inode.is_dir()) {
586 CDir *dir = in->get_dirfrag(frag_t());
587 assert(dir);
588 dir->mark_dirty(1, mut->ls);
589 dir->mark_new(mut->ls);
590 }
591
592 mut->apply();
593 mds->locker->drop_locks(mut.get());
594 mut->cleanup();
595
596 fin->complete(0);
597
598 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
599 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
600}
601
602
603
604struct C_MDS_RetryOpenRoot : public MDSInternalContext {
605 MDCache *cache;
606 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
607 void finish(int r) override {
608 if (r < 0) {
609 // If we can't open root, something disastrous has happened: mark
610 // this rank damaged for operator intervention. Note that
611 // it is not okay to call suicide() here because we are in
612 // a Finisher callback.
613 cache->mds->damaged();
614 ceph_abort(); // damaged should never return
615 } else {
616 cache->open_root();
617 }
618 }
619};
620
621void MDCache::open_root_inode(MDSInternalContextBase *c)
622{
623 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
624 CInode *in;
625 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
626 in->fetch(c);
627 } else {
628 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
629 }
630}
631
632void MDCache::open_mydir_inode(MDSInternalContextBase *c)
633{
634 MDSGatherBuilder gather(g_ceph_context);
635
636 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
637 in->fetch(gather.new_sub());
638
639 gather.set_finisher(c);
640 gather.activate();
641}
642
28e407b8
AA
643void MDCache::open_mydir_frag(MDSInternalContextBase *c)
644{
645 open_mydir_inode(
646 new MDSInternalContextWrapper(mds,
647 new FunctionContext([this, c](int r) {
648 if (r < 0) {
649 c->complete(r);
650 return;
651 }
652 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
653 assert(mydir);
654 adjust_subtree_auth(mydir, mds->get_nodeid());
655 mydir->fetch(c);
656 })
657 )
658 );
659}
660
7c673cae
FG
661void MDCache::open_root()
662{
663 dout(10) << "open_root" << dendl;
664
665 if (!root) {
666 open_root_inode(new C_MDS_RetryOpenRoot(this));
667 return;
668 }
669 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
670 assert(root->is_auth());
671 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
672 assert(rootdir);
673 if (!rootdir->is_subtree_root())
674 adjust_subtree_auth(rootdir, mds->get_nodeid());
675 if (!rootdir->is_complete()) {
676 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
677 return;
678 }
679 } else {
680 assert(!root->is_auth());
681 CDir *rootdir = root->get_dirfrag(frag_t());
682 if (!rootdir) {
224ce89b 683 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
684 return;
685 }
686 }
687
688 if (!myin) {
689 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
690 in->fetch(new C_MDS_RetryOpenRoot(this));
691 return;
692 }
693 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
694 assert(mydir);
695 adjust_subtree_auth(mydir, mds->get_nodeid());
696
697 populate_mydir();
698}
699
700void MDCache::populate_mydir()
701{
702 assert(myin);
703 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
704 assert(mydir);
705
706 dout(10) << "populate_mydir " << *mydir << dendl;
707
708 if (!mydir->is_complete()) {
709 mydir->fetch(new C_MDS_RetryOpenRoot(this));
710 return;
711 }
712
713 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
714 // A missing dirfrag, we will recreate it. Before that, we must dirty
715 // it before dirtying any of the strays we create within it.
716 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
717 "recreating it now";
718 LogSegment *ls = mds->mdlog->get_current_segment();
719 mydir->state_clear(CDir::STATE_BADFRAG);
720 mydir->mark_complete();
721 mydir->mark_dirty(mydir->pre_dirty(), ls);
722 }
723
724 // open or create stray
725 uint64_t num_strays = 0;
726 for (int i = 0; i < NUM_STRAY; ++i) {
727 stringstream name;
728 name << "stray" << i;
729 CDentry *straydn = mydir->lookup(name.str());
730
731 // allow for older fs's with stray instead of stray0
732 if (straydn == NULL && i == 0)
733 straydn = mydir->lookup("stray");
734
735 if (!straydn || !straydn->get_linkage()->get_inode()) {
736 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
737 new C_MDS_RetryOpenRoot(this));
738 return;
739 }
740 assert(straydn);
741 assert(strays[i]);
742 // we make multiple passes through this method; make sure we only pin each stray once.
743 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
744 strays[i]->get(CInode::PIN_STRAY);
745 strays[i]->state_set(CInode::STATE_STRAYPINNED);
746 strays[i]->get_stickydirs();
747 }
748 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
749
750 // open all frags
751 list<frag_t> ls;
752 strays[i]->dirfragtree.get_leaves(ls);
753 for (list<frag_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
754 frag_t fg = *p;
755 CDir *dir = strays[i]->get_dirfrag(fg);
756 if (!dir) {
757 dir = strays[i]->get_or_open_dirfrag(this, fg);
758 }
759
760 // DamageTable applies special handling to strays: it will
761 // have damaged() us out if one is damaged.
762 assert(!dir->state_test(CDir::STATE_BADFRAG));
763
764 if (dir->get_version() == 0) {
765 dir->fetch(new C_MDS_RetryOpenRoot(this));
766 return;
767 }
768
769 if (dir->get_frag_size() > 0)
770 num_strays += dir->get_frag_size();
771 }
772 }
773
774 stray_manager.set_num_strays(num_strays);
775
776 // okay!
777 dout(10) << "populate_mydir done" << dendl;
778 assert(!open);
779 open = true;
780 mds->queue_waiters(waiting_for_open);
781
782 scan_stray_dir();
783}
784
785void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin)
786{
787 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
788}
789
790CDir *MDCache::get_stray_dir(CInode *in)
791{
792 string straydname;
793 in->name_stray_dentry(straydname);
794
795 CInode *strayi = get_stray();
796 assert(strayi);
797 frag_t fg = strayi->pick_dirfrag(straydname);
798 CDir *straydir = strayi->get_dirfrag(fg);
799 assert(straydir);
800 return straydir;
801}
802
803CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
804{
805 CDir *straydir = get_stray_dir(in);
806 string straydname;
807 in->name_stray_dentry(straydname);
808 CDentry *straydn = straydir->lookup(straydname);
809 if (!straydn) {
810 straydn = straydir->add_null_dentry(straydname);
811 straydn->mark_new();
812 } else {
813 assert(straydn->get_projected_linkage()->is_null());
814 }
815
816 straydn->state_set(CDentry::STATE_STRAY);
817 return straydn;
818}
819
820
821
822MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info)
823{
824 // inode?
825 if (info.ino)
826 return get_inode(info.ino, info.snapid);
827
828 // dir or dentry.
829 CDir *dir = get_dirfrag(info.dirfrag);
830 if (!dir) return 0;
831
832 if (info.dname.length())
833 return dir->lookup(info.dname, info.snapid);
834 else
835 return dir;
836}
837
838
839
840
841// ====================================================================
842// subtree management
843
844void MDCache::list_subtrees(list<CDir*>& ls)
845{
846 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
847 p != subtrees.end();
848 ++p)
849 ls.push_back(p->first);
850}
851
852/*
853 * adjust the dir_auth of a subtree.
854 * merge with parent and/or child subtrees, if is it appropriate.
855 * merge can ONLY happen if both parent and child have unambiguous auth.
856 */
28e407b8 857void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
858{
859 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
860 << " on " << *dir << dendl;
861
7c673cae
FG
862 show_subtrees();
863
864 CDir *root;
865 if (dir->inode->is_base()) {
866 root = dir; // bootstrap hack.
867 if (subtrees.count(root) == 0) {
868 subtrees[root];
869 root->get(CDir::PIN_SUBTREE);
870 }
871 } else {
872 root = get_subtree_root(dir); // subtree root
873 }
874 assert(root);
875 assert(subtrees.count(root));
876 dout(7) << " current root is " << *root << dendl;
877
878 if (root == dir) {
879 // i am already a subtree.
880 dir->set_dir_auth(auth);
881 } else {
882 // i am a new subtree.
883 dout(10) << " new subtree at " << *dir << dendl;
884 assert(subtrees.count(dir) == 0);
885 subtrees[dir]; // create empty subtree bounds list for me.
886 dir->get(CDir::PIN_SUBTREE);
887
888 // set dir_auth
889 dir->set_dir_auth(auth);
890
891 // move items nested beneath me, under me.
892 set<CDir*>::iterator p = subtrees[root].begin();
893 while (p != subtrees[root].end()) {
894 set<CDir*>::iterator next = p;
895 ++next;
896 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
897 // move under me
898 dout(10) << " claiming child bound " << **p << dendl;
899 subtrees[dir].insert(*p);
900 subtrees[root].erase(p);
901 }
902 p = next;
903 }
904
905 // i am a bound of the parent subtree.
906 subtrees[root].insert(dir);
907
908 // i am now the subtree root.
909 root = dir;
910
911 // adjust recursive pop counters
28e407b8 912 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
913 utime_t now = ceph_clock_now();
914 CDir *p = dir->get_parent_dir();
915 while (p) {
916 p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree);
917 if (p->is_subtree_root()) break;
918 p = p->inode->get_parent_dir();
919 }
920 }
7c673cae
FG
921 }
922
923 show_subtrees();
924}
925
926
927void MDCache::try_subtree_merge(CDir *dir)
928{
929 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
930 // record my old bounds
931 auto oldbounds = subtrees.at(dir);
7c673cae 932
224ce89b 933 set<CInode*> to_eval;
7c673cae 934 // try merge at my root
224ce89b 935 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
936
937 // try merge at my old bounds
224ce89b
WB
938 for (auto bound : oldbounds)
939 try_subtree_merge_at(bound, &to_eval);
940
941 if (!(mds->is_any_replay() || mds->is_resolve())) {
942 for(auto in : to_eval)
943 eval_subtree_root(in);
944 }
7c673cae
FG
945}
946
947class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
948 CInode *in;
949 MutationRef mut;
950public:
951 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
952 void finish(int r) override {
953 mdcache->subtree_merge_writebehind_finish(in, mut);
954 }
955};
956
28e407b8 957void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
958{
959 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
960
961 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
962 dir->state_test(CDir::STATE_EXPORTBOUND) ||
963 dir->state_test(CDir::STATE_AUXSUBTREE))
964 return;
965
966 auto it = subtrees.find(dir);
967 assert(it != subtrees.end());
7c673cae 968
7c673cae
FG
969 // merge with parent?
970 CDir *parent = dir;
971 if (!dir->inode->is_base())
972 parent = get_subtree_root(dir->get_parent_dir());
973
b32b8144
FG
974 if (parent != dir && // we have a parent,
975 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
976 // merge with parent.
977 dout(10) << " subtree merge at " << *dir << dendl;
978 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
979
980 // move our bounds under the parent
b32b8144 981 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
982
983 // we are no longer a subtree or bound
984 dir->put(CDir::PIN_SUBTREE);
b32b8144 985 subtrees.erase(it);
7c673cae
FG
986 subtrees[parent].erase(dir);
987
988 // adjust popularity?
28e407b8 989 if (adjust_pop && dir->is_auth()) {
7c673cae 990 utime_t now = ceph_clock_now();
28e407b8 991 CDir *cur = dir;
7c673cae
FG
992 CDir *p = dir->get_parent_dir();
993 while (p) {
994 p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree);
28e407b8 995 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 996 if (p->is_subtree_root()) break;
28e407b8 997 cur = p;
7c673cae
FG
998 p = p->inode->get_parent_dir();
999 }
1000 }
1001
224ce89b
WB
1002 if (to_eval && dir->get_inode()->is_auth())
1003 to_eval->insert(dir->get_inode());
7c673cae 1004
181888fb
FG
1005 show_subtrees(15);
1006 }
7c673cae
FG
1007}
1008
1009void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
1010{
1011 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
1012 in->pop_and_dirty_projected_inode(mut->ls);
1013
1014 mut->apply();
1015 mds->locker->drop_locks(mut.get());
1016 mut->cleanup();
1017
1018 in->auth_unpin(this);
1019}
1020
1021void MDCache::eval_subtree_root(CInode *diri)
1022{
1023 // evaluate subtree inode filelock?
1024 // (we should scatter the filelock on subtree bounds)
224ce89b
WB
1025 assert(diri->is_auth());
1026 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1027}
1028
1029
1030void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth)
1031{
1032 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1033 << " on " << *dir
1034 << " bounds " << bounds
1035 << dendl;
1036
1037 show_subtrees();
1038
1039 CDir *root;
1040 if (dir->ino() == MDS_INO_ROOT) {
1041 root = dir; // bootstrap hack.
1042 if (subtrees.count(root) == 0) {
1043 subtrees[root];
1044 root->get(CDir::PIN_SUBTREE);
1045 }
1046 } else {
1047 root = get_subtree_root(dir); // subtree root
1048 }
1049 assert(root);
1050 assert(subtrees.count(root));
1051 dout(7) << " current root is " << *root << dendl;
1052
1053 mds_authority_t oldauth = dir->authority();
1054
1055 if (root == dir) {
1056 // i am already a subtree.
1057 dir->set_dir_auth(auth);
1058 } else {
1059 // i am a new subtree.
1060 dout(10) << " new subtree at " << *dir << dendl;
1061 assert(subtrees.count(dir) == 0);
1062 subtrees[dir]; // create empty subtree bounds list for me.
1063 dir->get(CDir::PIN_SUBTREE);
1064
1065 // set dir_auth
1066 dir->set_dir_auth(auth);
1067
1068 // move items nested beneath me, under me.
1069 set<CDir*>::iterator p = subtrees[root].begin();
1070 while (p != subtrees[root].end()) {
1071 set<CDir*>::iterator next = p;
1072 ++next;
1073 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1074 // move under me
1075 dout(10) << " claiming child bound " << **p << dendl;
1076 subtrees[dir].insert(*p);
1077 subtrees[root].erase(p);
1078 }
1079 p = next;
1080 }
1081
1082 // i am a bound of the parent subtree.
1083 subtrees[root].insert(dir);
1084
1085 // i am now the subtree root.
1086 root = dir;
1087 }
1088
224ce89b
WB
1089 set<CInode*> to_eval;
1090
7c673cae
FG
1091 // verify/adjust bounds.
1092 // - these may be new, or
1093 // - beneath existing ambiguous bounds (which will be collapsed),
1094 // - but NOT beneath unambiguous bounds.
1095 for (set<CDir*>::iterator p = bounds.begin();
1096 p != bounds.end();
1097 ++p) {
1098 CDir *bound = *p;
1099
1100 // new bound?
1101 if (subtrees[dir].count(bound) == 0) {
1102 if (get_subtree_root(bound) == dir) {
1103 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1104 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1105 }
1106 else {
1107 dout(10) << " want bound " << *bound << dendl;
1108 CDir *t = get_subtree_root(bound->get_parent_dir());
1109 if (subtrees[t].count(bound) == 0) {
1110 assert(t != dir);
1111 dout(10) << " new bound " << *bound << dendl;
1112 adjust_subtree_auth(bound, t->authority());
1113 }
1114 // make sure it's nested beneath ambiguous subtree(s)
1115 while (1) {
1116 while (subtrees[dir].count(t) == 0)
1117 t = get_subtree_root(t->get_parent_dir());
1118 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1119 adjust_subtree_auth(t, auth);
224ce89b 1120 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1121 t = get_subtree_root(bound->get_parent_dir());
1122 if (t == dir) break;
1123 }
1124 }
1125 }
1126 else {
1127 dout(10) << " already have bound " << *bound << dendl;
1128 }
1129 }
1130 // merge stray bounds?
1131 while (!subtrees[dir].empty()) {
1132 set<CDir*> copy = subtrees[dir];
1133 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1134 if (bounds.count(*p) == 0) {
1135 CDir *stray = *p;
1136 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1137 adjust_subtree_auth(stray, auth);
224ce89b 1138 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1139 }
1140 }
1141 // swallowing subtree may add new subtree bounds
1142 if (copy == subtrees[dir])
1143 break;
1144 }
1145
1146 // bound should now match.
1147 verify_subtree_bounds(dir, bounds);
1148
1149 show_subtrees();
224ce89b
WB
1150
1151 if (!(mds->is_any_replay() || mds->is_resolve())) {
1152 for(auto in : to_eval)
1153 eval_subtree_root(in);
1154 }
7c673cae
FG
1155}
1156
1157
1158/*
1159 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1160 * fragmentation as necessary to get an equivalent bounding set. That is, only
1161 * split if one of our frags spans the provided bounding set. Never merge.
1162 */
1163void MDCache::get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1164{
1165 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1166
1167 // sort by ino
1168 map<inodeno_t, fragset_t> byino;
1169 for (vector<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1170 byino[p->ino].insert(p->frag);
1171 dout(10) << " by ino: " << byino << dendl;
1172
1173 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1174 CInode *diri = get_inode(p->first);
1175 if (!diri)
1176 continue;
1177 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1178
1179 fragtree_t tmpdft;
1180 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1181 tmpdft.force_to_leaf(g_ceph_context, *q);
1182
1183 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
1184 frag_t fg = *q;
1185 list<frag_t> fgls;
1186 diri->dirfragtree.get_leaves_under(fg, fgls);
1187 if (fgls.empty()) {
1188 bool all = true;
1189 frag_t approx_fg = diri->dirfragtree[fg.value()];
1190 list<frag_t> ls;
1191 tmpdft.get_leaves_under(approx_fg, ls);
1192 for (list<frag_t>::iterator r = ls.begin(); r != ls.end(); ++r) {
1193 if (p->second.get().count(*r) == 0) {
1194 // not bound, so the resolve message is from auth MDS of the dirfrag
1195 force_dir_fragment(diri, *r);
1196 all = false;
1197 }
1198 }
1199 if (all)
1200 fgls.push_back(approx_fg);
1201 else
1202 diri->dirfragtree.get_leaves_under(fg, fgls);
1203 }
1204 dout(10) << " frag " << fg << " contains " << fgls << dendl;
1205 for (list<frag_t>::iterator r = fgls.begin(); r != fgls.end(); ++r) {
1206 CDir *dir = diri->get_dirfrag(*r);
1207 if (dir)
1208 bounds.insert(dir);
1209 }
1210 }
1211 }
1212}
1213
1214void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bound_dfs, mds_authority_t auth)
1215{
1216 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1217 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1218
1219 set<CDir*> bounds;
1220 get_force_dirfrag_bound_set(bound_dfs, bounds);
1221 adjust_bounded_subtree_auth(dir, bounds, auth);
1222}
1223
1224void MDCache::map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result)
1225{
1226 dout(10) << "map_dirfrag_set " << dfs << dendl;
1227
1228 // group by inode
1229 map<inodeno_t, fragset_t> ino_fragset;
1230 for (list<dirfrag_t>::iterator p = dfs.begin(); p != dfs.end(); ++p)
1231 ino_fragset[p->ino].insert(p->frag);
1232
1233 // get frags
1234 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1235 p != ino_fragset.end();
1236 ++p) {
1237 CInode *in = get_inode(p->first);
1238 if (!in)
1239 continue;
1240
1241 list<frag_t> fglist;
1242 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1243 in->dirfragtree.get_leaves_under(*q, fglist);
1244
1245 dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist
1246 << " on " << *in << dendl;
1247
1248 for (list<frag_t>::iterator q = fglist.begin(); q != fglist.end(); ++q) {
1249 CDir *dir = in->get_dirfrag(*q);
1250 if (dir)
1251 result.insert(dir);
1252 }
1253 }
1254}
1255
1256
1257
1258CDir *MDCache::get_subtree_root(CDir *dir)
1259{
1260 // find the underlying dir that delegates (or is about to delegate) auth
1261 while (true) {
1262 if (dir->is_subtree_root())
1263 return dir;
1264 dir = dir->get_inode()->get_parent_dir();
1265 if (!dir)
1266 return 0; // none
1267 }
1268}
1269
1270CDir *MDCache::get_projected_subtree_root(CDir *dir)
1271{
1272 // find the underlying dir that delegates (or is about to delegate) auth
1273 while (true) {
1274 if (dir->is_subtree_root())
1275 return dir;
1276 dir = dir->get_inode()->get_projected_parent_dir();
1277 if (!dir)
1278 return 0; // none
1279 }
1280}
1281
1282void MDCache::remove_subtree(CDir *dir)
1283{
1284 dout(10) << "remove_subtree " << *dir << dendl;
1285 assert(subtrees.count(dir));
1286 assert(subtrees[dir].empty());
1287 subtrees.erase(dir);
1288 dir->put(CDir::PIN_SUBTREE);
1289 if (dir->get_parent_dir()) {
1290 CDir *p = get_subtree_root(dir->get_parent_dir());
1291 assert(subtrees[p].count(dir));
1292 subtrees[p].erase(dir);
1293 }
1294}
1295
1296void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1297{
1298 assert(subtrees.count(dir));
1299 bounds = subtrees[dir];
1300}
1301
1302void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1303{
1304 if (subtrees.count(dir)) {
1305 // just copy them, dir is a subtree.
1306 get_subtree_bounds(dir, bounds);
1307 } else {
1308 // find them
1309 CDir *root = get_subtree_root(dir);
1310 for (set<CDir*>::iterator p = subtrees[root].begin();
1311 p != subtrees[root].end();
1312 ++p) {
1313 CDir *t = *p;
1314 while (t != root) {
1315 t = t->get_parent_dir();
1316 assert(t);
1317 if (t == dir) {
1318 bounds.insert(*p);
1319 continue;
1320 }
1321 }
1322 }
1323 }
1324}
1325
1326void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1327{
1328 // for debugging only.
1329 assert(subtrees.count(dir));
1330 if (bounds != subtrees[dir]) {
1331 dout(0) << "verify_subtree_bounds failed" << dendl;
1332 set<CDir*> b = bounds;
1333 for (auto &cd : subtrees[dir]) {
1334 if (bounds.count(cd)) {
1335 b.erase(cd);
1336 continue;
1337 }
1338 dout(0) << " missing bound " << *cd << dendl;
1339 }
1340 for (const auto &cd : b)
1341 dout(0) << " extra bound " << *cd << dendl;
1342 }
1343 assert(bounds == subtrees[dir]);
1344}
1345
1346void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1347{
1348 // for debugging only.
1349 assert(subtrees.count(dir));
1350
1351 // make sure that any bounds i do have are properly noted as such.
1352 int failed = 0;
1353 for (const auto &fg : bounds) {
1354 CDir *bd = get_dirfrag(fg);
1355 if (!bd) continue;
1356 if (subtrees[dir].count(bd) == 0) {
1357 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1358 failed++;
1359 }
1360 }
1361 assert(failed == 0);
1362}
1363
1364void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1365{
1366 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1367 << " to " << *newdir << dendl;
1368 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1369}
1370
224ce89b 1371void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1372{
1373 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1374
1375 //show_subtrees();
28e407b8 1376 utime_t now = ceph_clock_now();
7c673cae
FG
1377
1378 CDir *newdir = diri->get_parent_dir();
1379
1380 if (pop) {
1381 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1382 assert(p != projected_subtree_renames.end());
1383 assert(!p->second.empty());
1384 assert(p->second.front().first == olddir);
1385 assert(p->second.front().second == newdir);
1386 p->second.pop_front();
1387 if (p->second.empty())
1388 projected_subtree_renames.erase(p);
1389 }
1390
1391 // adjust subtree
1392 list<CDir*> dfls;
1393 // make sure subtree dirfrags are at the front of the list
1394 diri->get_subtree_dirfrags(dfls);
1395 diri->get_nested_dirfrags(dfls);
1396 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
1397 CDir *dir = *p;
1398
1399 dout(10) << "dirfrag " << *dir << dendl;
1400 CDir *oldparent = get_subtree_root(olddir);
1401 dout(10) << " old parent " << *oldparent << dendl;
1402 CDir *newparent = get_subtree_root(newdir);
1403 dout(10) << " new parent " << *newparent << dendl;
1404
28e407b8
AA
1405 if (olddir != newdir)
1406 mds->balancer->adjust_pop_for_rename(olddir, dir, now, false);
1407
7c673cae
FG
1408 if (oldparent == newparent) {
1409 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1410 } else if (dir->is_subtree_root()) {
7c673cae
FG
1411 // children are fine. change parent.
1412 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1413 assert(subtrees[oldparent].count(dir));
1414 subtrees[oldparent].erase(dir);
1415 assert(subtrees.count(newparent));
1416 subtrees[newparent].insert(dir);
224ce89b 1417 // caller is responsible for 'eval diri'
28e407b8 1418 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1419 } else {
1420 // mid-subtree.
1421
1422 // see if any old bounds move to the new parent.
1423 list<CDir*> tomove;
1424 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1425 p != subtrees[oldparent].end();
1426 ++p) {
1427 CDir *bound = *p;
1428 CDir *broot = get_subtree_root(bound->get_parent_dir());
1429 if (broot != oldparent) {
1430 assert(broot == newparent);
1431 tomove.push_back(bound);
1432 }
1433 }
1434 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1435 CDir *bound = *p;
1436 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1437 subtrees[oldparent].erase(bound);
1438 subtrees[newparent].insert(bound);
1439 }
1440
1441 // did auth change?
1442 if (oldparent->authority() != newparent->authority()) {
28e407b8 1443 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1444 // caller is responsible for 'eval diri'
28e407b8 1445 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1446 }
1447 }
28e407b8
AA
1448
1449 if (olddir != newdir)
1450 mds->balancer->adjust_pop_for_rename(newdir, dir, now, true);
7c673cae
FG
1451 }
1452
1453 show_subtrees();
1454}
1455
1456
1457void MDCache::get_fullauth_subtrees(set<CDir*>& s)
1458{
1459 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1460 p != subtrees.end();
1461 ++p) {
1462 CDir *root = p->first;
1463 if (root->is_full_dir_auth())
1464 s.insert(root);
1465 }
1466}
1467void MDCache::get_auth_subtrees(set<CDir*>& s)
1468{
1469 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1470 p != subtrees.end();
1471 ++p) {
1472 CDir *root = p->first;
1473 if (root->is_auth())
1474 s.insert(root);
1475 }
1476}
1477
1478
1479// count.
1480
1481int MDCache::num_subtrees()
1482{
1483 return subtrees.size();
1484}
1485
1486int MDCache::num_subtrees_fullauth()
1487{
1488 int n = 0;
1489 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1490 p != subtrees.end();
1491 ++p) {
1492 CDir *root = p->first;
1493 if (root->is_full_dir_auth())
1494 n++;
1495 }
1496 return n;
1497}
1498
1499int MDCache::num_subtrees_fullnonauth()
1500{
1501 int n = 0;
1502 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
1503 p != subtrees.end();
1504 ++p) {
1505 CDir *root = p->first;
1506 if (root->is_full_dir_nonauth())
1507 n++;
1508 }
1509 return n;
1510}
1511
1512
1513
1514// ===================================
1515// journal and snap/cow helpers
1516
1517
1518/*
1519 * find first inode in cache that follows given snapid. otherwise, return current.
1520 */
1521CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1522{
1523 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1524 assert(in->last == CEPH_NOSNAP);
1525
b32b8144
FG
1526 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1527 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1528 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1529 in = p->second;
7c673cae 1530 }
b32b8144 1531
7c673cae
FG
1532 return in;
1533}
1534
1535
1536/*
1537 * note: i'm currently cheating wrt dirty and inode.version on cow
1538 * items. instead of doing a full dir predirty, i just take the
1539 * original item's version, and set the dirty flag (via
1540 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1541 * means a special case in the dir commit clean sweep assertions.
1542 * bah.
1543 */
1544CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1545{
1546 assert(last >= in->first);
1547
b32b8144 1548 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae
FG
1549 oldin->inode = *in->get_previous_projected_inode();
1550 oldin->symlink = in->symlink;
1551 oldin->xattrs = *in->get_previous_projected_xattrs();
1552 oldin->inode.trim_client_ranges(last);
1553
1554 if (in->first < in->oldest_snap)
1555 in->oldest_snap = in->first;
1556
1557 in->first = last+1;
1558
1559 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1560 add_inode(oldin);
1561
1562 if (in->last != CEPH_NOSNAP) {
1563 CInode *head_in = get_inode(in->ino());
1564 assert(head_in);
1565 if (head_in->split_need_snapflush(oldin, in)) {
1566 oldin->client_snap_caps = in->client_snap_caps;
94b18763
FG
1567 for (const auto &p : in->client_snap_caps) {
1568 SimpleLock *lock = oldin->get_lock(p.first);
7c673cae 1569 assert(lock);
94b18763 1570 for (const auto &q : p.second) {
7c673cae
FG
1571 oldin->auth_pin(lock);
1572 lock->set_state(LOCK_SNAP_SYNC); // gathering
1573 lock->get_wrlock(true);
94b18763 1574 (void)q; /* unused */
7c673cae
FG
1575 }
1576 }
1577 }
1578 return oldin;
1579 }
1580
b32b8144
FG
1581 if (!in->client_caps.empty()) {
1582 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1583 // clone caps?
94b18763 1584 for (auto &p : in->client_caps) {
b32b8144
FG
1585 client_t client = p.first;
1586 Capability *cap = p.second;
1587 int issued = cap->issued();
1588 if ((issued & CEPH_CAP_ANY_WR) &&
1589 cap->client_follows < last) {
1590 // note in oldin
1591 for (int i = 0; i < num_cinode_locks; i++) {
1592 if (issued & cinode_lock_info[i].wr_caps) {
1593 int lockid = cinode_lock_info[i].lock;
1594 SimpleLock *lock = oldin->get_lock(lockid);
1595 assert(lock);
1596 oldin->client_snap_caps[lockid].insert(client);
1597 oldin->auth_pin(lock);
1598 lock->set_state(LOCK_SNAP_SYNC); // gathering
1599 lock->get_wrlock(true);
1600 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1601 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1602 }
7c673cae 1603 }
b32b8144
FG
1604 cap->client_follows = last;
1605
1606 // we need snapflushes for any intervening snaps
1607 dout(10) << " snaps " << snaps << dendl;
1608 for (auto q = snaps.lower_bound(oldin->first);
1609 q != snaps.end() && *q <= last;
1610 ++q) {
1611 in->add_need_snapflush(oldin, *q, client);
1612 }
1613 } else {
1614 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1615 }
7c673cae
FG
1616 }
1617 }
7c673cae
FG
1618 return oldin;
1619}
1620
1621void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1622 CDentry *dn, snapid_t follows,
1623 CInode **pcow_inode, CDentry::linkage_t *dnl)
1624{
1625 if (!dn) {
1626 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1627 return;
1628 }
1629 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1630 assert(dn->is_auth());
1631
1632 // nothing to cow on a null dentry, fix caller
1633 if (!dnl)
1634 dnl = dn->get_projected_linkage();
1635 assert(!dnl->is_null());
1636
1637 if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) {
1638 // multiversion inode.
1639 CInode *in = dnl->get_inode();
1640 SnapRealm *realm = NULL;
1641
1642 if (in->get_projected_parent_dn() != dn) {
1643 assert(follows == CEPH_NOSNAP);
1644 realm = dn->dir->inode->find_snaprealm();
1645 snapid_t dir_follows = realm->get_newest_snap();
1646
1647 if (dir_follows+1 > dn->first) {
1648 snapid_t oldfirst = dn->first;
1649 dn->first = dir_follows+1;
1650 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1651 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1652 oldfirst, dir_follows);
1653 olddn->pre_dirty();
1654 dout(10) << " olddn " << *olddn << dendl;
1655 metablob->add_remote_dentry(olddn, true);
1656 mut->add_cow_dentry(olddn);
1657 // FIXME: adjust link count here? hmm.
1658
1659 if (dir_follows+1 > in->first)
1660 in->cow_old_inode(dir_follows, false);
1661 }
1662 }
1663
1664 if (in->snaprealm) {
1665 realm = in->snaprealm;
1666 follows = realm->get_newest_seq();
1667 } else
1668 follows = dir_follows;
1669 } else {
1670 realm = in->find_snaprealm();
1671 if (follows == CEPH_NOSNAP)
1672 follows = realm->get_newest_seq();
1673 }
1674
1675 // already cloned?
1676 if (follows < in->first) {
1677 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1678 return;
1679 }
1680
1681 if (!realm->has_snaps_in_range(in->first, follows)) {
1682 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1683 in->first = follows + 1;
1684 return;
1685 }
1686
1687 in->cow_old_inode(follows, false);
1688
1689 } else {
1690 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1691 if (follows == CEPH_NOSNAP)
1692 follows = realm->get_newest_seq();
1693
1694 // already cloned?
1695 if (follows < dn->first) {
1696 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1697 return;
1698 }
1699
1700 // update dn.first before adding old dentry to cdir's map
1701 snapid_t oldfirst = dn->first;
1702 dn->first = follows+1;
1703
1704 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1705
1706 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1707 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1708 if (in)
1709 in->first = follows+1;
1710 return;
1711 }
1712
1713 dout(10) << " dn " << *dn << dendl;
1714 if (in) {
1715 CInode *oldin = cow_inode(in, follows);
1716 mut->add_cow_inode(oldin);
1717 if (pcow_inode)
1718 *pcow_inode = oldin;
94b18763 1719 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, oldin->last);
7c673cae
FG
1720 oldin->inode.version = olddn->pre_dirty();
1721 dout(10) << " olddn " << *olddn << dendl;
1722 bool need_snapflush = !oldin->client_snap_caps.empty();
1723 if (need_snapflush)
1724 mut->ls->open_files.push_back(&oldin->item_open_file);
1725 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1726 mut->add_cow_dentry(olddn);
1727 } else {
1728 assert(dnl->is_remote());
94b18763 1729 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1730 oldfirst, follows);
1731 olddn->pre_dirty();
1732 dout(10) << " olddn " << *olddn << dendl;
1733 metablob->add_remote_dentry(olddn, true);
1734 mut->add_cow_dentry(olddn);
1735 }
1736 }
1737}
1738
1739
1740void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1741 CInode *in, snapid_t follows,
1742 CInode **pcow_inode)
1743{
1744 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1745 CDentry *dn = in->get_projected_parent_dn();
1746 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1747}
1748
1749void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1750{
1751 if (in->is_base()) {
1752 metablob->add_root(true, in, in->get_projected_inode());
1753 } else {
1754 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1755 follows = in->first - 1;
1756 CDentry *dn = in->get_projected_parent_dn();
1757 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1758 journal_cow_dentry(mut, metablob, dn, follows);
1759 if (in->get_projected_inode()->is_backtrace_updated()) {
1760 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1761 in->get_previous_projected_inode()->layout.pool_id;
1762 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1763 } else {
1764 metablob->add_primary_dentry(dn, in, true);
1765 }
1766 }
1767}
1768
1769
1770
1771// nested ---------------------------------------------------------------
1772
1773void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1774 int linkunlink, SnapRealm *prealm)
1775{
1776 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1777 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1778
1779 if (cur->first > first)
1780 first = cur->first;
1781
1782 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1783 << " " << *cur << dendl;
1784 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1785 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1786
1787 /*
1788 * FIXME. this incompletely propagates rstats to _old_ parents
1789 * (i.e. shortly after a directory rename). but we need full
1790 * blown hard link backpointers to make this work properly...
1791 */
1792 snapid_t floor = parentdn->first;
1793 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1794
1795 if (!prealm)
1796 prealm = parent->inode->find_snaprealm();
1797 const set<snapid_t> snaps = prealm->get_snaps();
1798
1799 if (cur->last != CEPH_NOSNAP) {
1800 assert(cur->dirty_old_rstats.empty());
1801 set<snapid_t>::const_iterator q = snaps.lower_bound(MAX(first, floor));
1802 if (q == snaps.end() || *q > cur->last)
1803 return;
1804 }
1805
1806 if (cur->last >= floor) {
1807 bool update = true;
1808 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1809 // rename src inode is not projected in the slave rename prep case. so we should
1810 // avoid updateing the inode.
1811 assert(linkunlink < 0);
1812 assert(cur->is_frozen_inode());
1813 update = false;
1814 }
1815 _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent,
1816 linkunlink, update);
1817 }
1818
1819 if (g_conf->mds_snap_rstat) {
94b18763
FG
1820 for (const auto &p : cur->dirty_old_rstats) {
1821 auto &old = cur->old_inodes[p];
1822 snapid_t ofirst = std::max(old.first, floor);
1823 auto it = snaps.lower_bound(ofirst);
1824 if (it == snaps.end() || *it > p)
7c673cae 1825 continue;
94b18763
FG
1826 if (p >= floor)
1827 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1828 }
1829 }
1830 cur->dirty_old_rstats.clear();
1831}
1832
1833
94b18763 1834void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1835 CDir *parent, int linkunlink, bool update_inode)
1836{
1837 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1838 dout(20) << " inode rstat " << inode.rstat << dendl;
1839 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1840 nest_info_t delta;
1841 if (linkunlink == 0) {
1842 delta.add(inode.rstat);
1843 delta.sub(inode.accounted_rstat);
1844 } else if (linkunlink < 0) {
1845 delta.sub(inode.accounted_rstat);
1846 } else {
1847 delta.add(inode.rstat);
1848 }
1849 dout(20) << " delta " << delta << dendl;
1850
1851 if (update_inode)
1852 inode.accounted_rstat = inode.rstat;
1853
1854 while (last >= ofirst) {
1855 /*
1856 * pick fnode version to update. at each iteration, we want to
1857 * pick a segment ending in 'last' to update. split as necessary
1858 * to make that work. then, adjust first up so that we only
1859 * update one segment at a time. then loop to cover the whole
1860 * [ofirst,last] interval.
1861 */
1862 nest_info_t *prstat;
1863 snapid_t first;
1864 fnode_t *pf = parent->get_projected_fnode();
1865 if (last == CEPH_NOSNAP) {
1866 if (g_conf->mds_snap_rstat)
1867 first = MAX(ofirst, parent->first);
1868 else
1869 first = parent->first;
1870 prstat = &pf->rstat;
1871 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1872
1873 if (first > parent->first &&
1874 !(pf->rstat == pf->accounted_rstat)) {
1875 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1876 << parent->first << "," << (first-1) << "] "
1877 << " " << *prstat << "/" << pf->accounted_rstat
1878 << dendl;
1879 parent->dirty_old_rstat[first-1].first = parent->first;
1880 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1881 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1882 }
1883 parent->first = first;
1884 } else if (!g_conf->mds_snap_rstat) {
1885 // drop snapshots' rstats
1886 break;
1887 } else if (last >= parent->first) {
1888 first = parent->first;
1889 parent->dirty_old_rstat[last].first = first;
1890 parent->dirty_old_rstat[last].rstat = pf->rstat;
1891 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1892 prstat = &parent->dirty_old_rstat[last].rstat;
1893 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1894 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1895 } else {
1896 // be careful, dirty_old_rstat is a _sparse_ map.
1897 // sorry, this is ugly.
1898 first = ofirst;
1899
1900 // find any intersection with last
94b18763
FG
1901 auto it = parent->dirty_old_rstat.lower_bound(last);
1902 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1903 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1904 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1905 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1906 first = parent->dirty_old_rstat.rbegin()->first+1;
1907 }
1908 } else {
94b18763
FG
1909 // *it last is >= last
1910 if (it->second.first <= last) {
1911 // *it intersects [first,last]
1912 if (it->second.first < first) {
1913 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1914 parent->dirty_old_rstat[first-1] = it->second;
1915 it->second.first = first;
7c673cae 1916 }
94b18763
FG
1917 if (it->second.first > first)
1918 first = it->second.first;
1919 if (last < it->first) {
1920 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1921 parent->dirty_old_rstat[last] = it->second;
1922 it->second.first = last+1;
7c673cae
FG
1923 }
1924 } else {
94b18763
FG
1925 // *it is to the _right_ of [first,last]
1926 it = parent->dirty_old_rstat.lower_bound(first);
1927 // new *it last is >= first
1928 if (it->second.first <= last && // new *it isn't also to the right, and
1929 it->first >= first) { // it intersects our first bit,
1930 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1931 first = it->first+1;
7c673cae
FG
1932 }
1933 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1934 }
1935 }
1936 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1937 parent->dirty_old_rstat[last].first = first;
1938 prstat = &parent->dirty_old_rstat[last].rstat;
1939 }
1940
1941 // apply
1942 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1943 assert(last >= first);
1944 prstat->add(delta);
1945 if (update_inode)
1946 inode.accounted_rstat = inode.rstat;
1947 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1948
1949 last = first-1;
1950 }
1951}
1952
1953void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1954 snapid_t ofirst, snapid_t last,
1955 CInode *pin, bool cow_head)
1956{
1957 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1958 dout(20) << " frag rstat " << rstat << dendl;
1959 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1960 nest_info_t delta = rstat;
1961 delta.sub(accounted_rstat);
1962 dout(20) << " delta " << delta << dendl;
1963
1964 while (last >= ofirst) {
94b18763 1965 CInode::mempool_inode *pi;
7c673cae
FG
1966 snapid_t first;
1967 if (last == pin->last) {
1968 pi = pin->get_projected_inode();
1969 first = MAX(ofirst, pin->first);
1970 if (first > pin->first) {
94b18763 1971 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1972 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1973 }
1974 } else {
1975 if (last >= pin->first) {
1976 first = pin->first;
1977 pin->cow_old_inode(last, cow_head);
1978 } else {
1979 // our life is easier here because old_inodes is not sparse
1980 // (although it may not begin at snapid 1)
94b18763
FG
1981 auto it = pin->old_inodes.lower_bound(last);
1982 if (it == pin->old_inodes.end()) {
7c673cae
FG
1983 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1984 break;
1985 }
94b18763 1986 first = it->second.first;
7c673cae 1987 if (first > last) {
94b18763 1988 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1989 //assert(p == pin->old_inodes.begin());
1990 break;
1991 }
94b18763
FG
1992 if (it->first > last) {
1993 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1994 << (last+1) << "," << it->first << "]" << dendl;
1995 pin->old_inodes[last] = it->second;
1996 it->second.first = last+1;
1997 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1998 }
1999 }
2000 if (first < ofirst) {
2001 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
2002 << first << "," << ofirst-1 << "]" << dendl;
2003 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
2004 pin->dirty_old_rstats.insert(ofirst-1);
2005 pin->old_inodes[last].first = first = ofirst;
2006 }
2007 pi = &pin->old_inodes[last].inode;
2008 pin->dirty_old_rstats.insert(last);
2009 }
2010 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
2011 pi->rstat.add(delta);
2012 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
2013
2014 last = first-1;
2015 }
2016}
2017
28e407b8 2018void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct)
7c673cae
FG
2019{
2020 if (!in->is_auth() || in->is_frozen())
2021 return;
2022
94b18763 2023 auto i = in->get_projected_inode();
7c673cae
FG
2024
2025 if (!i->quota.is_enable())
2026 return;
2027
2028 for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
2029 it != in->client_caps.end();
2030 ++it) {
2031 Session *session = mds->get_session(it->first);
2032 if (!session || !session->connection ||
2033 !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA))
2034 continue;
2035
2036 Capability *cap = it->second;
28e407b8
AA
2037
2038 if (exclude_ct >= 0 && exclude_ct != it->first)
2039 goto update;
2040
7c673cae
FG
2041 if (cap->last_rbytes == i->rstat.rbytes &&
2042 cap->last_rsize == i->rstat.rsize())
2043 continue;
2044
2045 if (i->quota.max_files > 0) {
2046 if (i->rstat.rsize() >= i->quota.max_files)
2047 goto update;
2048
2049 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2050 abs(cap->last_rsize - i->rstat.rsize()))
2051 goto update;
2052 }
2053
2054 if (i->quota.max_bytes > 0) {
2055 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2056 goto update;
2057
2058 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2059 abs(cap->last_rbytes - i->rstat.rbytes))
2060 goto update;
2061 }
2062
2063 continue;
2064
2065update:
2066 cap->last_rsize = i->rstat.rsize();
2067 cap->last_rbytes = i->rstat.rbytes;
2068
2069 MClientQuota *msg = new MClientQuota();
2070 msg->ino = in->ino();
2071 msg->rstat = i->rstat;
2072 msg->quota = i->quota;
2073 mds->send_message_client_counted(msg, session->connection);
2074 }
181888fb 2075 for (const auto &it : in->get_replicas()) {
7c673cae
FG
2076 MGatherCaps *msg = new MGatherCaps;
2077 msg->ino = in->ino();
181888fb 2078 mds->send_message_mds(msg, it.first);
7c673cae
FG
2079 }
2080}
2081
2082/*
2083 * NOTE: we _have_ to delay the scatter if we are called during a
2084 * rejoin, because we can't twiddle locks between when the
2085 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2086 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2087 * (no requests), and a survivor acks immediately. _except_ that
2088 * during rejoin_(weak|strong) processing, we may complete a lock
2089 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2090 * scatterlock state in that case or the lock states will get out of
2091 * sync between the auth and replica.
2092 *
2093 * the simple solution is to never do the scatter here. instead, put
2094 * the scatterlock on a list if it isn't already wrlockable. this is
2095 * probably the best plan anyway, since we avoid too many
2096 * scatters/locks under normal usage.
2097 */
2098/*
2099 * some notes on dirlock/nestlock scatterlock semantics:
2100 *
2101 * the fragstat (dirlock) will never be updated without
2102 * dirlock+nestlock wrlock held by the caller.
2103 *
2104 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2105 * data is pushed up the tree. this could be changed with some
2106 * restructuring here, but in its current form we ensure that the
2107 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2108 * frag, which is nice. and, we only need to track frags that need to
2109 * be nudged (and not inodes with pending rstat changes that need to
2110 * be pushed into the frag). a consequence of this is that the
2111 * accounted_rstat on scatterlock sync may not match our current
2112 * rstat. this is normal and expected.
2113 */
2114void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2115 CInode *in, CDir *parent,
2116 int flags, int linkunlink,
2117 snapid_t cfollows)
2118{
2119 bool primary_dn = flags & PREDIRTY_PRIMARY;
2120 bool do_parent_mtime = flags & PREDIRTY_DIR;
2121 bool shallow = flags & PREDIRTY_SHALLOW;
2122
2123 assert(mds->mdlog->entry_is_open());
2124
2125 // make sure stamp is set
2126 if (mut->get_mds_stamp() == utime_t())
2127 mut->set_mds_stamp(ceph_clock_now());
2128
2129 if (in->is_base())
2130 return;
2131
2132 dout(10) << "predirty_journal_parents"
2133 << (do_parent_mtime ? " do_parent_mtime":"")
2134 << " linkunlink=" << linkunlink
2135 << (primary_dn ? " primary_dn":" remote_dn")
2136 << (shallow ? " SHALLOW":"")
2137 << " follows " << cfollows
2138 << " " << *in << dendl;
2139
2140 if (!parent) {
2141 assert(primary_dn);
2142 parent = in->get_projected_parent_dn()->get_dir();
2143 }
2144
2145 if (flags == 0 && linkunlink == 0) {
2146 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2147 blob->add_dir_context(parent);
2148 return;
2149 }
2150
2151 // build list of inodes to wrlock, dirty, and update
2152 list<CInode*> lsi;
2153 CInode *cur = in;
2154 CDentry *parentdn = NULL;
2155 bool first = true;
2156 while (parent) {
2157 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2158 assert(parent->is_auth());
2159
2160 // opportunistically adjust parent dirfrag
2161 CInode *pin = parent->get_inode();
2162
2163 // inode -> dirfrag
2164 mut->auth_pin(parent);
2165 mut->add_projected_fnode(parent);
2166
2167 fnode_t *pf = parent->project_fnode();
2168 pf->version = parent->pre_dirty();
2169
2170 if (do_parent_mtime || linkunlink) {
2171 assert(mut->wrlocks.count(&pin->filelock));
2172 assert(mut->wrlocks.count(&pin->nestlock));
2173 assert(cfollows == CEPH_NOSNAP);
2174
2175 // update stale fragstat/rstat?
2176 parent->resync_accounted_fragstat();
2177 parent->resync_accounted_rstat();
2178
2179 if (do_parent_mtime) {
2180 pf->fragstat.mtime = mut->get_op_stamp();
2181 pf->fragstat.change_attr++;
2182 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2183 if (pf->fragstat.mtime > pf->rstat.rctime) {
2184 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2185 pf->rstat.rctime = pf->fragstat.mtime;
2186 } else {
2187 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2188 }
2189 }
2190 if (linkunlink) {
2191 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2192 if (in->is_dir()) {
2193 pf->fragstat.nsubdirs += linkunlink;
2194 //pf->rstat.rsubdirs += linkunlink;
2195 } else {
2196 pf->fragstat.nfiles += linkunlink;
2197 //pf->rstat.rfiles += linkunlink;
2198 }
2199 }
2200 }
2201
2202 // rstat
2203 if (!primary_dn) {
2204 // don't update parent this pass
2205 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2206 pin->versionlock.can_wrlock())) {
2207 dout(20) << " unwritable parent nestlock " << pin->nestlock
2208 << ", marking dirty rstat on " << *cur << dendl;
2209 cur->mark_dirty_rstat();
2210 } else {
2211 // if we don't hold a wrlock reference on this nestlock, take one,
2212 // because we are about to write into the dirfrag fnode and that needs
2213 // to commit before the lock can cycle.
2214 if (linkunlink) {
2215 assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2216 }
2217
2218 if (mut->wrlocks.count(&pin->nestlock) == 0) {
2219 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2220 mds->locker->wrlock_force(&pin->nestlock, mut);
2221 }
2222
2223 // now we can project the inode rstat diff the dirfrag
2224 SnapRealm *prealm = pin->find_snaprealm();
2225
2226 snapid_t follows = cfollows;
2227 if (follows == CEPH_NOSNAP)
2228 follows = prealm->get_newest_seq();
2229
2230 snapid_t first = follows+1;
2231
2232 // first, if the frag is stale, bring it back in sync.
2233 parent->resync_accounted_rstat();
2234
2235 // now push inode rstats into frag
2236 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2237 cur->clear_dirty_rstat();
2238 }
2239
2240 bool stop = false;
2241 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2242 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2243 stop = true;
2244 }
2245
2246 // delay propagating until later?
2247 if (!stop && !first &&
2248 g_conf->mds_dirstat_min_interval > 0) {
2249 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2250 if (since_last_prop < g_conf->mds_dirstat_min_interval) {
2251 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2252 << " < " << g_conf->mds_dirstat_min_interval
2253 << ", stopping" << dendl;
2254 stop = true;
2255 } else {
2256 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2257 }
2258 }
2259
2260 // can cast only because i'm passing nowait=true in the sole user
2261 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2262 if (!stop &&
2263 mut->wrlocks.count(&pin->nestlock) == 0 &&
2264 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2265 //true
2266 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2267 )) { // ** do not initiate.. see above comment **
2268 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2269 << " on " << *pin << dendl;
2270 stop = true;
2271 }
2272 if (stop) {
2273 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2274 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2275 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2276 mut->add_updated_lock(&pin->nestlock);
2277 if (do_parent_mtime || linkunlink) {
2278 mds->locker->mark_updated_scatterlock(&pin->filelock);
2279 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2280 mut->add_updated_lock(&pin->filelock);
2281 }
2282 break;
2283 }
2284 if (!mut->wrlocks.count(&pin->versionlock))
2285 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2286
2287 assert(mut->wrlocks.count(&pin->nestlock) ||
2288 mut->is_slave());
2289
2290 pin->last_dirstat_prop = mut->get_mds_stamp();
2291
2292 // dirfrag -> diri
2293 mut->auth_pin(pin);
2294 mut->add_projected_inode(pin);
2295 lsi.push_front(pin);
2296
2297 pin->pre_cow_old_inode(); // avoid cow mayhem!
2298
94b18763
FG
2299 auto &pi = pin->project_inode();
2300 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2301
2302 // dirstat
2303 if (do_parent_mtime || linkunlink) {
2304 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2305 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2306 bool touched_mtime = false, touched_chattr = false;
94b18763 2307 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2308 pf->accounted_fragstat = pf->fragstat;
2309 if (touched_mtime)
94b18763 2310 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2311 if (touched_chattr)
94b18763
FG
2312 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2313 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2314
2315 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2316 if (pi.inode.dirstat.size() < 0)
7c673cae 2317 assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
94b18763 2318 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2319 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2320 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2321 << ", dirfrag has " << pf->fragstat;
2322
2323 // trust the dirfrag for now
94b18763 2324 pi.inode.dirstat = pf->fragstat;
7c673cae
FG
2325
2326 assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
2327 }
2328 }
2329 }
2330
2331 /*
2332 * the rule here is to follow the _oldest_ parent with dirty rstat
2333 * data. if we don't propagate all data, we add ourselves to the
2334 * nudge list. that way all rstat data will (eventually) get
2335 * pushed up the tree.
2336 *
2337 * actually, no. for now, silently drop rstats for old parents. we need
2338 * hard link backpointers to do the above properly.
2339 */
2340
2341 // stop?
2342 if (pin->is_base())
2343 break;
2344 parentdn = pin->get_projected_parent_dn();
2345 assert(parentdn);
2346
2347 // rstat
2348 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2349
2350 // first, if the frag is stale, bring it back in sync.
2351 parent->resync_accounted_rstat();
2352
2353 if (g_conf->mds_snap_rstat) {
94b18763
FG
2354 for (auto &p : parent->dirty_old_rstat) {
2355 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2356 p.first, pin, true);
2357 }
7c673cae
FG
2358 }
2359 parent->dirty_old_rstat.clear();
2360 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2361
2362 pf->accounted_rstat = pf->rstat;
2363
2364 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2365 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2366 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2367 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2368 << ", dirfrag has " << pf->rstat;
2369
2370 // trust the dirfrag for now
94b18763 2371 pi.inode.rstat = pf->rstat;
7c673cae
FG
2372
2373 assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
2374 }
2375 }
2376
2377 parent->check_rstats();
2378 broadcast_quota_to_client(pin);
2379 // next parent!
2380 cur = pin;
2381 parent = parentdn->get_dir();
2382 linkunlink = 0;
2383 do_parent_mtime = false;
2384 primary_dn = true;
2385 first = false;
2386 }
2387
2388 // now, stick it in the blob
2389 assert(parent);
2390 assert(parent->is_auth());
2391 blob->add_dir_context(parent);
2392 blob->add_dir(parent, true);
2393 for (list<CInode*>::iterator p = lsi.begin();
2394 p != lsi.end();
2395 ++p) {
2396 CInode *cur = *p;
2397 journal_dirty_inode(mut.get(), blob, cur);
2398 }
2399
2400}
2401
2402
2403
2404
2405
2406// ===================================
2407// slave requests
2408
2409
2410/*
2411 * some handlers for master requests with slaves. we need to make
2412 * sure slaves journal commits before we forget we mastered them and
2413 * remove them from the uncommitted_masters map (used during recovery
2414 * to commit|abort slaves).
2415 */
2416struct C_MDC_CommittedMaster : public MDCacheLogContext {
2417 metareqid_t reqid;
2418 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2419 void finish(int r) override {
2420 mdcache->_logged_master_commit(reqid);
2421 }
2422};
2423
2424void MDCache::log_master_commit(metareqid_t reqid)
2425{
2426 dout(10) << "log_master_commit " << reqid << dendl;
2427 uncommitted_masters[reqid].committing = true;
2428 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2429 new C_MDC_CommittedMaster(this, reqid));
2430}
2431
2432void MDCache::_logged_master_commit(metareqid_t reqid)
2433{
2434 dout(10) << "_logged_master_commit " << reqid << dendl;
2435 assert(uncommitted_masters.count(reqid));
2436 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2437 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2438 uncommitted_masters.erase(reqid);
2439}
2440
2441// while active...
2442
2443void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2444{
2445 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2446 assert(uncommitted_masters.count(r));
2447 uncommitted_masters[r].slaves.erase(from);
2448 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2449 log_master_commit(r);
2450}
2451
2452void MDCache::logged_master_update(metareqid_t reqid)
2453{
2454 dout(10) << "logged_master_update " << reqid << dendl;
2455 assert(uncommitted_masters.count(reqid));
2456 uncommitted_masters[reqid].safe = true;
2457 if (pending_masters.count(reqid)) {
2458 pending_masters.erase(reqid);
2459 if (pending_masters.empty())
2460 process_delayed_resolve();
2461 }
2462}
2463
2464/*
2465 * Master may crash after receiving all slaves' commit acks, but before journalling
2466 * the final commit. Slaves may crash after journalling the slave commit, but before
2467 * sending commit ack to the master. Commit masters with no uncommitted slave when
2468 * resolve finishes.
2469 */
2470void MDCache::finish_committed_masters()
2471{
2472 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2473 p != uncommitted_masters.end();
2474 ++p) {
2475 p->second.recovering = false;
2476 if (!p->second.committing && p->second.slaves.empty()) {
2477 dout(10) << "finish_committed_masters " << p->first << dendl;
2478 log_master_commit(p->first);
2479 }
2480 }
2481}
2482
2483/*
2484 * at end of resolve... we must journal a commit|abort for all slave
2485 * updates, before moving on.
2486 *
2487 * this is so that the master can safely journal ECommitted on ops it
2488 * masters when it reaches up:active (all other recovering nodes must
2489 * complete resolve before that happens).
2490 */
2491struct C_MDC_SlaveCommit : public MDCacheLogContext {
2492 mds_rank_t from;
2493 metareqid_t reqid;
2494 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2495 void finish(int r) override {
2496 mdcache->_logged_slave_commit(from, reqid);
2497 }
2498};
2499
2500void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2501{
2502 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2503
2504 // send a message
2505 MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2506 mds->send_message_mds(req, from);
2507}
2508
2509
2510
2511
2512
2513
2514// ====================================================================
2515// import map, recovery
2516
2517void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2518 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2519{
2520 if (subtrees.count(oldparent)) {
2521 vector<dirfrag_t>& v = subtrees[oldparent];
2522 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2523 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2524 if (*it == df) {
2525 v.erase(it);
2526 break;
2527 }
2528 }
2529 if (subtrees.count(newparent)) {
2530 vector<dirfrag_t>& v = subtrees[newparent];
2531 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2532 v.push_back(df);
2533 }
2534}
2535
2536ESubtreeMap *MDCache::create_subtree_map()
2537{
2538 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2539 << num_subtrees_fullauth() << " fullauth"
2540 << dendl;
2541
2542 show_subtrees();
2543
2544 ESubtreeMap *le = new ESubtreeMap();
2545 mds->mdlog->_start_entry(le);
2546
2547 map<dirfrag_t, CDir*> dirs_to_add;
2548
2549 if (myin) {
2550 CDir* mydir = myin->get_dirfrag(frag_t());
2551 dirs_to_add[mydir->dirfrag()] = mydir;
2552 }
2553
2554 // include all auth subtrees, and their bounds.
2555 // and a spanning tree to tie it to the root.
2556 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2557 p != subtrees.end();
2558 ++p) {
2559 CDir *dir = p->first;
2560
2561 // journal subtree as "ours" if we are
2562 // me, -2
2563 // me, me
2564 // me, !me (may be importing and ambiguous!)
2565
2566 // so not
2567 // !me, *
2568 if (dir->get_dir_auth().first != mds->get_nodeid())
2569 continue;
2570
2571 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2572 my_ambiguous_imports.count(dir->dirfrag())) {
2573 dout(15) << " ambig subtree " << *dir << dendl;
2574 le->ambiguous_subtrees.insert(dir->dirfrag());
2575 } else {
2576 dout(15) << " subtree " << *dir << dendl;
2577 }
2578
2579 dirs_to_add[dir->dirfrag()] = dir;
2580 le->subtrees[dir->dirfrag()].clear();
2581
2582
2583 // bounds
2584 for (set<CDir*>::iterator q = p->second.begin();
2585 q != p->second.end();
2586 ++q) {
2587 CDir *bound = *q;
2588 dout(15) << " subtree bound " << *bound << dendl;
2589 dirs_to_add[bound->dirfrag()] = bound;
2590 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2591 }
2592 }
2593
2594 // apply projected renames
2595 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2596 p != projected_subtree_renames.end();
2597 ++p) {
2598 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2599 CInode *diri = p->first;
2600 CDir *olddir = q->first;
2601 CDir *newdir = q->second;
2602 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2603
2604 list<CDir*> dfls;
2605 diri->get_dirfrags(dfls);
2606 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2607 CDir *dir = *p;
2608 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2609 CDir *oldparent = get_projected_subtree_root(olddir);
2610 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2611 CDir *newparent = get_projected_subtree_root(newdir);
2612 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2613
2614 if (oldparent == newparent) {
2615 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2616 << oldparent->dirfrag() << dendl;
2617 continue;
2618 }
2619
2620 if (dir->is_subtree_root()) {
2621 if (le->subtrees.count(newparent->dirfrag()) &&
2622 oldparent->get_dir_auth() != newparent->get_dir_auth())
2623 dirs_to_add[dir->dirfrag()] = dir;
2624 // children are fine. change parent.
2625 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2626 le->subtrees);
2627 } else {
2628 // mid-subtree.
2629
2630 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2631 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2632 // if oldparent is auth, subtree is mine; include it.
2633 if (le->subtrees.count(oldparent->dirfrag())) {
2634 dirs_to_add[dir->dirfrag()] = dir;
2635 le->subtrees[dir->dirfrag()].clear();
2636 }
2637 // if newparent is auth, subtree is a new bound
2638 if (le->subtrees.count(newparent->dirfrag())) {
2639 dirs_to_add[dir->dirfrag()] = dir;
2640 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2641 }
2642 newparent = dir;
2643 }
2644
2645 // see if any old bounds move to the new parent.
2646 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2647 p != subtrees[oldparent].end();
2648 ++p) {
2649 CDir *bound = *p;
2650 if (dir->contains(bound->get_parent_dir()))
2651 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2652 le->subtrees);
2653 }
2654 }
2655 }
2656 }
2657 }
2658
2659 // simplify the journaled map. our in memory map may have more
2660 // subtrees than needed due to migrations that are just getting
2661 // started or just completing. but on replay, the "live" map will
2662 // be simple and we can do a straight comparison.
2663 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2664 if (le->ambiguous_subtrees.count(p->first))
2665 continue;
2666 unsigned i = 0;
2667 while (i < p->second.size()) {
2668 dirfrag_t b = p->second[i];
2669 if (le->subtrees.count(b) &&
2670 le->ambiguous_subtrees.count(b) == 0) {
2671 vector<dirfrag_t>& bb = le->subtrees[b];
2672 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2673 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2674 p->second.push_back(*r);
2675 dirs_to_add.erase(b);
2676 le->subtrees.erase(b);
2677 p->second.erase(p->second.begin() + i);
2678 } else {
2679 ++i;
2680 }
2681 }
2682 }
2683
94b18763 2684 for (auto &p : dirs_to_add) {
7c673cae
FG
2685 CDir *dir = p.second;
2686 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2687 le->metablob.add_dir(dir, false);
2688 }
2689
2690 dout(15) << " subtrees " << le->subtrees << dendl;
2691 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2692
2693 //le->metablob.print(cout);
2694 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2695 return le;
2696}
2697
2698void MDCache::dump_resolve_status(Formatter *f) const
2699{
2700 f->open_object_section("resolve_status");
2701 f->dump_stream("resolve_gather") << resolve_gather;
2702 f->dump_stream("resolve_ack_gather") << resolve_gather;
2703 f->close_section();
2704}
2705
2706void MDCache::resolve_start(MDSInternalContext *resolve_done_)
2707{
2708 dout(10) << "resolve_start" << dendl;
2709 assert(!resolve_done);
2710 resolve_done.reset(resolve_done_);
2711
2712 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2713 // if we don't have the root dir, adjust it to UNKNOWN. during
2714 // resolve we want mds0 to explicit claim the portion of it that
2715 // it owns, so that anything beyond its bounds get left as
2716 // unknown.
2717 CDir *rootdir = root->get_dirfrag(frag_t());
2718 if (rootdir)
2719 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2720 }
2721 resolve_gather = recovery_set;
2722}
2723
2724void MDCache::send_resolves()
2725{
2726 send_slave_resolves();
2727 if (!resolve_ack_gather.empty()) {
2728 dout(10) << "send_resolves still waiting for resolve ack from ("
2729 << resolve_ack_gather << ")" << dendl;
2730 return;
2731 }
2732 if (!need_resolve_rollback.empty()) {
2733 dout(10) << "send_resolves still waiting for rollback to commit on ("
2734 << need_resolve_rollback << ")" << dendl;
2735 return;
2736 }
2737 send_subtree_resolves();
2738}
2739
2740void MDCache::send_slave_resolves()
2741{
2742 dout(10) << "send_slave_resolves" << dendl;
2743
2744 map<mds_rank_t, MMDSResolve*> resolves;
2745
2746 if (mds->is_resolve()) {
2747 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2748 p != uncommitted_slave_updates.end();
2749 ++p) {
2750 resolves[p->first] = new MMDSResolve;
2751 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2752 q != p->second.end();
2753 ++q) {
2754 dout(10) << " including uncommitted " << q->first << dendl;
2755 resolves[p->first]->add_slave_request(q->first, false);
2756 }
2757 }
2758 } else {
2759 set<mds_rank_t> resolve_set;
2760 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2761 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2762 p != active_requests.end();
2763 ++p) {
2764 MDRequestRef& mdr = p->second;
2765 if (!mdr->is_slave())
2766 continue;
2767 if (!mdr->slave_did_prepare() && !mdr->committing) {
2768 continue;
2769 }
2770 mds_rank_t master = mdr->slave_to_mds;
2771 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2772 dout(10) << " including uncommitted " << *mdr << dendl;
2773 if (!resolves.count(master))
2774 resolves[master] = new MMDSResolve;
2775 if (!mdr->committing &&
2776 mdr->has_more() && mdr->more()->is_inode_exporter) {
2777 // re-send cap exports
2778 CInode *in = mdr->more()->rename_inode;
2779 map<client_t, Capability::Export> cap_map;
2780 in->export_client_caps(cap_map);
2781 bufferlist bl;
2782 ::encode(in->ino(), bl);
2783 ::encode(cap_map, bl);
2784 resolves[master]->add_slave_request(p->first, bl);
2785 } else {
2786 resolves[master]->add_slave_request(p->first, mdr->committing);
2787 }
2788 }
2789 }
2790 }
2791
2792 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2793 p != resolves.end();
2794 ++p) {
2795 dout(10) << "sending slave resolve to mds." << p->first << dendl;
2796 mds->send_message_mds(p->second, p->first);
2797 resolve_ack_gather.insert(p->first);
2798 }
2799}
2800
2801void MDCache::send_subtree_resolves()
2802{
2803 dout(10) << "send_subtree_resolves" << dendl;
2804
2805 if (migrator->is_exporting() || migrator->is_importing()) {
2806 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2807 migrator->show_importing();
2808 migrator->show_exporting();
2809 resolves_pending = true;
2810 return; // not now
2811 }
2812
2813 map<mds_rank_t, MMDSResolve*> resolves;
2814 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2815 p != recovery_set.end();
2816 ++p) {
2817 if (*p == mds->get_nodeid())
2818 continue;
2819 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2820 resolves[*p] = new MMDSResolve;
2821 }
2822
2823 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2824 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2825
2826 // known
2827 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2828 p != subtrees.end();
2829 ++p) {
2830 CDir *dir = p->first;
2831
2832 // only our subtrees
2833 if (dir->authority().first != mds->get_nodeid())
2834 continue;
2835
2836 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2837 continue; // we'll add it below
2838
2839 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2840 // ambiguous (mid-import)
2841 set<CDir*> bounds;
2842 get_subtree_bounds(dir, bounds);
2843 vector<dirfrag_t> dfls;
2844 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2845 dfls.push_back((*q)->dirfrag());
2846
2847 my_ambig_imports[dir->dirfrag()] = dfls;
2848 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2849 } else {
2850 // not ambiguous.
2851 for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
2852 q != resolves.end();
2853 ++q)
2854 resolves[q->first]->add_subtree(dir->dirfrag());
2855 // bounds too
2856 vector<dirfrag_t> dfls;
2857 for (set<CDir*>::iterator q = subtrees[dir].begin();
2858 q != subtrees[dir].end();
2859 ++q) {
2860 CDir *bound = *q;
2861 dfls.push_back(bound->dirfrag());
2862 }
2863
2864 my_subtrees[dir->dirfrag()] = dfls;
2865 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2866 }
2867 }
2868
2869 // ambiguous
2870 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2871 p != my_ambiguous_imports.end();
2872 ++p) {
2873 my_ambig_imports[p->first] = p->second;
2874 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2875 }
2876
2877 // simplify the claimed subtree.
2878 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2879 unsigned i = 0;
2880 while (i < p->second.size()) {
2881 dirfrag_t b = p->second[i];
2882 if (my_subtrees.count(b)) {
2883 vector<dirfrag_t>& bb = my_subtrees[b];
2884 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2885 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2886 p->second.push_back(*r);
2887 my_subtrees.erase(b);
2888 p->second.erase(p->second.begin() + i);
2889 } else {
2890 ++i;
2891 }
2892 }
2893 }
2894
2895 // send
2896 for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
2897 p != resolves.end();
2898 ++p) {
2899 MMDSResolve* m = p->second;
2900 m->subtrees = my_subtrees;
2901 m->ambiguous_imports = my_ambig_imports;
2902 dout(10) << "sending subtee resolve to mds." << p->first << dendl;
2903 mds->send_message_mds(m, p->first);
2904 }
2905 resolves_pending = false;
2906}
2907
2908void MDCache::handle_mds_failure(mds_rank_t who)
2909{
2910 dout(7) << "handle_mds_failure mds." << who << dendl;
2911
2912 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2913
2914 resolve_gather.insert(who);
2915 discard_delayed_resolve(who);
2916 ambiguous_slave_updates.erase(who);
2917
2918 rejoin_gather.insert(who);
2919 rejoin_sent.erase(who); // i need to send another
31f18b77 2920 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2921 rejoin_ack_gather.erase(who); // i'll need/get another.
2922
2923 dout(10) << " resolve_gather " << resolve_gather << dendl;
2924 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2925 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2926 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2927 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2928
2929
2930 // tell the migrator too.
2931 migrator->handle_mds_failure_or_stop(who);
2932
224ce89b
WB
2933 // tell the balancer too.
2934 mds->balancer->handle_mds_failure(who);
2935
7c673cae
FG
2936 // clean up any requests slave to/from this node
2937 list<MDRequestRef> finish;
2938 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2939 p != active_requests.end();
2940 ++p) {
2941 MDRequestRef& mdr = p->second;
2942 // slave to the failed node?
2943 if (mdr->slave_to_mds == who) {
2944 if (mdr->slave_did_prepare()) {
2945 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2946 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2947 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2948
2949 if (!mdr->more()->waiting_on_slave.empty()) {
2950 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2951 // will rollback, no need to wait
91327a77 2952 mdr->reset_slave_request();
7c673cae
FG
2953 mdr->more()->waiting_on_slave.clear();
2954 }
2955 } else if (!mdr->committing) {
2956 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2957 if (mdr->slave_request || mdr->slave_rolling_back())
2958 mdr->aborted = true;
2959 else
2960 finish.push_back(mdr);
2961 }
2962 }
2963
2964 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2965 if (mdr->more()->waiting_on_slave.count(who)) {
2966 assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2967 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2968 << who << dendl;
2969 mdr->more()->waiting_on_slave.erase(who);
2970 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2971 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2972 }
2973
2974 if (mdr->more()->srcdn_auth_mds == who &&
2975 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2976 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2977 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2978 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2979 }
31f18b77
FG
2980 } else if (mdr->slave_request) {
2981 MMDSSlaveRequest *slave_req = mdr->slave_request;
2982 // FIXME: Slave rename request can arrive after we notice mds failure.
2983 // This can cause mds to crash (does not affect integrity of FS).
2984 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2985 slave_req->srcdn_auth == who)
2986 slave_req->mark_interrupted();
7c673cae
FG
2987 }
2988
2989 // failed node is slave?
2990 if (mdr->is_master() && !mdr->committing) {
2991 if (mdr->more()->srcdn_auth_mds == who) {
2992 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2993 << who << " to recover" << dendl;
2994 assert(mdr->more()->witnessed.count(who) == 0);
2995 if (mdr->more()->is_ambiguous_auth)
2996 mdr->clear_ambiguous_auth();
2997 // rename srcdn's auth mds failed, all witnesses will rollback
2998 mdr->more()->witnessed.clear();
2999 pending_masters.erase(p->first);
3000 }
3001
3002 if (mdr->more()->witnessed.count(who)) {
3003 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
3004 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
3005 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
3006 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
3007 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
3008 // until either the request is committing or the slave also fails.
3009 assert(mdr->more()->waiting_on_slave.size() == 1);
3010 pending_masters.insert(p->first);
3011 } else {
3012 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
3013 << who << " to recover" << dendl;
3014 if (srcdn_auth >= 0)
3015 assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
3016
3017 // discard this peer's prepare (if any)
3018 mdr->more()->witnessed.erase(who);
3019 }
3020 }
3021
3022 if (mdr->more()->waiting_on_slave.count(who)) {
3023 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3024 << " to recover" << dendl;
3025 // retry request when peer recovers
3026 mdr->more()->waiting_on_slave.erase(who);
3027 if (mdr->more()->waiting_on_slave.empty())
3028 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3029 }
3030
3031 if (mdr->locking && mdr->locking_target_mds == who)
3032 mdr->finish_locking(mdr->locking);
3033 }
3034 }
3035
3036 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3037 p != uncommitted_masters.end();
3038 ++p) {
3039 // The failed MDS may have already committed the slave update
3040 if (p->second.slaves.count(who)) {
3041 p->second.recovering = true;
3042 p->second.slaves.erase(who);
3043 }
3044 }
3045
3046 while (!finish.empty()) {
3047 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3048 request_finish(finish.front());
3049 finish.pop_front();
3050 }
3051
3052 kick_find_ino_peers(who);
3053 kick_open_ino_peers(who);
3054
3055 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3056 p != fragments.end(); ) {
3057 dirfrag_t df = p->first;
3058 fragment_info_t& info = p->second;
3059 ++p;
3060 if (info.is_fragmenting())
3061 continue;
3062 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3063 list<CDir*> dirs;
3064 info.dirs.swap(dirs);
3065 fragments.erase(df);
3066 fragment_unmark_unfreeze_dirs(dirs);
3067 }
3068
3069 // MDCache::shutdown_export_strays() always exports strays to mds.0
3070 if (who == mds_rank_t(0))
f64942e4 3071 shutdown_exporting_strays.clear();
7c673cae
FG
3072
3073 show_subtrees();
3074}
3075
3076/*
3077 * handle_mds_recovery - called on another node's transition
3078 * from resolve -> active.
3079 */
3080void MDCache::handle_mds_recovery(mds_rank_t who)
3081{
3082 dout(7) << "handle_mds_recovery mds." << who << dendl;
3083
3084 // exclude all discover waiters. kick_discovers() will do the job
3085 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3086 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3087
3088 list<MDSInternalContextBase*> waiters;
3089
3090 // wake up any waiters in their subtrees
3091 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3092 p != subtrees.end();
3093 ++p) {
3094 CDir *dir = p->first;
3095
3096 if (dir->authority().first != who ||
3097 dir->authority().second == mds->get_nodeid())
3098 continue;
3099 assert(!dir->is_auth());
3100
3101 // wake any waiters
3102 list<CDir*> q;
3103 q.push_back(dir);
3104
3105 while (!q.empty()) {
3106 CDir *d = q.front();
3107 q.pop_front();
3108 d->take_waiting(d_mask, waiters);
3109
3110 // inode waiters too
94b18763
FG
3111 for (auto &p : d->items) {
3112 CDentry *dn = p.second;
7c673cae
FG
3113 CDentry::linkage_t *dnl = dn->get_linkage();
3114 if (dnl->is_primary()) {
3115 dnl->get_inode()->take_waiting(i_mask, waiters);
3116
3117 // recurse?
3118 list<CDir*> ls;
3119 dnl->get_inode()->get_dirfrags(ls);
3120 for (list<CDir*>::iterator p = ls.begin();
3121 p != ls.end();
3122 ++p) {
3123 CDir *subdir = *p;
3124 if (!subdir->is_subtree_root())
3125 q.push_back(subdir);
3126 }
3127 }
3128 }
3129 }
3130 }
3131
3132 kick_open_ino_peers(who);
3133 kick_find_ino_peers(who);
3134
3135 // queue them up.
3136 mds->queue_waiters(waiters);
3137}
3138
3139void MDCache::set_recovery_set(set<mds_rank_t>& s)
3140{
3141 dout(7) << "set_recovery_set " << s << dendl;
3142 recovery_set = s;
3143}
3144
3145
3146/*
3147 * during resolve state, we share resolves to determine who
3148 * is authoritative for which trees. we expect to get an resolve
3149 * from _everyone_ in the recovery_set (the mds cluster at the time of
3150 * the first failure).
3151 *
3152 * This functions puts the passed message before returning
3153 */
3154void MDCache::handle_resolve(MMDSResolve *m)
3155{
3156 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3157 mds_rank_t from = mds_rank_t(m->get_source().num());
3158
3159 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3160 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3161 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3162 return;
3163 }
3164 // wait until we reach the resolve stage!
3165 m->put();
3166 return;
3167 }
3168
3169 discard_delayed_resolve(from);
3170
3171 // ambiguous slave requests?
3172 if (!m->slave_requests.empty()) {
3173 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3174 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3175 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3176 assert(!p->second.committing);
3177 pending_masters.insert(p->first);
3178 }
3179 }
3180
3181 if (!pending_masters.empty()) {
3182 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3183 delayed_resolve[from] = m;
3184 return;
3185 }
3186 }
3187
3188 MMDSResolveAck *ack = new MMDSResolveAck;
3189 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3190 if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) {
3191 // COMMIT
3192 if (p->second.committing) {
3193 // already committing, waiting for the OP_COMMITTED slave reply
3194 dout(10) << " already committing slave request " << *p << " noop "<< dendl;
3195 } else {
3196 dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl;
3197 ack->add_commit(p->first);
3198 }
3199 uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3200
3201 if (p->second.inode_caps.length() > 0) {
3202 // slave wants to export caps (rename)
3203 assert(mds->is_resolve());
3204
3205 inodeno_t ino;
3206 map<client_t,Capability::Export> cap_exports;
3207 bufferlist::iterator q = p->second.inode_caps.begin();
3208 ::decode(ino, q);
3209 ::decode(cap_exports, q);
3210
3211 assert(get_inode(ino));
3212
3213 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3214 q != cap_exports.end();
3215 ++q) {
3216 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3217 im.cap_id = ++last_cap_id; // assign a new cap ID
3218 im.issue_seq = 1;
3219 im.mseq = q->second.mseq;
28e407b8
AA
3220
3221 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3222 if (session)
3223 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3224 }
3225
3226 // will process these caps in rejoin stage
3227 rejoin_slave_exports[ino].first = from;
3228 rejoin_slave_exports[ino].second.swap(cap_exports);
3229
3230 // send information of imported caps back to slave
3231 ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]);
3232 }
3233 } else {
3234 // ABORT
3235 dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl;
3236 assert(!p->second.committing);
3237 ack->add_abort(p->first);
3238 }
3239 }
3240 mds->send_message(ack, m->get_connection());
3241 m->put();
3242 return;
3243 }
3244
3245 if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
3246 dout(10) << "delay processing subtree resolve" << dendl;
3247 delayed_resolve[from] = m;
3248 return;
3249 }
3250
3251 bool survivor = false;
3252 // am i a surviving ambiguous importer?
3253 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3254 survivor = true;
3255 // check for any import success/failure (from this node)
3256 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3257 while (p != my_ambiguous_imports.end()) {
3258 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3259 ++next;
3260 CDir *dir = get_dirfrag(p->first);
3261 assert(dir);
3262 dout(10) << "checking ambiguous import " << *dir << dendl;
3263 if (migrator->is_importing(dir->dirfrag()) &&
3264 migrator->get_import_peer(dir->dirfrag()) == from) {
3265 assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3266
3267 // check if sender claims the subtree
3268 bool claimed_by_sender = false;
3269 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = m->subtrees.begin();
3270 q != m->subtrees.end();
3271 ++q) {
3272 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3273 CDir *base = get_force_dirfrag(q->first, false);
3274 if (!base || !base->contains(dir))
3275 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3276
3277 bool inside = true;
3278 set<CDir*> bounds;
3279 get_force_dirfrag_bound_set(q->second, bounds);
3280 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3281 CDir *bound = *p;
3282 if (bound->contains(dir)) {
3283 inside = false; // nope, bound is dir or parent of dir, not inside.
3284 break;
3285 }
3286 }
3287 if (inside)
3288 claimed_by_sender = true;
3289 }
3290
3291 my_ambiguous_imports.erase(p); // no longer ambiguous.
3292 if (claimed_by_sender) {
3293 dout(7) << "ambiguous import failed on " << *dir << dendl;
3294 migrator->import_reverse(dir);
3295 } else {
3296 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3297 migrator->import_finish(dir, true);
3298 }
3299 }
3300 p = next;
3301 }
3302 }
3303
3304 // update my dir_auth values
3305 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3306 // migrations between other nodes)
3307 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
3308 pi != m->subtrees.end();
3309 ++pi) {
3310 dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl;
3311 CDir *dir = get_force_dirfrag(pi->first, !survivor);
3312 if (!dir)
3313 continue;
3314 adjust_bounded_subtree_auth(dir, pi->second, from);
3315 try_subtree_merge(dir);
3316 }
3317
3318 show_subtrees();
3319
3320 // note ambiguous imports too
3321 for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
3322 pi != m->ambiguous_imports.end();
3323 ++pi) {
3324 dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl;
3325 other_ambiguous_imports[from][pi->first].swap( pi->second );
3326 }
3327
3328 // did i get them all?
3329 resolve_gather.erase(from);
3330
3331 maybe_resolve_finish();
3332
3333 m->put();
3334}
3335
3336void MDCache::process_delayed_resolve()
3337{
3338 dout(10) << "process_delayed_resolve" << dendl;
3339 map<mds_rank_t, MMDSResolve*> tmp;
3340 tmp.swap(delayed_resolve);
3341 for (map<mds_rank_t, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
3342 handle_resolve(p->second);
3343}
3344
3345void MDCache::discard_delayed_resolve(mds_rank_t who)
3346{
3347 if (delayed_resolve.count(who)) {
3348 delayed_resolve[who]->put();
3349 delayed_resolve.erase(who);
3350 }
3351}
3352
3353void MDCache::maybe_resolve_finish()
3354{
3355 assert(resolve_ack_gather.empty());
3356 assert(need_resolve_rollback.empty());
3357
3358 if (!resolve_gather.empty()) {
3359 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3360 << resolve_gather << ")" << dendl;
3361 return;
3362 }
3363
3364 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3365 disambiguate_my_imports();
3366 finish_committed_masters();
3367
3368 if (resolve_done) {
3369 assert(mds->is_resolve());
3370 trim_unlinked_inodes();
3371 recalc_auth_bits(false);
3372 resolve_done.release()->complete(0);
3373 } else {
3374 maybe_send_pending_rejoins();
3375 }
3376}
3377
3378/* This functions puts the passed message before returning */
3379void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
3380{
3381 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3382 mds_rank_t from = mds_rank_t(ack->get_source().num());
3383
3384 if (!resolve_ack_gather.count(from) ||
3385 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3386 ack->put();
3387 return;
3388 }
3389
3390 if (ambiguous_slave_updates.count(from)) {
3391 assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3392 assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3393 }
3394
3395 for (map<metareqid_t, bufferlist>::iterator p = ack->commit.begin();
3396 p != ack->commit.end();
3397 ++p) {
3398 dout(10) << " commit on slave " << p->first << dendl;
3399
3400 if (ambiguous_slave_updates.count(from)) {
3401 remove_ambiguous_slave_update(p->first, from);
3402 continue;
3403 }
3404
3405 if (mds->is_resolve()) {
3406 // replay
3407 MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from);
3408 assert(su);
3409
3410 // log commit
3411 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from,
3412 ESlaveUpdate::OP_COMMIT, su->origop),
3413 new C_MDC_SlaveCommit(this, from, p->first));
3414 mds->mdlog->flush();
3415
3416 finish_uncommitted_slave_update(p->first, from);
3417 } else {
3418 MDRequestRef mdr = request_get(p->first);
3419 // information about master imported caps
3420 if (p->second.length() > 0)
3421 mdr->more()->inode_import.claim(p->second);
3422
3423 assert(mdr->slave_request == 0); // shouldn't be doing anything!
3424 request_finish(mdr);
3425 }
3426 }
3427
3428 for (vector<metareqid_t>::iterator p = ack->abort.begin();
3429 p != ack->abort.end();
3430 ++p) {
3431 dout(10) << " abort on slave " << *p << dendl;
3432
3433 if (mds->is_resolve()) {
3434 MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
3435 assert(su);
3436
3437 // perform rollback (and journal a rollback entry)
3438 // note: this will hold up the resolve a bit, until the rollback entries journal.
3439 MDRequestRef null_ref;
3440 switch (su->origop) {
3441 case ESlaveUpdate::LINK:
3442 mds->server->do_link_rollback(su->rollback, from, null_ref);
3443 break;
3444 case ESlaveUpdate::RENAME:
3445 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3446 break;
3447 case ESlaveUpdate::RMDIR:
3448 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3449 break;
3450 default:
3451 ceph_abort();
3452 }
3453 } else {
3454 MDRequestRef mdr = request_get(*p);
3455 mdr->aborted = true;
3456 if (mdr->slave_request) {
3457 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3458 add_rollback(*p, from);
3459 } else {
3460 request_finish(mdr);
3461 }
3462 }
3463 }
3464
3465 if (!ambiguous_slave_updates.count(from))
3466 resolve_ack_gather.erase(from);
3467 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3468 send_subtree_resolves();
3469 process_delayed_resolve();
3470 }
3471
3472 ack->put();
3473}
3474
3475void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3476{
3477 assert(uncommitted_slave_updates[master].count(reqid) == 0);
3478 uncommitted_slave_updates[master][reqid] = su;
3479 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3480 uncommitted_slave_rename_olddir[*p]++;
3481 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3482 uncommitted_slave_unlink[*p]++;
3483}
3484
3485void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3486{
3487 assert(uncommitted_slave_updates[master].count(reqid));
3488 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3489
3490 uncommitted_slave_updates[master].erase(reqid);
3491 if (uncommitted_slave_updates[master].empty())
3492 uncommitted_slave_updates.erase(master);
3493 // discard the non-auth subtree we renamed out of
3494 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3495 CInode *diri = *p;
3496 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3497 assert(it != uncommitted_slave_rename_olddir.end());
3498 it->second--;
3499 if (it->second == 0) {
3500 uncommitted_slave_rename_olddir.erase(it);
3501 list<CDir*> ls;
3502 diri->get_dirfrags(ls);
3503 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3504 CDir *root = get_subtree_root(*q);
3505 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3506 try_trim_non_auth_subtree(root);
3507 if (*q != root)
3508 break;
3509 }
3510 }
3511 } else
3512 assert(it->second > 0);
3513 }
3514 // removed the inodes that were unlinked by slave update
3515 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3516 CInode *in = *p;
3517 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3518 assert(it != uncommitted_slave_unlink.end());
3519 it->second--;
3520 if (it->second == 0) {
3521 uncommitted_slave_unlink.erase(it);
3522 if (!in->get_projected_parent_dn())
3523 mds->mdcache->remove_inode_recursive(in);
3524 } else
3525 assert(it->second > 0);
3526 }
3527 delete su;
3528}
3529
3530MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3531{
3532
3533 MDSlaveUpdate* su = NULL;
3534 if (uncommitted_slave_updates.count(master) &&
3535 uncommitted_slave_updates[master].count(reqid)) {
3536 su = uncommitted_slave_updates[master][reqid];
3537 assert(su);
3538 }
3539 return su;
3540}
3541
3542void MDCache::finish_rollback(metareqid_t reqid) {
3543 assert(need_resolve_rollback.count(reqid));
3544 if (mds->is_resolve())
3545 finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
3546 need_resolve_rollback.erase(reqid);
3547 if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
3548 send_subtree_resolves();
3549 process_delayed_resolve();
3550 }
3551}
3552
3553void MDCache::disambiguate_other_imports()
3554{
3555 dout(10) << "disambiguate_other_imports" << dendl;
3556
3557 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3558 // other nodes' ambiguous imports
3559 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3560 p != other_ambiguous_imports.end();
3561 ++p) {
3562 mds_rank_t who = p->first;
3563 dout(10) << "ambiguous imports for mds." << who << dendl;
3564
3565 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3566 q != p->second.end();
3567 ++q) {
3568 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3569 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3570 CDir *dir = get_force_dirfrag(q->first, recovering);
3571 if (!dir) continue;
3572
3573 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3574 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3575 dout(10) << " mds." << who << " did import " << *dir << dendl;
3576 adjust_bounded_subtree_auth(dir, q->second, who);
3577 try_subtree_merge(dir);
3578 } else {
3579 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3580 }
3581 }
3582 }
3583 other_ambiguous_imports.clear();
3584}
3585
3586void MDCache::disambiguate_my_imports()
3587{
3588 dout(10) << "disambiguate_my_imports" << dendl;
3589
3590 if (!mds->is_resolve()) {
3591 assert(my_ambiguous_imports.empty());
3592 return;
3593 }
3594
3595 disambiguate_other_imports();
3596
3597 // my ambiguous imports
3598 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3599 while (!my_ambiguous_imports.empty()) {
3600 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3601
3602 CDir *dir = get_dirfrag(q->first);
3603 assert(dir);
3604
3605 if (dir->authority() != me_ambig) {
3606 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3607 cancel_ambiguous_import(dir);
3608
3609 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3610
3611 // subtree may have been swallowed by another node claiming dir
3612 // as their own.
3613 CDir *root = get_subtree_root(dir);
3614 if (root != dir)
3615 dout(10) << " subtree root is " << *root << dendl;
3616 assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3617 try_trim_non_auth_subtree(root);
3618 } else {
3619 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3620 finish_ambiguous_import(q->first);
3621 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3622 }
3623 }
3624 assert(my_ambiguous_imports.empty());
3625 mds->mdlog->flush();
3626
3627 // verify all my subtrees are unambiguous!
3628 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3629 p != subtrees.end();
3630 ++p) {
3631 CDir *dir = p->first;
3632 if (dir->is_ambiguous_dir_auth()) {
3633 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3634 }
3635 assert(!dir->is_ambiguous_dir_auth());
3636 }
3637
3638 show_subtrees();
3639}
3640
3641
3642void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3643{
3644 assert(my_ambiguous_imports.count(base) == 0);
3645 my_ambiguous_imports[base] = bounds;
3646}
3647
3648
3649void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3650{
3651 // make a list
3652 vector<dirfrag_t> binos;
3653 for (set<CDir*>::iterator p = bounds.begin();
3654 p != bounds.end();
3655 ++p)
3656 binos.push_back((*p)->dirfrag());
3657
3658 // note: this can get called twice if the exporter fails during recovery
3659 if (my_ambiguous_imports.count(base->dirfrag()))
3660 my_ambiguous_imports.erase(base->dirfrag());
3661
3662 add_ambiguous_import(base->dirfrag(), binos);
3663}
3664
3665void MDCache::cancel_ambiguous_import(CDir *dir)
3666{
3667 dirfrag_t df = dir->dirfrag();
3668 assert(my_ambiguous_imports.count(df));
3669 dout(10) << "cancel_ambiguous_import " << df
3670 << " bounds " << my_ambiguous_imports[df]
3671 << " " << *dir
3672 << dendl;
3673 my_ambiguous_imports.erase(df);
3674}
3675
3676void MDCache::finish_ambiguous_import(dirfrag_t df)
3677{
3678 assert(my_ambiguous_imports.count(df));
3679 vector<dirfrag_t> bounds;
3680 bounds.swap(my_ambiguous_imports[df]);
3681 my_ambiguous_imports.erase(df);
3682
3683 dout(10) << "finish_ambiguous_import " << df
3684 << " bounds " << bounds
3685 << dendl;
3686 CDir *dir = get_dirfrag(df);
3687 assert(dir);
3688
3689 // adjust dir_auth, import maps
3690 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3691 try_subtree_merge(dir);
3692}
3693
3694void MDCache::remove_inode_recursive(CInode *in)
3695{
3696 dout(10) << "remove_inode_recursive " << *in << dendl;
3697 list<CDir*> ls;
3698 in->get_dirfrags(ls);
3699 list<CDir*>::iterator p = ls.begin();
3700 while (p != ls.end()) {
3701 CDir *subdir = *p++;
3702
3703 dout(10) << " removing dirfrag " << subdir << dendl;
94b18763
FG
3704 auto it = subdir->items.begin();
3705 while (it != subdir->items.end()) {
3706 CDentry *dn = it->second;
3707 ++it;
7c673cae
FG
3708 CDentry::linkage_t *dnl = dn->get_linkage();
3709 if (dnl->is_primary()) {
3710 CInode *tin = dnl->get_inode();
31f18b77 3711 subdir->unlink_inode(dn, false);
7c673cae
FG
3712 remove_inode_recursive(tin);
3713 }
3714 subdir->remove_dentry(dn);
3715 }
3716
3717 if (subdir->is_subtree_root())
3718 remove_subtree(subdir);
3719 in->close_dirfrag(subdir->dirfrag().frag);
3720 }
3721 remove_inode(in);
3722}
3723
3724bool MDCache::expire_recursive(
3725 CInode *in,
3726 map<mds_rank_t, MCacheExpire*>& expiremap)
3727{
3728 assert(!in->is_auth());
3729
3730 dout(10) << __func__ << ":" << *in << dendl;
3731
3732 // Recurse into any dirfrags beneath this inode
3733 list<CDir*> ls;
3734 in->get_dirfrags(ls);
3735 for (auto subdir : ls) {
3736 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3737 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3738 return true;
3739 }
3740
3741 for (auto &it : subdir->items) {
3742 CDentry *dn = it.second;
3743 CDentry::linkage_t *dnl = dn->get_linkage();
3744 if (dnl->is_primary()) {
3745 CInode *tin = dnl->get_inode();
3746
3747 /* Remote strays with linkage (i.e. hardlinks) should not be
3748 * expired, because they may be the target of
3749 * a rename() as the owning MDS shuts down */
3750 if (!tin->is_stray() && tin->inode.nlink) {
3751 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3752 return true;
3753 }
3754
3755 const bool abort = expire_recursive(tin, expiremap);
3756 if (abort) {
3757 return true;
3758 }
3759 }
3760 if (dn->lru_is_expireable()) {
3761 trim_dentry(dn, expiremap);
3762 } else {
3763 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3764 return true;
3765 }
3766 }
3767 }
3768
3769 return false;
3770}
3771
3772void MDCache::trim_unlinked_inodes()
3773{
3774 dout(7) << "trim_unlinked_inodes" << dendl;
3775 list<CInode*> q;
94b18763 3776 for (auto &p : inode_map) {
b32b8144 3777 CInode *in = p.second;
7c673cae
FG
3778 if (in->get_parent_dn() == NULL && !in->is_base()) {
3779 dout(7) << " will trim from " << *in << dendl;
3780 q.push_back(in);
3781 }
3782 }
3783 for (list<CInode*>::iterator p = q.begin(); p != q.end(); ++p)
3784 remove_inode_recursive(*p);
3785}
3786
3787/** recalc_auth_bits()
3788 * once subtree auth is disambiguated, we need to adjust all the
3789 * auth and dirty bits in our cache before moving on.
3790 */
3791void MDCache::recalc_auth_bits(bool replay)
3792{
3793 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3794
3795 if (root) {
3796 root->inode_auth.first = mds->mdsmap->get_root();
3797 bool auth = mds->get_nodeid() == root->inode_auth.first;
3798 if (auth) {
3799 root->state_set(CInode::STATE_AUTH);
3800 } else {
3801 root->state_clear(CInode::STATE_AUTH);
3802 if (!replay)
3803 root->state_set(CInode::STATE_REJOINING);
3804 }
3805 }
3806
3807 set<CInode*> subtree_inodes;
3808 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3809 p != subtrees.end();
3810 ++p) {
3811 if (p->first->dir_auth.first == mds->get_nodeid())
3812 subtree_inodes.insert(p->first->inode);
3813 }
3814
3815 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3816 p != subtrees.end();
3817 ++p) {
3818 if (p->first->inode->is_mdsdir()) {
3819 CInode *in = p->first->inode;
3820 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3821 if (auth) {
3822 in->state_set(CInode::STATE_AUTH);
3823 } else {
3824 in->state_clear(CInode::STATE_AUTH);
3825 if (!replay)
3826 in->state_set(CInode::STATE_REJOINING);
3827 }
3828 }
3829
3830 list<CDir*> dfq; // dirfrag queue
3831 dfq.push_back(p->first);
3832
3833 bool auth = p->first->authority().first == mds->get_nodeid();
3834 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3835
3836 while (!dfq.empty()) {
3837 CDir *dir = dfq.front();
3838 dfq.pop_front();
3839
3840 // dir
3841 if (auth) {
3842 dir->state_set(CDir::STATE_AUTH);
3843 } else {
3844 dir->state_clear(CDir::STATE_AUTH);
3845 if (!replay) {
3846 // close empty non-auth dirfrag
3847 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3848 dir->inode->close_dirfrag(dir->get_frag());
3849 continue;
3850 }
3851 dir->state_set(CDir::STATE_REJOINING);
3852 dir->state_clear(CDir::STATE_COMPLETE);
3853 if (dir->is_dirty())
3854 dir->mark_clean();
3855 }
3856 }
3857
3858 // dentries in this dir
94b18763 3859 for (auto &p : dir->items) {
7c673cae 3860 // dn
94b18763 3861 CDentry *dn = p.second;
7c673cae
FG
3862 CDentry::linkage_t *dnl = dn->get_linkage();
3863 if (auth) {
3864 dn->state_set(CDentry::STATE_AUTH);
3865 } else {
3866 dn->state_clear(CDentry::STATE_AUTH);
3867 if (!replay) {
3868 dn->state_set(CDentry::STATE_REJOINING);
3869 if (dn->is_dirty())
3870 dn->mark_clean();
3871 }
3872 }
3873
3874 if (dnl->is_primary()) {
3875 // inode
3876 CInode *in = dnl->get_inode();
3877 if (auth) {
3878 in->state_set(CInode::STATE_AUTH);
3879 } else {
3880 in->state_clear(CInode::STATE_AUTH);
3881 if (!replay) {
3882 in->state_set(CInode::STATE_REJOINING);
3883 if (in->is_dirty())
3884 in->mark_clean();
3885 if (in->is_dirty_parent())
3886 in->clear_dirty_parent();
3887 // avoid touching scatterlocks for our subtree roots!
3888 if (subtree_inodes.count(in) == 0)
3889 in->clear_scatter_dirty();
3890 }
3891 }
3892 // recurse?
3893 if (in->is_dir())
3894 in->get_nested_dirfrags(dfq);
3895 }
3896 }
3897 }
3898 }
3899
3900 show_subtrees();
3901 show_cache();
3902}
3903
3904
3905
3906// ===========================================================================
3907// REJOIN
3908
3909/*
3910 * notes on scatterlock recovery:
3911 *
3912 * - recovering inode replica sends scatterlock data for any subtree
3913 * roots (the only ones that are possibly dirty).
3914 *
3915 * - surviving auth incorporates any provided scatterlock data. any
3916 * pending gathers are then finished, as with the other lock types.
3917 *
3918 * that takes care of surviving auth + (recovering replica)*.
3919 *
3920 * - surviving replica sends strong_inode, which includes current
3921 * scatterlock state, AND any dirty scatterlock data. this
3922 * provides the recovering auth with everything it might need.
3923 *
3924 * - recovering auth must pick initial scatterlock state based on
3925 * (weak|strong) rejoins.
3926 * - always assimilate scatterlock data (it can't hurt)
3927 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3928 * - include base inode in ack for all inodes that saw scatterlock content
3929 *
3930 * also, for scatter gather,
3931 *
3932 * - auth increments {frag,r}stat.version on completion of any gather.
3933 *
3934 * - auth incorporates changes in a gather _only_ if the version
3935 * matches.
3936 *
3937 * - replica discards changes any time the scatterlock syncs, and
3938 * after recovery.
3939 */
3940
3941void MDCache::dump_rejoin_status(Formatter *f) const
3942{
3943 f->open_object_section("rejoin_status");
3944 f->dump_stream("rejoin_gather") << rejoin_gather;
3945 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3946 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3947 f->close_section();
3948}
3949
3950void MDCache::rejoin_start(MDSInternalContext *rejoin_done_)
3951{
3952 dout(10) << "rejoin_start" << dendl;
3953 assert(!rejoin_done);
3954 rejoin_done.reset(rejoin_done_);
3955
3956 rejoin_gather = recovery_set;
3957 // need finish opening cap inodes before sending cache rejoins
3958 rejoin_gather.insert(mds->get_nodeid());
3959 process_imported_caps();
3960}
3961
3962/*
3963 * rejoin phase!
3964 *
3965 * this initiates rejoin. it shoudl be called before we get any
3966 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3967 *
3968 * we start out by sending rejoins to everyone in the recovery set.
3969 *
3970 * if we are rejoin, send for all regions in our cache.
3971 * if we are active|stopping, send only to nodes that are are rejoining.
3972 */
3973void MDCache::rejoin_send_rejoins()
3974{
3975 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3976
3977 if (rejoin_gather.count(mds->get_nodeid())) {
3978 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3979 rejoins_pending = true;
3980 return;
3981 }
3982 if (!resolve_gather.empty()) {
3983 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3984 << resolve_gather << ")" << dendl;
3985 rejoins_pending = true;
3986 return;
3987 }
3988
3989 assert(!migrator->is_importing());
3990 assert(!migrator->is_exporting());
3991
3992 if (!mds->is_rejoin()) {
3993 disambiguate_other_imports();
3994 }
3995
3996 map<mds_rank_t, MMDSCacheRejoin*> rejoins;
3997
3998
3999 // if i am rejoining, send a rejoin to everyone.
4000 // otherwise, just send to others who are rejoining.
4001 for (set<mds_rank_t>::iterator p = recovery_set.begin();
4002 p != recovery_set.end();
4003 ++p) {
4004 if (*p == mds->get_nodeid()) continue; // nothing to myself!
4005 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
4006 if (mds->is_rejoin())
4007 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
4008 else if (mds->mdsmap->is_rejoin(*p))
4009 rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG);
4010 }
4011
4012 if (mds->is_rejoin()) {
4013 map<client_t, set<mds_rank_t> > client_exports;
4014 for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) {
28e407b8 4015 mds_rank_t target = p->second.first;
7c673cae
FG
4016 if (rejoins.count(target) == 0)
4017 continue;
28e407b8
AA
4018 rejoins[target]->cap_exports[p->first] = p->second.second;
4019 for (auto q = p->second.second.begin(); q != p->second.second.end(); ++q)
7c673cae
FG
4020 client_exports[q->first].insert(target);
4021 }
4022 for (map<client_t, set<mds_rank_t> >::iterator p = client_exports.begin();
4023 p != client_exports.end();
4024 ++p) {
4025 entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v));
4026 for (set<mds_rank_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
4027 rejoins[*q]->client_map[p->first] = inst;
4028 }
4029 }
4030
4031
4032 // check all subtrees
4033 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4034 p != subtrees.end();
4035 ++p) {
4036 CDir *dir = p->first;
4037 assert(dir->is_subtree_root());
4038 if (dir->is_ambiguous_dir_auth()) {
4039 // exporter is recovering, importer is survivor.
4040 assert(rejoins.count(dir->authority().first));
4041 assert(!rejoins.count(dir->authority().second));
4042 continue;
4043 }
4044
4045 // my subtree?
4046 if (dir->is_auth())
4047 continue; // skip my own regions!
4048
4049 mds_rank_t auth = dir->get_dir_auth().first;
4050 assert(auth >= 0);
4051 if (rejoins.count(auth) == 0)
4052 continue; // don't care about this node's subtrees
4053
4054 rejoin_walk(dir, rejoins[auth]);
4055 }
4056
4057 // rejoin root inodes, too
4058 for (map<mds_rank_t, MMDSCacheRejoin*>::iterator p = rejoins.begin();
4059 p != rejoins.end();
4060 ++p) {
4061 if (mds->is_rejoin()) {
4062 // weak
4063 if (p->first == 0 && root) {
4064 p->second->add_weak_inode(root->vino());
4065 if (root->is_dirty_scattered()) {
4066 dout(10) << " sending scatterlock state on root " << *root << dendl;
4067 p->second->add_scatterlock_state(root);
4068 }
4069 }
4070 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4071 if (in)
4072 p->second->add_weak_inode(in->vino());
4073 }
4074 } else {
4075 // strong
4076 if (p->first == 0 && root) {
4077 p->second->add_strong_inode(root->vino(),
4078 root->get_replica_nonce(),
4079 root->get_caps_wanted(),
4080 root->filelock.get_state(),
4081 root->nestlock.get_state(),
4082 root->dirfragtreelock.get_state());
4083 root->state_set(CInode::STATE_REJOINING);
4084 if (root->is_dirty_scattered()) {
4085 dout(10) << " sending scatterlock state on root " << *root << dendl;
4086 p->second->add_scatterlock_state(root);
4087 }
4088 }
4089
4090 if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
4091 p->second->add_strong_inode(in->vino(),
4092 in->get_replica_nonce(),
4093 in->get_caps_wanted(),
4094 in->filelock.get_state(),
4095 in->nestlock.get_state(),
4096 in->dirfragtreelock.get_state());
4097 in->state_set(CInode::STATE_REJOINING);
4098 }
4099 }
4100 }
4101
4102 if (!mds->is_rejoin()) {
4103 // i am survivor. send strong rejoin.
4104 // note request remote_auth_pins, xlocks
4105 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4106 p != active_requests.end();
4107 ++p) {
4108 MDRequestRef& mdr = p->second;
4109 if (mdr->is_slave())
4110 continue;
4111 // auth pins
4112 for (map<MDSCacheObject*,mds_rank_t>::iterator q = mdr->remote_auth_pins.begin();
4113 q != mdr->remote_auth_pins.end();
4114 ++q) {
4115 if (!q->first->is_auth()) {
4116 assert(q->second == q->first->authority().first);
4117 if (rejoins.count(q->second) == 0) continue;
4118 MMDSCacheRejoin *rejoin = rejoins[q->second];
4119
4120 dout(15) << " " << *mdr << " authpin on " << *q->first << dendl;
4121 MDSCacheObjectInfo i;
4122 q->first->set_object_info(i);
4123 if (i.ino)
4124 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4125 else
4126 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4127
4128 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4129 mdr->more()->rename_inode == q->first)
4130 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4131 mdr->reqid, mdr->attempt);
4132 }
4133 }
4134 // xlocks
4135 for (set<SimpleLock*>::iterator q = mdr->xlocks.begin();
4136 q != mdr->xlocks.end();
4137 ++q) {
4138 if (!(*q)->get_parent()->is_auth()) {
4139 mds_rank_t who = (*q)->get_parent()->authority().first;
4140 if (rejoins.count(who) == 0) continue;
4141 MMDSCacheRejoin *rejoin = rejoins[who];
4142
4143 dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl;
4144 MDSCacheObjectInfo i;
4145 (*q)->get_parent()->set_object_info(i);
4146 if (i.ino)
4147 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(),
4148 mdr->reqid, mdr->attempt);
4149 else
4150 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4151 mdr->reqid, mdr->attempt);
4152 }
4153 }
4154 // remote wrlocks
4155 for (map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
4156 q != mdr->remote_wrlocks.end();
4157 ++q) {
4158 mds_rank_t who = q->second;
4159 if (rejoins.count(who) == 0) continue;
4160 MMDSCacheRejoin *rejoin = rejoins[who];
4161
4162 dout(15) << " " << *mdr << " wrlock on " << q->second
4163 << " " << q->first->get_parent() << dendl;
4164 MDSCacheObjectInfo i;
4165 q->first->get_parent()->set_object_info(i);
4166 assert(i.ino);
4167 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(),
4168 mdr->reqid, mdr->attempt);
4169 }
4170 }
4171 }
4172
4173 // send the messages
4174 for (map<mds_rank_t,MMDSCacheRejoin*>::iterator p = rejoins.begin();
4175 p != rejoins.end();
4176 ++p) {
4177 assert(rejoin_sent.count(p->first) == 0);
4178 assert(rejoin_ack_gather.count(p->first) == 0);
4179 rejoin_sent.insert(p->first);
4180 rejoin_ack_gather.insert(p->first);
4181 mds->send_message_mds(p->second, p->first);
4182 }
4183 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4184 rejoins_pending = false;
4185
4186 // nothing?
28e407b8 4187 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4188 dout(10) << "nothing to rejoin" << dendl;
4189 rejoin_gather_finish();
4190 }
4191}
4192
4193
4194/**
4195 * rejoin_walk - build rejoin declarations for a subtree
4196 *
4197 * @param dir subtree root
4198 * @param rejoin rejoin message
4199 *
4200 * from a rejoining node:
4201 * weak dirfrag
4202 * weak dentries (w/ connectivity)
4203 *
4204 * from a surviving node:
4205 * strong dirfrag
4206 * strong dentries (no connectivity!)
4207 * strong inodes
4208 */
4209void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
4210{
4211 dout(10) << "rejoin_walk " << *dir << dendl;
4212
4213 list<CDir*> nested; // finish this dir, then do nested items
4214
4215 if (mds->is_rejoin()) {
4216 // WEAK
4217 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4218 for (auto &p : dir->items) {
4219 CDentry *dn = p.second;
4220 assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4221 CDentry::linkage_t *dnl = dn->get_linkage();
4222 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4223 assert(dnl->is_primary());
4224 CInode *in = dnl->get_inode();
4225 assert(dnl->get_inode()->is_dir());
94b18763 4226 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
7c673cae
FG
4227 in->get_nested_dirfrags(nested);
4228 if (in->is_dirty_scattered()) {
4229 dout(10) << " sending scatterlock state on " << *in << dendl;
4230 rejoin->add_scatterlock_state(in);
4231 }
4232 }
4233 } else {
4234 // STRONG
4235 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4236 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4237 dir->state_set(CDir::STATE_REJOINING);
4238
94b18763
FG
4239 for (auto it = dir->items.begin(); it != dir->items.end(); ++it) {
4240 CDentry *dn = it->second;
7c673cae
FG
4241 CDentry::linkage_t *dnl = dn->get_linkage();
4242 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4243 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4244 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4245 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4246 dnl->is_remote() ? dnl->get_remote_d_type():0,
4247 dn->get_replica_nonce(),
4248 dn->lock.get_state());
4249 dn->state_set(CDentry::STATE_REJOINING);
4250 if (dnl->is_primary()) {
4251 CInode *in = dnl->get_inode();
4252 dout(15) << " add_strong_inode " << *in << dendl;
4253 rejoin->add_strong_inode(in->vino(),
4254 in->get_replica_nonce(),
4255 in->get_caps_wanted(),
4256 in->filelock.get_state(),
4257 in->nestlock.get_state(),
4258 in->dirfragtreelock.get_state());
4259 in->state_set(CInode::STATE_REJOINING);
4260 in->get_nested_dirfrags(nested);
4261 if (in->is_dirty_scattered()) {
4262 dout(10) << " sending scatterlock state on " << *in << dendl;
4263 rejoin->add_scatterlock_state(in);
4264 }
4265 }
4266 }
4267 }
4268
4269 // recurse into nested dirs
4270 for (list<CDir*>::iterator p = nested.begin();
4271 p != nested.end();
4272 ++p)
4273 rejoin_walk(*p, rejoin);
4274}
4275
4276
4277/*
4278 * i got a rejoin.
4279 * - reply with the lockstate
4280 *
4281 * if i am active|stopping,
4282 * - remove source from replica list for everything not referenced here.
4283 * This function puts the passed message before returning.
4284 */
4285void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m)
4286{
4287 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4288 << " (" << m->get_payload().length() << " bytes)"
4289 << dendl;
4290
4291 switch (m->op) {
4292 case MMDSCacheRejoin::OP_WEAK:
4293 handle_cache_rejoin_weak(m);
4294 break;
4295 case MMDSCacheRejoin::OP_STRONG:
4296 handle_cache_rejoin_strong(m);
4297 break;
4298 case MMDSCacheRejoin::OP_ACK:
4299 handle_cache_rejoin_ack(m);
4300 break;
4301
4302 default:
4303 ceph_abort();
4304 }
4305 m->put();
4306}
4307
4308
4309/*
4310 * handle_cache_rejoin_weak
4311 *
4312 * the sender
4313 * - is recovering from their journal.
4314 * - may have incorrect (out of date) inode contents
4315 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4316 *
4317 * if the sender didn't trim_non_auth(), they
4318 * - may have incorrect (out of date) dentry/inode linkage
4319 * - may have deleted/purged inodes
4320 * and i may have to go to disk to get accurate inode contents. yuck.
4321 * This functions DOES NOT put the passed message before returning
4322 */
4323void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
4324{
4325 mds_rank_t from = mds_rank_t(weak->get_source().num());
4326
4327 // possible response(s)
4328 MMDSCacheRejoin *ack = 0; // if survivor
4329 set<vinodeno_t> acked_inodes; // if survivor
4330 set<SimpleLock *> gather_locks; // if survivor
4331 bool survivor = false; // am i a survivor?
4332
4333 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4334 survivor = true;
4335 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4336 ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
4337
4338 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4339
4340 // check cap exports
4341 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4342 CInode *in = get_inode(p->first);
4343 assert(!in || in->is_auth());
4344 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4345 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4346 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4347 Capability::Import& im = imported_caps[p->first][q->first];
4348 if (cap) {
4349 im.cap_id = cap->get_cap_id();
4350 im.issue_seq = cap->get_last_seq();
4351 im.mseq = cap->get_mseq();
4352 } else {
4353 // all are zero
4354 }
4355 }
4356 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4357 }
4358
4359 ::encode(imported_caps, ack->imported_caps);
4360 } else {
4361 assert(mds->is_rejoin());
4362
4363 // we may have already received a strong rejoin from the sender.
4364 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4365 assert(gather_locks.empty());
4366
4367 // check cap exports.
4368 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4369
4370 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4371 CInode *in = get_inode(p->first);
b32b8144 4372 assert(!in || in->is_auth());
7c673cae
FG
4373 // note
4374 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4375 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4376 cap_imports[p->first][q->first][from] = q->second;
4377 }
4378 }
4379 }
4380
4381 // assimilate any potentially dirty scatterlock state
4382 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4383 p != weak->inode_scatterlocks.end();
4384 ++p) {
4385 CInode *in = get_inode(p->first);
4386 assert(in);
4387 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4388 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4389 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4390 if (!survivor)
4391 rejoin_potential_updated_scatterlocks.insert(in);
4392 }
4393
4394 // recovering peer may send incorrect dirfrags here. we need to
4395 // infer which dirfrag they meant. the ack will include a
4396 // strong_dirfrag that will set them straight on the fragmentation.
4397
4398 // walk weak map
4399 set<CDir*> dirs_to_share;
4400 for (set<dirfrag_t>::iterator p = weak->weak_dirfrags.begin();
4401 p != weak->weak_dirfrags.end();
4402 ++p) {
4403 CInode *diri = get_inode(p->ino);
4404 if (!diri)
4405 dout(0) << " missing dir ino " << p->ino << dendl;
4406 assert(diri);
4407
4408 list<frag_t> ls;
4409 if (diri->dirfragtree.is_leaf(p->frag)) {
4410 ls.push_back(p->frag);
4411 } else {
4412 diri->dirfragtree.get_leaves_under(p->frag, ls);
4413 if (ls.empty())
4414 ls.push_back(diri->dirfragtree[p->frag.value()]);
4415 }
4416 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4417 frag_t fg = *q;
4418 CDir *dir = diri->get_dirfrag(fg);
4419 if (!dir) {
4420 dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl;
4421 continue;
4422 }
4423 assert(dir);
4424 if (dirs_to_share.count(dir)) {
4425 dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl;
4426 } else {
4427 dirs_to_share.insert(dir);
4428 unsigned nonce = dir->add_replica(from);
4429 dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl;
4430 if (ack) {
4431 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4432 ack->add_dirfrag_base(dir);
4433 }
4434 }
4435 }
4436 }
4437
4438 for (map<inodeno_t,map<string_snap_t,MMDSCacheRejoin::dn_weak> >::iterator p = weak->weak.begin();
4439 p != weak->weak.end();
4440 ++p) {
4441 CInode *diri = get_inode(p->first);
4442 if (!diri)
4443 dout(0) << " missing dir ino " << p->first << dendl;
4444 assert(diri);
4445
4446 // weak dentries
4447 CDir *dir = 0;
4448 for (map<string_snap_t,MMDSCacheRejoin::dn_weak>::iterator q = p->second.begin();
4449 q != p->second.end();
4450 ++q) {
4451 // locate proper dirfrag.
4452 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4453 frag_t fg = diri->pick_dirfrag(q->first.name);
4454 if (!dir || dir->get_frag() != fg) {
4455 dir = diri->get_dirfrag(fg);
4456 if (!dir)
4457 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4458 assert(dir);
4459 assert(dirs_to_share.count(dir));
4460 }
4461
4462 // and dentry
4463 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4464 assert(dn);
4465 CDentry::linkage_t *dnl = dn->get_linkage();
4466 assert(dnl->is_primary());
4467
4468 if (survivor && dn->is_replica(from))
4469 dentry_remove_replica(dn, from, gather_locks);
4470 unsigned dnonce = dn->add_replica(from);
4471 dout(10) << " have " << *dn << dendl;
4472 if (ack)
94b18763 4473 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4474 dnl->get_inode()->ino(), inodeno_t(0), 0,
4475 dnonce, dn->lock.get_replica_state());
4476
4477 // inode
4478 CInode *in = dnl->get_inode();
4479 assert(in);
4480
4481 if (survivor && in->is_replica(from))
4482 inode_remove_replica(in, from, true, gather_locks);
4483 unsigned inonce = in->add_replica(from);
4484 dout(10) << " have " << *in << dendl;
4485
4486 // scatter the dirlock, just in case?
4487 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4488 in->filelock.set_state(LOCK_MIX);
4489
4490 if (ack) {
4491 acked_inodes.insert(in->vino());
4492 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4493 bufferlist bl;
4494 in->_encode_locks_state_for_rejoin(bl, from);
4495 ack->add_inode_locks(in, inonce, bl);
4496 }
4497 }
4498 }
4499
4500 // weak base inodes? (root, stray, etc.)
4501 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4502 p != weak->weak_inodes.end();
4503 ++p) {
4504 CInode *in = get_inode(*p);
4505 assert(in); // hmm fixme wrt stray?
4506 if (survivor && in->is_replica(from))
4507 inode_remove_replica(in, from, true, gather_locks);
4508 unsigned inonce = in->add_replica(from);
4509 dout(10) << " have base " << *in << dendl;
4510
4511 if (ack) {
4512 acked_inodes.insert(in->vino());
4513 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4514 bufferlist bl;
4515 in->_encode_locks_state_for_rejoin(bl, from);
4516 ack->add_inode_locks(in, inonce, bl);
4517 }
4518 }
4519
4520 assert(rejoin_gather.count(from));
4521 rejoin_gather.erase(from);
4522 if (survivor) {
4523 // survivor. do everything now.
4524 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
4525 p != weak->inode_scatterlocks.end();
4526 ++p) {
4527 CInode *in = get_inode(p->first);
4528 assert(in);
4529 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4530 acked_inodes.insert(in->vino());
4531 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4532 }
4533
4534 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4535 mds->send_message(ack, weak->get_connection());
4536
4537 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4538 if (!(*p)->is_stable())
4539 mds->locker->eval_gather(*p);
4540 }
4541 } else {
4542 // done?
28e407b8 4543 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4544 rejoin_gather_finish();
4545 } else {
4546 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4547 }
4548 }
4549}
4550
7c673cae
FG
4551/*
4552 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4553 *
4554 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4555 * ack, the replica dne, and we can remove it from our replica maps.
4556 */
4557void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
4558 set<vinodeno_t>& acked_inodes,
4559 set<SimpleLock *>& gather_locks)
4560{
4561 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4562
b32b8144 4563 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4564 // inode?
4565 if (in->is_auth() &&
4566 in->is_replica(from) &&
b32b8144 4567 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4568 inode_remove_replica(in, from, false, gather_locks);
4569 dout(10) << " rem " << *in << dendl;
4570 }
4571
b32b8144
FG
4572 if (!in->is_dir())
4573 return;
7c673cae
FG
4574
4575 list<CDir*> dfs;
4576 in->get_dirfrags(dfs);
4577 for (list<CDir*>::iterator p = dfs.begin();
4578 p != dfs.end();
4579 ++p) {
4580 CDir *dir = *p;
181888fb
FG
4581 if (!dir->is_auth())
4582 continue;
7c673cae 4583
181888fb 4584 if (dir->is_replica(from) &&
7c673cae
FG
4585 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4586 dir->remove_replica(from);
4587 dout(10) << " rem " << *dir << dendl;
4588 }
4589
4590 // dentries
94b18763
FG
4591 for (auto &p : dir->items) {
4592 CDentry *dn = p.second;
7c673cae
FG
4593
4594 if (dn->is_replica(from) &&
4595 (ack == NULL ||
4596 ack->strong_dentries.count(dir->dirfrag()) == 0 ||
94b18763 4597 ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->get_name(), dn->last)) == 0)) {
7c673cae
FG
4598 dentry_remove_replica(dn, from, gather_locks);
4599 dout(10) << " rem " << *dn << dendl;
4600 }
4601 }
4602 }
b32b8144
FG
4603 };
4604
94b18763 4605 for (auto &p : inode_map)
b32b8144 4606 scour_func(p.second);
94b18763 4607 for (auto &p : snap_inode_map)
b32b8144 4608 scour_func(p.second);
7c673cae
FG
4609}
4610
4611
4612CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4613{
4614 CInode *in = new CInode(this, true, 1, last);
4615 in->inode.ino = ino;
4616 in->state_set(CInode::STATE_REJOINUNDEF);
4617 add_inode(in);
4618 rejoin_undef_inodes.insert(in);
4619 dout(10) << " invented " << *in << dendl;
4620 return in;
4621}
4622
4623CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4624{
4625 CInode *in = get_inode(df.ino);
4626 if (!in)
4627 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4628 if (!in->is_dir()) {
4629 assert(in->state_test(CInode::STATE_REJOINUNDEF));
4630 in->inode.mode = S_IFDIR;
4631 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4632 }
4633 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4634 dir->state_set(CDir::STATE_REJOINUNDEF);
4635 rejoin_undef_dirfrags.insert(dir);
4636 dout(10) << " invented " << *dir << dendl;
4637 return dir;
4638}
4639
4640/* This functions DOES NOT put the passed message before returning */
4641void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
4642{
4643 mds_rank_t from = mds_rank_t(strong->get_source().num());
4644
4645 // only a recovering node will get a strong rejoin.
4646 assert(mds->is_rejoin());
4647
4648 // assimilate any potentially dirty scatterlock state
4649 for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
4650 p != strong->inode_scatterlocks.end();
4651 ++p) {
4652 CInode *in = get_inode(p->first);
4653 assert(in);
4654 in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file);
4655 in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest);
4656 in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft);
4657 rejoin_potential_updated_scatterlocks.insert(in);
4658 }
4659
4660 rejoin_unlinked_inodes[from].clear();
4661
4662 // surviving peer may send incorrect dirfrag here (maybe they didn't
4663 // get the fragment notify, or maybe we rolled back?). we need to
4664 // infer the right frag and get them with the program. somehow.
4665 // we don't normally send ACK.. so we'll need to bundle this with
4666 // MISSING or something.
4667
4668 // strong dirfrags/dentries.
4669 // also process auth_pins, xlocks.
4670 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
4671 p != strong->strong_dirfrags.end();
4672 ++p) {
4673 CInode *diri = get_inode(p->first.ino);
4674 if (!diri)
4675 diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
4676 CDir *dir = diri->get_dirfrag(p->first.frag);
4677 bool refragged = false;
4678 if (dir) {
4679 dout(10) << " have " << *dir << dendl;
4680 } else {
4681 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4682 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4683 else if (diri->dirfragtree.is_leaf(p->first.frag))
4684 dir = rejoin_invent_dirfrag(p->first);
4685 }
4686 if (dir) {
4687 dir->add_replica(from, p->second.nonce);
4688 dir->dir_rep = p->second.dir_rep;
4689 } else {
4690 dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
4691 list<frag_t> ls;
4692 diri->dirfragtree.get_leaves_under(p->first.frag, ls);
4693 if (ls.empty())
4694 ls.push_back(diri->dirfragtree[p->first.frag.value()]);
4695 dout(10) << " maps to frag(s) " << ls << dendl;
4696 for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
4697 CDir *dir = diri->get_dirfrag(*q);
4698 if (!dir)
4699 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q));
4700 else
4701 dout(10) << " have(approx) " << *dir << dendl;
4702 dir->add_replica(from, p->second.nonce);
4703 dir->dir_rep = p->second.dir_rep;
4704 }
4705 refragged = true;
4706 }
4707
4708 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
4709 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4710 q != dmap.end();
4711 ++q) {
4712 CDentry *dn;
4713 if (!refragged)
4714 dn = dir->lookup(q->first.name, q->first.snapid);
4715 else {
4716 frag_t fg = diri->pick_dirfrag(q->first.name);
4717 dir = diri->get_dirfrag(fg);
4718 assert(dir);
4719 dn = dir->lookup(q->first.name, q->first.snapid);
4720 }
4721 if (!dn) {
4722 if (q->second.is_remote()) {
4723 dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type,
4724 q->second.first, q->first.snapid);
4725 } else if (q->second.is_null()) {
4726 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4727 } else {
4728 CInode *in = get_inode(q->second.ino, q->first.snapid);
4729 if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
4730 dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
4731 }
4732 dout(10) << " invented " << *dn << dendl;
4733 }
4734 CDentry::linkage_t *dnl = dn->get_linkage();
4735
4736 // dn auth_pin?
4737 if (strong->authpinned_dentries.count(p->first) &&
4738 strong->authpinned_dentries[p->first].count(q->first)) {
4739 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
4740 r != strong->authpinned_dentries[p->first][q->first].end();
4741 ++r) {
4742 dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
4743
4744 // get/create slave mdrequest
4745 MDRequestRef mdr;
4746 if (have_request(r->reqid))
4747 mdr = request_get(r->reqid);
4748 else
4749 mdr = request_start_slave(r->reqid, r->attempt, strong);
4750 mdr->auth_pin(dn);
4751 }
4752 }
4753
4754 // dn xlock?
4755 if (strong->xlocked_dentries.count(p->first) &&
4756 strong->xlocked_dentries[p->first].count(q->first)) {
4757 MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first];
4758 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4759 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4760 assert(mdr->is_auth_pinned(dn));
4761 if (!mdr->xlocks.count(&dn->versionlock)) {
4762 assert(dn->versionlock.can_xlock_local());
4763 dn->versionlock.get_xlock(mdr, mdr->get_client());
4764 mdr->xlocks.insert(&dn->versionlock);
4765 mdr->locks.insert(&dn->versionlock);
4766 }
4767 if (dn->lock.is_stable())
4768 dn->auth_pin(&dn->lock);
4769 dn->lock.set_state(LOCK_XLOCK);
4770 dn->lock.get_xlock(mdr, mdr->get_client());
4771 mdr->xlocks.insert(&dn->lock);
4772 mdr->locks.insert(&dn->lock);
4773 }
4774
4775 dn->add_replica(from, q->second.nonce);
4776 dout(10) << " have " << *dn << dendl;
4777
4778 if (dnl->is_primary()) {
4779 if (q->second.is_primary()) {
4780 if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
4781 // the survivor missed MDentryUnlink+MDentryLink messages ?
4782 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4783 CInode *in = get_inode(q->second.ino, q->first.snapid);
4784 assert(in);
4785 assert(in->get_parent_dn());
4786 rejoin_unlinked_inodes[from].insert(in);
4787 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4788 }
4789 } else {
4790 // the survivor missed MDentryLink message ?
4791 assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4792 dout(7) << " sender doesn't have primay dentry" << dendl;
4793 }
4794 } else {
4795 if (q->second.is_primary()) {
4796 // the survivor missed MDentryUnlink message ?
4797 CInode *in = get_inode(q->second.ino, q->first.snapid);
4798 assert(in);
4799 assert(in->get_parent_dn());
4800 rejoin_unlinked_inodes[from].insert(in);
4801 dout(7) << " sender has primary dentry but we don't" << dendl;
4802 }
4803 }
4804 }
4805 }
4806
4807 for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
4808 p != strong->strong_inodes.end();
4809 ++p) {
4810 CInode *in = get_inode(p->first);
4811 assert(in);
4812 in->add_replica(from, p->second.nonce);
4813 dout(10) << " have " << *in << dendl;
4814
4815 MMDSCacheRejoin::inode_strong &is = p->second;
4816
4817 // caps_wanted
4818 if (is.caps_wanted) {
4819 in->mds_caps_wanted[from] = is.caps_wanted;
4820 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4821 << " on " << *in << dendl;
4822 }
4823
4824 // scatterlocks?
4825 // infer state from replica state:
4826 // * go to MIX if they might have wrlocks
4827 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4828 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4829 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4830 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4831
4832 // auth pin?
4833 if (strong->authpinned_inodes.count(in->vino())) {
4834 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
4835 r != strong->authpinned_inodes[in->vino()].end();
4836 ++r) {
4837 dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
4838
4839 // get/create slave mdrequest
4840 MDRequestRef mdr;
4841 if (have_request(r->reqid))
4842 mdr = request_get(r->reqid);
4843 else
4844 mdr = request_start_slave(r->reqid, r->attempt, strong);
4845 if (strong->frozen_authpin_inodes.count(in->vino())) {
4846 assert(!in->get_num_auth_pins());
4847 mdr->freeze_auth_pin(in);
4848 } else {
4849 assert(!in->is_frozen_auth_pin());
4850 }
4851 mdr->auth_pin(in);
4852 }
4853 }
4854 // xlock(s)?
4855 if (strong->xlocked_inodes.count(in->vino())) {
4856 for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
4857 q != strong->xlocked_inodes[in->vino()].end();
4858 ++q) {
4859 SimpleLock *lock = in->get_lock(q->first);
4860 dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
4861 MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above.
4862 assert(mdr->is_auth_pinned(in));
4863 if (!mdr->xlocks.count(&in->versionlock)) {
4864 assert(in->versionlock.can_xlock_local());
4865 in->versionlock.get_xlock(mdr, mdr->get_client());
4866 mdr->xlocks.insert(&in->versionlock);
4867 mdr->locks.insert(&in->versionlock);
4868 }
4869 if (lock->is_stable())
4870 in->auth_pin(lock);
4871 lock->set_state(LOCK_XLOCK);
4872 if (lock == &in->filelock)
4873 in->loner_cap = -1;
4874 lock->get_xlock(mdr, mdr->get_client());
4875 mdr->xlocks.insert(lock);
4876 mdr->locks.insert(lock);
4877 }
4878 }
4879 }
4880 // wrlock(s)?
4881 for (map<vinodeno_t, map<int, list<MMDSCacheRejoin::slave_reqid> > >::iterator p = strong->wrlocked_inodes.begin();
4882 p != strong->wrlocked_inodes.end();
4883 ++p) {
4884 CInode *in = get_inode(p->first);
4885 for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = p->second.begin();
4886 q != p->second.end();
4887 ++q) {
4888 SimpleLock *lock = in->get_lock(q->first);
4889 for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
4890 r != q->second.end();
4891 ++r) {
4892 dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
4893 MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above.
4894 if (in->is_auth())
4895 assert(mdr->is_auth_pinned(in));
4896 lock->set_state(LOCK_MIX);
4897 if (lock == &in->filelock)
4898 in->loner_cap = -1;
4899 lock->get_wrlock(true);
4900 mdr->wrlocks.insert(lock);
4901 mdr->locks.insert(lock);
4902 }
4903 }
4904 }
4905
4906 // done?
4907 assert(rejoin_gather.count(from));
4908 rejoin_gather.erase(from);
28e407b8 4909 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4910 rejoin_gather_finish();
4911 } else {
4912 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4913 }
4914}
4915
4916/* This functions DOES NOT put the passed message before returning */
4917void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
4918{
4919 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4920 mds_rank_t from = mds_rank_t(ack->get_source().num());
4921
b32b8144
FG
4922 assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4923 bool survivor = !mds->is_rejoin();
4924
7c673cae
FG
4925 // for sending cache expire message
4926 set<CInode*> isolated_inodes;
4927 set<CInode*> refragged_inodes;
4928
4929 // dirs
4930 for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
4931 p != ack->strong_dirfrags.end();
4932 ++p) {
4933 // we may have had incorrect dir fragmentation; refragment based
4934 // on what they auth tells us.
4935 CDir *dir = get_dirfrag(p->first);
4936 if (!dir) {
4937 dir = get_force_dirfrag(p->first, false);
4938 if (dir)
4939 refragged_inodes.insert(dir->get_inode());
4940 }
4941 if (!dir) {
4942 CInode *diri = get_inode(p->first.ino);
4943 if (!diri) {
4944 // barebones inode; the full inode loop below will clean up.
4945 diri = new CInode(this, false);
4946 diri->inode.ino = p->first.ino;
4947 diri->inode.mode = S_IFDIR;
4948 diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
4949 add_inode(diri);
4950 if (MDS_INO_MDSDIR(from) == p->first.ino) {
4951 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4952 dout(10) << " add inode " << *diri << dendl;
4953 } else {
4954 diri->inode_auth = CDIR_AUTH_DEFAULT;
4955 isolated_inodes.insert(diri);
4956 dout(10) << " unconnected dirfrag " << p->first << dendl;
4957 }
4958 }
4959 // barebones dirfrag; the full dirfrag loop below will clean up.
4960 dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
4961 if (MDS_INO_MDSDIR(from) == p->first.ino ||
4962 (dir->authority() != CDIR_AUTH_UNDEF &&
4963 dir->authority().first != from))
4964 adjust_subtree_auth(dir, from);
4965 dout(10) << " add dirfrag " << *dir << dendl;
4966 }
4967
4968 dir->set_replica_nonce(p->second.nonce);
4969 dir->state_clear(CDir::STATE_REJOINING);
4970 dout(10) << " got " << *dir << dendl;
4971
4972 // dentries
4973 map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = ack->strong_dentries[p->first];
4974 for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
4975 q != dmap.end();
4976 ++q) {
4977 CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
4978 if(!dn)
4979 dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
4980
4981 CDentry::linkage_t *dnl = dn->get_linkage();
4982
4983 assert(dn->last == q->first.snapid);
4984 if (dn->first != q->second.first) {
4985 dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl;
4986 dn->first = q->second.first;
4987 }
4988
4989 // may have bad linkage if we missed dentry link/unlink messages
4990 if (dnl->is_primary()) {
4991 CInode *in = dnl->get_inode();
4992 if (!q->second.is_primary() ||
4993 vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
4994 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4995 dir->unlink_inode(dn);
4996 }
4997 } else if (dnl->is_remote()) {
4998 if (!q->second.is_remote() ||
4999 q->second.remote_ino != dnl->get_remote_ino() ||
5000 q->second.remote_d_type != dnl->get_remote_d_type()) {
5001 dout(10) << " had bad linkage for " << *dn << dendl;
5002 dir->unlink_inode(dn);
5003 }
5004 } else {
5005 if (!q->second.is_null())
5006 dout(10) << " had bad linkage for " << *dn << dendl;
5007 }
5008
5009 // hmm, did we have the proper linkage here?
5010 if (dnl->is_null() && !q->second.is_null()) {
5011 if (q->second.is_remote()) {
5012 dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
5013 } else {
5014 CInode *in = get_inode(q->second.ino, q->first.snapid);
5015 if (!in) {
5016 // barebones inode; assume it's dir, the full inode loop below will clean up.
5017 in = new CInode(this, false, q->second.first, q->first.snapid);
5018 in->inode.ino = q->second.ino;
5019 in->inode.mode = S_IFDIR;
5020 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
5021 add_inode(in);
5022 dout(10) << " add inode " << *in << dendl;
5023 } else if (in->get_parent_dn()) {
5024 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5025 << ", unlinking " << *in << dendl;
5026 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5027 }
5028 dn->dir->link_primary_inode(dn, in);
5029 isolated_inodes.erase(in);
5030 }
5031 }
5032
5033 dn->set_replica_nonce(q->second.nonce);
b32b8144 5034 dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
7c673cae
FG
5035 dn->state_clear(CDentry::STATE_REJOINING);
5036 dout(10) << " got " << *dn << dendl;
5037 }
5038 }
5039
5040 for (set<CInode*>::iterator p = refragged_inodes.begin();
5041 p != refragged_inodes.end();
5042 ++p) {
5043 list<CDir*> ls;
5044 (*p)->get_nested_dirfrags(ls);
5045 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5046 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5047 continue;
5048 assert((*q)->get_num_any() == 0);
5049 (*p)->close_dirfrag((*q)->get_frag());
5050 }
5051 }
5052
5053 // full dirfrags
5054 for (map<dirfrag_t, bufferlist>::iterator p = ack->dirfrag_bases.begin();
5055 p != ack->dirfrag_bases.end();
5056 ++p) {
5057 CDir *dir = get_dirfrag(p->first);
5058 assert(dir);
5059 bufferlist::iterator q = p->second.begin();
5060 dir->_decode_base(q);
5061 dout(10) << " got dir replica " << *dir << dendl;
5062 }
5063
5064 // full inodes
5065 bufferlist::iterator p = ack->inode_base.begin();
5066 while (!p.end()) {
5067 inodeno_t ino;
5068 snapid_t last;
5069 bufferlist basebl;
5070 ::decode(ino, p);
5071 ::decode(last, p);
5072 ::decode(basebl, p);
5073 CInode *in = get_inode(ino, last);
5074 assert(in);
5075 bufferlist::iterator q = basebl.begin();
5076 in->_decode_base(q);
5077 dout(10) << " got inode base " << *in << dendl;
5078 }
5079
5080 // inodes
5081 p = ack->inode_locks.begin();
5082 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5083 while (!p.end()) {
5084 inodeno_t ino;
5085 snapid_t last;
5086 __u32 nonce;
5087 bufferlist lockbl;
5088 ::decode(ino, p);
5089 ::decode(last, p);
5090 ::decode(nonce, p);
5091 ::decode(lockbl, p);
5092
5093 CInode *in = get_inode(ino, last);
5094 assert(in);
5095 in->set_replica_nonce(nonce);
5096 bufferlist::iterator q = lockbl.begin();
b32b8144 5097 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5098 in->state_clear(CInode::STATE_REJOINING);
5099 dout(10) << " got inode locks " << *in << dendl;
5100 }
5101
5102 // FIXME: This can happen if entire subtree, together with the inode subtree root
5103 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5104 assert(isolated_inodes.empty());
5105
5106 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5107 bufferlist::iterator bp = ack->imported_caps.begin();
5108 ::decode(peer_imported, bp);
5109
5110 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5111 p != peer_imported.end();
5112 ++p) {
28e407b8
AA
5113 auto& ex = cap_exports.at(p->first);
5114 assert(ex.first == from);
7c673cae
FG
5115 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5116 q != p->second.end();
5117 ++q) {
28e407b8
AA
5118 auto r = ex.second.find(q->first);
5119 assert(r != ex.second.end());
7c673cae
FG
5120
5121 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5122 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5123 if (!session) {
5124 dout(10) << " no session for client." << p->first << dendl;
5125 ex.second.erase(r);
5126 continue;
5127 }
7c673cae
FG
5128
5129 // mark client caps stale.
5130 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5131 r->second.capinfo.cap_id, 0,
7c673cae
FG
5132 mds->get_osd_epoch_barrier());
5133 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5134 (q->second.cap_id > 0 ? from : -1), 0);
5135 mds->send_message_client_counted(m, session);
5136
28e407b8 5137 ex.second.erase(r);
7c673cae 5138 }
28e407b8 5139 assert(ex.second.empty());
7c673cae
FG
5140 }
5141
5142 // done?
5143 assert(rejoin_ack_gather.count(from));
5144 rejoin_ack_gather.erase(from);
b32b8144 5145 if (!survivor) {
7c673cae
FG
5146
5147 if (rejoin_gather.empty()) {
5148 // eval unstable scatter locks after all wrlocks are rejoined.
5149 while (!rejoin_eval_locks.empty()) {
5150 SimpleLock *lock = rejoin_eval_locks.front();
5151 rejoin_eval_locks.pop_front();
5152 if (!lock->is_stable())
5153 mds->locker->eval_gather(lock);
5154 }
5155 }
5156
5157 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5158 rejoin_ack_gather.empty()) {
5159 // finally, kickstart past snap parent opens
5160 open_snap_parents();
5161 } else {
5162 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5163 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5164 }
5165 } else {
5166 // survivor.
5167 mds->queue_waiters(rejoin_waiters);
5168 }
5169}
5170
5171/**
5172 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5173 *
5174 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5175 * messages that clean these guys up...
5176 */
5177void MDCache::rejoin_trim_undef_inodes()
5178{
5179 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5180
5181 while (!rejoin_undef_inodes.empty()) {
5182 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5183 CInode *in = *p;
5184 rejoin_undef_inodes.erase(p);
5185
5186 in->clear_replica_map();
5187
5188 // close out dirfrags
5189 if (in->is_dir()) {
5190 list<CDir*> dfls;
5191 in->get_dirfrags(dfls);
5192 for (list<CDir*>::iterator p = dfls.begin();
5193 p != dfls.end();
5194 ++p) {
5195 CDir *dir = *p;
5196 dir->clear_replica_map();
5197
94b18763
FG
5198 for (auto &p : dir->items) {
5199 CDentry *dn = p.second;
7c673cae
FG
5200 dn->clear_replica_map();
5201
5202 dout(10) << " trimming " << *dn << dendl;
5203 dir->remove_dentry(dn);
5204 }
5205
5206 dout(10) << " trimming " << *dir << dendl;
5207 in->close_dirfrag(dir->dirfrag().frag);
5208 }
5209 }
5210
5211 CDentry *dn = in->get_parent_dn();
5212 if (dn) {
5213 dn->clear_replica_map();
5214 dout(10) << " trimming " << *dn << dendl;
5215 dn->dir->remove_dentry(dn);
5216 } else {
5217 dout(10) << " trimming " << *in << dendl;
5218 remove_inode(in);
5219 }
5220 }
5221
5222 assert(rejoin_undef_inodes.empty());
5223}
5224
5225void MDCache::rejoin_gather_finish()
5226{
5227 dout(10) << "rejoin_gather_finish" << dendl;
5228 assert(mds->is_rejoin());
28e407b8 5229 assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5230
5231 if (open_undef_inodes_dirfrags())
5232 return;
5233
5234 if (process_imported_caps())
5235 return;
5236
5237 choose_lock_states_and_reconnect_caps();
5238
5239 identify_files_to_recover();
5240 rejoin_send_acks();
5241
5242 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5243 rejoin_ack_gather.erase(mds->get_nodeid());
5244
5245 // did we already get our acks too?
5246 if (rejoin_ack_gather.empty()) {
5247 // finally, kickstart past snap parent opens
5248 open_snap_parents();
5249 }
5250}
5251
5252class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5253 inodeno_t ino;
5254public:
5255 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5256 void finish(int r) override {
5257 mdcache->rejoin_open_ino_finish(ino, r);
5258 }
5259};
5260
5261void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5262{
5263 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5264
5265 if (ret < 0) {
5266 cap_imports_missing.insert(ino);
5267 } else if (ret == mds->get_nodeid()) {
5268 assert(get_inode(ino));
5269 } else {
5270 auto p = cap_imports.find(ino);
5271 assert(p != cap_imports.end());
5272 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5273 assert(q->second.count(MDS_RANK_NONE));
5274 assert(q->second.size() == 1);
5275 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5276 }
5277 cap_imports.erase(p);
5278 }
5279
5280 assert(cap_imports_num_opening > 0);
5281 cap_imports_num_opening--;
5282
5283 if (cap_imports_num_opening == 0) {
5284 if (rejoin_gather.empty())
5285 rejoin_gather_finish();
5286 else if (rejoin_gather.count(mds->get_nodeid()))
5287 process_imported_caps();
5288 }
5289}
5290
5291class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5292public:
28e407b8
AA
5293 map<client_t,pair<Session*,uint64_t> > session_map;
5294 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae
FG
5295 void finish(int r) override {
5296 assert(r == 0);
28e407b8 5297 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5298 }
5299};
5300
28e407b8 5301void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5302{
5303 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5304 mds->server->finish_force_open_sessions(session_map);
5305 rejoin_session_map.swap(session_map);
7c673cae
FG
5306 if (rejoin_gather.empty())
5307 rejoin_gather_finish();
5308}
5309
5310bool MDCache::process_imported_caps()
5311{
5312 dout(10) << "process_imported_caps" << dendl;
5313
5314 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5315 CInode *in = get_inode(p->first);
5316 if (in) {
5317 assert(in->is_auth());
5318 cap_imports_missing.erase(p->first);
5319 continue;
5320 }
5321 if (cap_imports_missing.count(p->first) > 0)
5322 continue;
5323
5324 cap_imports_num_opening++;
5325 dout(10) << " opening missing ino " << p->first << dendl;
5326 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
28e407b8
AA
5327 if (!(cap_imports_num_opening % 1000))
5328 mds->heartbeat_reset();
7c673cae
FG
5329 }
5330
5331 if (cap_imports_num_opening > 0)
5332 return true;
5333
5334 // called by rejoin_gather_finish() ?
5335 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5336 if (!rejoin_client_map.empty() &&
5337 rejoin_session_map.empty()) {
5338 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5339 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5340 finish->session_map);
5341 mds->mdlog->start_submit_entry(new ESessions(pv, rejoin_client_map), finish);
5342 mds->mdlog->flush();
5343 rejoin_client_map.clear();
5344 return true;
7c673cae 5345 }
7c673cae
FG
5346
5347 // process caps that were exported by slave rename
5348 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5349 p != rejoin_slave_exports.end();
5350 ++p) {
5351 CInode *in = get_inode(p->first);
5352 assert(in);
5353 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5354 q != p->second.second.end();
5355 ++q) {
28e407b8
AA
5356 auto r = rejoin_session_map.find(q->first);
5357 if (r == rejoin_session_map.end())
5358 continue;
7c673cae 5359
28e407b8 5360 Session *session = r->second.first;
7c673cae
FG
5361 Capability *cap = in->get_client_cap(q->first);
5362 if (!cap)
5363 cap = in->add_client_cap(q->first, session);
5364 cap->merge(q->second, true);
5365
5366 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5367 assert(cap->get_last_seq() == im.issue_seq);
5368 assert(cap->get_mseq() == im.mseq);
5369 cap->set_cap_id(im.cap_id);
5370 // send cap import because we assigned a new cap ID
5371 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5372 p->second.first, CEPH_CAP_FLAG_AUTH);
5373 }
5374 }
5375 rejoin_slave_exports.clear();
5376 rejoin_imported_caps.clear();
5377
5378 // process cap imports
5379 // ino -> client -> frommds -> capex
5380 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5381 CInode *in = get_inode(p->first);
5382 if (!in) {
5383 dout(10) << " still missing ino " << p->first
5384 << ", will try again after replayed client requests" << dendl;
5385 ++p;
5386 continue;
5387 }
5388 assert(in->is_auth());
5389 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5390 Session *session;
5391 {
5392 auto r = rejoin_session_map.find(q->first);
5393 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5394 }
5395
7c673cae 5396 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5397 if (!session) {
5398 if (r->first >= 0)
5399 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5400 continue;
5401 }
5402
7c673cae
FG
5403 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5404 add_reconnected_cap(q->first, in->ino(), r->second);
5405 if (r->first >= 0) {
5406 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5407 cap->inc_mseq();
5408 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5409
5410 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5411 im.cap_id = cap->get_cap_id();
5412 im.issue_seq = cap->get_last_seq();
5413 im.mseq = cap->get_mseq();
5414 }
5415 }
5416 }
5417 cap_imports.erase(p++); // remove and move on
5418 }
5419 } else {
5420 trim_non_auth();
5421
28e407b8 5422 assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5423 rejoin_gather.erase(mds->get_nodeid());
28e407b8 5424 assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5425 maybe_send_pending_rejoins();
7c673cae
FG
5426 }
5427 return false;
5428}
5429
5430void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect)
5431{
5432 // are this realm's parents fully open?
5433 if (realm->have_past_parents_open()) {
5434 dout(10) << " have past snap parents for realm " << *realm
5435 << " on " << *realm->inode << dendl;
5436 if (reconnect) {
5437 // finish off client snaprealm reconnects?
5438 auto p = reconnected_snaprealms.find(realm->inode->ino());
5439 if (p != reconnected_snaprealms.end()) {
5440 for (auto q = p->second.begin(); q != p->second.end(); ++q)
5441 finish_snaprealm_reconnect(q->first, realm, q->second);
5442 reconnected_snaprealms.erase(p);
5443 }
5444 }
5445 } else {
5446 if (!missing_snap_parents.count(realm->inode)) {
5447 dout(10) << " MISSING past snap parents for realm " << *realm
5448 << " on " << *realm->inode << dendl;
5449 realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS);
5450 missing_snap_parents[realm->inode].size(); // just to get it into the map!
5451 } else {
5452 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5453 << " on " << *realm->inode << dendl;
5454 }
5455 }
5456}
5457
5458void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5459 client_t client, snapid_t snap_follows)
5460{
5461 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5462
5463 const set<snapid_t>& snaps = realm->get_snaps();
5464 snapid_t follows = snap_follows;
5465
5466 while (true) {
5467 CInode *in = pick_inode_snap(head_in, follows);
5468 if (in == head_in)
5469 break;
5470 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5471
5472 /* TODO: we can check the reconnected/flushing caps to find
5473 * which locks need gathering */
5474 for (int i = 0; i < num_cinode_locks; i++) {
5475 int lockid = cinode_lock_info[i].lock;
5476 SimpleLock *lock = in->get_lock(lockid);
5477 assert(lock);
5478 in->client_snap_caps[lockid].insert(client);
5479 in->auth_pin(lock);
5480 lock->set_state(LOCK_SNAP_SYNC);
5481 lock->get_wrlock(true);
5482 }
5483
5484 for (auto p = snaps.lower_bound(in->first);
5485 p != snaps.end() && *p <= in->last;
5486 ++p) {
5487 head_in->add_need_snapflush(in, *p, client);
5488 }
5489
5490 follows = in->last;
5491 }
5492}
5493
5494/*
5495 * choose lock states based on reconnected caps
5496 */
5497void MDCache::choose_lock_states_and_reconnect_caps()
5498{
5499 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5500
5501 map<client_t,MClientSnap*> splits;
5502
b32b8144
FG
5503 for (auto i : inode_map) {
5504 CInode *in = i.second;
7c673cae
FG
5505
5506 if (in->last != CEPH_NOSNAP)
5507 continue;
5508
5509 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5510 in->mark_dirty_rstat();
5511
7c673cae 5512 int dirty_caps = 0;
b32b8144 5513 auto p = reconnected_caps.find(in->ino());
7c673cae
FG
5514 if (p != reconnected_caps.end()) {
5515 for (const auto &it : p->second)
5516 dirty_caps |= it.second.dirty_caps;
5517 }
5518 in->choose_lock_states(dirty_caps);
5519 dout(15) << " chose lock states on " << *in << dendl;
5520
5521 SnapRealm *realm = in->find_snaprealm();
5522
5523 check_realm_past_parents(realm, realm == in->snaprealm);
5524
5525 if (p != reconnected_caps.end()) {
5526 bool missing_snap_parent = false;
5527 // also, make sure client's cap is in the correct snaprealm.
5528 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5529 if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) {
5530 if (realm->have_past_parents_open()) {
5531 rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows);
5532 } else {
5533 missing_snap_parent = true;
5534 }
5535 }
5536
5537 if (q->second.realm_ino == realm->inode->ino()) {
5538 dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl;
5539 } else {
5540 dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino
5541 << " != " << realm->inode->ino() << dendl;
5542 if (realm->have_past_parents_open()) {
5543 // ok, include in a split message _now_.
5544 prepare_realm_split(realm, q->first, in->ino(), splits);
5545 } else {
5546 // send the split later.
5547 missing_snap_parent = true;
5548 }
5549 }
5550 }
5551 if (missing_snap_parent)
5552 missing_snap_parents[realm->inode].insert(in);
5553 }
5554 }
5555
5556 send_snaps(splits);
5557}
5558
5559void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5560 map<client_t,MClientSnap*>& splits)
5561{
5562 MClientSnap *snap;
5563 if (splits.count(client) == 0) {
5564 splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT);
5565 snap->head.split = realm->inode->ino();
5566 realm->build_snap_trace(snap->bl);
5567
5568 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5569 p != realm->open_children.end();
5570 ++p)
5571 snap->split_realms.push_back((*p)->inode->ino());
5572
5573 } else
5574 snap = splits[client];
5575 snap->split_inos.push_back(ino);
5576}
5577
5578void MDCache::send_snaps(map<client_t,MClientSnap*>& splits)
5579{
5580 dout(10) << "send_snaps" << dendl;
5581
5582 for (map<client_t,MClientSnap*>::iterator p = splits.begin();
5583 p != splits.end();
5584 ++p) {
5585 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
5586 if (session) {
5587 dout(10) << " client." << p->first
5588 << " split " << p->second->head.split
5589 << " inos " << p->second->split_inos
5590 << dendl;
5591 mds->send_message_client_counted(p->second, session);
5592 } else {
5593 dout(10) << " no session for client." << p->first << dendl;
5594 p->second->put();
5595 }
5596 }
5597 splits.clear();
5598}
5599
5600
5601/*
5602 * remove any items from logsegment open_file lists that don't have
5603 * any caps
5604 */
5605void MDCache::clean_open_file_lists()
5606{
5607 dout(10) << "clean_open_file_lists" << dendl;
5608
5609 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5610 p != mds->mdlog->segments.end();
5611 ++p) {
5612 LogSegment *ls = p->second;
5613
5614 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5615 while (!q.end()) {
5616 CInode *in = *q;
5617 ++q;
5618 if (in->last == CEPH_NOSNAP) {
5619 if (!in->is_any_caps_wanted()) {
5620 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5621 in->item_open_file.remove_myself();
5622 }
5623 } else if (in->last != CEPH_NOSNAP) {
5624 if (in->client_snap_caps.empty()) {
5625 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5626 in->item_open_file.remove_myself();
5627 }
5628 }
5629 }
5630 }
5631}
5632
5633
5634
5635Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5636{
5637 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5638 << " on " << *in << dendl;
5639 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5640 if (!session) {
5641 dout(10) << " no session for client." << client << dendl;
5642 return NULL;
5643 }
5644
5645 Capability *cap = in->reconnect_cap(client, icr, session);
5646
5647 if (frommds >= 0) {
5648 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5649 cap->inc_mseq();
5650 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5651 }
5652
5653 return cap;
5654}
5655
5656void MDCache::export_remaining_imported_caps()
5657{
5658 dout(10) << "export_remaining_imported_caps" << dendl;
5659
5660 stringstream warn_str;
5661
5662 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5663 warn_str << " ino " << p->first << "\n";
5664 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5665 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5666 if (session) {
5667 // mark client caps stale.
5668 MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
5669 stale->set_cap_peer(0, 0, 0, -1, 0);
5670 mds->send_message_client_counted(stale, q->first);
5671 }
5672 }
5673
5674 mds->heartbeat_reset();
5675 }
5676
5677 for (map<inodeno_t, list<MDSInternalContextBase*> >::iterator p = cap_reconnect_waiters.begin();
5678 p != cap_reconnect_waiters.end();
5679 ++p)
5680 mds->queue_waiters(p->second);
5681
5682 cap_imports.clear();
5683 cap_reconnect_waiters.clear();
5684
5685 if (warn_str.peek() != EOF) {
5686 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5687 mds->clog->warn(warn_str);
5688 }
5689}
5690
5691void MDCache::try_reconnect_cap(CInode *in, Session *session)
5692{
5693 client_t client = session->info.get_client();
5694 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5695 if (rc) {
5696 in->reconnect_cap(client, *rc, session);
5697 dout(10) << "try_reconnect_cap client." << client
5698 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5699 << " issue " << ccap_string(rc->capinfo.issued)
5700 << " on " << *in << dendl;
5701 remove_replay_cap_reconnect(in->ino(), client);
5702
5703 if (in->is_replicated()) {
5704 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5705 } else {
5706 int dirty_caps = 0;
5707 auto p = reconnected_caps.find(in->ino());
5708 if (p != reconnected_caps.end()) {
5709 auto q = p->second.find(client);
5710 if (q != p->second.end())
5711 dirty_caps = q->second.dirty_caps;
5712 }
5713 in->choose_lock_states(dirty_caps);
5714 dout(15) << " chose lock states on " << *in << dendl;
5715 }
5716
5717 map<inodeno_t, list<MDSInternalContextBase*> >::iterator it =
5718 cap_reconnect_waiters.find(in->ino());
5719 if (it != cap_reconnect_waiters.end()) {
5720 mds->queue_waiters(it->second);
5721 cap_reconnect_waiters.erase(it);
5722 }
5723 }
5724}
5725
5726
5727
5728// -------
5729// cap imports and delayed snap parent opens
5730
5731void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5732 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5733 int peer, int p_flags)
5734{
5735 client_t client = session->info.inst.name.num();
5736 SnapRealm *realm = in->find_snaprealm();
5737 if (realm->have_past_parents_open()) {
5738 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5739 if (cap->get_last_seq() == 0) // reconnected cap
5740 cap->inc_last_seq();
5741 cap->set_last_issue();
5742 cap->set_last_issue_stamp(ceph_clock_now());
5743 cap->clear_new();
5744 MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
5745 in->ino(),
5746 realm->inode->ino(),
5747 cap->get_cap_id(), cap->get_last_seq(),
5748 cap->pending(), cap->wanted(), 0,
5749 cap->get_mseq(), mds->get_osd_epoch_barrier());
5750 in->encode_cap_message(reap, cap);
5751 realm->build_snap_trace(reap->snapbl);
5752 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5753 mds->send_message_client_counted(reap, session);
5754 } else {
5755 dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq "
5756 << cap->get_mseq() << " on " << *in << dendl;
5757 in->auth_pin(this);
5758 cap->inc_suppress();
5759 delayed_imported_caps[client].insert(in);
5760 missing_snap_parents[in].size();
5761 }
5762}
5763
5764void MDCache::do_delayed_cap_imports()
5765{
5766 dout(10) << "do_delayed_cap_imports" << dendl;
5767
5768 assert(delayed_imported_caps.empty());
5769}
5770
5771struct C_MDC_OpenSnapParents : public MDCacheContext {
5772 explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {}
5773 void finish(int r) override {
5774 mdcache->open_snap_parents();
5775 }
5776};
5777
5778void MDCache::open_snap_parents()
5779{
5780 dout(10) << "open_snap_parents" << dendl;
5781
5782 map<client_t,MClientSnap*> splits;
5783 MDSGatherBuilder gather(g_ceph_context);
5784
5785 auto p = missing_snap_parents.begin();
5786 while (p != missing_snap_parents.end()) {
5787 CInode *in = p->first;
5788 assert(in->snaprealm);
5789 if (in->snaprealm->open_parents(gather.new_sub())) {
5790 dout(10) << " past parents now open on " << *in << dendl;
5791
5792 for (CInode *child : p->second) {
5793 auto q = reconnected_caps.find(child->ino());
5794 assert(q != reconnected_caps.end());
5795 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5796 if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) {
5797 rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows);
5798 }
5799 // make sure client's cap is in the correct snaprealm.
5800 if (r->second.realm_ino != in->ino()) {
5801 prepare_realm_split(in->snaprealm, r->first, child->ino(), splits);
5802 }
5803 }
5804 }
5805
5806 missing_snap_parents.erase(p++);
5807
5808 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5809
5810 // finish off client snaprealm reconnects?
5811 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5812 if (q != reconnected_snaprealms.end()) {
5813 for (map<client_t,snapid_t>::iterator r = q->second.begin();
5814 r != q->second.end();
5815 ++r)
5816 finish_snaprealm_reconnect(r->first, in->snaprealm, r->second);
5817 reconnected_snaprealms.erase(q);
5818 }
5819 } else {
5820 dout(10) << " opening past parents on " << *in << dendl;
5821 ++p;
5822 }
5823 }
5824
5825 send_snaps(splits);
5826
5827 if (gather.has_subs()) {
5828 dout(10) << "open_snap_parents - waiting for "
5829 << gather.num_subs_remaining() << dendl;
5830 gather.set_finisher(new C_MDC_OpenSnapParents(this));
5831 gather.activate();
5832 } else {
5833 if (!reconnected_snaprealms.empty()) {
5834 stringstream warn_str;
5835 for (map<inodeno_t,map<client_t,snapid_t> >::iterator p = reconnected_snaprealms.begin();
5836 p != reconnected_snaprealms.end();
5837 ++p) {
5838 warn_str << " unconnected snaprealm " << p->first << "\n";
5839 for (map<client_t,snapid_t>::iterator q = p->second.begin();
5840 q != p->second.end();
5841 ++q)
5842 warn_str << " client." << q->first << " snapid " << q->second << "\n";
5843 }
5844 mds->clog->warn() << "open_snap_parents has:";
5845 mds->clog->warn(warn_str);
5846 }
5847 assert(rejoin_waiters.empty());
5848 assert(missing_snap_parents.empty());
5849 dout(10) << "open_snap_parents - all open" << dendl;
5850 do_delayed_cap_imports();
5851
5852 assert(rejoin_done);
5853 rejoin_done.release()->complete(0);
5854 reconnected_caps.clear();
5855 }
5856}
5857
5858bool MDCache::open_undef_inodes_dirfrags()
5859{
5860 dout(10) << "open_undef_inodes_dirfrags "
5861 << rejoin_undef_inodes.size() << " inodes "
5862 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5863
5864 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5865
5866 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5867 p != rejoin_undef_inodes.end();
5868 ++p) {
5869 CInode *in = *p;
5870 assert(!in->is_base());
5871 fetch_queue.insert(in->get_parent_dir());
5872 }
5873
5874 if (fetch_queue.empty())
5875 return false;
5876
28e407b8
AA
5877 MDSGatherBuilder gather(g_ceph_context,
5878 new MDSInternalContextWrapper(mds,
5879 new FunctionContext([this](int r) {
5880 if (rejoin_gather.empty())
5881 rejoin_gather_finish();
5882 })
5883 )
5884 );
5885
7c673cae
FG
5886 for (set<CDir*>::iterator p = fetch_queue.begin();
5887 p != fetch_queue.end();
5888 ++p) {
5889 CDir *dir = *p;
5890 CInode *diri = dir->get_inode();
5891 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5892 continue;
5893 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5894 assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5895 dir->fetch(gather.new_sub());
5896 }
5897 assert(gather.has_subs());
5898 gather.activate();
5899 return true;
5900}
5901
5902void MDCache::opened_undef_inode(CInode *in) {
5903 dout(10) << "opened_undef_inode " << *in << dendl;
5904 rejoin_undef_inodes.erase(in);
5905 if (in->is_dir()) {
5906 // FIXME: re-hash dentries if necessary
5907 assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash);
5908 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5909 CDir *dir = in->get_dirfrag(frag_t());
5910 assert(dir);
5911 rejoin_undef_dirfrags.erase(dir);
5912 in->force_dirfrags();
5913 list<CDir*> ls;
5914 in->get_dirfrags(ls);
5915 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5916 rejoin_undef_dirfrags.insert(*p);
5917 }
5918 }
5919}
5920
5921void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
5922{
5923 if (seq < realm->get_newest_seq()) {
5924 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5925 << realm->get_newest_seq()
5926 << " on " << *realm << dendl;
5927 // send an update
5928 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5929 if (session) {
5930 MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE);
5931 realm->build_snap_trace(snap->bl);
5932 mds->send_message_client_counted(snap, session);
5933 } else {
5934 dout(10) << " ...or not, no session for this client!" << dendl;
5935 }
5936 } else {
5937 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5938 << " on " << *realm << dendl;
5939 }
5940}
5941
5942
5943
5944void MDCache::rejoin_send_acks()
5945{
5946 dout(7) << "rejoin_send_acks" << dendl;
5947
5948 // replicate stray
5949 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5950 p != rejoin_unlinked_inodes.end();
5951 ++p) {
5952 for (set<CInode*>::iterator q = p->second.begin();
5953 q != p->second.end();
5954 ++q) {
5955 CInode *in = *q;
5956 dout(7) << " unlinked inode " << *in << dendl;
5957 // inode expired
5958 if (!in->is_replica(p->first))
5959 continue;
5960 while (1) {
5961 CDentry *dn = in->get_parent_dn();
5962 if (dn->is_replica(p->first))
5963 break;
5964 dn->add_replica(p->first);
5965 CDir *dir = dn->get_dir();
5966 if (dir->is_replica(p->first))
5967 break;
5968 dir->add_replica(p->first);
5969 in = dir->get_inode();
5970 if (in->is_replica(p->first))
5971 break;
224ce89b 5972 in->add_replica(p->first);
7c673cae
FG
5973 if (in->is_base())
5974 break;
5975 }
5976 }
5977 }
5978 rejoin_unlinked_inodes.clear();
5979
5980 // send acks to everyone in the recovery set
31f18b77 5981 map<mds_rank_t,MMDSCacheRejoin*> acks;
7c673cae
FG
5982 for (set<mds_rank_t>::iterator p = recovery_set.begin();
5983 p != recovery_set.end();
31f18b77
FG
5984 ++p) {
5985 if (rejoin_ack_sent.count(*p))
5986 continue;
5987 acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK);
5988 }
5989
5990 rejoin_ack_sent = recovery_set;
7c673cae
FG
5991
5992 // walk subtrees
5993 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
5994 p != subtrees.end();
5995 ++p) {
5996 CDir *dir = p->first;
5997 if (!dir->is_auth())
5998 continue;
5999 dout(10) << "subtree " << *dir << dendl;
6000
6001 // auth items in this subtree
6002 list<CDir*> dq;
6003 dq.push_back(dir);
6004
6005 while (!dq.empty()) {
6006 CDir *dir = dq.front();
6007 dq.pop_front();
6008
6009 // dir
181888fb
FG
6010 for (auto &r : dir->get_replicas()) {
6011 auto it = acks.find(r.first);
31f18b77
FG
6012 if (it == acks.end())
6013 continue;
181888fb 6014 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 6015 it->second->add_dirfrag_base(dir);
7c673cae
FG
6016 }
6017
94b18763
FG
6018 for (auto &p : dir->items) {
6019 CDentry *dn = p.second;
7c673cae
FG
6020 CDentry::linkage_t *dnl = dn->get_linkage();
6021
6022 // inode
6023 CInode *in = NULL;
6024 if (dnl->is_primary())
6025 in = dnl->get_inode();
6026
6027 // dentry
181888fb
FG
6028 for (auto &r : dn->get_replicas()) {
6029 auto it = acks.find(r.first);
31f18b77
FG
6030 if (it == acks.end())
6031 continue;
94b18763 6032 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6033 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6034 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6035 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6036 ++r.second,
7c673cae
FG
6037 dn->lock.get_replica_state());
6038 // peer missed MDentrylink message ?
181888fb
FG
6039 if (in && !in->is_replica(r.first))
6040 in->add_replica(r.first);
7c673cae
FG
6041 }
6042
6043 if (!in)
6044 continue;
6045
181888fb
FG
6046 for (auto &r : in->get_replicas()) {
6047 auto it = acks.find(r.first);
31f18b77
FG
6048 if (it == acks.end())
6049 continue;
6050 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6051 bufferlist bl;
181888fb
FG
6052 in->_encode_locks_state_for_rejoin(bl, r.first);
6053 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6054 }
6055
6056 // subdirs in this subtree?
6057 in->get_nested_dirfrags(dq);
6058 }
6059 }
6060 }
6061
6062 // base inodes too
6063 if (root && root->is_auth())
181888fb
FG
6064 for (auto &r : root->get_replicas()) {
6065 auto it = acks.find(r.first);
31f18b77
FG
6066 if (it == acks.end())
6067 continue;
6068 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6069 bufferlist bl;
181888fb
FG
6070 root->_encode_locks_state_for_rejoin(bl, r.first);
6071 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6072 }
6073 if (myin)
181888fb
FG
6074 for (auto &r : myin->get_replicas()) {
6075 auto it = acks.find(r.first);
31f18b77
FG
6076 if (it == acks.end())
6077 continue;
6078 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6079 bufferlist bl;
181888fb
FG
6080 myin->_encode_locks_state_for_rejoin(bl, r.first);
6081 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6082 }
6083
6084 // include inode base for any inodes whose scatterlocks may have updated
6085 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6086 p != rejoin_potential_updated_scatterlocks.end();
6087 ++p) {
6088 CInode *in = *p;
181888fb
FG
6089 for (const auto &r : in->get_replicas()) {
6090 auto it = acks.find(r.first);
31f18b77
FG
6091 if (it == acks.end())
6092 continue;
6093 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6094 }
7c673cae
FG
6095 }
6096
6097 // send acks
31f18b77 6098 for (auto p = acks.begin(); p != acks.end(); ++p) {
7c673cae
FG
6099 ::encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6100 mds->send_message_mds(p->second, p->first);
6101 }
6102
6103 rejoin_imported_caps.clear();
6104}
6105
c07f9fc5
FG
6106class C_MDC_ReIssueCaps : public MDCacheContext {
6107 CInode *in;
6108public:
6109 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6110 MDCacheContext(mdc), in(i)
6111 {
6112 in->get(CInode::PIN_PTRWAITER);
6113 }
6114 void finish(int r) override {
6115 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6116 mdcache->mds->locker->issue_caps(in);
6117 in->put(CInode::PIN_PTRWAITER);
6118 }
6119};
7c673cae
FG
6120
6121void MDCache::reissue_all_caps()
6122{
6123 dout(10) << "reissue_all_caps" << dendl;
6124
94b18763 6125 for (auto &p : inode_map) {
b32b8144 6126 CInode *in = p.second;
7c673cae 6127 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6128 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6129 if (in->is_frozen_inode()) {
6130 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6131 continue;
6132 }
7c673cae
FG
6133 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6134 mds->locker->issue_caps(in);
6135 }
6136 }
6137}
6138
6139
6140// ===============================================================================
6141
6142struct C_MDC_QueuedCow : public MDCacheContext {
6143 CInode *in;
6144 MutationRef mut;
6145 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6146 MDCacheContext(mdc), in(i), mut(m) {}
6147 void finish(int r) override {
6148 mdcache->_queued_file_recover_cow(in, mut);
6149 }
6150};
6151
6152
6153void MDCache::queue_file_recover(CInode *in)
6154{
6155 dout(10) << "queue_file_recover " << *in << dendl;
6156 assert(in->is_auth());
6157
6158 // cow?
6159 /*
6160 SnapRealm *realm = in->find_snaprealm();
6161 set<snapid_t> s = realm->get_snaps();
6162 while (!s.empty() && *s.begin() < in->first)
6163 s.erase(s.begin());
6164 while (!s.empty() && *s.rbegin() > in->last)
6165 s.erase(*s.rbegin());
6166 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6167 if (s.size() > 1) {
94b18763 6168 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6169 pi->version = in->pre_dirty();
6170
6171 auto mut(std::make_shared<MutationImpl>());
6172 mut->ls = mds->mdlog->get_current_segment();
6173 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6174 mds->mdlog->start_entry(le);
6175 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6176
6177 s.erase(*s.begin());
6178 while (!s.empty()) {
6179 snapid_t snapid = *s.begin();
6180 CInode *cow_inode = 0;
6181 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6182 assert(cow_inode);
6183 recovery_queue.enqueue(cow_inode);
6184 s.erase(*s.begin());
6185 }
6186
6187 in->parent->first = in->first;
6188 le->metablob.add_primary_dentry(in->parent, in, true);
6189 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6190 mds->mdlog->flush();
6191 }
6192 */
6193
6194 recovery_queue.enqueue(in);
6195}
6196
6197void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6198{
6199 in->pop_and_dirty_projected_inode(mut->ls);
6200 mut->apply();
6201 mds->locker->drop_locks(mut.get());
6202 mut->cleanup();
6203}
6204
6205
6206/*
6207 * called after recovery to recover file sizes for previously opened (for write)
6208 * files. that is, those where max_size > size.
6209 */
6210void MDCache::identify_files_to_recover()
6211{
6212 dout(10) << "identify_files_to_recover" << dendl;
94b18763 6213 for (auto &p : inode_map) {
b32b8144 6214 CInode *in = p.second;
7c673cae
FG
6215 if (!in->is_auth())
6216 continue;
6217
6218 if (in->last != CEPH_NOSNAP)
6219 continue;
6220
6221 // Only normal files need file size recovery
6222 if (!in->is_file()) {
6223 continue;
6224 }
6225
6226 bool recover = false;
6227 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6228 p != in->inode.client_ranges.end();
6229 ++p) {
6230 Capability *cap = in->get_client_cap(p->first);
6231 if (!cap) {
6232 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6233 recover = true;
6234 break;
6235 }
6236 }
6237
6238 if (recover) {
6239 if (in->filelock.is_stable()) {
6240 in->auth_pin(&in->filelock);
6241 } else {
6242 assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6243 }
6244 in->filelock.set_state(LOCK_PRE_SCAN);
6245 rejoin_recover_q.push_back(in);
6246 } else {
6247 rejoin_check_q.push_back(in);
6248 }
6249 }
6250}
6251
6252void MDCache::start_files_to_recover()
6253{
6254 for (CInode *in : rejoin_check_q) {
6255 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6256 mds->locker->issue_caps(in);
6257 mds->locker->check_inode_max_size(in);
6258 }
6259 rejoin_check_q.clear();
6260 for (CInode *in : rejoin_recover_q) {
6261 mds->locker->file_recover(&in->filelock);
6262 }
6263 if (!rejoin_recover_q.empty()) {
6264 rejoin_recover_q.clear();
6265 do_file_recover();
6266 }
6267}
6268
6269void MDCache::do_file_recover()
6270{
6271 recovery_queue.advance();
6272}
6273
6274// ===============================================================================
6275
6276
6277// ----------------------------
6278// truncate
6279
6280class C_MDC_RetryTruncate : public MDCacheContext {
6281 CInode *in;
6282 LogSegment *ls;
6283public:
6284 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6285 MDCacheContext(c), in(i), ls(l) {}
6286 void finish(int r) override {
6287 mdcache->_truncate_inode(in, ls);
6288 }
6289};
6290
6291void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6292{
94b18763 6293 auto pi = in->get_projected_inode();
7c673cae
FG
6294 dout(10) << "truncate_inode "
6295 << pi->truncate_from << " -> " << pi->truncate_size
6296 << " on " << *in
6297 << dendl;
6298
6299 ls->truncating_inodes.insert(in);
6300 in->get(CInode::PIN_TRUNCATING);
6301 in->auth_pin(this);
6302
6303 if (!in->client_need_snapflush.empty() &&
6304 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6305 assert(in->filelock.is_xlocked());
6306 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6307 mds->locker->issue_caps(in);
6308 return;
6309 }
6310
6311 _truncate_inode(in, ls);
6312}
6313
6314struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6315 CInode *in;
6316 LogSegment *ls;
6317 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
91327a77
AA
6318 MDCacheIOContext(c, false), in(i), ls(l) {
6319 }
7c673cae
FG
6320 void finish(int r) override {
6321 assert(r == 0 || r == -ENOENT);
6322 mdcache->truncate_inode_finish(in, ls);
6323 }
91327a77
AA
6324 void print(ostream& out) const override {
6325 out << "file_truncate(" << in->ino() << ")";
6326 }
7c673cae
FG
6327};
6328
6329void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6330{
94b18763 6331 auto pi = &in->inode;
7c673cae
FG
6332 dout(10) << "_truncate_inode "
6333 << pi->truncate_from << " -> " << pi->truncate_size
6334 << " on " << *in << dendl;
6335
6336 assert(pi->is_truncating());
6337 assert(pi->truncate_size < (1ULL << 63));
6338 assert(pi->truncate_from < (1ULL << 63));
6339 assert(pi->truncate_size < pi->truncate_from);
6340
6341
6342 SnapRealm *realm = in->find_snaprealm();
6343 SnapContext nullsnap;
6344 const SnapContext *snapc;
6345 if (realm) {
6346 dout(10) << " realm " << *realm << dendl;
6347 snapc = &realm->get_snap_context();
6348 } else {
6349 dout(10) << " NO realm, using null context" << dendl;
6350 snapc = &nullsnap;
6351 assert(in->last == CEPH_NOSNAP);
6352 }
6353 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6354 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6355 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6356 pi->truncate_seq, ceph::real_time::min(), 0,
6357 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6358 mds->finisher));
6359}
6360
6361struct C_MDC_TruncateLogged : public MDCacheLogContext {
6362 CInode *in;
6363 MutationRef mut;
6364 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6365 MDCacheLogContext(m), in(i), mut(mu) {}
6366 void finish(int r) override {
6367 mdcache->truncate_inode_logged(in, mut);
6368 }
6369};
6370
6371void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6372{
6373 dout(10) << "truncate_inode_finish " << *in << dendl;
6374
6375 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6376 assert(p != ls->truncating_inodes.end());
6377 ls->truncating_inodes.erase(p);
6378
6379 // update
94b18763
FG
6380 auto &pi = in->project_inode();
6381 pi.inode.version = in->pre_dirty();
6382 pi.inode.truncate_from = 0;
6383 pi.inode.truncate_pending--;
7c673cae
FG
6384
6385 MutationRef mut(new MutationImpl());
6386 mut->ls = mds->mdlog->get_current_segment();
6387 mut->add_projected_inode(in);
6388
6389 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6390 mds->mdlog->start_entry(le);
6391 CDentry *dn = in->get_projected_parent_dn();
6392 le->metablob.add_dir_context(dn->get_dir());
6393 le->metablob.add_primary_dentry(dn, in, true);
6394 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6395
6396 journal_dirty_inode(mut.get(), &le->metablob, in);
6397 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6398
6399 // flush immediately if there are readers/writers waiting
6400 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6401 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6402 mds->mdlog->flush();
6403}
6404
6405void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6406{
6407 dout(10) << "truncate_inode_logged " << *in << dendl;
6408 mut->apply();
6409 mds->locker->drop_locks(mut.get());
6410 mut->cleanup();
6411
6412 in->put(CInode::PIN_TRUNCATING);
6413 in->auth_unpin(this);
6414
6415 list<MDSInternalContextBase*> waiters;
6416 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6417 mds->queue_waiters(waiters);
6418}
6419
6420
6421void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6422{
6423 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6424 << ls->seq << "/" << ls->offset << dendl;
6425 ls->truncating_inodes.insert(in);
6426 in->get(CInode::PIN_TRUNCATING);
6427}
6428
6429void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6430{
6431 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6432 << ls->seq << "/" << ls->offset << dendl;
6433 // if we have the logseg the truncate started in, it must be in our list.
6434 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6435 assert(p != ls->truncating_inodes.end());
6436 ls->truncating_inodes.erase(p);
6437 in->put(CInode::PIN_TRUNCATING);
6438}
6439
6440void MDCache::start_recovered_truncates()
6441{
6442 dout(10) << "start_recovered_truncates" << dendl;
6443 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6444 p != mds->mdlog->segments.end();
6445 ++p) {
6446 LogSegment *ls = p->second;
6447 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6448 q != ls->truncating_inodes.end();
6449 ++q) {
6450 CInode *in = *q;
6451 in->auth_pin(this);
6452
6453 if (!in->client_need_snapflush.empty() &&
6454 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6455 assert(in->filelock.is_stable());
6456 in->filelock.set_state(LOCK_XLOCKDONE);
6457 in->auth_pin(&in->filelock);
6458 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6459 // start_files_to_recover will revoke caps
6460 continue;
6461 }
6462 _truncate_inode(in, ls);
6463 }
6464 }
6465}
6466
6467
6468
6469
6470
6471
6472// ================================================================================
6473// cache trimming
6474
181888fb
FG
6475void MDCache::trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*> &expiremap)
6476{
7c673cae 6477 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6478 std::vector<CDentry *> unexpirables;
6479 uint64_t trimmed = 0;
6480
6481 dout(7) << "trim_lru trimming " << count
6482 << " items from LRU"
6483 << " size=" << lru.lru_get_size()
6484 << " mid=" << lru.lru_get_top()
6485 << " pintail=" << lru.lru_get_pintail()
6486 << " pinned=" << lru.lru_get_num_pinned()
6487 << dendl;
7c673cae 6488
31f18b77
FG
6489 for (;;) {
6490 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6491 if (!dn)
6492 break;
6493 if (trim_dentry(dn, expiremap)) {
6494 unexpirables.push_back(dn);
181888fb
FG
6495 } else {
6496 trimmed++;
31f18b77
FG
6497 }
6498 }
6499
181888fb 6500 for (auto &dn : unexpirables) {
31f18b77 6501 bottom_lru.lru_insert_mid(dn);
181888fb 6502 }
31f18b77
FG
6503 unexpirables.clear();
6504
181888fb
FG
6505 // trim dentries from the LRU until count is reached
6506 while (cache_toofull() || count > 0) {
7c673cae
FG
6507 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6508 if (!dn) {
6509 break;
6510 }
7c673cae 6511 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6512 dn->get_linkage()->inode->item_open_file.is_on_list())) {
7c673cae 6513 unexpirables.push_back(dn);
181888fb
FG
6514 } else if (trim_dentry(dn, expiremap)) {
6515 unexpirables.push_back(dn);
6516 } else {
6517 trimmed++;
3efd9988 6518 if (count > 0) count--;
7c673cae
FG
6519 }
6520 }
181888fb
FG
6521
6522 for (auto &dn : unexpirables) {
31f18b77 6523 lru.lru_insert_mid(dn);
181888fb 6524 }
31f18b77 6525 unexpirables.clear();
7c673cae 6526
181888fb
FG
6527 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6528}
6529
6530/*
6531 * note: only called while MDS is active or stopping... NOT during recovery.
6532 * however, we may expire a replica whose authority is recovering.
6533 *
6534 * @param count is number of dentries to try to expire
6535 */
6536bool MDCache::trim(uint64_t count)
6537{
6538 uint64_t used = cache_size();
91327a77 6539 uint64_t limit = cache_memory_limit;
181888fb
FG
6540 map<mds_rank_t, MCacheExpire*> expiremap;
6541
6542 dout(7) << "trim bytes_used=" << bytes2str(used)
6543 << " limit=" << bytes2str(limit)
91327a77 6544 << " reservation=" << cache_reservation
181888fb
FG
6545 << "% count=" << count << dendl;
6546
6547 // process delayed eval_stray()
6548 stray_manager.advance_delayed();
6549
6550 trim_lru(count, expiremap);
6551
7c673cae 6552 // trim non-auth, non-bound subtrees
181888fb 6553 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6554 CDir *dir = p->first;
6555 ++p;
31f18b77
FG
6556 CInode *diri = dir->get_inode();
6557 if (dir->is_auth()) {
6558 if (!diri->is_auth() && !diri->is_base() &&
6559 dir->get_num_head_items() == 0) {
6560 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6561 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6562 dir->is_freezing() || dir->is_frozen())
6563 continue;
6564
6565 migrator->export_empty_import(dir);
6566 }
6567 } else {
6568 if (!diri->is_auth()) {
6569 if (dir->get_num_ref() > 1) // only subtree pin
6570 continue;
6571 list<CDir*> ls;
6572 diri->get_subtree_dirfrags(ls);
6573 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6574 continue;
6575
6576 // don't trim subtree root if its auth MDS is recovering.
6577 // This simplify the cache rejoin code.
6578 if (dir->is_subtree_root() &&
6579 rejoin_ack_gather.count(dir->get_dir_auth().first))
6580 continue;
7c673cae 6581 trim_dirfrag(dir, 0, expiremap);
31f18b77 6582 }
7c673cae
FG
6583 }
6584 }
6585
6586 // trim root?
181888fb 6587 if (mds->is_stopping() && root) {
7c673cae
FG
6588 list<CDir*> ls;
6589 root->get_dirfrags(ls);
6590 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6591 CDir *dir = *p;
6592 if (dir->get_num_ref() == 1) // subtree pin
6593 trim_dirfrag(dir, 0, expiremap);
6594 }
6595 if (root->get_num_ref() == 0)
6596 trim_inode(0, root, 0, expiremap);
6597 }
6598
6599 std::set<mds_rank_t> stopping;
6600 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6601 stopping.erase(mds->get_nodeid());
6602 for (auto rank : stopping) {
6603 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6604 if (!mdsdir_in)
6605 continue;
6606
6607 if (expiremap.count(rank) == 0) {
6608 expiremap[rank] = new MCacheExpire(mds->get_nodeid());
6609 }
6610
6611 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6612
6613 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6614 if (!aborted) {
6615 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6616 list<CDir*> ls;
6617 mdsdir_in->get_dirfrags(ls);
6618 for (auto dir : ls) {
6619 if (dir->get_num_ref() == 1) // subtree pin
6620 trim_dirfrag(dir, dir, expiremap);
6621 }
6622 if (mdsdir_in->get_num_ref() == 0)
6623 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6624 } else {
6625 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6626 }
6627 }
6628
6629 // Other rank's base inodes (when I'm stopping)
181888fb 6630 if (mds->is_stopping()) {
7c673cae
FG
6631 for (set<CInode*>::iterator p = base_inodes.begin();
6632 p != base_inodes.end(); ++p) {
6633 if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) {
6634 dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl;
6635 if ((*p)->get_num_ref() == 0) {
6636 trim_inode(NULL, *p, NULL, expiremap);
6637 }
6638 }
6639 }
6640 }
6641
6642 // send any expire messages
6643 send_expire_messages(expiremap);
6644
6645 return true;
6646}
6647
6648void MDCache::send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap)
6649{
6650 // send expires
6651 for (map<mds_rank_t, MCacheExpire*>::iterator it = expiremap.begin();
6652 it != expiremap.end();
6653 ++it) {
6654 if (mds->is_cluster_degraded() &&
6655 (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
6656 (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
6657 rejoin_sent.count(it->first) == 0))) {
6658 it->second->put();
6659 continue;
6660 }
6661 dout(7) << "sending cache_expire to " << it->first << dendl;
6662 mds->send_message_mds(it->second, it->first);
6663 }
6664}
6665
6666
6667bool MDCache::trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap)
6668{
6669 dout(12) << "trim_dentry " << *dn << dendl;
6670
6671 CDentry::linkage_t *dnl = dn->get_linkage();
6672
6673 CDir *dir = dn->get_dir();
6674 assert(dir);
6675
6676 CDir *con = get_subtree_root(dir);
6677 if (con)
6678 dout(12) << " in container " << *con << dendl;
6679 else {
6680 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6681 assert(dn->is_auth());
6682 }
6683
6684 // If replica dentry is not readable, it's likely we will receive
6685 // MDentryLink/MDentryUnlink message soon (It's possible we first
6686 // receive a MDentryUnlink message, then MDentryLink message)
6687 // MDentryLink message only replicates an inode, so we should
6688 // avoid trimming the inode's parent dentry. This is because that
6689 // unconnected replicas are problematic for subtree migration.
6690 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6691 !dn->get_dir()->get_inode()->is_stray())
6692 return true;
6693
6694 // adjust the dir state
6695 // NOTE: we can safely remove a clean, null dentry without effecting
6696 // directory completeness.
6697 // (check this _before_ we unlink the inode, below!)
6698 bool clear_complete = false;
6699 if (!(dnl->is_null() && dn->is_clean()))
6700 clear_complete = true;
6701
6702 // unlink the dentry
6703 if (dnl->is_remote()) {
6704 // just unlink.
31f18b77 6705 dir->unlink_inode(dn, false);
7c673cae
FG
6706 } else if (dnl->is_primary()) {
6707 // expire the inode, too.
6708 CInode *in = dnl->get_inode();
6709 assert(in);
6710 if (trim_inode(dn, in, con, expiremap))
6711 return true; // purging stray instead of trimming
6712 } else {
6713 assert(dnl->is_null());
6714 }
6715
6716 if (!dn->is_auth()) {
6717 // notify dentry authority.
6718 mds_authority_t auth = dn->authority();
6719
6720 for (int p=0; p<2; p++) {
6721 mds_rank_t a = auth.first;
6722 if (p) a = auth.second;
6723 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6724 if (mds->get_nodeid() == auth.second &&
6725 con->is_importing()) break; // don't send any expire while importing.
6726 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6727
6728 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6729 assert(a != mds->get_nodeid());
6730 if (expiremap.count(a) == 0)
6731 expiremap[a] = new MCacheExpire(mds->get_nodeid());
94b18763 6732 expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6733 }
6734 }
6735
6736 // remove dentry
6737 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6738 dir->add_to_bloom(dn);
6739 dir->remove_dentry(dn);
6740
6741 if (clear_complete)
6742 dir->state_clear(CDir::STATE_COMPLETE);
6743
7c673cae
FG
6744 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6745 return false;
6746}
6747
6748
6749void MDCache::trim_dirfrag(CDir *dir, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6750{
6751 dout(15) << "trim_dirfrag " << *dir << dendl;
6752
6753 if (dir->is_subtree_root()) {
6754 assert(!dir->is_auth() ||
6755 (!dir->is_replicated() && dir->inode->is_base()));
6756 remove_subtree(dir); // remove from subtree map
6757 }
6758 assert(dir->get_num_ref() == 0);
6759
6760 CInode *in = dir->get_inode();
6761
6762 if (!dir->is_auth()) {
6763 mds_authority_t auth = dir->authority();
6764
6765 // was this an auth delegation? (if so, slightly modified container)
6766 dirfrag_t condf;
6767 if (dir->is_subtree_root()) {
6768 dout(12) << " subtree root, container is " << *dir << dendl;
6769 con = dir;
6770 condf = dir->dirfrag();
6771 } else {
6772 condf = con->dirfrag();
6773 }
6774
6775 for (int p=0; p<2; p++) {
6776 mds_rank_t a = auth.first;
6777 if (p) a = auth.second;
6778 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6779 if (mds->get_nodeid() == auth.second &&
6780 con->is_importing()) break; // don't send any expire while importing.
6781 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6782
6783 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6784 assert(a != mds->get_nodeid());
6785 if (expiremap.count(a) == 0)
6786 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6787 expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6788 }
6789 }
6790
6791 in->close_dirfrag(dir->dirfrag().frag);
6792}
6793
6794/**
6795 * Try trimming an inode from the cache
6796 *
6797 * @return true if the inode is still in cache, else false if it was trimmed
6798 */
6799bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<mds_rank_t, MCacheExpire*>& expiremap)
6800{
6801 dout(15) << "trim_inode " << *in << dendl;
6802 assert(in->get_num_ref() == 0);
6803
6804 if (in->is_dir()) {
6805 // If replica inode's dirfragtreelock is not readable, it's likely
6806 // some dirfrags of the inode are being fragmented and we will receive
6807 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6808 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6809 // This is because that unconnected replicas are problematic for
6810 // subtree migration.
6811 //
28e407b8 6812 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
7c673cae 6813 return true;
28e407b8 6814 }
7c673cae
FG
6815
6816 // DIR
6817 list<CDir*> dfls;
6818 in->get_dirfrags(dfls);
6819 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6820 CDir *dir = *p;
6821 assert(!dir->is_subtree_root());
6822 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6823 }
6824 }
6825
6826 // INODE
6827 if (in->is_auth()) {
6828 // eval stray after closing dirfrags
6829 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6830 maybe_eval_stray(in);
6831 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6832 return true;
6833 }
6834 } else {
6835 mds_authority_t auth = in->authority();
6836
6837 dirfrag_t df;
6838 if (con)
6839 df = con->dirfrag();
6840 else
6841 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6842
6843 for (int p=0; p<2; p++) {
6844 mds_rank_t a = auth.first;
6845 if (p) a = auth.second;
6846 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6847 if (con && mds->get_nodeid() == auth.second &&
6848 con->is_importing()) break; // don't send any expire while importing.
6849 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6850
6851 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
6852 assert(a != mds->get_nodeid());
6853 if (expiremap.count(a) == 0)
6854 expiremap[a] = new MCacheExpire(mds->get_nodeid());
6855 expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce());
6856 }
6857 }
6858
6859 /*
6860 if (in->is_auth()) {
6861 if (in->hack_accessed)
6862 mds->logger->inc("outt");
6863 else {
6864 mds->logger->inc("outut");
6865 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6866 }
6867 }
6868 */
6869
6870 // unlink
6871 if (dn)
31f18b77 6872 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
6873 remove_inode(in);
6874 return false;
6875}
6876
6877
6878/**
6879 * trim_non_auth - remove any non-auth items from our cache
6880 *
6881 * this reduces the amount of non-auth metadata in our cache, reducing the
6882 * load incurred by the rejoin phase.
6883 *
6884 * the only non-auth items that remain are those that are needed to
6885 * attach our own subtrees to the root.
6886 *
6887 * when we are done, all dentries will be in the top bit of the lru.
6888 *
6889 * why we have to do this:
6890 * we may not have accurate linkage for non-auth items. which means we will
6891 * know which subtree it falls into, and can not be sure to declare it to the
6892 * correct authority.
6893 */
6894void MDCache::trim_non_auth()
6895{
6896 dout(7) << "trim_non_auth" << dendl;
6897
6898 // temporarily pin all subtree roots
6899 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6900 p != subtrees.end();
6901 ++p)
6902 p->first->get(CDir::PIN_SUBTREETEMP);
6903
31f18b77 6904 list<CDentry*> auth_list;
7c673cae
FG
6905
6906 // trim non-auth items from the lru
31f18b77
FG
6907 for (;;) {
6908 CDentry *dn = NULL;
6909 if (bottom_lru.lru_get_size() > 0)
6910 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6911 if (!dn && lru.lru_get_size() > 0)
6912 dn = static_cast<CDentry*>(lru.lru_expire());
6913 if (!dn)
6914 break;
6915
7c673cae
FG
6916 CDentry::linkage_t *dnl = dn->get_linkage();
6917
6918 if (dn->is_auth()) {
6919 // add back into lru (at the top)
31f18b77 6920 auth_list.push_back(dn);
7c673cae
FG
6921
6922 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6923 dn->unlink_remote(dnl);
7c673cae
FG
6924 } else {
6925 // non-auth. expire.
6926 CDir *dir = dn->get_dir();
6927 assert(dir);
6928
6929 // unlink the dentry
6930 dout(10) << " removing " << *dn << dendl;
6931 if (dnl->is_remote()) {
31f18b77 6932 dir->unlink_inode(dn, false);
7c673cae
FG
6933 }
6934 else if (dnl->is_primary()) {
6935 CInode *in = dnl->get_inode();
6936 dout(10) << " removing " << *in << dendl;
6937 list<CDir*> ls;
6938 in->get_dirfrags(ls);
6939 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6940 CDir *subdir = *p;
6941 assert(!subdir->is_subtree_root());
6942 in->close_dirfrag(subdir->dirfrag().frag);
6943 }
31f18b77 6944 dir->unlink_inode(dn, false);
7c673cae
FG
6945 remove_inode(in);
6946 }
6947 else {
6948 assert(dnl->is_null());
6949 }
6950
6951 assert(!dir->has_bloom());
6952 dir->remove_dentry(dn);
6953 // adjust the dir state
6954 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
6955 // close empty non-auth dirfrag
6956 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
6957 dir->inode->close_dirfrag(dir->get_frag());
6958 }
6959 }
6960
31f18b77
FG
6961 for (auto dn : auth_list) {
6962 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
6963 bottom_lru.lru_insert_mid(dn);
6964 else
6965 lru.lru_insert_top(dn);
6966 }
6967
7c673cae
FG
6968 // move everything in the pintail to the top bit of the lru.
6969 lru.lru_touch_entire_pintail();
6970
6971 // unpin all subtrees
6972 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6973 p != subtrees.end();
6974 ++p)
6975 p->first->put(CDir::PIN_SUBTREETEMP);
6976
31f18b77
FG
6977 if (lru.lru_get_size() == 0 &&
6978 bottom_lru.lru_get_size() == 0) {
7c673cae 6979 // root, stray, etc.?
b32b8144 6980 auto p = inode_map.begin();
7c673cae 6981 while (p != inode_map.end()) {
7c673cae 6982 CInode *in = p->second;
b32b8144 6983 ++p;
7c673cae
FG
6984 if (!in->is_auth()) {
6985 list<CDir*> ls;
6986 in->get_dirfrags(ls);
6987 for (list<CDir*>::iterator p = ls.begin();
6988 p != ls.end();
6989 ++p) {
6990 dout(10) << " removing " << **p << dendl;
6991 assert((*p)->get_num_ref() == 1); // SUBTREE
6992 remove_subtree((*p));
6993 in->close_dirfrag((*p)->dirfrag().frag);
6994 }
6995 dout(10) << " removing " << *in << dendl;
6996 assert(!in->get_parent_dn());
6997 assert(in->get_num_ref() == 0);
6998 remove_inode(in);
6999 }
7c673cae
FG
7000 }
7001 }
7002
7003 show_subtrees();
7004}
7005
7006/**
7007 * Recursively trim the subtree rooted at directory to remove all
7008 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7009 * of those links. This is used to clear invalid data out of the cache.
7010 * Note that it doesn't clear the passed-in directory, since that's not
7011 * always safe.
7012 */
7013bool MDCache::trim_non_auth_subtree(CDir *dir)
7014{
7015 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7016
7017 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7018
94b18763
FG
7019 auto j = dir->begin();
7020 auto i = j;
7c673cae
FG
7021 while (j != dir->end()) {
7022 i = j++;
7023 CDentry *dn = i->second;
7024 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7025 CDentry::linkage_t *dnl = dn->get_linkage();
7026 if (dnl->is_primary()) { // check for subdirectories, etc
7027 CInode *in = dnl->get_inode();
7028 bool keep_inode = false;
7029 if (in->is_dir()) {
7030 list<CDir*> subdirs;
7031 in->get_dirfrags(subdirs);
7032 for (list<CDir*>::iterator subdir = subdirs.begin();
7033 subdir != subdirs.end();
7034 ++subdir) {
7035 if ((*subdir)->is_subtree_root()) {
7036 keep_inode = true;
7037 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7038 } else {
7039 if (trim_non_auth_subtree(*subdir))
7040 keep_inode = true;
7041 else {
7042 in->close_dirfrag((*subdir)->get_frag());
7043 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7044 }
7045 }
7046 }
7047
7048 }
7049 if (!keep_inode) { // remove it!
7050 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7051 dir->unlink_inode(dn, false);
7c673cae
FG
7052 remove_inode(in);
7053 assert(!dir->has_bloom());
7054 dir->remove_dentry(dn);
7055 } else {
7056 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7057 dn->state_clear(CDentry::STATE_AUTH);
7058 in->state_clear(CInode::STATE_AUTH);
7059 }
7060 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7061 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7062 } else { // just remove it
7063 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7064 if (dnl->is_remote())
31f18b77 7065 dir->unlink_inode(dn, false);
7c673cae
FG
7066 dir->remove_dentry(dn);
7067 }
7068 }
7069 dir->state_clear(CDir::STATE_AUTH);
7070 /**
7071 * We've now checked all our children and deleted those that need it.
7072 * Now return to caller, and tell them if *we're* a keeper.
7073 */
7074 return keep_dir || dir->get_num_any();
7075}
7076
7077/*
7078 * during replay, when we determine a subtree is no longer ours, we
7079 * try to trim it from our cache. because subtrees must be connected
7080 * to the root, the fact that we can trim this tree may mean that our
7081 * children or parents can also be trimmed.
7082 */
7083void MDCache::try_trim_non_auth_subtree(CDir *dir)
7084{
7085 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7086
7087 // can we now trim child subtrees?
7088 set<CDir*> bounds;
7089 get_subtree_bounds(dir, bounds);
7090 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7091 CDir *bd = *p;
7092 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7093 bd->get_num_any() == 0 && // and empty
7094 can_trim_non_auth_dirfrag(bd)) {
7095 CInode *bi = bd->get_inode();
7096 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7097 remove_subtree(bd);
7098 bd->mark_clean();
7099 bi->close_dirfrag(bd->get_frag());
7100 }
7101 }
7102
7103 if (trim_non_auth_subtree(dir)) {
7104 // keep
7105 try_subtree_merge(dir);
7106 } else {
7107 // can we trim this subtree (and possibly our ancestors) too?
7108 while (true) {
7109 CInode *diri = dir->get_inode();
7110 if (diri->is_base()) {
7111 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7112 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7113 remove_subtree(dir);
7114 dir->mark_clean();
7115 diri->close_dirfrag(dir->get_frag());
7116
7117 dout(10) << " removing " << *diri << dendl;
7118 assert(!diri->get_parent_dn());
7119 assert(diri->get_num_ref() == 0);
7120 remove_inode(diri);
7121 }
7122 break;
7123 }
7124
7125 CDir *psub = get_subtree_root(diri->get_parent_dir());
7126 dout(10) << " parent subtree is " << *psub << dendl;
7127 if (psub->get_dir_auth().first == mds->get_nodeid())
7128 break; // we are auth, keep.
7129
7130 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7131 remove_subtree(dir);
7132 dir->mark_clean();
7133 diri->close_dirfrag(dir->get_frag());
7134
7135 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7136 if (trim_non_auth_subtree(psub))
7137 break;
7138 dir = psub;
7139 }
7140 }
7141
7142 show_subtrees();
7143}
7144
7145void MDCache::standby_trim_segment(LogSegment *ls)
7146{
7147 ls->new_dirfrags.clear_list();
7148 ls->open_files.clear_list();
7149
7150 while (!ls->dirty_dirfrags.empty()) {
7151 CDir *dir = ls->dirty_dirfrags.front();
7152 dir->mark_clean();
7153 }
7154 while (!ls->dirty_inodes.empty()) {
7155 CInode *in = ls->dirty_inodes.front();
7156 in->mark_clean();
7157 }
7158 while (!ls->dirty_dentries.empty()) {
7159 CDentry *dn = ls->dirty_dentries.front();
7160 dn->mark_clean();
7161 }
7162 while (!ls->dirty_parent_inodes.empty()) {
7163 CInode *in = ls->dirty_parent_inodes.front();
7164 in->clear_dirty_parent();
7165 }
7166 while (!ls->dirty_dirfrag_dir.empty()) {
7167 CInode *in = ls->dirty_dirfrag_dir.front();
7168 in->filelock.remove_dirty();
7169 }
7170 while (!ls->dirty_dirfrag_nest.empty()) {
7171 CInode *in = ls->dirty_dirfrag_nest.front();
7172 in->nestlock.remove_dirty();
7173 }
7174 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7175 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7176 in->dirfragtreelock.remove_dirty();
7177 }
7178}
7179
7180/* This function DOES put the passed message before returning */
7181void MDCache::handle_cache_expire(MCacheExpire *m)
7182{
7183 mds_rank_t from = mds_rank_t(m->get_from());
7184
7185 dout(7) << "cache_expire from mds." << from << dendl;
7186
7187 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7188 m->put();
7189 return;
7190 }
7191
7192 set<SimpleLock *> gather_locks;
7193 // loop over realms
7194 for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
7195 p != m->realms.end();
7196 ++p) {
7197 // check container?
7198 if (p->first.ino > 0) {
7199 CInode *expired_inode = get_inode(p->first.ino);
7200 assert(expired_inode); // we had better have this.
7201 CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag);
7202 assert(parent_dir);
7203
7204 int export_state = -1;
7205 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7206 export_state = migrator->get_export_state(parent_dir);
7207 assert(export_state >= 0);
7208 }
7209
7210 if (!parent_dir->is_auth() ||
7211 (export_state != -1 &&
7212 ((export_state == Migrator::EXPORT_WARNING &&
7213 migrator->export_has_warned(parent_dir,from)) ||
7214 export_state == Migrator::EXPORT_EXPORTING ||
7215 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7216 (export_state == Migrator::EXPORT_NOTIFYING &&
7217 !migrator->export_has_notified(parent_dir,from))))) {
7218
7219 // not auth.
7220 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7221 assert(parent_dir->is_frozen_tree_root());
7222
7223 // make a message container
7224 if (delayed_expire[parent_dir].count(from) == 0)
7225 delayed_expire[parent_dir][from] = new MCacheExpire(from);
7226
7227 // merge these expires into it
7228 delayed_expire[parent_dir][from]->add_realm(p->first, p->second);
7229 continue;
7230 }
7231 assert(export_state <= Migrator::EXPORT_PREPPING ||
7232 (export_state == Migrator::EXPORT_WARNING &&
7233 !migrator->export_has_warned(parent_dir, from)));
7234
7235 dout(7) << "expires for " << *parent_dir << dendl;
7236 } else {
7237 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7238 }
7239
7240 // INODES
7241 for (map<vinodeno_t,uint32_t>::iterator it = p->second.inodes.begin();
7242 it != p->second.inodes.end();
7243 ++it) {
7244 CInode *in = get_inode(it->first);
7245 unsigned nonce = it->second;
7246
7247 if (!in) {
7248 dout(0) << " inode expire on " << it->first << " from " << from
7249 << ", don't have it" << dendl;
7250 assert(in);
7251 }
7252 assert(in->is_auth());
7253 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7254
7255 // check nonce
7256 if (nonce == in->get_replica_nonce(from)) {
7257 // remove from our cached_by
7258 dout(7) << " inode expire on " << *in << " from mds." << from
7259 << " cached_by was " << in->get_replicas() << dendl;
7260 inode_remove_replica(in, from, false, gather_locks);
7261 }
7262 else {
7263 // this is an old nonce, ignore expire.
7264 dout(7) << " inode expire on " << *in << " from mds." << from
7265 << " with old nonce " << nonce
7266 << " (current " << in->get_replica_nonce(from) << "), dropping"
7267 << dendl;
7268 }
7269 }
7270
7271 // DIRS
7272 for (map<dirfrag_t,uint32_t>::iterator it = p->second.dirs.begin();
7273 it != p->second.dirs.end();
7274 ++it) {
7275 CDir *dir = get_dirfrag(it->first);
7276 unsigned nonce = it->second;
7277
7278 if (!dir) {
7279 CInode *diri = get_inode(it->first.ino);
7280 if (diri) {
7281 if (mds->is_rejoin() &&
7282 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7283 !diri->is_replica(from)) {
7284 list<CDir*> ls;
7285 diri->get_nested_dirfrags(ls);
7286 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7287 << " while rejoining, inode isn't replicated" << dendl;
7288 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7289 dir = *q;
7290 if (dir->is_replica(from)) {
7291 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7292 dir->remove_replica(from);
7293 }
7294 }
7295 continue;
7296 }
7297 CDir *other = diri->get_approx_dirfrag(it->first.frag);
7298 if (other) {
7299 dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from
7300 << " have " << *other << ", mismatched frags, dropping" << dendl;
7301 continue;
7302 }
7303 }
7304 dout(0) << " dir expire on " << it->first << " from " << from
7305 << ", don't have it" << dendl;
7306 assert(dir);
7307 }
7308 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7309
7310 assert(dir->is_auth());
7311
7312 // check nonce
7313 if (nonce == dir->get_replica_nonce(from)) {
7314 // remove from our cached_by
7315 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7316 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7317 dir->remove_replica(from);
7318 }
7319 else {
7320 // this is an old nonce, ignore expire.
7321 dout(7) << " dir expire on " << *dir << " from mds." << from
7322 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7323 << "), dropping" << dendl;
7324 }
7325 }
7326
7327 // DENTRIES
7328 for (map<dirfrag_t, map<pair<string,snapid_t>,uint32_t> >::iterator pd = p->second.dentries.begin();
7329 pd != p->second.dentries.end();
7330 ++pd) {
7331 dout(10) << " dn expires in dir " << pd->first << dendl;
7332 CInode *diri = get_inode(pd->first.ino);
7333 assert(diri);
7334 CDir *dir = diri->get_dirfrag(pd->first.frag);
7335
7336 if (!dir) {
7337 dout(0) << " dn expires on " << pd->first << " from " << from
7338 << ", must have refragmented" << dendl;
7339 } else {
7340 assert(dir->is_auth());
7341 }
7342
7343 for (map<pair<string,snapid_t>,uint32_t>::iterator p = pd->second.begin();
7344 p != pd->second.end();
7345 ++p) {
7346 unsigned nonce = p->second;
7347 CDentry *dn;
7348
7349 if (dir) {
7350 dn = dir->lookup(p->first.first, p->first.second);
7351 } else {
7352 // which dirfrag for this dentry?
7353 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first));
7354 assert(dir);
7355 assert(dir->is_auth());
7356 dn = dir->lookup(p->first.first, p->first.second);
7357 }
7358
7359 if (!dn) {
7360 if (dir)
7361 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl;
7362 else
7363 dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl;
7364 }
7365 assert(dn);
7366
7367 if (nonce == dn->get_replica_nonce(from)) {
7368 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7369 dentry_remove_replica(dn, from, gather_locks);
7370 }
7371 else {
7372 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7373 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7374 << "), dropping" << dendl;
7375 }
7376 }
7377 }
7378 }
7379
7380 // done
7381 m->put();
7382
7383 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7384 if (!(*p)->is_stable())
7385 mds->locker->eval_gather(*p);
7386 }
7387}
7388
7389void MDCache::process_delayed_expire(CDir *dir)
7390{
7391 dout(7) << "process_delayed_expire on " << *dir << dendl;
7392 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7393 p != delayed_expire[dir].end();
7394 ++p)
7395 handle_cache_expire(p->second);
7396 delayed_expire.erase(dir);
7397}
7398
7399void MDCache::discard_delayed_expire(CDir *dir)
7400{
7401 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7402 for (map<mds_rank_t,MCacheExpire*>::iterator p = delayed_expire[dir].begin();
7403 p != delayed_expire[dir].end();
7404 ++p)
7405 p->second->put();
7406 delayed_expire.erase(dir);
7407}
7408
7409void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7410 set<SimpleLock *>& gather_locks)
7411{
7412 in->remove_replica(from);
7413 in->mds_caps_wanted.erase(from);
7414
7415 // note: this code calls _eval more often than it needs to!
7416 // fix lock
7417 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7418 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7419 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7420 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7421 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7422 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7423
7424 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7425 // Don't remove the recovering mds from lock's gathering list because
7426 // it may hold rejoined wrlocks.
7427 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7428 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7429 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7430}
7431
7432void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7433{
7434 dn->remove_replica(from);
7435
7436 // fix lock
7437 if (dn->lock.remove_replica(from))
7438 gather_locks.insert(&dn->lock);
7439
7440 // Replicated strays might now be elegible for purge
7441 CDentry::linkage_t *dnl = dn->get_linkage();
7442 if (dnl->is_primary()) {
7443 maybe_eval_stray(dnl->get_inode());
7444 }
7445}
7446
7447void MDCache::trim_client_leases()
7448{
7449 utime_t now = ceph_clock_now();
7450
7451 dout(10) << "trim_client_leases" << dendl;
7452
7453 for (int pool=0; pool<client_lease_pools; pool++) {
7454 int before = client_leases[pool].size();
7455 if (client_leases[pool].empty())
7456 continue;
7457
7458 while (!client_leases[pool].empty()) {
7459 ClientLease *r = client_leases[pool].front();
7460 if (r->ttl > now) break;
7461 CDentry *dn = static_cast<CDentry*>(r->parent);
7462 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7463 dn->remove_client_lease(r, mds->locker);
7464 }
7465 int after = client_leases[pool].size();
7466 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7467 << (before-after) << " leases, " << after << " left" << dendl;
7468 }
7469}
7470
7471
7472void MDCache::check_memory_usage()
7473{
7474 static MemoryModel mm(g_ceph_context);
7475 static MemoryModel::snap last;
7476 mm.sample(&last);
7477 static MemoryModel::snap baseline = last;
7478
7479 // check client caps
b32b8144 7480 assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7481 double caps_per_inode = 0.0;
7c673cae 7482 if (CInode::count())
181888fb 7483 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae
FG
7484
7485 dout(2) << "check_memory_usage"
7486 << " total " << last.get_total()
7487 << ", rss " << last.get_rss()
7488 << ", heap " << last.get_heap()
7489 << ", baseline " << baseline.get_heap()
7490 << ", buffers " << (buffer::get_total_alloc() >> 10)
7491 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7492 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7493 << dendl;
7494
c07f9fc5 7495 mds->update_mlogger();
7c673cae
FG
7496 mds->mlogger->set(l_mdm_rss, last.get_rss());
7497 mds->mlogger->set(l_mdm_heap, last.get_heap());
7498
181888fb 7499 if (cache_toofull()) {
91327a77 7500 last_recall_state = clock::now();
f64942e4 7501 mds->server->recall_client_state(-1.0, false, nullptr);
7c673cae
FG
7502 }
7503
7504 // If the cache size had exceeded its limit, but we're back in bounds
7505 // now, free any unused pool memory so that our memory usage isn't
7506 // permanently bloated.
181888fb 7507 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7508 // Only do this once we are back in bounds: otherwise the releases would
7509 // slow down whatever process caused us to exceed bounds to begin with
7510 if (ceph_using_tcmalloc()) {
7511 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7512 << dendl;
7513 ceph_heap_release_free_memory();
7514 }
7515 exceeded_size_limit = false;
7516 }
7517}
7518
7519
7520
7521// =========================================================================================
7522// shutdown
7523
7524class C_MDC_ShutdownCheck : public MDCacheContext {
7525public:
7526 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7527 void finish(int) override {
7528 mdcache->shutdown_check();
7529 }
7530};
7531
7532void MDCache::shutdown_check()
7533{
7534 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7535
7536 // cache
7537 char old_val[32] = { 0 };
7538 char *o = old_val;
7539 g_conf->get_val("debug_mds", &o, sizeof(old_val));
7540 g_conf->set_val("debug_mds", "10");
7541 g_conf->apply_changes(NULL);
7542 show_cache();
7543 g_conf->set_val("debug_mds", old_val);
7544 g_conf->apply_changes(NULL);
7545 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7546
7547 // this
31f18b77 7548 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7549 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7550
7551
7552 if (mds->objecter->is_active()) {
7553 dout(0) << "objecter still active" << dendl;
7554 mds->objecter->dump_active();
7555 }
7556}
7557
7558
7559void MDCache::shutdown_start()
7560{
7561 dout(2) << "shutdown_start" << dendl;
7562
7563 if (g_conf->mds_shutdown_check)
7564 mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7565
7566 // g_conf->debug_mds = 10;
7567}
7568
7569
7570
7571bool MDCache::shutdown_pass()
7572{
7573 dout(7) << "shutdown_pass" << dendl;
7574
7575 if (mds->is_stopped()) {
7576 dout(7) << " already shut down" << dendl;
7577 show_cache();
7578 show_subtrees();
7579 return true;
7580 }
7581
7582 // empty stray dir
28e407b8 7583 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7584
7585 // trim cache
181888fb 7586 trim(UINT64_MAX);
31f18b77 7587 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7588
28e407b8 7589 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae
FG
7590 int num_auth_subtree = 0;
7591 if (!subtrees.empty() &&
28e407b8 7592 mds->get_nodeid() != 0) {
7c673cae
FG
7593 dout(7) << "looking for subtrees to export to mds0" << dendl;
7594 list<CDir*> ls;
7595 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7596 it != subtrees.end();
7597 ++it) {
7598 CDir *dir = it->first;
7599 if (dir->get_inode()->is_mdsdir())
7600 continue;
7601 if (dir->is_auth()) {
7602 num_auth_subtree++;
7603 if (dir->is_frozen() ||
7604 dir->is_freezing() ||
7605 dir->is_ambiguous_dir_auth() ||
7606 dir->state_test(CDir::STATE_EXPORTING))
7607 continue;
7608 ls.push_back(dir);
7609 }
7610 }
28e407b8
AA
7611
7612 migrator->clear_export_queue();
7c673cae
FG
7613 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7614 CDir *dir = *p;
7615 mds_rank_t dest = dir->get_inode()->authority().first;
7616 if (dest > 0 && !mds->mdsmap->is_active(dest))
7617 dest = 0;
7618 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7619 migrator->export_dir_nicely(dir, dest);
7620 }
7621 }
7622
28e407b8
AA
7623 if (!strays_all_exported) {
7624 dout(7) << "waiting for strays to migrate" << dendl;
7625 return false;
7626 }
7627
7c673cae 7628 if (num_auth_subtree > 0) {
28e407b8 7629 assert(mds->get_nodeid() > 0);
7c673cae
FG
7630 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7631 show_subtrees();
7632 return false;
7633 }
7634
7635 // close out any sessions (and open files!) before we try to trim the log, etc.
7636 if (mds->sessionmap.have_unclosed_sessions()) {
7637 if (!mds->server->terminating_sessions)
7638 mds->server->terminate_sessions();
7639 return false;
7640 }
7641
28e407b8
AA
7642 // Fully trim the log so that all objects in cache are clean and may be
7643 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7644 // trim the log such that the cache eventually becomes clean.
f64942e4
AA
7645 if (mds->mdlog->get_num_segments() > 0) {
7646 auto ls = mds->mdlog->get_current_segment();
7647 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7648 // Current segment contains events other than subtreemap or
7649 // there are dirty dirfrags (see CDir::log_mark_dirty())
7650 mds->mdlog->start_new_segment();
7651 mds->mdlog->flush();
7652 }
7653 }
7654 mds->mdlog->trim_all();
28e407b8
AA
7655 if (mds->mdlog->get_num_segments() > 1) {
7656 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7657 return false;
7658 }
7659
7660 // drop our reference to our stray dir inode
7661 for (int i = 0; i < NUM_STRAY; ++i) {
7662 if (strays[i] &&
7663 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7664 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7665 strays[i]->put(CInode::PIN_STRAY);
7666 strays[i]->put_stickydirs();
7667 }
7668 }
7669
7c673cae
FG
7670 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7671 if (mydir && !mydir->is_subtree_root())
7672 mydir = NULL;
7673
7674 // subtrees map not empty yet?
7675 if (subtrees.size() > (mydir ? 1 : 0)) {
7676 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7677 show_subtrees();
7678 migrator->show_importing();
7679 migrator->show_exporting();
7680 if (!migrator->is_importing() && !migrator->is_exporting())
7681 show_cache();
7682 return false;
7683 }
7684 assert(!migrator->is_exporting());
7685 assert(!migrator->is_importing());
7686
f64942e4
AA
7687 // replicas may dirty scatter locks
7688 if (myin && myin->is_replicated()) {
7689 dout(7) << "still have replicated objects" << dendl;
7690 return false;
7691 }
7692
181888fb
FG
7693 if ((myin && myin->is_auth_pinned()) ||
7694 (mydir && mydir->is_auth_pinned())) {
7695 dout(7) << "still have auth pinned objects" << dendl;
7696 return false;
7697 }
7698
7c673cae
FG
7699 // (only do this once!)
7700 if (!mds->mdlog->is_capped()) {
7701 dout(7) << "capping the log" << dendl;
7702 mds->mdlog->cap();
7c673cae
FG
7703 }
7704
f64942e4
AA
7705 if (!mds->mdlog->empty())
7706 mds->mdlog->trim(0);
7707
7c673cae
FG
7708 if (!mds->mdlog->empty()) {
7709 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7710 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7711 return false;
7712 }
7713
7714 if (!did_shutdown_log_cap) {
7715 // flush journal header
7716 dout(7) << "writing header for (now-empty) journal" << dendl;
7717 assert(mds->mdlog->empty());
7718 mds->mdlog->write_head(0);
7719 // NOTE: filer active checker below will block us until this completes.
7720 did_shutdown_log_cap = true;
7721 return false;
7722 }
7723
7724 // filer active?
7725 if (mds->objecter->is_active()) {
7726 dout(7) << "objecter still active" << dendl;
7727 mds->objecter->dump_active();
7728 return false;
7729 }
7730
7731 // trim what we can from the cache
31f18b77
FG
7732 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7733 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7734 show_cache();
7735 //dump();
7736 return false;
7737 }
31f18b77
FG
7738
7739 // make mydir subtree go away
7740 if (mydir) {
7741 if (mydir->get_num_ref() > 1) { // subtree pin
7742 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7743 show_cache();
7744 return false;
7745 }
7746
7747 remove_subtree(mydir);
7748 myin->close_dirfrag(mydir->get_frag());
7749 }
7750 assert(subtrees.empty());
7751
1adf2230 7752 if (myin) {
31f18b77 7753 remove_inode(myin);
1adf2230
AA
7754 assert(!myin);
7755 }
7756
7c673cae
FG
7757 // done!
7758 dout(2) << "shutdown done." << dendl;
7759 return true;
7760}
7761
7762bool MDCache::shutdown_export_strays()
7763{
f64942e4
AA
7764 static const unsigned MAX_EXPORTING = 100;
7765
7c673cae
FG
7766 if (mds->get_nodeid() == 0)
7767 return true;
f64942e4
AA
7768
7769 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7770 return false;
7771
7772 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7773 << " '" << shutdown_export_next.second << "'" << dendl;
7c673cae
FG
7774
7775 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
f64942e4 7776 bool all_exported = false;
7c673cae 7777
f64942e4
AA
7778again:
7779 auto next = shutdown_export_next;
7c673cae 7780
7c673cae 7781 for (int i = 0; i < NUM_STRAY; ++i) {
f64942e4
AA
7782 CInode *strayi = strays[i];
7783 if (!strayi ||
7784 !strayi->state_test(CInode::STATE_STRAYPINNED))
7785 continue;
7786 if (strayi->ino() < next.first.ino)
7c673cae 7787 continue;
7c673cae 7788
f64942e4
AA
7789 deque<CDir*> dfls;
7790 strayi->get_dirfrags(dfls);
7c673cae 7791
f64942e4
AA
7792 while (!dfls.empty()) {
7793 CDir *dir = dfls.front();
7794 dfls.pop_front();
7795
7796 if (dir->dirfrag() < next.first)
7c673cae 7797 continue;
f64942e4
AA
7798 if (next.first < dir->dirfrag()) {
7799 next.first = dir->dirfrag();
7800 next.second.clear();
7801 }
7802
7803 if (!dir->is_complete()) {
7804 MDSInternalContextBase *fin = nullptr;
7805 if (shutdown_exporting_strays.empty()) {
7806 fin = new MDSInternalContextWrapper(mds,
7807 new FunctionContext([this](int r) {
7808 shutdown_export_strays();
7809 })
7810 );
7811 }
7812 dir->fetch(fin);
7813 goto done;
7c673cae
FG
7814 }
7815
f64942e4
AA
7816 CDir::dentry_key_map::iterator it;
7817 if (next.second.empty()) {
7818 it = dir->begin();
7c673cae 7819 } else {
f64942e4
AA
7820 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
7821 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
7c673cae 7822 }
f64942e4
AA
7823
7824 for (; it != dir->end(); ++it) {
7825 CDentry *dn = it->second;
7826 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7827 if (dnl->is_null())
7828 continue;
7829
7830 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
7831 next.second = string(it->first.name);
7832 goto done;
7833 }
7834
7835 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
7836 if (!ret.second) {
7837 dout(10) << "already exporting/purging " << *dn << dendl;
7838 continue;
7839 }
7840
7841 // Don't try to migrate anything that is actually
7842 // being purged right now
7843 if (!dn->state_test(CDentry::STATE_PURGING))
7844 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7845
7846 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
7847 ++it;
7848 if (it != dir->end()) {
7849 next.second = string(it->first.name);
7850 } else {
7851 if (dfls.empty())
7852 next.first.ino.val++;
7853 else
7854 next.first = dfls.front()->dirfrag();
7855 next.second.clear();
7856 }
7857 goto done;
7858 }
7859 }
7860 }
7861 }
7862
7863 if (shutdown_exporting_strays.empty()) {
7864 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
7865 if (first_df < shutdown_export_next.first ||
7866 !shutdown_export_next.second.empty()) {
7867 shutdown_export_next.first = first_df;
7868 shutdown_export_next.second.clear();
7869 goto again;
7c673cae 7870 }
f64942e4 7871 all_exported = true;
7c673cae
FG
7872 }
7873
f64942e4
AA
7874done:
7875 shutdown_export_next = next;
7876 return all_exported;
7c673cae
FG
7877}
7878
7879// ========= messaging ==============
7880
7881/* This function DOES put the passed message before returning */
7882void MDCache::dispatch(Message *m)
7883{
7884 switch (m->get_type()) {
7885
7886 // RESOLVE
7887 case MSG_MDS_RESOLVE:
7888 handle_resolve(static_cast<MMDSResolve*>(m));
7889 break;
7890 case MSG_MDS_RESOLVEACK:
7891 handle_resolve_ack(static_cast<MMDSResolveAck*>(m));
7892 break;
7893
7894 // REJOIN
7895 case MSG_MDS_CACHEREJOIN:
7896 handle_cache_rejoin(static_cast<MMDSCacheRejoin*>(m));
7897 break;
7898
7899 case MSG_MDS_DISCOVER:
7900 handle_discover(static_cast<MDiscover*>(m));
7901 break;
7902 case MSG_MDS_DISCOVERREPLY:
7903 handle_discover_reply(static_cast<MDiscoverReply*>(m));
7904 break;
7905
7906 case MSG_MDS_DIRUPDATE:
7907 handle_dir_update(static_cast<MDirUpdate*>(m));
7908 break;
7909
7910 case MSG_MDS_CACHEEXPIRE:
7911 handle_cache_expire(static_cast<MCacheExpire*>(m));
7912 break;
7913
7914 case MSG_MDS_DENTRYLINK:
7915 handle_dentry_link(static_cast<MDentryLink*>(m));
7916 break;
7917 case MSG_MDS_DENTRYUNLINK:
7918 handle_dentry_unlink(static_cast<MDentryUnlink*>(m));
7919 break;
7920
7921 case MSG_MDS_FRAGMENTNOTIFY:
7922 handle_fragment_notify(static_cast<MMDSFragmentNotify*>(m));
7923 break;
7924
7925 case MSG_MDS_FINDINO:
7926 handle_find_ino(static_cast<MMDSFindIno *>(m));
7927 break;
7928 case MSG_MDS_FINDINOREPLY:
7929 handle_find_ino_reply(static_cast<MMDSFindInoReply *>(m));
7930 break;
7931
7932 case MSG_MDS_OPENINO:
7933 handle_open_ino(static_cast<MMDSOpenIno *>(m));
7934 break;
7935 case MSG_MDS_OPENINOREPLY:
7936 handle_open_ino_reply(static_cast<MMDSOpenInoReply *>(m));
7937 break;
7938
7939 default:
7940 derr << "cache unknown message " << m->get_type() << dendl;
7941 assert(0 == "cache unknown message");
7942 }
7943}
7944
7945MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin)
7946{
7947 if (mdr) {
7948 dout(20) << "_get_waiter retryrequest" << dendl;
7949 return new C_MDS_RetryRequest(this, mdr);
7950 } else if (req) {
7951 dout(20) << "_get_waiter retrymessage" << dendl;
7952 return new C_MDS_RetryMessage(mds, req);
7953 } else {
7954 return fin;
7955 }
7956}
7957
7958int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who
7959 const filepath& path, // what
7960 vector<CDentry*> *pdnvec, // result
7961 CInode **pin,
7962 int onfail)
7963{
7964 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
7965 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
7966 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
7967
7968 assert(mdr || req || fin);
7969 assert(!forward || mdr || req); // forward requires a request
7970
7971 snapid_t snapid = CEPH_NOSNAP;
7972 if (mdr)
7973 mdr->snapid = snapid;
7974
7975 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
7976
7977 if (mds->logger) mds->logger->inc(l_mds_traverse);
7978
7979 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
7980 CInode *cur = get_inode(path.get_ino());
7981 if (cur == NULL) {
7982 if (MDS_INO_IS_MDSDIR(path.get_ino()))
7983 open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin));
7984 else {
7985 //ceph_abort(); // hrm.. broken
7986 return -ESTALE;
7987 }
7988 return 1;
7989 }
7990 if (cur->state_test(CInode::STATE_PURGING))
7991 return -ESTALE;
7992
7993 // make sure snaprealm are open...
7994 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
7995 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
7996 return 1;
7997 }
7998
7999 // start trace
8000 if (pdnvec)
8001 pdnvec->clear();
8002 if (pin)
8003 *pin = cur;
8004
8005 unsigned depth = 0;
8006 while (depth < path.depth()) {
8007 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8008 << "' snapid " << snapid << dendl;
8009
8010 if (!cur->is_dir()) {
8011 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8012 return -ENOTDIR;
8013 }
8014
8015 // walk into snapdir?
8016 if (path[depth].length() == 0) {
8017 dout(10) << "traverse: snapdir" << dendl;
8018 if (!mdr)
8019 return -EINVAL;
8020 snapid = CEPH_SNAPDIR;
8021 mdr->snapid = snapid;
8022 depth++;
8023 continue;
8024 }
8025 // walk thru snapdir?
8026 if (snapid == CEPH_SNAPDIR) {
8027 if (!mdr)
8028 return -EINVAL;
8029 SnapRealm *realm = cur->find_snaprealm();
8030 snapid = realm->resolve_snapname(path[depth], cur->ino());
8031 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8032 if (!snapid)
8033 return -ENOENT;
8034 mdr->snapid = snapid;
8035 depth++;
8036 continue;
8037 }
8038
8039 // open dir
8040 frag_t fg = cur->pick_dirfrag(path[depth]);
8041 CDir *curdir = cur->get_dirfrag(fg);
8042 if (!curdir) {
8043 if (cur->is_auth()) {
8044 // parent dir frozen_dir?
8045 if (cur->is_frozen()) {
8046 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8047 cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8048 return 1;
8049 }
8050 curdir = cur->get_or_open_dirfrag(this, fg);
8051 } else {
8052 // discover?
8053 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8054 discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8055 null_okay);
8056 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8057 return 1;
8058 }
8059 }
8060 assert(curdir);
8061
8062#ifdef MDS_VERIFY_FRAGSTAT
8063 if (curdir->is_complete())
8064 curdir->verify_fragstat();
8065#endif
8066
8067 // frozen?
8068 /*
8069 if (curdir->is_frozen()) {
8070 // doh!
8071 // FIXME: traverse is allowed?
8072 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8073 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8074 if (onfinish) delete onfinish;
8075 return 1;
8076 }
8077 */
8078
8079 // Before doing dirfrag->dn lookup, compare with DamageTable's
8080 // record of which dentries were unreadable
8081 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8082 dout(4) << "traverse: stopped lookup at damaged dentry "
8083 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8084 return -EIO;
8085 }
8086
8087 // dentry
8088 CDentry *dn = curdir->lookup(path[depth], snapid);
8089 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
8090
8091 // null and last_bit and xlocked by me?
8092 if (dnl && dnl->is_null() && null_okay) {
8093 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
8094 if (pdnvec)
8095 pdnvec->push_back(dn);
8096 if (pin)
8097 *pin = 0;
8098 break; // done!
8099 }
8100
8101 if (dnl &&
8102 dn->lock.is_xlocked() &&
8103 dn->lock.get_xlock_by() != mdr &&
8104 !dn->lock.can_read(client) &&
8105 (dnl->is_null() || forward)) {
8106 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
8107 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8108 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
8109 mds->mdlog->flush();
8110 return 1;
8111 }
8112
8113 // can we conclude ENOENT?
8114 if (dnl && dnl->is_null()) {
8115 if (dn->lock.can_read(client) ||
8116 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8117 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8118 if (pdnvec) {
8119 if (depth == path.depth() - 1)
8120 pdnvec->push_back(dn);
8121 else
8122 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8123 }
8124 return -ENOENT;
8125 } else {
8126 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
8127 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
8128 return 1;
8129 }
8130 }
8131
8132 if (dnl && !dnl->is_null()) {
8133 CInode *in = dnl->get_inode();
8134
8135 // do we have inode?
8136 if (!in) {
8137 assert(dnl->is_remote());
8138 // do i have it?
8139 in = get_inode(dnl->get_remote_ino());
8140 if (in) {
8141 dout(7) << "linking in remote in " << *in << dendl;
8142 dn->link_remote(dnl, in);
8143 } else {
8144 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8145 assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8146 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8147 dout(4) << "traverse: remote dentry points to damaged ino "
8148 << *dn << dendl;
8149 return -EIO;
8150 }
8151 open_remote_dentry(dn, true, _get_waiter(mdr, req, fin),
8152 (null_okay && depth == path.depth() - 1));
8153 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8154 return 1;
8155 }
8156 }
8157
8158 cur = in;
8159 // make sure snaprealm are open...
8160 if (mdr && cur->snaprealm && !cur->snaprealm->is_open() &&
8161 !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) {
8162 return 1;
8163 }
8164
8165 // add to trace, continue.
8166 touch_inode(cur);
8167 if (pdnvec)
8168 pdnvec->push_back(dn);
8169 if (pin)
8170 *pin = cur;
8171 depth++;
8172 continue;
8173 }
8174
8175
8176 // MISS. dentry doesn't exist.
8177 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8178
8179 if (curdir->is_auth()) {
8180 // dentry is mine.
8181 if (curdir->is_complete() ||
8182 (snapid == CEPH_NOSNAP &&
8183 curdir->has_bloom() &&
8184 !curdir->is_in_bloom(path[depth]))){
8185 // file not found
8186 if (pdnvec) {
8187 // instantiate a null dn?
8188 if (depth < path.depth()-1){
8189 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8190 dn = NULL;
8191 } else if (dn) {
8192 ceph_abort(); // should have fallen out in ->is_null() check above
8193 } else if (curdir->is_frozen()) {
8194 dout(20) << " not adding null to frozen dir " << dendl;
8195 } else if (snapid < CEPH_MAXSNAP) {
8196 dout(20) << " not adding null for snapid " << snapid << dendl;
8197 } else {
8198 // create a null dentry
8199 dn = curdir->add_null_dentry(path[depth]);
8200 dout(20) << " added null " << *dn << dendl;
8201 }
8202 if (dn)
8203 pdnvec->push_back(dn);
8204 else
8205 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8206 }
8207 return -ENOENT;
8208 } else {
8209
8210 // Check DamageTable for missing fragments before trying to fetch
8211 // this
8212 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8213 dout(4) << "traverse: damaged dirfrag " << *curdir
8214 << ", blocking fetch" << dendl;
8215 return -EIO;
8216 }
8217
8218 // directory isn't complete; reload
8219 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8220 touch_inode(cur);
8221 curdir->fetch(_get_waiter(mdr, req, fin), path[depth]);
8222 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8223 return 1;
8224 }
8225 } else {
8226 // dirfrag/dentry is not mine.
8227 mds_authority_t dauth = curdir->authority();
8228
8229 if (forward &&
8230 snapid && mdr && mdr->client_request &&
8231 (int)depth < mdr->client_request->get_num_fwd()) {
8232 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8233 << " < fwd " << mdr->client_request->get_num_fwd()
8234 << ", discovering instead of forwarding" << dendl;
8235 discover = true;
8236 }
8237
8238 if ((discover || null_okay)) {
8239 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8240 discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin),
8241 null_okay);
8242 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8243 return 1;
8244 }
8245 if (forward) {
8246 // forward
8247 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8248
8249 if (curdir->is_ambiguous_auth()) {
8250 // wait
8251 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8252 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin));
8253 return 1;
8254 }
8255
8256 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8257
8258 if (mdr)
8259 request_forward(mdr, dauth.first);
8260 else
8261 mds->forward_message_mds(req, dauth.first);
8262
8263 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8264 assert(fin == NULL);
8265 return 2;
8266 }
8267 }
8268
8269 ceph_abort(); // i shouldn't get here
8270 }
8271
8272 // success.
8273 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8274 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8275 if (mdr)
8276 assert(mdr->snapid == snapid);
8277 return 0;
8278}
8279
8280CInode *MDCache::cache_traverse(const filepath& fp)
8281{
8282 dout(10) << "cache_traverse " << fp << dendl;
8283
8284 CInode *in;
8285 if (fp.get_ino())
8286 in = get_inode(fp.get_ino());
8287 else
8288 in = root;
8289 if (!in)
8290 return NULL;
8291
8292 for (unsigned i = 0; i < fp.depth(); i++) {
94b18763 8293 boost::string_view dname = fp[i];
7c673cae
FG
8294 frag_t fg = in->pick_dirfrag(dname);
8295 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8296 CDir *curdir = in->get_dirfrag(fg);
8297 if (!curdir)
8298 return NULL;
8299 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8300 if (!dn)
8301 return NULL;
8302 in = dn->get_linkage()->get_inode();
8303 if (!in)
8304 return NULL;
8305 }
8306 dout(10) << " got " << *in << dendl;
8307 return in;
8308}
8309
8310
8311/**
8312 * open_remote_dir -- open up a remote dirfrag
8313 *
8314 * @param diri base inode
8315 * @param approxfg approximate fragment.
8316 * @param fin completion callback
8317 */
8318void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin)
8319{
8320 dout(10) << "open_remote_dir on " << *diri << dendl;
7c673cae
FG
8321 assert(diri->is_dir());
8322 assert(!diri->is_auth());
8323 assert(diri->get_dirfrag(approxfg) == 0);
8324
224ce89b 8325 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8326}
8327
8328
8329/**
8330 * get_dentry_inode - get or open inode
8331 *
8332 * @param dn the dentry
8333 * @param mdr current request
8334 *
8335 * will return inode for primary, or link up/open up remote link's inode as necessary.
8336 * If it's not available right now, puts mdr on wait list and returns null.
8337 */
8338CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8339{
8340 CDentry::linkage_t *dnl;
8341 if (projected)
8342 dnl = dn->get_projected_linkage();
8343 else
8344 dnl = dn->get_linkage();
8345
8346 assert(!dnl->is_null());
8347
8348 if (dnl->is_primary())
8349 return dnl->inode;
8350
8351 assert(dnl->is_remote());
8352 CInode *in = get_inode(dnl->get_remote_ino());
8353 if (in) {
8354 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8355 dn->link_remote(dnl, in);
8356 return in;
8357 } else {
8358 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8359 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8360 return 0;
8361 }
8362}
8363
8364struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8365 CDentry *dn;
8366 inodeno_t ino;
8367 MDSInternalContextBase *onfinish;
8368 bool want_xlocked;
8369 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) :
31f18b77
FG
8370 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8371 dn->get(MDSCacheObject::PIN_PTRWAITER);
8372 }
7c673cae
FG
8373 void finish(int r) override {
8374 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8375 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8376 }
8377};
8378
8379void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked)
8380{
8381 dout(10) << "open_remote_dentry " << *dn << dendl;
8382 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8383 inodeno_t ino = dnl->get_remote_ino();
8384 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8385 open_ino(ino, pool,
8386 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8387}
8388
8389void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
8390 bool want_xlocked, int r)
8391{
8392 if (r < 0) {
31f18b77
FG
8393 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8394 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8395 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8396 dn->state_set(CDentry::STATE_BADREMOTEINO);
8397
8398 std::string path;
8399 CDir *dir = dn->get_dir();
8400 if (dir) {
31f18b77 8401 dir->get_inode()->make_path_string(path);
94b18763
FG
8402 path += "/";
8403 path += std::string(dn->get_name());
7c673cae
FG
8404 }
8405
31f18b77 8406 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8407 if (fatal) {
31f18b77
FG
8408 mds->damaged();
8409 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8410 }
31f18b77
FG
8411 } else {
8412 r = 0;
8413 }
7c673cae
FG
8414 }
8415 fin->complete(r < 0 ? r : 0);
8416}
8417
8418
8419void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8420{
8421 // empty trace if we're a base inode
8422 if (in->is_base())
8423 return;
8424
8425 CInode *parent = in->get_parent_inode();
8426 assert(parent);
8427 make_trace(trace, parent);
8428
8429 CDentry *dn = in->get_parent_dn();
8430 dout(15) << "make_trace adding " << *dn << dendl;
8431 trace.push_back(dn);
8432}
8433
8434
8435// -------------------------------------------------------------------------------
8436// Open inode by inode number
8437
8438class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8439 inodeno_t ino;
8440 public:
8441 bufferlist bl;
8442 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8443 MDCacheIOContext(c), ino(i) {}
8444 void finish(int r) override {
8445 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8446 }
91327a77
AA
8447 void print(ostream& out) const override {
8448 out << "openino_backtrace_fetch" << ino << ")";
8449 }
7c673cae
FG
8450};
8451
8452struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8453 inodeno_t ino;
8454 MMDSOpenIno *msg;
8455 bool parent;
8456 public:
8457 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) :
8458 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8459 void finish(int r) override {
8460 if (r < 0 && !parent)
8461 r = -EAGAIN;
8462 if (msg) {
8463 mdcache->handle_open_ino(msg, r);
8464 return;
8465 }
8466 assert(mdcache->opening_inodes.count(ino));
8467 mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
8468 }
8469};
8470
8471struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8472 inodeno_t ino;
8473 public:
8474 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8475 void finish(int r) override {
8476 mdcache->_open_ino_parent_opened(ino, r);
8477 }
8478};
8479
8480void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8481{
8482 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8483
8484 assert(opening_inodes.count(ino));
8485 open_ino_info_t& info = opening_inodes[ino];
8486
8487 CInode *in = get_inode(ino);
8488 if (in) {
8489 dout(10) << " found cached " << *in << dendl;
8490 open_ino_finish(ino, info, in->authority().first);
8491 return;
8492 }
8493
8494 inode_backtrace_t backtrace;
8495 if (err == 0) {
8496 try {
8497 ::decode(backtrace, bl);
8498 } catch (const buffer::error &decode_exc) {
8499 derr << "corrupt backtrace on ino x0" << std::hex << ino
8500 << std::dec << ": " << decode_exc << dendl;
8501 open_ino_finish(ino, info, -EIO);
8502 return;
8503 }
8504 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8505 dout(10) << " old object in pool " << info.pool
8506 << ", retrying pool " << backtrace.pool << dendl;
8507 info.pool = backtrace.pool;
8508 C_IO_MDC_OpenInoBacktraceFetched *fin =
8509 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8510 fetch_backtrace(ino, info.pool, fin->bl,
8511 new C_OnFinisher(fin, mds->finisher));
8512 return;
8513 }
8514 } else if (err == -ENOENT) {
8515 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8516 if (info.pool != meta_pool) {
8517 dout(10) << " no object in pool " << info.pool
8518 << ", retrying pool " << meta_pool << dendl;
8519 info.pool = meta_pool;
8520 C_IO_MDC_OpenInoBacktraceFetched *fin =
8521 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8522 fetch_backtrace(ino, info.pool, fin->bl,
8523 new C_OnFinisher(fin, mds->finisher));
8524 return;
8525 }
8526 err = 0; // backtrace.ancestors.empty() is checked below
8527 }
8528
8529 if (err == 0) {
8530 if (backtrace.ancestors.empty()) {
8531 dout(10) << " got empty backtrace " << dendl;
8532 err = -EIO;
8533 } else if (!info.ancestors.empty()) {
8534 if (info.ancestors[0] == backtrace.ancestors[0]) {
8535 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8536 err = -EINVAL;
8537 } else {
8538 info.last_err = 0;
8539 }
8540 }
8541 }
8542 if (err) {
8543 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8544 if (info.last_err)
8545 err = info.last_err;
8546 open_ino_finish(ino, info, err);
8547 return;
8548 }
8549
8550 dout(10) << " got backtrace " << backtrace << dendl;
8551 info.ancestors = backtrace.ancestors;
8552
8553 _open_ino_traverse_dir(ino, info, 0);
8554}
8555
8556void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8557{
8558 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8559
8560 assert(opening_inodes.count(ino));
8561 open_ino_info_t& info = opening_inodes[ino];
8562
8563 CInode *in = get_inode(ino);
8564 if (in) {
8565 dout(10) << " found cached " << *in << dendl;
8566 open_ino_finish(ino, info, in->authority().first);
8567 return;
8568 }
8569
8570 if (ret == mds->get_nodeid()) {
8571 _open_ino_traverse_dir(ino, info, 0);
8572 } else {
8573 if (ret >= 0) {
8574 mds_rank_t checked_rank = mds_rank_t(ret);
8575 info.check_peers = true;
8576 info.auth_hint = checked_rank;
8577 info.checked.erase(checked_rank);
8578 }
8579 do_open_ino(ino, info, ret);
8580 }
8581}
8582
8583void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8584{
8585 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8586
8587 CInode *in = get_inode(ino);
8588 if (in) {
8589 dout(10) << " found cached " << *in << dendl;
8590 open_ino_finish(ino, info, in->authority().first);
8591 return;
8592 }
8593
8594 if (ret) {
8595 do_open_ino(ino, info, ret);
8596 return;
8597 }
8598
8599 mds_rank_t hint = info.auth_hint;
8600 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8601 info.discover, info.want_xlocked, &hint);
8602 if (ret > 0)
8603 return;
8604 if (hint != mds->get_nodeid())
8605 info.auth_hint = hint;
8606 do_open_ino(ino, info, ret);
8607}
8608
8609void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent)
8610{
8611 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8612 assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8613 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8614}
8615
8616int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
8617 vector<inode_backpointer_t>& ancestors,
8618 bool discover, bool want_xlocked, mds_rank_t *hint)
8619{
8620 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8621 int err = 0;
8622 for (unsigned i = 0; i < ancestors.size(); i++) {
8623 CInode *diri = get_inode(ancestors[i].dirino);
8624
8625 if (!diri) {
8626 if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) {
8627 open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8628 return 1;
8629 }
8630 continue;
8631 }
8632
8633 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8634 CDir *dir = diri->get_parent_dir();
8635 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8636 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8637 dir = dir->get_inode()->get_parent_dir();
8638 _open_ino_fetch_dir(ino, m, dir, i == 0);
8639 return 1;
8640 }
8641
8642 if (!diri->is_dir()) {
8643 dout(10) << " " << *diri << " is not dir" << dendl;
8644 if (i == 0)
8645 err = -ENOTDIR;
8646 break;
8647 }
8648
8649 string &name = ancestors[i].dname;
8650 frag_t fg = diri->pick_dirfrag(name);
8651 CDir *dir = diri->get_dirfrag(fg);
8652 if (!dir) {
8653 if (diri->is_auth()) {
8654 if (diri->is_frozen()) {
8655 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8656 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8657 return 1;
8658 }
8659 dir = diri->get_or_open_dirfrag(this, fg);
8660 } else if (discover) {
8661 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8662 return 1;
8663 }
8664 }
8665 if (dir) {
8666 inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino;
8667 CDentry *dn = dir->lookup(name);
8668 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8669 if (dir->is_auth()) {
8670 if (dnl && dnl->is_primary() &&
8671 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8672 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8673 _open_ino_fetch_dir(ino, m, dir, i == 0);
8674 return 1;
8675 }
8676
8677 if (!dnl && !dir->is_complete() &&
8678 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8679 dout(10) << " fetching incomplete " << *dir << dendl;
8680 _open_ino_fetch_dir(ino, m, dir, i == 0);
8681 return 1;
8682 }
8683
8684 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8685 if (i == 0)
8686 err = -ENOENT;
8687 } else if (discover) {
8688 if (!dnl) {
8689 filepath path(name, 0);
8690 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8691 (i == 0 && want_xlocked));
8692 return 1;
8693 }
8694 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8695 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8696 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8697 return 1;
8698 }
8699 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8700 if (i == 0)
8701 err = -ENOENT;
8702 }
8703 }
8704 if (hint && i == 0)
8705 *hint = dir ? dir->authority().first : diri->authority().first;
8706 break;
8707 }
8708 return err;
8709}
8710
8711void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8712{
8713 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8714
8715 list<MDSInternalContextBase*> waiters;
8716 waiters.swap(info.waiters);
8717 opening_inodes.erase(ino);
8718 finish_contexts(g_ceph_context, waiters, ret);
8719}
8720
8721void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8722{
8723 if (err < 0 && err != -EAGAIN) {
8724 info.checked.clear();
7c673cae
FG
8725 info.checking = MDS_RANK_NONE;
8726 info.check_peers = true;
8727 info.fetch_backtrace = true;
8728 if (info.discover) {
8729 info.discover = false;
8730 info.ancestors.clear();
8731 }
8732 if (err != -ENOENT && err != -ENOTDIR)
8733 info.last_err = err;
8734 }
8735
d2e6a577
FG
8736 if (info.check_peers || info.discover) {
8737 if (info.discover) {
8738 // got backtrace from peer, but failed to find inode. re-check peers
8739 info.discover = false;
8740 info.ancestors.clear();
8741 info.checked.clear();
8742 }
7c673cae
FG
8743 info.check_peers = false;
8744 info.checking = MDS_RANK_NONE;
8745 do_open_ino_peer(ino, info);
8746 } else if (info.fetch_backtrace) {
8747 info.check_peers = true;
8748 info.fetch_backtrace = false;
8749 info.checking = mds->get_nodeid();
8750 info.checked.clear();
7c673cae
FG
8751 C_IO_MDC_OpenInoBacktraceFetched *fin =
8752 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8753 fetch_backtrace(ino, info.pool, fin->bl,
8754 new C_OnFinisher(fin, mds->finisher));
8755 } else {
8756 assert(!info.ancestors.empty());
8757 info.checking = mds->get_nodeid();
8758 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8759 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8760 }
8761}
8762
8763void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8764{
8765 set<mds_rank_t> all, active;
8766 mds->mdsmap->get_mds_set(all);
7c673cae 8767 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
8768 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
8769 else
8770 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8771
8772 dout(10) << "do_open_ino_peer " << ino << " active " << active
8773 << " all " << all << " checked " << info.checked << dendl;
8774
8775 mds_rank_t peer = MDS_RANK_NONE;
8776 if (info.auth_hint >= 0) {
8777 if (active.count(info.auth_hint)) {
8778 peer = info.auth_hint;
8779 info.auth_hint = MDS_RANK_NONE;
8780 }
8781 } else {
8782 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8783 if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
8784 peer = *p;
8785 break;
8786 }
8787 }
8788 if (peer < 0) {
d2e6a577
FG
8789 all.erase(mds->get_nodeid());
8790 if (all != info.checked) {
7c673cae
FG
8791 dout(10) << " waiting for more peers to be active" << dendl;
8792 } else {
8793 dout(10) << " all MDS peers have been checked " << dendl;
8794 do_open_ino(ino, info, 0);
8795 }
8796 } else {
8797 info.checking = peer;
8798 vector<inode_backpointer_t> *pa = NULL;
8799 // got backtrace from peer or backtrace just fetched
8800 if (info.discover || !info.fetch_backtrace)
8801 pa = &info.ancestors;
8802 mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
8803 }
8804}
8805
8806void MDCache::handle_open_ino(MMDSOpenIno *m, int err)
8807{
8808 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8809 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
8810 m->put();
8811 return;
8812 }
8813
8814 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8815
8816 inodeno_t ino = m->ino;
8817 MMDSOpenInoReply *reply;
8818 CInode *in = get_inode(ino);
8819 if (in) {
8820 dout(10) << " have " << *in << dendl;
8821 reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0));
8822 if (in->is_auth()) {
8823 touch_inode(in);
8824 while (1) {
8825 CDentry *pdn = in->get_parent_dn();
8826 if (!pdn)
8827 break;
8828 CInode *diri = pdn->get_dir()->get_inode();
94b18763 8829 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
8830 in->inode.version));
8831 in = diri;
8832 }
8833 } else {
8834 reply->hint = in->authority().first;
8835 }
8836 } else if (err < 0) {
8837 reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err);
8838 } else {
8839 mds_rank_t hint = MDS_RANK_NONE;
8840 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8841 if (ret > 0)
8842 return;
8843 reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret);
8844 }
8845 m->get_connection()->send_message(reply);
8846 m->put();
8847}
8848
8849void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m)
8850{
8851 dout(10) << "handle_open_ino_reply " << *m << dendl;
8852
8853 inodeno_t ino = m->ino;
8854 mds_rank_t from = mds_rank_t(m->get_source().num());
8855 auto it = opening_inodes.find(ino);
8856 if (it != opening_inodes.end() && it->second.checking == from) {
8857 open_ino_info_t& info = it->second;
8858 info.checking = MDS_RANK_NONE;
8859 info.checked.insert(from);
8860
8861 CInode *in = get_inode(ino);
8862 if (in) {
8863 dout(10) << " found cached " << *in << dendl;
8864 open_ino_finish(ino, info, in->authority().first);
8865 } else if (!m->ancestors.empty()) {
8866 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8867 if (!info.want_replica) {
8868 open_ino_finish(ino, info, from);
8869 m->put();
8870 return;
8871 }
8872
8873 info.ancestors = m->ancestors;
8874 info.auth_hint = from;
8875 info.checking = mds->get_nodeid();
8876 info.discover = true;
8877 _open_ino_traverse_dir(ino, info, 0);
8878 } else if (m->error) {
8879 dout(10) << " error " << m->error << " from mds." << from << dendl;
8880 do_open_ino(ino, info, m->error);
8881 } else {
8882 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8883 info.auth_hint = m->hint;
8884 info.checked.erase(m->hint);
8885 }
8886 do_open_ino_peer(ino, info);
8887 }
8888 }
8889 m->put();
8890}
8891
8892void MDCache::kick_open_ino_peers(mds_rank_t who)
8893{
8894 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8895
8896 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8897 p != opening_inodes.end();
8898 ++p) {
8899 open_ino_info_t& info = p->second;
8900 if (info.checking == who) {
8901 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8902 info.checking = MDS_RANK_NONE;
8903 do_open_ino_peer(p->first, info);
8904 } else if (info.checking == MDS_RANK_NONE) {
8905 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8906 do_open_ino_peer(p->first, info);
8907 }
8908 }
8909}
8910
8911void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
8912 bool want_replica, bool want_xlocked)
8913{
8914 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8915 << want_replica << dendl;
8916
8917 if (opening_inodes.count(ino)) {
8918 open_ino_info_t& info = opening_inodes[ino];
8919 if (want_replica) {
8920 info.want_replica = true;
8921 if (want_xlocked && !info.want_xlocked) {
8922 if (!info.ancestors.empty()) {
8923 CInode *diri = get_inode(info.ancestors[0].dirino);
8924 if (diri) {
8925 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8926 CDir *dir = diri->get_dirfrag(fg);
8927 if (dir && !dir->is_auth()) {
8928 filepath path(info.ancestors[0].dname, 0);
8929 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8930 }
8931 }
8932 }
8933 info.want_xlocked = true;
8934 }
8935 }
8936 info.waiters.push_back(fin);
8937 } else {
8938 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
8939 info.want_replica = want_replica;
8940 info.want_xlocked = want_xlocked;
8941 info.tid = ++open_ino_last_tid;
8942 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
8943 info.waiters.push_back(fin);
8944 do_open_ino(ino, info, 0);
8945 }
8946}
8947
8948/* ---------------------------- */
8949
8950/*
8951 * search for a given inode on MDS peers. optionally start with the given node.
8952
8953
8954 TODO
8955 - recover from mds node failure, recovery
8956 - traverse path
8957
8958 */
8959void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint)
8960{
8961 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
8962 CInode *in = get_inode(ino);
8963 if (in && in->state_test(CInode::STATE_PURGING)) {
8964 c->complete(-ESTALE);
8965 return;
8966 }
8967 assert(!in);
7c673cae
FG
8968
8969 ceph_tid_t tid = ++find_ino_peer_last_tid;
8970 find_ino_peer_info_t& fip = find_ino_peer[tid];
8971 fip.ino = ino;
8972 fip.tid = tid;
8973 fip.fin = c;
8974 fip.hint = hint;
7c673cae
FG
8975 _do_find_ino_peer(fip);
8976}
8977
8978void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
8979{
8980 set<mds_rank_t> all, active;
8981 mds->mdsmap->get_mds_set(all);
1adf2230 8982 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8983
8984 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
8985 << " active " << active << " all " << all
8986 << " checked " << fip.checked
8987 << dendl;
8988
8989 mds_rank_t m = MDS_RANK_NONE;
8990 if (fip.hint >= 0) {
8991 m = fip.hint;
8992 fip.hint = MDS_RANK_NONE;
8993 } else {
8994 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
8995 if (*p != mds->get_nodeid() &&
8996 fip.checked.count(*p) == 0) {
8997 m = *p;
8998 break;
8999 }
9000 }
9001 if (m == MDS_RANK_NONE) {
d2e6a577
FG
9002 all.erase(mds->get_nodeid());
9003 if (all != fip.checked) {
7c673cae
FG
9004 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9005 } else {
9006 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9007 fip.fin->complete(-ESTALE);
9008 find_ino_peer.erase(fip.tid);
9009 }
9010 } else {
9011 fip.checking = m;
9012 mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m);
9013 }
9014}
9015
9016void MDCache::handle_find_ino(MMDSFindIno *m)
9017{
9018 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9019 m->put();
9020 return;
9021 }
9022
9023 dout(10) << "handle_find_ino " << *m << dendl;
9024 MMDSFindInoReply *r = new MMDSFindInoReply(m->tid);
9025 CInode *in = get_inode(m->ino);
9026 if (in) {
9027 in->make_path(r->path);
9028 dout(10) << " have " << r->path << " " << *in << dendl;
9029 }
9030 m->get_connection()->send_message(r);
9031 m->put();
9032}
9033
9034
9035void MDCache::handle_find_ino_reply(MMDSFindInoReply *m)
9036{
9037 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
9038 if (p != find_ino_peer.end()) {
9039 dout(10) << "handle_find_ino_reply " << *m << dendl;
9040 find_ino_peer_info_t& fip = p->second;
9041
9042 // success?
9043 if (get_inode(fip.ino)) {
9044 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9045 mds->queue_waiter(fip.fin);
9046 find_ino_peer.erase(p);
9047 m->put();
9048 return;
9049 }
9050
9051 mds_rank_t from = mds_rank_t(m->get_source().num());
9052 if (fip.checking == from)
9053 fip.checking = MDS_RANK_NONE;
9054 fip.checked.insert(from);
9055
9056 if (!m->path.empty()) {
9057 // we got a path!
9058 vector<CDentry*> trace;
9059 MDRequestRef null_ref;
9060 int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
9061 if (r > 0)
9062 return;
9063 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9064 << ", retrying" << dendl;
9065 fip.checked.clear();
9066 _do_find_ino_peer(fip);
9067 } else {
9068 // nope, continue.
9069 _do_find_ino_peer(fip);
9070 }
9071 } else {
9072 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9073 }
9074 m->put();
9075}
9076
9077void MDCache::kick_find_ino_peers(mds_rank_t who)
9078{
9079 // find_ino_peers requests we should move on from
9080 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9081 p != find_ino_peer.end();
9082 ++p) {
9083 find_ino_peer_info_t& fip = p->second;
9084 if (fip.checking == who) {
9085 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9086 fip.checking = MDS_RANK_NONE;
9087 _do_find_ino_peer(fip);
9088 } else if (fip.checking == MDS_RANK_NONE) {
9089 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9090 _do_find_ino_peer(fip);
9091 }
9092 }
9093}
9094
9095/* ---------------------------- */
9096
9097int MDCache::get_num_client_requests()
9098{
9099 int count = 0;
9100 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9101 p != active_requests.end();
9102 ++p) {
9103 MDRequestRef& mdr = p->second;
9104 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9105 count++;
9106 }
9107 return count;
9108}
9109
9110/* This function takes over the reference to the passed Message */
9111MDRequestRef MDCache::request_start(MClientRequest *req)
9112{
9113 // did we win a forward race against a slave?
9114 if (active_requests.count(req->get_reqid())) {
9115 MDRequestRef& mdr = active_requests[req->get_reqid()];
9116 assert(mdr);
9117 if (mdr->is_slave()) {
9118 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9119 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9120 } else {
9121 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9122 req->put();
9123 }
9124 return MDRequestRef();
9125 }
9126
9127 // register new client request
9128 MDRequestImpl::Params params;
9129 params.reqid = req->get_reqid();
9130 params.attempt = req->get_num_fwd();
9131 params.client_req = req;
9132 params.initiated = req->get_recv_stamp();
9133 params.throttled = req->get_throttle_stamp();
9134 params.all_read = req->get_recv_complete_stamp();
9135 params.dispatched = req->get_dispatch_stamp();
9136
9137 MDRequestRef mdr =
9138 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9139 active_requests[params.reqid] = mdr;
9140 mdr->set_op_stamp(req->get_stamp());
9141 dout(7) << "request_start " << *mdr << dendl;
9142 return mdr;
9143}
9144
9145MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m)
9146{
9147 int by = m->get_source().num();
9148 MDRequestImpl::Params params;
9149 params.reqid = ri;
9150 params.attempt = attempt;
9151 params.triggering_slave_req = m;
9152 params.slave_to = by;
9153 params.initiated = m->get_recv_stamp();
9154 params.throttled = m->get_throttle_stamp();
9155 params.all_read = m->get_recv_complete_stamp();
9156 params.dispatched = m->get_dispatch_stamp();
9157 MDRequestRef mdr =
9158 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9159 assert(active_requests.count(mdr->reqid) == 0);
9160 active_requests[mdr->reqid] = mdr;
9161 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9162 return mdr;
9163}
9164
9165MDRequestRef MDCache::request_start_internal(int op)
9166{
91327a77 9167 utime_t now = ceph_clock_now();
7c673cae
FG
9168 MDRequestImpl::Params params;
9169 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9170 params.reqid.tid = mds->issue_tid();
91327a77
AA
9171 params.initiated = now;
9172 params.throttled = now;
9173 params.all_read = now;
9174 params.dispatched = now;
7c673cae
FG
9175 params.internal_op = op;
9176 MDRequestRef mdr =
9177 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params>(params);
9178
9179 assert(active_requests.count(mdr->reqid) == 0);
9180 active_requests[mdr->reqid] = mdr;
9181 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9182 return mdr;
9183}
9184
9185MDRequestRef MDCache::request_get(metareqid_t rid)
9186{
9187 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9188 assert(p != active_requests.end());
9189 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9190 return p->second;
9191}
9192
9193void MDCache::request_finish(MDRequestRef& mdr)
9194{
9195 dout(7) << "request_finish " << *mdr << dendl;
9196 mdr->mark_event("finishing request");
9197
9198 // slave finisher?
9199 if (mdr->has_more() && mdr->more()->slave_commit) {
9200 Context *fin = mdr->more()->slave_commit;
9201 mdr->more()->slave_commit = 0;
9202 int ret;
9203 if (mdr->aborted) {
9204 mdr->aborted = false;
9205 ret = -1;
9206 mdr->more()->slave_rolling_back = true;
9207 } else {
9208 ret = 0;
9209 mdr->committing = true;
9210 }
9211 fin->complete(ret); // this must re-call request_finish.
9212 return;
9213 }
9214
d2e6a577
FG
9215 switch(mdr->internal_op) {
9216 case CEPH_MDS_OP_FRAGMENTDIR:
9217 logger->inc(l_mdss_ireq_fragmentdir);
9218 break;
9219 case CEPH_MDS_OP_EXPORTDIR:
9220 logger->inc(l_mdss_ireq_exportdir);
9221 break;
9222 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9223 logger->inc(l_mdss_ireq_enqueue_scrub);
9224 break;
9225 case CEPH_MDS_OP_FLUSH:
9226 logger->inc(l_mdss_ireq_flush);
9227 break;
9228 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9229 logger->inc(l_mdss_ireq_fragstats);
9230 break;
9231 case CEPH_MDS_OP_REPAIR_INODESTATS:
9232 logger->inc(l_mdss_ireq_inodestats);
9233 break;
9234 }
9235
7c673cae
FG
9236 request_cleanup(mdr);
9237}
9238
9239
9240void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9241{
9242 mdr->mark_event("forwarding request");
9243 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9244 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9245 << *mdr->client_request << dendl;
91327a77 9246 mds->forward_message_mds(mdr->release_client_request(), who);
7c673cae
FG
9247 if (mds->logger) mds->logger->inc(l_mds_forward);
9248 } else if (mdr->internal_op >= 0) {
9249 dout(10) << "request_forward on internal op; cancelling" << dendl;
9250 mdr->internal_op_finish->complete(-EXDEV);
9251 } else {
9252 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9253 << " was from mds" << dendl;
9254 }
9255 request_cleanup(mdr);
9256}
9257
9258
9259void MDCache::dispatch_request(MDRequestRef& mdr)
9260{
9261 if (mdr->client_request) {
9262 mds->server->dispatch_client_request(mdr);
9263 } else if (mdr->slave_request) {
9264 mds->server->dispatch_slave_request(mdr);
9265 } else {
9266 switch (mdr->internal_op) {
9267 case CEPH_MDS_OP_FRAGMENTDIR:
9268 dispatch_fragment_dir(mdr);
9269 break;
9270 case CEPH_MDS_OP_EXPORTDIR:
9271 migrator->dispatch_export_dir(mdr, 0);
9272 break;
9273 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9274 enqueue_scrub_work(mdr);
9275 break;
9276 case CEPH_MDS_OP_FLUSH:
9277 flush_dentry_work(mdr);
9278 break;
9279 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9280 repair_dirfrag_stats_work(mdr);
9281 break;
9282 case CEPH_MDS_OP_REPAIR_INODESTATS:
9283 repair_inode_stats_work(mdr);
9284 break;
9285 default:
9286 ceph_abort();
9287 }
9288 }
9289}
9290
9291
9292void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9293{
9294 if (!mdr->has_more())
9295 return;
9296
9297 // clean up slaves
9298 // (will implicitly drop remote dn pins)
9299 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9300 p != mdr->more()->slaves.end();
9301 ++p) {
9302 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
9303 MMDSSlaveRequest::OP_FINISH);
9304
9305 if (mdr->killed && !mdr->committing) {
9306 r->mark_abort();
9307 } else if (mdr->more()->srcdn_auth_mds == *p &&
9308 mdr->more()->inode_import.length() > 0) {
9309 // information about rename imported caps
9310 r->inode_export.claim(mdr->more()->inode_import);
9311 }
9312
9313 mds->send_message_mds(r, *p);
9314 }
9315
9316 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9317 * implicitly. Note that we don't call the finishers -- there shouldn't
9318 * be any on a remote lock and the request finish wakes up all
9319 * the waiters anyway! */
9320 set<SimpleLock*>::iterator p = mdr->xlocks.begin();
9321 while (p != mdr->xlocks.end()) {
9322 if ((*p)->get_parent()->is_auth())
9323 ++p;
9324 else {
9325 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9326 << " on " << *(*p)->get_parent() << dendl;
9327 (*p)->put_xlock();
9328 mdr->locks.erase(*p);
9329 mdr->xlocks.erase(p++);
9330 }
9331 }
9332
9333 map<SimpleLock*, mds_rank_t>::iterator q = mdr->remote_wrlocks.begin();
9334 while (q != mdr->remote_wrlocks.end()) {
9335 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first
9336 << " on mds." << q->second
9337 << " on " << *(q->first)->get_parent() << dendl;
9338 mdr->locks.erase(q->first);
9339 mdr->remote_wrlocks.erase(q++);
9340 }
9341
9342 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9343 * leaving them in can cause double-notifies as
9344 * this function can get called more than once */
9345}
9346
9347void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9348{
9349 request_drop_foreign_locks(mdr);
9350 mds->locker->drop_non_rdlocks(mdr.get());
9351}
9352
9353void MDCache::request_drop_locks(MDRequestRef& mdr)
9354{
9355 request_drop_foreign_locks(mdr);
9356 mds->locker->drop_locks(mdr.get());
9357}
9358
9359void MDCache::request_cleanup(MDRequestRef& mdr)
9360{
9361 dout(15) << "request_cleanup " << *mdr << dendl;
9362
9363 if (mdr->has_more()) {
9364 if (mdr->more()->is_ambiguous_auth)
9365 mdr->clear_ambiguous_auth();
9366 if (!mdr->more()->waiting_for_finish.empty())
9367 mds->queue_waiters(mdr->more()->waiting_for_finish);
9368 }
9369
9370 request_drop_locks(mdr);
9371
9372 // drop (local) auth pins
9373 mdr->drop_local_auth_pins();
9374
9375 // drop stickydirs
9376 for (set<CInode*>::iterator p = mdr->stickydirs.begin();
9377 p != mdr->stickydirs.end();
9378 ++p)
9379 (*p)->put_stickydirs();
9380
9381 mds->locker->kick_cap_releases(mdr);
9382
9383 // drop cache pins
9384 mdr->drop_pins();
9385
9386 // remove from session
9387 mdr->item_session_request.remove_myself();
9388
9389 // remove from map
9390 active_requests.erase(mdr->reqid);
9391
9392 if (mds->logger)
9393 log_stat();
9394
9395 mdr->mark_event("cleaned up request");
9396}
9397
9398void MDCache::request_kill(MDRequestRef& mdr)
9399{
9400 // rollback slave requests is tricky. just let the request proceed.
94b18763 9401 if (mdr->has_more() &&
7c673cae 9402 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
94b18763
FG
9403 if (!mdr->done_locking) {
9404 assert(mdr->more()->witnessed.empty());
9405 mdr->aborted = true;
9406 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9407 } else {
9408 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9409 }
7c673cae
FG
9410
9411 assert(mdr->used_prealloc_ino == 0);
9412 assert(mdr->prealloc_inos.empty());
9413
9414 mdr->session = NULL;
9415 mdr->item_session_request.remove_myself();
9416 return;
9417 }
9418
9419 mdr->killed = true;
9420 mdr->mark_event("killing request");
9421
9422 if (mdr->committing) {
9423 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9424 } else {
9425 dout(10) << "request_kill " << *mdr << dendl;
9426 request_cleanup(mdr);
9427 }
9428}
9429
9430// -------------------------------------------------------------------------------
9431// SNAPREALMS
9432
9433struct C_MDC_snaprealm_create_finish : public MDCacheLogContext {
9434 MDRequestRef mdr;
9435 MutationRef mut;
9436 CInode *in;
9437 C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m,
9438 MutationRef& mu, CInode *i) :
9439 MDCacheLogContext(c), mdr(m), mut(mu), in(i) {}
9440 void finish(int r) override {
9441 mdcache->_snaprealm_create_finish(mdr, mut, in);
9442 }
9443};
9444
9445void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in)
9446{
9447 dout(10) << "snaprealm_create " << *in << dendl;
9448 assert(!in->snaprealm);
9449
9450 // allocate an id..
9451 if (!mdr->more()->stid) {
9452 mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl,
9453 new C_MDS_RetryRequest(this, mdr));
9454 return;
9455 }
9456
9457 MutationRef mut(new MutationImpl());
9458 mut->ls = mds->mdlog->get_current_segment();
9459 EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create");
9460 mds->mdlog->start_entry(le);
9461
9462 le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid);
9463
94b18763
FG
9464 auto &pi = in->project_inode(false, true);
9465 pi.inode.version = in->pre_dirty();
9466 pi.inode.rstat.rsnaprealms++;
7c673cae
FG
9467
9468 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9469 snapid_t seq;
9470 ::decode(seq, p);
9471
94b18763
FG
9472 auto &newsnap = *pi.snapnode;
9473 newsnap.created = seq;
9474 newsnap.seq = seq;
9475 newsnap.last_created = seq;
7c673cae
FG
9476
9477 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
9478 journal_cow_inode(mut, &le->metablob, in);
9479 le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9480
9481 mds->server->submit_mdlog_entry(le,
9482 new C_MDC_snaprealm_create_finish(this, mdr,
9483 mut, in),
9484 mdr, __func__);
9485 mds->mdlog->flush();
9486}
9487
9488
9489void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend)
9490{
9491 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9492
9493 vector<inodeno_t> split_inos;
9494 vector<inodeno_t> split_realms;
9495
9496 if (snapop == CEPH_SNAP_OP_SPLIT) {
9497 // notify clients of update|split
9498 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9499 !p.end(); ++p)
9500 split_inos.push_back((*p)->ino());
9501
9502 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9503 p != in->snaprealm->open_children.end();
9504 ++p)
9505 split_realms.push_back((*p)->inode->ino());
9506 }
9507
9508 bufferlist snapbl;
9509 in->snaprealm->build_snap_trace(snapbl);
9510
9511 set<SnapRealm*> past_children;
9512 map<client_t, MClientSnap*> updates;
9513 list<SnapRealm*> q;
9514 q.push_back(in->snaprealm);
9515 while (!q.empty()) {
9516 SnapRealm *realm = q.front();
9517 q.pop_front();
9518
9519 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9520 realm->invalidate_cached_snaps();
9521
9522 for (map<client_t, xlist<Capability*>* >::iterator p = realm->client_caps.begin();
9523 p != realm->client_caps.end();
9524 ++p) {
9525 assert(!p->second->empty());
9526 if (!nosend && updates.count(p->first) == 0) {
9527 MClientSnap *update = new MClientSnap(snapop);
9528 update->head.split = in->ino();
9529 update->split_inos = split_inos;
9530 update->split_realms = split_realms;
9531 update->bl = snapbl;
9532 updates[p->first] = update;
9533 }
9534 }
9535
9536 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9537 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9538 p != realm->open_past_children.end();
9539 ++p)
9540 past_children.insert(*p);
9541 }
9542
9543 // notify for active children, too.
9544 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9545 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9546 p != realm->open_children.end();
9547 ++p)
9548 q.push_back(*p);
9549 }
9550
9551 if (!nosend)
9552 send_snaps(updates);
9553
9554 // notify past children and their descendants if we update/delete old snapshots
9555 for (set<SnapRealm*>::iterator p = past_children.begin();
9556 p != past_children.end();
9557 ++p)
9558 q.push_back(*p);
9559
9560 while (!q.empty()) {
9561 SnapRealm *realm = q.front();
9562 q.pop_front();
9563
9564 realm->invalidate_cached_snaps();
9565
9566 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9567 p != realm->open_children.end();
9568 ++p) {
9569 if (past_children.count(*p) == 0)
9570 q.push_back(*p);
9571 }
9572
9573 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9574 p != realm->open_past_children.end();
9575 ++p) {
9576 if (past_children.count(*p) == 0) {
9577 q.push_back(*p);
9578 past_children.insert(*p);
9579 }
9580 }
9581 }
9582
9583 if (snapop == CEPH_SNAP_OP_DESTROY) {
9584 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9585 for (set<SnapRealm*>::iterator p = past_children.begin();
9586 p != past_children.end();
9587 ++p)
9588 maybe_eval_stray((*p)->inode, true);
9589 }
9590}
9591
9592void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in)
9593{
9594 dout(10) << "_snaprealm_create_finish " << *in << dendl;
9595
9596 // apply
9597 in->pop_and_dirty_projected_inode(mut->ls);
9598 mut->apply();
9599 mds->locker->drop_locks(mut.get());
9600 mut->cleanup();
9601
9602 // tell table we've committed
9603 mds->snapclient->commit(mdr->more()->stid, mut->ls);
9604
9605 // create
9606 bufferlist::iterator p = mdr->more()->snapidbl.begin();
9607 snapid_t seq;
9608 ::decode(seq, p);
9609
9610 in->open_snaprealm();
9611 in->snaprealm->srnode.seq = seq;
9612 in->snaprealm->srnode.created = seq;
9613 bool ok = in->snaprealm->_open_parents(NULL);
9614 assert(ok);
9615
9616 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT);
9617
9618 /*
9619 static int count = 5;
9620 if (--count == 0)
9621 ceph_abort(); // hack test test **********
9622 */
9623
9624 // done.
9625 mdr->more()->stid = 0; // caller will likely need to reuse this
9626 dispatch_request(mdr);
9627}
9628
9629
9630// -------------------------------------------------------------------------------
9631// STRAYS
9632
9633struct C_MDC_RetryScanStray : public MDCacheContext {
9634 dirfrag_t next;
9635 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9636 void finish(int r) override {
9637 mdcache->scan_stray_dir(next);
9638 }
9639};
9640
9641void MDCache::scan_stray_dir(dirfrag_t next)
9642{
9643 dout(10) << "scan_stray_dir " << next << dendl;
9644
9645 list<CDir*> ls;
9646 for (int i = 0; i < NUM_STRAY; ++i) {
9647 if (strays[i]->ino() < next.ino)
9648 continue;
9649 strays[i]->get_dirfrags(ls);
9650 }
9651
9652 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9653 CDir *dir = *p;
9654 if (dir->dirfrag() < next)
9655 continue;
9656 if (!dir->is_complete()) {
9657 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9658 return;
9659 }
94b18763
FG
9660 for (auto &p : dir->items) {
9661 CDentry *dn = p.second;
7c673cae
FG
9662 dn->state_set(CDentry::STATE_STRAY);
9663 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9664 if (dnl->is_primary()) {
9665 CInode *in = dnl->get_inode();
9666 if (in->inode.nlink == 0)
9667 in->state_set(CInode::STATE_ORPHAN);
9668 maybe_eval_stray(in);
9669 }
9670 }
9671 }
9672}
9673
7c673cae
FG
9674void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9675{
9676 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9677 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9678}
9679
9680
9681
9682
9683
9684// ========================================================================================
9685// DISCOVER
9686/*
9687
9688 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9689 to the parent metadata object in the cache (pinning it).
9690
9691 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9692
9693*/
9694
9695void MDCache::_send_discover(discover_info_t& d)
9696{
9697 MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path,
9698 d.want_base_dir, d.want_xlocked);
9699 dis->set_tid(d.tid);
9700 mds->send_message_mds(dis, d.mds);
9701}
9702
9703void MDCache::discover_base_ino(inodeno_t want_ino,
9704 MDSInternalContextBase *onfinish,
9705 mds_rank_t from)
9706{
9707 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9708 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9709 discover_info_t& d = _create_discover(from);
9710 d.ino = want_ino;
9711 _send_discover(d);
9712 }
9713 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9714}
9715
9716
9717void MDCache::discover_dir_frag(CInode *base,
9718 frag_t approx_fg,
9719 MDSInternalContextBase *onfinish,
9720 mds_rank_t from)
9721{
9722 if (from < 0)
9723 from = base->authority().first;
9724
9725 dirfrag_t df(base->ino(), approx_fg);
9726 dout(7) << "discover_dir_frag " << df
9727 << " from mds." << from << dendl;
9728
9729 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9730 discover_info_t& d = _create_discover(from);
9731 d.pin_base(base);
9732 d.ino = base->ino();
9733 d.frag = approx_fg;
9734 d.want_base_dir = true;
9735 _send_discover(d);
9736 }
9737
9738 if (onfinish)
9739 base->add_dir_waiter(approx_fg, onfinish);
9740}
9741
9742struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9743 CInode *base;
9744 snapid_t snapid;
9745 filepath path;
9746 mds_rank_t from;
9747 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9748 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9749 void finish(int r) override {
9750 mdcache->discover_path(base, snapid, path, 0, from);
9751 }
9752};
9753
9754void MDCache::discover_path(CInode *base,
9755 snapid_t snap,
9756 filepath want_path,
9757 MDSInternalContextBase *onfinish,
9758 bool want_xlocked,
9759 mds_rank_t from)
9760{
9761 if (from < 0)
9762 from = base->authority().first;
9763
9764 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9765 << (want_xlocked ? " want_xlocked":"")
9766 << dendl;
9767
9768 if (base->is_ambiguous_auth()) {
9769 dout(10) << " waiting for single auth on " << *base << dendl;
9770 if (!onfinish)
9771 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9772 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9773 return;
9774 } else if (from == mds->get_nodeid()) {
9775 list<MDSInternalContextBase*> finished;
9776 base->take_waiting(CInode::WAIT_DIR, finished);
9777 mds->queue_waiters(finished);
9778 return;
9779 }
9780
9781 frag_t fg = base->pick_dirfrag(want_path[0]);
9782 if ((want_xlocked && want_path.depth() == 1) ||
9783 !base->is_waiting_for_dir(fg) || !onfinish) {
9784 discover_info_t& d = _create_discover(from);
9785 d.ino = base->ino();
9786 d.pin_base(base);
9787 d.frag = fg;
9788 d.snap = snap;
9789 d.want_path = want_path;
9790 d.want_base_dir = true;
9791 d.want_xlocked = want_xlocked;
9792 _send_discover(d);
9793 }
9794
9795 // register + wait
9796 if (onfinish)
9797 base->add_dir_waiter(fg, onfinish);
9798}
9799
9800struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9801 CDir *base;
9802 snapid_t snapid;
9803 filepath path;
9804 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9805 MDCacheContext(c), base(b), snapid(s), path(p) {}
9806 void finish(int r) override {
9807 mdcache->discover_path(base, snapid, path, 0);
9808 }
9809};
9810
9811void MDCache::discover_path(CDir *base,
9812 snapid_t snap,
9813 filepath want_path,
9814 MDSInternalContextBase *onfinish,
9815 bool want_xlocked)
9816{
9817 mds_rank_t from = base->authority().first;
9818
9819 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9820 << (want_xlocked ? " want_xlocked":"")
9821 << dendl;
9822
9823 if (base->is_ambiguous_auth()) {
9824 dout(7) << " waiting for single auth on " << *base << dendl;
9825 if (!onfinish)
9826 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9827 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9828 return;
9829 } else if (from == mds->get_nodeid()) {
9830 list<MDSInternalContextBase*> finished;
9831 base->take_sub_waiting(finished);
9832 mds->queue_waiters(finished);
9833 return;
9834 }
9835
9836 if ((want_xlocked && want_path.depth() == 1) ||
9837 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9838 discover_info_t& d = _create_discover(from);
9839 d.ino = base->ino();
31f18b77 9840 d.pin_base(base->inode);
7c673cae
FG
9841 d.frag = base->get_frag();
9842 d.snap = snap;
9843 d.want_path = want_path;
9844 d.want_base_dir = false;
9845 d.want_xlocked = want_xlocked;
9846 _send_discover(d);
9847 }
9848
9849 // register + wait
9850 if (onfinish)
9851 base->add_dentry_waiter(want_path[0], snap, onfinish);
9852}
9853
9854void MDCache::kick_discovers(mds_rank_t who)
9855{
9856 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9857 p != discovers.end();
9858 ++p) {
9859 if (p->second.mds != who)
9860 continue;
9861 _send_discover(p->second);
9862 }
9863}
9864
9865
9866/* This function DOES put the passed message before returning */
9867void MDCache::handle_discover(MDiscover *dis)
9868{
9869 mds_rank_t whoami = mds->get_nodeid();
9870 mds_rank_t from = mds_rank_t(dis->get_source().num());
9871
9872 assert(from != whoami);
9873
9874 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9875 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 9876 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9877 dis->put();
9878 return;
9879 }
9880
9881 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9882 // delay processing request from survivor because we may not yet choose lock states.
9883 if (!mds->mdsmap->is_rejoin(from)) {
9884 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9885 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9886 return;
9887 }
9888 }
9889
9890
9891 CInode *cur = 0;
9892 MDiscoverReply *reply = new MDiscoverReply(dis);
9893
9894 snapid_t snapid = dis->get_snapid();
9895
9896 // get started.
9897 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9898 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9899 // wants root
9900 dout(7) << "handle_discover from mds." << from
9901 << " wants base + " << dis->get_want().get_path()
9902 << " snap " << snapid
9903 << dendl;
9904
9905 cur = get_inode(dis->get_base_ino());
9906 assert(cur);
9907
9908 // add root
9909 reply->starts_with = MDiscoverReply::INODE;
9910 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9911 dout(10) << "added base " << *cur << dendl;
9912 }
9913 else {
9914 // there's a base inode
9915 cur = get_inode(dis->get_base_ino(), snapid);
9916 if (!cur && snapid != CEPH_NOSNAP) {
9917 cur = get_inode(dis->get_base_ino());
9918 if (cur && !cur->is_multiversion())
9919 cur = NULL; // nope!
9920 }
9921
9922 if (!cur) {
9923 dout(7) << "handle_discover mds." << from
9924 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9925 << dendl;
9926 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9927 reply->set_error_dentry(dis->get_dentry(0));
9928 reply->set_flag_error_dir();
9929 } else if (dis->wants_base_dir()) {
9930 dout(7) << "handle_discover mds." << from
9931 << " wants basedir+" << dis->get_want().get_path()
9932 << " has " << *cur
9933 << dendl;
9934 } else {
9935 dout(7) << "handle_discover mds." << from
9936 << " wants " << dis->get_want().get_path()
9937 << " has " << *cur
9938 << dendl;
9939 }
9940 }
9941
9942 assert(reply);
9943
9944 // add content
9945 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9946 for (unsigned i = 0;
9947 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
9948 i++) {
9949
9950 // -- figure out the dir
9951
9952 // is *cur even a dir at all?
9953 if (!cur->is_dir()) {
9954 dout(7) << *cur << " not a dir" << dendl;
9955 reply->set_flag_error_dir();
9956 break;
9957 }
9958
9959 // pick frag
9960 frag_t fg;
9961 if (dis->get_want().depth()) {
9962 // dentry specifies
9963 fg = cur->pick_dirfrag(dis->get_dentry(i));
9964 } else {
9965 // requester explicity specified the frag
9966 assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
9967 fg = dis->get_base_dir_frag();
9968 if (!cur->dirfragtree.is_leaf(fg))
9969 fg = cur->dirfragtree[fg.value()];
9970 }
9971 CDir *curdir = cur->get_dirfrag(fg);
9972
9973 if ((!curdir && !cur->is_auth()) ||
9974 (curdir && !curdir->is_auth())) {
9975
9976 /* before:
9977 * ONLY set flag if empty!!
9978 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9979 * resulting in duplicate discovers in flight,
9980 * which can wreak havoc when discovering rename srcdn (which may move)
9981 */
9982
9983 if (reply->is_empty()) {
9984 // only hint if empty.
9985 // someday this could be better, but right now the waiter logic isn't smart enough.
9986
9987 // hint
9988 if (curdir) {
9989 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
9990 reply->set_dir_auth_hint(curdir->authority().first);
9991 } else {
9992 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9993 << *cur << dendl;
9994 reply->set_dir_auth_hint(cur->authority().first);
9995 }
9996
9997 // note error dentry, if any
9998 // NOTE: important, as it allows requester to issue an equivalent discover
9999 // to whomever we hint at.
10000 if (dis->get_want().depth() > i)
10001 reply->set_error_dentry(dis->get_dentry(i));
10002 }
10003
10004 break;
10005 }
10006
10007 if (!curdir) { // open dir?
10008 if (cur->is_frozen()) {
10009 if (!reply->is_empty()) {
10010 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10011 break;
10012 }
10013 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10014 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10015 reply->put();
10016 return;
10017 }
10018 curdir = cur->get_or_open_dirfrag(this, fg);
10019 } else if (curdir->is_frozen_tree() ||
10020 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
10021 if (!reply->is_empty()) {
10022 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10023 break;
10024 }
7c673cae
FG
10025 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10026 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10027 reply->set_flag_error_dir();
10028 break;
10029 }
7c673cae
FG
10030 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10031 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10032 reply->put();
10033 return;
10034 }
10035
10036 // add dir
10037 if (curdir->get_version() == 0) {
10038 // fetch newly opened dir
10039 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10040 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10041 // make sure the base frag is correct, though, in there was a refragment since the
10042 // original request was sent.
10043 reply->set_base_dir_frag(curdir->get_frag());
10044 } else {
10045 assert(!curdir->is_ambiguous_auth()); // would be frozen.
10046 if (!reply->trace.length())
10047 reply->starts_with = MDiscoverReply::DIR;
10048 replicate_dir(curdir, from, reply->trace);
10049 dout(7) << "handle_discover added dir " << *curdir << dendl;
10050 }
10051
10052 // lookup
10053 CDentry *dn = 0;
10054 if (curdir->get_version() == 0) {
10055 // fetch newly opened dir
31f18b77 10056 assert(!curdir->has_bloom());
7c673cae
FG
10057 } else if (dis->get_want().depth() > 0) {
10058 // lookup dentry
10059 dn = curdir->lookup(dis->get_dentry(i), snapid);
10060 } else
10061 break; // done!
10062
10063 // incomplete dir?
10064 if (!dn) {
31f18b77
FG
10065 if (!curdir->is_complete() &&
10066 (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
10067 // readdir
10068 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10069 if (reply->is_empty()) {
10070 // fetch and wait
10071 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10072 dis->wants_base_dir() && curdir->get_version() == 0);
10073 reply->put();
10074 return;
10075 } else {
10076 // initiate fetch, but send what we have so far
10077 curdir->fetch(0);
10078 break;
10079 }
10080 }
10081
10082 // send null dentry
10083 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10084 << *curdir << dendl;
10085 dn = curdir->add_null_dentry(dis->get_dentry(i));
10086 }
10087 assert(dn);
10088
31f18b77
FG
10089 // don't add replica to purging dentry/inode
10090 if (dn->state_test(CDentry::STATE_PURGING)) {
10091 if (reply->is_empty())
10092 reply->set_flag_error_dn(dis->get_dentry(i));
10093 break;
10094 }
10095
7c673cae
FG
10096 CDentry::linkage_t *dnl = dn->get_linkage();
10097
10098 // xlocked dentry?
10099 // ...always block on non-tail items (they are unrelated)
10100 // ...allow xlocked tail disocvery _only_ if explicitly requested
10101 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10102 if (dn->lock.is_xlocked()) {
10103 // is this the last (tail) item in the discover traversal?
10104 if (tailitem && dis->wants_xlocked()) {
10105 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
10106 } else if (reply->is_empty()) {
10107 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10108 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10109 reply->put();
10110 return;
10111 } else {
10112 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10113 break;
10114 }
10115 }
10116
10117 // frozen inode?
10118 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10119 if (tailitem && dis->wants_xlocked()) {
10120 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10121 } else if (reply->is_empty()) {
10122 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10123 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10124 reply->put();
10125 return;
10126 } else {
10127 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10128 break;
10129 }
10130 }
10131
10132 // add dentry
10133 if (!reply->trace.length())
10134 reply->starts_with = MDiscoverReply::DENTRY;
10135 replicate_dentry(dn, from, reply->trace);
10136 dout(7) << "handle_discover added dentry " << *dn << dendl;
10137
10138 if (!dnl->is_primary()) break; // stop on null or remote link.
10139
10140 // add inode
10141 CInode *next = dnl->get_inode();
10142 assert(next->is_auth());
10143
10144 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10145 dout(7) << "handle_discover added inode " << *next << dendl;
10146
10147 // descend, keep going.
10148 cur = next;
10149 continue;
10150 }
10151
10152 // how did we do?
10153 assert(!reply->is_empty());
10154 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10155 mds->send_message(reply, dis->get_connection());
10156
10157 dis->put();
10158}
10159
10160/* This function DOES put the passed message before returning */
10161void MDCache::handle_discover_reply(MDiscoverReply *m)
10162{
10163 /*
10164 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10165 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10166 m->put();
10167 return;
10168 }
10169 */
10170 dout(7) << "discover_reply " << *m << dendl;
10171 if (m->is_flag_error_dir())
10172 dout(7) << " flag error, dir" << dendl;
10173 if (m->is_flag_error_dn())
10174 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10175
10176 list<MDSInternalContextBase*> finished, error;
10177 mds_rank_t from = mds_rank_t(m->get_source().num());
10178
10179 // starting point
10180 CInode *cur = get_inode(m->get_base_ino());
10181 bufferlist::iterator p = m->trace.begin();
10182
10183 int next = m->starts_with;
10184
10185 // decrement discover counters
10186 if (m->get_tid()) {
10187 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10188 if (p != discovers.end()) {
10189 dout(10) << " found tid " << m->get_tid() << dendl;
10190 discovers.erase(p);
10191 } else {
10192 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10193 }
10194 }
10195
10196 // discover may start with an inode
10197 if (!p.end() && next == MDiscoverReply::INODE) {
10198 cur = add_replica_inode(p, NULL, finished);
10199 dout(7) << "discover_reply got base inode " << *cur << dendl;
10200 assert(cur->is_base());
10201
10202 next = MDiscoverReply::DIR;
10203
10204 // take waiters?
10205 if (cur->is_base() &&
10206 waiting_for_base_ino[from].count(cur->ino())) {
10207 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10208 waiting_for_base_ino[from].erase(cur->ino());
10209 }
10210 }
10211 assert(cur);
10212
10213 // loop over discover results.
10214 // indexes follow each ([[dir] dentry] inode)
10215 // can start, end with any type.
10216 while (!p.end()) {
10217 // dir
10218 frag_t fg;
10219 CDir *curdir = 0;
10220 if (next == MDiscoverReply::DIR) {
10221 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10222 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10223 assert(m->get_wanted_base_dir());
10224 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10225 }
10226 } else {
10227 // note: this can only happen our first way around this loop.
10228 if (p.end() && m->is_flag_error_dn()) {
10229 fg = cur->pick_dirfrag(m->get_error_dentry());
10230 curdir = cur->get_dirfrag(fg);
10231 } else
10232 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10233 }
10234
10235 if (p.end())
10236 break;
10237
10238 // dentry
10239 CDentry *dn = add_replica_dentry(p, curdir, finished);
10240
10241 if (p.end())
10242 break;
10243
10244 // inode
10245 cur = add_replica_inode(p, dn, finished);
10246
10247 next = MDiscoverReply::DIR;
10248 }
10249
10250 // dir error?
10251 // or dir_auth hint?
10252 if (m->is_flag_error_dir() && !cur->is_dir()) {
10253 // not a dir.
10254 cur->take_waiting(CInode::WAIT_DIR, error);
10255 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10256 mds_rank_t who = m->get_dir_auth_hint();
10257 if (who == mds->get_nodeid()) who = -1;
10258 if (who >= 0)
10259 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10260
7c673cae
FG
10261
10262 if (m->get_wanted_base_dir()) {
31f18b77
FG
10263 frag_t fg = m->get_base_dir_frag();
10264 CDir *dir = cur->get_dirfrag(fg);
10265
7c673cae
FG
10266 if (cur->is_waiting_for_dir(fg)) {
10267 if (cur->is_auth())
10268 cur->take_waiting(CInode::WAIT_DIR, finished);
10269 else if (dir || !cur->dirfragtree.is_leaf(fg))
10270 cur->take_dir_waiting(fg, finished);
10271 else
10272 discover_dir_frag(cur, fg, 0, who);
10273 } else
10274 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10275 }
10276
10277 // try again?
10278 if (m->get_error_dentry().length()) {
31f18b77
FG
10279 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10280 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10281 // wanted a dentry
10282 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10283 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10284 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10285 m->get_wanted_snapid(), finished);
10286 } else {
10287 filepath relpath(m->get_error_dentry(), 0);
10288 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10289 }
10290 } else
10291 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10292 << m->get_error_dentry() << dendl;
10293 }
31f18b77
FG
10294 } else if (m->is_flag_error_dn()) {
10295 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10296 CDir *dir = cur->get_dirfrag(fg);
10297 if (dir) {
10298 if (dir->is_auth()) {
10299 dir->take_sub_waiting(finished);
10300 } else {
10301 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10302 m->get_wanted_snapid(), error);
10303 }
10304 }
7c673cae
FG
10305 }
10306
10307 // waiters
10308 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10309 mds->queue_waiters(finished);
10310
10311 // done
10312 m->put();
10313}
10314
10315
10316
10317// ----------------------------
10318// REPLICAS
10319
b32b8144
FG
10320
10321void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10322{
10323 dirfrag_t df = dir->dirfrag();
10324 ::encode(df, bl);
10325 dir->encode_replica(to, bl);
10326}
10327
10328void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10329{
94b18763 10330 ::encode(dn->get_name(), bl);
b32b8144
FG
10331 ::encode(dn->last, bl);
10332 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10333}
10334
10335void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10336 uint64_t features)
10337{
10338 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
10339 ::encode(in->last, bl);
10340 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10341}
10342
7c673cae
FG
10343CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
10344 list<MDSInternalContextBase*>& finished)
10345{
10346 dirfrag_t df;
10347 ::decode(df, p);
10348
10349 assert(diri->ino() == df.ino);
10350
10351 // add it (_replica_)
10352 CDir *dir = diri->get_dirfrag(df.frag);
10353
10354 if (dir) {
10355 // had replica. update w/ new nonce.
10356 dir->decode_replica(p);
10357 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10358 } else {
10359 // force frag to leaf in the diri tree
10360 if (!diri->dirfragtree.is_leaf(df.frag)) {
10361 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10362 << diri->dirfragtree << dendl;
10363 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10364 }
10365
10366 // add replica.
10367 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10368 dir->decode_replica(p);
10369
10370 // is this a dir_auth delegation boundary?
10371 if (from != diri->authority().first ||
10372 diri->is_ambiguous_auth() ||
10373 diri->is_base())
10374 adjust_subtree_auth(dir, from);
10375
10376 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10377
10378 // get waiters
10379 diri->take_dir_waiting(df.frag, finished);
10380 }
10381
10382 return dir;
10383}
10384
7c673cae
FG
10385CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished)
10386{
10387 string name;
10388 snapid_t last;
10389 ::decode(name, p);
10390 ::decode(last, p);
10391
10392 CDentry *dn = dir->lookup(name, last);
10393
10394 // have it?
10395 if (dn) {
10396 dn->decode_replica(p, false);
10397 dout(7) << "add_replica_dentry had " << *dn << dendl;
10398 } else {
10399 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10400 dn->decode_replica(p, true);
10401 dout(7) << "add_replica_dentry added " << *dn << dendl;
10402 }
10403
10404 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10405
10406 return dn;
10407}
10408
10409CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished)
10410{
10411 inodeno_t ino;
10412 snapid_t last;
10413 ::decode(ino, p);
10414 ::decode(last, p);
10415 CInode *in = get_inode(ino, last);
10416 if (!in) {
10417 in = new CInode(this, false, 1, last);
10418 in->decode_replica(p, true);
10419 add_inode(in);
10420 if (in->ino() == MDS_INO_ROOT)
10421 in->inode_auth.first = 0;
10422 else if (in->is_mdsdir())
10423 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10424 dout(10) << "add_replica_inode added " << *in << dendl;
10425 if (dn) {
10426 assert(dn->get_linkage()->is_null());
10427 dn->dir->link_primary_inode(dn, in);
10428 }
10429 } else {
10430 in->decode_replica(p, false);
10431 dout(10) << "add_replica_inode had " << *in << dendl;
10432 }
10433
10434 if (dn) {
10435 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10436 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10437 }
10438
10439 return in;
10440}
10441
10442
10443void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10444{
10445 uint64_t features = mds->mdsmap->get_up_features();
10446 replicate_inode(get_myin(), who, bl, features);
10447 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10448 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10449 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10450 replicate_dir(straydn->get_dir(), who, bl);
10451 replicate_dentry(straydn, who, bl);
10452}
10453
10454CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from)
10455{
10456 list<MDSInternalContextBase*> finished;
10457 bufferlist::iterator p = bl.begin();
10458
10459 CInode *mdsin = add_replica_inode(p, NULL, finished);
10460 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10461 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10462 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10463 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10464 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10465 if (!finished.empty())
10466 mds->queue_waiters(finished);
10467
10468 return straydn;
10469}
10470
10471
10472int MDCache::send_dir_updates(CDir *dir, bool bcast)
10473{
10474 // this is an FYI, re: replication
10475
10476 set<mds_rank_t> who;
10477 if (bcast) {
10478 mds->get_mds_map()->get_active_mds_set(who);
10479 } else {
181888fb
FG
10480 for (const auto &p : dir->get_replicas()) {
10481 who.insert(p.first);
10482 }
7c673cae
FG
10483 }
10484
10485 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10486
10487 filepath path;
10488 dir->inode->make_path(path);
10489
10490 mds_rank_t whoami = mds->get_nodeid();
10491 for (set<mds_rank_t>::iterator it = who.begin();
10492 it != who.end();
10493 ++it) {
10494 if (*it == whoami) continue;
10495 //if (*it == except) continue;
10496 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10497
94b18763
FG
10498 std::set<int32_t> s;
10499 for (const auto &r : dir->dir_rep_by) {
10500 s.insert(r);
10501 }
7c673cae
FG
10502 mds->send_message_mds(new MDirUpdate(mds->get_nodeid(),
10503 dir->dirfrag(),
10504 dir->dir_rep,
94b18763 10505 s,
7c673cae
FG
10506 path,
10507 bcast),
10508 *it);
10509 }
10510
10511 return 0;
10512}
10513
10514/* This function DOES put the passed message before returning */
10515void MDCache::handle_dir_update(MDirUpdate *m)
10516{
224ce89b
WB
10517 dirfrag_t df = m->get_dirfrag();
10518 CDir *dir = get_dirfrag(df);
7c673cae 10519 if (!dir) {
224ce89b 10520 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10521
10522 // discover it?
10523 if (m->should_discover()) {
10524 // only try once!
10525 // this is key to avoid a fragtree update race, among other things.
224ce89b 10526 m->inc_tried_discover();
7c673cae
FG
10527 vector<CDentry*> trace;
10528 CInode *in;
10529 filepath path = m->get_path();
10530 dout(5) << "trying discover on dir_update for " << path << dendl;
10531 MDRequestRef null_ref;
10532 int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
10533 if (r > 0)
10534 return;
224ce89b
WB
10535 if (r == 0 &&
10536 in->ino() == df.ino &&
10537 in->get_approx_dirfrag(df.frag) == NULL) {
10538 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10539 return;
10540 }
7c673cae
FG
10541 }
10542
10543 m->put();
10544 return;
10545 }
10546
224ce89b
WB
10547 if (!m->has_tried_discover()) {
10548 // Update if it already exists. Othwerwise it got updated by discover reply.
10549 dout(5) << "dir_update on " << *dir << dendl;
10550 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10551 dir->dir_rep_by.clear();
10552 for (const auto &e : m->get_dir_rep_by()) {
10553 dir->dir_rep_by.insert(e);
10554 }
224ce89b
WB
10555 }
10556
7c673cae
FG
10557 // done
10558 m->put();
10559}
10560
10561
10562
10563
10564
10565// LINK
10566
10567void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10568{
10569 dout(7) << "send_dentry_link " << *dn << dendl;
10570
10571 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10572 for (const auto &p : dn->get_replicas()) {
7c673cae 10573 // don't tell (rename) witnesses; they already know
181888fb 10574 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10575 continue;
181888fb
FG
10576 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10577 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10578 rejoin_gather.count(p.first)))
7c673cae
FG
10579 continue;
10580 CDentry::linkage_t *dnl = dn->get_linkage();
10581 MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
94b18763 10582 dn->get_name(), dnl->is_primary());
7c673cae
FG
10583 if (dnl->is_primary()) {
10584 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10585 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10586 mds->mdsmap->get_up_features());
10587 } else if (dnl->is_remote()) {
10588 inodeno_t ino = dnl->get_remote_ino();
10589 __u8 d_type = dnl->get_remote_d_type();
10590 dout(10) << " remote " << ino << " " << d_type << dendl;
10591 ::encode(ino, m->bl);
10592 ::encode(d_type, m->bl);
10593 } else
10594 ceph_abort(); // aie, bad caller!
181888fb 10595 mds->send_message_mds(m, p.first);
7c673cae
FG
10596 }
10597}
10598
10599/* This function DOES put the passed message before returning */
10600void MDCache::handle_dentry_link(MDentryLink *m)
10601{
10602
10603 CDentry *dn = NULL;
10604 CDir *dir = get_dirfrag(m->get_dirfrag());
10605 if (!dir) {
10606 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10607 } else {
10608 dn = dir->lookup(m->get_dn());
10609 if (!dn) {
10610 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10611 } else {
10612 dout(7) << "handle_dentry_link on " << *dn << dendl;
10613 CDentry::linkage_t *dnl = dn->get_linkage();
10614
10615 assert(!dn->is_auth());
10616 assert(dnl->is_null());
10617 }
10618 }
10619
10620 bufferlist::iterator p = m->bl.begin();
10621 list<MDSInternalContextBase*> finished;
10622 if (dn) {
10623 if (m->get_is_primary()) {
10624 // primary link.
10625 add_replica_inode(p, dn, finished);
10626 } else {
10627 // remote link, easy enough.
10628 inodeno_t ino;
10629 __u8 d_type;
10630 ::decode(ino, p);
10631 ::decode(d_type, p);
10632 dir->link_remote_inode(dn, ino, d_type);
10633 }
10634 } else {
10635 ceph_abort();
10636 }
10637
10638 if (!finished.empty())
10639 mds->queue_waiters(finished);
10640
10641 m->put();
10642 return;
10643}
10644
10645
10646// UNLINK
10647
10648void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10649{
10650 dout(10) << "send_dentry_unlink " << *dn << dendl;
10651 // share unlink news with replicas
10652 set<mds_rank_t> replicas;
10653 dn->list_replicas(replicas);
10654 if (straydn)
10655 straydn->list_replicas(replicas);
10656 for (set<mds_rank_t>::iterator it = replicas.begin();
10657 it != replicas.end();
10658 ++it) {
10659 // don't tell (rmdir) witnesses; they already know
10660 if (mdr.get() && mdr->more()->witnessed.count(*it))
10661 continue;
10662
10663 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10664 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10665 rejoin_gather.count(*it)))
10666 continue;
10667
94b18763 10668 MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->get_name());
7c673cae
FG
10669 if (straydn)
10670 replicate_stray(straydn, *it, unlink->straybl);
10671 mds->send_message_mds(unlink, *it);
10672 }
10673}
10674
10675/* This function DOES put the passed message before returning */
10676void MDCache::handle_dentry_unlink(MDentryUnlink *m)
10677{
10678 // straydn
10679 CDentry *straydn = NULL;
10680 if (m->straybl.length())
10681 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10682
10683 CDir *dir = get_dirfrag(m->get_dirfrag());
10684 if (!dir) {
10685 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10686 } else {
10687 CDentry *dn = dir->lookup(m->get_dn());
10688 if (!dn) {
10689 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10690 } else {
10691 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10692 CDentry::linkage_t *dnl = dn->get_linkage();
10693
10694 // open inode?
10695 if (dnl->is_primary()) {
10696 CInode *in = dnl->get_inode();
10697 dn->dir->unlink_inode(dn);
10698 assert(straydn);
10699 straydn->dir->link_primary_inode(straydn, in);
10700
10701 // in->first is lazily updated on replica; drag it forward so
10702 // that we always keep it in sync with the dnq
10703 assert(straydn->first >= in->first);
10704 in->first = straydn->first;
10705
10706 // update subtree map?
10707 if (in->is_dir())
10708 adjust_subtree_after_rename(in, dir, false);
10709
10710 // send caps to auth (if we're not already)
10711 if (in->is_any_caps() &&
10712 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10713 migrator->export_caps(in);
10714
7c673cae
FG
10715 straydn = NULL;
10716 } else {
10717 assert(!straydn);
10718 assert(dnl->is_remote());
10719 dn->dir->unlink_inode(dn);
10720 }
10721 assert(dnl->is_null());
7c673cae
FG
10722 }
10723 }
10724
10725 // race with trim_dentry()
10726 if (straydn) {
10727 assert(straydn->get_num_ref() == 0);
10728 assert(straydn->get_linkage()->is_null());
10729 map<mds_rank_t, MCacheExpire*> expiremap;
10730 trim_dentry(straydn, expiremap);
10731 send_expire_messages(expiremap);
10732 }
10733
10734 m->put();
10735 return;
10736}
10737
10738
10739
10740
10741
10742
10743// ===================================================================
10744
10745
10746
10747// ===================================================================
10748// FRAGMENT
10749
10750
10751/**
10752 * adjust_dir_fragments -- adjust fragmentation for a directory
10753 *
10754 * @param diri directory inode
10755 * @param basefrag base fragment
10756 * @param bits bit adjustment. positive for split, negative for merge.
10757 */
10758void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10759 list<CDir*>& resultfrags,
10760 list<MDSInternalContextBase*>& waiters,
10761 bool replay)
10762{
10763 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10764 << " on " << *diri << dendl;
10765
10766 list<CDir*> srcfrags;
10767 diri->get_dirfrags_under(basefrag, srcfrags);
10768
10769 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10770}
10771
10772CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10773{
10774 CDir *dir = diri->get_dirfrag(fg);
10775 if (dir)
10776 return dir;
10777
10778 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10779
10780 list<CDir*> src, result;
10781 list<MDSInternalContextBase*> waiters;
10782
10783 // split a parent?
10784 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10785 while (1) {
10786 CDir *pdir = diri->get_dirfrag(parent);
10787 if (pdir) {
10788 int split = fg.bits() - parent.bits();
10789 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10790 src.push_back(pdir);
10791 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10792 dir = diri->get_dirfrag(fg);
10793 if (dir) {
10794 dout(10) << "force_dir_fragment result " << *dir << dendl;
10795 break;
10796 }
10797 }
10798 if (parent == frag_t())
10799 break;
10800 frag_t last = parent;
10801 parent = parent.parent();
10802 dout(10) << " " << last << " parent is " << parent << dendl;
10803 }
10804
10805 if (!dir) {
10806 // hoover up things under fg?
10807 diri->get_dirfrags_under(fg, src);
10808 if (src.empty()) {
10809 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10810 } else {
10811 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10812 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10813 dir = result.front();
10814 dout(10) << "force_dir_fragment result " << *dir << dendl;
10815 }
10816 }
10817 if (!replay)
10818 mds->queue_waiters(waiters);
10819 return dir;
10820}
10821
10822void MDCache::adjust_dir_fragments(CInode *diri,
10823 list<CDir*>& srcfrags,
10824 frag_t basefrag, int bits,
10825 list<CDir*>& resultfrags,
10826 list<MDSInternalContextBase*>& waiters,
10827 bool replay)
10828{
10829 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10830 << " srcfrags " << srcfrags
10831 << " on " << *diri << dendl;
10832
10833 // adjust fragtree
10834 // yuck. we may have discovered the inode while it was being fragmented.
10835 if (!diri->dirfragtree.is_leaf(basefrag))
10836 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10837
10838 if (bits > 0)
10839 diri->dirfragtree.split(basefrag, bits);
10840 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10841
10842 if (srcfrags.empty())
10843 return;
10844
10845 // split
10846 CDir *parent_dir = diri->get_parent_dir();
10847 CDir *parent_subtree = 0;
10848 if (parent_dir)
10849 parent_subtree = get_subtree_root(parent_dir);
10850
10851 if (bits > 0) {
10852 // SPLIT
10853 assert(srcfrags.size() == 1);
10854 CDir *dir = srcfrags.front();
10855
10856 dir->split(bits, resultfrags, waiters, replay);
10857
10858 // did i change the subtree map?
10859 if (dir->is_subtree_root()) {
10860 // new frags are now separate subtrees
10861 for (list<CDir*>::iterator p = resultfrags.begin();
10862 p != resultfrags.end();
10863 ++p)
10864 subtrees[*p].clear(); // new frag is now its own subtree
10865
10866 // was i a bound?
10867 if (parent_subtree) {
10868 assert(subtrees[parent_subtree].count(dir));
10869 subtrees[parent_subtree].erase(dir);
10870 for (list<CDir*>::iterator p = resultfrags.begin();
10871 p != resultfrags.end();
10872 ++p) {
10873 assert((*p)->is_subtree_root());
10874 subtrees[parent_subtree].insert(*p);
10875 }
10876 }
10877
10878 // adjust my bounds.
10879 set<CDir*> bounds;
10880 bounds.swap(subtrees[dir]);
10881 subtrees.erase(dir);
10882 for (set<CDir*>::iterator p = bounds.begin();
10883 p != bounds.end();
10884 ++p) {
10885 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10886 subtrees[frag].insert(*p);
10887 }
10888
10889 show_subtrees(10);
7c673cae
FG
10890 }
10891
10892 diri->close_dirfrag(dir->get_frag());
10893
10894 } else {
10895 // MERGE
10896
10897 // are my constituent bits subtrees? if so, i will be too.
10898 // (it's all or none, actually.)
31f18b77
FG
10899 bool any_subtree = false;
10900 for (CDir *dir : srcfrags) {
7c673cae 10901 if (dir->is_subtree_root()) {
31f18b77
FG
10902 any_subtree = true;
10903 break;
10904 }
10905 }
10906 set<CDir*> new_bounds;
10907 if (any_subtree) {
10908 for (CDir *dir : srcfrags) {
10909 // this simplifies the code that find subtrees underneath the dirfrag
10910 if (!dir->is_subtree_root()) {
10911 dir->state_set(CDir::STATE_AUXSUBTREE);
10912 adjust_subtree_auth(dir, mds->get_nodeid());
10913 }
10914 }
10915
10916 for (CDir *dir : srcfrags) {
10917 assert(dir->is_subtree_root());
7c673cae 10918 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
10919 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10920 set<CDir*>::iterator r = q->second.begin();
10921 while (r != subtrees[dir].end()) {
10922 new_bounds.insert(*r);
10923 subtrees[dir].erase(r++);
10924 }
10925 subtrees.erase(q);
31f18b77 10926
7c673cae
FG
10927 // remove myself as my parent's bound
10928 if (parent_subtree)
10929 subtrees[parent_subtree].erase(dir);
10930 }
10931 }
10932
10933 // merge
10934 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10935 f->merge(srcfrags, waiters, replay);
7c673cae 10936
31f18b77 10937 if (any_subtree) {
7c673cae
FG
10938 assert(f->is_subtree_root());
10939 subtrees[f].swap(new_bounds);
10940 if (parent_subtree)
10941 subtrees[parent_subtree].insert(f);
10942
10943 show_subtrees(10);
10944 }
10945
10946 resultfrags.push_back(f);
10947 }
10948}
10949
10950
10951class C_MDC_FragmentFrozen : public MDSInternalContext {
10952 MDCache *mdcache;
10953 MDRequestRef mdr;
10954public:
10955 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
10956 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
10957 void finish(int r) override {
10958 mdcache->fragment_frozen(mdr, r);
10959 }
10960};
10961
10962bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
10963{
10964 if (is_readonly()) {
10965 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
10966 return false;
10967 }
10968 if (mds->is_cluster_degraded()) {
10969 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
10970 return false;
10971 }
10972 if (diri->get_parent_dir() &&
10973 diri->get_parent_dir()->get_inode()->is_stray()) {
10974 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
10975 return false;
10976 }
10977 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
10978 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
10979 return false;
10980 }
10981
10982 if (diri->scrub_is_in_progress()) {
10983 dout(7) << "can_fragment: scrub in progress" << dendl;
10984 return false;
10985 }
10986
10987 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
10988 CDir *dir = *p;
10989 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
10990 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
10991 return false;
10992 }
10993 if (!dir->is_auth()) {
10994 dout(7) << "can_fragment: not auth on " << *dir << dendl;
10995 return false;
10996 }
10997 if (dir->is_bad()) {
10998 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
10999 return false;
11000 }
11001 if (dir->is_frozen() ||
11002 dir->is_freezing()) {
11003 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11004 return false;
11005 }
11006 }
11007
11008 return true;
11009}
11010
11011void MDCache::split_dir(CDir *dir, int bits)
11012{
11013 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11014 assert(dir->is_auth());
11015 CInode *diri = dir->inode;
11016
11017 list<CDir*> dirs;
11018 dirs.push_back(dir);
11019
11020 if (!can_fragment(diri, dirs)) {
11021 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11022 return;
11023 }
11024
31f18b77
FG
11025 if (dir->frag.bits() + bits > 24) {
11026 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11027 return;
11028 }
11029
7c673cae
FG
11030 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11031 mdr->more()->fragment_base = dir->dirfrag();
11032
11033 assert(fragments.count(dir->dirfrag()) == 0);
11034 fragment_info_t& info = fragments[dir->dirfrag()];
11035 info.mdr = mdr;
11036 info.dirs.push_back(dir);
11037 info.bits = bits;
11038 info.last_cum_auth_pins_change = ceph_clock_now();
11039
11040 fragment_freeze_dirs(dirs);
11041 // initial mark+complete pass
11042 fragment_mark_and_complete(mdr);
11043}
11044
11045void MDCache::merge_dir(CInode *diri, frag_t frag)
11046{
11047 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11048
11049 list<CDir*> dirs;
11050 if (!diri->get_dirfrags_under(frag, dirs)) {
11051 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11052 return;
11053 }
11054
11055 if (diri->dirfragtree.is_leaf(frag)) {
11056 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11057 return;
11058 }
11059
11060 if (!can_fragment(diri, dirs))
11061 return;
11062
11063 CDir *first = dirs.front();
11064 int bits = first->get_frag().bits() - frag.bits();
11065 dout(10) << " we are merginb by " << bits << " bits" << dendl;
11066
11067 dirfrag_t basedirfrag(diri->ino(), frag);
11068 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11069 mdr->more()->fragment_base = basedirfrag;
11070
11071 assert(fragments.count(basedirfrag) == 0);
11072 fragment_info_t& info = fragments[basedirfrag];
11073 info.mdr = mdr;
11074 info.dirs = dirs;
11075 info.bits = -bits;
11076 info.last_cum_auth_pins_change = ceph_clock_now();
11077
11078 fragment_freeze_dirs(dirs);
11079 // initial mark+complete pass
11080 fragment_mark_and_complete(mdr);
11081}
11082
11083void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
11084{
11085 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11086 CDir *dir = *p;
11087 dir->auth_pin(dir); // until we mark and complete them
11088 dir->state_set(CDir::STATE_FRAGMENTING);
11089 dir->freeze_dir();
11090 assert(dir->is_freezing_dir());
11091 }
11092}
11093
11094class C_MDC_FragmentMarking : public MDCacheContext {
11095 MDRequestRef mdr;
11096public:
11097 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11098 void finish(int r) override {
11099 mdcache->fragment_mark_and_complete(mdr);
11100 }
11101};
11102
11103void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11104{
11105 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11106 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11107 if (it == fragments.end() || it->second.mdr != mdr) {
11108 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11109 request_finish(mdr);
11110 return;
11111 }
11112
11113 fragment_info_t& info = it->second;
11114 CInode *diri = info.dirs.front()->get_inode();
11115 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11116
11117 MDSGatherBuilder gather(g_ceph_context);
11118
11119 for (list<CDir*>::iterator p = info.dirs.begin();
11120 p != info.dirs.end();
11121 ++p) {
11122 CDir *dir = *p;
11123
11124 bool ready = true;
11125 if (!dir->is_complete()) {
11126 dout(15) << " fetching incomplete " << *dir << dendl;
11127 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11128 ready = false;
11129 } else if (dir->get_frag() == frag_t()) {
11130 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11131 // the operation. To avoid CDir::fetch() complaining about missing object,
11132 // we commit new dirfrag first.
11133 if (dir->state_test(CDir::STATE_CREATING)) {
11134 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11135 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11136 ready = false;
11137 } else if (dir->is_new()) {
11138 dout(15) << " committing new " << *dir << dendl;
11139 assert(dir->is_dirty());
11140 dir->commit(0, gather.new_sub(), true);
11141 ready = false;
11142 }
11143 }
11144 if (!ready)
11145 continue;
11146
11147 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11148 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11149 for (auto &p : dir->items) {
11150 CDentry *dn = p.second;
7c673cae
FG
11151 dn->get(CDentry::PIN_FRAGMENTING);
11152 assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11153 dn->state_set(CDentry::STATE_FRAGMENTING);
11154 }
11155 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11156 dir->auth_unpin(dir);
11157 } else {
11158 dout(15) << " already marked " << *dir << dendl;
11159 }
11160 }
11161 if (gather.has_subs()) {
11162 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11163 gather.activate();
11164 return;
11165 }
11166
11167 for (list<CDir*>::iterator p = info.dirs.begin();
11168 p != info.dirs.end();
11169 ++p) {
11170 CDir *dir = *p;
11171 if (!dir->is_frozen_dir()) {
11172 assert(dir->is_freezing_dir());
11173 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11174 }
11175 }
11176 if (gather.has_subs()) {
11177 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11178 gather.activate();
11179 // flush log so that request auth_pins are retired
11180 mds->mdlog->flush();
11181 return;
11182 }
11183
11184 fragment_frozen(mdr, 0);
11185}
11186
11187void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11188{
11189 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11190 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11191 CDir *dir = *p;
11192 dout(10) << " frag " << *dir << dendl;
11193
11194 assert(dir->state_test(CDir::STATE_FRAGMENTING));
11195 dir->state_clear(CDir::STATE_FRAGMENTING);
11196
11197 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11198 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11199
94b18763
FG
11200 for (auto &p : dir->items) {
11201 CDentry *dn = p.second;
7c673cae
FG
11202 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11203 dn->state_clear(CDentry::STATE_FRAGMENTING);
11204 dn->put(CDentry::PIN_FRAGMENTING);
11205 }
11206 } else {
11207 dir->auth_unpin(dir);
11208 }
11209
11210 dir->unfreeze_dir();
11211 }
11212}
11213
11214bool MDCache::fragment_are_all_frozen(CDir *dir)
11215{
11216 assert(dir->is_frozen_dir());
11217 map<dirfrag_t,fragment_info_t>::iterator p;
11218 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11219 p != fragments.end() && p->first.ino == dir->ino();
11220 ++p) {
11221 if (p->first.frag.contains(dir->get_frag()))
11222 return p->second.all_frozen;
11223 }
11224 ceph_abort();
11225 return false;
11226}
11227
11228void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11229{
11230 map<dirfrag_t,fragment_info_t>::iterator p;
11231 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11232 p != fragments.end() && p->first.ino == dir->ino();
11233 ++p) {
11234 if (p->first.frag.contains(dir->get_frag())) {
11235 p->second.num_remote_waiters++;
11236 return;
11237 }
11238 }
11239 ceph_abort();
11240}
11241
11242void MDCache::find_stale_fragment_freeze()
11243{
11244 dout(10) << "find_stale_fragment_freeze" << dendl;
11245 // see comment in Migrator::find_stale_export_freeze()
11246 utime_t now = ceph_clock_now();
11247 utime_t cutoff = now;
11248 cutoff -= g_conf->mds_freeze_tree_timeout;
11249
11250 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11251 p != fragments.end(); ) {
11252 dirfrag_t df = p->first;
11253 fragment_info_t& info = p->second;
11254 ++p;
11255 if (info.all_frozen)
11256 continue;
11257 CDir *dir;
11258 int total_auth_pins = 0;
11259 for (list<CDir*>::iterator q = info.dirs.begin();
11260 q != info.dirs.end();
11261 ++q) {
11262 dir = *q;
11263 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11264 total_auth_pins = -1;
11265 break;
11266 }
11267 if (dir->is_frozen_dir())
11268 continue;
11269 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11270 }
11271 if (total_auth_pins < 0)
11272 continue;
11273 if (info.last_cum_auth_pins != total_auth_pins) {
11274 info.last_cum_auth_pins = total_auth_pins;
11275 info.last_cum_auth_pins_change = now;
11276 continue;
11277 }
11278 if (info.last_cum_auth_pins_change >= cutoff)
11279 continue;
11280 dir = info.dirs.front();
11281 if (info.num_remote_waiters > 0 ||
11282 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11283 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11284 list<CDir*> dirs;
11285 info.dirs.swap(dirs);
11286 fragments.erase(df);
11287 fragment_unmark_unfreeze_dirs(dirs);
11288 }
11289 }
11290}
11291
11292class C_MDC_FragmentPrep : public MDCacheLogContext {
11293 MDRequestRef mdr;
11294public:
11295 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11296 void finish(int r) override {
11297 mdcache->_fragment_logged(mdr);
11298 }
11299};
11300
11301class C_MDC_FragmentStore : public MDCacheContext {
11302 MDRequestRef mdr;
11303public:
11304 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11305 void finish(int r) override {
11306 mdcache->_fragment_stored(mdr);
11307 }
11308};
11309
11310class C_MDC_FragmentCommit : public MDCacheLogContext {
11311 dirfrag_t basedirfrag;
11312 list<CDir*> resultfrags;
11313public:
11314 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list<CDir*>& l) :
11315 MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {}
11316 void finish(int r) override {
11317 mdcache->_fragment_committed(basedirfrag, resultfrags);
11318 }
11319};
11320
11321class C_IO_MDC_FragmentFinish : public MDCacheIOContext {
11322 dirfrag_t basedirfrag;
11323 list<CDir*> resultfrags;
11324public:
11325 C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
11326 MDCacheIOContext(m), basedirfrag(f) {
11327 resultfrags.swap(l);
11328 }
11329 void finish(int r) override {
11330 assert(r == 0 || r == -ENOENT);
11331 mdcache->_fragment_finish(basedirfrag, resultfrags);
11332 }
91327a77
AA
11333 void print(ostream& out) const override {
11334 out << "dirfrags_commit(" << basedirfrag << ")";
11335 }
7c673cae
FG
11336};
11337
11338void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11339{
11340 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11341 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11342 if (it == fragments.end() || it->second.mdr != mdr) {
11343 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11344 request_finish(mdr);
11345 return;
11346 }
11347
11348 assert(r == 0);
11349 fragment_info_t& info = it->second;
11350 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11351 << " on " << info.dirs.front()->get_inode() << dendl;
11352
11353 info.all_frozen = true;
11354 dispatch_fragment_dir(mdr);
11355}
11356
11357void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11358{
11359 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11360 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11361 if (it == fragments.end() || it->second.mdr != mdr) {
11362 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11363 request_finish(mdr);
11364 return;
11365 }
11366
11367 fragment_info_t& info = it->second;
11368 CInode *diri = info.dirs.front()->get_inode();
11369
11370 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11371 << " on " << *diri << dendl;
11372 if (!mdr->aborted) {
11373 set<SimpleLock*> rdlocks, wrlocks, xlocks;
11374 wrlocks.insert(&diri->dirfragtreelock);
11375 // prevent a racing gather on any other scatterlocks too
11376 wrlocks.insert(&diri->nestlock);
11377 wrlocks.insert(&diri->filelock);
11378 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true))
11379 if (!mdr->aborted)
11380 return;
11381 }
11382
11383 if (mdr->aborted) {
11384 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11385 << info.dirs.front()->dirfrag() << dendl;
11386 if (info.bits > 0)
11387 mds->balancer->queue_split(info.dirs.front(), false);
11388 else
11389 mds->balancer->queue_merge(info.dirs.front());
11390 fragment_unmark_unfreeze_dirs(info.dirs);
11391 fragments.erase(it);
11392 request_finish(mdr);
11393 return;
11394 }
11395
11396 mdr->ls = mds->mdlog->get_current_segment();
11397 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11398 mds->mdlog->start_entry(le);
11399
11400 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11401 CDir *dir = *p;
11402 dirfrag_rollback rollback;
11403 rollback.fnode = dir->fnode;
11404 le->add_orig_frag(dir->get_frag(), &rollback);
11405 }
11406
11407 // refragment
11408 list<MDSInternalContextBase*> waiters;
11409 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11410 info.resultfrags, waiters, false);
11411 if (g_conf->mds_debug_frag)
11412 diri->verify_dirfrags();
11413 mds->queue_waiters(waiters);
11414
11415 for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
11416 assert(!diri->dirfragtree.is_leaf(*p));
11417
11418 le->metablob.add_dir_context(*info.resultfrags.begin());
11419 for (list<CDir*>::iterator p = info.resultfrags.begin();
11420 p != info.resultfrags.end();
11421 ++p) {
11422 if (diri->is_auth()) {
11423 le->metablob.add_fragmented_dir(*p, false, false);
11424 } else {
11425 (*p)->state_set(CDir::STATE_DIRTYDFT);
11426 le->metablob.add_fragmented_dir(*p, false, true);
11427 }
11428 }
11429
11430 // dft lock
11431 if (diri->is_auth()) {
11432 // journal dirfragtree
94b18763
FG
11433 auto &pi = diri->project_inode();
11434 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11435 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11436 } else {
11437 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11438 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11439 mdr->add_updated_lock(&diri->dirfragtreelock);
11440 }
11441
11442 /*
11443 // filelock
11444 mds->locker->mark_updated_scatterlock(&diri->filelock);
11445 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11446 mut->add_updated_lock(&diri->filelock);
11447
11448 // dirlock
11449 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11450 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11451 mut->add_updated_lock(&diri->nestlock);
11452 */
11453
11454 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11455 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11456 mdr, __func__);
11457 mds->mdlog->flush();
11458}
11459
11460void MDCache::_fragment_logged(MDRequestRef& mdr)
11461{
11462 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11463 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11464 assert(it != fragments.end());
11465 fragment_info_t &info = it->second;
11466 CInode *diri = info.resultfrags.front()->get_inode();
11467
11468 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11469 << " on " << *diri << dendl;
11470
11471 if (diri->is_auth())
11472 diri->pop_and_dirty_projected_inode(mdr->ls);
11473
11474 mdr->apply(); // mark scatterlock
11475
11476 // store resulting frags
11477 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11478
11479 for (list<CDir*>::iterator p = info.resultfrags.begin();
11480 p != info.resultfrags.end();
11481 ++p) {
11482 CDir *dir = *p;
11483 dout(10) << " storing result frag " << *dir << dendl;
11484
11485 // freeze and store them too
11486 dir->auth_pin(this);
11487 dir->state_set(CDir::STATE_FRAGMENTING);
11488 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11489 }
11490
11491 gather.activate();
11492}
11493
11494void MDCache::_fragment_stored(MDRequestRef& mdr)
11495{
11496 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11497 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11498 assert(it != fragments.end());
11499 fragment_info_t &info = it->second;
11500 CInode *diri = info.resultfrags.front()->get_inode();
11501
11502 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11503 << " on " << *diri << dendl;
11504
11505 // tell peers
11506 CDir *first = *info.resultfrags.begin();
181888fb
FG
11507 for (const auto &p : first->get_replicas()) {
11508 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11509 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11510 rejoin_gather.count(p.first)))
7c673cae
FG
11511 continue;
11512
11513 MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits);
11514
11515 // freshly replicate new dirs to peers
11516 for (list<CDir*>::iterator q = info.resultfrags.begin();
11517 q != info.resultfrags.end();
11518 ++q)
181888fb 11519 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11520
181888fb 11521 mds->send_message_mds(notify, p.first);
7c673cae
FG
11522 }
11523
11524 // journal commit
11525 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11526 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag,
11527 info.resultfrags));
11528
11529 mds->locker->drop_locks(mdr.get());
11530
11531 // unfreeze resulting frags
11532 for (list<CDir*>::iterator p = info.resultfrags.begin();
11533 p != info.resultfrags.end();
11534 ++p) {
11535 CDir *dir = *p;
11536 dout(10) << " result frag " << *dir << dendl;
11537
94b18763
FG
11538 for (auto &p : dir->items) {
11539 CDentry *dn = p.second;
7c673cae
FG
11540 assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11541 dn->state_clear(CDentry::STATE_FRAGMENTING);
11542 dn->put(CDentry::PIN_FRAGMENTING);
11543 }
11544
11545 // unfreeze
11546 dir->unfreeze_dir();
11547 }
11548
11549 fragments.erase(it);
11550 request_finish(mdr);
11551}
11552
11553void MDCache::_fragment_committed(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11554{
11555 dout(10) << "fragment_committed " << basedirfrag << dendl;
11556 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11557 assert(it != uncommitted_fragments.end());
11558 ufragment &uf = it->second;
11559
11560 // remove old frags
11561 C_GatherBuilder gather(
11562 g_ceph_context,
11563 new C_OnFinisher(
11564 new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags),
11565 mds->finisher));
11566
11567 SnapContext nullsnapc;
11568 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11569 for (list<frag_t>::iterator p = uf.old_frags.begin();
11570 p != uf.old_frags.end();
11571 ++p) {
11572 object_t oid = CInode::get_object_name(basedirfrag.ino, *p, "");
11573 ObjectOperation op;
11574 if (*p == frag_t()) {
11575 // backtrace object
11576 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11577 op.truncate(0);
11578 op.omap_clear();
11579 } else {
11580 dout(10) << " removing orphan dirfrag " << oid << dendl;
11581 op.remove();
11582 }
11583 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11584 ceph::real_clock::now(),
11585 0, gather.new_sub());
11586 }
11587
11588 assert(gather.has_subs());
11589 gather.activate();
11590}
11591
11592void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
11593{
11594 dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size="
11595 << resultfrags.size() << dendl;
11596 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11597 assert(it != uncommitted_fragments.end());
11598 ufragment &uf = it->second;
11599
11600 // unmark & auth_unpin
11601 for (const auto &dir : resultfrags) {
11602 dir->state_clear(CDir::STATE_FRAGMENTING);
11603 dir->auth_unpin(this);
11604
11605 // In case the resulting fragments are beyond the split size,
11606 // we might need to split them again right away (they could
11607 // have been taking inserts between unfreezing and getting
11608 // here)
11609 mds->balancer->maybe_fragment(dir, false);
11610 }
11611
11612 if (mds->logger) {
11613 if (resultfrags.size() > 1) {
11614 mds->logger->inc(l_mds_dir_split);
11615 } else {
11616 mds->logger->inc(l_mds_dir_merge);
11617 }
11618 }
11619
11620 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits);
11621 mds->mdlog->start_submit_entry(le);
11622
11623 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11624}
11625
11626/* This function DOES put the passed message before returning */
11627void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
11628{
11629 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
11630
11631 if (mds->get_state() < MDSMap::STATE_REJOIN) {
11632 notify->put();
11633 return;
11634 }
11635
11636 CInode *diri = get_inode(notify->get_ino());
11637 if (diri) {
11638 frag_t base = notify->get_basefrag();
11639 int bits = notify->get_bits();
11640
11641/*
11642 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11643 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11644 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11645 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11646 notify->put();
11647 return;
11648 }
11649*/
11650
11651 // refragment
11652 list<MDSInternalContextBase*> waiters;
11653 list<CDir*> resultfrags;
11654 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11655 if (g_conf->mds_debug_frag)
11656 diri->verify_dirfrags();
11657
11658 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11659 diri->take_dir_waiting((*p)->get_frag(), waiters);
11660
11661 // add new replica dirs values
11662 bufferlist::iterator p = notify->basebl.begin();
11663 while (!p.end())
11664 add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters);
11665
11666 mds->queue_waiters(waiters);
11667 } else {
11668 ceph_abort();
11669 }
11670
11671 notify->put();
11672}
11673
11674void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
11675 LogSegment *ls, bufferlist *rollback)
11676{
11677 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11678 assert(!uncommitted_fragments.count(basedirfrag));
11679 ufragment& uf = uncommitted_fragments[basedirfrag];
11680 uf.old_frags = old_frags;
11681 uf.bits = bits;
11682 uf.ls = ls;
11683 ls->uncommitted_fragments.insert(basedirfrag);
11684 if (rollback)
11685 uf.rollback.swap(*rollback);
11686}
11687
11688void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11689{
11690 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11691 << " op " << EFragment::op_name(op) << dendl;
11692 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11693 if (it != uncommitted_fragments.end()) {
11694 ufragment& uf = it->second;
11695 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11696 uf.committed = true;
11697 } else {
11698 uf.ls->uncommitted_fragments.erase(basedirfrag);
11699 mds->queue_waiters(uf.waiters);
11700 uncommitted_fragments.erase(it);
11701 }
11702 }
11703}
11704
11705void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
11706{
11707 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11708 << " old_frags (" << old_frags << ")" << dendl;
11709 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11710 if (it != uncommitted_fragments.end()) {
11711 ufragment& uf = it->second;
11712 if (!uf.old_frags.empty()) {
11713 uf.old_frags.swap(old_frags);
11714 uf.committed = true;
11715 } else {
11716 uf.ls->uncommitted_fragments.erase(basedirfrag);
11717 uncommitted_fragments.erase(it);
11718 }
11719 }
11720}
11721
11722void MDCache::rollback_uncommitted_fragments()
11723{
11724 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11725 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11726 p != uncommitted_fragments.end();
11727 ++p) {
11728 ufragment &uf = p->second;
11729 CInode *diri = get_inode(p->first.ino);
11730 assert(diri);
11731
11732 if (uf.committed) {
11733 list<CDir*> frags;
11734 diri->get_dirfrags_under(p->first.frag, frags);
11735 for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
11736 CDir *dir = *q;
11737 dir->auth_pin(this);
11738 dir->state_set(CDir::STATE_FRAGMENTING);
11739 }
11740 _fragment_committed(p->first, frags);
11741 continue;
11742 }
11743
11744 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11745
11746 LogSegment *ls = mds->mdlog->get_current_segment();
11747 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11748 mds->mdlog->start_entry(le);
11749 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11750
11751 list<frag_t> old_frags;
11752 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11753
11754 list<CDir*> resultfrags;
11755 if (uf.old_frags.empty()) {
11756 // created by old format EFragment
11757 list<MDSInternalContextBase*> waiters;
11758 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11759 } else {
11760 bufferlist::iterator bp = uf.rollback.begin();
11761 for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
11762 CDir *dir = force_dir_fragment(diri, *q);
11763 resultfrags.push_back(dir);
11764
11765 dirfrag_rollback rollback;
11766 ::decode(rollback, bp);
11767
11768 dir->set_version(rollback.fnode.version);
11769 dir->fnode = rollback.fnode;
11770
11771 dir->_mark_dirty(ls);
11772
11773 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11774 dout(10) << " dirty nestinfo on " << *dir << dendl;
11775 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11776 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11777 }
11778 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11779 dout(10) << " dirty fragstat on " << *dir << dendl;
11780 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11781 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11782 }
11783
11784 le->add_orig_frag(dir->get_frag());
11785 le->metablob.add_dir_context(dir);
11786 if (diri_auth) {
11787 le->metablob.add_fragmented_dir(dir, true, false);
11788 } else {
11789 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11790 dir->state_set(CDir::STATE_DIRTYDFT);
11791 le->metablob.add_fragmented_dir(dir, true, true);
11792 }
11793 }
11794 }
11795
11796 if (diri_auth) {
94b18763
FG
11797 auto &pi = diri->project_inode();
11798 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11799 diri->pop_and_dirty_projected_inode(ls); // hacky
11800 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11801 } else {
11802 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11803 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11804 }
11805
11806 if (g_conf->mds_debug_frag)
11807 diri->verify_dirfrags();
11808
11809 for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
11810 assert(!diri->dirfragtree.is_leaf(*q));
11811
11812 for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
11813 CDir *dir = *q;
11814 dir->auth_pin(this);
11815 dir->state_set(CDir::STATE_FRAGMENTING);
11816 }
11817
11818 mds->mdlog->submit_entry(le);
11819
11820 uf.old_frags.swap(old_frags);
11821 _fragment_committed(p->first, resultfrags);
11822 }
11823}
11824
11825void MDCache::force_readonly()
11826{
11827 if (is_readonly())
11828 return;
11829
11830 dout(1) << "force file system read-only" << dendl;
11831 mds->clog->warn() << "force file system read-only";
11832
11833 set_readonly();
11834
11835 mds->server->force_clients_readonly();
11836
11837 // revoke write caps
94b18763 11838 for (auto &p : inode_map) {
b32b8144 11839 CInode *in = p.second;
7c673cae
FG
11840 if (in->is_head())
11841 mds->locker->eval(in, CEPH_CAP_LOCKS);
11842 }
11843
11844 mds->mdlog->flush();
11845}
11846
11847
11848// ==============================================================
11849// debug crap
11850
11851void MDCache::show_subtrees(int dbl)
11852{
11853 if (g_conf->mds_thrash_exports)
11854 dbl += 15;
11855
11856 //dout(10) << "show_subtrees" << dendl;
11857
11858 if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl))
11859 return; // i won't print anything.
11860
11861 if (subtrees.empty()) {
11862 dout(dbl) << "show_subtrees - no subtrees" << dendl;
11863 return;
11864 }
11865
11866 // root frags
11867 list<CDir*> basefrags;
11868 for (set<CInode*>::iterator p = base_inodes.begin();
11869 p != base_inodes.end();
11870 ++p)
11871 (*p)->get_dirfrags(basefrags);
11872 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11873 dout(15) << "show_subtrees" << dendl;
11874
11875 // queue stuff
11876 list<pair<CDir*,int> > q;
11877 string indent;
11878 set<CDir*> seen;
11879
11880 // calc max depth
11881 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11882 q.push_back(pair<CDir*,int>(*p, 0));
11883
11884 set<CDir*> subtrees_seen;
11885
11886 int depth = 0;
11887 while (!q.empty()) {
11888 CDir *dir = q.front().first;
11889 int d = q.front().second;
11890 q.pop_front();
11891
11892 if (subtrees.count(dir) == 0) continue;
11893
11894 subtrees_seen.insert(dir);
11895
11896 if (d > depth) depth = d;
11897
11898 // sanity check
11899 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11900 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11901 assert(seen.count(dir) == 0);
11902 seen.insert(dir);
11903
11904 // nested items?
11905 if (!subtrees[dir].empty()) {
11906 for (set<CDir*>::iterator p = subtrees[dir].begin();
11907 p != subtrees[dir].end();
11908 ++p) {
11909 //dout(25) << " saw sub " << **p << dendl;
11910 q.push_front(pair<CDir*,int>(*p, d+1));
11911 }
11912 }
11913 }
11914
11915
11916 // print tree
11917 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
11918 q.push_back(pair<CDir*,int>(*p, 0));
11919
11920 while (!q.empty()) {
11921 CDir *dir = q.front().first;
11922 int d = q.front().second;
11923 q.pop_front();
11924
11925 if (subtrees.count(dir) == 0) continue;
11926
11927 // adjust indenter
11928 while ((unsigned)d < indent.size())
11929 indent.resize(d);
11930
11931 // pad
11932 string pad = "______________________________________";
11933 pad.resize(depth*2+1-indent.size());
11934 if (!subtrees[dir].empty())
11935 pad[0] = '.'; // parent
11936
11937
11938 string auth;
11939 if (dir->is_auth())
11940 auth = "auth ";
11941 else
11942 auth = " rep ";
11943
11944 char s[10];
11945 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
11946 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
11947 else
11948 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
11949
11950 // print
11951 dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl;
11952
11953 if (dir->ino() == MDS_INO_ROOT)
11954 assert(dir->inode == root);
11955 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11956 assert(dir->inode == myin);
11957 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11958 assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
11959
11960 // nested items?
11961 if (!subtrees[dir].empty()) {
11962 // more at my level?
11963 if (!q.empty() && q.front().second == d)
11964 indent += "| ";
11965 else
11966 indent += " ";
11967
11968 for (set<CDir*>::iterator p = subtrees[dir].begin();
11969 p != subtrees[dir].end();
11970 ++p)
11971 q.push_front(pair<CDir*,int>(*p, d+2));
11972 }
11973 }
11974
11975 // verify there isn't stray crap in subtree map
11976 int lost = 0;
11977 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
11978 p != subtrees.end();
11979 ++p) {
11980 if (subtrees_seen.count(p->first)) continue;
11981 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
11982 lost++;
11983 }
11984 assert(lost == 0);
11985}
11986
7c673cae
FG
11987void MDCache::show_cache()
11988{
11989 dout(7) << "show_cache" << dendl;
b32b8144
FG
11990
11991 auto show_func = [this](CInode *in) {
7c673cae 11992 // unlinked?
b32b8144
FG
11993 if (!in->parent)
11994 dout(7) << " unlinked " << *in << dendl;
11995
7c673cae
FG
11996 // dirfrags?
11997 list<CDir*> dfs;
b32b8144 11998 in->get_dirfrags(dfs);
7c673cae
FG
11999 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12000 CDir *dir = *p;
12001 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 12002
94b18763
FG
12003 for (auto &p : dir->items) {
12004 CDentry *dn = p.second;
7c673cae
FG
12005 dout(7) << " dentry " << *dn << dendl;
12006 CDentry::linkage_t *dnl = dn->get_linkage();
12007 if (dnl->is_primary() && dnl->get_inode())
12008 dout(7) << " inode " << *dnl->get_inode() << dendl;
12009 }
12010 }
b32b8144
FG
12011 };
12012
94b18763 12013 for (auto &p : inode_map)
b32b8144 12014 show_func(p.second);
94b18763 12015 for (auto &p : snap_inode_map)
b32b8144 12016 show_func(p.second);
7c673cae
FG
12017}
12018
f64942e4 12019void MDCache::cache_status(Formatter *f)
181888fb
FG
12020{
12021 f->open_object_section("cache");
12022
12023 f->open_object_section("pool");
12024 mempool::get_pool(mempool::mds_co::id).dump(f);
12025 f->close_section();
12026
12027 f->close_section();
181888fb
FG
12028}
12029
94b18763 12030int MDCache::dump_cache(boost::string_view file_name)
7c673cae 12031{
94b18763 12032 return dump_cache(file_name, NULL);
7c673cae
FG
12033}
12034
31f18b77 12035int MDCache::dump_cache(Formatter *f)
7c673cae 12036{
94b18763 12037 return dump_cache(boost::string_view(""), f);
7c673cae
FG
12038}
12039
94b18763 12040int MDCache::dump_cache(boost::string_view dump_root, int depth, Formatter *f)
7c673cae 12041{
94b18763 12042 return dump_cache(boost::string_view(""), f, dump_root, depth);
7c673cae
FG
12043}
12044
12045/**
12046 * Dump the metadata cache, either to a Formatter, if
12047 * provided, else to a plain text file.
12048 */
94b18763
FG
12049int MDCache::dump_cache(boost::string_view fn, Formatter *f,
12050 boost::string_view dump_root, int depth)
7c673cae
FG
12051{
12052 int r = 0;
f64942e4
AA
12053
12054 // dumping large caches may cause mds to hang or worse get killed.
12055 // so, disallow the dump if the cache size exceeds the configured
12056 // threshold, which is 1G for formatter and unlimited for file (note
12057 // that this can be jacked up by the admin... and is nothing but foot
12058 // shooting, but the option itself is for devs and hence dangerous to
12059 // tune). TODO: remove this when fixed.
12060 uint64_t threshold = f ?
12061 g_conf->get_val<uint64_t>("mds_dump_cache_threshold_formatter") :
12062 g_conf->get_val<uint64_t>("mds_dump_cache_threshold_file");
12063
12064 if (threshold && cache_size() > threshold) {
12065 if (f) {
12066 std::stringstream ss;
12067 ss << "cache usage exceeds dump threshold";
12068 f->open_object_section("result");
12069 f->dump_string("error", ss.str());
12070 f->close_section();
12071 } else {
12072 derr << "cache usage exceeds dump threshold" << dendl;
12073 r = -EINVAL;
12074 }
12075 return r;
12076 }
12077
12078 r = 0;
7c673cae
FG
12079 int fd = -1;
12080
12081 if (f) {
12082 f->open_array_section("inodes");
12083 } else {
94b18763
FG
12084 char path[PATH_MAX] = "";
12085 if (fn.length()) {
12086 snprintf(path, sizeof path, "%s", fn.data());
12087 } else {
12088 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
12089 }
12090
94b18763 12091 dout(1) << "dump_cache to " << path << dendl;
7c673cae 12092
91327a77 12093 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
7c673cae 12094 if (fd < 0) {
94b18763 12095 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 12096 return errno;
7c673cae
FG
12097 }
12098 }
12099
b32b8144
FG
12100 auto dump_func = [this, fd, f, depth, &dump_root](CInode *in) {
12101 int r;
7c673cae
FG
12102 if (!dump_root.empty()) {
12103 string ipath;
12104 if (in->is_root())
12105 ipath = "/";
12106 else
12107 in->make_path_string(ipath);
12108
12109 if (dump_root.length() > ipath.length() ||
12110 !equal(dump_root.begin(), dump_root.end(), ipath.begin()))
b32b8144 12111 return 0;
7c673cae
FG
12112
12113 if (depth >= 0 &&
12114 count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth)
b32b8144 12115 return 0;
7c673cae
FG
12116 }
12117
12118 if (f) {
12119 f->open_object_section("inode");
12120 in->dump(f);
12121 } else {
12122 ostringstream ss;
12123 ss << *in << std::endl;
12124 std::string s = ss.str();
12125 r = safe_write(fd, s.c_str(), s.length());
b32b8144
FG
12126 if (r < 0)
12127 return r;
7c673cae
FG
12128 }
12129
12130 list<CDir*> dfs;
12131 in->get_dirfrags(dfs);
12132 if (f) {
12133 f->open_array_section("dirfrags");
12134 }
12135 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12136 CDir *dir = *p;
12137 if (f) {
12138 f->open_object_section("dir");
12139 dir->dump(f);
12140 } else {
12141 ostringstream tt;
12142 tt << " " << *dir << std::endl;
12143 string t = tt.str();
12144 r = safe_write(fd, t.c_str(), t.length());
b32b8144
FG
12145 if (r < 0)
12146 return r;
7c673cae
FG
12147 }
12148
12149 if (f) {
12150 f->open_array_section("dentries");
12151 }
94b18763
FG
12152 for (auto &p : dir->items) {
12153 CDentry *dn = p.second;
7c673cae
FG
12154 if (f) {
12155 f->open_object_section("dentry");
12156 dn->dump(f);
12157 f->close_section();
12158 } else {
12159 ostringstream uu;
12160 uu << " " << *dn << std::endl;
12161 string u = uu.str();
12162 r = safe_write(fd, u.c_str(), u.length());
b32b8144
FG
12163 if (r < 0)
12164 return r;
7c673cae
FG
12165 }
12166 }
12167 if (f) {
12168 f->close_section(); //dentries
12169 }
12170 dir->check_rstats();
12171 if (f) {
12172 f->close_section(); //dir
12173 }
12174 }
12175 if (f) {
12176 f->close_section(); // dirfrags
12177 }
12178
12179 if (f) {
12180 f->close_section(); // inode
12181 }
b32b8144
FG
12182 return 1;
12183 };
12184
94b18763 12185 for (auto &p : inode_map) {
b32b8144
FG
12186 r = dump_func(p.second);
12187 if (r < 0)
12188 goto out;
12189 }
94b18763 12190 for (auto &p : snap_inode_map) {
b32b8144
FG
12191 r = dump_func(p.second);
12192 if (r < 0)
12193 goto out;
7c673cae 12194 }
b32b8144 12195 r = 0;
7c673cae
FG
12196
12197 out:
12198 if (f) {
12199 f->close_section(); // inodes
12200 } else {
12201 ::close(fd);
12202 }
31f18b77 12203 return r;
7c673cae
FG
12204}
12205
12206
12207
12208C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12209 : MDSInternalContext(c->mds), cache(c), mdr(r)
12210{}
12211
12212void C_MDS_RetryRequest::finish(int r)
12213{
12214 mdr->retry++;
12215 cache->dispatch_request(mdr);
12216}
12217
12218
12219class C_MDS_EnqueueScrub : public Context
12220{
12221 Formatter *formatter;
12222 Context *on_finish;
12223public:
12224 ScrubHeaderRef header;
12225 C_MDS_EnqueueScrub(Formatter *f, Context *fin) :
12226 formatter(f), on_finish(fin), header(nullptr) {}
12227
12228 Context *take_finisher() {
12229 Context *fin = on_finish;
12230 on_finish = NULL;
12231 return fin;
12232 }
12233
12234 void finish(int r) override {
12235 if (r < 0) { // we failed the lookup or something; dump ourselves
12236 formatter->open_object_section("results");
12237 formatter->dump_int("return_code", r);
12238 formatter->close_section(); // results
12239 }
12240 if (on_finish)
12241 on_finish->complete(r);
12242 }
12243};
12244
12245void MDCache::enqueue_scrub(
94b18763
FG
12246 boost::string_view path,
12247 boost::string_view tag,
7c673cae
FG
12248 bool force, bool recursive, bool repair,
12249 Formatter *f, Context *fin)
12250{
12251 dout(10) << __func__ << path << dendl;
12252 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
94b18763 12253 filepath fp(path);
7c673cae
FG
12254 mdr->set_filepath(fp);
12255
12256 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin);
12257 cs->header = std::make_shared<ScrubHeader>(
12258 tag, force, recursive, repair, f);
12259
12260 mdr->internal_op_finish = cs;
12261 enqueue_scrub_work(mdr);
1adf2230
AA
12262
12263 // since recursive scrub is asynchronous, dump minimal output
12264 // to not upset cli tools.
12265 if (recursive) {
12266 f->open_object_section("results");
12267 f->close_section(); // results
12268 }
7c673cae
FG
12269}
12270
12271void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12272{
12273 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12274 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12275 if (NULL == in)
12276 return;
12277
12278 // TODO: Remove this restriction
12279 assert(in->is_auth());
12280
12281 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12282 if (!locked)
12283 return;
12284
12285 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12286 ScrubHeaderRef &header = cs->header;
12287
12288 // Cannot scrub same dentry twice at same time
12289 if (in->scrub_infop && in->scrub_infop->scrub_in_progress) {
12290 mds->server->respond_to_request(mdr, -EBUSY);
12291 return;
12292 } else {
12293 in->scrub_info();
12294 }
12295
12296 header->set_origin(in);
12297
b32b8144
FG
12298 Context *fin = nullptr;
12299 if (!header->get_recursive()) {
12300 fin = cs->take_finisher();
12301 }
12302
12303 // If the scrub did some repair, then flush the journal at the end of
12304 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12305 // the on disk state will still look damaged.
28e407b8
AA
12306 auto scrub_finish = new FunctionContext([this, header, fin](int r){
12307 if (!header->get_repaired()) {
12308 if (fin)
12309 fin->complete(r);
12310 return;
12311 }
12312
12313 auto flush_finish = new FunctionContext([this, fin](int r){
12314 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12315 mds->mdlog->trim_all();
12316
12317 if (fin) {
12318 MDSGatherBuilder gather(g_ceph_context);
12319 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12320 for (auto logseg : expiring_segments)
12321 logseg->wait_for_expiry(gather.new_sub());
12322 assert(gather.has_subs());
12323 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12324 gather.activate();
b32b8144 12325 }
28e407b8
AA
12326 });
12327
12328 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12329 mds->mdlog->start_new_segment();
12330 mds->mdlog->flush();
12331 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12332 });
12333
7c673cae 12334 if (!header->get_recursive()) {
7c673cae 12335 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12336 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12337 } else {
12338 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12339 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12340 }
7c673cae
FG
12341
12342 mds->server->respond_to_request(mdr, 0);
12343 return;
12344}
12345
12346struct C_MDC_RepairDirfragStats : public MDCacheLogContext {
12347 MDRequestRef mdr;
12348 C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) :
12349 MDCacheLogContext(c), mdr(m) {}
12350 void finish(int r) override {
12351 mdr->apply();
12352 get_mds()->server->respond_to_request(mdr, r);
12353 }
12354};
12355
12356void MDCache::repair_dirfrag_stats(CDir *dir)
12357{
12358 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12359 mdr->pin(dir);
12360 mdr->internal_op_private = dir;
12361 mdr->internal_op_finish = new C_MDSInternalNoop;
12362 repair_dirfrag_stats_work(mdr);
12363}
12364
12365void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12366{
12367 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12368 dout(10) << __func__ << " " << *dir << dendl;
12369
12370 if (!dir->is_auth()) {
12371 mds->server->respond_to_request(mdr, -ESTALE);
12372 return;
12373 }
12374
12375 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12376 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12377
7c673cae
FG
12378 mds->locker->drop_locks(mdr.get());
12379 mdr->drop_local_auth_pins();
224ce89b
WB
12380 if (!mdr->remote_auth_pins.empty())
12381 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12382 return;
12383 }
12384
12385 mdr->auth_pin(dir);
12386
12387 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12388 CInode *diri = dir->inode;
12389 rdlocks.insert(&diri->dirfragtreelock);
12390 wrlocks.insert(&diri->nestlock);
12391 wrlocks.insert(&diri->filelock);
12392 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12393 return;
12394
12395 if (!dir->is_complete()) {
12396 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12397 return;
12398 }
12399
12400 frag_info_t frag_info;
12401 nest_info_t nest_info;
94b18763 12402 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12403 CDentry *dn = it->second;
12404 if (dn->last != CEPH_NOSNAP)
12405 continue;
12406 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12407 if (dnl->is_primary()) {
12408 CInode *in = dnl->get_inode();
12409 nest_info.add(in->get_projected_inode()->accounted_rstat);
12410 if (in->is_dir())
12411 frag_info.nsubdirs++;
12412 else
12413 frag_info.nfiles++;
12414 } else if (dnl->is_remote())
12415 frag_info.nfiles++;
12416 }
12417
12418 fnode_t *pf = dir->get_projected_fnode();
12419 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12420 bool good_rstat = nest_info.same_sums(pf->rstat);
12421 if (good_fragstat && good_rstat) {
12422 dout(10) << __func__ << " no corruption found" << dendl;
12423 mds->server->respond_to_request(mdr, 0);
12424 return;
12425 }
12426
12427 pf = dir->project_fnode();
12428 pf->version = dir->pre_dirty();
12429 mdr->add_projected_fnode(dir);
12430
12431 mdr->ls = mds->mdlog->get_current_segment();
12432 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12433 mds->mdlog->start_entry(le);
12434
12435 if (!good_fragstat) {
12436 if (pf->fragstat.mtime > frag_info.mtime)
12437 frag_info.mtime = pf->fragstat.mtime;
12438 if (pf->fragstat.change_attr > frag_info.change_attr)
12439 frag_info.change_attr = pf->fragstat.change_attr;
12440 pf->fragstat = frag_info;
12441 mds->locker->mark_updated_scatterlock(&diri->filelock);
12442 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12443 mdr->add_updated_lock(&diri->filelock);
12444 }
12445
12446 if (!good_rstat) {
12447 if (pf->rstat.rctime > nest_info.rctime)
12448 nest_info.rctime = pf->rstat.rctime;
12449 pf->rstat = nest_info;
12450 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12451 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12452 mdr->add_updated_lock(&diri->nestlock);
12453 }
12454
12455 le->metablob.add_dir_context(dir);
12456 le->metablob.add_dir(dir, true);
12457
12458 mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr));
12459}
12460
12461void MDCache::repair_inode_stats(CInode *diri)
12462{
12463 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12464 mdr->pin(diri);
12465 mdr->internal_op_private = diri;
12466 mdr->internal_op_finish = new C_MDSInternalNoop;
12467 repair_inode_stats_work(mdr);
12468}
12469
12470void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12471{
12472 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12473 dout(10) << __func__ << " " << *diri << dendl;
12474
12475 if (!diri->is_auth()) {
12476 mds->server->respond_to_request(mdr, -ESTALE);
12477 return;
12478 }
12479 if (!diri->is_dir()) {
12480 mds->server->respond_to_request(mdr, -ENOTDIR);
12481 return;
12482 }
12483
12484 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12485 std::list<frag_t> frags;
12486
12487 if (mdr->ls) // already marked filelock/nestlock dirty ?
12488 goto do_rdlocks;
12489
12490 rdlocks.insert(&diri->dirfragtreelock);
12491 wrlocks.insert(&diri->nestlock);
12492 wrlocks.insert(&diri->filelock);
12493 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12494 return;
12495
12496 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12497 // the scatter-gather process, which will fix any fragstat/rstat errors.
12498 diri->dirfragtree.get_leaves(frags);
12499 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12500 CDir *dir = diri->get_dirfrag(*p);
12501 if (!dir) {
12502 assert(mdr->is_auth_pinned(diri));
12503 dir = diri->get_or_open_dirfrag(this, *p);
12504 }
12505 if (dir->get_version() == 0) {
12506 assert(dir->is_auth());
12507 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12508 return;
12509 }
12510 }
12511
12512 diri->state_set(CInode::STATE_REPAIRSTATS);
12513 mdr->ls = mds->mdlog->get_current_segment();
12514 mds->locker->mark_updated_scatterlock(&diri->filelock);
12515 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12516 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12517 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12518
12519 mds->locker->drop_locks(mdr.get());
12520
12521do_rdlocks:
12522 // force the scatter-gather process
12523 rdlocks.insert(&diri->dirfragtreelock);
12524 rdlocks.insert(&diri->nestlock);
12525 rdlocks.insert(&diri->filelock);
12526 wrlocks.clear();
12527 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
12528 return;
12529
12530 diri->state_clear(CInode::STATE_REPAIRSTATS);
12531
12532 frag_info_t dir_info;
12533 nest_info_t nest_info;
12534 nest_info.rsubdirs++; // it gets one to account for self
12535
12536 diri->dirfragtree.get_leaves(frags);
12537 for (list<frag_t>::iterator p = frags.begin(); p != frags.end(); ++p) {
12538 CDir *dir = diri->get_dirfrag(*p);
12539 assert(dir);
12540 assert(dir->get_version() > 0);
12541 dir_info.add(dir->fnode.accounted_fragstat);
12542 nest_info.add(dir->fnode.accounted_rstat);
12543 }
12544
12545 if (!dir_info.same_sums(diri->inode.dirstat) ||
12546 !nest_info.same_sums(diri->inode.rstat)) {
12547 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12548 << *diri << dendl;
12549 }
12550
12551 mds->server->respond_to_request(mdr, 0);
12552}
12553
94b18763 12554void MDCache::flush_dentry(boost::string_view path, Context *fin)
7c673cae
FG
12555{
12556 if (is_readonly()) {
12557 dout(10) << __func__ << ": read-only FS" << dendl;
12558 fin->complete(-EROFS);
12559 return;
12560 }
12561 dout(10) << "flush_dentry " << path << dendl;
12562 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 12563 filepath fp(path);
7c673cae
FG
12564 mdr->set_filepath(fp);
12565 mdr->internal_op_finish = fin;
12566 flush_dentry_work(mdr);
12567}
12568
12569class C_FinishIOMDR : public MDSInternalContextBase {
12570protected:
12571 MDSRank *mds;
12572 MDRequestRef mdr;
12573 MDSRank *get_mds() override { return mds; }
12574public:
12575 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12576 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12577};
12578
12579void MDCache::flush_dentry_work(MDRequestRef& mdr)
12580{
12581 set<SimpleLock*> rdlocks, wrlocks, xlocks;
12582 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
12583 if (NULL == in)
12584 return;
12585
12586 // TODO: Is this necessary? Fix it if so
12587 assert(in->is_auth());
12588 bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
12589 if (!locked)
12590 return;
12591 in->flush(new C_FinishIOMDR(mds, mdr));
12592}
12593
12594
12595/**
12596 * Initialize performance counters with global perfcounter
12597 * collection.
12598 */
12599void MDCache::register_perfcounters()
12600{
91327a77
AA
12601 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
12602
12603 // Stray/purge statistics
12604 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
12605 PerfCountersBuilder::PRIO_INTERESTING);
12606 pcb.add_u64(l_mdc_num_recovering_enqueued,
12607 "num_recovering_enqueued", "Files waiting for recovery", "recy",
12608 PerfCountersBuilder::PRIO_INTERESTING);
12609 pcb.add_u64_counter(l_mdc_recovery_completed,
12610 "recovery_completed", "File recoveries completed", "recd",
12611 PerfCountersBuilder::PRIO_INTERESTING);
12612
12613 // useful recovery queue statistics
12614 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
12615 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
12616 "Files currently being recovered");
12617 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
12618 "Files waiting for recovery with elevated priority");
12619 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
12620 "File recoveries started");
12621
12622 // along with other stray dentries stats
12623 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
12624 "Stray dentries delayed");
12625 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
12626 "Stray dentries enqueuing for purge");
12627 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
12628 "Stray dentries created");
7c673cae 12629 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
91327a77
AA
12630 "Stray dentries enqueued for purge");
12631 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
12632 "Stray dentries reintegrated");
12633 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
12634 "Stray dentries migrated");
7c673cae 12635
91327a77 12636 // low prio internal request stats
d2e6a577 12637 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
91327a77 12638 "Internal Request type enqueue scrub");
d2e6a577 12639 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
91327a77 12640 "Internal Request type export dir");
d2e6a577 12641 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
91327a77 12642 "Internal Request type flush");
d2e6a577 12643 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
91327a77 12644 "Internal Request type fragmentdir");
d2e6a577 12645 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
91327a77 12646 "Internal Request type frag stats");
d2e6a577 12647 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
91327a77 12648 "Internal Request type inode stats");
d2e6a577 12649
7c673cae
FG
12650 logger.reset(pcb.create_perf_counters());
12651 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12652 recovery_queue.set_logger(logger.get());
12653 stray_manager.set_logger(logger.get());
12654}
12655
12656void MDCache::activate_stray_manager()
12657{
12658 if (open) {
12659 stray_manager.activate();
12660 } else {
12661 wait_for_open(
12662 new MDSInternalContextWrapper(mds,
12663 new FunctionContext([this](int r){
12664 stray_manager.activate();
12665 })
12666 )
12667 );
12668 }
12669}
12670
12671/**
12672 * Call this when putting references to an inode/dentry or
12673 * when attempting to trim it.
12674 *
12675 * If this inode is no longer linked by anyone, and this MDS
12676 * rank holds the primary dentry, and that dentry is in a stray
12677 * directory, then give up the dentry to the StrayManager, never
12678 * to be seen again by MDCache.
12679 *
12680 * @param delay if true, then purgeable inodes are stashed til
12681 * the next trim(), rather than being purged right
12682 * away.
12683 */
12684void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12685 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12686 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12687 return;
224ce89b 12688
7c673cae
FG
12689 CDentry *dn = in->get_projected_parent_dn();
12690
12691 if (dn->state_test(CDentry::STATE_PURGING)) {
12692 /* We have already entered the purging process, no need
12693 * to re-evaluate me ! */
12694 return;
12695 }
12696
12697 if (dn->get_projected_linkage()->is_primary() &&
12698 dn->get_dir()->get_inode()->is_stray()) {
12699 stray_manager.eval_stray(dn, delay);
12700 }
12701}
12702
31f18b77
FG
12703void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12704 dout(10) << __func__ << " " << *diri << dendl;
12705 assert(diri->get_projected_parent_dir()->inode->is_stray());
12706 list<CDir*> ls;
12707 diri->get_dirfrags(ls);
94b18763 12708 for (auto &p : ls) {
31f18b77
FG
12709 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12710 p->try_remove_dentries_for_stray();
12711 }
12712 if (!diri->snaprealm) {
12713 if (diri->is_auth())
12714 diri->clear_dirty_rstat();
12715 diri->clear_scatter_dirty();
12716 }
12717}
12718