]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
11fdf7f2 20#include <string_view>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae 44
11fdf7f2
TL
45#include "messages/MClientCaps.h"
46
7c673cae
FG
47#include "msg/Message.h"
48#include "msg/Messenger.h"
49
181888fb 50#include "common/MemoryModel.h"
7c673cae 51#include "common/errno.h"
7c673cae 52#include "common/perf_counters.h"
181888fb
FG
53#include "common/safe_io.h"
54
7c673cae
FG
55#include "osdc/Journaler.h"
56#include "osdc/Filer.h"
57
58#include "events/ESubtreeMap.h"
59#include "events/EUpdate.h"
60#include "events/ESlaveUpdate.h"
61#include "events/EImportFinish.h"
62#include "events/EFragment.h"
63#include "events/ECommitted.h"
64#include "events/ESessions.h"
65
7c673cae
FG
66#include "InoTable.h"
67
68#include "common/Timer.h"
69
70#include "perfglue/heap_profiler.h"
71
7c673cae
FG
72
73#include "common/config.h"
11fdf7f2 74#include "include/ceph_assert.h"
7c673cae
FG
75
76#define dout_context g_ceph_context
77#define dout_subsys ceph_subsys_mds
78#undef dout_prefix
79#define dout_prefix _prefix(_dout, mds)
80static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
81 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
82}
83
84set<int> SimpleLock::empty_gather_set;
85
86
87/**
88 * All non-I/O contexts that require a reference
89 * to an MDCache instance descend from this.
90 */
11fdf7f2 91class MDCacheContext : public virtual MDSContext {
7c673cae
FG
92protected:
93 MDCache *mdcache;
94 MDSRank *get_mds() override
95 {
11fdf7f2 96 ceph_assert(mdcache != NULL);
7c673cae
FG
97 return mdcache->mds;
98 }
99public:
100 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
101};
102
103
104/**
105 * Only for contexts called back from an I/O completion
106 *
107 * Note: duplication of members wrt MDCacheContext, because
108 * it'ls the lesser of two evils compared with introducing
109 * yet another piece of (multiple) inheritance.
110 */
111class MDCacheIOContext : public virtual MDSIOContextBase {
112protected:
113 MDCache *mdcache;
114 MDSRank *get_mds() override
115 {
11fdf7f2 116 ceph_assert(mdcache != NULL);
7c673cae
FG
117 return mdcache->mds;
118 }
119public:
91327a77
AA
120 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
121 MDSIOContextBase(track), mdcache(mdc_) {}
7c673cae
FG
122};
123
124class MDCacheLogContext : public virtual MDSLogContextBase {
125protected:
126 MDCache *mdcache;
127 MDSRank *get_mds() override
128 {
11fdf7f2 129 ceph_assert(mdcache != NULL);
7c673cae
FG
130 return mdcache->mds;
131 }
132public:
133 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
134};
135
136MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
137 mds(m),
138 filer(m->objecter, m->finisher),
7c673cae 139 recovery_queue(m),
a8e16298 140 stray_manager(m, purge_queue_),
11fdf7f2
TL
141 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")),
142 open_file_table(m)
7c673cae
FG
143{
144 migrator.reset(new Migrator(mds, this));
7c673cae 145
11fdf7f2
TL
146 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
147 (g_conf()->mds_dir_max_commit_size << 20) :
148 (0.9 *(g_conf()->osd_max_write_size << 20));
7c673cae 149
11fdf7f2
TL
150 cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
151 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
153 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 154
11fdf7f2 155 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
7c673cae 156
31f18b77
FG
157 bottom_lru.lru_set_midpoint(0);
158
11fdf7f2 159 decayrate.set_halflife(g_conf()->mds_decay_halflife);
7c673cae 160
eafe8130
TL
161 upkeeper = std::thread([this]() {
162 std::unique_lock lock(upkeep_mutex);
163 while (!upkeep_trim_shutdown.load()) {
164 auto now = clock::now();
165 auto since = now-upkeep_last_trim;
92f5a8d4
TL
166 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
167 if (since >= trim_interval*.90) {
eafe8130
TL
168 lock.unlock(); /* mds_lock -> upkeep_mutex */
169 std::scoped_lock mds_lock(mds->mds_lock);
170 lock.lock();
171 if (upkeep_trim_shutdown.load())
172 return;
173 if (mds->is_cache_trimmable()) {
174 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
175 trim_client_leases();
176 trim();
177 check_memory_usage();
92f5a8d4
TL
178 auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
179 mds->server->recall_client_state(nullptr, flags);
eafe8130 180 upkeep_last_trim = clock::now();
92f5a8d4 181 upkeep_last_trim = now = clock::now();
eafe8130
TL
182 } else {
183 dout(10) << "cache not ready for trimming" << dendl;
184 }
185 } else {
92f5a8d4
TL
186 trim_interval -= since;
187 }
188 since = now-upkeep_last_release;
189 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
190 if (since >= release_interval) {
191 /* XXX not necessary once MDCache uses PriorityCache */
192 dout(10) << "releasing free memory" << dendl;
193 ceph_heap_release_free_memory();
194 upkeep_last_release = clock::now();
195 } else {
196 release_interval -= since;
eafe8130 197 }
92f5a8d4 198 auto interval = std::min(release_interval, trim_interval);
eafe8130
TL
199 dout(20) << "upkeep thread waiting interval " << interval << dendl;
200 upkeep_cvar.wait_for(lock, interval);
201 }
202 });
7c673cae
FG
203}
204
205MDCache::~MDCache()
206{
207 if (logger) {
208 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
209 }
eafe8130
TL
210 if (upkeeper.joinable())
211 upkeeper.join();
7c673cae
FG
212}
213
92f5a8d4 214void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
91327a77
AA
215{
216 if (changed.count("mds_cache_size"))
11fdf7f2 217 cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
91327a77 218 if (changed.count("mds_cache_memory_limit"))
11fdf7f2 219 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
91327a77 220 if (changed.count("mds_cache_reservation"))
11fdf7f2 221 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
91327a77 222 if (changed.count("mds_health_cache_threshold"))
11fdf7f2 223 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 224 if (changed.count("mds_cache_mid"))
11fdf7f2 225 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
a8e16298 226 if (changed.count("mds_cache_trim_decay_rate")) {
11fdf7f2 227 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
a8e16298 228 }
7c673cae 229
92f5a8d4
TL
230 migrator->handle_conf_change(changed, mdsmap);
231 mds->balancer->handle_conf_change(changed, mdsmap);
91327a77 232}
7c673cae
FG
233
234void MDCache::log_stat()
235{
91327a77 236 mds->logger->set(l_mds_inode_max, cache_inode_limit ? : INT_MAX);
7c673cae
FG
237 mds->logger->set(l_mds_inodes, lru.lru_get_size());
238 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
239 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
240 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
241 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
242 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
243 mds->logger->set(l_mds_caps, Capability::count());
eafe8130
TL
244 if (root) {
245 mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
246 mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
247 mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
248 }
7c673cae
FG
249}
250
251
252//
253
254bool MDCache::shutdown()
255{
eafe8130
TL
256 {
257 std::scoped_lock lock(upkeep_mutex);
258 upkeep_trim_shutdown = true;
259 upkeep_cvar.notify_one();
260 }
7c673cae
FG
261 if (lru.lru_get_size() > 0) {
262 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
263 //show_cache();
264 show_subtrees();
265 //dump();
266 }
267 return true;
268}
269
270
271// ====================================================================
272// some inode functions
273
274void MDCache::add_inode(CInode *in)
275{
276 // add to lru, inode map
b32b8144
FG
277 if (in->last == CEPH_NOSNAP) {
278 auto &p = inode_map[in->ino()];
11fdf7f2 279 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
280 p = in;
281 } else {
282 auto &p = snap_inode_map[in->vino()];
11fdf7f2 283 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
284 p = in;
285 }
7c673cae
FG
286
287 if (in->ino() < MDS_INO_SYSTEM_BASE) {
288 if (in->ino() == MDS_INO_ROOT)
289 root = in;
290 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
291 myin = in;
292 else if (in->is_stray()) {
293 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
294 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
295 }
296 }
297 if (in->is_base())
298 base_inodes.insert(in);
299 }
300
181888fb 301 if (cache_toofull()) {
7c673cae
FG
302 exceeded_size_limit = true;
303 }
304}
305
306void MDCache::remove_inode(CInode *o)
307{
308 dout(14) << "remove_inode " << *o << dendl;
309
310 if (o->get_parent_dn()) {
311 // FIXME: multiple parents?
312 CDentry *dn = o->get_parent_dn();
11fdf7f2 313 ceph_assert(!dn->is_dirty());
7c673cae
FG
314 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
315 }
316
317 if (o->is_dirty())
318 o->mark_clean();
319 if (o->is_dirty_parent())
320 o->clear_dirty_parent();
321
322 o->clear_scatter_dirty();
323
324 o->item_open_file.remove_myself();
325
31f18b77
FG
326 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
327 export_pin_queue.erase(o);
7c673cae 328
eafe8130
TL
329 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
330 export_pin_delayed_queue.erase(o);
331
7c673cae 332 // remove from inode map
11fdf7f2 333 if (o->last == CEPH_NOSNAP) {
b32b8144 334 inode_map.erase(o->ino());
11fdf7f2
TL
335 } else {
336 o->item_caps.remove_myself();
b32b8144 337 snap_inode_map.erase(o->vino());
11fdf7f2 338 }
7c673cae
FG
339
340 if (o->ino() < MDS_INO_SYSTEM_BASE) {
341 if (o == root) root = 0;
342 if (o == myin) myin = 0;
343 if (o->is_stray()) {
344 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
345 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
346 }
347 }
348 if (o->is_base())
349 base_inodes.erase(o);
11fdf7f2 350 }
7c673cae
FG
351
352 // delete it
11fdf7f2 353 ceph_assert(o->get_num_ref() == 0);
7c673cae
FG
354 delete o;
355}
356
357file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
358{
359 file_layout_t result = file_layout_t::get_default();
360 result.pool_id = mdsmap.get_first_data_pool();
361 return result;
362}
363
364file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
365{
366 file_layout_t result = file_layout_t::get_default();
367 result.pool_id = mdsmap.get_metadata_pool();
11fdf7f2
TL
368 if (g_conf()->mds_log_segment_size > 0) {
369 result.object_size = g_conf()->mds_log_segment_size;
370 result.stripe_unit = g_conf()->mds_log_segment_size;
7c673cae
FG
371 }
372 return result;
373}
374
375void MDCache::init_layouts()
376{
377 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
378 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
379}
380
381void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
382 int mode) const
383{
384 in->inode.ino = ino;
385 in->inode.version = 1;
386 in->inode.xattr_version = 1;
387 in->inode.mode = 0500 | mode;
388 in->inode.size = 0;
389 in->inode.ctime =
390 in->inode.mtime =
391 in->inode.btime = ceph_clock_now();
392 in->inode.nlink = 1;
393 in->inode.truncate_size = -1ull;
394 in->inode.change_attr = 0;
395 in->inode.export_pin = MDS_RANK_NONE;
396
92f5a8d4 397 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
398 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
399 if (in->inode.is_dir()) {
11fdf7f2 400 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
f64942e4
AA
401 in->inode.rstat.rsubdirs = 1; /* itself */
402 in->inode.rstat.rctime = in->inode.ctime;
7c673cae
FG
403 } else {
404 in->inode.layout = default_file_layout;
405 ++in->inode.rstat.rfiles;
406 }
407 in->inode.accounted_rstat = in->inode.rstat;
408
409 if (in->is_base()) {
410 if (in->is_root())
411 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
412 else
413 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
414 in->open_snaprealm(); // empty snaprealm
11fdf7f2 415 ceph_assert(!in->snaprealm->parent); // created its own
7c673cae
FG
416 in->snaprealm->srnode.seq = 1;
417 }
418}
419
420CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
421{
422 dout(0) << "creating system inode with ino:" << ino << dendl;
423 CInode *in = new CInode(this);
424 create_unlinked_system_inode(in, ino, mode);
425 add_inode(in);
426 return in;
427}
428
429CInode *MDCache::create_root_inode()
430{
431 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
11fdf7f2
TL
432 i->inode.uid = g_conf()->mds_root_ino_uid;
433 i->inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
434 i->inode.layout = default_file_layout;
435 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
436 return i;
437}
438
439void MDCache::create_empty_hierarchy(MDSGather *gather)
440{
441 // create root dir
442 CInode *root = create_root_inode();
443
444 // force empty root dir
445 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
446 adjust_subtree_auth(rootdir, mds->get_nodeid());
447 rootdir->dir_rep = CDir::REP_ALL; //NONE;
448
11fdf7f2
TL
449 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
450 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
451 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
f64942e4
AA
452 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
453 * assume version 0 is stale/invalid.
454 */
7c673cae
FG
455
456 rootdir->mark_complete();
457 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
458 rootdir->commit(0, gather->new_sub());
459
28e407b8
AA
460 root->mark_clean();
461 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
462 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
463 root->flush(gather->new_sub());
7c673cae
FG
464}
465
466void MDCache::create_mydir_hierarchy(MDSGather *gather)
467{
468 // create mds dir
469 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
470
471 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
472 adjust_subtree_auth(mydir, mds->get_nodeid());
473
474 LogSegment *ls = mds->mdlog->get_current_segment();
475
476 // stray dir
477 for (int i = 0; i < NUM_STRAY; ++i) {
478 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
479 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
480 stringstream name;
481 name << "stray" << i;
482 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
483 sdn->_mark_dirty(mds->mdlog->get_current_segment());
484
485 stray->inode.dirstat = straydir->fnode.fragstat;
486
487 mydir->fnode.rstat.add(stray->inode.rstat);
488 mydir->fnode.fragstat.nsubdirs++;
489 // save them
490 straydir->mark_complete();
491 straydir->mark_dirty(straydir->pre_dirty(), ls);
492 straydir->commit(0, gather->new_sub());
28e407b8 493 stray->mark_dirty_parent(ls, true);
7c673cae
FG
494 stray->store_backtrace(gather->new_sub());
495 }
496
497 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
498 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
499
500 myin->inode.dirstat = mydir->fnode.fragstat;
501 myin->inode.rstat = mydir->fnode.rstat;
502 ++myin->inode.rstat.rsubdirs;
503 myin->inode.accounted_rstat = myin->inode.rstat;
504
505 mydir->mark_complete();
506 mydir->mark_dirty(mydir->pre_dirty(), ls);
507 mydir->commit(0, gather->new_sub());
508
509 myin->store(gather->new_sub());
510}
511
512struct C_MDC_CreateSystemFile : public MDCacheLogContext {
513 MutationRef mut;
514 CDentry *dn;
515 version_t dpv;
11fdf7f2
TL
516 MDSContext *fin;
517 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
7c673cae
FG
518 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
519 void finish(int r) override {
520 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
521 }
522};
523
11fdf7f2 524void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
7c673cae
FG
525{
526 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
527 CDentry *dn = dir->add_null_dentry(name);
528
529 dn->push_projected_linkage(in);
530 version_t dpv = dn->pre_dirty();
531
532 CDir *mdir = 0;
533 if (in->inode.is_dir()) {
534 in->inode.rstat.rsubdirs = 1;
535
536 mdir = in->get_or_open_dirfrag(this, frag_t());
537 mdir->mark_complete();
538 mdir->pre_dirty();
539 } else
540 in->inode.rstat.rfiles = 1;
541 in->inode.version = dn->pre_dirty();
542
543 SnapRealm *realm = dir->get_inode()->find_snaprealm();
544 dn->first = in->first = realm->get_newest_seq() + 1;
545
546 MutationRef mut(new MutationImpl());
547
548 // force some locks. hacky.
549 mds->locker->wrlock_force(&dir->inode->filelock, mut);
550 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
551
552 mut->ls = mds->mdlog->get_current_segment();
553 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
554 mds->mdlog->start_entry(le);
555
556 if (!in->is_mdsdir()) {
557 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
558 le->metablob.add_primary_dentry(dn, in, true);
559 } else {
560 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
561 journal_dirty_inode(mut.get(), &le->metablob, in);
562 dn->push_projected_linkage(in->ino(), in->d_type());
563 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
564 le->metablob.add_root(true, in);
565 }
566 if (mdir)
567 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
568
569 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
570 mds->mdlog->flush();
571}
572
11fdf7f2 573void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
7c673cae
FG
574{
575 dout(10) << "_create_system_file_finish " << *dn << dendl;
576
577 dn->pop_projected_linkage();
578 dn->mark_dirty(dpv, mut->ls);
579
580 CInode *in = dn->get_linkage()->get_inode();
581 in->inode.version--;
582 in->mark_dirty(in->inode.version + 1, mut->ls);
583
584 if (in->inode.is_dir()) {
585 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 586 ceph_assert(dir);
7c673cae
FG
587 dir->mark_dirty(1, mut->ls);
588 dir->mark_new(mut->ls);
589 }
590
591 mut->apply();
592 mds->locker->drop_locks(mut.get());
593 mut->cleanup();
594
595 fin->complete(0);
596
597 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
598 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
599}
600
601
602
603struct C_MDS_RetryOpenRoot : public MDSInternalContext {
604 MDCache *cache;
605 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
606 void finish(int r) override {
607 if (r < 0) {
608 // If we can't open root, something disastrous has happened: mark
609 // this rank damaged for operator intervention. Note that
610 // it is not okay to call suicide() here because we are in
611 // a Finisher callback.
612 cache->mds->damaged();
613 ceph_abort(); // damaged should never return
614 } else {
615 cache->open_root();
616 }
617 }
618};
619
11fdf7f2 620void MDCache::open_root_inode(MDSContext *c)
7c673cae
FG
621{
622 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
623 CInode *in;
624 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
625 in->fetch(c);
626 } else {
627 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
628 }
629}
630
11fdf7f2 631void MDCache::open_mydir_inode(MDSContext *c)
7c673cae 632{
7c673cae 633 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
11fdf7f2 634 in->fetch(c);
7c673cae
FG
635}
636
11fdf7f2 637void MDCache::open_mydir_frag(MDSContext *c)
28e407b8
AA
638{
639 open_mydir_inode(
640 new MDSInternalContextWrapper(mds,
641 new FunctionContext([this, c](int r) {
642 if (r < 0) {
643 c->complete(r);
644 return;
645 }
646 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 647 ceph_assert(mydir);
28e407b8
AA
648 adjust_subtree_auth(mydir, mds->get_nodeid());
649 mydir->fetch(c);
650 })
651 )
652 );
653}
654
7c673cae
FG
655void MDCache::open_root()
656{
657 dout(10) << "open_root" << dendl;
658
659 if (!root) {
660 open_root_inode(new C_MDS_RetryOpenRoot(this));
661 return;
662 }
663 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
11fdf7f2 664 ceph_assert(root->is_auth());
7c673cae 665 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
11fdf7f2 666 ceph_assert(rootdir);
7c673cae
FG
667 if (!rootdir->is_subtree_root())
668 adjust_subtree_auth(rootdir, mds->get_nodeid());
669 if (!rootdir->is_complete()) {
670 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
671 return;
672 }
673 } else {
11fdf7f2 674 ceph_assert(!root->is_auth());
7c673cae
FG
675 CDir *rootdir = root->get_dirfrag(frag_t());
676 if (!rootdir) {
224ce89b 677 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
678 return;
679 }
680 }
681
682 if (!myin) {
683 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
684 in->fetch(new C_MDS_RetryOpenRoot(this));
685 return;
686 }
687 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 688 ceph_assert(mydir);
7c673cae
FG
689 adjust_subtree_auth(mydir, mds->get_nodeid());
690
691 populate_mydir();
692}
693
694void MDCache::populate_mydir()
695{
11fdf7f2 696 ceph_assert(myin);
7c673cae 697 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 698 ceph_assert(mydir);
7c673cae
FG
699
700 dout(10) << "populate_mydir " << *mydir << dendl;
701
702 if (!mydir->is_complete()) {
703 mydir->fetch(new C_MDS_RetryOpenRoot(this));
704 return;
705 }
706
707 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
708 // A missing dirfrag, we will recreate it. Before that, we must dirty
709 // it before dirtying any of the strays we create within it.
710 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
711 "recreating it now";
712 LogSegment *ls = mds->mdlog->get_current_segment();
713 mydir->state_clear(CDir::STATE_BADFRAG);
714 mydir->mark_complete();
715 mydir->mark_dirty(mydir->pre_dirty(), ls);
716 }
717
718 // open or create stray
719 uint64_t num_strays = 0;
720 for (int i = 0; i < NUM_STRAY; ++i) {
721 stringstream name;
722 name << "stray" << i;
723 CDentry *straydn = mydir->lookup(name.str());
724
725 // allow for older fs's with stray instead of stray0
726 if (straydn == NULL && i == 0)
727 straydn = mydir->lookup("stray");
728
729 if (!straydn || !straydn->get_linkage()->get_inode()) {
730 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
731 new C_MDS_RetryOpenRoot(this));
732 return;
733 }
11fdf7f2
TL
734 ceph_assert(straydn);
735 ceph_assert(strays[i]);
7c673cae
FG
736 // we make multiple passes through this method; make sure we only pin each stray once.
737 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
738 strays[i]->get(CInode::PIN_STRAY);
739 strays[i]->state_set(CInode::STATE_STRAYPINNED);
740 strays[i]->get_stickydirs();
741 }
742 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
743
744 // open all frags
11fdf7f2
TL
745 frag_vec_t leaves;
746 strays[i]->dirfragtree.get_leaves(leaves);
747 for (const auto& leaf : leaves) {
748 CDir *dir = strays[i]->get_dirfrag(leaf);
7c673cae 749 if (!dir) {
11fdf7f2 750 dir = strays[i]->get_or_open_dirfrag(this, leaf);
7c673cae
FG
751 }
752
753 // DamageTable applies special handling to strays: it will
754 // have damaged() us out if one is damaged.
11fdf7f2 755 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
7c673cae
FG
756
757 if (dir->get_version() == 0) {
758 dir->fetch(new C_MDS_RetryOpenRoot(this));
759 return;
760 }
761
762 if (dir->get_frag_size() > 0)
763 num_strays += dir->get_frag_size();
764 }
765 }
766
7c673cae
FG
767 // okay!
768 dout(10) << "populate_mydir done" << dendl;
11fdf7f2 769 ceph_assert(!open);
7c673cae
FG
770 open = true;
771 mds->queue_waiters(waiting_for_open);
772
11fdf7f2
TL
773 stray_manager.set_num_strays(num_strays);
774 stray_manager.activate();
775
7c673cae
FG
776 scan_stray_dir();
777}
778
11fdf7f2 779void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
7c673cae
FG
780{
781 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
782}
783
784CDir *MDCache::get_stray_dir(CInode *in)
785{
786 string straydname;
787 in->name_stray_dentry(straydname);
788
789 CInode *strayi = get_stray();
11fdf7f2 790 ceph_assert(strayi);
7c673cae
FG
791 frag_t fg = strayi->pick_dirfrag(straydname);
792 CDir *straydir = strayi->get_dirfrag(fg);
11fdf7f2 793 ceph_assert(straydir);
7c673cae
FG
794 return straydir;
795}
796
797CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
798{
799 CDir *straydir = get_stray_dir(in);
800 string straydname;
801 in->name_stray_dentry(straydname);
802 CDentry *straydn = straydir->lookup(straydname);
803 if (!straydn) {
804 straydn = straydir->add_null_dentry(straydname);
805 straydn->mark_new();
806 } else {
11fdf7f2 807 ceph_assert(straydn->get_projected_linkage()->is_null());
7c673cae
FG
808 }
809
810 straydn->state_set(CDentry::STATE_STRAY);
811 return straydn;
812}
813
814
815
11fdf7f2 816MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
7c673cae
FG
817{
818 // inode?
819 if (info.ino)
820 return get_inode(info.ino, info.snapid);
821
822 // dir or dentry.
823 CDir *dir = get_dirfrag(info.dirfrag);
824 if (!dir) return 0;
825
826 if (info.dname.length())
827 return dir->lookup(info.dname, info.snapid);
828 else
829 return dir;
830}
831
832
833
834
835// ====================================================================
836// subtree management
837
7c673cae
FG
838/*
839 * adjust the dir_auth of a subtree.
840 * merge with parent and/or child subtrees, if is it appropriate.
841 * merge can ONLY happen if both parent and child have unambiguous auth.
842 */
28e407b8 843void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
844{
845 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
846 << " on " << *dir << dendl;
847
7c673cae
FG
848 show_subtrees();
849
850 CDir *root;
851 if (dir->inode->is_base()) {
852 root = dir; // bootstrap hack.
853 if (subtrees.count(root) == 0) {
854 subtrees[root];
855 root->get(CDir::PIN_SUBTREE);
856 }
857 } else {
858 root = get_subtree_root(dir); // subtree root
859 }
11fdf7f2
TL
860 ceph_assert(root);
861 ceph_assert(subtrees.count(root));
7c673cae
FG
862 dout(7) << " current root is " << *root << dendl;
863
864 if (root == dir) {
865 // i am already a subtree.
866 dir->set_dir_auth(auth);
867 } else {
868 // i am a new subtree.
869 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 870 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
871 subtrees[dir]; // create empty subtree bounds list for me.
872 dir->get(CDir::PIN_SUBTREE);
873
874 // set dir_auth
875 dir->set_dir_auth(auth);
876
877 // move items nested beneath me, under me.
878 set<CDir*>::iterator p = subtrees[root].begin();
879 while (p != subtrees[root].end()) {
880 set<CDir*>::iterator next = p;
881 ++next;
882 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
883 // move under me
884 dout(10) << " claiming child bound " << **p << dendl;
885 subtrees[dir].insert(*p);
886 subtrees[root].erase(p);
887 }
888 p = next;
889 }
890
891 // i am a bound of the parent subtree.
892 subtrees[root].insert(dir);
893
894 // i am now the subtree root.
895 root = dir;
896
897 // adjust recursive pop counters
28e407b8 898 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
899 CDir *p = dir->get_parent_dir();
900 while (p) {
11fdf7f2 901 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
7c673cae
FG
902 if (p->is_subtree_root()) break;
903 p = p->inode->get_parent_dir();
904 }
905 }
7c673cae
FG
906 }
907
908 show_subtrees();
909}
910
911
912void MDCache::try_subtree_merge(CDir *dir)
913{
914 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
915 // record my old bounds
916 auto oldbounds = subtrees.at(dir);
7c673cae 917
224ce89b 918 set<CInode*> to_eval;
7c673cae 919 // try merge at my root
224ce89b 920 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
921
922 // try merge at my old bounds
224ce89b
WB
923 for (auto bound : oldbounds)
924 try_subtree_merge_at(bound, &to_eval);
925
926 if (!(mds->is_any_replay() || mds->is_resolve())) {
927 for(auto in : to_eval)
928 eval_subtree_root(in);
929 }
7c673cae
FG
930}
931
932class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
933 CInode *in;
934 MutationRef mut;
935public:
936 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
937 void finish(int r) override {
938 mdcache->subtree_merge_writebehind_finish(in, mut);
939 }
940};
941
28e407b8 942void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
943{
944 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
945
946 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
947 dir->state_test(CDir::STATE_EXPORTBOUND) ||
948 dir->state_test(CDir::STATE_AUXSUBTREE))
949 return;
950
951 auto it = subtrees.find(dir);
11fdf7f2 952 ceph_assert(it != subtrees.end());
7c673cae 953
7c673cae
FG
954 // merge with parent?
955 CDir *parent = dir;
956 if (!dir->inode->is_base())
957 parent = get_subtree_root(dir->get_parent_dir());
958
b32b8144
FG
959 if (parent != dir && // we have a parent,
960 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
961 // merge with parent.
962 dout(10) << " subtree merge at " << *dir << dendl;
963 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
964
965 // move our bounds under the parent
b32b8144 966 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
967
968 // we are no longer a subtree or bound
969 dir->put(CDir::PIN_SUBTREE);
b32b8144 970 subtrees.erase(it);
7c673cae
FG
971 subtrees[parent].erase(dir);
972
973 // adjust popularity?
28e407b8 974 if (adjust_pop && dir->is_auth()) {
28e407b8 975 CDir *cur = dir;
7c673cae
FG
976 CDir *p = dir->get_parent_dir();
977 while (p) {
11fdf7f2 978 p->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8 979 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 980 if (p->is_subtree_root()) break;
28e407b8 981 cur = p;
7c673cae
FG
982 p = p->inode->get_parent_dir();
983 }
984 }
985
224ce89b
WB
986 if (to_eval && dir->get_inode()->is_auth())
987 to_eval->insert(dir->get_inode());
7c673cae 988
181888fb
FG
989 show_subtrees(15);
990 }
7c673cae
FG
991}
992
993void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
994{
995 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
996 in->pop_and_dirty_projected_inode(mut->ls);
997
998 mut->apply();
999 mds->locker->drop_locks(mut.get());
1000 mut->cleanup();
1001
1002 in->auth_unpin(this);
1003}
1004
1005void MDCache::eval_subtree_root(CInode *diri)
1006{
1007 // evaluate subtree inode filelock?
1008 // (we should scatter the filelock on subtree bounds)
11fdf7f2 1009 ceph_assert(diri->is_auth());
224ce89b 1010 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1011}
1012
1013
11fdf7f2 1014void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
7c673cae
FG
1015{
1016 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1017 << " on " << *dir
1018 << " bounds " << bounds
1019 << dendl;
1020
1021 show_subtrees();
1022
1023 CDir *root;
1024 if (dir->ino() == MDS_INO_ROOT) {
1025 root = dir; // bootstrap hack.
1026 if (subtrees.count(root) == 0) {
1027 subtrees[root];
1028 root->get(CDir::PIN_SUBTREE);
1029 }
1030 } else {
1031 root = get_subtree_root(dir); // subtree root
1032 }
11fdf7f2
TL
1033 ceph_assert(root);
1034 ceph_assert(subtrees.count(root));
7c673cae
FG
1035 dout(7) << " current root is " << *root << dendl;
1036
1037 mds_authority_t oldauth = dir->authority();
1038
1039 if (root == dir) {
1040 // i am already a subtree.
1041 dir->set_dir_auth(auth);
1042 } else {
1043 // i am a new subtree.
1044 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 1045 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
1046 subtrees[dir]; // create empty subtree bounds list for me.
1047 dir->get(CDir::PIN_SUBTREE);
1048
1049 // set dir_auth
1050 dir->set_dir_auth(auth);
1051
1052 // move items nested beneath me, under me.
1053 set<CDir*>::iterator p = subtrees[root].begin();
1054 while (p != subtrees[root].end()) {
1055 set<CDir*>::iterator next = p;
1056 ++next;
1057 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1058 // move under me
1059 dout(10) << " claiming child bound " << **p << dendl;
1060 subtrees[dir].insert(*p);
1061 subtrees[root].erase(p);
1062 }
1063 p = next;
1064 }
1065
1066 // i am a bound of the parent subtree.
1067 subtrees[root].insert(dir);
1068
1069 // i am now the subtree root.
1070 root = dir;
1071 }
1072
224ce89b
WB
1073 set<CInode*> to_eval;
1074
7c673cae
FG
1075 // verify/adjust bounds.
1076 // - these may be new, or
1077 // - beneath existing ambiguous bounds (which will be collapsed),
1078 // - but NOT beneath unambiguous bounds.
11fdf7f2 1079 for (const auto& bound : bounds) {
7c673cae
FG
1080 // new bound?
1081 if (subtrees[dir].count(bound) == 0) {
1082 if (get_subtree_root(bound) == dir) {
1083 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1084 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1085 }
1086 else {
1087 dout(10) << " want bound " << *bound << dendl;
1088 CDir *t = get_subtree_root(bound->get_parent_dir());
1089 if (subtrees[t].count(bound) == 0) {
11fdf7f2 1090 ceph_assert(t != dir);
7c673cae
FG
1091 dout(10) << " new bound " << *bound << dendl;
1092 adjust_subtree_auth(bound, t->authority());
1093 }
1094 // make sure it's nested beneath ambiguous subtree(s)
1095 while (1) {
1096 while (subtrees[dir].count(t) == 0)
1097 t = get_subtree_root(t->get_parent_dir());
1098 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1099 adjust_subtree_auth(t, auth);
224ce89b 1100 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1101 t = get_subtree_root(bound->get_parent_dir());
1102 if (t == dir) break;
1103 }
1104 }
1105 }
1106 else {
1107 dout(10) << " already have bound " << *bound << dendl;
1108 }
1109 }
1110 // merge stray bounds?
1111 while (!subtrees[dir].empty()) {
1112 set<CDir*> copy = subtrees[dir];
1113 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1114 if (bounds.count(*p) == 0) {
1115 CDir *stray = *p;
1116 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1117 adjust_subtree_auth(stray, auth);
224ce89b 1118 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1119 }
1120 }
1121 // swallowing subtree may add new subtree bounds
1122 if (copy == subtrees[dir])
1123 break;
1124 }
1125
1126 // bound should now match.
1127 verify_subtree_bounds(dir, bounds);
1128
1129 show_subtrees();
224ce89b
WB
1130
1131 if (!(mds->is_any_replay() || mds->is_resolve())) {
1132 for(auto in : to_eval)
1133 eval_subtree_root(in);
1134 }
7c673cae
FG
1135}
1136
1137
1138/*
1139 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1140 * fragmentation as necessary to get an equivalent bounding set. That is, only
1141 * split if one of our frags spans the provided bounding set. Never merge.
1142 */
11fdf7f2 1143void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
7c673cae
FG
1144{
1145 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1146
1147 // sort by ino
1148 map<inodeno_t, fragset_t> byino;
11fdf7f2
TL
1149 for (auto& frag : dfs) {
1150 byino[frag.ino].insert(frag.frag);
1151 }
7c673cae
FG
1152 dout(10) << " by ino: " << byino << dendl;
1153
1154 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1155 CInode *diri = get_inode(p->first);
1156 if (!diri)
1157 continue;
1158 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1159
1160 fragtree_t tmpdft;
1161 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1162 tmpdft.force_to_leaf(g_ceph_context, *q);
1163
11fdf7f2
TL
1164 for (const auto& fg : p->second) {
1165 frag_vec_t leaves;
1166 diri->dirfragtree.get_leaves_under(fg, leaves);
1167 if (leaves.empty()) {
7c673cae
FG
1168 bool all = true;
1169 frag_t approx_fg = diri->dirfragtree[fg.value()];
11fdf7f2
TL
1170 frag_vec_t approx_leaves;
1171 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1172 for (const auto& leaf : approx_leaves) {
1173 if (p->second.get().count(leaf) == 0) {
7c673cae 1174 // not bound, so the resolve message is from auth MDS of the dirfrag
11fdf7f2 1175 force_dir_fragment(diri, leaf);
7c673cae
FG
1176 all = false;
1177 }
1178 }
1179 if (all)
11fdf7f2 1180 leaves.push_back(approx_fg);
7c673cae 1181 else
11fdf7f2 1182 diri->dirfragtree.get_leaves_under(fg, leaves);
7c673cae 1183 }
11fdf7f2
TL
1184 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1185 for (const auto& leaf : leaves) {
1186 CDir *dir = diri->get_dirfrag(leaf);
7c673cae
FG
1187 if (dir)
1188 bounds.insert(dir);
1189 }
1190 }
1191 }
1192}
1193
11fdf7f2 1194void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
7c673cae
FG
1195{
1196 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1197 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1198
1199 set<CDir*> bounds;
1200 get_force_dirfrag_bound_set(bound_dfs, bounds);
1201 adjust_bounded_subtree_auth(dir, bounds, auth);
1202}
1203
11fdf7f2 1204void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
7c673cae
FG
1205{
1206 dout(10) << "map_dirfrag_set " << dfs << dendl;
1207
1208 // group by inode
1209 map<inodeno_t, fragset_t> ino_fragset;
11fdf7f2
TL
1210 for (const auto &df : dfs) {
1211 ino_fragset[df.ino].insert(df.frag);
1212 }
7c673cae
FG
1213
1214 // get frags
1215 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1216 p != ino_fragset.end();
1217 ++p) {
1218 CInode *in = get_inode(p->first);
1219 if (!in)
1220 continue;
1221
11fdf7f2
TL
1222 frag_vec_t fgs;
1223 for (const auto& fg : p->second) {
1224 in->dirfragtree.get_leaves_under(fg, fgs);
1225 }
7c673cae 1226
11fdf7f2 1227 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
7c673cae
FG
1228 << " on " << *in << dendl;
1229
11fdf7f2
TL
1230 for (const auto& fg : fgs) {
1231 CDir *dir = in->get_dirfrag(fg);
7c673cae
FG
1232 if (dir)
1233 result.insert(dir);
1234 }
1235 }
1236}
1237
1238
1239
1240CDir *MDCache::get_subtree_root(CDir *dir)
1241{
1242 // find the underlying dir that delegates (or is about to delegate) auth
1243 while (true) {
1244 if (dir->is_subtree_root())
1245 return dir;
1246 dir = dir->get_inode()->get_parent_dir();
1247 if (!dir)
1248 return 0; // none
1249 }
1250}
1251
1252CDir *MDCache::get_projected_subtree_root(CDir *dir)
1253{
1254 // find the underlying dir that delegates (or is about to delegate) auth
1255 while (true) {
1256 if (dir->is_subtree_root())
1257 return dir;
1258 dir = dir->get_inode()->get_projected_parent_dir();
1259 if (!dir)
1260 return 0; // none
1261 }
1262}
1263
1264void MDCache::remove_subtree(CDir *dir)
1265{
1266 dout(10) << "remove_subtree " << *dir << dendl;
11fdf7f2
TL
1267 ceph_assert(subtrees.count(dir));
1268 ceph_assert(subtrees[dir].empty());
7c673cae
FG
1269 subtrees.erase(dir);
1270 dir->put(CDir::PIN_SUBTREE);
1271 if (dir->get_parent_dir()) {
1272 CDir *p = get_subtree_root(dir->get_parent_dir());
11fdf7f2 1273 ceph_assert(subtrees[p].count(dir));
7c673cae
FG
1274 subtrees[p].erase(dir);
1275 }
1276}
1277
1278void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1279{
11fdf7f2 1280 ceph_assert(subtrees.count(dir));
7c673cae
FG
1281 bounds = subtrees[dir];
1282}
1283
1284void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1285{
1286 if (subtrees.count(dir)) {
1287 // just copy them, dir is a subtree.
1288 get_subtree_bounds(dir, bounds);
1289 } else {
1290 // find them
1291 CDir *root = get_subtree_root(dir);
1292 for (set<CDir*>::iterator p = subtrees[root].begin();
1293 p != subtrees[root].end();
1294 ++p) {
1295 CDir *t = *p;
1296 while (t != root) {
1297 t = t->get_parent_dir();
11fdf7f2 1298 ceph_assert(t);
7c673cae
FG
1299 if (t == dir) {
1300 bounds.insert(*p);
1301 continue;
1302 }
1303 }
1304 }
1305 }
1306}
1307
1308void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1309{
1310 // for debugging only.
11fdf7f2 1311 ceph_assert(subtrees.count(dir));
7c673cae
FG
1312 if (bounds != subtrees[dir]) {
1313 dout(0) << "verify_subtree_bounds failed" << dendl;
1314 set<CDir*> b = bounds;
1315 for (auto &cd : subtrees[dir]) {
1316 if (bounds.count(cd)) {
1317 b.erase(cd);
1318 continue;
1319 }
1320 dout(0) << " missing bound " << *cd << dendl;
1321 }
1322 for (const auto &cd : b)
1323 dout(0) << " extra bound " << *cd << dendl;
1324 }
11fdf7f2 1325 ceph_assert(bounds == subtrees[dir]);
7c673cae
FG
1326}
1327
1328void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1329{
1330 // for debugging only.
11fdf7f2 1331 ceph_assert(subtrees.count(dir));
7c673cae
FG
1332
1333 // make sure that any bounds i do have are properly noted as such.
1334 int failed = 0;
1335 for (const auto &fg : bounds) {
1336 CDir *bd = get_dirfrag(fg);
1337 if (!bd) continue;
1338 if (subtrees[dir].count(bd) == 0) {
1339 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1340 failed++;
1341 }
1342 }
11fdf7f2 1343 ceph_assert(failed == 0);
7c673cae
FG
1344}
1345
1346void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1347{
1348 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1349 << " to " << *newdir << dendl;
1350 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1351}
1352
224ce89b 1353void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1354{
1355 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1356
7c673cae
FG
1357 CDir *newdir = diri->get_parent_dir();
1358
1359 if (pop) {
1360 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
11fdf7f2
TL
1361 ceph_assert(p != projected_subtree_renames.end());
1362 ceph_assert(!p->second.empty());
1363 ceph_assert(p->second.front().first == olddir);
1364 ceph_assert(p->second.front().second == newdir);
7c673cae
FG
1365 p->second.pop_front();
1366 if (p->second.empty())
1367 projected_subtree_renames.erase(p);
1368 }
1369
11fdf7f2
TL
1370 vector<CDir*> dfls;
1371
1372 // adjust total auth pin of freezing subtree
1373 if (olddir != newdir) {
1374 diri->get_nested_dirfrags(dfls);
1375 for (auto dir : dfls)
1376 olddir->adjust_freeze_after_rename(dir);
1377 dfls.clear();
1378 }
1379
7c673cae 1380 // adjust subtree
7c673cae
FG
1381 // make sure subtree dirfrags are at the front of the list
1382 diri->get_subtree_dirfrags(dfls);
1383 diri->get_nested_dirfrags(dfls);
11fdf7f2 1384 for (auto dir : dfls) {
7c673cae
FG
1385 dout(10) << "dirfrag " << *dir << dendl;
1386 CDir *oldparent = get_subtree_root(olddir);
1387 dout(10) << " old parent " << *oldparent << dendl;
1388 CDir *newparent = get_subtree_root(newdir);
1389 dout(10) << " new parent " << *newparent << dendl;
1390
28e407b8 1391 if (olddir != newdir)
11fdf7f2 1392 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
28e407b8 1393
7c673cae
FG
1394 if (oldparent == newparent) {
1395 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1396 } else if (dir->is_subtree_root()) {
7c673cae
FG
1397 // children are fine. change parent.
1398 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
11fdf7f2 1399 ceph_assert(subtrees[oldparent].count(dir));
7c673cae 1400 subtrees[oldparent].erase(dir);
11fdf7f2 1401 ceph_assert(subtrees.count(newparent));
7c673cae 1402 subtrees[newparent].insert(dir);
224ce89b 1403 // caller is responsible for 'eval diri'
28e407b8 1404 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1405 } else {
1406 // mid-subtree.
1407
1408 // see if any old bounds move to the new parent.
1409 list<CDir*> tomove;
1410 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1411 p != subtrees[oldparent].end();
1412 ++p) {
1413 CDir *bound = *p;
1414 CDir *broot = get_subtree_root(bound->get_parent_dir());
1415 if (broot != oldparent) {
11fdf7f2 1416 ceph_assert(broot == newparent);
7c673cae
FG
1417 tomove.push_back(bound);
1418 }
1419 }
1420 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1421 CDir *bound = *p;
1422 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1423 subtrees[oldparent].erase(bound);
1424 subtrees[newparent].insert(bound);
1425 }
1426
1427 // did auth change?
1428 if (oldparent->authority() != newparent->authority()) {
28e407b8 1429 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1430 // caller is responsible for 'eval diri'
28e407b8 1431 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1432 }
1433 }
28e407b8
AA
1434
1435 if (olddir != newdir)
11fdf7f2 1436 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
7c673cae
FG
1437 }
1438
1439 show_subtrees();
1440}
1441
7c673cae
FG
1442// ===================================
1443// journal and snap/cow helpers
1444
1445
1446/*
1447 * find first inode in cache that follows given snapid. otherwise, return current.
1448 */
1449CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1450{
1451 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
11fdf7f2 1452 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae 1453
b32b8144
FG
1454 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1455 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1456 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1457 in = p->second;
7c673cae 1458 }
b32b8144 1459
7c673cae
FG
1460 return in;
1461}
1462
1463
1464/*
1465 * note: i'm currently cheating wrt dirty and inode.version on cow
1466 * items. instead of doing a full dir predirty, i just take the
1467 * original item's version, and set the dirty flag (via
1468 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1469 * means a special case in the dir commit clean sweep assertions.
1470 * bah.
1471 */
1472CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1473{
11fdf7f2 1474 ceph_assert(last >= in->first);
7c673cae 1475
b32b8144 1476 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae 1477 oldin->inode = *in->get_previous_projected_inode();
7c673cae 1478 oldin->xattrs = *in->get_previous_projected_xattrs();
11fdf7f2 1479 oldin->symlink = in->symlink;
7c673cae
FG
1480 oldin->inode.trim_client_ranges(last);
1481
1482 if (in->first < in->oldest_snap)
1483 in->oldest_snap = in->first;
1484
1485 in->first = last+1;
1486
1487 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1488 add_inode(oldin);
1489
1490 if (in->last != CEPH_NOSNAP) {
1491 CInode *head_in = get_inode(in->ino());
11fdf7f2 1492 ceph_assert(head_in);
494da23a
TL
1493 auto ret = head_in->split_need_snapflush(oldin, in);
1494 if (ret.first) {
7c673cae 1495 oldin->client_snap_caps = in->client_snap_caps;
eafe8130
TL
1496 if (!oldin->client_snap_caps.empty()) {
1497 for (int i = 0; i < num_cinode_locks; i++) {
1498 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1499 ceph_assert(lock);
494da23a
TL
1500 if (lock->get_state() != LOCK_SNAP_SYNC) {
1501 ceph_assert(lock->is_stable());
1502 lock->set_state(LOCK_SNAP_SYNC); // gathering
1503 oldin->auth_pin(lock);
1504 }
7c673cae
FG
1505 lock->get_wrlock(true);
1506 }
1507 }
1508 }
494da23a
TL
1509 if (!ret.second) {
1510 auto client_snap_caps = std::move(in->client_snap_caps);
1511 in->client_snap_caps.clear();
1512 in->item_open_file.remove_myself();
1513 in->item_caps.remove_myself();
eafe8130
TL
1514
1515 if (!client_snap_caps.empty()) {
1516 MDSContext::vec finished;
1517 for (int i = 0; i < num_cinode_locks; i++) {
1518 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1519 ceph_assert(lock);
1520 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
494da23a 1521 lock->put_wrlock();
eafe8130
TL
1522 if (!lock->get_num_wrlocks()) {
1523 lock->set_state(LOCK_SYNC);
1524 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1525 in->auth_unpin(lock);
1526 }
494da23a 1527 }
eafe8130 1528 mds->queue_waiters(finished);
494da23a
TL
1529 }
1530 }
7c673cae
FG
1531 return oldin;
1532 }
1533
b32b8144
FG
1534 if (!in->client_caps.empty()) {
1535 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1536 // clone caps?
94b18763 1537 for (auto &p : in->client_caps) {
b32b8144 1538 client_t client = p.first;
11fdf7f2
TL
1539 Capability *cap = &p.second;
1540 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
b32b8144
FG
1541 if ((issued & CEPH_CAP_ANY_WR) &&
1542 cap->client_follows < last) {
eafe8130
TL
1543 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1544 oldin->client_snap_caps.insert(client);
b32b8144
FG
1545 cap->client_follows = last;
1546
1547 // we need snapflushes for any intervening snaps
1548 dout(10) << " snaps " << snaps << dendl;
1549 for (auto q = snaps.lower_bound(oldin->first);
1550 q != snaps.end() && *q <= last;
1551 ++q) {
1552 in->add_need_snapflush(oldin, *q, client);
1553 }
1554 } else {
1555 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1556 }
7c673cae 1557 }
eafe8130
TL
1558
1559 if (!oldin->client_snap_caps.empty()) {
1560 for (int i = 0; i < num_cinode_locks; i++) {
1561 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1562 ceph_assert(lock);
1563 if (lock->get_state() != LOCK_SNAP_SYNC) {
1564 ceph_assert(lock->is_stable());
1565 lock->set_state(LOCK_SNAP_SYNC); // gathering
1566 oldin->auth_pin(lock);
1567 }
1568 lock->get_wrlock(true);
1569 }
1570 }
7c673cae 1571 }
7c673cae
FG
1572 return oldin;
1573}
1574
1575void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1576 CDentry *dn, snapid_t follows,
1577 CInode **pcow_inode, CDentry::linkage_t *dnl)
1578{
1579 if (!dn) {
1580 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1581 return;
1582 }
1583 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
11fdf7f2 1584 ceph_assert(dn->is_auth());
7c673cae
FG
1585
1586 // nothing to cow on a null dentry, fix caller
1587 if (!dnl)
1588 dnl = dn->get_projected_linkage();
11fdf7f2 1589 ceph_assert(!dnl->is_null());
7c673cae 1590
11fdf7f2
TL
1591 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1592 bool cow_head = false;
1593 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1594 ceph_assert(in->is_frozen_inode());
1595 cow_head = true;
1596 }
1597 if (in && (in->is_multiversion() || cow_head)) {
7c673cae 1598 // multiversion inode.
7c673cae
FG
1599 SnapRealm *realm = NULL;
1600
1601 if (in->get_projected_parent_dn() != dn) {
11fdf7f2 1602 ceph_assert(follows == CEPH_NOSNAP);
7c673cae 1603 realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1604 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1605 ceph_assert(dir_follows >= realm->get_newest_seq());
7c673cae
FG
1606
1607 if (dir_follows+1 > dn->first) {
1608 snapid_t oldfirst = dn->first;
1609 dn->first = dir_follows+1;
1610 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1611 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1612 oldfirst, dir_follows);
1613 olddn->pre_dirty();
1614 dout(10) << " olddn " << *olddn << dendl;
1615 metablob->add_remote_dentry(olddn, true);
1616 mut->add_cow_dentry(olddn);
1617 // FIXME: adjust link count here? hmm.
1618
1619 if (dir_follows+1 > in->first)
11fdf7f2 1620 in->cow_old_inode(dir_follows, cow_head);
7c673cae
FG
1621 }
1622 }
1623
11fdf7f2 1624 follows = dir_follows;
7c673cae
FG
1625 if (in->snaprealm) {
1626 realm = in->snaprealm;
11fdf7f2
TL
1627 ceph_assert(follows >= realm->get_newest_seq());
1628 }
7c673cae
FG
1629 } else {
1630 realm = in->find_snaprealm();
11fdf7f2
TL
1631 if (follows == CEPH_NOSNAP) {
1632 follows = get_global_snaprealm()->get_newest_seq();
1633 ceph_assert(follows >= realm->get_newest_seq());
1634 }
7c673cae
FG
1635 }
1636
1637 // already cloned?
1638 if (follows < in->first) {
1639 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1640 return;
1641 }
1642
1643 if (!realm->has_snaps_in_range(in->first, follows)) {
1644 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1645 in->first = follows + 1;
1646 return;
1647 }
1648
11fdf7f2 1649 in->cow_old_inode(follows, cow_head);
7c673cae
FG
1650
1651 } else {
1652 SnapRealm *realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1653 if (follows == CEPH_NOSNAP) {
1654 follows = get_global_snaprealm()->get_newest_seq();
1655 ceph_assert(follows >= realm->get_newest_seq());
1656 }
7c673cae
FG
1657
1658 // already cloned?
1659 if (follows < dn->first) {
1660 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1661 return;
1662 }
1663
1664 // update dn.first before adding old dentry to cdir's map
1665 snapid_t oldfirst = dn->first;
1666 dn->first = follows+1;
1667
7c673cae
FG
1668 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1669 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1670 if (in)
1671 in->first = follows+1;
1672 return;
1673 }
1674
1675 dout(10) << " dn " << *dn << dendl;
1676 if (in) {
1677 CInode *oldin = cow_inode(in, follows);
1678 mut->add_cow_inode(oldin);
1679 if (pcow_inode)
1680 *pcow_inode = oldin;
11fdf7f2 1681 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
7c673cae
FG
1682 oldin->inode.version = olddn->pre_dirty();
1683 dout(10) << " olddn " << *olddn << dendl;
1684 bool need_snapflush = !oldin->client_snap_caps.empty();
11fdf7f2 1685 if (need_snapflush) {
7c673cae 1686 mut->ls->open_files.push_back(&oldin->item_open_file);
11fdf7f2
TL
1687 mds->locker->mark_need_snapflush_inode(oldin);
1688 }
7c673cae
FG
1689 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1690 mut->add_cow_dentry(olddn);
1691 } else {
11fdf7f2 1692 ceph_assert(dnl->is_remote());
94b18763 1693 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1694 oldfirst, follows);
1695 olddn->pre_dirty();
1696 dout(10) << " olddn " << *olddn << dendl;
1697 metablob->add_remote_dentry(olddn, true);
1698 mut->add_cow_dentry(olddn);
1699 }
1700 }
1701}
1702
1703
1704void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1705 CInode *in, snapid_t follows,
1706 CInode **pcow_inode)
1707{
1708 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1709 CDentry *dn = in->get_projected_parent_dn();
1710 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1711}
1712
1713void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1714{
1715 if (in->is_base()) {
11fdf7f2 1716 metablob->add_root(true, in);
7c673cae
FG
1717 } else {
1718 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1719 follows = in->first - 1;
1720 CDentry *dn = in->get_projected_parent_dn();
1721 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1722 journal_cow_dentry(mut, metablob, dn, follows);
1723 if (in->get_projected_inode()->is_backtrace_updated()) {
1724 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1725 in->get_previous_projected_inode()->layout.pool_id;
1726 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1727 } else {
1728 metablob->add_primary_dentry(dn, in, true);
1729 }
1730 }
1731}
1732
1733
1734
1735// nested ---------------------------------------------------------------
1736
1737void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1738 int linkunlink, SnapRealm *prealm)
1739{
1740 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1741 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1742
1743 if (cur->first > first)
1744 first = cur->first;
1745
1746 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1747 << " " << *cur << dendl;
1748 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1749 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1750
1751 /*
1752 * FIXME. this incompletely propagates rstats to _old_ parents
1753 * (i.e. shortly after a directory rename). but we need full
1754 * blown hard link backpointers to make this work properly...
1755 */
1756 snapid_t floor = parentdn->first;
1757 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1758
1759 if (!prealm)
1760 prealm = parent->inode->find_snaprealm();
1761 const set<snapid_t> snaps = prealm->get_snaps();
1762
1763 if (cur->last != CEPH_NOSNAP) {
11fdf7f2
TL
1764 ceph_assert(cur->dirty_old_rstats.empty());
1765 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
7c673cae
FG
1766 if (q == snaps.end() || *q > cur->last)
1767 return;
1768 }
1769
1770 if (cur->last >= floor) {
1771 bool update = true;
1772 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1773 // rename src inode is not projected in the slave rename prep case. so we should
1774 // avoid updateing the inode.
11fdf7f2
TL
1775 ceph_assert(linkunlink < 0);
1776 ceph_assert(cur->is_frozen_inode());
7c673cae
FG
1777 update = false;
1778 }
11fdf7f2 1779 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
7c673cae
FG
1780 linkunlink, update);
1781 }
1782
11fdf7f2 1783 if (g_conf()->mds_snap_rstat) {
94b18763
FG
1784 for (const auto &p : cur->dirty_old_rstats) {
1785 auto &old = cur->old_inodes[p];
1786 snapid_t ofirst = std::max(old.first, floor);
1787 auto it = snaps.lower_bound(ofirst);
1788 if (it == snaps.end() || *it > p)
7c673cae 1789 continue;
94b18763
FG
1790 if (p >= floor)
1791 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1792 }
1793 }
1794 cur->dirty_old_rstats.clear();
1795}
1796
1797
94b18763 1798void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1799 CDir *parent, int linkunlink, bool update_inode)
1800{
1801 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1802 dout(20) << " inode rstat " << inode.rstat << dendl;
1803 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1804 nest_info_t delta;
1805 if (linkunlink == 0) {
1806 delta.add(inode.rstat);
1807 delta.sub(inode.accounted_rstat);
1808 } else if (linkunlink < 0) {
1809 delta.sub(inode.accounted_rstat);
1810 } else {
1811 delta.add(inode.rstat);
1812 }
1813 dout(20) << " delta " << delta << dendl;
1814
1815 if (update_inode)
1816 inode.accounted_rstat = inode.rstat;
1817
1818 while (last >= ofirst) {
1819 /*
1820 * pick fnode version to update. at each iteration, we want to
1821 * pick a segment ending in 'last' to update. split as necessary
1822 * to make that work. then, adjust first up so that we only
1823 * update one segment at a time. then loop to cover the whole
1824 * [ofirst,last] interval.
1825 */
1826 nest_info_t *prstat;
1827 snapid_t first;
1828 fnode_t *pf = parent->get_projected_fnode();
1829 if (last == CEPH_NOSNAP) {
11fdf7f2
TL
1830 if (g_conf()->mds_snap_rstat)
1831 first = std::max(ofirst, parent->first);
7c673cae
FG
1832 else
1833 first = parent->first;
1834 prstat = &pf->rstat;
1835 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1836
1837 if (first > parent->first &&
1838 !(pf->rstat == pf->accounted_rstat)) {
1839 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1840 << parent->first << "," << (first-1) << "] "
1841 << " " << *prstat << "/" << pf->accounted_rstat
1842 << dendl;
1843 parent->dirty_old_rstat[first-1].first = parent->first;
1844 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1845 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1846 }
1847 parent->first = first;
11fdf7f2 1848 } else if (!g_conf()->mds_snap_rstat) {
7c673cae
FG
1849 // drop snapshots' rstats
1850 break;
1851 } else if (last >= parent->first) {
1852 first = parent->first;
1853 parent->dirty_old_rstat[last].first = first;
1854 parent->dirty_old_rstat[last].rstat = pf->rstat;
1855 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1856 prstat = &parent->dirty_old_rstat[last].rstat;
1857 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1858 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1859 } else {
1860 // be careful, dirty_old_rstat is a _sparse_ map.
1861 // sorry, this is ugly.
1862 first = ofirst;
1863
1864 // find any intersection with last
94b18763
FG
1865 auto it = parent->dirty_old_rstat.lower_bound(last);
1866 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1867 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1868 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1869 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1870 first = parent->dirty_old_rstat.rbegin()->first+1;
1871 }
1872 } else {
94b18763
FG
1873 // *it last is >= last
1874 if (it->second.first <= last) {
1875 // *it intersects [first,last]
1876 if (it->second.first < first) {
1877 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1878 parent->dirty_old_rstat[first-1] = it->second;
1879 it->second.first = first;
7c673cae 1880 }
94b18763
FG
1881 if (it->second.first > first)
1882 first = it->second.first;
1883 if (last < it->first) {
1884 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1885 parent->dirty_old_rstat[last] = it->second;
1886 it->second.first = last+1;
7c673cae
FG
1887 }
1888 } else {
94b18763
FG
1889 // *it is to the _right_ of [first,last]
1890 it = parent->dirty_old_rstat.lower_bound(first);
1891 // new *it last is >= first
1892 if (it->second.first <= last && // new *it isn't also to the right, and
1893 it->first >= first) { // it intersects our first bit,
1894 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1895 first = it->first+1;
7c673cae
FG
1896 }
1897 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1898 }
1899 }
1900 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1901 parent->dirty_old_rstat[last].first = first;
1902 prstat = &parent->dirty_old_rstat[last].rstat;
1903 }
1904
1905 // apply
1906 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
11fdf7f2 1907 ceph_assert(last >= first);
7c673cae
FG
1908 prstat->add(delta);
1909 if (update_inode)
1910 inode.accounted_rstat = inode.rstat;
1911 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1912
1913 last = first-1;
1914 }
1915}
1916
1917void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1918 snapid_t ofirst, snapid_t last,
1919 CInode *pin, bool cow_head)
1920{
1921 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1922 dout(20) << " frag rstat " << rstat << dendl;
1923 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1924 nest_info_t delta = rstat;
1925 delta.sub(accounted_rstat);
1926 dout(20) << " delta " << delta << dendl;
1927
1928 while (last >= ofirst) {
94b18763 1929 CInode::mempool_inode *pi;
7c673cae
FG
1930 snapid_t first;
1931 if (last == pin->last) {
1932 pi = pin->get_projected_inode();
11fdf7f2 1933 first = std::max(ofirst, pin->first);
7c673cae 1934 if (first > pin->first) {
94b18763 1935 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1936 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1937 }
1938 } else {
1939 if (last >= pin->first) {
1940 first = pin->first;
1941 pin->cow_old_inode(last, cow_head);
1942 } else {
1943 // our life is easier here because old_inodes is not sparse
1944 // (although it may not begin at snapid 1)
94b18763
FG
1945 auto it = pin->old_inodes.lower_bound(last);
1946 if (it == pin->old_inodes.end()) {
7c673cae
FG
1947 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1948 break;
1949 }
94b18763 1950 first = it->second.first;
7c673cae 1951 if (first > last) {
94b18763 1952 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1953 //assert(p == pin->old_inodes.begin());
1954 break;
1955 }
94b18763
FG
1956 if (it->first > last) {
1957 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1958 << (last+1) << "," << it->first << "]" << dendl;
1959 pin->old_inodes[last] = it->second;
1960 it->second.first = last+1;
1961 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1962 }
1963 }
1964 if (first < ofirst) {
1965 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1966 << first << "," << ofirst-1 << "]" << dendl;
1967 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1968 pin->dirty_old_rstats.insert(ofirst-1);
1969 pin->old_inodes[last].first = first = ofirst;
1970 }
1971 pi = &pin->old_inodes[last].inode;
1972 pin->dirty_old_rstats.insert(last);
1973 }
1974 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1975 pi->rstat.add(delta);
1976 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1977
1978 last = first-1;
1979 }
1980}
1981
a8e16298 1982void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
7c673cae 1983{
11fdf7f2
TL
1984 if (!(mds->is_active() || mds->is_stopping()))
1985 return;
1986
7c673cae
FG
1987 if (!in->is_auth() || in->is_frozen())
1988 return;
1989
94b18763 1990 auto i = in->get_projected_inode();
a8e16298
TL
1991
1992 if (!i->quota.is_enable() &&
1993 !quota_change)
7c673cae
FG
1994 return;
1995
11fdf7f2
TL
1996 // creaete snaprealm for quota inode (quota was set before mimic)
1997 if (!in->get_projected_srnode())
1998 mds->server->create_quota_realm(in);
7c673cae 1999
11fdf7f2
TL
2000 for (auto &p : in->client_caps) {
2001 Capability *cap = &p.second;
2002 if (cap->is_noquota())
2003 continue;
28e407b8 2004
11fdf7f2 2005 if (exclude_ct >= 0 && exclude_ct != p.first)
28e407b8
AA
2006 goto update;
2007
7c673cae
FG
2008 if (cap->last_rbytes == i->rstat.rbytes &&
2009 cap->last_rsize == i->rstat.rsize())
2010 continue;
2011
2012 if (i->quota.max_files > 0) {
2013 if (i->rstat.rsize() >= i->quota.max_files)
2014 goto update;
2015
2016 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2017 abs(cap->last_rsize - i->rstat.rsize()))
2018 goto update;
2019 }
2020
2021 if (i->quota.max_bytes > 0) {
2022 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2023 goto update;
2024
2025 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2026 abs(cap->last_rbytes - i->rstat.rbytes))
2027 goto update;
2028 }
2029
2030 continue;
2031
2032update:
2033 cap->last_rsize = i->rstat.rsize();
2034 cap->last_rbytes = i->rstat.rbytes;
2035
11fdf7f2 2036 auto msg = MClientQuota::create();
7c673cae
FG
2037 msg->ino = in->ino();
2038 msg->rstat = i->rstat;
2039 msg->quota = i->quota;
11fdf7f2 2040 mds->send_message_client_counted(msg, cap->get_session());
7c673cae 2041 }
181888fb 2042 for (const auto &it : in->get_replicas()) {
11fdf7f2 2043 auto msg = MGatherCaps::create();
7c673cae 2044 msg->ino = in->ino();
181888fb 2045 mds->send_message_mds(msg, it.first);
7c673cae
FG
2046 }
2047}
2048
2049/*
2050 * NOTE: we _have_ to delay the scatter if we are called during a
2051 * rejoin, because we can't twiddle locks between when the
2052 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2053 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2054 * (no requests), and a survivor acks immediately. _except_ that
2055 * during rejoin_(weak|strong) processing, we may complete a lock
2056 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2057 * scatterlock state in that case or the lock states will get out of
2058 * sync between the auth and replica.
2059 *
2060 * the simple solution is to never do the scatter here. instead, put
2061 * the scatterlock on a list if it isn't already wrlockable. this is
2062 * probably the best plan anyway, since we avoid too many
2063 * scatters/locks under normal usage.
2064 */
2065/*
2066 * some notes on dirlock/nestlock scatterlock semantics:
2067 *
2068 * the fragstat (dirlock) will never be updated without
2069 * dirlock+nestlock wrlock held by the caller.
2070 *
2071 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2072 * data is pushed up the tree. this could be changed with some
2073 * restructuring here, but in its current form we ensure that the
2074 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2075 * frag, which is nice. and, we only need to track frags that need to
2076 * be nudged (and not inodes with pending rstat changes that need to
2077 * be pushed into the frag). a consequence of this is that the
2078 * accounted_rstat on scatterlock sync may not match our current
2079 * rstat. this is normal and expected.
2080 */
2081void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2082 CInode *in, CDir *parent,
2083 int flags, int linkunlink,
2084 snapid_t cfollows)
2085{
2086 bool primary_dn = flags & PREDIRTY_PRIMARY;
2087 bool do_parent_mtime = flags & PREDIRTY_DIR;
2088 bool shallow = flags & PREDIRTY_SHALLOW;
2089
11fdf7f2 2090 ceph_assert(mds->mdlog->entry_is_open());
7c673cae
FG
2091
2092 // make sure stamp is set
2093 if (mut->get_mds_stamp() == utime_t())
2094 mut->set_mds_stamp(ceph_clock_now());
2095
2096 if (in->is_base())
2097 return;
2098
2099 dout(10) << "predirty_journal_parents"
2100 << (do_parent_mtime ? " do_parent_mtime":"")
2101 << " linkunlink=" << linkunlink
2102 << (primary_dn ? " primary_dn":" remote_dn")
2103 << (shallow ? " SHALLOW":"")
2104 << " follows " << cfollows
2105 << " " << *in << dendl;
2106
2107 if (!parent) {
11fdf7f2 2108 ceph_assert(primary_dn);
7c673cae
FG
2109 parent = in->get_projected_parent_dn()->get_dir();
2110 }
2111
2112 if (flags == 0 && linkunlink == 0) {
2113 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2114 blob->add_dir_context(parent);
2115 return;
2116 }
2117
2118 // build list of inodes to wrlock, dirty, and update
2119 list<CInode*> lsi;
2120 CInode *cur = in;
2121 CDentry *parentdn = NULL;
2122 bool first = true;
2123 while (parent) {
2124 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
11fdf7f2 2125 ceph_assert(parent->is_auth());
7c673cae
FG
2126
2127 // opportunistically adjust parent dirfrag
2128 CInode *pin = parent->get_inode();
2129
2130 // inode -> dirfrag
2131 mut->auth_pin(parent);
2132 mut->add_projected_fnode(parent);
2133
2134 fnode_t *pf = parent->project_fnode();
2135 pf->version = parent->pre_dirty();
2136
2137 if (do_parent_mtime || linkunlink) {
11fdf7f2
TL
2138 ceph_assert(mut->is_wrlocked(&pin->filelock));
2139 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2140 ceph_assert(cfollows == CEPH_NOSNAP);
7c673cae
FG
2141
2142 // update stale fragstat/rstat?
2143 parent->resync_accounted_fragstat();
2144 parent->resync_accounted_rstat();
2145
2146 if (do_parent_mtime) {
2147 pf->fragstat.mtime = mut->get_op_stamp();
2148 pf->fragstat.change_attr++;
2149 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2150 if (pf->fragstat.mtime > pf->rstat.rctime) {
2151 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2152 pf->rstat.rctime = pf->fragstat.mtime;
2153 } else {
2154 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2155 }
2156 }
2157 if (linkunlink) {
2158 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2159 if (in->is_dir()) {
2160 pf->fragstat.nsubdirs += linkunlink;
2161 //pf->rstat.rsubdirs += linkunlink;
2162 } else {
2163 pf->fragstat.nfiles += linkunlink;
2164 //pf->rstat.rfiles += linkunlink;
2165 }
2166 }
2167 }
2168
2169 // rstat
2170 if (!primary_dn) {
2171 // don't update parent this pass
2172 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2173 pin->versionlock.can_wrlock())) {
2174 dout(20) << " unwritable parent nestlock " << pin->nestlock
2175 << ", marking dirty rstat on " << *cur << dendl;
2176 cur->mark_dirty_rstat();
2177 } else {
2178 // if we don't hold a wrlock reference on this nestlock, take one,
2179 // because we are about to write into the dirfrag fnode and that needs
2180 // to commit before the lock can cycle.
2181 if (linkunlink) {
11fdf7f2 2182 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
7c673cae
FG
2183 }
2184
11fdf7f2 2185 if (!mut->is_wrlocked(&pin->nestlock)) {
7c673cae
FG
2186 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2187 mds->locker->wrlock_force(&pin->nestlock, mut);
2188 }
2189
2190 // now we can project the inode rstat diff the dirfrag
2191 SnapRealm *prealm = pin->find_snaprealm();
2192
2193 snapid_t follows = cfollows;
2194 if (follows == CEPH_NOSNAP)
2195 follows = prealm->get_newest_seq();
2196
2197 snapid_t first = follows+1;
2198
2199 // first, if the frag is stale, bring it back in sync.
2200 parent->resync_accounted_rstat();
2201
2202 // now push inode rstats into frag
2203 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2204 cur->clear_dirty_rstat();
2205 }
2206
2207 bool stop = false;
2208 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2209 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2210 stop = true;
2211 }
2212
2213 // delay propagating until later?
2214 if (!stop && !first &&
11fdf7f2 2215 g_conf()->mds_dirstat_min_interval > 0) {
7c673cae 2216 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
11fdf7f2 2217 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
7c673cae 2218 dout(10) << "predirty_journal_parents last prop " << since_last_prop
11fdf7f2 2219 << " < " << g_conf()->mds_dirstat_min_interval
7c673cae
FG
2220 << ", stopping" << dendl;
2221 stop = true;
2222 } else {
2223 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2224 }
2225 }
2226
2227 // can cast only because i'm passing nowait=true in the sole user
2228 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2229 if (!stop &&
11fdf7f2 2230 !mut->is_wrlocked(&pin->nestlock) &&
7c673cae
FG
2231 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2232 //true
2233 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2234 )) { // ** do not initiate.. see above comment **
2235 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2236 << " on " << *pin << dendl;
2237 stop = true;
2238 }
2239 if (stop) {
2240 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2241 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2242 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2243 mut->add_updated_lock(&pin->nestlock);
2244 if (do_parent_mtime || linkunlink) {
2245 mds->locker->mark_updated_scatterlock(&pin->filelock);
2246 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2247 mut->add_updated_lock(&pin->filelock);
2248 }
2249 break;
2250 }
11fdf7f2 2251 if (!mut->is_wrlocked(&pin->versionlock))
7c673cae
FG
2252 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2253
11fdf7f2 2254 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
7c673cae
FG
2255
2256 pin->last_dirstat_prop = mut->get_mds_stamp();
2257
2258 // dirfrag -> diri
2259 mut->auth_pin(pin);
2260 mut->add_projected_inode(pin);
2261 lsi.push_front(pin);
2262
2263 pin->pre_cow_old_inode(); // avoid cow mayhem!
2264
94b18763
FG
2265 auto &pi = pin->project_inode();
2266 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2267
2268 // dirstat
2269 if (do_parent_mtime || linkunlink) {
2270 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2271 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2272 bool touched_mtime = false, touched_chattr = false;
94b18763 2273 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2274 pf->accounted_fragstat = pf->fragstat;
2275 if (touched_mtime)
94b18763 2276 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2277 if (touched_chattr)
94b18763
FG
2278 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2279 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2280
2281 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2282 if (pi.inode.dirstat.size() < 0)
11fdf7f2 2283 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
94b18763 2284 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2285 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2286 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2287 << ", dirfrag has " << pf->fragstat;
2288
2289 // trust the dirfrag for now
94b18763 2290 pi.inode.dirstat = pf->fragstat;
7c673cae 2291
11fdf7f2 2292 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
7c673cae
FG
2293 }
2294 }
2295 }
2296
2297 /*
2298 * the rule here is to follow the _oldest_ parent with dirty rstat
2299 * data. if we don't propagate all data, we add ourselves to the
2300 * nudge list. that way all rstat data will (eventually) get
2301 * pushed up the tree.
2302 *
2303 * actually, no. for now, silently drop rstats for old parents. we need
2304 * hard link backpointers to do the above properly.
2305 */
2306
2307 // stop?
2308 if (pin->is_base())
2309 break;
2310 parentdn = pin->get_projected_parent_dn();
11fdf7f2 2311 ceph_assert(parentdn);
7c673cae
FG
2312
2313 // rstat
2314 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2315
2316 // first, if the frag is stale, bring it back in sync.
2317 parent->resync_accounted_rstat();
2318
11fdf7f2 2319 if (g_conf()->mds_snap_rstat) {
94b18763
FG
2320 for (auto &p : parent->dirty_old_rstat) {
2321 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2322 p.first, pin, true);
2323 }
7c673cae
FG
2324 }
2325 parent->dirty_old_rstat.clear();
2326 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2327
2328 pf->accounted_rstat = pf->rstat;
2329
2330 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2331 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2332 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2333 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2334 << ", dirfrag has " << pf->rstat;
2335
2336 // trust the dirfrag for now
94b18763 2337 pi.inode.rstat = pf->rstat;
7c673cae 2338
11fdf7f2 2339 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
7c673cae
FG
2340 }
2341 }
2342
2343 parent->check_rstats();
2344 broadcast_quota_to_client(pin);
2345 // next parent!
2346 cur = pin;
2347 parent = parentdn->get_dir();
2348 linkunlink = 0;
2349 do_parent_mtime = false;
2350 primary_dn = true;
2351 first = false;
2352 }
2353
2354 // now, stick it in the blob
11fdf7f2
TL
2355 ceph_assert(parent);
2356 ceph_assert(parent->is_auth());
7c673cae
FG
2357 blob->add_dir_context(parent);
2358 blob->add_dir(parent, true);
2359 for (list<CInode*>::iterator p = lsi.begin();
2360 p != lsi.end();
2361 ++p) {
2362 CInode *cur = *p;
2363 journal_dirty_inode(mut.get(), blob, cur);
2364 }
2365
2366}
2367
2368
2369
2370
2371
2372// ===================================
2373// slave requests
2374
2375
2376/*
2377 * some handlers for master requests with slaves. we need to make
2378 * sure slaves journal commits before we forget we mastered them and
2379 * remove them from the uncommitted_masters map (used during recovery
2380 * to commit|abort slaves).
2381 */
2382struct C_MDC_CommittedMaster : public MDCacheLogContext {
2383 metareqid_t reqid;
2384 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2385 void finish(int r) override {
2386 mdcache->_logged_master_commit(reqid);
2387 }
2388};
2389
2390void MDCache::log_master_commit(metareqid_t reqid)
2391{
2392 dout(10) << "log_master_commit " << reqid << dendl;
2393 uncommitted_masters[reqid].committing = true;
2394 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2395 new C_MDC_CommittedMaster(this, reqid));
2396}
2397
2398void MDCache::_logged_master_commit(metareqid_t reqid)
2399{
2400 dout(10) << "_logged_master_commit " << reqid << dendl;
11fdf7f2 2401 ceph_assert(uncommitted_masters.count(reqid));
7c673cae
FG
2402 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2403 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2404 uncommitted_masters.erase(reqid);
2405}
2406
2407// while active...
2408
2409void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2410{
2411 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
11fdf7f2 2412 ceph_assert(uncommitted_masters.count(r));
7c673cae
FG
2413 uncommitted_masters[r].slaves.erase(from);
2414 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2415 log_master_commit(r);
2416}
2417
2418void MDCache::logged_master_update(metareqid_t reqid)
2419{
2420 dout(10) << "logged_master_update " << reqid << dendl;
11fdf7f2 2421 ceph_assert(uncommitted_masters.count(reqid));
7c673cae 2422 uncommitted_masters[reqid].safe = true;
11fdf7f2
TL
2423 auto p = pending_masters.find(reqid);
2424 if (p != pending_masters.end()) {
2425 pending_masters.erase(p);
7c673cae
FG
2426 if (pending_masters.empty())
2427 process_delayed_resolve();
2428 }
2429}
2430
2431/*
2432 * Master may crash after receiving all slaves' commit acks, but before journalling
2433 * the final commit. Slaves may crash after journalling the slave commit, but before
2434 * sending commit ack to the master. Commit masters with no uncommitted slave when
2435 * resolve finishes.
2436 */
2437void MDCache::finish_committed_masters()
2438{
2439 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2440 p != uncommitted_masters.end();
2441 ++p) {
2442 p->second.recovering = false;
2443 if (!p->second.committing && p->second.slaves.empty()) {
2444 dout(10) << "finish_committed_masters " << p->first << dendl;
2445 log_master_commit(p->first);
2446 }
2447 }
2448}
2449
2450/*
2451 * at end of resolve... we must journal a commit|abort for all slave
2452 * updates, before moving on.
2453 *
2454 * this is so that the master can safely journal ECommitted on ops it
2455 * masters when it reaches up:active (all other recovering nodes must
2456 * complete resolve before that happens).
2457 */
2458struct C_MDC_SlaveCommit : public MDCacheLogContext {
2459 mds_rank_t from;
2460 metareqid_t reqid;
2461 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2462 void finish(int r) override {
2463 mdcache->_logged_slave_commit(from, reqid);
2464 }
2465};
2466
2467void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2468{
2469 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2470
2471 // send a message
11fdf7f2 2472 auto req = MMDSSlaveRequest::create(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
7c673cae
FG
2473 mds->send_message_mds(req, from);
2474}
2475
2476
2477
2478
2479
2480
2481// ====================================================================
2482// import map, recovery
2483
2484void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2485 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2486{
2487 if (subtrees.count(oldparent)) {
2488 vector<dirfrag_t>& v = subtrees[oldparent];
2489 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2490 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2491 if (*it == df) {
2492 v.erase(it);
2493 break;
2494 }
2495 }
2496 if (subtrees.count(newparent)) {
2497 vector<dirfrag_t>& v = subtrees[newparent];
2498 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2499 v.push_back(df);
2500 }
2501}
2502
2503ESubtreeMap *MDCache::create_subtree_map()
2504{
2505 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2506 << num_subtrees_fullauth() << " fullauth"
2507 << dendl;
2508
2509 show_subtrees();
2510
2511 ESubtreeMap *le = new ESubtreeMap();
2512 mds->mdlog->_start_entry(le);
2513
2514 map<dirfrag_t, CDir*> dirs_to_add;
2515
2516 if (myin) {
2517 CDir* mydir = myin->get_dirfrag(frag_t());
2518 dirs_to_add[mydir->dirfrag()] = mydir;
2519 }
2520
2521 // include all auth subtrees, and their bounds.
2522 // and a spanning tree to tie it to the root.
2523 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2524 p != subtrees.end();
2525 ++p) {
2526 CDir *dir = p->first;
2527
2528 // journal subtree as "ours" if we are
2529 // me, -2
2530 // me, me
2531 // me, !me (may be importing and ambiguous!)
2532
2533 // so not
2534 // !me, *
2535 if (dir->get_dir_auth().first != mds->get_nodeid())
2536 continue;
2537
2538 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2539 my_ambiguous_imports.count(dir->dirfrag())) {
2540 dout(15) << " ambig subtree " << *dir << dendl;
2541 le->ambiguous_subtrees.insert(dir->dirfrag());
2542 } else {
2543 dout(15) << " subtree " << *dir << dendl;
2544 }
2545
2546 dirs_to_add[dir->dirfrag()] = dir;
2547 le->subtrees[dir->dirfrag()].clear();
2548
2549
2550 // bounds
2551 for (set<CDir*>::iterator q = p->second.begin();
2552 q != p->second.end();
2553 ++q) {
2554 CDir *bound = *q;
2555 dout(15) << " subtree bound " << *bound << dendl;
2556 dirs_to_add[bound->dirfrag()] = bound;
2557 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2558 }
2559 }
2560
2561 // apply projected renames
2562 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2563 p != projected_subtree_renames.end();
2564 ++p) {
2565 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2566 CInode *diri = p->first;
2567 CDir *olddir = q->first;
2568 CDir *newdir = q->second;
2569 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2570
2571 list<CDir*> dfls;
2572 diri->get_dirfrags(dfls);
2573 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2574 CDir *dir = *p;
2575 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2576 CDir *oldparent = get_projected_subtree_root(olddir);
2577 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2578 CDir *newparent = get_projected_subtree_root(newdir);
2579 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2580
2581 if (oldparent == newparent) {
2582 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2583 << oldparent->dirfrag() << dendl;
2584 continue;
2585 }
2586
2587 if (dir->is_subtree_root()) {
2588 if (le->subtrees.count(newparent->dirfrag()) &&
2589 oldparent->get_dir_auth() != newparent->get_dir_auth())
2590 dirs_to_add[dir->dirfrag()] = dir;
2591 // children are fine. change parent.
2592 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2593 le->subtrees);
2594 } else {
2595 // mid-subtree.
2596
2597 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2598 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2599 // if oldparent is auth, subtree is mine; include it.
2600 if (le->subtrees.count(oldparent->dirfrag())) {
2601 dirs_to_add[dir->dirfrag()] = dir;
2602 le->subtrees[dir->dirfrag()].clear();
2603 }
2604 // if newparent is auth, subtree is a new bound
2605 if (le->subtrees.count(newparent->dirfrag())) {
2606 dirs_to_add[dir->dirfrag()] = dir;
2607 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2608 }
2609 newparent = dir;
2610 }
2611
2612 // see if any old bounds move to the new parent.
2613 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2614 p != subtrees[oldparent].end();
2615 ++p) {
2616 CDir *bound = *p;
2617 if (dir->contains(bound->get_parent_dir()))
2618 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2619 le->subtrees);
2620 }
2621 }
2622 }
2623 }
2624 }
2625
2626 // simplify the journaled map. our in memory map may have more
2627 // subtrees than needed due to migrations that are just getting
2628 // started or just completing. but on replay, the "live" map will
2629 // be simple and we can do a straight comparison.
2630 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2631 if (le->ambiguous_subtrees.count(p->first))
2632 continue;
2633 unsigned i = 0;
2634 while (i < p->second.size()) {
2635 dirfrag_t b = p->second[i];
2636 if (le->subtrees.count(b) &&
2637 le->ambiguous_subtrees.count(b) == 0) {
2638 vector<dirfrag_t>& bb = le->subtrees[b];
2639 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2640 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2641 p->second.push_back(*r);
2642 dirs_to_add.erase(b);
2643 le->subtrees.erase(b);
2644 p->second.erase(p->second.begin() + i);
2645 } else {
2646 ++i;
2647 }
2648 }
2649 }
2650
94b18763 2651 for (auto &p : dirs_to_add) {
7c673cae
FG
2652 CDir *dir = p.second;
2653 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2654 le->metablob.add_dir(dir, false);
2655 }
2656
2657 dout(15) << " subtrees " << le->subtrees << dendl;
2658 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2659
2660 //le->metablob.print(cout);
2661 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2662 return le;
2663}
2664
2665void MDCache::dump_resolve_status(Formatter *f) const
2666{
2667 f->open_object_section("resolve_status");
2668 f->dump_stream("resolve_gather") << resolve_gather;
2669 f->dump_stream("resolve_ack_gather") << resolve_gather;
2670 f->close_section();
2671}
2672
11fdf7f2 2673void MDCache::resolve_start(MDSContext *resolve_done_)
7c673cae
FG
2674{
2675 dout(10) << "resolve_start" << dendl;
11fdf7f2 2676 ceph_assert(!resolve_done);
7c673cae
FG
2677 resolve_done.reset(resolve_done_);
2678
2679 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2680 // if we don't have the root dir, adjust it to UNKNOWN. during
2681 // resolve we want mds0 to explicit claim the portion of it that
2682 // it owns, so that anything beyond its bounds get left as
2683 // unknown.
2684 CDir *rootdir = root->get_dirfrag(frag_t());
2685 if (rootdir)
2686 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2687 }
2688 resolve_gather = recovery_set;
11fdf7f2
TL
2689
2690 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
7c673cae
FG
2691}
2692
2693void MDCache::send_resolves()
2694{
2695 send_slave_resolves();
11fdf7f2
TL
2696
2697 if (!resolve_done) {
2698 // I'm survivor: refresh snap cache
2699 mds->snapclient->sync(
2700 new MDSInternalContextWrapper(mds,
2701 new FunctionContext([this](int r) {
2702 maybe_finish_slave_resolve();
2703 })
2704 )
2705 );
2706 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2707 return;
2708 }
7c673cae
FG
2709 if (!resolve_ack_gather.empty()) {
2710 dout(10) << "send_resolves still waiting for resolve ack from ("
2711 << resolve_ack_gather << ")" << dendl;
2712 return;
2713 }
11fdf7f2 2714 if (!resolve_need_rollback.empty()) {
7c673cae 2715 dout(10) << "send_resolves still waiting for rollback to commit on ("
11fdf7f2 2716 << resolve_need_rollback << ")" << dendl;
7c673cae
FG
2717 return;
2718 }
11fdf7f2 2719
7c673cae
FG
2720 send_subtree_resolves();
2721}
2722
2723void MDCache::send_slave_resolves()
2724{
2725 dout(10) << "send_slave_resolves" << dendl;
2726
11fdf7f2 2727 map<mds_rank_t, MMDSResolve::ref> resolves;
7c673cae
FG
2728
2729 if (mds->is_resolve()) {
2730 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2731 p != uncommitted_slave_updates.end();
2732 ++p) {
11fdf7f2 2733 resolves[p->first] = MMDSResolve::create();
7c673cae
FG
2734 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2735 q != p->second.end();
2736 ++q) {
2737 dout(10) << " including uncommitted " << q->first << dendl;
2738 resolves[p->first]->add_slave_request(q->first, false);
2739 }
2740 }
2741 } else {
2742 set<mds_rank_t> resolve_set;
2743 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2744 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2745 p != active_requests.end();
2746 ++p) {
2747 MDRequestRef& mdr = p->second;
2748 if (!mdr->is_slave())
2749 continue;
2750 if (!mdr->slave_did_prepare() && !mdr->committing) {
2751 continue;
2752 }
2753 mds_rank_t master = mdr->slave_to_mds;
2754 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2755 dout(10) << " including uncommitted " << *mdr << dendl;
2756 if (!resolves.count(master))
11fdf7f2 2757 resolves[master] = MMDSResolve::create();
7c673cae
FG
2758 if (!mdr->committing &&
2759 mdr->has_more() && mdr->more()->is_inode_exporter) {
2760 // re-send cap exports
2761 CInode *in = mdr->more()->rename_inode;
2762 map<client_t, Capability::Export> cap_map;
2763 in->export_client_caps(cap_map);
2764 bufferlist bl;
11fdf7f2
TL
2765 encode(in->ino(), bl);
2766 encode(cap_map, bl);
7c673cae
FG
2767 resolves[master]->add_slave_request(p->first, bl);
2768 } else {
2769 resolves[master]->add_slave_request(p->first, mdr->committing);
2770 }
2771 }
2772 }
2773 }
2774
11fdf7f2
TL
2775 for (auto &p : resolves) {
2776 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2777 mds->send_message_mds(p.second, p.first);
2778 resolve_ack_gather.insert(p.first);
7c673cae
FG
2779 }
2780}
2781
2782void MDCache::send_subtree_resolves()
2783{
2784 dout(10) << "send_subtree_resolves" << dendl;
2785
2786 if (migrator->is_exporting() || migrator->is_importing()) {
2787 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2788 migrator->show_importing();
2789 migrator->show_exporting();
2790 resolves_pending = true;
2791 return; // not now
2792 }
2793
11fdf7f2 2794 map<mds_rank_t, MMDSResolve::ref> resolves;
7c673cae
FG
2795 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2796 p != recovery_set.end();
2797 ++p) {
2798 if (*p == mds->get_nodeid())
2799 continue;
2800 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
11fdf7f2 2801 resolves[*p] = MMDSResolve::create();
7c673cae
FG
2802 }
2803
2804 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2805 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2806
2807 // known
2808 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2809 p != subtrees.end();
2810 ++p) {
2811 CDir *dir = p->first;
2812
2813 // only our subtrees
2814 if (dir->authority().first != mds->get_nodeid())
2815 continue;
2816
2817 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2818 continue; // we'll add it below
2819
2820 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2821 // ambiguous (mid-import)
2822 set<CDir*> bounds;
2823 get_subtree_bounds(dir, bounds);
2824 vector<dirfrag_t> dfls;
2825 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2826 dfls.push_back((*q)->dirfrag());
2827
2828 my_ambig_imports[dir->dirfrag()] = dfls;
2829 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2830 } else {
2831 // not ambiguous.
11fdf7f2
TL
2832 for (auto &q : resolves) {
2833 resolves[q.first]->add_subtree(dir->dirfrag());
2834 }
7c673cae
FG
2835 // bounds too
2836 vector<dirfrag_t> dfls;
2837 for (set<CDir*>::iterator q = subtrees[dir].begin();
2838 q != subtrees[dir].end();
2839 ++q) {
2840 CDir *bound = *q;
2841 dfls.push_back(bound->dirfrag());
2842 }
2843
2844 my_subtrees[dir->dirfrag()] = dfls;
2845 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2846 }
2847 }
2848
2849 // ambiguous
2850 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2851 p != my_ambiguous_imports.end();
2852 ++p) {
2853 my_ambig_imports[p->first] = p->second;
2854 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2855 }
2856
2857 // simplify the claimed subtree.
2858 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2859 unsigned i = 0;
2860 while (i < p->second.size()) {
2861 dirfrag_t b = p->second[i];
2862 if (my_subtrees.count(b)) {
2863 vector<dirfrag_t>& bb = my_subtrees[b];
2864 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2865 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2866 p->second.push_back(*r);
2867 my_subtrees.erase(b);
2868 p->second.erase(p->second.begin() + i);
2869 } else {
2870 ++i;
2871 }
2872 }
2873 }
2874
2875 // send
11fdf7f2
TL
2876 for (auto &p : resolves) {
2877 const MMDSResolve::ref &m = p.second;
2878 if (mds->is_resolve()) {
2879 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2880 } else {
2881 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2882 }
7c673cae
FG
2883 m->subtrees = my_subtrees;
2884 m->ambiguous_imports = my_ambig_imports;
11fdf7f2
TL
2885 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2886 mds->send_message_mds(m, p.first);
7c673cae
FG
2887 }
2888 resolves_pending = false;
2889}
2890
11fdf7f2
TL
2891void MDCache::maybe_finish_slave_resolve() {
2892 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2893 // snap cache get synced or I'm in resolve state
2894 if (mds->snapclient->is_synced() || resolve_done)
2895 send_subtree_resolves();
2896 process_delayed_resolve();
2897 }
2898}
2899
7c673cae
FG
2900void MDCache::handle_mds_failure(mds_rank_t who)
2901{
2902 dout(7) << "handle_mds_failure mds." << who << dendl;
2903
2904 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2905
2906 resolve_gather.insert(who);
2907 discard_delayed_resolve(who);
2908 ambiguous_slave_updates.erase(who);
2909
2910 rejoin_gather.insert(who);
2911 rejoin_sent.erase(who); // i need to send another
31f18b77 2912 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2913 rejoin_ack_gather.erase(who); // i'll need/get another.
2914
2915 dout(10) << " resolve_gather " << resolve_gather << dendl;
2916 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2917 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2918 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2919 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2920
2921
2922 // tell the migrator too.
2923 migrator->handle_mds_failure_or_stop(who);
2924
224ce89b
WB
2925 // tell the balancer too.
2926 mds->balancer->handle_mds_failure(who);
2927
7c673cae
FG
2928 // clean up any requests slave to/from this node
2929 list<MDRequestRef> finish;
2930 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2931 p != active_requests.end();
2932 ++p) {
2933 MDRequestRef& mdr = p->second;
2934 // slave to the failed node?
2935 if (mdr->slave_to_mds == who) {
2936 if (mdr->slave_did_prepare()) {
2937 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2938 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2939 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2940
2941 if (!mdr->more()->waiting_on_slave.empty()) {
11fdf7f2 2942 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae 2943 // will rollback, no need to wait
91327a77 2944 mdr->reset_slave_request();
7c673cae
FG
2945 mdr->more()->waiting_on_slave.clear();
2946 }
2947 } else if (!mdr->committing) {
2948 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2949 if (mdr->slave_request || mdr->slave_rolling_back())
2950 mdr->aborted = true;
2951 else
2952 finish.push_back(mdr);
2953 }
2954 }
2955
2956 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2957 if (mdr->more()->waiting_on_slave.count(who)) {
11fdf7f2 2958 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae
FG
2959 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2960 << who << dendl;
2961 mdr->more()->waiting_on_slave.erase(who);
2962 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2963 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2964 }
2965
2966 if (mdr->more()->srcdn_auth_mds == who &&
2967 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2968 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2969 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2970 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2971 }
31f18b77 2972 } else if (mdr->slave_request) {
11fdf7f2 2973 const MMDSSlaveRequest::const_ref &slave_req = mdr->slave_request;
31f18b77
FG
2974 // FIXME: Slave rename request can arrive after we notice mds failure.
2975 // This can cause mds to crash (does not affect integrity of FS).
2976 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2977 slave_req->srcdn_auth == who)
2978 slave_req->mark_interrupted();
7c673cae
FG
2979 }
2980
2981 // failed node is slave?
2982 if (mdr->is_master() && !mdr->committing) {
2983 if (mdr->more()->srcdn_auth_mds == who) {
2984 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2985 << who << " to recover" << dendl;
11fdf7f2 2986 ceph_assert(mdr->more()->witnessed.count(who) == 0);
7c673cae
FG
2987 if (mdr->more()->is_ambiguous_auth)
2988 mdr->clear_ambiguous_auth();
2989 // rename srcdn's auth mds failed, all witnesses will rollback
2990 mdr->more()->witnessed.clear();
2991 pending_masters.erase(p->first);
2992 }
2993
2994 if (mdr->more()->witnessed.count(who)) {
2995 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2996 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2997 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2998 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2999 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
3000 // until either the request is committing or the slave also fails.
11fdf7f2 3001 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
7c673cae
FG
3002 pending_masters.insert(p->first);
3003 } else {
3004 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
3005 << who << " to recover" << dendl;
3006 if (srcdn_auth >= 0)
11fdf7f2 3007 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
7c673cae
FG
3008
3009 // discard this peer's prepare (if any)
3010 mdr->more()->witnessed.erase(who);
3011 }
3012 }
3013
3014 if (mdr->more()->waiting_on_slave.count(who)) {
3015 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3016 << " to recover" << dendl;
3017 // retry request when peer recovers
3018 mdr->more()->waiting_on_slave.erase(who);
3019 if (mdr->more()->waiting_on_slave.empty())
3020 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3021 }
3022
3023 if (mdr->locking && mdr->locking_target_mds == who)
3024 mdr->finish_locking(mdr->locking);
3025 }
3026 }
3027
3028 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3029 p != uncommitted_masters.end();
3030 ++p) {
3031 // The failed MDS may have already committed the slave update
3032 if (p->second.slaves.count(who)) {
3033 p->second.recovering = true;
3034 p->second.slaves.erase(who);
3035 }
3036 }
3037
3038 while (!finish.empty()) {
3039 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3040 request_finish(finish.front());
3041 finish.pop_front();
3042 }
3043
3044 kick_find_ino_peers(who);
3045 kick_open_ino_peers(who);
3046
3047 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3048 p != fragments.end(); ) {
3049 dirfrag_t df = p->first;
3050 fragment_info_t& info = p->second;
a8e16298
TL
3051
3052 if (info.is_fragmenting()) {
3053 if (info.notify_ack_waiting.erase(who) &&
3054 info.notify_ack_waiting.empty()) {
3055 fragment_drop_locks(info);
3056 fragment_maybe_finish(p++);
3057 } else {
3058 ++p;
3059 }
7c673cae 3060 continue;
a8e16298
TL
3061 }
3062
3063 ++p;
7c673cae
FG
3064 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3065 list<CDir*> dirs;
3066 info.dirs.swap(dirs);
3067 fragments.erase(df);
3068 fragment_unmark_unfreeze_dirs(dirs);
3069 }
3070
3071 // MDCache::shutdown_export_strays() always exports strays to mds.0
3072 if (who == mds_rank_t(0))
f64942e4 3073 shutdown_exporting_strays.clear();
7c673cae
FG
3074
3075 show_subtrees();
3076}
3077
3078/*
3079 * handle_mds_recovery - called on another node's transition
3080 * from resolve -> active.
3081 */
3082void MDCache::handle_mds_recovery(mds_rank_t who)
3083{
3084 dout(7) << "handle_mds_recovery mds." << who << dendl;
3085
3086 // exclude all discover waiters. kick_discovers() will do the job
3087 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3088 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3089
11fdf7f2 3090 MDSContext::vec waiters;
7c673cae
FG
3091
3092 // wake up any waiters in their subtrees
3093 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3094 p != subtrees.end();
3095 ++p) {
3096 CDir *dir = p->first;
3097
3098 if (dir->authority().first != who ||
3099 dir->authority().second == mds->get_nodeid())
3100 continue;
11fdf7f2 3101 ceph_assert(!dir->is_auth());
7c673cae
FG
3102
3103 // wake any waiters
3104 list<CDir*> q;
3105 q.push_back(dir);
3106
3107 while (!q.empty()) {
3108 CDir *d = q.front();
3109 q.pop_front();
3110 d->take_waiting(d_mask, waiters);
3111
3112 // inode waiters too
94b18763
FG
3113 for (auto &p : d->items) {
3114 CDentry *dn = p.second;
7c673cae
FG
3115 CDentry::linkage_t *dnl = dn->get_linkage();
3116 if (dnl->is_primary()) {
3117 dnl->get_inode()->take_waiting(i_mask, waiters);
3118
3119 // recurse?
3120 list<CDir*> ls;
3121 dnl->get_inode()->get_dirfrags(ls);
3122 for (list<CDir*>::iterator p = ls.begin();
3123 p != ls.end();
3124 ++p) {
3125 CDir *subdir = *p;
3126 if (!subdir->is_subtree_root())
3127 q.push_back(subdir);
3128 }
3129 }
3130 }
3131 }
3132 }
3133
3134 kick_open_ino_peers(who);
3135 kick_find_ino_peers(who);
3136
3137 // queue them up.
3138 mds->queue_waiters(waiters);
3139}
3140
3141void MDCache::set_recovery_set(set<mds_rank_t>& s)
3142{
3143 dout(7) << "set_recovery_set " << s << dendl;
3144 recovery_set = s;
3145}
3146
3147
3148/*
3149 * during resolve state, we share resolves to determine who
3150 * is authoritative for which trees. we expect to get an resolve
3151 * from _everyone_ in the recovery_set (the mds cluster at the time of
3152 * the first failure).
3153 *
3154 * This functions puts the passed message before returning
3155 */
11fdf7f2 3156void MDCache::handle_resolve(const MMDSResolve::const_ref &m)
7c673cae
FG
3157{
3158 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3159 mds_rank_t from = mds_rank_t(m->get_source().num());
3160
3161 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3162 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3163 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3164 return;
3165 }
3166 // wait until we reach the resolve stage!
7c673cae
FG
3167 return;
3168 }
3169
3170 discard_delayed_resolve(from);
3171
3172 // ambiguous slave requests?
3173 if (!m->slave_requests.empty()) {
3174 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3175 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3176 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
11fdf7f2 3177 ceph_assert(!p->second.committing);
7c673cae
FG
3178 pending_masters.insert(p->first);
3179 }
3180 }
3181
3182 if (!pending_masters.empty()) {
3183 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3184 delayed_resolve[from] = m;
3185 return;
3186 }
3187 }
3188
11fdf7f2
TL
3189 auto ack = MMDSResolveAck::create();
3190 for (const auto &p : m->slave_requests) {
3191 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
7c673cae 3192 // COMMIT
11fdf7f2 3193 if (p.second.committing) {
7c673cae 3194 // already committing, waiting for the OP_COMMITTED slave reply
11fdf7f2 3195 dout(10) << " already committing slave request " << p << " noop "<< dendl;
7c673cae 3196 } else {
11fdf7f2
TL
3197 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3198 ack->add_commit(p.first);
7c673cae 3199 }
11fdf7f2 3200 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
7c673cae 3201
11fdf7f2 3202 if (p.second.inode_caps.length() > 0) {
7c673cae 3203 // slave wants to export caps (rename)
11fdf7f2 3204 ceph_assert(mds->is_resolve());
7c673cae
FG
3205
3206 inodeno_t ino;
3207 map<client_t,Capability::Export> cap_exports;
11fdf7f2
TL
3208 auto q = p.second.inode_caps.cbegin();
3209 decode(ino, q);
3210 decode(cap_exports, q);
7c673cae 3211
11fdf7f2 3212 ceph_assert(get_inode(ino));
7c673cae
FG
3213
3214 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3215 q != cap_exports.end();
3216 ++q) {
3217 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3218 im.cap_id = ++last_cap_id; // assign a new cap ID
3219 im.issue_seq = 1;
3220 im.mseq = q->second.mseq;
28e407b8
AA
3221
3222 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3223 if (session)
3224 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3225 }
3226
3227 // will process these caps in rejoin stage
3228 rejoin_slave_exports[ino].first = from;
3229 rejoin_slave_exports[ino].second.swap(cap_exports);
3230
3231 // send information of imported caps back to slave
11fdf7f2 3232 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
7c673cae
FG
3233 }
3234 } else {
3235 // ABORT
11fdf7f2
TL
3236 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3237 ceph_assert(!p.second.committing);
3238 ack->add_abort(p.first);
7c673cae
FG
3239 }
3240 }
3241 mds->send_message(ack, m->get_connection());
7c673cae
FG
3242 return;
3243 }
3244
11fdf7f2 3245 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
7c673cae
FG
3246 dout(10) << "delay processing subtree resolve" << dendl;
3247 delayed_resolve[from] = m;
3248 return;
3249 }
3250
3251 bool survivor = false;
3252 // am i a surviving ambiguous importer?
3253 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3254 survivor = true;
3255 // check for any import success/failure (from this node)
3256 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3257 while (p != my_ambiguous_imports.end()) {
3258 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3259 ++next;
3260 CDir *dir = get_dirfrag(p->first);
11fdf7f2 3261 ceph_assert(dir);
7c673cae
FG
3262 dout(10) << "checking ambiguous import " << *dir << dendl;
3263 if (migrator->is_importing(dir->dirfrag()) &&
3264 migrator->get_import_peer(dir->dirfrag()) == from) {
11fdf7f2 3265 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
7c673cae
FG
3266
3267 // check if sender claims the subtree
3268 bool claimed_by_sender = false;
11fdf7f2 3269 for (const auto &q : m->subtrees) {
7c673cae 3270 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
11fdf7f2 3271 CDir *base = get_force_dirfrag(q.first, false);
7c673cae
FG
3272 if (!base || !base->contains(dir))
3273 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3274
3275 bool inside = true;
3276 set<CDir*> bounds;
11fdf7f2 3277 get_force_dirfrag_bound_set(q.second, bounds);
7c673cae
FG
3278 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3279 CDir *bound = *p;
3280 if (bound->contains(dir)) {
3281 inside = false; // nope, bound is dir or parent of dir, not inside.
3282 break;
3283 }
3284 }
3285 if (inside)
3286 claimed_by_sender = true;
3287 }
3288
3289 my_ambiguous_imports.erase(p); // no longer ambiguous.
3290 if (claimed_by_sender) {
3291 dout(7) << "ambiguous import failed on " << *dir << dendl;
3292 migrator->import_reverse(dir);
3293 } else {
3294 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3295 migrator->import_finish(dir, true);
3296 }
3297 }
3298 p = next;
3299 }
3300 }
3301
3302 // update my dir_auth values
3303 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3304 // migrations between other nodes)
11fdf7f2
TL
3305 for (const auto& p : m->subtrees) {
3306 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3307 CDir *dir = get_force_dirfrag(p.first, !survivor);
7c673cae
FG
3308 if (!dir)
3309 continue;
11fdf7f2 3310 adjust_bounded_subtree_auth(dir, p.second, from);
7c673cae
FG
3311 try_subtree_merge(dir);
3312 }
3313
3314 show_subtrees();
3315
3316 // note ambiguous imports too
11fdf7f2
TL
3317 for (const auto& p : m->ambiguous_imports) {
3318 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3319 other_ambiguous_imports[from][p.first] = p.second;
3320 }
3321
3322 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3323 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3324 for (const auto& p : m->table_clients) {
3325 dout(10) << " noting " << get_mdstable_name(p.type)
3326 << " pending_commits " << p.pending_commits << dendl;
3327 MDSTableClient *client = mds->get_table_client(p.type);
3328 for (const auto& q : p.pending_commits)
3329 client->notify_commit(q);
7c673cae
FG
3330 }
3331
3332 // did i get them all?
3333 resolve_gather.erase(from);
3334
3335 maybe_resolve_finish();
7c673cae
FG
3336}
3337
3338void MDCache::process_delayed_resolve()
3339{
3340 dout(10) << "process_delayed_resolve" << dendl;
11fdf7f2 3341 map<mds_rank_t, MMDSResolve::const_ref> tmp;
7c673cae 3342 tmp.swap(delayed_resolve);
11fdf7f2
TL
3343 for (auto &p : tmp) {
3344 handle_resolve(p.second);
3345 }
7c673cae
FG
3346}
3347
3348void MDCache::discard_delayed_resolve(mds_rank_t who)
3349{
11fdf7f2 3350 delayed_resolve.erase(who);
7c673cae
FG
3351}
3352
3353void MDCache::maybe_resolve_finish()
3354{
11fdf7f2
TL
3355 ceph_assert(resolve_ack_gather.empty());
3356 ceph_assert(resolve_need_rollback.empty());
7c673cae
FG
3357
3358 if (!resolve_gather.empty()) {
3359 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3360 << resolve_gather << ")" << dendl;
3361 return;
3362 }
3363
3364 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3365 disambiguate_my_imports();
3366 finish_committed_masters();
3367
3368 if (resolve_done) {
11fdf7f2 3369 ceph_assert(mds->is_resolve());
7c673cae
FG
3370 trim_unlinked_inodes();
3371 recalc_auth_bits(false);
3372 resolve_done.release()->complete(0);
3373 } else {
11fdf7f2 3374 // I am survivor.
7c673cae
FG
3375 maybe_send_pending_rejoins();
3376 }
3377}
3378
11fdf7f2 3379void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref &ack)
7c673cae
FG
3380{
3381 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3382 mds_rank_t from = mds_rank_t(ack->get_source().num());
3383
3384 if (!resolve_ack_gather.count(from) ||
3385 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
7c673cae
FG
3386 return;
3387 }
3388
3389 if (ambiguous_slave_updates.count(from)) {
11fdf7f2
TL
3390 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3391 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
7c673cae
FG
3392 }
3393
11fdf7f2
TL
3394 for (const auto &p : ack->commit) {
3395 dout(10) << " commit on slave " << p.first << dendl;
7c673cae
FG
3396
3397 if (ambiguous_slave_updates.count(from)) {
11fdf7f2 3398 remove_ambiguous_slave_update(p.first, from);
7c673cae
FG
3399 continue;
3400 }
3401
3402 if (mds->is_resolve()) {
3403 // replay
11fdf7f2
TL
3404 MDSlaveUpdate *su = get_uncommitted_slave_update(p.first, from);
3405 ceph_assert(su);
7c673cae
FG
3406
3407 // log commit
11fdf7f2 3408 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
7c673cae 3409 ESlaveUpdate::OP_COMMIT, su->origop),
11fdf7f2 3410 new C_MDC_SlaveCommit(this, from, p.first));
7c673cae
FG
3411 mds->mdlog->flush();
3412
11fdf7f2 3413 finish_uncommitted_slave_update(p.first, from);
7c673cae 3414 } else {
11fdf7f2 3415 MDRequestRef mdr = request_get(p.first);
7c673cae 3416 // information about master imported caps
11fdf7f2
TL
3417 if (p.second.length() > 0)
3418 mdr->more()->inode_import.share(p.second);
7c673cae 3419
11fdf7f2 3420 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
7c673cae
FG
3421 request_finish(mdr);
3422 }
3423 }
3424
11fdf7f2
TL
3425 for (const auto &metareq : ack->abort) {
3426 dout(10) << " abort on slave " << metareq << dendl;
7c673cae
FG
3427
3428 if (mds->is_resolve()) {
11fdf7f2
TL
3429 MDSlaveUpdate *su = get_uncommitted_slave_update(metareq, from);
3430 ceph_assert(su);
7c673cae
FG
3431
3432 // perform rollback (and journal a rollback entry)
3433 // note: this will hold up the resolve a bit, until the rollback entries journal.
3434 MDRequestRef null_ref;
3435 switch (su->origop) {
3436 case ESlaveUpdate::LINK:
3437 mds->server->do_link_rollback(su->rollback, from, null_ref);
3438 break;
3439 case ESlaveUpdate::RENAME:
3440 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3441 break;
3442 case ESlaveUpdate::RMDIR:
3443 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3444 break;
3445 default:
3446 ceph_abort();
3447 }
3448 } else {
11fdf7f2 3449 MDRequestRef mdr = request_get(metareq);
7c673cae
FG
3450 mdr->aborted = true;
3451 if (mdr->slave_request) {
3452 if (mdr->slave_did_prepare()) // journaling slave prepare ?
11fdf7f2 3453 add_rollback(metareq, from);
7c673cae
FG
3454 } else {
3455 request_finish(mdr);
3456 }
3457 }
3458 }
3459
11fdf7f2 3460 if (!ambiguous_slave_updates.count(from)) {
7c673cae 3461 resolve_ack_gather.erase(from);
11fdf7f2 3462 maybe_finish_slave_resolve();
7c673cae 3463 }
7c673cae
FG
3464}
3465
3466void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3467{
11fdf7f2 3468 ceph_assert(uncommitted_slave_updates[master].count(reqid) == 0);
7c673cae
FG
3469 uncommitted_slave_updates[master][reqid] = su;
3470 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3471 uncommitted_slave_rename_olddir[*p]++;
3472 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3473 uncommitted_slave_unlink[*p]++;
3474}
3475
3476void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3477{
11fdf7f2 3478 ceph_assert(uncommitted_slave_updates[master].count(reqid));
7c673cae
FG
3479 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3480
3481 uncommitted_slave_updates[master].erase(reqid);
3482 if (uncommitted_slave_updates[master].empty())
3483 uncommitted_slave_updates.erase(master);
3484 // discard the non-auth subtree we renamed out of
3485 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3486 CInode *diri = *p;
3487 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
11fdf7f2 3488 ceph_assert(it != uncommitted_slave_rename_olddir.end());
7c673cae
FG
3489 it->second--;
3490 if (it->second == 0) {
3491 uncommitted_slave_rename_olddir.erase(it);
3492 list<CDir*> ls;
3493 diri->get_dirfrags(ls);
3494 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3495 CDir *root = get_subtree_root(*q);
3496 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3497 try_trim_non_auth_subtree(root);
3498 if (*q != root)
3499 break;
3500 }
3501 }
3502 } else
11fdf7f2 3503 ceph_assert(it->second > 0);
7c673cae
FG
3504 }
3505 // removed the inodes that were unlinked by slave update
3506 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3507 CInode *in = *p;
3508 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
11fdf7f2 3509 ceph_assert(it != uncommitted_slave_unlink.end());
7c673cae
FG
3510 it->second--;
3511 if (it->second == 0) {
3512 uncommitted_slave_unlink.erase(it);
3513 if (!in->get_projected_parent_dn())
3514 mds->mdcache->remove_inode_recursive(in);
3515 } else
11fdf7f2 3516 ceph_assert(it->second > 0);
7c673cae
FG
3517 }
3518 delete su;
3519}
3520
3521MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3522{
3523
3524 MDSlaveUpdate* su = NULL;
3525 if (uncommitted_slave_updates.count(master) &&
3526 uncommitted_slave_updates[master].count(reqid)) {
3527 su = uncommitted_slave_updates[master][reqid];
11fdf7f2 3528 ceph_assert(su);
7c673cae
FG
3529 }
3530 return su;
3531}
3532
3533void MDCache::finish_rollback(metareqid_t reqid) {
11fdf7f2
TL
3534 auto p = resolve_need_rollback.find(reqid);
3535 ceph_assert(p != resolve_need_rollback.end());
7c673cae 3536 if (mds->is_resolve())
11fdf7f2
TL
3537 finish_uncommitted_slave_update(reqid, p->second);
3538 resolve_need_rollback.erase(p);
3539 maybe_finish_slave_resolve();
7c673cae
FG
3540}
3541
3542void MDCache::disambiguate_other_imports()
3543{
3544 dout(10) << "disambiguate_other_imports" << dendl;
3545
3546 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3547 // other nodes' ambiguous imports
3548 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3549 p != other_ambiguous_imports.end();
3550 ++p) {
3551 mds_rank_t who = p->first;
3552 dout(10) << "ambiguous imports for mds." << who << dendl;
3553
3554 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3555 q != p->second.end();
3556 ++q) {
3557 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3558 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3559 CDir *dir = get_force_dirfrag(q->first, recovering);
3560 if (!dir) continue;
3561
3562 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3563 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3564 dout(10) << " mds." << who << " did import " << *dir << dendl;
3565 adjust_bounded_subtree_auth(dir, q->second, who);
3566 try_subtree_merge(dir);
3567 } else {
3568 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3569 }
3570 }
3571 }
3572 other_ambiguous_imports.clear();
3573}
3574
3575void MDCache::disambiguate_my_imports()
3576{
3577 dout(10) << "disambiguate_my_imports" << dendl;
3578
3579 if (!mds->is_resolve()) {
11fdf7f2 3580 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3581 return;
3582 }
3583
3584 disambiguate_other_imports();
3585
3586 // my ambiguous imports
3587 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3588 while (!my_ambiguous_imports.empty()) {
3589 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3590
3591 CDir *dir = get_dirfrag(q->first);
11fdf7f2 3592 ceph_assert(dir);
7c673cae
FG
3593
3594 if (dir->authority() != me_ambig) {
3595 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3596 cancel_ambiguous_import(dir);
3597
3598 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3599
3600 // subtree may have been swallowed by another node claiming dir
3601 // as their own.
3602 CDir *root = get_subtree_root(dir);
3603 if (root != dir)
3604 dout(10) << " subtree root is " << *root << dendl;
11fdf7f2 3605 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
7c673cae
FG
3606 try_trim_non_auth_subtree(root);
3607 } else {
3608 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3609 finish_ambiguous_import(q->first);
3610 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3611 }
3612 }
11fdf7f2 3613 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3614 mds->mdlog->flush();
3615
3616 // verify all my subtrees are unambiguous!
3617 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3618 p != subtrees.end();
3619 ++p) {
3620 CDir *dir = p->first;
3621 if (dir->is_ambiguous_dir_auth()) {
3622 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3623 }
11fdf7f2 3624 ceph_assert(!dir->is_ambiguous_dir_auth());
7c673cae
FG
3625 }
3626
3627 show_subtrees();
3628}
3629
3630
3631void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3632{
11fdf7f2 3633 ceph_assert(my_ambiguous_imports.count(base) == 0);
7c673cae
FG
3634 my_ambiguous_imports[base] = bounds;
3635}
3636
3637
3638void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3639{
3640 // make a list
3641 vector<dirfrag_t> binos;
3642 for (set<CDir*>::iterator p = bounds.begin();
3643 p != bounds.end();
3644 ++p)
3645 binos.push_back((*p)->dirfrag());
3646
3647 // note: this can get called twice if the exporter fails during recovery
3648 if (my_ambiguous_imports.count(base->dirfrag()))
3649 my_ambiguous_imports.erase(base->dirfrag());
3650
3651 add_ambiguous_import(base->dirfrag(), binos);
3652}
3653
3654void MDCache::cancel_ambiguous_import(CDir *dir)
3655{
3656 dirfrag_t df = dir->dirfrag();
11fdf7f2 3657 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3658 dout(10) << "cancel_ambiguous_import " << df
3659 << " bounds " << my_ambiguous_imports[df]
3660 << " " << *dir
3661 << dendl;
3662 my_ambiguous_imports.erase(df);
3663}
3664
3665void MDCache::finish_ambiguous_import(dirfrag_t df)
3666{
11fdf7f2 3667 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3668 vector<dirfrag_t> bounds;
3669 bounds.swap(my_ambiguous_imports[df]);
3670 my_ambiguous_imports.erase(df);
3671
3672 dout(10) << "finish_ambiguous_import " << df
3673 << " bounds " << bounds
3674 << dendl;
3675 CDir *dir = get_dirfrag(df);
11fdf7f2 3676 ceph_assert(dir);
7c673cae
FG
3677
3678 // adjust dir_auth, import maps
3679 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3680 try_subtree_merge(dir);
3681}
3682
3683void MDCache::remove_inode_recursive(CInode *in)
3684{
3685 dout(10) << "remove_inode_recursive " << *in << dendl;
3686 list<CDir*> ls;
3687 in->get_dirfrags(ls);
3688 list<CDir*>::iterator p = ls.begin();
3689 while (p != ls.end()) {
3690 CDir *subdir = *p++;
3691
3692 dout(10) << " removing dirfrag " << subdir << dendl;
94b18763
FG
3693 auto it = subdir->items.begin();
3694 while (it != subdir->items.end()) {
3695 CDentry *dn = it->second;
3696 ++it;
7c673cae
FG
3697 CDentry::linkage_t *dnl = dn->get_linkage();
3698 if (dnl->is_primary()) {
3699 CInode *tin = dnl->get_inode();
31f18b77 3700 subdir->unlink_inode(dn, false);
7c673cae
FG
3701 remove_inode_recursive(tin);
3702 }
3703 subdir->remove_dentry(dn);
3704 }
3705
3706 if (subdir->is_subtree_root())
3707 remove_subtree(subdir);
3708 in->close_dirfrag(subdir->dirfrag().frag);
3709 }
3710 remove_inode(in);
3711}
3712
11fdf7f2 3713bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
7c673cae 3714{
11fdf7f2 3715 ceph_assert(!in->is_auth());
7c673cae
FG
3716
3717 dout(10) << __func__ << ":" << *in << dendl;
3718
3719 // Recurse into any dirfrags beneath this inode
3720 list<CDir*> ls;
3721 in->get_dirfrags(ls);
3722 for (auto subdir : ls) {
3723 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3724 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3725 return true;
3726 }
3727
3728 for (auto &it : subdir->items) {
3729 CDentry *dn = it.second;
3730 CDentry::linkage_t *dnl = dn->get_linkage();
3731 if (dnl->is_primary()) {
3732 CInode *tin = dnl->get_inode();
3733
3734 /* Remote strays with linkage (i.e. hardlinks) should not be
3735 * expired, because they may be the target of
3736 * a rename() as the owning MDS shuts down */
3737 if (!tin->is_stray() && tin->inode.nlink) {
3738 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3739 return true;
3740 }
3741
3742 const bool abort = expire_recursive(tin, expiremap);
3743 if (abort) {
3744 return true;
3745 }
3746 }
3747 if (dn->lru_is_expireable()) {
3748 trim_dentry(dn, expiremap);
3749 } else {
3750 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3751 return true;
3752 }
3753 }
3754 }
3755
3756 return false;
3757}
3758
3759void MDCache::trim_unlinked_inodes()
3760{
3761 dout(7) << "trim_unlinked_inodes" << dendl;
81eedcae
TL
3762 int count = 0;
3763 vector<CInode*> q;
94b18763 3764 for (auto &p : inode_map) {
b32b8144 3765 CInode *in = p.second;
7c673cae
FG
3766 if (in->get_parent_dn() == NULL && !in->is_base()) {
3767 dout(7) << " will trim from " << *in << dendl;
3768 q.push_back(in);
3769 }
81eedcae
TL
3770
3771 if (!(++count % 1000))
3772 mds->heartbeat_reset();
3773 }
3774
3775 for (auto& in : q) {
3776 remove_inode_recursive(in);
3777
3778 if (!(++count % 1000))
3779 mds->heartbeat_reset();
7c673cae 3780 }
7c673cae
FG
3781}
3782
3783/** recalc_auth_bits()
3784 * once subtree auth is disambiguated, we need to adjust all the
3785 * auth and dirty bits in our cache before moving on.
3786 */
3787void MDCache::recalc_auth_bits(bool replay)
3788{
3789 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3790
3791 if (root) {
3792 root->inode_auth.first = mds->mdsmap->get_root();
3793 bool auth = mds->get_nodeid() == root->inode_auth.first;
3794 if (auth) {
3795 root->state_set(CInode::STATE_AUTH);
3796 } else {
3797 root->state_clear(CInode::STATE_AUTH);
3798 if (!replay)
3799 root->state_set(CInode::STATE_REJOINING);
3800 }
3801 }
3802
3803 set<CInode*> subtree_inodes;
3804 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3805 p != subtrees.end();
3806 ++p) {
3807 if (p->first->dir_auth.first == mds->get_nodeid())
3808 subtree_inodes.insert(p->first->inode);
3809 }
3810
3811 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3812 p != subtrees.end();
3813 ++p) {
3814 if (p->first->inode->is_mdsdir()) {
3815 CInode *in = p->first->inode;
3816 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3817 if (auth) {
3818 in->state_set(CInode::STATE_AUTH);
3819 } else {
3820 in->state_clear(CInode::STATE_AUTH);
3821 if (!replay)
3822 in->state_set(CInode::STATE_REJOINING);
3823 }
3824 }
3825
3826 list<CDir*> dfq; // dirfrag queue
3827 dfq.push_back(p->first);
3828
3829 bool auth = p->first->authority().first == mds->get_nodeid();
3830 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3831
3832 while (!dfq.empty()) {
3833 CDir *dir = dfq.front();
3834 dfq.pop_front();
3835
3836 // dir
3837 if (auth) {
3838 dir->state_set(CDir::STATE_AUTH);
3839 } else {
3840 dir->state_clear(CDir::STATE_AUTH);
3841 if (!replay) {
3842 // close empty non-auth dirfrag
3843 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3844 dir->inode->close_dirfrag(dir->get_frag());
3845 continue;
3846 }
3847 dir->state_set(CDir::STATE_REJOINING);
3848 dir->state_clear(CDir::STATE_COMPLETE);
3849 if (dir->is_dirty())
3850 dir->mark_clean();
3851 }
3852 }
3853
3854 // dentries in this dir
94b18763 3855 for (auto &p : dir->items) {
7c673cae 3856 // dn
94b18763 3857 CDentry *dn = p.second;
7c673cae
FG
3858 CDentry::linkage_t *dnl = dn->get_linkage();
3859 if (auth) {
3860 dn->state_set(CDentry::STATE_AUTH);
3861 } else {
3862 dn->state_clear(CDentry::STATE_AUTH);
3863 if (!replay) {
3864 dn->state_set(CDentry::STATE_REJOINING);
3865 if (dn->is_dirty())
3866 dn->mark_clean();
3867 }
3868 }
3869
3870 if (dnl->is_primary()) {
3871 // inode
3872 CInode *in = dnl->get_inode();
3873 if (auth) {
3874 in->state_set(CInode::STATE_AUTH);
3875 } else {
3876 in->state_clear(CInode::STATE_AUTH);
3877 if (!replay) {
3878 in->state_set(CInode::STATE_REJOINING);
3879 if (in->is_dirty())
3880 in->mark_clean();
3881 if (in->is_dirty_parent())
3882 in->clear_dirty_parent();
3883 // avoid touching scatterlocks for our subtree roots!
3884 if (subtree_inodes.count(in) == 0)
3885 in->clear_scatter_dirty();
3886 }
3887 }
3888 // recurse?
3889 if (in->is_dir())
3890 in->get_nested_dirfrags(dfq);
3891 }
3892 }
3893 }
3894 }
3895
3896 show_subtrees();
3897 show_cache();
3898}
3899
3900
3901
3902// ===========================================================================
3903// REJOIN
3904
3905/*
3906 * notes on scatterlock recovery:
3907 *
3908 * - recovering inode replica sends scatterlock data for any subtree
3909 * roots (the only ones that are possibly dirty).
3910 *
3911 * - surviving auth incorporates any provided scatterlock data. any
3912 * pending gathers are then finished, as with the other lock types.
3913 *
3914 * that takes care of surviving auth + (recovering replica)*.
3915 *
3916 * - surviving replica sends strong_inode, which includes current
3917 * scatterlock state, AND any dirty scatterlock data. this
3918 * provides the recovering auth with everything it might need.
3919 *
3920 * - recovering auth must pick initial scatterlock state based on
3921 * (weak|strong) rejoins.
3922 * - always assimilate scatterlock data (it can't hurt)
3923 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3924 * - include base inode in ack for all inodes that saw scatterlock content
3925 *
3926 * also, for scatter gather,
3927 *
3928 * - auth increments {frag,r}stat.version on completion of any gather.
3929 *
3930 * - auth incorporates changes in a gather _only_ if the version
3931 * matches.
3932 *
3933 * - replica discards changes any time the scatterlock syncs, and
3934 * after recovery.
3935 */
3936
3937void MDCache::dump_rejoin_status(Formatter *f) const
3938{
3939 f->open_object_section("rejoin_status");
3940 f->dump_stream("rejoin_gather") << rejoin_gather;
3941 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3942 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3943 f->close_section();
3944}
3945
11fdf7f2 3946void MDCache::rejoin_start(MDSContext *rejoin_done_)
7c673cae
FG
3947{
3948 dout(10) << "rejoin_start" << dendl;
11fdf7f2 3949 ceph_assert(!rejoin_done);
7c673cae
FG
3950 rejoin_done.reset(rejoin_done_);
3951
3952 rejoin_gather = recovery_set;
3953 // need finish opening cap inodes before sending cache rejoins
3954 rejoin_gather.insert(mds->get_nodeid());
3955 process_imported_caps();
3956}
3957
3958/*
3959 * rejoin phase!
3960 *
11fdf7f2 3961 * this initiates rejoin. it should be called before we get any
7c673cae
FG
3962 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3963 *
3964 * we start out by sending rejoins to everyone in the recovery set.
3965 *
3966 * if we are rejoin, send for all regions in our cache.
11fdf7f2 3967 * if we are active|stopping, send only to nodes that are rejoining.
7c673cae
FG
3968 */
3969void MDCache::rejoin_send_rejoins()
3970{
3971 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3972
3973 if (rejoin_gather.count(mds->get_nodeid())) {
3974 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3975 rejoins_pending = true;
3976 return;
3977 }
3978 if (!resolve_gather.empty()) {
3979 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3980 << resolve_gather << ")" << dendl;
3981 rejoins_pending = true;
3982 return;
3983 }
3984
11fdf7f2
TL
3985 ceph_assert(!migrator->is_importing());
3986 ceph_assert(!migrator->is_exporting());
7c673cae
FG
3987
3988 if (!mds->is_rejoin()) {
3989 disambiguate_other_imports();
3990 }
3991
11fdf7f2 3992 map<mds_rank_t, MMDSCacheRejoin::ref> rejoins;
7c673cae
FG
3993
3994
3995 // if i am rejoining, send a rejoin to everyone.
3996 // otherwise, just send to others who are rejoining.
3997 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3998 p != recovery_set.end();
3999 ++p) {
4000 if (*p == mds->get_nodeid()) continue; // nothing to myself!
4001 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
4002 if (mds->is_rejoin())
11fdf7f2 4003 rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK);
7c673cae 4004 else if (mds->mdsmap->is_rejoin(*p))
11fdf7f2 4005 rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG);
7c673cae
FG
4006 }
4007
4008 if (mds->is_rejoin()) {
11fdf7f2
TL
4009 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4010 for (auto& p : cap_exports) {
4011 mds_rank_t target = p.second.first;
7c673cae
FG
4012 if (rejoins.count(target) == 0)
4013 continue;
11fdf7f2
TL
4014 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4015 Session *session = nullptr;
4016 auto it = client_exports.find(q->first);
4017 if (it != client_exports.end()) {
4018 session = it->second.first;
4019 if (session)
4020 it->second.second.insert(target);
4021 } else {
4022 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4023 auto& r = client_exports[q->first];
4024 r.first = session;
4025 if (session)
4026 r.second.insert(target);
4027 }
4028 if (session) {
4029 ++q;
4030 } else {
4031 // remove reconnect with no session
4032 p.second.second.erase(q++);
4033 }
4034 }
4035 rejoins[target]->cap_exports[p.first] = p.second.second;
7c673cae 4036 }
11fdf7f2
TL
4037 for (auto& p : client_exports) {
4038 Session *session = p.second.first;
4039 for (auto& q : p.second.second) {
4040 auto rejoin = rejoins[q];
4041 rejoin->client_map[p.first] = session->info.inst;
4042 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4043 }
7c673cae
FG
4044 }
4045 }
4046
4047
4048 // check all subtrees
4049 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4050 p != subtrees.end();
4051 ++p) {
4052 CDir *dir = p->first;
11fdf7f2 4053 ceph_assert(dir->is_subtree_root());
7c673cae
FG
4054 if (dir->is_ambiguous_dir_auth()) {
4055 // exporter is recovering, importer is survivor.
11fdf7f2
TL
4056 ceph_assert(rejoins.count(dir->authority().first));
4057 ceph_assert(!rejoins.count(dir->authority().second));
7c673cae
FG
4058 continue;
4059 }
4060
4061 // my subtree?
4062 if (dir->is_auth())
4063 continue; // skip my own regions!
4064
4065 mds_rank_t auth = dir->get_dir_auth().first;
11fdf7f2 4066 ceph_assert(auth >= 0);
7c673cae
FG
4067 if (rejoins.count(auth) == 0)
4068 continue; // don't care about this node's subtrees
4069
4070 rejoin_walk(dir, rejoins[auth]);
4071 }
4072
4073 // rejoin root inodes, too
11fdf7f2 4074 for (auto &p : rejoins) {
7c673cae
FG
4075 if (mds->is_rejoin()) {
4076 // weak
11fdf7f2
TL
4077 if (p.first == 0 && root) {
4078 p.second->add_weak_inode(root->vino());
7c673cae
FG
4079 if (root->is_dirty_scattered()) {
4080 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4081 p.second->add_scatterlock_state(root);
7c673cae
FG
4082 }
4083 }
11fdf7f2 4084 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
7c673cae 4085 if (in)
11fdf7f2 4086 p.second->add_weak_inode(in->vino());
7c673cae
FG
4087 }
4088 } else {
4089 // strong
11fdf7f2
TL
4090 if (p.first == 0 && root) {
4091 p.second->add_strong_inode(root->vino(),
7c673cae
FG
4092 root->get_replica_nonce(),
4093 root->get_caps_wanted(),
4094 root->filelock.get_state(),
4095 root->nestlock.get_state(),
4096 root->dirfragtreelock.get_state());
4097 root->state_set(CInode::STATE_REJOINING);
4098 if (root->is_dirty_scattered()) {
4099 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4100 p.second->add_scatterlock_state(root);
7c673cae
FG
4101 }
4102 }
4103
11fdf7f2
TL
4104 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4105 p.second->add_strong_inode(in->vino(),
7c673cae
FG
4106 in->get_replica_nonce(),
4107 in->get_caps_wanted(),
4108 in->filelock.get_state(),
4109 in->nestlock.get_state(),
4110 in->dirfragtreelock.get_state());
4111 in->state_set(CInode::STATE_REJOINING);
4112 }
4113 }
4114 }
4115
4116 if (!mds->is_rejoin()) {
4117 // i am survivor. send strong rejoin.
4118 // note request remote_auth_pins, xlocks
4119 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4120 p != active_requests.end();
4121 ++p) {
4122 MDRequestRef& mdr = p->second;
4123 if (mdr->is_slave())
4124 continue;
4125 // auth pins
11fdf7f2
TL
4126 for (const auto& q : mdr->remote_auth_pins) {
4127 if (!q.first->is_auth()) {
4128 ceph_assert(q.second == q.first->authority().first);
4129 if (rejoins.count(q.second) == 0) continue;
4130 const MMDSCacheRejoin::ref &rejoin = rejoins[q.second];
7c673cae 4131
11fdf7f2 4132 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
7c673cae 4133 MDSCacheObjectInfo i;
11fdf7f2 4134 q.first->set_object_info(i);
7c673cae
FG
4135 if (i.ino)
4136 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4137 else
4138 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4139
4140 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
11fdf7f2 4141 mdr->more()->rename_inode == q.first)
7c673cae
FG
4142 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4143 mdr->reqid, mdr->attempt);
4144 }
4145 }
4146 // xlocks
11fdf7f2
TL
4147 for (const auto& q : mdr->locks) {
4148 auto lock = q.lock;
4149 auto obj = lock->get_parent();
4150 if (q.is_xlock() && !obj->is_auth()) {
4151 mds_rank_t who = obj->authority().first;
7c673cae 4152 if (rejoins.count(who) == 0) continue;
11fdf7f2 4153 const MMDSCacheRejoin::ref &rejoin = rejoins[who];
7c673cae 4154
11fdf7f2 4155 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
7c673cae 4156 MDSCacheObjectInfo i;
11fdf7f2 4157 obj->set_object_info(i);
7c673cae 4158 if (i.ino)
11fdf7f2 4159 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
7c673cae
FG
4160 mdr->reqid, mdr->attempt);
4161 else
4162 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4163 mdr->reqid, mdr->attempt);
11fdf7f2
TL
4164 } else if (q.is_remote_wrlock()) {
4165 mds_rank_t who = q.wrlock_target;
4166 if (rejoins.count(who) == 0) continue;
4167 const MMDSCacheRejoin::ref &rejoin = rejoins[who];
7c673cae 4168
11fdf7f2
TL
4169 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4170 MDSCacheObjectInfo i;
4171 obj->set_object_info(i);
4172 ceph_assert(i.ino);
4173 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4174 mdr->reqid, mdr->attempt);
4175 }
7c673cae
FG
4176 }
4177 }
4178 }
4179
4180 // send the messages
11fdf7f2
TL
4181 for (auto &p : rejoins) {
4182 ceph_assert(rejoin_sent.count(p.first) == 0);
4183 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4184 rejoin_sent.insert(p.first);
4185 rejoin_ack_gather.insert(p.first);
4186 mds->send_message_mds(p.second, p.first);
7c673cae
FG
4187 }
4188 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4189 rejoins_pending = false;
4190
4191 // nothing?
28e407b8 4192 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4193 dout(10) << "nothing to rejoin" << dendl;
4194 rejoin_gather_finish();
4195 }
4196}
4197
4198
4199/**
4200 * rejoin_walk - build rejoin declarations for a subtree
4201 *
4202 * @param dir subtree root
4203 * @param rejoin rejoin message
4204 *
4205 * from a rejoining node:
4206 * weak dirfrag
4207 * weak dentries (w/ connectivity)
4208 *
4209 * from a surviving node:
4210 * strong dirfrag
4211 * strong dentries (no connectivity!)
4212 * strong inodes
4213 */
11fdf7f2 4214void MDCache::rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin)
7c673cae
FG
4215{
4216 dout(10) << "rejoin_walk " << *dir << dendl;
4217
4218 list<CDir*> nested; // finish this dir, then do nested items
4219
4220 if (mds->is_rejoin()) {
4221 // WEAK
4222 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4223 for (auto &p : dir->items) {
4224 CDentry *dn = p.second;
11fdf7f2 4225 ceph_assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4226 CDentry::linkage_t *dnl = dn->get_linkage();
4227 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
11fdf7f2 4228 ceph_assert(dnl->is_primary());
7c673cae 4229 CInode *in = dnl->get_inode();
11fdf7f2 4230 ceph_assert(dnl->get_inode()->is_dir());
94b18763 4231 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
7c673cae
FG
4232 in->get_nested_dirfrags(nested);
4233 if (in->is_dirty_scattered()) {
4234 dout(10) << " sending scatterlock state on " << *in << dendl;
4235 rejoin->add_scatterlock_state(in);
4236 }
4237 }
4238 } else {
4239 // STRONG
4240 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4241 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4242 dir->state_set(CDir::STATE_REJOINING);
4243
11fdf7f2 4244 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
94b18763 4245 CDentry *dn = it->second;
11fdf7f2
TL
4246 ++it;
4247 dn->state_set(CDentry::STATE_REJOINING);
7c673cae 4248 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2
TL
4249 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4250
4251 // trim snap dentries. because they may have been pruned by
4252 // their auth mds (snap deleted)
4253 if (dn->last != CEPH_NOSNAP) {
4254 if (in && !in->remote_parents.empty()) {
4255 // unlink any stale remote snap dentry.
4256 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4257 CDentry *remote_dn = *it2;
4258 ++it2;
4259 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4260 remote_dn->unlink_remote(remote_dn->get_linkage());
4261 }
4262 }
4263 if (dn->lru_is_expireable()) {
4264 if (!dnl->is_null())
4265 dir->unlink_inode(dn, false);
4266 if (in)
4267 remove_inode(in);
4268 dir->remove_dentry(dn);
4269 continue;
4270 } else {
4271 // Inventing null/remote dentry shouldn't cause problem
4272 ceph_assert(!dnl->is_primary());
4273 }
4274 }
4275
7c673cae 4276 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4277 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4278 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4279 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4280 dnl->is_remote() ? dnl->get_remote_d_type():0,
4281 dn->get_replica_nonce(),
4282 dn->lock.get_state());
4283 dn->state_set(CDentry::STATE_REJOINING);
4284 if (dnl->is_primary()) {
4285 CInode *in = dnl->get_inode();
4286 dout(15) << " add_strong_inode " << *in << dendl;
4287 rejoin->add_strong_inode(in->vino(),
4288 in->get_replica_nonce(),
4289 in->get_caps_wanted(),
4290 in->filelock.get_state(),
4291 in->nestlock.get_state(),
4292 in->dirfragtreelock.get_state());
4293 in->state_set(CInode::STATE_REJOINING);
4294 in->get_nested_dirfrags(nested);
4295 if (in->is_dirty_scattered()) {
4296 dout(10) << " sending scatterlock state on " << *in << dendl;
4297 rejoin->add_scatterlock_state(in);
4298 }
4299 }
4300 }
4301 }
4302
4303 // recurse into nested dirs
4304 for (list<CDir*>::iterator p = nested.begin();
4305 p != nested.end();
4306 ++p)
4307 rejoin_walk(*p, rejoin);
4308}
4309
4310
4311/*
4312 * i got a rejoin.
4313 * - reply with the lockstate
4314 *
4315 * if i am active|stopping,
4316 * - remove source from replica list for everything not referenced here.
7c673cae 4317 */
11fdf7f2 4318void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m)
7c673cae
FG
4319{
4320 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4321 << " (" << m->get_payload().length() << " bytes)"
4322 << dendl;
4323
4324 switch (m->op) {
4325 case MMDSCacheRejoin::OP_WEAK:
4326 handle_cache_rejoin_weak(m);
4327 break;
4328 case MMDSCacheRejoin::OP_STRONG:
4329 handle_cache_rejoin_strong(m);
4330 break;
4331 case MMDSCacheRejoin::OP_ACK:
4332 handle_cache_rejoin_ack(m);
4333 break;
4334
4335 default:
4336 ceph_abort();
4337 }
7c673cae
FG
4338}
4339
4340
4341/*
4342 * handle_cache_rejoin_weak
4343 *
4344 * the sender
4345 * - is recovering from their journal.
4346 * - may have incorrect (out of date) inode contents
4347 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4348 *
4349 * if the sender didn't trim_non_auth(), they
4350 * - may have incorrect (out of date) dentry/inode linkage
4351 * - may have deleted/purged inodes
4352 * and i may have to go to disk to get accurate inode contents. yuck.
7c673cae 4353 */
11fdf7f2 4354void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &weak)
7c673cae
FG
4355{
4356 mds_rank_t from = mds_rank_t(weak->get_source().num());
4357
4358 // possible response(s)
11fdf7f2 4359 MMDSCacheRejoin::ref ack; // if survivor
7c673cae
FG
4360 set<vinodeno_t> acked_inodes; // if survivor
4361 set<SimpleLock *> gather_locks; // if survivor
4362 bool survivor = false; // am i a survivor?
4363
4364 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4365 survivor = true;
4366 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
11fdf7f2 4367 ack = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
7c673cae
FG
4368
4369 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4370
4371 // check cap exports
4372 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4373 CInode *in = get_inode(p->first);
11fdf7f2 4374 ceph_assert(!in || in->is_auth());
7c673cae
FG
4375 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4376 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4377 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4378 Capability::Import& im = imported_caps[p->first][q->first];
4379 if (cap) {
4380 im.cap_id = cap->get_cap_id();
4381 im.issue_seq = cap->get_last_seq();
4382 im.mseq = cap->get_mseq();
4383 } else {
4384 // all are zero
4385 }
4386 }
4387 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4388 }
4389
11fdf7f2 4390 encode(imported_caps, ack->imported_caps);
7c673cae 4391 } else {
11fdf7f2 4392 ceph_assert(mds->is_rejoin());
7c673cae
FG
4393
4394 // we may have already received a strong rejoin from the sender.
4395 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
11fdf7f2 4396 ceph_assert(gather_locks.empty());
7c673cae
FG
4397
4398 // check cap exports.
4399 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
11fdf7f2
TL
4400 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4401 weak->client_metadata_map.end());
7c673cae
FG
4402
4403 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4404 CInode *in = get_inode(p->first);
11fdf7f2 4405 ceph_assert(!in || in->is_auth());
7c673cae
FG
4406 // note
4407 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4408 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4409 cap_imports[p->first][q->first][from] = q->second;
4410 }
4411 }
4412 }
4413
4414 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4415 for (const auto &p : weak->inode_scatterlocks) {
4416 CInode *in = get_inode(p.first);
4417 ceph_assert(in);
4418 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4419 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4420 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4421 if (!survivor)
4422 rejoin_potential_updated_scatterlocks.insert(in);
4423 }
4424
4425 // recovering peer may send incorrect dirfrags here. we need to
4426 // infer which dirfrag they meant. the ack will include a
4427 // strong_dirfrag that will set them straight on the fragmentation.
4428
4429 // walk weak map
4430 set<CDir*> dirs_to_share;
11fdf7f2
TL
4431 for (const auto &p : weak->weak_dirfrags) {
4432 CInode *diri = get_inode(p.ino);
7c673cae 4433 if (!diri)
11fdf7f2
TL
4434 dout(0) << " missing dir ino " << p.ino << dendl;
4435 ceph_assert(diri);
7c673cae 4436
11fdf7f2
TL
4437 frag_vec_t leaves;
4438 if (diri->dirfragtree.is_leaf(p.frag)) {
4439 leaves.push_back(p.frag);
7c673cae 4440 } else {
11fdf7f2
TL
4441 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4442 if (leaves.empty())
4443 leaves.push_back(diri->dirfragtree[p.frag.value()]);
7c673cae 4444 }
11fdf7f2
TL
4445 for (const auto& leaf : leaves) {
4446 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4447 if (!dir) {
11fdf7f2 4448 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
7c673cae
FG
4449 continue;
4450 }
11fdf7f2 4451 ceph_assert(dir);
7c673cae 4452 if (dirs_to_share.count(dir)) {
11fdf7f2 4453 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4454 } else {
4455 dirs_to_share.insert(dir);
4456 unsigned nonce = dir->add_replica(from);
11fdf7f2 4457 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4458 if (ack) {
4459 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4460 ack->add_dirfrag_base(dir);
4461 }
4462 }
4463 }
4464 }
4465
11fdf7f2
TL
4466 for (const auto &p : weak->weak) {
4467 CInode *diri = get_inode(p.first);
7c673cae 4468 if (!diri)
11fdf7f2
TL
4469 dout(0) << " missing dir ino " << p.first << dendl;
4470 ceph_assert(diri);
7c673cae
FG
4471
4472 // weak dentries
4473 CDir *dir = 0;
11fdf7f2 4474 for (const auto &q : p.second) {
7c673cae
FG
4475 // locate proper dirfrag.
4476 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
11fdf7f2 4477 frag_t fg = diri->pick_dirfrag(q.first.name);
7c673cae
FG
4478 if (!dir || dir->get_frag() != fg) {
4479 dir = diri->get_dirfrag(fg);
4480 if (!dir)
4481 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
11fdf7f2
TL
4482 ceph_assert(dir);
4483 ceph_assert(dirs_to_share.count(dir));
7c673cae
FG
4484 }
4485
4486 // and dentry
11fdf7f2
TL
4487 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4488 ceph_assert(dn);
7c673cae 4489 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 4490 ceph_assert(dnl->is_primary());
7c673cae
FG
4491
4492 if (survivor && dn->is_replica(from))
4493 dentry_remove_replica(dn, from, gather_locks);
4494 unsigned dnonce = dn->add_replica(from);
4495 dout(10) << " have " << *dn << dendl;
4496 if (ack)
94b18763 4497 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4498 dnl->get_inode()->ino(), inodeno_t(0), 0,
4499 dnonce, dn->lock.get_replica_state());
4500
4501 // inode
4502 CInode *in = dnl->get_inode();
11fdf7f2 4503 ceph_assert(in);
7c673cae
FG
4504
4505 if (survivor && in->is_replica(from))
4506 inode_remove_replica(in, from, true, gather_locks);
4507 unsigned inonce = in->add_replica(from);
4508 dout(10) << " have " << *in << dendl;
4509
4510 // scatter the dirlock, just in case?
4511 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4512 in->filelock.set_state(LOCK_MIX);
4513
4514 if (ack) {
4515 acked_inodes.insert(in->vino());
4516 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4517 bufferlist bl;
4518 in->_encode_locks_state_for_rejoin(bl, from);
4519 ack->add_inode_locks(in, inonce, bl);
4520 }
4521 }
4522 }
4523
4524 // weak base inodes? (root, stray, etc.)
4525 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4526 p != weak->weak_inodes.end();
4527 ++p) {
4528 CInode *in = get_inode(*p);
11fdf7f2 4529 ceph_assert(in); // hmm fixme wrt stray?
7c673cae
FG
4530 if (survivor && in->is_replica(from))
4531 inode_remove_replica(in, from, true, gather_locks);
4532 unsigned inonce = in->add_replica(from);
4533 dout(10) << " have base " << *in << dendl;
4534
4535 if (ack) {
4536 acked_inodes.insert(in->vino());
4537 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4538 bufferlist bl;
4539 in->_encode_locks_state_for_rejoin(bl, from);
4540 ack->add_inode_locks(in, inonce, bl);
4541 }
4542 }
4543
11fdf7f2 4544 ceph_assert(rejoin_gather.count(from));
7c673cae
FG
4545 rejoin_gather.erase(from);
4546 if (survivor) {
4547 // survivor. do everything now.
11fdf7f2
TL
4548 for (const auto &p : weak->inode_scatterlocks) {
4549 CInode *in = get_inode(p.first);
4550 ceph_assert(in);
7c673cae
FG
4551 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4552 acked_inodes.insert(in->vino());
4553 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4554 }
4555
4556 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4557 mds->send_message(ack, weak->get_connection());
4558
4559 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4560 if (!(*p)->is_stable())
4561 mds->locker->eval_gather(*p);
4562 }
4563 } else {
4564 // done?
28e407b8 4565 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4566 rejoin_gather_finish();
4567 } else {
4568 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4569 }
4570 }
4571}
4572
7c673cae
FG
4573/*
4574 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4575 *
4576 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4577 * ack, the replica dne, and we can remove it from our replica maps.
4578 */
11fdf7f2 4579void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
7c673cae
FG
4580 set<vinodeno_t>& acked_inodes,
4581 set<SimpleLock *>& gather_locks)
4582{
4583 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4584
b32b8144 4585 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4586 // inode?
4587 if (in->is_auth() &&
4588 in->is_replica(from) &&
b32b8144 4589 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4590 inode_remove_replica(in, from, false, gather_locks);
4591 dout(10) << " rem " << *in << dendl;
4592 }
4593
b32b8144
FG
4594 if (!in->is_dir())
4595 return;
7c673cae
FG
4596
4597 list<CDir*> dfs;
4598 in->get_dirfrags(dfs);
4599 for (list<CDir*>::iterator p = dfs.begin();
4600 p != dfs.end();
4601 ++p) {
4602 CDir *dir = *p;
181888fb
FG
4603 if (!dir->is_auth())
4604 continue;
7c673cae 4605
181888fb 4606 if (dir->is_replica(from) &&
7c673cae
FG
4607 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4608 dir->remove_replica(from);
4609 dout(10) << " rem " << *dir << dendl;
4610 }
4611
4612 // dentries
94b18763
FG
4613 for (auto &p : dir->items) {
4614 CDentry *dn = p.second;
7c673cae 4615
11fdf7f2
TL
4616 if (dn->is_replica(from)) {
4617 if (ack) {
4618 const auto it = ack->strong_dentries.find(dir->dirfrag());
4619 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4620 continue;
4621 }
4622 }
7c673cae
FG
4623 dentry_remove_replica(dn, from, gather_locks);
4624 dout(10) << " rem " << *dn << dendl;
4625 }
4626 }
4627 }
b32b8144
FG
4628 };
4629
94b18763 4630 for (auto &p : inode_map)
b32b8144 4631 scour_func(p.second);
94b18763 4632 for (auto &p : snap_inode_map)
b32b8144 4633 scour_func(p.second);
7c673cae
FG
4634}
4635
4636
4637CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4638{
4639 CInode *in = new CInode(this, true, 1, last);
4640 in->inode.ino = ino;
4641 in->state_set(CInode::STATE_REJOINUNDEF);
4642 add_inode(in);
4643 rejoin_undef_inodes.insert(in);
4644 dout(10) << " invented " << *in << dendl;
4645 return in;
4646}
4647
4648CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4649{
4650 CInode *in = get_inode(df.ino);
4651 if (!in)
4652 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4653 if (!in->is_dir()) {
11fdf7f2 4654 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
7c673cae 4655 in->inode.mode = S_IFDIR;
11fdf7f2 4656 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
4657 }
4658 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4659 dir->state_set(CDir::STATE_REJOINUNDEF);
4660 rejoin_undef_dirfrags.insert(dir);
4661 dout(10) << " invented " << *dir << dendl;
4662 return dir;
4663}
4664
11fdf7f2 4665void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &strong)
7c673cae
FG
4666{
4667 mds_rank_t from = mds_rank_t(strong->get_source().num());
4668
4669 // only a recovering node will get a strong rejoin.
a8e16298
TL
4670 if (!mds->is_rejoin()) {
4671 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4672 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4673 return;
4674 }
11fdf7f2 4675 ceph_abort_msg("got unexpected rejoin message during recovery");
a8e16298 4676 }
7c673cae
FG
4677
4678 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4679 for (const auto &p : strong->inode_scatterlocks) {
4680 CInode *in = get_inode(p.first);
4681 ceph_assert(in);
4682 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4683 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4684 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4685 rejoin_potential_updated_scatterlocks.insert(in);
4686 }
4687
4688 rejoin_unlinked_inodes[from].clear();
4689
4690 // surviving peer may send incorrect dirfrag here (maybe they didn't
4691 // get the fragment notify, or maybe we rolled back?). we need to
4692 // infer the right frag and get them with the program. somehow.
4693 // we don't normally send ACK.. so we'll need to bundle this with
4694 // MISSING or something.
4695
4696 // strong dirfrags/dentries.
4697 // also process auth_pins, xlocks.
11fdf7f2
TL
4698 for (const auto &p : strong->strong_dirfrags) {
4699 auto& dirfrag = p.first;
4700 CInode *diri = get_inode(dirfrag.ino);
7c673cae 4701 if (!diri)
11fdf7f2
TL
4702 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4703 CDir *dir = diri->get_dirfrag(dirfrag.frag);
7c673cae
FG
4704 bool refragged = false;
4705 if (dir) {
4706 dout(10) << " have " << *dir << dendl;
4707 } else {
4708 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4709 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
11fdf7f2
TL
4710 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4711 dir = rejoin_invent_dirfrag(dirfrag);
7c673cae
FG
4712 }
4713 if (dir) {
11fdf7f2
TL
4714 dir->add_replica(from, p.second.nonce);
4715 dir->dir_rep = p.second.dir_rep;
7c673cae 4716 } else {
11fdf7f2
TL
4717 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4718 frag_vec_t leaves;
4719 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4720 if (leaves.empty())
4721 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4722 dout(10) << " maps to frag(s) " << leaves << dendl;
4723 for (const auto& leaf : leaves) {
4724 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4725 if (!dir)
11fdf7f2 4726 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
7c673cae
FG
4727 else
4728 dout(10) << " have(approx) " << *dir << dendl;
11fdf7f2
TL
4729 dir->add_replica(from, p.second.nonce);
4730 dir->dir_rep = p.second.dir_rep;
7c673cae
FG
4731 }
4732 refragged = true;
4733 }
4734
11fdf7f2
TL
4735 const auto it = strong->strong_dentries.find(dirfrag);
4736 if (it != strong->strong_dentries.end()) {
4737 const map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = it->second;
4738 for (const auto &q : dmap) {
4739 const string_snap_t& ss = q.first;
4740 const MMDSCacheRejoin::dn_strong& d = q.second;
4741 CDentry *dn;
4742 if (!refragged)
4743 dn = dir->lookup(ss.name, ss.snapid);
4744 else {
4745 frag_t fg = diri->pick_dirfrag(ss.name);
4746 dir = diri->get_dirfrag(fg);
4747 ceph_assert(dir);
4748 dn = dir->lookup(ss.name, ss.snapid);
4749 }
4750 if (!dn) {
4751 if (d.is_remote()) {
4752 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4753 } else if (d.is_null()) {
4754 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4755 } else {
4756 CInode *in = get_inode(d.ino, ss.snapid);
4757 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4758 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4759 }
4760 dout(10) << " invented " << *dn << dendl;
4761 }
4762 CDentry::linkage_t *dnl = dn->get_linkage();
4763
4764 // dn auth_pin?
4765 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4766 if (pinned_it != strong->authpinned_dentries.end()) {
4767 const auto slave_reqid_it = pinned_it->second.find(ss);
4768 if (slave_reqid_it != pinned_it->second.end()) {
4769 for (const auto &r : slave_reqid_it->second) {
4770 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4771
4772 // get/create slave mdrequest
4773 MDRequestRef mdr;
4774 if (have_request(r.reqid))
4775 mdr = request_get(r.reqid);
4776 else
4777 mdr = request_start_slave(r.reqid, r.attempt, strong);
4778 mdr->auth_pin(dn);
4779 }
4780 }
7c673cae 4781 }
7c673cae 4782
11fdf7f2
TL
4783 // dn xlock?
4784 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4785 if (xlocked_it != strong->xlocked_dentries.end()) {
4786 const auto ss_req_it = xlocked_it->second.find(ss);
4787 if (ss_req_it != xlocked_it->second.end()) {
4788 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4789 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4790 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4791 ceph_assert(mdr->is_auth_pinned(dn));
4792 if (!mdr->is_xlocked(&dn->versionlock)) {
4793 ceph_assert(dn->versionlock.can_xlock_local());
4794 dn->versionlock.get_xlock(mdr, mdr->get_client());
4795 mdr->locks.emplace(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4796 }
4797 if (dn->lock.is_stable())
4798 dn->auth_pin(&dn->lock);
4799 dn->lock.set_state(LOCK_XLOCK);
4800 dn->lock.get_xlock(mdr, mdr->get_client());
4801 mdr->locks.emplace(&dn->lock, MutationImpl::LockOp::XLOCK);
4802 }
4803 }
7c673cae 4804
11fdf7f2
TL
4805 dn->add_replica(from, d.nonce);
4806 dout(10) << " have " << *dn << dendl;
4807
4808 if (dnl->is_primary()) {
4809 if (d.is_primary()) {
4810 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4811 // the survivor missed MDentryUnlink+MDentryLink messages ?
4812 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4813 CInode *in = get_inode(d.ino, ss.snapid);
4814 ceph_assert(in);
4815 ceph_assert(in->get_parent_dn());
4816 rejoin_unlinked_inodes[from].insert(in);
4817 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4818 }
4819 } else {
4820 // the survivor missed MDentryLink message ?
4821 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4822 dout(7) << " sender doesn't have primay dentry" << dendl;
4823 }
4824 } else {
4825 if (d.is_primary()) {
4826 // the survivor missed MDentryUnlink message ?
4827 CInode *in = get_inode(d.ino, ss.snapid);
4828 ceph_assert(in);
4829 ceph_assert(in->get_parent_dn());
7c673cae 4830 rejoin_unlinked_inodes[from].insert(in);
11fdf7f2 4831 dout(7) << " sender has primary dentry but we don't" << dendl;
7c673cae 4832 }
11fdf7f2 4833 }
7c673cae
FG
4834 }
4835 }
4836 }
4837
11fdf7f2
TL
4838 for (const auto &p : strong->strong_inodes) {
4839 CInode *in = get_inode(p.first);
4840 ceph_assert(in);
4841 in->add_replica(from, p.second.nonce);
7c673cae
FG
4842 dout(10) << " have " << *in << dendl;
4843
11fdf7f2 4844 const MMDSCacheRejoin::inode_strong& is = p.second;
7c673cae
FG
4845
4846 // caps_wanted
4847 if (is.caps_wanted) {
11fdf7f2 4848 in->set_mds_caps_wanted(from, is.caps_wanted);
7c673cae
FG
4849 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4850 << " on " << *in << dendl;
4851 }
4852
4853 // scatterlocks?
4854 // infer state from replica state:
4855 // * go to MIX if they might have wrlocks
4856 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4857 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4858 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4859 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4860
4861 // auth pin?
11fdf7f2
TL
4862 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4863 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4864 for (const auto& r : authpinned_inodes_it->second) {
4865 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
7c673cae
FG
4866
4867 // get/create slave mdrequest
4868 MDRequestRef mdr;
11fdf7f2
TL
4869 if (have_request(r.reqid))
4870 mdr = request_get(r.reqid);
7c673cae 4871 else
11fdf7f2 4872 mdr = request_start_slave(r.reqid, r.attempt, strong);
7c673cae 4873 if (strong->frozen_authpin_inodes.count(in->vino())) {
11fdf7f2 4874 ceph_assert(!in->get_num_auth_pins());
7c673cae
FG
4875 mdr->freeze_auth_pin(in);
4876 } else {
11fdf7f2 4877 ceph_assert(!in->is_frozen_auth_pin());
7c673cae
FG
4878 }
4879 mdr->auth_pin(in);
4880 }
4881 }
4882 // xlock(s)?
11fdf7f2
TL
4883 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4884 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4885 for (const auto &q : xlocked_inodes_it->second) {
4886 SimpleLock *lock = in->get_lock(q.first);
4887 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4888 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4889 ceph_assert(mdr->is_auth_pinned(in));
4890 if (!mdr->is_xlocked(&in->versionlock)) {
4891 ceph_assert(in->versionlock.can_xlock_local());
7c673cae 4892 in->versionlock.get_xlock(mdr, mdr->get_client());
11fdf7f2 4893 mdr->locks.emplace(&in->versionlock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4894 }
4895 if (lock->is_stable())
4896 in->auth_pin(lock);
4897 lock->set_state(LOCK_XLOCK);
4898 if (lock == &in->filelock)
4899 in->loner_cap = -1;
4900 lock->get_xlock(mdr, mdr->get_client());
11fdf7f2 4901 mdr->locks.emplace(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4902 }
4903 }
4904 }
4905 // wrlock(s)?
11fdf7f2
TL
4906 for (const auto &p : strong->wrlocked_inodes) {
4907 CInode *in = get_inode(p.first);
4908 for (const auto &q : p.second) {
4909 SimpleLock *lock = in->get_lock(q.first);
4910 for (const auto &r : q.second) {
4911 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4912 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
7c673cae 4913 if (in->is_auth())
11fdf7f2 4914 ceph_assert(mdr->is_auth_pinned(in));
7c673cae
FG
4915 lock->set_state(LOCK_MIX);
4916 if (lock == &in->filelock)
4917 in->loner_cap = -1;
4918 lock->get_wrlock(true);
11fdf7f2 4919 mdr->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
7c673cae
FG
4920 }
4921 }
4922 }
4923
4924 // done?
11fdf7f2 4925 ceph_assert(rejoin_gather.count(from));
7c673cae 4926 rejoin_gather.erase(from);
28e407b8 4927 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4928 rejoin_gather_finish();
4929 } else {
4930 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4931 }
4932}
4933
11fdf7f2 4934void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &ack)
7c673cae
FG
4935{
4936 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4937 mds_rank_t from = mds_rank_t(ack->get_source().num());
4938
11fdf7f2 4939 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
b32b8144
FG
4940 bool survivor = !mds->is_rejoin();
4941
7c673cae
FG
4942 // for sending cache expire message
4943 set<CInode*> isolated_inodes;
4944 set<CInode*> refragged_inodes;
11fdf7f2 4945 list<pair<CInode*,int> > updated_realms;
7c673cae
FG
4946
4947 // dirs
11fdf7f2 4948 for (const auto &p : ack->strong_dirfrags) {
7c673cae
FG
4949 // we may have had incorrect dir fragmentation; refragment based
4950 // on what they auth tells us.
11fdf7f2 4951 CDir *dir = get_dirfrag(p.first);
7c673cae 4952 if (!dir) {
11fdf7f2 4953 dir = get_force_dirfrag(p.first, false);
7c673cae
FG
4954 if (dir)
4955 refragged_inodes.insert(dir->get_inode());
4956 }
4957 if (!dir) {
11fdf7f2 4958 CInode *diri = get_inode(p.first.ino);
7c673cae
FG
4959 if (!diri) {
4960 // barebones inode; the full inode loop below will clean up.
4961 diri = new CInode(this, false);
11fdf7f2 4962 diri->inode.ino = p.first.ino;
7c673cae 4963 diri->inode.mode = S_IFDIR;
11fdf7f2 4964 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 4965 add_inode(diri);
11fdf7f2 4966 if (MDS_INO_MDSDIR(from) == p.first.ino) {
7c673cae
FG
4967 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4968 dout(10) << " add inode " << *diri << dendl;
4969 } else {
4970 diri->inode_auth = CDIR_AUTH_DEFAULT;
4971 isolated_inodes.insert(diri);
11fdf7f2 4972 dout(10) << " unconnected dirfrag " << p.first << dendl;
7c673cae
FG
4973 }
4974 }
4975 // barebones dirfrag; the full dirfrag loop below will clean up.
11fdf7f2
TL
4976 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4977 if (MDS_INO_MDSDIR(from) == p.first.ino ||
7c673cae
FG
4978 (dir->authority() != CDIR_AUTH_UNDEF &&
4979 dir->authority().first != from))
4980 adjust_subtree_auth(dir, from);
4981 dout(10) << " add dirfrag " << *dir << dendl;
4982 }
4983
11fdf7f2 4984 dir->set_replica_nonce(p.second.nonce);
7c673cae
FG
4985 dir->state_clear(CDir::STATE_REJOINING);
4986 dout(10) << " got " << *dir << dendl;
4987
4988 // dentries
11fdf7f2
TL
4989 auto it = ack->strong_dentries.find(p.first);
4990 if (it != ack->strong_dentries.end()) {
4991 for (const auto &q : it->second) {
4992 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4993 if(!dn)
4994 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
4995
4996 CDentry::linkage_t *dnl = dn->get_linkage();
4997
4998 ceph_assert(dn->last == q.first.snapid);
4999 if (dn->first != q.second.first) {
5000 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5001 dn->first = q.second.first;
5002 }
7c673cae 5003
11fdf7f2
TL
5004 // may have bad linkage if we missed dentry link/unlink messages
5005 if (dnl->is_primary()) {
5006 CInode *in = dnl->get_inode();
5007 if (!q.second.is_primary() ||
5008 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5009 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5010 dir->unlink_inode(dn);
5011 }
5012 } else if (dnl->is_remote()) {
5013 if (!q.second.is_remote() ||
5014 q.second.remote_ino != dnl->get_remote_ino() ||
5015 q.second.remote_d_type != dnl->get_remote_d_type()) {
5016 dout(10) << " had bad linkage for " << *dn << dendl;
5017 dir->unlink_inode(dn);
5018 }
5019 } else {
5020 if (!q.second.is_null())
5021 dout(10) << " had bad linkage for " << *dn << dendl;
5022 }
7c673cae 5023
11fdf7f2
TL
5024 // hmm, did we have the proper linkage here?
5025 if (dnl->is_null() && !q.second.is_null()) {
5026 if (q.second.is_remote()) {
5027 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5028 } else {
5029 CInode *in = get_inode(q.second.ino, q.first.snapid);
5030 if (!in) {
5031 // barebones inode; assume it's dir, the full inode loop below will clean up.
5032 in = new CInode(this, false, q.second.first, q.first.snapid);
5033 in->inode.ino = q.second.ino;
5034 in->inode.mode = S_IFDIR;
5035 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5036 add_inode(in);
5037 dout(10) << " add inode " << *in << dendl;
5038 } else if (in->get_parent_dn()) {
5039 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5040 << ", unlinking " << *in << dendl;
5041 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5042 }
5043 dn->dir->link_primary_inode(dn, in);
5044 isolated_inodes.erase(in);
7c673cae 5045 }
11fdf7f2 5046 }
7c673cae 5047
11fdf7f2
TL
5048 dn->set_replica_nonce(q.second.nonce);
5049 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5050 dn->state_clear(CDentry::STATE_REJOINING);
5051 dout(10) << " got " << *dn << dendl;
5052 }
7c673cae
FG
5053 }
5054 }
5055
5056 for (set<CInode*>::iterator p = refragged_inodes.begin();
5057 p != refragged_inodes.end();
5058 ++p) {
5059 list<CDir*> ls;
5060 (*p)->get_nested_dirfrags(ls);
5061 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5062 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5063 continue;
11fdf7f2 5064 ceph_assert((*q)->get_num_any() == 0);
7c673cae
FG
5065 (*p)->close_dirfrag((*q)->get_frag());
5066 }
5067 }
5068
5069 // full dirfrags
11fdf7f2
TL
5070 for (const auto &p : ack->dirfrag_bases) {
5071 CDir *dir = get_dirfrag(p.first);
5072 ceph_assert(dir);
5073 auto q = p.second.cbegin();
7c673cae
FG
5074 dir->_decode_base(q);
5075 dout(10) << " got dir replica " << *dir << dendl;
5076 }
5077
5078 // full inodes
11fdf7f2 5079 auto p = ack->inode_base.cbegin();
7c673cae
FG
5080 while (!p.end()) {
5081 inodeno_t ino;
5082 snapid_t last;
5083 bufferlist basebl;
11fdf7f2
TL
5084 decode(ino, p);
5085 decode(last, p);
5086 decode(basebl, p);
7c673cae 5087 CInode *in = get_inode(ino, last);
11fdf7f2
TL
5088 ceph_assert(in);
5089 auto q = basebl.cbegin();
5090 snapid_t sseq = 0;
5091 if (in->snaprealm)
5092 sseq = in->snaprealm->srnode.seq;
7c673cae 5093 in->_decode_base(q);
11fdf7f2
TL
5094 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5095 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5096 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5097 }
7c673cae
FG
5098 dout(10) << " got inode base " << *in << dendl;
5099 }
5100
5101 // inodes
11fdf7f2 5102 p = ack->inode_locks.cbegin();
7c673cae
FG
5103 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5104 while (!p.end()) {
5105 inodeno_t ino;
5106 snapid_t last;
5107 __u32 nonce;
5108 bufferlist lockbl;
11fdf7f2
TL
5109 decode(ino, p);
5110 decode(last, p);
5111 decode(nonce, p);
5112 decode(lockbl, p);
7c673cae
FG
5113
5114 CInode *in = get_inode(ino, last);
11fdf7f2 5115 ceph_assert(in);
7c673cae 5116 in->set_replica_nonce(nonce);
11fdf7f2 5117 auto q = lockbl.cbegin();
b32b8144 5118 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5119 in->state_clear(CInode::STATE_REJOINING);
5120 dout(10) << " got inode locks " << *in << dendl;
5121 }
5122
5123 // FIXME: This can happen if entire subtree, together with the inode subtree root
5124 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
11fdf7f2 5125 ceph_assert(isolated_inodes.empty());
7c673cae
FG
5126
5127 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
11fdf7f2
TL
5128 auto bp = ack->imported_caps.cbegin();
5129 decode(peer_imported, bp);
7c673cae
FG
5130
5131 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5132 p != peer_imported.end();
5133 ++p) {
28e407b8 5134 auto& ex = cap_exports.at(p->first);
11fdf7f2 5135 ceph_assert(ex.first == from);
7c673cae
FG
5136 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5137 q != p->second.end();
5138 ++q) {
28e407b8 5139 auto r = ex.second.find(q->first);
11fdf7f2 5140 ceph_assert(r != ex.second.end());
7c673cae
FG
5141
5142 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5143 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5144 if (!session) {
5145 dout(10) << " no session for client." << p->first << dendl;
5146 ex.second.erase(r);
5147 continue;
5148 }
7c673cae
FG
5149
5150 // mark client caps stale.
11fdf7f2 5151 auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5152 r->second.capinfo.cap_id, 0,
7c673cae
FG
5153 mds->get_osd_epoch_barrier());
5154 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5155 (q->second.cap_id > 0 ? from : -1), 0);
5156 mds->send_message_client_counted(m, session);
5157
28e407b8 5158 ex.second.erase(r);
7c673cae 5159 }
11fdf7f2
TL
5160 ceph_assert(ex.second.empty());
5161 }
5162
5163 for (auto p : updated_realms) {
5164 CInode *in = p.first;
5165 bool notify_clients;
5166 if (mds->is_rejoin()) {
5167 if (!rejoin_pending_snaprealms.count(in)) {
5168 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5169 rejoin_pending_snaprealms.insert(in);
5170 }
5171 notify_clients = false;
5172 } else {
5173 // notify clients if I'm survivor
5174 notify_clients = true;
5175 }
5176 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
7c673cae
FG
5177 }
5178
5179 // done?
11fdf7f2 5180 ceph_assert(rejoin_ack_gather.count(from));
7c673cae 5181 rejoin_ack_gather.erase(from);
b32b8144 5182 if (!survivor) {
7c673cae
FG
5183 if (rejoin_gather.empty()) {
5184 // eval unstable scatter locks after all wrlocks are rejoined.
5185 while (!rejoin_eval_locks.empty()) {
5186 SimpleLock *lock = rejoin_eval_locks.front();
5187 rejoin_eval_locks.pop_front();
5188 if (!lock->is_stable())
5189 mds->locker->eval_gather(lock);
5190 }
5191 }
5192
5193 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5194 rejoin_ack_gather.empty()) {
5195 // finally, kickstart past snap parent opens
11fdf7f2 5196 open_snaprealms();
7c673cae
FG
5197 } else {
5198 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5199 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5200 }
5201 } else {
5202 // survivor.
5203 mds->queue_waiters(rejoin_waiters);
5204 }
5205}
5206
5207/**
5208 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5209 *
5210 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5211 * messages that clean these guys up...
5212 */
5213void MDCache::rejoin_trim_undef_inodes()
5214{
5215 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5216
5217 while (!rejoin_undef_inodes.empty()) {
5218 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5219 CInode *in = *p;
5220 rejoin_undef_inodes.erase(p);
5221
5222 in->clear_replica_map();
5223
5224 // close out dirfrags
5225 if (in->is_dir()) {
5226 list<CDir*> dfls;
5227 in->get_dirfrags(dfls);
5228 for (list<CDir*>::iterator p = dfls.begin();
5229 p != dfls.end();
5230 ++p) {
5231 CDir *dir = *p;
5232 dir->clear_replica_map();
5233
94b18763
FG
5234 for (auto &p : dir->items) {
5235 CDentry *dn = p.second;
7c673cae
FG
5236 dn->clear_replica_map();
5237
5238 dout(10) << " trimming " << *dn << dendl;
5239 dir->remove_dentry(dn);
5240 }
5241
5242 dout(10) << " trimming " << *dir << dendl;
5243 in->close_dirfrag(dir->dirfrag().frag);
5244 }
5245 }
5246
5247 CDentry *dn = in->get_parent_dn();
5248 if (dn) {
5249 dn->clear_replica_map();
5250 dout(10) << " trimming " << *dn << dendl;
5251 dn->dir->remove_dentry(dn);
5252 } else {
5253 dout(10) << " trimming " << *in << dendl;
5254 remove_inode(in);
5255 }
5256 }
5257
11fdf7f2 5258 ceph_assert(rejoin_undef_inodes.empty());
7c673cae
FG
5259}
5260
5261void MDCache::rejoin_gather_finish()
5262{
5263 dout(10) << "rejoin_gather_finish" << dendl;
11fdf7f2
TL
5264 ceph_assert(mds->is_rejoin());
5265 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5266
5267 if (open_undef_inodes_dirfrags())
5268 return;
5269
5270 if (process_imported_caps())
5271 return;
5272
5273 choose_lock_states_and_reconnect_caps();
5274
5275 identify_files_to_recover();
5276 rejoin_send_acks();
5277
5278 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5279 rejoin_ack_gather.erase(mds->get_nodeid());
5280
5281 // did we already get our acks too?
5282 if (rejoin_ack_gather.empty()) {
11fdf7f2
TL
5283 // finally, open snaprealms
5284 open_snaprealms();
7c673cae
FG
5285 }
5286}
5287
5288class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5289 inodeno_t ino;
5290public:
5291 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5292 void finish(int r) override {
5293 mdcache->rejoin_open_ino_finish(ino, r);
5294 }
5295};
5296
5297void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5298{
5299 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5300
5301 if (ret < 0) {
5302 cap_imports_missing.insert(ino);
5303 } else if (ret == mds->get_nodeid()) {
11fdf7f2 5304 ceph_assert(get_inode(ino));
7c673cae
FG
5305 } else {
5306 auto p = cap_imports.find(ino);
11fdf7f2 5307 ceph_assert(p != cap_imports.end());
7c673cae 5308 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
11fdf7f2
TL
5309 ceph_assert(q->second.count(MDS_RANK_NONE));
5310 ceph_assert(q->second.size() == 1);
7c673cae
FG
5311 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5312 }
5313 cap_imports.erase(p);
5314 }
5315
11fdf7f2 5316 ceph_assert(cap_imports_num_opening > 0);
7c673cae
FG
5317 cap_imports_num_opening--;
5318
5319 if (cap_imports_num_opening == 0) {
5320 if (rejoin_gather.empty())
5321 rejoin_gather_finish();
5322 else if (rejoin_gather.count(mds->get_nodeid()))
5323 process_imported_caps();
5324 }
5325}
5326
5327class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5328public:
28e407b8
AA
5329 map<client_t,pair<Session*,uint64_t> > session_map;
5330 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae 5331 void finish(int r) override {
11fdf7f2 5332 ceph_assert(r == 0);
28e407b8 5333 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5334 }
5335};
5336
28e407b8 5337void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5338{
5339 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5340 mds->server->finish_force_open_sessions(session_map);
5341 rejoin_session_map.swap(session_map);
7c673cae
FG
5342 if (rejoin_gather.empty())
5343 rejoin_gather_finish();
5344}
5345
11fdf7f2
TL
5346void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5347{
5348 auto p = cap_imports.find(ino);
5349 if (p != cap_imports.end()) {
5350 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5351 if (ret < 0) {
5352 cap_imports_missing.insert(ino);
5353 } else if (ret != mds->get_nodeid()) {
5354 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5355 ceph_assert(q->second.count(MDS_RANK_NONE));
5356 ceph_assert(q->second.size() == 1);
5357 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5358 }
5359 cap_imports.erase(p);
5360 }
5361 }
5362}
5363
7c673cae
FG
5364bool MDCache::process_imported_caps()
5365{
5366 dout(10) << "process_imported_caps" << dendl;
5367
11fdf7f2
TL
5368 if (!open_file_table.is_prefetched() &&
5369 open_file_table.prefetch_inodes()) {
5370 open_file_table.wait_for_prefetch(
5371 new MDSInternalContextWrapper(mds,
5372 new FunctionContext([this](int r) {
5373 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5374 process_imported_caps();
5375 })
5376 )
5377 );
5378 return true;
5379 }
5380
7c673cae
FG
5381 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5382 CInode *in = get_inode(p->first);
5383 if (in) {
11fdf7f2 5384 ceph_assert(in->is_auth());
7c673cae
FG
5385 cap_imports_missing.erase(p->first);
5386 continue;
5387 }
5388 if (cap_imports_missing.count(p->first) > 0)
5389 continue;
5390
5391 cap_imports_num_opening++;
5392 dout(10) << " opening missing ino " << p->first << dendl;
5393 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
28e407b8
AA
5394 if (!(cap_imports_num_opening % 1000))
5395 mds->heartbeat_reset();
7c673cae
FG
5396 }
5397
5398 if (cap_imports_num_opening > 0)
5399 return true;
5400
5401 // called by rejoin_gather_finish() ?
5402 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5403 if (!rejoin_client_map.empty() &&
5404 rejoin_session_map.empty()) {
5405 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5406 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
11fdf7f2 5407 rejoin_client_metadata_map,
28e407b8 5408 finish->session_map);
11fdf7f2
TL
5409 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5410 std::move(rejoin_client_metadata_map));
5411 mds->mdlog->start_submit_entry(le, finish);
28e407b8
AA
5412 mds->mdlog->flush();
5413 rejoin_client_map.clear();
11fdf7f2 5414 rejoin_client_metadata_map.clear();
28e407b8 5415 return true;
7c673cae 5416 }
7c673cae
FG
5417
5418 // process caps that were exported by slave rename
5419 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5420 p != rejoin_slave_exports.end();
5421 ++p) {
5422 CInode *in = get_inode(p->first);
11fdf7f2 5423 ceph_assert(in);
7c673cae
FG
5424 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5425 q != p->second.second.end();
5426 ++q) {
28e407b8
AA
5427 auto r = rejoin_session_map.find(q->first);
5428 if (r == rejoin_session_map.end())
5429 continue;
7c673cae 5430
28e407b8 5431 Session *session = r->second.first;
7c673cae 5432 Capability *cap = in->get_client_cap(q->first);
11fdf7f2 5433 if (!cap) {
7c673cae 5434 cap = in->add_client_cap(q->first, session);
11fdf7f2
TL
5435 // add empty item to reconnected_caps
5436 (void)reconnected_caps[p->first][q->first];
5437 }
7c673cae
FG
5438 cap->merge(q->second, true);
5439
5440 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
11fdf7f2
TL
5441 ceph_assert(cap->get_last_seq() == im.issue_seq);
5442 ceph_assert(cap->get_mseq() == im.mseq);
7c673cae
FG
5443 cap->set_cap_id(im.cap_id);
5444 // send cap import because we assigned a new cap ID
5445 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5446 p->second.first, CEPH_CAP_FLAG_AUTH);
5447 }
5448 }
5449 rejoin_slave_exports.clear();
5450 rejoin_imported_caps.clear();
5451
5452 // process cap imports
5453 // ino -> client -> frommds -> capex
5454 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5455 CInode *in = get_inode(p->first);
5456 if (!in) {
5457 dout(10) << " still missing ino " << p->first
5458 << ", will try again after replayed client requests" << dendl;
5459 ++p;
5460 continue;
5461 }
11fdf7f2 5462 ceph_assert(in->is_auth());
7c673cae 5463 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5464 Session *session;
5465 {
5466 auto r = rejoin_session_map.find(q->first);
5467 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5468 }
5469
7c673cae 5470 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5471 if (!session) {
5472 if (r->first >= 0)
5473 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5474 continue;
5475 }
5476
7c673cae
FG
5477 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5478 add_reconnected_cap(q->first, in->ino(), r->second);
5479 if (r->first >= 0) {
5480 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5481 cap->inc_mseq();
5482 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5483
5484 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5485 im.cap_id = cap->get_cap_id();
5486 im.issue_seq = cap->get_last_seq();
5487 im.mseq = cap->get_mseq();
5488 }
5489 }
5490 }
5491 cap_imports.erase(p++); // remove and move on
5492 }
5493 } else {
5494 trim_non_auth();
5495
11fdf7f2 5496 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5497 rejoin_gather.erase(mds->get_nodeid());
11fdf7f2 5498 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5499 maybe_send_pending_rejoins();
7c673cae
FG
5500 }
5501 return false;
5502}
5503
7c673cae
FG
5504void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5505 client_t client, snapid_t snap_follows)
5506{
5507 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5508
11fdf7f2
TL
5509 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5510 return;
5511
7c673cae
FG
5512 const set<snapid_t>& snaps = realm->get_snaps();
5513 snapid_t follows = snap_follows;
5514
5515 while (true) {
5516 CInode *in = pick_inode_snap(head_in, follows);
5517 if (in == head_in)
5518 break;
11fdf7f2
TL
5519
5520 bool need_snapflush = false;
5521 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5522 p != snaps.end() && *p <= in->last;
5523 ++p) {
5524 head_in->add_need_snapflush(in, *p, client);
5525 need_snapflush = true;
5526 }
5527 follows = in->last;
5528 if (!need_snapflush)
5529 continue;
5530
7c673cae
FG
5531 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5532
eafe8130
TL
5533 if (in->client_snap_caps.empty()) {
5534 for (int i = 0; i < num_cinode_locks; i++) {
5535 int lockid = cinode_lock_info[i].lock;
5536 SimpleLock *lock = in->get_lock(lockid);
5537 ceph_assert(lock);
5538 in->auth_pin(lock);
5539 lock->set_state(LOCK_SNAP_SYNC);
5540 lock->get_wrlock(true);
5541 }
7c673cae 5542 }
eafe8130 5543 in->client_snap_caps.insert(client);
11fdf7f2 5544 mds->locker->mark_need_snapflush_inode(in);
7c673cae
FG
5545 }
5546}
5547
5548/*
5549 * choose lock states based on reconnected caps
5550 */
5551void MDCache::choose_lock_states_and_reconnect_caps()
5552{
5553 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5554
81eedcae 5555 int count = 0;
11fdf7f2
TL
5556 for (auto p : inode_map) {
5557 CInode *in = p.second;
7c673cae
FG
5558 if (in->last != CEPH_NOSNAP)
5559 continue;
5560
5561 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5562 in->mark_dirty_rstat();
5563
7c673cae 5564 int dirty_caps = 0;
11fdf7f2
TL
5565 auto q = reconnected_caps.find(in->ino());
5566 if (q != reconnected_caps.end()) {
5567 for (const auto &it : q->second)
7c673cae
FG
5568 dirty_caps |= it.second.dirty_caps;
5569 }
5570 in->choose_lock_states(dirty_caps);
5571 dout(15) << " chose lock states on " << *in << dendl;
5572
11fdf7f2
TL
5573 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5574 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5575 rejoin_pending_snaprealms.insert(in);
7c673cae 5576 }
81eedcae
TL
5577
5578 if (!(++count % 1000))
5579 mds->heartbeat_reset();
11fdf7f2 5580 }
7c673cae
FG
5581}
5582
5583void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
11fdf7f2 5584 map<client_t,MClientSnap::ref>& splits)
7c673cae 5585{
11fdf7f2
TL
5586 MClientSnap::ref snap;
5587 auto it = splits.find(client);
5588 if (it != splits.end()) {
5589 snap = it->second;
5590 snap->head.op = CEPH_SNAP_OP_SPLIT;
5591 } else {
5592 snap = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
5593 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae 5594 snap->head.split = realm->inode->ino();
11fdf7f2 5595 snap->bl = realm->get_snap_trace();
7c673cae 5596
11fdf7f2
TL
5597 for (const auto& child : realm->open_children)
5598 snap->split_realms.push_back(child->inode->ino());
5599 }
7c673cae
FG
5600 snap->split_inos.push_back(ino);
5601}
5602
11fdf7f2
TL
5603void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5604 map<client_t,MClientSnap::ref>& splits)
5605{
5606 ceph_assert(parent_realm);
5607
5608 vector<inodeno_t> split_inos;
5609 vector<inodeno_t> split_realms;
5610
5611 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5612 !p.end();
5613 ++p)
5614 split_inos.push_back((*p)->ino());
5615 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5616 p != realm->open_children.end();
5617 ++p)
5618 split_realms.push_back((*p)->inode->ino());
5619
5620 for (const auto& p : realm->client_caps) {
5621 ceph_assert(!p.second->empty());
5622 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5623 if (em.second) {
5624 auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
5625 update->head.split = parent_realm->inode->ino();
5626 update->split_inos = split_inos;
5627 update->split_realms = split_realms;
5628 update->bl = parent_realm->get_snap_trace();
5629 em.first->second = std::move(update);
5630 }
5631 }
5632}
5633
5634void MDCache::send_snaps(map<client_t,MClientSnap::ref>& splits)
7c673cae
FG
5635{
5636 dout(10) << "send_snaps" << dendl;
5637
11fdf7f2
TL
5638 for (auto &p : splits) {
5639 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
7c673cae 5640 if (session) {
11fdf7f2
TL
5641 dout(10) << " client." << p.first
5642 << " split " << p.second->head.split
5643 << " inos " << p.second->split_inos
7c673cae 5644 << dendl;
11fdf7f2 5645 mds->send_message_client_counted(p.second, session);
7c673cae 5646 } else {
11fdf7f2 5647 dout(10) << " no session for client." << p.first << dendl;
7c673cae
FG
5648 }
5649 }
5650 splits.clear();
5651}
5652
5653
5654/*
5655 * remove any items from logsegment open_file lists that don't have
5656 * any caps
5657 */
5658void MDCache::clean_open_file_lists()
5659{
5660 dout(10) << "clean_open_file_lists" << dendl;
5661
5662 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5663 p != mds->mdlog->segments.end();
5664 ++p) {
5665 LogSegment *ls = p->second;
5666
5667 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5668 while (!q.end()) {
5669 CInode *in = *q;
5670 ++q;
5671 if (in->last == CEPH_NOSNAP) {
11fdf7f2
TL
5672 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5673 in->item_open_file.remove_myself();
5674 } else {
7c673cae
FG
5675 if (in->client_snap_caps.empty()) {
5676 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5677 in->item_open_file.remove_myself();
5678 }
5679 }
5680 }
5681 }
5682}
5683
11fdf7f2
TL
5684void MDCache::dump_openfiles(Formatter *f)
5685{
5686 f->open_array_section("openfiles");
5687 for (auto p = mds->mdlog->segments.begin();
5688 p != mds->mdlog->segments.end();
5689 ++p) {
5690 LogSegment *ls = p->second;
5691
5692 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5693 while (!q.end()) {
5694 CInode *in = *q;
5695 ++q;
5696 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5697 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5698 continue;
5699 f->open_object_section("file");
5700 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5701 f->close_section();
5702 }
5703 }
5704 f->close_section();
5705}
7c673cae
FG
5706
5707Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5708{
5709 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5710 << " on " << *in << dendl;
5711 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5712 if (!session) {
5713 dout(10) << " no session for client." << client << dendl;
5714 return NULL;
5715 }
5716
5717 Capability *cap = in->reconnect_cap(client, icr, session);
5718
5719 if (frommds >= 0) {
5720 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5721 cap->inc_mseq();
5722 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5723 }
5724
5725 return cap;
5726}
5727
5728void MDCache::export_remaining_imported_caps()
5729{
5730 dout(10) << "export_remaining_imported_caps" << dendl;
5731
5732 stringstream warn_str;
5733
81eedcae 5734 int count = 0;
7c673cae
FG
5735 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5736 warn_str << " ino " << p->first << "\n";
5737 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5738 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5739 if (session) {
5740 // mark client caps stale.
11fdf7f2 5741 auto stale = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
7c673cae
FG
5742 stale->set_cap_peer(0, 0, 0, -1, 0);
5743 mds->send_message_client_counted(stale, q->first);
5744 }
5745 }
5746
81eedcae
TL
5747 if (!(++count % 1000))
5748 mds->heartbeat_reset();
7c673cae
FG
5749 }
5750
11fdf7f2 5751 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
7c673cae
FG
5752 p != cap_reconnect_waiters.end();
5753 ++p)
5754 mds->queue_waiters(p->second);
5755
5756 cap_imports.clear();
5757 cap_reconnect_waiters.clear();
5758
5759 if (warn_str.peek() != EOF) {
5760 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5761 mds->clog->warn(warn_str);
5762 }
5763}
5764
a8e16298 5765Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
7c673cae
FG
5766{
5767 client_t client = session->info.get_client();
a8e16298 5768 Capability *cap = nullptr;
7c673cae
FG
5769 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5770 if (rc) {
a8e16298 5771 cap = in->reconnect_cap(client, *rc, session);
7c673cae
FG
5772 dout(10) << "try_reconnect_cap client." << client
5773 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5774 << " issue " << ccap_string(rc->capinfo.issued)
5775 << " on " << *in << dendl;
5776 remove_replay_cap_reconnect(in->ino(), client);
5777
5778 if (in->is_replicated()) {
5779 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5780 } else {
5781 int dirty_caps = 0;
5782 auto p = reconnected_caps.find(in->ino());
5783 if (p != reconnected_caps.end()) {
5784 auto q = p->second.find(client);
5785 if (q != p->second.end())
5786 dirty_caps = q->second.dirty_caps;
5787 }
5788 in->choose_lock_states(dirty_caps);
5789 dout(15) << " chose lock states on " << *in << dendl;
5790 }
5791
11fdf7f2 5792 map<inodeno_t, MDSContext::vec >::iterator it =
7c673cae
FG
5793 cap_reconnect_waiters.find(in->ino());
5794 if (it != cap_reconnect_waiters.end()) {
5795 mds->queue_waiters(it->second);
5796 cap_reconnect_waiters.erase(it);
5797 }
5798 }
a8e16298 5799 return cap;
7c673cae
FG
5800}
5801
5802
5803
5804// -------
5805// cap imports and delayed snap parent opens
5806
5807void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5808 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5809 int peer, int p_flags)
5810{
7c673cae
FG
5811 SnapRealm *realm = in->find_snaprealm();
5812 if (realm->have_past_parents_open()) {
5813 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5814 if (cap->get_last_seq() == 0) // reconnected cap
5815 cap->inc_last_seq();
5816 cap->set_last_issue();
5817 cap->set_last_issue_stamp(ceph_clock_now());
5818 cap->clear_new();
11fdf7f2 5819 auto reap = MClientCaps::create(CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(), cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
7c673cae 5820 in->encode_cap_message(reap, cap);
11fdf7f2 5821 reap->snapbl = realm->get_snap_trace();
7c673cae
FG
5822 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5823 mds->send_message_client_counted(reap, session);
5824 } else {
11fdf7f2 5825 ceph_abort();
7c673cae
FG
5826 }
5827}
5828
5829void MDCache::do_delayed_cap_imports()
5830{
5831 dout(10) << "do_delayed_cap_imports" << dendl;
5832
11fdf7f2 5833 ceph_assert(delayed_imported_caps.empty());
7c673cae
FG
5834}
5835
11fdf7f2
TL
5836struct C_MDC_OpenSnapRealms : public MDCacheContext {
5837 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
7c673cae 5838 void finish(int r) override {
11fdf7f2 5839 mdcache->open_snaprealms();
7c673cae
FG
5840 }
5841};
5842
11fdf7f2 5843void MDCache::open_snaprealms()
7c673cae 5844{
11fdf7f2 5845 dout(10) << "open_snaprealms" << dendl;
7c673cae 5846
7c673cae
FG
5847 MDSGatherBuilder gather(g_ceph_context);
5848
11fdf7f2
TL
5849 auto it = rejoin_pending_snaprealms.begin();
5850 while (it != rejoin_pending_snaprealms.end()) {
5851 CInode *in = *it;
5852 SnapRealm *realm = in->snaprealm;
5853 ceph_assert(realm);
5854 if (realm->have_past_parents_open() ||
5855 realm->open_parents(gather.new_sub())) {
7c673cae
FG
5856 dout(10) << " past parents now open on " << *in << dendl;
5857
11fdf7f2
TL
5858 map<client_t,MClientSnap::ref> splits;
5859 // finish off client snaprealm reconnects?
5860 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5861 if (q != reconnected_snaprealms.end()) {
5862 for (const auto& r : q->second)
5863 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5864 reconnected_snaprealms.erase(q);
5865 }
5866
5867 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5868 !p.end(); ++p) {
5869 CInode *child = *p;
7c673cae 5870 auto q = reconnected_caps.find(child->ino());
11fdf7f2 5871 ceph_assert(q != reconnected_caps.end());
7c673cae 5872 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
11fdf7f2
TL
5873 Capability *cap = child->get_client_cap(r->first);
5874 if (!cap)
5875 continue;
5876 if (r->second.snap_follows > 0) {
5877 if (r->second.snap_follows < child->first - 1) {
5878 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5879 } else if (r->second.snapflush) {
5880 // When processing a cap flush message that is re-sent, it's possble
5881 // that the sender has already released all WR caps. So we should
5882 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5883 cap->mark_needsnapflush();
5884 }
7c673cae
FG
5885 }
5886 // make sure client's cap is in the correct snaprealm.
5887 if (r->second.realm_ino != in->ino()) {
11fdf7f2 5888 prepare_realm_split(realm, r->first, child->ino(), splits);
7c673cae
FG
5889 }
5890 }
5891 }
5892
11fdf7f2 5893 rejoin_pending_snaprealms.erase(it++);
7c673cae
FG
5894 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5895
11fdf7f2 5896 send_snaps(splits);
7c673cae
FG
5897 } else {
5898 dout(10) << " opening past parents on " << *in << dendl;
11fdf7f2 5899 ++it;
7c673cae
FG
5900 }
5901 }
5902
7c673cae 5903 if (gather.has_subs()) {
11fdf7f2
TL
5904 if (gather.num_subs_remaining() == 0) {
5905 // cleanup gather
5906 gather.set_finisher(new C_MDSInternalNoop);
5907 gather.activate();
5908 } else {
5909 // for multimds, must succeed the first time
5910 ceph_assert(recovery_set.empty());
5911
5912 dout(10) << "open_snaprealms - waiting for "
5913 << gather.num_subs_remaining() << dendl;
5914 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5915 gather.activate();
5916 return;
5917 }
5918 }
5919
5920 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5921
5922 if (!reconnected_snaprealms.empty()) {
5923 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5924 for (auto& p : reconnected_snaprealms) {
7c673cae 5925 stringstream warn_str;
11fdf7f2
TL
5926 warn_str << " " << p.first << " {";
5927 bool first = true;
5928 for (auto& q : p.second) {
5929 if (!first)
5930 warn_str << ", ";
5931 warn_str << "client." << q.first << "/" << q.second;
7c673cae 5932 }
11fdf7f2
TL
5933 warn_str << "}";
5934 dout(5) << warn_str.str() << dendl;
7c673cae 5935 }
7c673cae 5936 }
11fdf7f2
TL
5937 ceph_assert(rejoin_waiters.empty());
5938 ceph_assert(rejoin_pending_snaprealms.empty());
5939 dout(10) << "open_snaprealms - all open" << dendl;
5940 do_delayed_cap_imports();
5941
5942 ceph_assert(rejoin_done);
5943 rejoin_done.release()->complete(0);
5944 reconnected_caps.clear();
7c673cae
FG
5945}
5946
5947bool MDCache::open_undef_inodes_dirfrags()
5948{
5949 dout(10) << "open_undef_inodes_dirfrags "
5950 << rejoin_undef_inodes.size() << " inodes "
5951 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5952
5953 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5954
5955 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5956 p != rejoin_undef_inodes.end();
5957 ++p) {
5958 CInode *in = *p;
11fdf7f2 5959 ceph_assert(!in->is_base());
7c673cae
FG
5960 fetch_queue.insert(in->get_parent_dir());
5961 }
5962
5963 if (fetch_queue.empty())
5964 return false;
5965
28e407b8
AA
5966 MDSGatherBuilder gather(g_ceph_context,
5967 new MDSInternalContextWrapper(mds,
5968 new FunctionContext([this](int r) {
5969 if (rejoin_gather.empty())
5970 rejoin_gather_finish();
5971 })
5972 )
5973 );
5974
7c673cae
FG
5975 for (set<CDir*>::iterator p = fetch_queue.begin();
5976 p != fetch_queue.end();
5977 ++p) {
5978 CDir *dir = *p;
5979 CInode *diri = dir->get_inode();
5980 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5981 continue;
5982 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 5983 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
7c673cae
FG
5984 dir->fetch(gather.new_sub());
5985 }
11fdf7f2 5986 ceph_assert(gather.has_subs());
7c673cae
FG
5987 gather.activate();
5988 return true;
5989}
5990
5991void MDCache::opened_undef_inode(CInode *in) {
5992 dout(10) << "opened_undef_inode " << *in << dendl;
5993 rejoin_undef_inodes.erase(in);
5994 if (in->is_dir()) {
5995 // FIXME: re-hash dentries if necessary
11fdf7f2 5996 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
7c673cae
FG
5997 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5998 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 5999 ceph_assert(dir);
7c673cae
FG
6000 rejoin_undef_dirfrags.erase(dir);
6001 in->force_dirfrags();
6002 list<CDir*> ls;
6003 in->get_dirfrags(ls);
6004 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
6005 rejoin_undef_dirfrags.insert(*p);
6006 }
6007 }
6008}
6009
11fdf7f2
TL
6010void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6011 map<client_t,MClientSnap::ref>& updates)
7c673cae
FG
6012{
6013 if (seq < realm->get_newest_seq()) {
6014 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
11fdf7f2
TL
6015 << realm->get_newest_seq() << " on " << *realm << dendl;
6016 auto snap = MClientSnap::create(CEPH_SNAP_OP_UPDATE);
6017 snap->bl = realm->get_snap_trace();
6018 for (const auto& child : realm->open_children)
6019 snap->split_realms.push_back(child->inode->ino());
6020 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae
FG
6021 } else {
6022 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6023 << " on " << *realm << dendl;
6024 }
6025}
6026
6027
6028
6029void MDCache::rejoin_send_acks()
6030{
6031 dout(7) << "rejoin_send_acks" << dendl;
6032
6033 // replicate stray
6034 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6035 p != rejoin_unlinked_inodes.end();
6036 ++p) {
6037 for (set<CInode*>::iterator q = p->second.begin();
6038 q != p->second.end();
6039 ++q) {
6040 CInode *in = *q;
6041 dout(7) << " unlinked inode " << *in << dendl;
6042 // inode expired
6043 if (!in->is_replica(p->first))
6044 continue;
6045 while (1) {
6046 CDentry *dn = in->get_parent_dn();
6047 if (dn->is_replica(p->first))
6048 break;
6049 dn->add_replica(p->first);
6050 CDir *dir = dn->get_dir();
6051 if (dir->is_replica(p->first))
6052 break;
6053 dir->add_replica(p->first);
6054 in = dir->get_inode();
6055 if (in->is_replica(p->first))
6056 break;
224ce89b 6057 in->add_replica(p->first);
7c673cae
FG
6058 if (in->is_base())
6059 break;
6060 }
6061 }
6062 }
6063 rejoin_unlinked_inodes.clear();
6064
6065 // send acks to everyone in the recovery set
11fdf7f2 6066 map<mds_rank_t,MMDSCacheRejoin::ref> acks;
7c673cae
FG
6067 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6068 p != recovery_set.end();
31f18b77
FG
6069 ++p) {
6070 if (rejoin_ack_sent.count(*p))
6071 continue;
11fdf7f2 6072 acks[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
31f18b77
FG
6073 }
6074
6075 rejoin_ack_sent = recovery_set;
7c673cae
FG
6076
6077 // walk subtrees
6078 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6079 p != subtrees.end();
6080 ++p) {
6081 CDir *dir = p->first;
6082 if (!dir->is_auth())
6083 continue;
6084 dout(10) << "subtree " << *dir << dendl;
6085
6086 // auth items in this subtree
6087 list<CDir*> dq;
6088 dq.push_back(dir);
6089
6090 while (!dq.empty()) {
6091 CDir *dir = dq.front();
6092 dq.pop_front();
6093
6094 // dir
181888fb
FG
6095 for (auto &r : dir->get_replicas()) {
6096 auto it = acks.find(r.first);
31f18b77
FG
6097 if (it == acks.end())
6098 continue;
181888fb 6099 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 6100 it->second->add_dirfrag_base(dir);
7c673cae
FG
6101 }
6102
94b18763
FG
6103 for (auto &p : dir->items) {
6104 CDentry *dn = p.second;
7c673cae
FG
6105 CDentry::linkage_t *dnl = dn->get_linkage();
6106
6107 // inode
6108 CInode *in = NULL;
6109 if (dnl->is_primary())
6110 in = dnl->get_inode();
6111
6112 // dentry
181888fb
FG
6113 for (auto &r : dn->get_replicas()) {
6114 auto it = acks.find(r.first);
31f18b77
FG
6115 if (it == acks.end())
6116 continue;
94b18763 6117 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6118 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6119 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6120 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6121 ++r.second,
7c673cae
FG
6122 dn->lock.get_replica_state());
6123 // peer missed MDentrylink message ?
181888fb
FG
6124 if (in && !in->is_replica(r.first))
6125 in->add_replica(r.first);
7c673cae
FG
6126 }
6127
6128 if (!in)
6129 continue;
6130
181888fb
FG
6131 for (auto &r : in->get_replicas()) {
6132 auto it = acks.find(r.first);
31f18b77
FG
6133 if (it == acks.end())
6134 continue;
6135 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6136 bufferlist bl;
181888fb
FG
6137 in->_encode_locks_state_for_rejoin(bl, r.first);
6138 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6139 }
6140
6141 // subdirs in this subtree?
6142 in->get_nested_dirfrags(dq);
6143 }
6144 }
6145 }
6146
6147 // base inodes too
6148 if (root && root->is_auth())
181888fb
FG
6149 for (auto &r : root->get_replicas()) {
6150 auto it = acks.find(r.first);
31f18b77
FG
6151 if (it == acks.end())
6152 continue;
6153 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6154 bufferlist bl;
181888fb
FG
6155 root->_encode_locks_state_for_rejoin(bl, r.first);
6156 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6157 }
6158 if (myin)
181888fb
FG
6159 for (auto &r : myin->get_replicas()) {
6160 auto it = acks.find(r.first);
31f18b77
FG
6161 if (it == acks.end())
6162 continue;
6163 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6164 bufferlist bl;
181888fb
FG
6165 myin->_encode_locks_state_for_rejoin(bl, r.first);
6166 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6167 }
6168
6169 // include inode base for any inodes whose scatterlocks may have updated
6170 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6171 p != rejoin_potential_updated_scatterlocks.end();
6172 ++p) {
6173 CInode *in = *p;
181888fb
FG
6174 for (const auto &r : in->get_replicas()) {
6175 auto it = acks.find(r.first);
31f18b77
FG
6176 if (it == acks.end())
6177 continue;
6178 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6179 }
7c673cae
FG
6180 }
6181
6182 // send acks
31f18b77 6183 for (auto p = acks.begin(); p != acks.end(); ++p) {
11fdf7f2 6184 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
7c673cae
FG
6185 mds->send_message_mds(p->second, p->first);
6186 }
6187
6188 rejoin_imported_caps.clear();
6189}
6190
c07f9fc5
FG
6191class C_MDC_ReIssueCaps : public MDCacheContext {
6192 CInode *in;
6193public:
6194 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6195 MDCacheContext(mdc), in(i)
6196 {
6197 in->get(CInode::PIN_PTRWAITER);
6198 }
6199 void finish(int r) override {
6200 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6201 mdcache->mds->locker->issue_caps(in);
6202 in->put(CInode::PIN_PTRWAITER);
6203 }
6204};
7c673cae
FG
6205
6206void MDCache::reissue_all_caps()
6207{
6208 dout(10) << "reissue_all_caps" << dendl;
6209
81eedcae 6210 int count = 0;
94b18763 6211 for (auto &p : inode_map) {
81eedcae 6212 int n = 1;
b32b8144 6213 CInode *in = p.second;
7c673cae 6214 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6215 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6216 if (in->is_frozen_inode()) {
6217 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6218 continue;
6219 }
7c673cae 6220 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
81eedcae 6221 n += mds->locker->issue_caps(in);
7c673cae 6222 }
81eedcae
TL
6223
6224 if ((count % 1000) + n >= 1000)
6225 mds->heartbeat_reset();
6226 count += n;
7c673cae
FG
6227 }
6228}
6229
6230
6231// ===============================================================================
6232
6233struct C_MDC_QueuedCow : public MDCacheContext {
6234 CInode *in;
6235 MutationRef mut;
6236 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6237 MDCacheContext(mdc), in(i), mut(m) {}
6238 void finish(int r) override {
6239 mdcache->_queued_file_recover_cow(in, mut);
6240 }
6241};
6242
6243
6244void MDCache::queue_file_recover(CInode *in)
6245{
6246 dout(10) << "queue_file_recover " << *in << dendl;
11fdf7f2 6247 ceph_assert(in->is_auth());
7c673cae
FG
6248
6249 // cow?
6250 /*
6251 SnapRealm *realm = in->find_snaprealm();
6252 set<snapid_t> s = realm->get_snaps();
6253 while (!s.empty() && *s.begin() < in->first)
6254 s.erase(s.begin());
6255 while (!s.empty() && *s.rbegin() > in->last)
6256 s.erase(*s.rbegin());
6257 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6258 if (s.size() > 1) {
94b18763 6259 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6260 pi->version = in->pre_dirty();
6261
6262 auto mut(std::make_shared<MutationImpl>());
6263 mut->ls = mds->mdlog->get_current_segment();
6264 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6265 mds->mdlog->start_entry(le);
6266 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6267
6268 s.erase(*s.begin());
6269 while (!s.empty()) {
6270 snapid_t snapid = *s.begin();
6271 CInode *cow_inode = 0;
6272 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
11fdf7f2 6273 ceph_assert(cow_inode);
7c673cae
FG
6274 recovery_queue.enqueue(cow_inode);
6275 s.erase(*s.begin());
6276 }
6277
6278 in->parent->first = in->first;
6279 le->metablob.add_primary_dentry(in->parent, in, true);
6280 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6281 mds->mdlog->flush();
6282 }
6283 */
6284
6285 recovery_queue.enqueue(in);
6286}
6287
6288void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6289{
6290 in->pop_and_dirty_projected_inode(mut->ls);
6291 mut->apply();
6292 mds->locker->drop_locks(mut.get());
6293 mut->cleanup();
6294}
6295
6296
6297/*
6298 * called after recovery to recover file sizes for previously opened (for write)
6299 * files. that is, those where max_size > size.
6300 */
6301void MDCache::identify_files_to_recover()
6302{
6303 dout(10) << "identify_files_to_recover" << dendl;
81eedcae 6304 int count = 0;
94b18763 6305 for (auto &p : inode_map) {
b32b8144 6306 CInode *in = p.second;
7c673cae
FG
6307 if (!in->is_auth())
6308 continue;
6309
6310 if (in->last != CEPH_NOSNAP)
6311 continue;
6312
6313 // Only normal files need file size recovery
6314 if (!in->is_file()) {
6315 continue;
6316 }
6317
6318 bool recover = false;
6319 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6320 p != in->inode.client_ranges.end();
6321 ++p) {
6322 Capability *cap = in->get_client_cap(p->first);
a8e16298
TL
6323 if (cap) {
6324 cap->mark_clientwriteable();
6325 } else {
7c673cae
FG
6326 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6327 recover = true;
6328 break;
6329 }
6330 }
6331
6332 if (recover) {
6333 if (in->filelock.is_stable()) {
6334 in->auth_pin(&in->filelock);
6335 } else {
11fdf7f2 6336 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
7c673cae
FG
6337 }
6338 in->filelock.set_state(LOCK_PRE_SCAN);
6339 rejoin_recover_q.push_back(in);
6340 } else {
6341 rejoin_check_q.push_back(in);
6342 }
81eedcae
TL
6343
6344 if (!(++count % 1000))
6345 mds->heartbeat_reset();
7c673cae
FG
6346 }
6347}
6348
6349void MDCache::start_files_to_recover()
6350{
6351 for (CInode *in : rejoin_check_q) {
6352 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6353 mds->locker->issue_caps(in);
6354 mds->locker->check_inode_max_size(in);
6355 }
6356 rejoin_check_q.clear();
6357 for (CInode *in : rejoin_recover_q) {
6358 mds->locker->file_recover(&in->filelock);
6359 }
6360 if (!rejoin_recover_q.empty()) {
6361 rejoin_recover_q.clear();
6362 do_file_recover();
6363 }
6364}
6365
6366void MDCache::do_file_recover()
6367{
6368 recovery_queue.advance();
6369}
6370
6371// ===============================================================================
6372
6373
6374// ----------------------------
6375// truncate
6376
6377class C_MDC_RetryTruncate : public MDCacheContext {
6378 CInode *in;
6379 LogSegment *ls;
6380public:
6381 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6382 MDCacheContext(c), in(i), ls(l) {}
6383 void finish(int r) override {
6384 mdcache->_truncate_inode(in, ls);
6385 }
6386};
6387
6388void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6389{
94b18763 6390 auto pi = in->get_projected_inode();
7c673cae
FG
6391 dout(10) << "truncate_inode "
6392 << pi->truncate_from << " -> " << pi->truncate_size
6393 << " on " << *in
6394 << dendl;
6395
6396 ls->truncating_inodes.insert(in);
6397 in->get(CInode::PIN_TRUNCATING);
6398 in->auth_pin(this);
6399
6400 if (!in->client_need_snapflush.empty() &&
6401 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6402 ceph_assert(in->filelock.is_xlocked());
7c673cae
FG
6403 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6404 mds->locker->issue_caps(in);
6405 return;
6406 }
6407
6408 _truncate_inode(in, ls);
6409}
6410
6411struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6412 CInode *in;
6413 LogSegment *ls;
6414 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
91327a77
AA
6415 MDCacheIOContext(c, false), in(i), ls(l) {
6416 }
7c673cae 6417 void finish(int r) override {
11fdf7f2 6418 ceph_assert(r == 0 || r == -ENOENT);
7c673cae
FG
6419 mdcache->truncate_inode_finish(in, ls);
6420 }
91327a77
AA
6421 void print(ostream& out) const override {
6422 out << "file_truncate(" << in->ino() << ")";
6423 }
7c673cae
FG
6424};
6425
6426void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6427{
94b18763 6428 auto pi = &in->inode;
7c673cae
FG
6429 dout(10) << "_truncate_inode "
6430 << pi->truncate_from << " -> " << pi->truncate_size
6431 << " on " << *in << dendl;
6432
11fdf7f2
TL
6433 ceph_assert(pi->is_truncating());
6434 ceph_assert(pi->truncate_size < (1ULL << 63));
6435 ceph_assert(pi->truncate_from < (1ULL << 63));
6436 ceph_assert(pi->truncate_size < pi->truncate_from);
7c673cae
FG
6437
6438
6439 SnapRealm *realm = in->find_snaprealm();
6440 SnapContext nullsnap;
6441 const SnapContext *snapc;
6442 if (realm) {
6443 dout(10) << " realm " << *realm << dendl;
6444 snapc = &realm->get_snap_context();
6445 } else {
6446 dout(10) << " NO realm, using null context" << dendl;
6447 snapc = &nullsnap;
11fdf7f2 6448 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae
FG
6449 }
6450 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6451 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6452 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6453 pi->truncate_seq, ceph::real_time::min(), 0,
6454 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6455 mds->finisher));
6456}
6457
6458struct C_MDC_TruncateLogged : public MDCacheLogContext {
6459 CInode *in;
6460 MutationRef mut;
6461 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6462 MDCacheLogContext(m), in(i), mut(mu) {}
6463 void finish(int r) override {
6464 mdcache->truncate_inode_logged(in, mut);
6465 }
6466};
6467
6468void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6469{
6470 dout(10) << "truncate_inode_finish " << *in << dendl;
6471
6472 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6473 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6474 ls->truncating_inodes.erase(p);
6475
6476 // update
94b18763
FG
6477 auto &pi = in->project_inode();
6478 pi.inode.version = in->pre_dirty();
6479 pi.inode.truncate_from = 0;
6480 pi.inode.truncate_pending--;
7c673cae
FG
6481
6482 MutationRef mut(new MutationImpl());
6483 mut->ls = mds->mdlog->get_current_segment();
6484 mut->add_projected_inode(in);
6485
6486 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6487 mds->mdlog->start_entry(le);
6488 CDentry *dn = in->get_projected_parent_dn();
6489 le->metablob.add_dir_context(dn->get_dir());
6490 le->metablob.add_primary_dentry(dn, in, true);
6491 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6492
6493 journal_dirty_inode(mut.get(), &le->metablob, in);
6494 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6495
6496 // flush immediately if there are readers/writers waiting
6497 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6498 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6499 mds->mdlog->flush();
6500}
6501
6502void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6503{
6504 dout(10) << "truncate_inode_logged " << *in << dendl;
6505 mut->apply();
6506 mds->locker->drop_locks(mut.get());
6507 mut->cleanup();
6508
6509 in->put(CInode::PIN_TRUNCATING);
6510 in->auth_unpin(this);
6511
11fdf7f2 6512 MDSContext::vec waiters;
7c673cae
FG
6513 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6514 mds->queue_waiters(waiters);
6515}
6516
6517
6518void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6519{
6520 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6521 << ls->seq << "/" << ls->offset << dendl;
6522 ls->truncating_inodes.insert(in);
6523 in->get(CInode::PIN_TRUNCATING);
6524}
6525
6526void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6527{
6528 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6529 << ls->seq << "/" << ls->offset << dendl;
6530 // if we have the logseg the truncate started in, it must be in our list.
6531 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6532 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6533 ls->truncating_inodes.erase(p);
6534 in->put(CInode::PIN_TRUNCATING);
6535}
6536
6537void MDCache::start_recovered_truncates()
6538{
6539 dout(10) << "start_recovered_truncates" << dendl;
6540 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6541 p != mds->mdlog->segments.end();
6542 ++p) {
6543 LogSegment *ls = p->second;
6544 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6545 q != ls->truncating_inodes.end();
6546 ++q) {
6547 CInode *in = *q;
6548 in->auth_pin(this);
6549
6550 if (!in->client_need_snapflush.empty() &&
6551 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6552 ceph_assert(in->filelock.is_stable());
7c673cae
FG
6553 in->filelock.set_state(LOCK_XLOCKDONE);
6554 in->auth_pin(&in->filelock);
6555 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6556 // start_files_to_recover will revoke caps
6557 continue;
6558 }
6559 _truncate_inode(in, ls);
6560 }
6561 }
6562}
6563
6564
6565
6566
6567
6568
6569// ================================================================================
6570// cache trimming
6571
11fdf7f2 6572std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
181888fb 6573{
7c673cae 6574 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6575 std::vector<CDentry *> unexpirables;
6576 uint64_t trimmed = 0;
6577
11fdf7f2 6578 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
a8e16298 6579
181888fb
FG
6580 dout(7) << "trim_lru trimming " << count
6581 << " items from LRU"
6582 << " size=" << lru.lru_get_size()
6583 << " mid=" << lru.lru_get_top()
6584 << " pintail=" << lru.lru_get_pintail()
6585 << " pinned=" << lru.lru_get_num_pinned()
6586 << dendl;
7c673cae 6587
11fdf7f2 6588 const uint64_t trim_counter_start = trim_counter.get();
a8e16298
TL
6589 bool throttled = false;
6590 while (1) {
6591 throttled |= trim_counter_start+trimmed >= trim_threshold;
6592 if (throttled) break;
31f18b77
FG
6593 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6594 if (!dn)
6595 break;
6596 if (trim_dentry(dn, expiremap)) {
6597 unexpirables.push_back(dn);
181888fb
FG
6598 } else {
6599 trimmed++;
31f18b77
FG
6600 }
6601 }
6602
181888fb 6603 for (auto &dn : unexpirables) {
31f18b77 6604 bottom_lru.lru_insert_mid(dn);
181888fb 6605 }
31f18b77
FG
6606 unexpirables.clear();
6607
181888fb 6608 // trim dentries from the LRU until count is reached
494da23a
TL
6609 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6610 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
a8e16298
TL
6611 throttled |= trim_counter_start+trimmed >= trim_threshold;
6612 if (throttled) break;
7c673cae
FG
6613 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6614 if (!dn) {
6615 break;
6616 }
7c673cae 6617 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6618 dn->get_linkage()->inode->item_open_file.is_on_list())) {
494da23a
TL
6619 // we move the inodes that need to be trimmed to the end of the lru queue.
6620 // refer to MDCache::standby_trim_segment
6621 lru.lru_insert_bot(dn);
6622 break;
181888fb
FG
6623 } else if (trim_dentry(dn, expiremap)) {
6624 unexpirables.push_back(dn);
6625 } else {
6626 trimmed++;
3efd9988 6627 if (count > 0) count--;
7c673cae
FG
6628 }
6629 }
11fdf7f2 6630 trim_counter.hit(trimmed);
181888fb
FG
6631
6632 for (auto &dn : unexpirables) {
31f18b77 6633 lru.lru_insert_mid(dn);
181888fb 6634 }
31f18b77 6635 unexpirables.clear();
7c673cae 6636
181888fb 6637 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
a8e16298 6638 return std::pair<bool, uint64_t>(throttled, trimmed);
181888fb
FG
6639}
6640
6641/*
6642 * note: only called while MDS is active or stopping... NOT during recovery.
6643 * however, we may expire a replica whose authority is recovering.
6644 *
6645 * @param count is number of dentries to try to expire
6646 */
a8e16298 6647std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
181888fb
FG
6648{
6649 uint64_t used = cache_size();
91327a77 6650 uint64_t limit = cache_memory_limit;
11fdf7f2 6651 expiremap expiremap;
181888fb
FG
6652
6653 dout(7) << "trim bytes_used=" << bytes2str(used)
6654 << " limit=" << bytes2str(limit)
91327a77 6655 << " reservation=" << cache_reservation
181888fb
FG
6656 << "% count=" << count << dendl;
6657
6658 // process delayed eval_stray()
6659 stray_manager.advance_delayed();
6660
a8e16298
TL
6661 auto result = trim_lru(count, expiremap);
6662 auto& trimmed = result.second;
181888fb 6663
7c673cae 6664 // trim non-auth, non-bound subtrees
181888fb 6665 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6666 CDir *dir = p->first;
6667 ++p;
31f18b77
FG
6668 CInode *diri = dir->get_inode();
6669 if (dir->is_auth()) {
6670 if (!diri->is_auth() && !diri->is_base() &&
6671 dir->get_num_head_items() == 0) {
6672 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6673 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6674 dir->is_freezing() || dir->is_frozen())
6675 continue;
6676
6677 migrator->export_empty_import(dir);
a8e16298 6678 ++trimmed;
31f18b77
FG
6679 }
6680 } else {
6681 if (!diri->is_auth()) {
6682 if (dir->get_num_ref() > 1) // only subtree pin
6683 continue;
6684 list<CDir*> ls;
6685 diri->get_subtree_dirfrags(ls);
6686 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6687 continue;
6688
6689 // don't trim subtree root if its auth MDS is recovering.
6690 // This simplify the cache rejoin code.
6691 if (dir->is_subtree_root() &&
6692 rejoin_ack_gather.count(dir->get_dir_auth().first))
6693 continue;
7c673cae 6694 trim_dirfrag(dir, 0, expiremap);
a8e16298 6695 ++trimmed;
31f18b77 6696 }
7c673cae
FG
6697 }
6698 }
6699
6700 // trim root?
181888fb 6701 if (mds->is_stopping() && root) {
7c673cae
FG
6702 list<CDir*> ls;
6703 root->get_dirfrags(ls);
6704 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6705 CDir *dir = *p;
a8e16298 6706 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6707 trim_dirfrag(dir, 0, expiremap);
a8e16298
TL
6708 ++trimmed;
6709 }
7c673cae 6710 }
a8e16298 6711 if (root->get_num_ref() == 0) {
7c673cae 6712 trim_inode(0, root, 0, expiremap);
a8e16298
TL
6713 ++trimmed;
6714 }
7c673cae
FG
6715 }
6716
6717 std::set<mds_rank_t> stopping;
6718 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6719 stopping.erase(mds->get_nodeid());
6720 for (auto rank : stopping) {
6721 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6722 if (!mdsdir_in)
6723 continue;
6724
11fdf7f2
TL
6725 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6726 if (em.second) {
6727 em.first->second = MCacheExpire::create(mds->get_nodeid());
7c673cae
FG
6728 }
6729
6730 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6731
6732 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6733 if (!aborted) {
6734 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6735 list<CDir*> ls;
6736 mdsdir_in->get_dirfrags(ls);
6737 for (auto dir : ls) {
a8e16298 6738 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6739 trim_dirfrag(dir, dir, expiremap);
a8e16298
TL
6740 ++trimmed;
6741 }
7c673cae 6742 }
a8e16298 6743 if (mdsdir_in->get_num_ref() == 0) {
7c673cae 6744 trim_inode(NULL, mdsdir_in, NULL, expiremap);
a8e16298
TL
6745 ++trimmed;
6746 }
7c673cae
FG
6747 } else {
6748 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6749 }
6750 }
6751
6752 // Other rank's base inodes (when I'm stopping)
181888fb 6753 if (mds->is_stopping()) {
7c673cae 6754 for (set<CInode*>::iterator p = base_inodes.begin();
11fdf7f2
TL
6755 p != base_inodes.end();) {
6756 CInode *base_in = *p;
6757 ++p;
6758 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6759 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6760 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6761 if (base_in->get_num_ref() == 0) {
6762 trim_inode(NULL, base_in, NULL, expiremap);
a8e16298 6763 ++trimmed;
7c673cae
FG
6764 }
6765 }
6766 }
6767 }
6768
6769 // send any expire messages
6770 send_expire_messages(expiremap);
6771
a8e16298 6772 return result;
7c673cae
FG
6773}
6774
11fdf7f2 6775void MDCache::send_expire_messages(expiremap& expiremap)
7c673cae
FG
6776{
6777 // send expires
11fdf7f2 6778 for (const auto &p : expiremap) {
7c673cae 6779 if (mds->is_cluster_degraded() &&
11fdf7f2
TL
6780 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6781 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6782 rejoin_sent.count(p.first) == 0))) {
7c673cae
FG
6783 continue;
6784 }
11fdf7f2
TL
6785 dout(7) << "sending cache_expire to " << p.first << dendl;
6786 mds->send_message_mds(p.second, p.first);
7c673cae 6787 }
11fdf7f2 6788 expiremap.clear();
7c673cae
FG
6789}
6790
6791
11fdf7f2 6792bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
7c673cae
FG
6793{
6794 dout(12) << "trim_dentry " << *dn << dendl;
6795
6796 CDentry::linkage_t *dnl = dn->get_linkage();
6797
6798 CDir *dir = dn->get_dir();
11fdf7f2 6799 ceph_assert(dir);
7c673cae
FG
6800
6801 CDir *con = get_subtree_root(dir);
6802 if (con)
6803 dout(12) << " in container " << *con << dendl;
6804 else {
6805 dout(12) << " no container; under a not-yet-linked dir" << dendl;
11fdf7f2 6806 ceph_assert(dn->is_auth());
7c673cae
FG
6807 }
6808
6809 // If replica dentry is not readable, it's likely we will receive
6810 // MDentryLink/MDentryUnlink message soon (It's possible we first
6811 // receive a MDentryUnlink message, then MDentryLink message)
6812 // MDentryLink message only replicates an inode, so we should
6813 // avoid trimming the inode's parent dentry. This is because that
6814 // unconnected replicas are problematic for subtree migration.
6815 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6816 !dn->get_dir()->get_inode()->is_stray())
6817 return true;
6818
6819 // adjust the dir state
6820 // NOTE: we can safely remove a clean, null dentry without effecting
6821 // directory completeness.
6822 // (check this _before_ we unlink the inode, below!)
6823 bool clear_complete = false;
6824 if (!(dnl->is_null() && dn->is_clean()))
6825 clear_complete = true;
6826
6827 // unlink the dentry
6828 if (dnl->is_remote()) {
6829 // just unlink.
31f18b77 6830 dir->unlink_inode(dn, false);
7c673cae
FG
6831 } else if (dnl->is_primary()) {
6832 // expire the inode, too.
6833 CInode *in = dnl->get_inode();
11fdf7f2 6834 ceph_assert(in);
7c673cae
FG
6835 if (trim_inode(dn, in, con, expiremap))
6836 return true; // purging stray instead of trimming
6837 } else {
11fdf7f2 6838 ceph_assert(dnl->is_null());
7c673cae
FG
6839 }
6840
6841 if (!dn->is_auth()) {
6842 // notify dentry authority.
6843 mds_authority_t auth = dn->authority();
6844
6845 for (int p=0; p<2; p++) {
6846 mds_rank_t a = auth.first;
6847 if (p) a = auth.second;
6848 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6849 if (mds->get_nodeid() == auth.second &&
6850 con->is_importing()) break; // don't send any expire while importing.
6851 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6852
6853 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
11fdf7f2
TL
6854 ceph_assert(a != mds->get_nodeid());
6855 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6856 if (em.second)
6857 em.first->second = MCacheExpire::create(mds->get_nodeid());
6858 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6859 }
6860 }
6861
6862 // remove dentry
6863 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6864 dir->add_to_bloom(dn);
6865 dir->remove_dentry(dn);
6866
6867 if (clear_complete)
6868 dir->state_clear(CDir::STATE_COMPLETE);
6869
7c673cae
FG
6870 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6871 return false;
6872}
6873
6874
11fdf7f2 6875void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7c673cae
FG
6876{
6877 dout(15) << "trim_dirfrag " << *dir << dendl;
6878
6879 if (dir->is_subtree_root()) {
11fdf7f2 6880 ceph_assert(!dir->is_auth() ||
7c673cae
FG
6881 (!dir->is_replicated() && dir->inode->is_base()));
6882 remove_subtree(dir); // remove from subtree map
6883 }
11fdf7f2 6884 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
6885
6886 CInode *in = dir->get_inode();
6887
6888 if (!dir->is_auth()) {
6889 mds_authority_t auth = dir->authority();
6890
6891 // was this an auth delegation? (if so, slightly modified container)
6892 dirfrag_t condf;
6893 if (dir->is_subtree_root()) {
6894 dout(12) << " subtree root, container is " << *dir << dendl;
6895 con = dir;
6896 condf = dir->dirfrag();
6897 } else {
6898 condf = con->dirfrag();
6899 }
6900
6901 for (int p=0; p<2; p++) {
6902 mds_rank_t a = auth.first;
6903 if (p) a = auth.second;
6904 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6905 if (mds->get_nodeid() == auth.second &&
6906 con->is_importing()) break; // don't send any expire while importing.
6907 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6908
6909 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
11fdf7f2
TL
6910 ceph_assert(a != mds->get_nodeid());
6911 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6912 if (em.second)
6913 em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
6914 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7c673cae
FG
6915 }
6916 }
6917
6918 in->close_dirfrag(dir->dirfrag().frag);
6919}
6920
6921/**
6922 * Try trimming an inode from the cache
6923 *
6924 * @return true if the inode is still in cache, else false if it was trimmed
6925 */
11fdf7f2 6926bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7c673cae
FG
6927{
6928 dout(15) << "trim_inode " << *in << dendl;
11fdf7f2 6929 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
6930
6931 if (in->is_dir()) {
6932 // If replica inode's dirfragtreelock is not readable, it's likely
6933 // some dirfrags of the inode are being fragmented and we will receive
6934 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6935 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6936 // This is because that unconnected replicas are problematic for
6937 // subtree migration.
6938 //
28e407b8 6939 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
7c673cae 6940 return true;
28e407b8 6941 }
7c673cae
FG
6942
6943 // DIR
6944 list<CDir*> dfls;
6945 in->get_dirfrags(dfls);
6946 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6947 CDir *dir = *p;
11fdf7f2 6948 ceph_assert(!dir->is_subtree_root());
7c673cae
FG
6949 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6950 }
6951 }
6952
6953 // INODE
6954 if (in->is_auth()) {
6955 // eval stray after closing dirfrags
6956 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6957 maybe_eval_stray(in);
6958 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6959 return true;
6960 }
6961 } else {
6962 mds_authority_t auth = in->authority();
6963
6964 dirfrag_t df;
6965 if (con)
6966 df = con->dirfrag();
6967 else
6968 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6969
6970 for (int p=0; p<2; p++) {
6971 mds_rank_t a = auth.first;
6972 if (p) a = auth.second;
6973 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6974 if (con && mds->get_nodeid() == auth.second &&
6975 con->is_importing()) break; // don't send any expire while importing.
6976 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6977
6978 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
11fdf7f2
TL
6979 ceph_assert(a != mds->get_nodeid());
6980 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6981 if (em.second)
6982 em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
6983 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7c673cae
FG
6984 }
6985 }
6986
6987 /*
6988 if (in->is_auth()) {
6989 if (in->hack_accessed)
6990 mds->logger->inc("outt");
6991 else {
6992 mds->logger->inc("outut");
6993 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6994 }
6995 }
6996 */
6997
6998 // unlink
6999 if (dn)
31f18b77 7000 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
7001 remove_inode(in);
7002 return false;
7003}
7004
7005
7006/**
7007 * trim_non_auth - remove any non-auth items from our cache
7008 *
7009 * this reduces the amount of non-auth metadata in our cache, reducing the
7010 * load incurred by the rejoin phase.
7011 *
7012 * the only non-auth items that remain are those that are needed to
7013 * attach our own subtrees to the root.
7014 *
7015 * when we are done, all dentries will be in the top bit of the lru.
7016 *
7017 * why we have to do this:
7018 * we may not have accurate linkage for non-auth items. which means we will
7019 * know which subtree it falls into, and can not be sure to declare it to the
7020 * correct authority.
7021 */
7022void MDCache::trim_non_auth()
7023{
7024 dout(7) << "trim_non_auth" << dendl;
7025
7026 // temporarily pin all subtree roots
7027 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7028 p != subtrees.end();
7029 ++p)
7030 p->first->get(CDir::PIN_SUBTREETEMP);
7031
31f18b77 7032 list<CDentry*> auth_list;
7c673cae
FG
7033
7034 // trim non-auth items from the lru
31f18b77
FG
7035 for (;;) {
7036 CDentry *dn = NULL;
7037 if (bottom_lru.lru_get_size() > 0)
7038 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7039 if (!dn && lru.lru_get_size() > 0)
7040 dn = static_cast<CDentry*>(lru.lru_expire());
7041 if (!dn)
7042 break;
7043
7c673cae
FG
7044 CDentry::linkage_t *dnl = dn->get_linkage();
7045
7046 if (dn->is_auth()) {
7047 // add back into lru (at the top)
31f18b77 7048 auth_list.push_back(dn);
7c673cae
FG
7049
7050 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7051 dn->unlink_remote(dnl);
7c673cae
FG
7052 } else {
7053 // non-auth. expire.
7054 CDir *dir = dn->get_dir();
11fdf7f2 7055 ceph_assert(dir);
7c673cae
FG
7056
7057 // unlink the dentry
7058 dout(10) << " removing " << *dn << dendl;
7059 if (dnl->is_remote()) {
31f18b77 7060 dir->unlink_inode(dn, false);
7c673cae
FG
7061 }
7062 else if (dnl->is_primary()) {
7063 CInode *in = dnl->get_inode();
7064 dout(10) << " removing " << *in << dendl;
7065 list<CDir*> ls;
7066 in->get_dirfrags(ls);
7067 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7068 CDir *subdir = *p;
11fdf7f2 7069 ceph_assert(!subdir->is_subtree_root());
7c673cae
FG
7070 in->close_dirfrag(subdir->dirfrag().frag);
7071 }
31f18b77 7072 dir->unlink_inode(dn, false);
7c673cae
FG
7073 remove_inode(in);
7074 }
7075 else {
11fdf7f2 7076 ceph_assert(dnl->is_null());
7c673cae
FG
7077 }
7078
11fdf7f2 7079 ceph_assert(!dir->has_bloom());
7c673cae
FG
7080 dir->remove_dentry(dn);
7081 // adjust the dir state
7082 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7083 // close empty non-auth dirfrag
7084 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7085 dir->inode->close_dirfrag(dir->get_frag());
7086 }
7087 }
7088
31f18b77
FG
7089 for (auto dn : auth_list) {
7090 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7091 bottom_lru.lru_insert_mid(dn);
7092 else
7093 lru.lru_insert_top(dn);
7094 }
7095
7c673cae
FG
7096 // move everything in the pintail to the top bit of the lru.
7097 lru.lru_touch_entire_pintail();
7098
7099 // unpin all subtrees
7100 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7101 p != subtrees.end();
7102 ++p)
7103 p->first->put(CDir::PIN_SUBTREETEMP);
7104
31f18b77
FG
7105 if (lru.lru_get_size() == 0 &&
7106 bottom_lru.lru_get_size() == 0) {
7c673cae 7107 // root, stray, etc.?
b32b8144 7108 auto p = inode_map.begin();
7c673cae 7109 while (p != inode_map.end()) {
7c673cae 7110 CInode *in = p->second;
b32b8144 7111 ++p;
7c673cae
FG
7112 if (!in->is_auth()) {
7113 list<CDir*> ls;
7114 in->get_dirfrags(ls);
7115 for (list<CDir*>::iterator p = ls.begin();
7116 p != ls.end();
7117 ++p) {
7118 dout(10) << " removing " << **p << dendl;
11fdf7f2 7119 ceph_assert((*p)->get_num_ref() == 1); // SUBTREE
7c673cae
FG
7120 remove_subtree((*p));
7121 in->close_dirfrag((*p)->dirfrag().frag);
7122 }
7123 dout(10) << " removing " << *in << dendl;
11fdf7f2
TL
7124 ceph_assert(!in->get_parent_dn());
7125 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7126 remove_inode(in);
7127 }
7c673cae
FG
7128 }
7129 }
7130
7131 show_subtrees();
7132}
7133
7134/**
7135 * Recursively trim the subtree rooted at directory to remove all
7136 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7137 * of those links. This is used to clear invalid data out of the cache.
7138 * Note that it doesn't clear the passed-in directory, since that's not
7139 * always safe.
7140 */
7141bool MDCache::trim_non_auth_subtree(CDir *dir)
7142{
7143 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7144
7145 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7146
94b18763
FG
7147 auto j = dir->begin();
7148 auto i = j;
7c673cae
FG
7149 while (j != dir->end()) {
7150 i = j++;
7151 CDentry *dn = i->second;
7152 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7153 CDentry::linkage_t *dnl = dn->get_linkage();
7154 if (dnl->is_primary()) { // check for subdirectories, etc
7155 CInode *in = dnl->get_inode();
7156 bool keep_inode = false;
7157 if (in->is_dir()) {
7158 list<CDir*> subdirs;
7159 in->get_dirfrags(subdirs);
7160 for (list<CDir*>::iterator subdir = subdirs.begin();
7161 subdir != subdirs.end();
7162 ++subdir) {
7163 if ((*subdir)->is_subtree_root()) {
7164 keep_inode = true;
7165 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7166 } else {
7167 if (trim_non_auth_subtree(*subdir))
7168 keep_inode = true;
7169 else {
7170 in->close_dirfrag((*subdir)->get_frag());
7171 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7172 }
7173 }
7174 }
7175
7176 }
7177 if (!keep_inode) { // remove it!
7178 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7179 dir->unlink_inode(dn, false);
7c673cae 7180 remove_inode(in);
11fdf7f2 7181 ceph_assert(!dir->has_bloom());
7c673cae
FG
7182 dir->remove_dentry(dn);
7183 } else {
7184 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7185 dn->state_clear(CDentry::STATE_AUTH);
7186 in->state_clear(CInode::STATE_AUTH);
7187 }
7188 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7189 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7190 } else { // just remove it
7191 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7192 if (dnl->is_remote())
31f18b77 7193 dir->unlink_inode(dn, false);
7c673cae
FG
7194 dir->remove_dentry(dn);
7195 }
7196 }
7197 dir->state_clear(CDir::STATE_AUTH);
7198 /**
7199 * We've now checked all our children and deleted those that need it.
7200 * Now return to caller, and tell them if *we're* a keeper.
7201 */
7202 return keep_dir || dir->get_num_any();
7203}
7204
7205/*
7206 * during replay, when we determine a subtree is no longer ours, we
7207 * try to trim it from our cache. because subtrees must be connected
7208 * to the root, the fact that we can trim this tree may mean that our
7209 * children or parents can also be trimmed.
7210 */
7211void MDCache::try_trim_non_auth_subtree(CDir *dir)
7212{
7213 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7214
7215 // can we now trim child subtrees?
7216 set<CDir*> bounds;
7217 get_subtree_bounds(dir, bounds);
7218 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7219 CDir *bd = *p;
7220 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7221 bd->get_num_any() == 0 && // and empty
7222 can_trim_non_auth_dirfrag(bd)) {
7223 CInode *bi = bd->get_inode();
7224 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7225 remove_subtree(bd);
7226 bd->mark_clean();
7227 bi->close_dirfrag(bd->get_frag());
7228 }
7229 }
7230
7231 if (trim_non_auth_subtree(dir)) {
7232 // keep
7233 try_subtree_merge(dir);
7234 } else {
7235 // can we trim this subtree (and possibly our ancestors) too?
7236 while (true) {
7237 CInode *diri = dir->get_inode();
7238 if (diri->is_base()) {
7239 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7240 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7241 remove_subtree(dir);
7242 dir->mark_clean();
7243 diri->close_dirfrag(dir->get_frag());
7244
7245 dout(10) << " removing " << *diri << dendl;
11fdf7f2
TL
7246 ceph_assert(!diri->get_parent_dn());
7247 ceph_assert(diri->get_num_ref() == 0);
7c673cae
FG
7248 remove_inode(diri);
7249 }
7250 break;
7251 }
7252
7253 CDir *psub = get_subtree_root(diri->get_parent_dir());
7254 dout(10) << " parent subtree is " << *psub << dendl;
7255 if (psub->get_dir_auth().first == mds->get_nodeid())
7256 break; // we are auth, keep.
7257
7258 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7259 remove_subtree(dir);
7260 dir->mark_clean();
7261 diri->close_dirfrag(dir->get_frag());
7262
7263 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7264 if (trim_non_auth_subtree(psub))
7265 break;
7266 dir = psub;
7267 }
7268 }
7269
7270 show_subtrees();
7271}
7272
7273void MDCache::standby_trim_segment(LogSegment *ls)
7274{
494da23a
TL
7275 auto try_trim_inode = [this](CInode *in) {
7276 if (in->get_num_ref() == 0 &&
7277 !in->item_open_file.is_on_list() &&
7278 in->parent != NULL &&
7279 in->parent->get_num_ref() == 0){
7280 touch_dentry_bottom(in->parent);
7281 }
7282 };
7283
7284 auto try_trim_dentry = [this](CDentry *dn) {
7285 if (dn->get_num_ref() > 0)
7286 return;
7287 auto in = dn->get_linkage()->inode;
7288 if(in && in->item_open_file.is_on_list())
7289 return;
7290 touch_dentry_bottom(dn);
7291 };
7292
7c673cae
FG
7293 ls->new_dirfrags.clear_list();
7294 ls->open_files.clear_list();
7295
7296 while (!ls->dirty_dirfrags.empty()) {
7297 CDir *dir = ls->dirty_dirfrags.front();
7298 dir->mark_clean();
494da23a
TL
7299 if (dir->inode)
7300 try_trim_inode(dir->inode);
7c673cae
FG
7301 }
7302 while (!ls->dirty_inodes.empty()) {
7303 CInode *in = ls->dirty_inodes.front();
7304 in->mark_clean();
494da23a 7305 try_trim_inode(in);
7c673cae
FG
7306 }
7307 while (!ls->dirty_dentries.empty()) {
7308 CDentry *dn = ls->dirty_dentries.front();
7309 dn->mark_clean();
494da23a 7310 try_trim_dentry(dn);
7c673cae
FG
7311 }
7312 while (!ls->dirty_parent_inodes.empty()) {
7313 CInode *in = ls->dirty_parent_inodes.front();
7314 in->clear_dirty_parent();
494da23a 7315 try_trim_inode(in);
7c673cae
FG
7316 }
7317 while (!ls->dirty_dirfrag_dir.empty()) {
7318 CInode *in = ls->dirty_dirfrag_dir.front();
7319 in->filelock.remove_dirty();
494da23a 7320 try_trim_inode(in);
7c673cae
FG
7321 }
7322 while (!ls->dirty_dirfrag_nest.empty()) {
7323 CInode *in = ls->dirty_dirfrag_nest.front();
7324 in->nestlock.remove_dirty();
494da23a 7325 try_trim_inode(in);
7c673cae
FG
7326 }
7327 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7328 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7329 in->dirfragtreelock.remove_dirty();
494da23a 7330 try_trim_inode(in);
7c673cae 7331 }
eafe8130
TL
7332 while (!ls->truncating_inodes.empty()) {
7333 auto it = ls->truncating_inodes.begin();
7334 CInode *in = *it;
7335 ls->truncating_inodes.erase(it);
7336 in->put(CInode::PIN_TRUNCATING);
7337 try_trim_inode(in);
7338 }
7c673cae
FG
7339}
7340
11fdf7f2 7341void MDCache::handle_cache_expire(const MCacheExpire::const_ref &m)
7c673cae
FG
7342{
7343 mds_rank_t from = mds_rank_t(m->get_from());
7344
7345 dout(7) << "cache_expire from mds." << from << dendl;
7346
7347 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
7348 return;
7349 }
7350
7351 set<SimpleLock *> gather_locks;
7352 // loop over realms
11fdf7f2 7353 for (const auto &p : m->realms) {
7c673cae 7354 // check container?
11fdf7f2
TL
7355 if (p.first.ino > 0) {
7356 CInode *expired_inode = get_inode(p.first.ino);
7357 ceph_assert(expired_inode); // we had better have this.
7358 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7359 ceph_assert(parent_dir);
7c673cae
FG
7360
7361 int export_state = -1;
7362 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7363 export_state = migrator->get_export_state(parent_dir);
11fdf7f2 7364 ceph_assert(export_state >= 0);
7c673cae
FG
7365 }
7366
7367 if (!parent_dir->is_auth() ||
7368 (export_state != -1 &&
7369 ((export_state == Migrator::EXPORT_WARNING &&
7370 migrator->export_has_warned(parent_dir,from)) ||
7371 export_state == Migrator::EXPORT_EXPORTING ||
7372 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7373 (export_state == Migrator::EXPORT_NOTIFYING &&
7374 !migrator->export_has_notified(parent_dir,from))))) {
7375
7376 // not auth.
7377 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
11fdf7f2 7378 ceph_assert(parent_dir->is_frozen_tree_root());
7c673cae
FG
7379
7380 // make a message container
11fdf7f2
TL
7381
7382 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7383 if (em.second)
7384 em.first->second = MCacheExpire::create(from); /* new */
7385
7c673cae 7386 // merge these expires into it
11fdf7f2 7387 em.first->second->add_realm(p.first, p.second);
7c673cae
FG
7388 continue;
7389 }
11fdf7f2 7390 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7c673cae
FG
7391 (export_state == Migrator::EXPORT_WARNING &&
7392 !migrator->export_has_warned(parent_dir, from)));
7393
7394 dout(7) << "expires for " << *parent_dir << dendl;
7395 } else {
7396 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7397 }
7398
7399 // INODES
11fdf7f2
TL
7400 for (const auto &q : p.second.inodes) {
7401 CInode *in = get_inode(q.first);
7402 unsigned nonce = q.second;
7c673cae
FG
7403
7404 if (!in) {
11fdf7f2 7405 dout(0) << " inode expire on " << q.first << " from " << from
7c673cae 7406 << ", don't have it" << dendl;
11fdf7f2 7407 ceph_assert(in);
7c673cae 7408 }
11fdf7f2 7409 ceph_assert(in->is_auth());
7c673cae
FG
7410 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7411
7412 // check nonce
7413 if (nonce == in->get_replica_nonce(from)) {
7414 // remove from our cached_by
7415 dout(7) << " inode expire on " << *in << " from mds." << from
7416 << " cached_by was " << in->get_replicas() << dendl;
7417 inode_remove_replica(in, from, false, gather_locks);
7418 }
7419 else {
7420 // this is an old nonce, ignore expire.
7421 dout(7) << " inode expire on " << *in << " from mds." << from
7422 << " with old nonce " << nonce
7423 << " (current " << in->get_replica_nonce(from) << "), dropping"
7424 << dendl;
7425 }
7426 }
7427
7428 // DIRS
11fdf7f2
TL
7429 for (const auto &q : p.second.dirs) {
7430 CDir *dir = get_dirfrag(q.first);
7431 unsigned nonce = q.second;
7c673cae
FG
7432
7433 if (!dir) {
11fdf7f2 7434 CInode *diri = get_inode(q.first.ino);
7c673cae
FG
7435 if (diri) {
7436 if (mds->is_rejoin() &&
7437 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7438 !diri->is_replica(from)) {
7439 list<CDir*> ls;
7440 diri->get_nested_dirfrags(ls);
11fdf7f2 7441 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7442 << " while rejoining, inode isn't replicated" << dendl;
7443 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7444 dir = *q;
7445 if (dir->is_replica(from)) {
7446 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7447 dir->remove_replica(from);
7448 }
7449 }
7450 continue;
7451 }
11fdf7f2 7452 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7c673cae 7453 if (other) {
11fdf7f2 7454 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7455 << " have " << *other << ", mismatched frags, dropping" << dendl;
7456 continue;
7457 }
7458 }
11fdf7f2 7459 dout(0) << " dir expire on " << q.first << " from " << from
7c673cae 7460 << ", don't have it" << dendl;
11fdf7f2 7461 ceph_assert(dir);
7c673cae
FG
7462 }
7463 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7464
11fdf7f2 7465 ceph_assert(dir->is_auth());
7c673cae
FG
7466
7467 // check nonce
7468 if (nonce == dir->get_replica_nonce(from)) {
7469 // remove from our cached_by
7470 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7471 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7472 dir->remove_replica(from);
7473 }
7474 else {
7475 // this is an old nonce, ignore expire.
7476 dout(7) << " dir expire on " << *dir << " from mds." << from
7477 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7478 << "), dropping" << dendl;
7479 }
7480 }
7481
7482 // DENTRIES
11fdf7f2
TL
7483 for (const auto &pd : p.second.dentries) {
7484 dout(10) << " dn expires in dir " << pd.first << dendl;
7485 CInode *diri = get_inode(pd.first.ino);
7486 ceph_assert(diri);
7487 CDir *dir = diri->get_dirfrag(pd.first.frag);
7c673cae
FG
7488
7489 if (!dir) {
11fdf7f2 7490 dout(0) << " dn expires on " << pd.first << " from " << from
7c673cae
FG
7491 << ", must have refragmented" << dendl;
7492 } else {
11fdf7f2 7493 ceph_assert(dir->is_auth());
7c673cae
FG
7494 }
7495
11fdf7f2
TL
7496 for (const auto &p : pd.second) {
7497 unsigned nonce = p.second;
7c673cae
FG
7498 CDentry *dn;
7499
7500 if (dir) {
11fdf7f2 7501 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7502 } else {
7503 // which dirfrag for this dentry?
11fdf7f2
TL
7504 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7505 ceph_assert(dir);
7506 ceph_assert(dir->is_auth());
7507 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7508 }
7509
7510 if (!dn) {
7511 if (dir)
11fdf7f2 7512 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7c673cae 7513 else
11fdf7f2 7514 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7c673cae 7515 }
11fdf7f2 7516 ceph_assert(dn);
7c673cae
FG
7517
7518 if (nonce == dn->get_replica_nonce(from)) {
7519 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7520 dentry_remove_replica(dn, from, gather_locks);
7521 }
7522 else {
7523 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7524 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7525 << "), dropping" << dendl;
7526 }
7527 }
7528 }
7529 }
7530
7c673cae
FG
7531 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7532 if (!(*p)->is_stable())
7533 mds->locker->eval_gather(*p);
7534 }
7535}
7536
7537void MDCache::process_delayed_expire(CDir *dir)
7538{
7539 dout(7) << "process_delayed_expire on " << *dir << dendl;
11fdf7f2
TL
7540 for (const auto &p : delayed_expire[dir]) {
7541 handle_cache_expire(p.second);
7542 }
7c673cae
FG
7543 delayed_expire.erase(dir);
7544}
7545
7546void MDCache::discard_delayed_expire(CDir *dir)
7547{
7548 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7c673cae
FG
7549 delayed_expire.erase(dir);
7550}
7551
7552void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7553 set<SimpleLock *>& gather_locks)
7554{
7555 in->remove_replica(from);
11fdf7f2 7556 in->set_mds_caps_wanted(from, 0);
7c673cae
FG
7557
7558 // note: this code calls _eval more often than it needs to!
7559 // fix lock
7560 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7561 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7562 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7563 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7564 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7565 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7566
7567 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7568 // Don't remove the recovering mds from lock's gathering list because
7569 // it may hold rejoined wrlocks.
7570 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7571 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7572 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7573}
7574
7575void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7576{
7577 dn->remove_replica(from);
7578
7579 // fix lock
7580 if (dn->lock.remove_replica(from))
7581 gather_locks.insert(&dn->lock);
7582
7583 // Replicated strays might now be elegible for purge
11fdf7f2 7584 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7585 if (dnl->is_primary()) {
7586 maybe_eval_stray(dnl->get_inode());
7587 }
7588}
7589
7590void MDCache::trim_client_leases()
7591{
7592 utime_t now = ceph_clock_now();
7593
7594 dout(10) << "trim_client_leases" << dendl;
7595
eafe8130
TL
7596 std::size_t pool = 0;
7597 for (const auto& list : client_leases) {
7598 pool += 1;
7599 if (list.empty())
7c673cae
FG
7600 continue;
7601
eafe8130
TL
7602 auto before = list.size();
7603 while (!list.empty()) {
7604 ClientLease *r = list.front();
7c673cae
FG
7605 if (r->ttl > now) break;
7606 CDentry *dn = static_cast<CDentry*>(r->parent);
7607 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7608 dn->remove_client_lease(r, mds->locker);
7609 }
eafe8130 7610 auto after = list.size();
7c673cae
FG
7611 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7612 << (before-after) << " leases, " << after << " left" << dendl;
7613 }
7614}
7615
7616
7617void MDCache::check_memory_usage()
7618{
7619 static MemoryModel mm(g_ceph_context);
7620 static MemoryModel::snap last;
7621 mm.sample(&last);
7622 static MemoryModel::snap baseline = last;
7623
7624 // check client caps
11fdf7f2 7625 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7626 double caps_per_inode = 0.0;
7c673cae 7627 if (CInode::count())
181888fb 7628 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae 7629
a8e16298 7630 dout(2) << "Memory usage: "
7c673cae
FG
7631 << " total " << last.get_total()
7632 << ", rss " << last.get_rss()
7633 << ", heap " << last.get_heap()
7634 << ", baseline " << baseline.get_heap()
7c673cae
FG
7635 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7636 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7637 << dendl;
7638
c07f9fc5 7639 mds->update_mlogger();
7c673cae
FG
7640 mds->mlogger->set(l_mdm_rss, last.get_rss());
7641 mds->mlogger->set(l_mdm_heap, last.get_heap());
7642
181888fb 7643 if (cache_toofull()) {
92f5a8d4 7644 mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM);
7c673cae
FG
7645 }
7646
7647 // If the cache size had exceeded its limit, but we're back in bounds
7648 // now, free any unused pool memory so that our memory usage isn't
7649 // permanently bloated.
181888fb 7650 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7651 // Only do this once we are back in bounds: otherwise the releases would
7652 // slow down whatever process caused us to exceed bounds to begin with
7653 if (ceph_using_tcmalloc()) {
a8e16298 7654 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7c673cae
FG
7655 << dendl;
7656 ceph_heap_release_free_memory();
7657 }
7658 exceeded_size_limit = false;
7659 }
7660}
7661
7662
7663
7664// =========================================================================================
7665// shutdown
7666
7667class C_MDC_ShutdownCheck : public MDCacheContext {
7668public:
7669 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7670 void finish(int) override {
7671 mdcache->shutdown_check();
7672 }
7673};
7674
7675void MDCache::shutdown_check()
7676{
7677 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7678
7679 // cache
7680 char old_val[32] = { 0 };
7681 char *o = old_val;
11fdf7f2
TL
7682 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7683 g_conf().set_val("debug_mds", "10");
7684 g_conf().apply_changes(nullptr);
7c673cae 7685 show_cache();
11fdf7f2
TL
7686 g_conf().set_val("debug_mds", old_val);
7687 g_conf().apply_changes(nullptr);
7688 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae
FG
7689
7690 // this
31f18b77 7691 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7692 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7693
7694
7695 if (mds->objecter->is_active()) {
7696 dout(0) << "objecter still active" << dendl;
7697 mds->objecter->dump_active();
7698 }
7699}
7700
7701
7702void MDCache::shutdown_start()
7703{
a8e16298 7704 dout(5) << "shutdown_start" << dendl;
7c673cae 7705
11fdf7f2
TL
7706 if (g_conf()->mds_shutdown_check)
7707 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae 7708
11fdf7f2 7709 // g_conf()->debug_mds = 10;
7c673cae
FG
7710}
7711
7712
7713
7714bool MDCache::shutdown_pass()
7715{
7716 dout(7) << "shutdown_pass" << dendl;
7717
7718 if (mds->is_stopped()) {
7719 dout(7) << " already shut down" << dendl;
7720 show_cache();
7721 show_subtrees();
7722 return true;
7723 }
7724
7725 // empty stray dir
28e407b8 7726 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7727
7728 // trim cache
181888fb 7729 trim(UINT64_MAX);
31f18b77 7730 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7731
28e407b8 7732 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae
FG
7733 int num_auth_subtree = 0;
7734 if (!subtrees.empty() &&
28e407b8 7735 mds->get_nodeid() != 0) {
7c673cae
FG
7736 dout(7) << "looking for subtrees to export to mds0" << dendl;
7737 list<CDir*> ls;
7738 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7739 it != subtrees.end();
7740 ++it) {
7741 CDir *dir = it->first;
7742 if (dir->get_inode()->is_mdsdir())
7743 continue;
7744 if (dir->is_auth()) {
7745 num_auth_subtree++;
7746 if (dir->is_frozen() ||
7747 dir->is_freezing() ||
7748 dir->is_ambiguous_dir_auth() ||
7749 dir->state_test(CDir::STATE_EXPORTING))
7750 continue;
7751 ls.push_back(dir);
7752 }
7753 }
28e407b8
AA
7754
7755 migrator->clear_export_queue();
7c673cae
FG
7756 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7757 CDir *dir = *p;
7758 mds_rank_t dest = dir->get_inode()->authority().first;
7759 if (dest > 0 && !mds->mdsmap->is_active(dest))
7760 dest = 0;
7761 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7762 migrator->export_dir_nicely(dir, dest);
7763 }
7764 }
7765
28e407b8
AA
7766 if (!strays_all_exported) {
7767 dout(7) << "waiting for strays to migrate" << dendl;
7768 return false;
7769 }
7770
7c673cae 7771 if (num_auth_subtree > 0) {
11fdf7f2 7772 ceph_assert(mds->get_nodeid() > 0);
7c673cae
FG
7773 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7774 show_subtrees();
7775 return false;
7776 }
7777
7778 // close out any sessions (and open files!) before we try to trim the log, etc.
7779 if (mds->sessionmap.have_unclosed_sessions()) {
7780 if (!mds->server->terminating_sessions)
7781 mds->server->terminate_sessions();
7782 return false;
7783 }
7784
28e407b8
AA
7785 // Fully trim the log so that all objects in cache are clean and may be
7786 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7787 // trim the log such that the cache eventually becomes clean.
f64942e4
AA
7788 if (mds->mdlog->get_num_segments() > 0) {
7789 auto ls = mds->mdlog->get_current_segment();
7790 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7791 // Current segment contains events other than subtreemap or
7792 // there are dirty dirfrags (see CDir::log_mark_dirty())
7793 mds->mdlog->start_new_segment();
7794 mds->mdlog->flush();
7795 }
7796 }
7797 mds->mdlog->trim_all();
28e407b8
AA
7798 if (mds->mdlog->get_num_segments() > 1) {
7799 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7800 return false;
7801 }
7802
7803 // drop our reference to our stray dir inode
7804 for (int i = 0; i < NUM_STRAY; ++i) {
7805 if (strays[i] &&
7806 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7807 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7808 strays[i]->put(CInode::PIN_STRAY);
7809 strays[i]->put_stickydirs();
7810 }
7811 }
7812
7c673cae
FG
7813 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7814 if (mydir && !mydir->is_subtree_root())
7815 mydir = NULL;
7816
7817 // subtrees map not empty yet?
7818 if (subtrees.size() > (mydir ? 1 : 0)) {
7819 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7820 show_subtrees();
7821 migrator->show_importing();
7822 migrator->show_exporting();
7823 if (!migrator->is_importing() && !migrator->is_exporting())
7824 show_cache();
7825 return false;
7826 }
11fdf7f2
TL
7827 ceph_assert(!migrator->is_exporting());
7828 ceph_assert(!migrator->is_importing());
7c673cae 7829
f64942e4
AA
7830 // replicas may dirty scatter locks
7831 if (myin && myin->is_replicated()) {
7832 dout(7) << "still have replicated objects" << dendl;
7833 return false;
7834 }
7835
11fdf7f2
TL
7836 if ((myin && myin->get_num_auth_pins()) ||
7837 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
181888fb
FG
7838 dout(7) << "still have auth pinned objects" << dendl;
7839 return false;
7840 }
7841
7c673cae
FG
7842 // (only do this once!)
7843 if (!mds->mdlog->is_capped()) {
7844 dout(7) << "capping the log" << dendl;
7845 mds->mdlog->cap();
7c673cae
FG
7846 }
7847
f64942e4
AA
7848 if (!mds->mdlog->empty())
7849 mds->mdlog->trim(0);
7850
7c673cae
FG
7851 if (!mds->mdlog->empty()) {
7852 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7853 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7854 return false;
7855 }
7856
7857 if (!did_shutdown_log_cap) {
7858 // flush journal header
7859 dout(7) << "writing header for (now-empty) journal" << dendl;
11fdf7f2 7860 ceph_assert(mds->mdlog->empty());
7c673cae
FG
7861 mds->mdlog->write_head(0);
7862 // NOTE: filer active checker below will block us until this completes.
7863 did_shutdown_log_cap = true;
7864 return false;
7865 }
7866
7867 // filer active?
7868 if (mds->objecter->is_active()) {
7869 dout(7) << "objecter still active" << dendl;
7870 mds->objecter->dump_active();
7871 return false;
7872 }
7873
7874 // trim what we can from the cache
31f18b77
FG
7875 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7876 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7877 show_cache();
7878 //dump();
7879 return false;
7880 }
31f18b77
FG
7881
7882 // make mydir subtree go away
7883 if (mydir) {
7884 if (mydir->get_num_ref() > 1) { // subtree pin
7885 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7886 show_cache();
7887 return false;
7888 }
7889
7890 remove_subtree(mydir);
7891 myin->close_dirfrag(mydir->get_frag());
7892 }
11fdf7f2 7893 ceph_assert(subtrees.empty());
31f18b77 7894
1adf2230 7895 if (myin) {
31f18b77 7896 remove_inode(myin);
11fdf7f2 7897 ceph_assert(!myin);
1adf2230
AA
7898 }
7899
11fdf7f2
TL
7900 if (global_snaprealm) {
7901 remove_inode(global_snaprealm->inode);
7902 global_snaprealm = nullptr;
7903 }
7904
7c673cae 7905 // done!
a8e16298 7906 dout(5) << "shutdown done." << dendl;
7c673cae
FG
7907 return true;
7908}
7909
7910bool MDCache::shutdown_export_strays()
7911{
f64942e4
AA
7912 static const unsigned MAX_EXPORTING = 100;
7913
7c673cae
FG
7914 if (mds->get_nodeid() == 0)
7915 return true;
f64942e4
AA
7916
7917 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7918 return false;
7919
7920 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7921 << " '" << shutdown_export_next.second << "'" << dendl;
7c673cae
FG
7922
7923 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
f64942e4 7924 bool all_exported = false;
7c673cae 7925
f64942e4
AA
7926again:
7927 auto next = shutdown_export_next;
7c673cae 7928
7c673cae 7929 for (int i = 0; i < NUM_STRAY; ++i) {
f64942e4
AA
7930 CInode *strayi = strays[i];
7931 if (!strayi ||
7932 !strayi->state_test(CInode::STATE_STRAYPINNED))
7933 continue;
7934 if (strayi->ino() < next.first.ino)
7c673cae 7935 continue;
7c673cae 7936
f64942e4
AA
7937 deque<CDir*> dfls;
7938 strayi->get_dirfrags(dfls);
7c673cae 7939
f64942e4
AA
7940 while (!dfls.empty()) {
7941 CDir *dir = dfls.front();
7942 dfls.pop_front();
7943
7944 if (dir->dirfrag() < next.first)
7c673cae 7945 continue;
f64942e4
AA
7946 if (next.first < dir->dirfrag()) {
7947 next.first = dir->dirfrag();
7948 next.second.clear();
7949 }
7950
7951 if (!dir->is_complete()) {
11fdf7f2 7952 MDSContext *fin = nullptr;
f64942e4
AA
7953 if (shutdown_exporting_strays.empty()) {
7954 fin = new MDSInternalContextWrapper(mds,
7955 new FunctionContext([this](int r) {
7956 shutdown_export_strays();
7957 })
7958 );
7959 }
7960 dir->fetch(fin);
7961 goto done;
7c673cae
FG
7962 }
7963
f64942e4
AA
7964 CDir::dentry_key_map::iterator it;
7965 if (next.second.empty()) {
7966 it = dir->begin();
7c673cae 7967 } else {
f64942e4
AA
7968 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
7969 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
7c673cae 7970 }
f64942e4
AA
7971
7972 for (; it != dir->end(); ++it) {
7973 CDentry *dn = it->second;
7974 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7975 if (dnl->is_null())
7976 continue;
7977
7978 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
11fdf7f2 7979 next.second = it->first.name;
f64942e4
AA
7980 goto done;
7981 }
7982
7983 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
7984 if (!ret.second) {
7985 dout(10) << "already exporting/purging " << *dn << dendl;
7986 continue;
7987 }
7988
7989 // Don't try to migrate anything that is actually
7990 // being purged right now
7991 if (!dn->state_test(CDentry::STATE_PURGING))
7992 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7993
7994 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
7995 ++it;
7996 if (it != dir->end()) {
11fdf7f2 7997 next.second = it->first.name;
f64942e4
AA
7998 } else {
7999 if (dfls.empty())
8000 next.first.ino.val++;
8001 else
8002 next.first = dfls.front()->dirfrag();
8003 next.second.clear();
8004 }
8005 goto done;
8006 }
8007 }
8008 }
8009 }
8010
8011 if (shutdown_exporting_strays.empty()) {
8012 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8013 if (first_df < shutdown_export_next.first ||
8014 !shutdown_export_next.second.empty()) {
8015 shutdown_export_next.first = first_df;
8016 shutdown_export_next.second.clear();
8017 goto again;
7c673cae 8018 }
f64942e4 8019 all_exported = true;
7c673cae
FG
8020 }
8021
f64942e4
AA
8022done:
8023 shutdown_export_next = next;
8024 return all_exported;
7c673cae
FG
8025}
8026
8027// ========= messaging ==============
8028
11fdf7f2 8029void MDCache::dispatch(const Message::const_ref &m)
7c673cae
FG
8030{
8031 switch (m->get_type()) {
8032
8033 // RESOLVE
8034 case MSG_MDS_RESOLVE:
11fdf7f2 8035 handle_resolve(MMDSResolve::msgref_cast(m));
7c673cae
FG
8036 break;
8037 case MSG_MDS_RESOLVEACK:
11fdf7f2 8038 handle_resolve_ack(MMDSResolveAck::msgref_cast(m));
7c673cae
FG
8039 break;
8040
8041 // REJOIN
8042 case MSG_MDS_CACHEREJOIN:
11fdf7f2 8043 handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m));
7c673cae
FG
8044 break;
8045
8046 case MSG_MDS_DISCOVER:
11fdf7f2 8047 handle_discover(MDiscover::msgref_cast(m));
7c673cae
FG
8048 break;
8049 case MSG_MDS_DISCOVERREPLY:
11fdf7f2 8050 handle_discover_reply(MDiscoverReply::msgref_cast(m));
7c673cae
FG
8051 break;
8052
8053 case MSG_MDS_DIRUPDATE:
11fdf7f2 8054 handle_dir_update(MDirUpdate::msgref_cast(m));
7c673cae
FG
8055 break;
8056
8057 case MSG_MDS_CACHEEXPIRE:
11fdf7f2 8058 handle_cache_expire(MCacheExpire::msgref_cast(m));
7c673cae
FG
8059 break;
8060
8061 case MSG_MDS_DENTRYLINK:
11fdf7f2 8062 handle_dentry_link(MDentryLink::msgref_cast(m));
7c673cae
FG
8063 break;
8064 case MSG_MDS_DENTRYUNLINK:
11fdf7f2 8065 handle_dentry_unlink(MDentryUnlink::msgref_cast(m));
7c673cae
FG
8066 break;
8067
8068 case MSG_MDS_FRAGMENTNOTIFY:
11fdf7f2 8069 handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m));
7c673cae 8070 break;
a8e16298 8071 case MSG_MDS_FRAGMENTNOTIFYACK:
11fdf7f2 8072 handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m));
a8e16298 8073 break;
7c673cae
FG
8074
8075 case MSG_MDS_FINDINO:
11fdf7f2 8076 handle_find_ino(MMDSFindIno::msgref_cast(m));
7c673cae
FG
8077 break;
8078 case MSG_MDS_FINDINOREPLY:
11fdf7f2 8079 handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m));
7c673cae
FG
8080 break;
8081
8082 case MSG_MDS_OPENINO:
11fdf7f2 8083 handle_open_ino(MMDSOpenIno::msgref_cast(m));
7c673cae
FG
8084 break;
8085 case MSG_MDS_OPENINOREPLY:
11fdf7f2
TL
8086 handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m));
8087 break;
8088
8089 case MSG_MDS_SNAPUPDATE:
8090 handle_snap_update(MMDSSnapUpdate::msgref_cast(m));
7c673cae
FG
8091 break;
8092
8093 default:
8094 derr << "cache unknown message " << m->get_type() << dendl;
11fdf7f2 8095 ceph_abort_msg("cache unknown message");
7c673cae
FG
8096 }
8097}
8098
11fdf7f2
TL
8099int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // who
8100 const filepath& path, // what
7c673cae 8101 vector<CDentry*> *pdnvec, // result
11fdf7f2 8102 CInode **pin,
7c673cae
FG
8103 int onfail)
8104{
8105 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
8106 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
8107 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
8108
11fdf7f2 8109 ceph_assert(!forward || mdr); // forward requires a request
7c673cae
FG
8110
8111 snapid_t snapid = CEPH_NOSNAP;
8112 if (mdr)
8113 mdr->snapid = snapid;
8114
8115 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8116
8117 if (mds->logger) mds->logger->inc(l_mds_traverse);
8118
8119 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8120 CInode *cur = get_inode(path.get_ino());
8121 if (cur == NULL) {
8122 if (MDS_INO_IS_MDSDIR(path.get_ino()))
11fdf7f2 8123 open_foreign_mdsdir(path.get_ino(), cf.build());
7c673cae
FG
8124 else {
8125 //ceph_abort(); // hrm.. broken
8126 return -ESTALE;
8127 }
8128 return 1;
8129 }
8130 if (cur->state_test(CInode::STATE_PURGING))
8131 return -ESTALE;
8132
8133 // make sure snaprealm are open...
11fdf7f2
TL
8134 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8135 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8136 return 1;
8137 }
8138
8139 // start trace
8140 if (pdnvec)
8141 pdnvec->clear();
8142 if (pin)
8143 *pin = cur;
8144
8145 unsigned depth = 0;
8146 while (depth < path.depth()) {
8147 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8148 << "' snapid " << snapid << dendl;
8149
8150 if (!cur->is_dir()) {
8151 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8152 return -ENOTDIR;
8153 }
8154
8155 // walk into snapdir?
8156 if (path[depth].length() == 0) {
8157 dout(10) << "traverse: snapdir" << dendl;
8158 if (!mdr)
8159 return -EINVAL;
8160 snapid = CEPH_SNAPDIR;
8161 mdr->snapid = snapid;
8162 depth++;
8163 continue;
8164 }
8165 // walk thru snapdir?
8166 if (snapid == CEPH_SNAPDIR) {
8167 if (!mdr)
8168 return -EINVAL;
8169 SnapRealm *realm = cur->find_snaprealm();
8170 snapid = realm->resolve_snapname(path[depth], cur->ino());
8171 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
11fdf7f2
TL
8172 if (!snapid) {
8173 CInode *t = cur;
8174 while (t) {
8175 // if snaplock isn't readable, it's possible that other mds is creating
8176 // snapshot, but snap update message hasn't been received.
8177 if (!t->snaplock.can_read(client)) {
8178 dout(10) << " non-readable snaplock on " << *t << dendl;
8179 t->snaplock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8180 return 1;
8181 }
8182 CDentry *pdn = t->get_projected_parent_dn();
8183 t = pdn ? pdn->get_dir()->get_inode() : NULL;
8184 }
7c673cae 8185 return -ENOENT;
11fdf7f2 8186 }
7c673cae
FG
8187 mdr->snapid = snapid;
8188 depth++;
8189 continue;
8190 }
8191
8192 // open dir
8193 frag_t fg = cur->pick_dirfrag(path[depth]);
8194 CDir *curdir = cur->get_dirfrag(fg);
8195 if (!curdir) {
8196 if (cur->is_auth()) {
8197 // parent dir frozen_dir?
8198 if (cur->is_frozen()) {
8199 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
11fdf7f2 8200 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
7c673cae
FG
8201 return 1;
8202 }
8203 curdir = cur->get_or_open_dirfrag(this, fg);
8204 } else {
8205 // discover?
8206 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
11fdf7f2 8207 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
7c673cae
FG
8208 null_okay);
8209 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8210 return 1;
8211 }
8212 }
11fdf7f2 8213 ceph_assert(curdir);
7c673cae
FG
8214
8215#ifdef MDS_VERIFY_FRAGSTAT
8216 if (curdir->is_complete())
8217 curdir->verify_fragstat();
8218#endif
8219
8220 // frozen?
8221 /*
8222 if (curdir->is_frozen()) {
8223 // doh!
8224 // FIXME: traverse is allowed?
8225 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8226 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8227 if (onfinish) delete onfinish;
8228 return 1;
8229 }
8230 */
8231
8232 // Before doing dirfrag->dn lookup, compare with DamageTable's
8233 // record of which dentries were unreadable
8234 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8235 dout(4) << "traverse: stopped lookup at damaged dentry "
8236 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8237 return -EIO;
8238 }
8239
8240 // dentry
8241 CDentry *dn = curdir->lookup(path[depth], snapid);
8242 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
8243
8244 // null and last_bit and xlocked by me?
8245 if (dnl && dnl->is_null() && null_okay) {
8246 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
8247 if (pdnvec)
8248 pdnvec->push_back(dn);
8249 if (pin)
8250 *pin = 0;
8251 break; // done!
8252 }
8253
8254 if (dnl &&
8255 dn->lock.is_xlocked() &&
8256 dn->lock.get_xlock_by() != mdr &&
8257 !dn->lock.can_read(client) &&
8258 (dnl->is_null() || forward)) {
8259 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
11fdf7f2 8260 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
7c673cae
FG
8261 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
8262 mds->mdlog->flush();
8263 return 1;
8264 }
8265
8266 // can we conclude ENOENT?
8267 if (dnl && dnl->is_null()) {
8268 if (dn->lock.can_read(client) ||
8269 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8270 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8271 if (pdnvec) {
8272 if (depth == path.depth() - 1)
8273 pdnvec->push_back(dn);
8274 else
8275 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8276 }
8277 return -ENOENT;
8278 } else {
8279 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
11fdf7f2 8280 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
7c673cae
FG
8281 return 1;
8282 }
8283 }
8284
8285 if (dnl && !dnl->is_null()) {
8286 CInode *in = dnl->get_inode();
8287
8288 // do we have inode?
8289 if (!in) {
11fdf7f2 8290 ceph_assert(dnl->is_remote());
7c673cae
FG
8291 // do i have it?
8292 in = get_inode(dnl->get_remote_ino());
8293 if (in) {
8294 dout(7) << "linking in remote in " << *in << dendl;
8295 dn->link_remote(dnl, in);
8296 } else {
8297 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
11fdf7f2 8298 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7c673cae
FG
8299 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8300 dout(4) << "traverse: remote dentry points to damaged ino "
8301 << *dn << dendl;
8302 return -EIO;
8303 }
11fdf7f2 8304 open_remote_dentry(dn, true, cf.build(),
7c673cae
FG
8305 (null_okay && depth == path.depth() - 1));
8306 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8307 return 1;
8308 }
8309 }
8310
8311 cur = in;
8312 // make sure snaprealm are open...
11fdf7f2
TL
8313 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8314 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8315 return 1;
8316 }
8317
8318 // add to trace, continue.
8319 touch_inode(cur);
8320 if (pdnvec)
8321 pdnvec->push_back(dn);
8322 if (pin)
8323 *pin = cur;
8324 depth++;
8325 continue;
8326 }
8327
8328
8329 // MISS. dentry doesn't exist.
8330 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8331
8332 if (curdir->is_auth()) {
8333 // dentry is mine.
8334 if (curdir->is_complete() ||
8335 (snapid == CEPH_NOSNAP &&
8336 curdir->has_bloom() &&
11fdf7f2 8337 !curdir->is_in_bloom(path[depth]))) {
7c673cae
FG
8338 // file not found
8339 if (pdnvec) {
8340 // instantiate a null dn?
8341 if (depth < path.depth()-1){
8342 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8343 dn = NULL;
8344 } else if (dn) {
8345 ceph_abort(); // should have fallen out in ->is_null() check above
8346 } else if (curdir->is_frozen()) {
8347 dout(20) << " not adding null to frozen dir " << dendl;
8348 } else if (snapid < CEPH_MAXSNAP) {
8349 dout(20) << " not adding null for snapid " << snapid << dendl;
8350 } else {
8351 // create a null dentry
8352 dn = curdir->add_null_dentry(path[depth]);
8353 dout(20) << " added null " << *dn << dendl;
8354 }
8355 if (dn)
8356 pdnvec->push_back(dn);
8357 else
8358 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8359 }
8360 return -ENOENT;
8361 } else {
8362
8363 // Check DamageTable for missing fragments before trying to fetch
8364 // this
8365 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8366 dout(4) << "traverse: damaged dirfrag " << *curdir
8367 << ", blocking fetch" << dendl;
8368 return -EIO;
8369 }
8370
8371 // directory isn't complete; reload
8372 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8373 touch_inode(cur);
11fdf7f2 8374 curdir->fetch(cf.build(), path[depth]);
7c673cae
FG
8375 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8376 return 1;
8377 }
8378 } else {
8379 // dirfrag/dentry is not mine.
8380 mds_authority_t dauth = curdir->authority();
8381
8382 if (forward &&
11fdf7f2 8383 mdr && mdr->client_request &&
7c673cae
FG
8384 (int)depth < mdr->client_request->get_num_fwd()) {
8385 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8386 << " < fwd " << mdr->client_request->get_num_fwd()
8387 << ", discovering instead of forwarding" << dendl;
8388 discover = true;
8389 }
8390
8391 if ((discover || null_okay)) {
8392 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
11fdf7f2 8393 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
7c673cae
FG
8394 null_okay);
8395 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8396 return 1;
8397 }
8398 if (forward) {
8399 // forward
8400 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8401
8402 if (curdir->is_ambiguous_auth()) {
8403 // wait
8404 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
11fdf7f2 8405 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
7c673cae
FG
8406 return 1;
8407 }
8408
8409 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
11fdf7f2
TL
8410
8411 request_forward(mdr, dauth.first);
8412
7c673cae 8413 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
7c673cae 8414 return 2;
11fdf7f2 8415 }
7c673cae 8416 }
11fdf7f2 8417
7c673cae
FG
8418 ceph_abort(); // i shouldn't get here
8419 }
8420
8421 // success.
8422 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8423 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8424 if (mdr)
11fdf7f2 8425 ceph_assert(mdr->snapid == snapid);
7c673cae
FG
8426 return 0;
8427}
8428
8429CInode *MDCache::cache_traverse(const filepath& fp)
8430{
8431 dout(10) << "cache_traverse " << fp << dendl;
8432
8433 CInode *in;
8434 if (fp.get_ino())
8435 in = get_inode(fp.get_ino());
8436 else
8437 in = root;
8438 if (!in)
8439 return NULL;
8440
8441 for (unsigned i = 0; i < fp.depth(); i++) {
11fdf7f2 8442 std::string_view dname = fp[i];
7c673cae
FG
8443 frag_t fg = in->pick_dirfrag(dname);
8444 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8445 CDir *curdir = in->get_dirfrag(fg);
8446 if (!curdir)
8447 return NULL;
8448 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8449 if (!dn)
8450 return NULL;
8451 in = dn->get_linkage()->get_inode();
8452 if (!in)
8453 return NULL;
8454 }
8455 dout(10) << " got " << *in << dendl;
8456 return in;
8457}
8458
8459
8460/**
8461 * open_remote_dir -- open up a remote dirfrag
8462 *
8463 * @param diri base inode
8464 * @param approxfg approximate fragment.
8465 * @param fin completion callback
8466 */
11fdf7f2 8467void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
7c673cae
FG
8468{
8469 dout(10) << "open_remote_dir on " << *diri << dendl;
11fdf7f2
TL
8470 ceph_assert(diri->is_dir());
8471 ceph_assert(!diri->is_auth());
8472 ceph_assert(diri->get_dirfrag(approxfg) == 0);
7c673cae 8473
224ce89b 8474 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8475}
8476
8477
8478/**
8479 * get_dentry_inode - get or open inode
8480 *
8481 * @param dn the dentry
8482 * @param mdr current request
8483 *
8484 * will return inode for primary, or link up/open up remote link's inode as necessary.
8485 * If it's not available right now, puts mdr on wait list and returns null.
8486 */
8487CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8488{
8489 CDentry::linkage_t *dnl;
8490 if (projected)
8491 dnl = dn->get_projected_linkage();
8492 else
8493 dnl = dn->get_linkage();
8494
11fdf7f2 8495 ceph_assert(!dnl->is_null());
7c673cae
FG
8496
8497 if (dnl->is_primary())
8498 return dnl->inode;
8499
11fdf7f2 8500 ceph_assert(dnl->is_remote());
7c673cae
FG
8501 CInode *in = get_inode(dnl->get_remote_ino());
8502 if (in) {
8503 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8504 dn->link_remote(dnl, in);
8505 return in;
8506 } else {
8507 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8508 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8509 return 0;
8510 }
8511}
8512
8513struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8514 CDentry *dn;
8515 inodeno_t ino;
11fdf7f2 8516 MDSContext *onfinish;
7c673cae 8517 bool want_xlocked;
11fdf7f2 8518 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
31f18b77
FG
8519 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8520 dn->get(MDSCacheObject::PIN_PTRWAITER);
8521 }
7c673cae
FG
8522 void finish(int r) override {
8523 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8524 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8525 }
8526};
8527
11fdf7f2 8528void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
7c673cae
FG
8529{
8530 dout(10) << "open_remote_dentry " << *dn << dendl;
8531 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8532 inodeno_t ino = dnl->get_remote_ino();
8533 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8534 open_ino(ino, pool,
8535 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8536}
8537
11fdf7f2 8538void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
8539 bool want_xlocked, int r)
8540{
8541 if (r < 0) {
31f18b77
FG
8542 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8543 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8544 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8545 dn->state_set(CDentry::STATE_BADREMOTEINO);
8546
8547 std::string path;
8548 CDir *dir = dn->get_dir();
8549 if (dir) {
31f18b77 8550 dir->get_inode()->make_path_string(path);
94b18763 8551 path += "/";
11fdf7f2 8552 path += dn->get_name();
7c673cae
FG
8553 }
8554
31f18b77 8555 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8556 if (fatal) {
31f18b77
FG
8557 mds->damaged();
8558 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8559 }
31f18b77
FG
8560 } else {
8561 r = 0;
8562 }
7c673cae
FG
8563 }
8564 fin->complete(r < 0 ? r : 0);
8565}
8566
8567
8568void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8569{
8570 // empty trace if we're a base inode
8571 if (in->is_base())
8572 return;
8573
8574 CInode *parent = in->get_parent_inode();
11fdf7f2 8575 ceph_assert(parent);
7c673cae
FG
8576 make_trace(trace, parent);
8577
8578 CDentry *dn = in->get_parent_dn();
8579 dout(15) << "make_trace adding " << *dn << dendl;
8580 trace.push_back(dn);
8581}
8582
8583
8584// -------------------------------------------------------------------------------
8585// Open inode by inode number
8586
8587class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8588 inodeno_t ino;
8589 public:
8590 bufferlist bl;
8591 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8592 MDCacheIOContext(c), ino(i) {}
8593 void finish(int r) override {
8594 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8595 }
91327a77
AA
8596 void print(ostream& out) const override {
8597 out << "openino_backtrace_fetch" << ino << ")";
8598 }
7c673cae
FG
8599};
8600
8601struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8602 inodeno_t ino;
11fdf7f2 8603 MMDSOpenIno::const_ref msg;
7c673cae
FG
8604 bool parent;
8605 public:
11fdf7f2 8606 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const MMDSOpenIno::const_ref &m, bool p) :
7c673cae
FG
8607 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8608 void finish(int r) override {
8609 if (r < 0 && !parent)
8610 r = -EAGAIN;
8611 if (msg) {
8612 mdcache->handle_open_ino(msg, r);
8613 return;
8614 }
11fdf7f2
TL
8615 auto& info = mdcache->opening_inodes.at(ino);
8616 mdcache->_open_ino_traverse_dir(ino, info, r);
7c673cae
FG
8617 }
8618};
8619
8620struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8621 inodeno_t ino;
8622 public:
8623 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8624 void finish(int r) override {
8625 mdcache->_open_ino_parent_opened(ino, r);
8626 }
8627};
8628
8629void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8630{
8631 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8632
11fdf7f2 8633 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8634
8635 CInode *in = get_inode(ino);
8636 if (in) {
8637 dout(10) << " found cached " << *in << dendl;
8638 open_ino_finish(ino, info, in->authority().first);
8639 return;
8640 }
8641
8642 inode_backtrace_t backtrace;
8643 if (err == 0) {
8644 try {
11fdf7f2 8645 decode(backtrace, bl);
7c673cae
FG
8646 } catch (const buffer::error &decode_exc) {
8647 derr << "corrupt backtrace on ino x0" << std::hex << ino
8648 << std::dec << ": " << decode_exc << dendl;
8649 open_ino_finish(ino, info, -EIO);
8650 return;
8651 }
8652 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8653 dout(10) << " old object in pool " << info.pool
8654 << ", retrying pool " << backtrace.pool << dendl;
8655 info.pool = backtrace.pool;
8656 C_IO_MDC_OpenInoBacktraceFetched *fin =
8657 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8658 fetch_backtrace(ino, info.pool, fin->bl,
8659 new C_OnFinisher(fin, mds->finisher));
8660 return;
8661 }
8662 } else if (err == -ENOENT) {
8663 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8664 if (info.pool != meta_pool) {
8665 dout(10) << " no object in pool " << info.pool
8666 << ", retrying pool " << meta_pool << dendl;
8667 info.pool = meta_pool;
8668 C_IO_MDC_OpenInoBacktraceFetched *fin =
8669 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8670 fetch_backtrace(ino, info.pool, fin->bl,
8671 new C_OnFinisher(fin, mds->finisher));
8672 return;
8673 }
8674 err = 0; // backtrace.ancestors.empty() is checked below
8675 }
8676
8677 if (err == 0) {
8678 if (backtrace.ancestors.empty()) {
8679 dout(10) << " got empty backtrace " << dendl;
92f5a8d4 8680 err = -ESTALE;
7c673cae
FG
8681 } else if (!info.ancestors.empty()) {
8682 if (info.ancestors[0] == backtrace.ancestors[0]) {
8683 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8684 err = -EINVAL;
8685 } else {
8686 info.last_err = 0;
8687 }
8688 }
8689 }
8690 if (err) {
8691 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8692 if (info.last_err)
8693 err = info.last_err;
8694 open_ino_finish(ino, info, err);
8695 return;
8696 }
8697
8698 dout(10) << " got backtrace " << backtrace << dendl;
8699 info.ancestors = backtrace.ancestors;
8700
8701 _open_ino_traverse_dir(ino, info, 0);
8702}
8703
8704void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8705{
8706 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8707
11fdf7f2 8708 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8709
8710 CInode *in = get_inode(ino);
8711 if (in) {
8712 dout(10) << " found cached " << *in << dendl;
8713 open_ino_finish(ino, info, in->authority().first);
8714 return;
8715 }
8716
8717 if (ret == mds->get_nodeid()) {
8718 _open_ino_traverse_dir(ino, info, 0);
8719 } else {
8720 if (ret >= 0) {
8721 mds_rank_t checked_rank = mds_rank_t(ret);
8722 info.check_peers = true;
8723 info.auth_hint = checked_rank;
8724 info.checked.erase(checked_rank);
8725 }
8726 do_open_ino(ino, info, ret);
8727 }
8728}
8729
8730void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8731{
8732 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8733
8734 CInode *in = get_inode(ino);
8735 if (in) {
8736 dout(10) << " found cached " << *in << dendl;
8737 open_ino_finish(ino, info, in->authority().first);
8738 return;
8739 }
8740
8741 if (ret) {
8742 do_open_ino(ino, info, ret);
8743 return;
8744 }
8745
8746 mds_rank_t hint = info.auth_hint;
8747 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8748 info.discover, info.want_xlocked, &hint);
8749 if (ret > 0)
8750 return;
8751 if (hint != mds->get_nodeid())
8752 info.auth_hint = hint;
8753 do_open_ino(ino, info, ret);
8754}
8755
11fdf7f2 8756void MDCache::_open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent)
7c673cae
FG
8757{
8758 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 8759 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
7c673cae 8760 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
11fdf7f2
TL
8761 if (mds->logger)
8762 mds->logger->inc(l_mds_openino_dir_fetch);
7c673cae
FG
8763}
8764
11fdf7f2
TL
8765int MDCache::open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
8766 const vector<inode_backpointer_t>& ancestors,
7c673cae
FG
8767 bool discover, bool want_xlocked, mds_rank_t *hint)
8768{
8769 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8770 int err = 0;
8771 for (unsigned i = 0; i < ancestors.size(); i++) {
11fdf7f2
TL
8772 const auto& ancestor = ancestors.at(i);
8773 CInode *diri = get_inode(ancestor.dirino);
7c673cae
FG
8774
8775 if (!diri) {
11fdf7f2
TL
8776 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8777 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
7c673cae
FG
8778 return 1;
8779 }
8780 continue;
8781 }
8782
8783 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8784 CDir *dir = diri->get_parent_dir();
8785 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8786 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8787 dir = dir->get_inode()->get_parent_dir();
8788 _open_ino_fetch_dir(ino, m, dir, i == 0);
8789 return 1;
8790 }
8791
8792 if (!diri->is_dir()) {
8793 dout(10) << " " << *diri << " is not dir" << dendl;
8794 if (i == 0)
8795 err = -ENOTDIR;
8796 break;
8797 }
8798
11fdf7f2 8799 const string& name = ancestor.dname;
7c673cae
FG
8800 frag_t fg = diri->pick_dirfrag(name);
8801 CDir *dir = diri->get_dirfrag(fg);
8802 if (!dir) {
8803 if (diri->is_auth()) {
8804 if (diri->is_frozen()) {
8805 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8806 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8807 return 1;
8808 }
8809 dir = diri->get_or_open_dirfrag(this, fg);
8810 } else if (discover) {
8811 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8812 return 1;
8813 }
8814 }
8815 if (dir) {
11fdf7f2 8816 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
7c673cae
FG
8817 CDentry *dn = dir->lookup(name);
8818 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8819 if (dir->is_auth()) {
8820 if (dnl && dnl->is_primary() &&
8821 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8822 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8823 _open_ino_fetch_dir(ino, m, dir, i == 0);
8824 return 1;
8825 }
8826
8827 if (!dnl && !dir->is_complete() &&
8828 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8829 dout(10) << " fetching incomplete " << *dir << dendl;
8830 _open_ino_fetch_dir(ino, m, dir, i == 0);
8831 return 1;
8832 }
8833
8834 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8835 if (i == 0)
8836 err = -ENOENT;
8837 } else if (discover) {
8838 if (!dnl) {
8839 filepath path(name, 0);
8840 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8841 (i == 0 && want_xlocked));
8842 return 1;
8843 }
8844 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8845 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8846 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8847 return 1;
8848 }
8849 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8850 if (i == 0)
8851 err = -ENOENT;
8852 }
8853 }
8854 if (hint && i == 0)
8855 *hint = dir ? dir->authority().first : diri->authority().first;
8856 break;
8857 }
8858 return err;
8859}
8860
8861void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8862{
8863 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8864
11fdf7f2 8865 MDSContext::vec waiters;
7c673cae
FG
8866 waiters.swap(info.waiters);
8867 opening_inodes.erase(ino);
8868 finish_contexts(g_ceph_context, waiters, ret);
8869}
8870
8871void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8872{
8873 if (err < 0 && err != -EAGAIN) {
8874 info.checked.clear();
7c673cae
FG
8875 info.checking = MDS_RANK_NONE;
8876 info.check_peers = true;
8877 info.fetch_backtrace = true;
8878 if (info.discover) {
8879 info.discover = false;
8880 info.ancestors.clear();
8881 }
8882 if (err != -ENOENT && err != -ENOTDIR)
8883 info.last_err = err;
8884 }
8885
d2e6a577
FG
8886 if (info.check_peers || info.discover) {
8887 if (info.discover) {
8888 // got backtrace from peer, but failed to find inode. re-check peers
8889 info.discover = false;
8890 info.ancestors.clear();
8891 info.checked.clear();
8892 }
7c673cae
FG
8893 info.check_peers = false;
8894 info.checking = MDS_RANK_NONE;
8895 do_open_ino_peer(ino, info);
8896 } else if (info.fetch_backtrace) {
8897 info.check_peers = true;
8898 info.fetch_backtrace = false;
8899 info.checking = mds->get_nodeid();
8900 info.checked.clear();
7c673cae
FG
8901 C_IO_MDC_OpenInoBacktraceFetched *fin =
8902 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8903 fetch_backtrace(ino, info.pool, fin->bl,
8904 new C_OnFinisher(fin, mds->finisher));
8905 } else {
11fdf7f2 8906 ceph_assert(!info.ancestors.empty());
7c673cae
FG
8907 info.checking = mds->get_nodeid();
8908 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8909 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8910 }
8911}
8912
8913void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8914{
8915 set<mds_rank_t> all, active;
8916 mds->mdsmap->get_mds_set(all);
7c673cae 8917 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
8918 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
8919 else
8920 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8921
8922 dout(10) << "do_open_ino_peer " << ino << " active " << active
8923 << " all " << all << " checked " << info.checked << dendl;
8924
11fdf7f2 8925 mds_rank_t whoami = mds->get_nodeid();
7c673cae 8926 mds_rank_t peer = MDS_RANK_NONE;
11fdf7f2 8927 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
7c673cae
FG
8928 if (active.count(info.auth_hint)) {
8929 peer = info.auth_hint;
8930 info.auth_hint = MDS_RANK_NONE;
8931 }
8932 } else {
8933 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
11fdf7f2 8934 if (*p != whoami && info.checked.count(*p) == 0) {
7c673cae
FG
8935 peer = *p;
8936 break;
8937 }
8938 }
8939 if (peer < 0) {
11fdf7f2 8940 all.erase(whoami);
d2e6a577 8941 if (all != info.checked) {
7c673cae
FG
8942 dout(10) << " waiting for more peers to be active" << dendl;
8943 } else {
8944 dout(10) << " all MDS peers have been checked " << dendl;
8945 do_open_ino(ino, info, 0);
8946 }
8947 } else {
8948 info.checking = peer;
8949 vector<inode_backpointer_t> *pa = NULL;
8950 // got backtrace from peer or backtrace just fetched
8951 if (info.discover || !info.fetch_backtrace)
8952 pa = &info.ancestors;
11fdf7f2
TL
8953 mds->send_message_mds(MMDSOpenIno::create(info.tid, ino, pa), peer);
8954 if (mds->logger)
8955 mds->logger->inc(l_mds_openino_peer_discover);
7c673cae
FG
8956 }
8957}
8958
11fdf7f2 8959void MDCache::handle_open_ino(const MMDSOpenIno::const_ref &m, int err)
7c673cae
FG
8960{
8961 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8962 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
8963 return;
8964 }
8965
8966 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8967
11fdf7f2 8968 auto from = mds_rank_t(m->get_source().num());
7c673cae 8969 inodeno_t ino = m->ino;
11fdf7f2 8970 MMDSOpenInoReply::ref reply;
7c673cae
FG
8971 CInode *in = get_inode(ino);
8972 if (in) {
8973 dout(10) << " have " << *in << dendl;
11fdf7f2 8974 reply = MMDSOpenInoReply::create(m->get_tid(), ino, mds_rank_t(0));
7c673cae
FG
8975 if (in->is_auth()) {
8976 touch_inode(in);
8977 while (1) {
8978 CDentry *pdn = in->get_parent_dn();
8979 if (!pdn)
8980 break;
8981 CInode *diri = pdn->get_dir()->get_inode();
94b18763 8982 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
8983 in->inode.version));
8984 in = diri;
8985 }
8986 } else {
8987 reply->hint = in->authority().first;
8988 }
8989 } else if (err < 0) {
11fdf7f2 8990 reply = MMDSOpenInoReply::create(m->get_tid(), ino, MDS_RANK_NONE, err);
7c673cae
FG
8991 } else {
8992 mds_rank_t hint = MDS_RANK_NONE;
8993 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8994 if (ret > 0)
8995 return;
11fdf7f2 8996 reply = MMDSOpenInoReply::create(m->get_tid(), ino, hint, ret);
7c673cae 8997 }
11fdf7f2 8998 mds->send_message_mds(reply, from);
7c673cae
FG
8999}
9000
11fdf7f2 9001void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m)
7c673cae
FG
9002{
9003 dout(10) << "handle_open_ino_reply " << *m << dendl;
9004
9005 inodeno_t ino = m->ino;
9006 mds_rank_t from = mds_rank_t(m->get_source().num());
9007 auto it = opening_inodes.find(ino);
9008 if (it != opening_inodes.end() && it->second.checking == from) {
9009 open_ino_info_t& info = it->second;
9010 info.checking = MDS_RANK_NONE;
9011 info.checked.insert(from);
9012
9013 CInode *in = get_inode(ino);
9014 if (in) {
9015 dout(10) << " found cached " << *in << dendl;
9016 open_ino_finish(ino, info, in->authority().first);
9017 } else if (!m->ancestors.empty()) {
9018 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9019 if (!info.want_replica) {
9020 open_ino_finish(ino, info, from);
7c673cae
FG
9021 return;
9022 }
9023
9024 info.ancestors = m->ancestors;
9025 info.auth_hint = from;
9026 info.checking = mds->get_nodeid();
9027 info.discover = true;
9028 _open_ino_traverse_dir(ino, info, 0);
9029 } else if (m->error) {
9030 dout(10) << " error " << m->error << " from mds." << from << dendl;
9031 do_open_ino(ino, info, m->error);
9032 } else {
9033 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9034 info.auth_hint = m->hint;
9035 info.checked.erase(m->hint);
9036 }
9037 do_open_ino_peer(ino, info);
9038 }
9039 }
7c673cae
FG
9040}
9041
9042void MDCache::kick_open_ino_peers(mds_rank_t who)
9043{
9044 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9045
9046 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9047 p != opening_inodes.end();
9048 ++p) {
9049 open_ino_info_t& info = p->second;
9050 if (info.checking == who) {
9051 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9052 info.checking = MDS_RANK_NONE;
9053 do_open_ino_peer(p->first, info);
9054 } else if (info.checking == MDS_RANK_NONE) {
9055 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9056 do_open_ino_peer(p->first, info);
9057 }
9058 }
9059}
9060
11fdf7f2 9061void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
7c673cae
FG
9062 bool want_replica, bool want_xlocked)
9063{
9064 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9065 << want_replica << dendl;
9066
11fdf7f2
TL
9067 auto it = opening_inodes.find(ino);
9068 if (it != opening_inodes.end()) {
9069 open_ino_info_t& info = it->second;
7c673cae
FG
9070 if (want_replica) {
9071 info.want_replica = true;
9072 if (want_xlocked && !info.want_xlocked) {
9073 if (!info.ancestors.empty()) {
9074 CInode *diri = get_inode(info.ancestors[0].dirino);
9075 if (diri) {
9076 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9077 CDir *dir = diri->get_dirfrag(fg);
9078 if (dir && !dir->is_auth()) {
9079 filepath path(info.ancestors[0].dname, 0);
9080 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9081 }
9082 }
9083 }
9084 info.want_xlocked = true;
9085 }
9086 }
9087 info.waiters.push_back(fin);
9088 } else {
9089 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
9090 info.want_replica = want_replica;
9091 info.want_xlocked = want_xlocked;
9092 info.tid = ++open_ino_last_tid;
9093 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9094 info.waiters.push_back(fin);
11fdf7f2
TL
9095 if (mds->is_rejoin() &&
9096 open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
9097 info.fetch_backtrace = false;
9098 info.checking = mds->get_nodeid();
9099 _open_ino_traverse_dir(ino, info, 0);
9100 } else {
9101 do_open_ino(ino, info, 0);
9102 }
7c673cae
FG
9103 }
9104}
9105
9106/* ---------------------------- */
9107
9108/*
9109 * search for a given inode on MDS peers. optionally start with the given node.
9110
9111
9112 TODO
9113 - recover from mds node failure, recovery
9114 - traverse path
9115
9116 */
11fdf7f2 9117void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint)
7c673cae
FG
9118{
9119 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
9120 CInode *in = get_inode(ino);
9121 if (in && in->state_test(CInode::STATE_PURGING)) {
9122 c->complete(-ESTALE);
9123 return;
9124 }
11fdf7f2 9125 ceph_assert(!in);
7c673cae
FG
9126
9127 ceph_tid_t tid = ++find_ino_peer_last_tid;
9128 find_ino_peer_info_t& fip = find_ino_peer[tid];
9129 fip.ino = ino;
9130 fip.tid = tid;
9131 fip.fin = c;
9132 fip.hint = hint;
7c673cae
FG
9133 _do_find_ino_peer(fip);
9134}
9135
9136void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9137{
9138 set<mds_rank_t> all, active;
9139 mds->mdsmap->get_mds_set(all);
1adf2230 9140 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9141
9142 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9143 << " active " << active << " all " << all
9144 << " checked " << fip.checked
9145 << dendl;
9146
9147 mds_rank_t m = MDS_RANK_NONE;
9148 if (fip.hint >= 0) {
9149 m = fip.hint;
9150 fip.hint = MDS_RANK_NONE;
9151 } else {
9152 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9153 if (*p != mds->get_nodeid() &&
9154 fip.checked.count(*p) == 0) {
9155 m = *p;
9156 break;
9157 }
9158 }
9159 if (m == MDS_RANK_NONE) {
d2e6a577
FG
9160 all.erase(mds->get_nodeid());
9161 if (all != fip.checked) {
7c673cae
FG
9162 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9163 } else {
9164 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9165 fip.fin->complete(-ESTALE);
9166 find_ino_peer.erase(fip.tid);
9167 }
9168 } else {
9169 fip.checking = m;
11fdf7f2 9170 mds->send_message_mds(MMDSFindIno::create(fip.tid, fip.ino), m);
7c673cae
FG
9171 }
9172}
9173
11fdf7f2 9174void MDCache::handle_find_ino(const MMDSFindIno::const_ref &m)
7c673cae
FG
9175{
9176 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
9177 return;
9178 }
9179
9180 dout(10) << "handle_find_ino " << *m << dendl;
11fdf7f2 9181 auto r = MMDSFindInoReply::create(m->tid);
7c673cae
FG
9182 CInode *in = get_inode(m->ino);
9183 if (in) {
9184 in->make_path(r->path);
9185 dout(10) << " have " << r->path << " " << *in << dendl;
9186 }
11fdf7f2 9187 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
7c673cae
FG
9188}
9189
9190
11fdf7f2 9191void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref &m)
7c673cae
FG
9192{
9193 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
9194 if (p != find_ino_peer.end()) {
9195 dout(10) << "handle_find_ino_reply " << *m << dendl;
9196 find_ino_peer_info_t& fip = p->second;
9197
9198 // success?
9199 if (get_inode(fip.ino)) {
9200 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9201 mds->queue_waiter(fip.fin);
9202 find_ino_peer.erase(p);
7c673cae
FG
9203 return;
9204 }
9205
9206 mds_rank_t from = mds_rank_t(m->get_source().num());
9207 if (fip.checking == from)
9208 fip.checking = MDS_RANK_NONE;
9209 fip.checked.insert(from);
9210
9211 if (!m->path.empty()) {
9212 // we got a path!
9213 vector<CDentry*> trace;
11fdf7f2 9214 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 9215 MDRequestRef null_ref;
11fdf7f2 9216 int r = path_traverse(null_ref, cf, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
7c673cae
FG
9217 if (r > 0)
9218 return;
9219 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9220 << ", retrying" << dendl;
9221 fip.checked.clear();
9222 _do_find_ino_peer(fip);
9223 } else {
9224 // nope, continue.
9225 _do_find_ino_peer(fip);
9226 }
9227 } else {
9228 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9229 }
7c673cae
FG
9230}
9231
9232void MDCache::kick_find_ino_peers(mds_rank_t who)
9233{
9234 // find_ino_peers requests we should move on from
9235 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9236 p != find_ino_peer.end();
9237 ++p) {
9238 find_ino_peer_info_t& fip = p->second;
9239 if (fip.checking == who) {
9240 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9241 fip.checking = MDS_RANK_NONE;
9242 _do_find_ino_peer(fip);
9243 } else if (fip.checking == MDS_RANK_NONE) {
9244 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9245 _do_find_ino_peer(fip);
9246 }
9247 }
9248}
9249
9250/* ---------------------------- */
9251
9252int MDCache::get_num_client_requests()
9253{
9254 int count = 0;
9255 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9256 p != active_requests.end();
9257 ++p) {
9258 MDRequestRef& mdr = p->second;
9259 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9260 count++;
9261 }
9262 return count;
9263}
9264
11fdf7f2 9265MDRequestRef MDCache::request_start(const MClientRequest::const_ref& req)
7c673cae
FG
9266{
9267 // did we win a forward race against a slave?
9268 if (active_requests.count(req->get_reqid())) {
9269 MDRequestRef& mdr = active_requests[req->get_reqid()];
11fdf7f2 9270 ceph_assert(mdr);
7c673cae
FG
9271 if (mdr->is_slave()) {
9272 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9273 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9274 } else {
9275 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
7c673cae
FG
9276 }
9277 return MDRequestRef();
9278 }
9279
9280 // register new client request
9281 MDRequestImpl::Params params;
9282 params.reqid = req->get_reqid();
9283 params.attempt = req->get_num_fwd();
9284 params.client_req = req;
9285 params.initiated = req->get_recv_stamp();
9286 params.throttled = req->get_throttle_stamp();
9287 params.all_read = req->get_recv_complete_stamp();
9288 params.dispatched = req->get_dispatch_stamp();
9289
9290 MDRequestRef mdr =
11fdf7f2 9291 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae
FG
9292 active_requests[params.reqid] = mdr;
9293 mdr->set_op_stamp(req->get_stamp());
9294 dout(7) << "request_start " << *mdr << dendl;
9295 return mdr;
9296}
9297
11fdf7f2 9298MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const Message::const_ref &m)
7c673cae
FG
9299{
9300 int by = m->get_source().num();
9301 MDRequestImpl::Params params;
9302 params.reqid = ri;
9303 params.attempt = attempt;
9304 params.triggering_slave_req = m;
9305 params.slave_to = by;
9306 params.initiated = m->get_recv_stamp();
9307 params.throttled = m->get_throttle_stamp();
9308 params.all_read = m->get_recv_complete_stamp();
9309 params.dispatched = m->get_dispatch_stamp();
9310 MDRequestRef mdr =
11fdf7f2
TL
9311 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9312 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9313 active_requests[mdr->reqid] = mdr;
9314 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9315 return mdr;
9316}
9317
9318MDRequestRef MDCache::request_start_internal(int op)
9319{
91327a77 9320 utime_t now = ceph_clock_now();
7c673cae
FG
9321 MDRequestImpl::Params params;
9322 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9323 params.reqid.tid = mds->issue_tid();
91327a77
AA
9324 params.initiated = now;
9325 params.throttled = now;
9326 params.all_read = now;
9327 params.dispatched = now;
7c673cae
FG
9328 params.internal_op = op;
9329 MDRequestRef mdr =
11fdf7f2 9330 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae 9331
11fdf7f2 9332 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9333 active_requests[mdr->reqid] = mdr;
9334 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9335 return mdr;
9336}
9337
9338MDRequestRef MDCache::request_get(metareqid_t rid)
9339{
9340 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
11fdf7f2 9341 ceph_assert(p != active_requests.end());
7c673cae
FG
9342 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9343 return p->second;
9344}
9345
9346void MDCache::request_finish(MDRequestRef& mdr)
9347{
9348 dout(7) << "request_finish " << *mdr << dendl;
9349 mdr->mark_event("finishing request");
9350
9351 // slave finisher?
9352 if (mdr->has_more() && mdr->more()->slave_commit) {
9353 Context *fin = mdr->more()->slave_commit;
9354 mdr->more()->slave_commit = 0;
9355 int ret;
9356 if (mdr->aborted) {
9357 mdr->aborted = false;
9358 ret = -1;
9359 mdr->more()->slave_rolling_back = true;
9360 } else {
9361 ret = 0;
9362 mdr->committing = true;
9363 }
9364 fin->complete(ret); // this must re-call request_finish.
9365 return;
9366 }
9367
d2e6a577
FG
9368 switch(mdr->internal_op) {
9369 case CEPH_MDS_OP_FRAGMENTDIR:
9370 logger->inc(l_mdss_ireq_fragmentdir);
9371 break;
9372 case CEPH_MDS_OP_EXPORTDIR:
9373 logger->inc(l_mdss_ireq_exportdir);
9374 break;
9375 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9376 logger->inc(l_mdss_ireq_enqueue_scrub);
9377 break;
9378 case CEPH_MDS_OP_FLUSH:
9379 logger->inc(l_mdss_ireq_flush);
9380 break;
9381 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9382 logger->inc(l_mdss_ireq_fragstats);
9383 break;
9384 case CEPH_MDS_OP_REPAIR_INODESTATS:
9385 logger->inc(l_mdss_ireq_inodestats);
9386 break;
9387 }
9388
7c673cae
FG
9389 request_cleanup(mdr);
9390}
9391
9392
9393void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9394{
9395 mdr->mark_event("forwarding request");
9396 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9397 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9398 << *mdr->client_request << dendl;
91327a77 9399 mds->forward_message_mds(mdr->release_client_request(), who);
7c673cae
FG
9400 if (mds->logger) mds->logger->inc(l_mds_forward);
9401 } else if (mdr->internal_op >= 0) {
9402 dout(10) << "request_forward on internal op; cancelling" << dendl;
9403 mdr->internal_op_finish->complete(-EXDEV);
9404 } else {
9405 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9406 << " was from mds" << dendl;
9407 }
9408 request_cleanup(mdr);
9409}
9410
9411
9412void MDCache::dispatch_request(MDRequestRef& mdr)
9413{
9414 if (mdr->client_request) {
9415 mds->server->dispatch_client_request(mdr);
9416 } else if (mdr->slave_request) {
9417 mds->server->dispatch_slave_request(mdr);
9418 } else {
9419 switch (mdr->internal_op) {
9420 case CEPH_MDS_OP_FRAGMENTDIR:
9421 dispatch_fragment_dir(mdr);
9422 break;
9423 case CEPH_MDS_OP_EXPORTDIR:
9424 migrator->dispatch_export_dir(mdr, 0);
9425 break;
9426 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9427 enqueue_scrub_work(mdr);
9428 break;
9429 case CEPH_MDS_OP_FLUSH:
9430 flush_dentry_work(mdr);
9431 break;
9432 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9433 repair_dirfrag_stats_work(mdr);
9434 break;
9435 case CEPH_MDS_OP_REPAIR_INODESTATS:
9436 repair_inode_stats_work(mdr);
9437 break;
11fdf7f2
TL
9438 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9439 upgrade_inode_snaprealm_work(mdr);
9440 break;
7c673cae
FG
9441 default:
9442 ceph_abort();
9443 }
9444 }
9445}
9446
9447
9448void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9449{
9450 if (!mdr->has_more())
9451 return;
9452
9453 // clean up slaves
9454 // (will implicitly drop remote dn pins)
9455 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9456 p != mdr->more()->slaves.end();
9457 ++p) {
11fdf7f2 9458 auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt,
7c673cae
FG
9459 MMDSSlaveRequest::OP_FINISH);
9460
9461 if (mdr->killed && !mdr->committing) {
9462 r->mark_abort();
9463 } else if (mdr->more()->srcdn_auth_mds == *p &&
9464 mdr->more()->inode_import.length() > 0) {
9465 // information about rename imported caps
9466 r->inode_export.claim(mdr->more()->inode_import);
9467 }
9468
9469 mds->send_message_mds(r, *p);
9470 }
9471
9472 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9473 * implicitly. Note that we don't call the finishers -- there shouldn't
9474 * be any on a remote lock and the request finish wakes up all
9475 * the waiters anyway! */
7c673cae 9476
11fdf7f2
TL
9477 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9478 SimpleLock *lock = it->lock;
9479 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9480 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9481 << " on " << lock->get_parent() << dendl;
9482 lock->put_xlock();
9483 mdr->locks.erase(it++);
9484 } else if (it->is_remote_wrlock()) {
9485 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9486 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9487 if (it->is_wrlock()) {
9488 it->clear_remote_wrlock();
9489 ++it;
9490 } else {
9491 mdr->locks.erase(it++);
9492 }
9493 } else {
9494 ++it;
9495 }
7c673cae
FG
9496 }
9497
9498 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9499 * leaving them in can cause double-notifies as
9500 * this function can get called more than once */
9501}
9502
9503void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9504{
9505 request_drop_foreign_locks(mdr);
9506 mds->locker->drop_non_rdlocks(mdr.get());
9507}
9508
9509void MDCache::request_drop_locks(MDRequestRef& mdr)
9510{
9511 request_drop_foreign_locks(mdr);
9512 mds->locker->drop_locks(mdr.get());
9513}
9514
9515void MDCache::request_cleanup(MDRequestRef& mdr)
9516{
9517 dout(15) << "request_cleanup " << *mdr << dendl;
9518
9519 if (mdr->has_more()) {
9520 if (mdr->more()->is_ambiguous_auth)
9521 mdr->clear_ambiguous_auth();
9522 if (!mdr->more()->waiting_for_finish.empty())
9523 mds->queue_waiters(mdr->more()->waiting_for_finish);
9524 }
9525
9526 request_drop_locks(mdr);
9527
9528 // drop (local) auth pins
9529 mdr->drop_local_auth_pins();
9530
9531 // drop stickydirs
11fdf7f2 9532 mdr->put_stickydirs();
7c673cae
FG
9533
9534 mds->locker->kick_cap_releases(mdr);
9535
9536 // drop cache pins
9537 mdr->drop_pins();
9538
9539 // remove from session
9540 mdr->item_session_request.remove_myself();
9541
9542 // remove from map
9543 active_requests.erase(mdr->reqid);
9544
9545 if (mds->logger)
9546 log_stat();
9547
9548 mdr->mark_event("cleaned up request");
9549}
9550
9551void MDCache::request_kill(MDRequestRef& mdr)
9552{
9553 // rollback slave requests is tricky. just let the request proceed.
94b18763 9554 if (mdr->has_more() &&
7c673cae 9555 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
94b18763 9556 if (!mdr->done_locking) {
11fdf7f2 9557 ceph_assert(mdr->more()->witnessed.empty());
94b18763
FG
9558 mdr->aborted = true;
9559 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9560 } else {
9561 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9562 }
7c673cae 9563
11fdf7f2
TL
9564 ceph_assert(mdr->used_prealloc_ino == 0);
9565 ceph_assert(mdr->prealloc_inos.empty());
7c673cae
FG
9566
9567 mdr->session = NULL;
9568 mdr->item_session_request.remove_myself();
9569 return;
9570 }
9571
9572 mdr->killed = true;
9573 mdr->mark_event("killing request");
9574
9575 if (mdr->committing) {
9576 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9577 } else {
9578 dout(10) << "request_kill " << *mdr << dendl;
9579 request_cleanup(mdr);
9580 }
9581}
9582
9583// -------------------------------------------------------------------------------
9584// SNAPREALMS
9585
11fdf7f2 9586void MDCache::create_global_snaprealm()
7c673cae 9587{
11fdf7f2
TL
9588 CInode *in = new CInode(this); // dummy inode
9589 create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9590 add_inode(in);
9591 global_snaprealm = in->snaprealm;
7c673cae
FG
9592}
9593
11fdf7f2 9594void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
7c673cae
FG
9595{
9596 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9597
9598 vector<inodeno_t> split_inos;
9599 vector<inodeno_t> split_realms;
9600
11fdf7f2
TL
9601 if (notify_clients) {
9602 ceph_assert(in->snaprealm->have_past_parents_open());
9603 if (snapop == CEPH_SNAP_OP_SPLIT) {
9604 // notify clients of update|split
9605 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9606 !p.end(); ++p)
9607 split_inos.push_back((*p)->ino());
7c673cae 9608
11fdf7f2
TL
9609 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9610 p != in->snaprealm->open_children.end();
9611 ++p)
9612 split_realms.push_back((*p)->inode->ino());
9613 }
9614 }
7c673cae
FG
9615
9616 set<SnapRealm*> past_children;
11fdf7f2 9617 map<client_t, MClientSnap::ref> updates;
7c673cae
FG
9618 list<SnapRealm*> q;
9619 q.push_back(in->snaprealm);
9620 while (!q.empty()) {
9621 SnapRealm *realm = q.front();
9622 q.pop_front();
9623
9624 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9625 realm->invalidate_cached_snaps();
9626
11fdf7f2
TL
9627 if (notify_clients) {
9628 for (const auto& p : realm->client_caps) {
9629 const auto& client = p.first;
9630 const auto& caps = p.second;
9631 ceph_assert(!caps->empty());
9632
9633 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9634 if (em.second) {
9635 auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
9636 update->head.split = in->ino();
9637 update->split_inos = split_inos;
9638 update->split_realms = split_realms;
9639 update->bl = in->snaprealm->get_snap_trace();
9640 em.first->second = std::move(update);
9641 }
7c673cae
FG
9642 }
9643 }
9644
9645 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9646 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9647 p != realm->open_past_children.end();
9648 ++p)
9649 past_children.insert(*p);
9650 }
9651
9652 // notify for active children, too.
9653 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9654 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9655 p != realm->open_children.end();
9656 ++p)
9657 q.push_back(*p);
9658 }
9659
11fdf7f2 9660 if (notify_clients)
7c673cae
FG
9661 send_snaps(updates);
9662
9663 // notify past children and their descendants if we update/delete old snapshots
9664 for (set<SnapRealm*>::iterator p = past_children.begin();
9665 p != past_children.end();
9666 ++p)
9667 q.push_back(*p);
9668
9669 while (!q.empty()) {
9670 SnapRealm *realm = q.front();
9671 q.pop_front();
9672
9673 realm->invalidate_cached_snaps();
9674
9675 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9676 p != realm->open_children.end();
9677 ++p) {
9678 if (past_children.count(*p) == 0)
9679 q.push_back(*p);
9680 }
9681
9682 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9683 p != realm->open_past_children.end();
9684 ++p) {
9685 if (past_children.count(*p) == 0) {
9686 q.push_back(*p);
9687 past_children.insert(*p);
9688 }
9689 }
9690 }
9691
9692 if (snapop == CEPH_SNAP_OP_DESTROY) {
9693 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9694 for (set<SnapRealm*>::iterator p = past_children.begin();
9695 p != past_children.end();
9696 ++p)
9697 maybe_eval_stray((*p)->inode, true);
9698 }
9699}
9700
11fdf7f2 9701void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
7c673cae 9702{
11fdf7f2
TL
9703 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9704 ceph_assert(in->is_auth());
7c673cae 9705
11fdf7f2
TL
9706 set<mds_rank_t> mds_set;
9707 if (stid > 0) {
9708 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9709 mds_set.erase(mds->get_nodeid());
9710 } else {
9711 in->list_replicas(mds_set);
9712 }
7c673cae 9713
11fdf7f2
TL
9714 if (!mds_set.empty()) {
9715 bufferlist snap_blob;
9716 in->encode_snap(snap_blob);
7c673cae 9717
11fdf7f2
TL
9718 for (auto p : mds_set) {
9719 auto m = MMDSSnapUpdate::create(in->ino(), stid, snap_op);
9720 m->snap_blob = snap_blob;
9721 mds->send_message_mds(m, p);
9722 }
9723 }
7c673cae 9724
11fdf7f2
TL
9725 if (stid > 0)
9726 notify_global_snaprealm_update(snap_op);
9727}
7c673cae 9728
11fdf7f2
TL
9729void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref &m)
9730{
9731 mds_rank_t from = mds_rank_t(m->get_source().num());
9732 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 9733
11fdf7f2
TL
9734 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9735 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9736 return;
9737 }
7c673cae 9738
11fdf7f2
TL
9739 // null rejoin_done means open_snaprealms() has already been called
9740 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9741 (mds->is_rejoin() && !rejoin_done);
9742
9743 if (m->get_tid() > 0) {
9744 mds->snapclient->notify_commit(m->get_tid());
9745 if (notify_clients)
9746 notify_global_snaprealm_update(m->get_snap_op());
9747 }
9748
9749 CInode *in = get_inode(m->get_ino());
9750 if (in) {
9751 ceph_assert(!in->is_auth());
9752 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9753 (mds->is_rejoin() && !in->is_rejoining())) {
9754 auto p = m->snap_blob.cbegin();
9755 in->decode_snap(p);
9756
9757 if (!notify_clients) {
9758 if (!rejoin_pending_snaprealms.count(in)) {
9759 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9760 rejoin_pending_snaprealms.insert(in);
9761 }
9762 }
9763 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9764 }
9765 }
7c673cae
FG
9766}
9767
11fdf7f2
TL
9768void MDCache::notify_global_snaprealm_update(int snap_op)
9769{
9770 if (snap_op != CEPH_SNAP_OP_DESTROY)
9771 snap_op = CEPH_SNAP_OP_UPDATE;
9772 set<Session*> sessions;
9773 mds->sessionmap.get_client_session_set(sessions);
9774 for (auto &session : sessions) {
9775 if (!session->is_open() && !session->is_stale())
9776 continue;
9777 auto update = MClientSnap::create(snap_op);
9778 update->head.split = global_snaprealm->inode->ino();
9779 update->bl = global_snaprealm->get_snap_trace();
9780 mds->send_message_client_counted(update, session);
9781 }
9782}
7c673cae
FG
9783
9784// -------------------------------------------------------------------------------
9785// STRAYS
9786
9787struct C_MDC_RetryScanStray : public MDCacheContext {
9788 dirfrag_t next;
9789 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9790 void finish(int r) override {
9791 mdcache->scan_stray_dir(next);
9792 }
9793};
9794
9795void MDCache::scan_stray_dir(dirfrag_t next)
9796{
9797 dout(10) << "scan_stray_dir " << next << dendl;
9798
9799 list<CDir*> ls;
9800 for (int i = 0; i < NUM_STRAY; ++i) {
9801 if (strays[i]->ino() < next.ino)
9802 continue;
9803 strays[i]->get_dirfrags(ls);
9804 }
9805
9806 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9807 CDir *dir = *p;
9808 if (dir->dirfrag() < next)
9809 continue;
9810 if (!dir->is_complete()) {
9811 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9812 return;
9813 }
94b18763
FG
9814 for (auto &p : dir->items) {
9815 CDentry *dn = p.second;
7c673cae
FG
9816 dn->state_set(CDentry::STATE_STRAY);
9817 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9818 if (dnl->is_primary()) {
9819 CInode *in = dnl->get_inode();
9820 if (in->inode.nlink == 0)
9821 in->state_set(CInode::STATE_ORPHAN);
9822 maybe_eval_stray(in);
9823 }
9824 }
9825 }
9826}
9827
7c673cae
FG
9828void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9829{
9830 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9831 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
11fdf7f2
TL
9832 if (mds->logger)
9833 mds->logger->inc(l_mds_openino_backtrace_fetch);
7c673cae
FG
9834}
9835
9836
9837
9838
9839
9840// ========================================================================================
9841// DISCOVER
9842/*
9843
9844 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9845 to the parent metadata object in the cache (pinning it).
9846
9847 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9848
9849*/
9850
9851void MDCache::_send_discover(discover_info_t& d)
9852{
11fdf7f2 9853 auto dis = MDiscover::create(d.ino, d.frag, d.snap, d.want_path, d.want_base_dir, d.want_xlocked);
7c673cae
FG
9854 dis->set_tid(d.tid);
9855 mds->send_message_mds(dis, d.mds);
9856}
9857
9858void MDCache::discover_base_ino(inodeno_t want_ino,
11fdf7f2 9859 MDSContext *onfinish,
7c673cae
FG
9860 mds_rank_t from)
9861{
9862 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9863 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9864 discover_info_t& d = _create_discover(from);
9865 d.ino = want_ino;
9866 _send_discover(d);
9867 }
9868 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9869}
9870
9871
9872void MDCache::discover_dir_frag(CInode *base,
9873 frag_t approx_fg,
11fdf7f2 9874 MDSContext *onfinish,
7c673cae
FG
9875 mds_rank_t from)
9876{
9877 if (from < 0)
9878 from = base->authority().first;
9879
9880 dirfrag_t df(base->ino(), approx_fg);
9881 dout(7) << "discover_dir_frag " << df
9882 << " from mds." << from << dendl;
9883
9884 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9885 discover_info_t& d = _create_discover(from);
9886 d.pin_base(base);
9887 d.ino = base->ino();
9888 d.frag = approx_fg;
9889 d.want_base_dir = true;
9890 _send_discover(d);
9891 }
9892
9893 if (onfinish)
9894 base->add_dir_waiter(approx_fg, onfinish);
9895}
9896
9897struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9898 CInode *base;
9899 snapid_t snapid;
9900 filepath path;
9901 mds_rank_t from;
9902 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9903 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9904 void finish(int r) override {
9905 mdcache->discover_path(base, snapid, path, 0, from);
9906 }
9907};
9908
9909void MDCache::discover_path(CInode *base,
9910 snapid_t snap,
9911 filepath want_path,
11fdf7f2 9912 MDSContext *onfinish,
7c673cae
FG
9913 bool want_xlocked,
9914 mds_rank_t from)
9915{
9916 if (from < 0)
9917 from = base->authority().first;
9918
9919 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9920 << (want_xlocked ? " want_xlocked":"")
9921 << dendl;
9922
9923 if (base->is_ambiguous_auth()) {
9924 dout(10) << " waiting for single auth on " << *base << dendl;
9925 if (!onfinish)
9926 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9927 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9928 return;
9929 } else if (from == mds->get_nodeid()) {
11fdf7f2 9930 MDSContext::vec finished;
7c673cae
FG
9931 base->take_waiting(CInode::WAIT_DIR, finished);
9932 mds->queue_waiters(finished);
9933 return;
9934 }
9935
9936 frag_t fg = base->pick_dirfrag(want_path[0]);
9937 if ((want_xlocked && want_path.depth() == 1) ||
9938 !base->is_waiting_for_dir(fg) || !onfinish) {
9939 discover_info_t& d = _create_discover(from);
9940 d.ino = base->ino();
9941 d.pin_base(base);
9942 d.frag = fg;
9943 d.snap = snap;
9944 d.want_path = want_path;
9945 d.want_base_dir = true;
9946 d.want_xlocked = want_xlocked;
9947 _send_discover(d);
9948 }
9949
9950 // register + wait
9951 if (onfinish)
9952 base->add_dir_waiter(fg, onfinish);
9953}
9954
9955struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9956 CDir *base;
9957 snapid_t snapid;
9958 filepath path;
9959 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9960 MDCacheContext(c), base(b), snapid(s), path(p) {}
9961 void finish(int r) override {
9962 mdcache->discover_path(base, snapid, path, 0);
9963 }
9964};
9965
9966void MDCache::discover_path(CDir *base,
9967 snapid_t snap,
9968 filepath want_path,
11fdf7f2 9969 MDSContext *onfinish,
7c673cae
FG
9970 bool want_xlocked)
9971{
9972 mds_rank_t from = base->authority().first;
9973
9974 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9975 << (want_xlocked ? " want_xlocked":"")
9976 << dendl;
9977
9978 if (base->is_ambiguous_auth()) {
9979 dout(7) << " waiting for single auth on " << *base << dendl;
9980 if (!onfinish)
9981 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9982 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9983 return;
9984 } else if (from == mds->get_nodeid()) {
11fdf7f2 9985 MDSContext::vec finished;
7c673cae
FG
9986 base->take_sub_waiting(finished);
9987 mds->queue_waiters(finished);
9988 return;
9989 }
9990
9991 if ((want_xlocked && want_path.depth() == 1) ||
9992 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9993 discover_info_t& d = _create_discover(from);
9994 d.ino = base->ino();
31f18b77 9995 d.pin_base(base->inode);
7c673cae
FG
9996 d.frag = base->get_frag();
9997 d.snap = snap;
9998 d.want_path = want_path;
9999 d.want_base_dir = false;
10000 d.want_xlocked = want_xlocked;
10001 _send_discover(d);
10002 }
10003
10004 // register + wait
10005 if (onfinish)
10006 base->add_dentry_waiter(want_path[0], snap, onfinish);
10007}
10008
10009void MDCache::kick_discovers(mds_rank_t who)
10010{
10011 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10012 p != discovers.end();
10013 ++p) {
10014 if (p->second.mds != who)
10015 continue;
10016 _send_discover(p->second);
10017 }
10018}
10019
10020
11fdf7f2 10021void MDCache::handle_discover(const MDiscover::const_ref &dis)
7c673cae
FG
10022{
10023 mds_rank_t whoami = mds->get_nodeid();
10024 mds_rank_t from = mds_rank_t(dis->get_source().num());
10025
11fdf7f2 10026 ceph_assert(from != whoami);
7c673cae
FG
10027
10028 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10029 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 10030 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
10031 return;
10032 }
10033
10034 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10035 // delay processing request from survivor because we may not yet choose lock states.
10036 if (!mds->mdsmap->is_rejoin(from)) {
10037 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10038 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10039 return;
10040 }
10041 }
10042
10043
10044 CInode *cur = 0;
11fdf7f2 10045 auto reply = MDiscoverReply::create(*dis);
7c673cae
FG
10046
10047 snapid_t snapid = dis->get_snapid();
10048
10049 // get started.
10050 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10051 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10052 // wants root
10053 dout(7) << "handle_discover from mds." << from
10054 << " wants base + " << dis->get_want().get_path()
10055 << " snap " << snapid
10056 << dendl;
10057
10058 cur = get_inode(dis->get_base_ino());
11fdf7f2 10059 ceph_assert(cur);
7c673cae
FG
10060
10061 // add root
10062 reply->starts_with = MDiscoverReply::INODE;
10063 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10064 dout(10) << "added base " << *cur << dendl;
10065 }
10066 else {
10067 // there's a base inode
10068 cur = get_inode(dis->get_base_ino(), snapid);
10069 if (!cur && snapid != CEPH_NOSNAP) {
10070 cur = get_inode(dis->get_base_ino());
10071 if (cur && !cur->is_multiversion())
10072 cur = NULL; // nope!
10073 }
10074
10075 if (!cur) {
10076 dout(7) << "handle_discover mds." << from
10077 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10078 << dendl;
10079 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10080 reply->set_error_dentry(dis->get_dentry(0));
10081 reply->set_flag_error_dir();
10082 } else if (dis->wants_base_dir()) {
10083 dout(7) << "handle_discover mds." << from
10084 << " wants basedir+" << dis->get_want().get_path()
10085 << " has " << *cur
10086 << dendl;
10087 } else {
10088 dout(7) << "handle_discover mds." << from
10089 << " wants " << dis->get_want().get_path()
10090 << " has " << *cur
10091 << dendl;
10092 }
10093 }
10094
11fdf7f2 10095 ceph_assert(reply);
7c673cae
FG
10096
10097 // add content
10098 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10099 for (unsigned i = 0;
10100 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10101 i++) {
10102
10103 // -- figure out the dir
10104
10105 // is *cur even a dir at all?
10106 if (!cur->is_dir()) {
10107 dout(7) << *cur << " not a dir" << dendl;
10108 reply->set_flag_error_dir();
10109 break;
10110 }
10111
10112 // pick frag
10113 frag_t fg;
10114 if (dis->get_want().depth()) {
10115 // dentry specifies
10116 fg = cur->pick_dirfrag(dis->get_dentry(i));
10117 } else {
10118 // requester explicity specified the frag
11fdf7f2 10119 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
7c673cae
FG
10120 fg = dis->get_base_dir_frag();
10121 if (!cur->dirfragtree.is_leaf(fg))
10122 fg = cur->dirfragtree[fg.value()];
10123 }
10124 CDir *curdir = cur->get_dirfrag(fg);
10125
10126 if ((!curdir && !cur->is_auth()) ||
10127 (curdir && !curdir->is_auth())) {
10128
10129 /* before:
10130 * ONLY set flag if empty!!
10131 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10132 * resulting in duplicate discovers in flight,
10133 * which can wreak havoc when discovering rename srcdn (which may move)
10134 */
10135
10136 if (reply->is_empty()) {
10137 // only hint if empty.
10138 // someday this could be better, but right now the waiter logic isn't smart enough.
10139
10140 // hint
10141 if (curdir) {
10142 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10143 reply->set_dir_auth_hint(curdir->authority().first);
10144 } else {
10145 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10146 << *cur << dendl;
10147 reply->set_dir_auth_hint(cur->authority().first);
10148 }
10149
10150 // note error dentry, if any
10151 // NOTE: important, as it allows requester to issue an equivalent discover
10152 // to whomever we hint at.
10153 if (dis->get_want().depth() > i)
10154 reply->set_error_dentry(dis->get_dentry(i));
10155 }
10156
10157 break;
10158 }
10159
10160 if (!curdir) { // open dir?
10161 if (cur->is_frozen()) {
10162 if (!reply->is_empty()) {
10163 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10164 break;
10165 }
10166 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10167 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10168 return;
10169 }
10170 curdir = cur->get_or_open_dirfrag(this, fg);
10171 } else if (curdir->is_frozen_tree() ||
10172 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
10173 if (!reply->is_empty()) {
10174 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10175 break;
10176 }
7c673cae
FG
10177 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10178 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10179 reply->set_flag_error_dir();
10180 break;
10181 }
7c673cae
FG
10182 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10183 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10184 return;
10185 }
10186
10187 // add dir
10188 if (curdir->get_version() == 0) {
10189 // fetch newly opened dir
10190 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10191 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10192 // make sure the base frag is correct, though, in there was a refragment since the
10193 // original request was sent.
10194 reply->set_base_dir_frag(curdir->get_frag());
10195 } else {
11fdf7f2 10196 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
7c673cae
FG
10197 if (!reply->trace.length())
10198 reply->starts_with = MDiscoverReply::DIR;
10199 replicate_dir(curdir, from, reply->trace);
10200 dout(7) << "handle_discover added dir " << *curdir << dendl;
10201 }
10202
10203 // lookup
10204 CDentry *dn = 0;
10205 if (curdir->get_version() == 0) {
10206 // fetch newly opened dir
11fdf7f2 10207 ceph_assert(!curdir->has_bloom());
7c673cae
FG
10208 } else if (dis->get_want().depth() > 0) {
10209 // lookup dentry
10210 dn = curdir->lookup(dis->get_dentry(i), snapid);
10211 } else
10212 break; // done!
10213
10214 // incomplete dir?
10215 if (!dn) {
31f18b77 10216 if (!curdir->is_complete() &&
11fdf7f2
TL
10217 !(snapid == CEPH_NOSNAP &&
10218 curdir->has_bloom() &&
10219 !curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
10220 // readdir
10221 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10222 if (reply->is_empty()) {
10223 // fetch and wait
10224 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10225 dis->wants_base_dir() && curdir->get_version() == 0);
7c673cae
FG
10226 return;
10227 } else {
10228 // initiate fetch, but send what we have so far
10229 curdir->fetch(0);
10230 break;
10231 }
10232 }
10233
11fdf7f2
TL
10234 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10235 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10236 << " dne, non-empty reply, stopping" << dendl;
10237 break;
10238 }
10239
7c673cae
FG
10240 // send null dentry
10241 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10242 << *curdir << dendl;
11fdf7f2
TL
10243 if (snapid == CEPH_NOSNAP)
10244 dn = curdir->add_null_dentry(dis->get_dentry(i));
10245 else
10246 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
7c673cae 10247 }
11fdf7f2 10248 ceph_assert(dn);
7c673cae 10249
31f18b77
FG
10250 // don't add replica to purging dentry/inode
10251 if (dn->state_test(CDentry::STATE_PURGING)) {
10252 if (reply->is_empty())
10253 reply->set_flag_error_dn(dis->get_dentry(i));
10254 break;
10255 }
10256
7c673cae
FG
10257 CDentry::linkage_t *dnl = dn->get_linkage();
10258
10259 // xlocked dentry?
10260 // ...always block on non-tail items (they are unrelated)
10261 // ...allow xlocked tail disocvery _only_ if explicitly requested
10262 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10263 if (dn->lock.is_xlocked()) {
10264 // is this the last (tail) item in the discover traversal?
10265 if (tailitem && dis->wants_xlocked()) {
10266 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
10267 } else if (reply->is_empty()) {
10268 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10269 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10270 return;
10271 } else {
10272 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10273 break;
10274 }
10275 }
10276
10277 // frozen inode?
10278 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10279 if (tailitem && dis->wants_xlocked()) {
10280 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10281 } else if (reply->is_empty()) {
10282 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10283 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10284 return;
10285 } else {
10286 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10287 break;
10288 }
10289 }
10290
10291 // add dentry
10292 if (!reply->trace.length())
10293 reply->starts_with = MDiscoverReply::DENTRY;
10294 replicate_dentry(dn, from, reply->trace);
10295 dout(7) << "handle_discover added dentry " << *dn << dendl;
10296
10297 if (!dnl->is_primary()) break; // stop on null or remote link.
10298
10299 // add inode
10300 CInode *next = dnl->get_inode();
11fdf7f2 10301 ceph_assert(next->is_auth());
7c673cae
FG
10302
10303 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10304 dout(7) << "handle_discover added inode " << *next << dendl;
10305
10306 // descend, keep going.
10307 cur = next;
10308 continue;
10309 }
10310
10311 // how did we do?
11fdf7f2 10312 ceph_assert(!reply->is_empty());
7c673cae
FG
10313 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10314 mds->send_message(reply, dis->get_connection());
7c673cae
FG
10315}
10316
11fdf7f2 10317void MDCache::handle_discover_reply(const MDiscoverReply::const_ref &m)
7c673cae
FG
10318{
10319 /*
10320 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10321 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
7c673cae
FG
10322 return;
10323 }
10324 */
10325 dout(7) << "discover_reply " << *m << dendl;
10326 if (m->is_flag_error_dir())
10327 dout(7) << " flag error, dir" << dendl;
10328 if (m->is_flag_error_dn())
10329 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10330
11fdf7f2 10331 MDSContext::vec finished, error;
7c673cae
FG
10332 mds_rank_t from = mds_rank_t(m->get_source().num());
10333
10334 // starting point
10335 CInode *cur = get_inode(m->get_base_ino());
11fdf7f2 10336 auto p = m->trace.cbegin();
7c673cae
FG
10337
10338 int next = m->starts_with;
10339
10340 // decrement discover counters
10341 if (m->get_tid()) {
10342 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10343 if (p != discovers.end()) {
10344 dout(10) << " found tid " << m->get_tid() << dendl;
10345 discovers.erase(p);
10346 } else {
10347 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10348 }
10349 }
10350
10351 // discover may start with an inode
10352 if (!p.end() && next == MDiscoverReply::INODE) {
10353 cur = add_replica_inode(p, NULL, finished);
10354 dout(7) << "discover_reply got base inode " << *cur << dendl;
11fdf7f2 10355 ceph_assert(cur->is_base());
7c673cae
FG
10356
10357 next = MDiscoverReply::DIR;
10358
10359 // take waiters?
10360 if (cur->is_base() &&
10361 waiting_for_base_ino[from].count(cur->ino())) {
10362 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10363 waiting_for_base_ino[from].erase(cur->ino());
10364 }
10365 }
11fdf7f2 10366 ceph_assert(cur);
7c673cae
FG
10367
10368 // loop over discover results.
10369 // indexes follow each ([[dir] dentry] inode)
10370 // can start, end with any type.
10371 while (!p.end()) {
10372 // dir
10373 frag_t fg;
10374 CDir *curdir = 0;
10375 if (next == MDiscoverReply::DIR) {
10376 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10377 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
11fdf7f2 10378 ceph_assert(m->get_wanted_base_dir());
7c673cae
FG
10379 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10380 }
10381 } else {
10382 // note: this can only happen our first way around this loop.
10383 if (p.end() && m->is_flag_error_dn()) {
10384 fg = cur->pick_dirfrag(m->get_error_dentry());
10385 curdir = cur->get_dirfrag(fg);
10386 } else
10387 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10388 }
10389
10390 if (p.end())
10391 break;
10392
10393 // dentry
10394 CDentry *dn = add_replica_dentry(p, curdir, finished);
10395
10396 if (p.end())
10397 break;
10398
10399 // inode
10400 cur = add_replica_inode(p, dn, finished);
10401
10402 next = MDiscoverReply::DIR;
10403 }
10404
10405 // dir error?
10406 // or dir_auth hint?
10407 if (m->is_flag_error_dir() && !cur->is_dir()) {
10408 // not a dir.
10409 cur->take_waiting(CInode::WAIT_DIR, error);
10410 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10411 mds_rank_t who = m->get_dir_auth_hint();
10412 if (who == mds->get_nodeid()) who = -1;
10413 if (who >= 0)
10414 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10415
7c673cae
FG
10416
10417 if (m->get_wanted_base_dir()) {
31f18b77
FG
10418 frag_t fg = m->get_base_dir_frag();
10419 CDir *dir = cur->get_dirfrag(fg);
10420
7c673cae
FG
10421 if (cur->is_waiting_for_dir(fg)) {
10422 if (cur->is_auth())
10423 cur->take_waiting(CInode::WAIT_DIR, finished);
10424 else if (dir || !cur->dirfragtree.is_leaf(fg))
10425 cur->take_dir_waiting(fg, finished);
10426 else
10427 discover_dir_frag(cur, fg, 0, who);
10428 } else
10429 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10430 }
10431
10432 // try again?
10433 if (m->get_error_dentry().length()) {
31f18b77
FG
10434 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10435 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10436 // wanted a dentry
10437 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10438 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10439 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10440 m->get_wanted_snapid(), finished);
10441 } else {
10442 filepath relpath(m->get_error_dentry(), 0);
10443 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10444 }
10445 } else
10446 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10447 << m->get_error_dentry() << dendl;
10448 }
31f18b77
FG
10449 } else if (m->is_flag_error_dn()) {
10450 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10451 CDir *dir = cur->get_dirfrag(fg);
10452 if (dir) {
10453 if (dir->is_auth()) {
10454 dir->take_sub_waiting(finished);
10455 } else {
10456 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10457 m->get_wanted_snapid(), error);
10458 }
10459 }
7c673cae
FG
10460 }
10461
10462 // waiters
10463 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10464 mds->queue_waiters(finished);
7c673cae
FG
10465}
10466
10467
10468
10469// ----------------------------
10470// REPLICAS
10471
b32b8144
FG
10472
10473void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10474{
10475 dirfrag_t df = dir->dirfrag();
11fdf7f2 10476 encode(df, bl);
b32b8144
FG
10477 dir->encode_replica(to, bl);
10478}
10479
10480void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10481{
11fdf7f2
TL
10482 encode(dn->get_name(), bl);
10483 encode(dn->last, bl);
b32b8144
FG
10484 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10485}
10486
10487void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10488 uint64_t features)
10489{
11fdf7f2
TL
10490 encode(in->inode.ino, bl); // bleh, minor assymetry here
10491 encode(in->last, bl);
b32b8144
FG
10492 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10493}
10494
11fdf7f2
TL
10495CDir *MDCache::add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10496 MDSContext::vec& finished)
7c673cae
FG
10497{
10498 dirfrag_t df;
11fdf7f2 10499 decode(df, p);
7c673cae 10500
11fdf7f2 10501 ceph_assert(diri->ino() == df.ino);
7c673cae
FG
10502
10503 // add it (_replica_)
10504 CDir *dir = diri->get_dirfrag(df.frag);
10505
10506 if (dir) {
10507 // had replica. update w/ new nonce.
10508 dir->decode_replica(p);
10509 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10510 } else {
10511 // force frag to leaf in the diri tree
10512 if (!diri->dirfragtree.is_leaf(df.frag)) {
10513 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10514 << diri->dirfragtree << dendl;
10515 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10516 }
10517
10518 // add replica.
10519 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10520 dir->decode_replica(p);
10521
10522 // is this a dir_auth delegation boundary?
10523 if (from != diri->authority().first ||
10524 diri->is_ambiguous_auth() ||
10525 diri->is_base())
10526 adjust_subtree_auth(dir, from);
10527
10528 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10529
10530 // get waiters
10531 diri->take_dir_waiting(df.frag, finished);
10532 }
10533
10534 return dir;
10535}
10536
11fdf7f2 10537CDentry *MDCache::add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
7c673cae
FG
10538{
10539 string name;
10540 snapid_t last;
11fdf7f2
TL
10541 decode(name, p);
10542 decode(last, p);
7c673cae
FG
10543
10544 CDentry *dn = dir->lookup(name, last);
10545
10546 // have it?
10547 if (dn) {
10548 dn->decode_replica(p, false);
10549 dout(7) << "add_replica_dentry had " << *dn << dendl;
10550 } else {
10551 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10552 dn->decode_replica(p, true);
10553 dout(7) << "add_replica_dentry added " << *dn << dendl;
10554 }
10555
10556 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10557
10558 return dn;
10559}
10560
11fdf7f2 10561CInode *MDCache::add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
7c673cae
FG
10562{
10563 inodeno_t ino;
10564 snapid_t last;
11fdf7f2
TL
10565 decode(ino, p);
10566 decode(last, p);
7c673cae
FG
10567 CInode *in = get_inode(ino, last);
10568 if (!in) {
10569 in = new CInode(this, false, 1, last);
10570 in->decode_replica(p, true);
10571 add_inode(in);
10572 if (in->ino() == MDS_INO_ROOT)
10573 in->inode_auth.first = 0;
10574 else if (in->is_mdsdir())
10575 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10576 dout(10) << "add_replica_inode added " << *in << dendl;
10577 if (dn) {
11fdf7f2 10578 ceph_assert(dn->get_linkage()->is_null());
7c673cae
FG
10579 dn->dir->link_primary_inode(dn, in);
10580 }
10581 } else {
10582 in->decode_replica(p, false);
10583 dout(10) << "add_replica_inode had " << *in << dendl;
10584 }
10585
10586 if (dn) {
10587 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10588 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10589 }
10590
10591 return in;
10592}
10593
10594
10595void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10596{
10597 uint64_t features = mds->mdsmap->get_up_features();
10598 replicate_inode(get_myin(), who, bl, features);
10599 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10600 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10601 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10602 replicate_dir(straydn->get_dir(), who, bl);
10603 replicate_dentry(straydn, who, bl);
10604}
10605
11fdf7f2 10606CDentry *MDCache::add_replica_stray(const bufferlist &bl, mds_rank_t from)
7c673cae 10607{
11fdf7f2
TL
10608 MDSContext::vec finished;
10609 auto p = bl.cbegin();
7c673cae
FG
10610
10611 CInode *mdsin = add_replica_inode(p, NULL, finished);
10612 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10613 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10614 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10615 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10616 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10617 if (!finished.empty())
10618 mds->queue_waiters(finished);
10619
10620 return straydn;
10621}
10622
10623
10624int MDCache::send_dir_updates(CDir *dir, bool bcast)
10625{
10626 // this is an FYI, re: replication
10627
10628 set<mds_rank_t> who;
10629 if (bcast) {
10630 mds->get_mds_map()->get_active_mds_set(who);
10631 } else {
181888fb
FG
10632 for (const auto &p : dir->get_replicas()) {
10633 who.insert(p.first);
10634 }
7c673cae
FG
10635 }
10636
10637 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10638
10639 filepath path;
10640 dir->inode->make_path(path);
10641
10642 mds_rank_t whoami = mds->get_nodeid();
10643 for (set<mds_rank_t>::iterator it = who.begin();
10644 it != who.end();
10645 ++it) {
10646 if (*it == whoami) continue;
10647 //if (*it == except) continue;
10648 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10649
94b18763
FG
10650 std::set<int32_t> s;
10651 for (const auto &r : dir->dir_rep_by) {
10652 s.insert(r);
10653 }
11fdf7f2 10654 mds->send_message_mds(MDirUpdate::create(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
7c673cae
FG
10655 }
10656
10657 return 0;
10658}
10659
11fdf7f2 10660void MDCache::handle_dir_update(const MDirUpdate::const_ref &m)
7c673cae 10661{
224ce89b
WB
10662 dirfrag_t df = m->get_dirfrag();
10663 CDir *dir = get_dirfrag(df);
7c673cae 10664 if (!dir) {
224ce89b 10665 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10666
10667 // discover it?
10668 if (m->should_discover()) {
10669 // only try once!
10670 // this is key to avoid a fragtree update race, among other things.
224ce89b 10671 m->inc_tried_discover();
7c673cae
FG
10672 vector<CDentry*> trace;
10673 CInode *in;
10674 filepath path = m->get_path();
10675 dout(5) << "trying discover on dir_update for " << path << dendl;
11fdf7f2 10676 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 10677 MDRequestRef null_ref;
11fdf7f2 10678 int r = path_traverse(null_ref, cf, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
7c673cae
FG
10679 if (r > 0)
10680 return;
224ce89b
WB
10681 if (r == 0 &&
10682 in->ino() == df.ino &&
10683 in->get_approx_dirfrag(df.frag) == NULL) {
10684 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10685 return;
10686 }
7c673cae
FG
10687 }
10688
7c673cae
FG
10689 return;
10690 }
10691
224ce89b
WB
10692 if (!m->has_tried_discover()) {
10693 // Update if it already exists. Othwerwise it got updated by discover reply.
10694 dout(5) << "dir_update on " << *dir << dendl;
10695 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10696 dir->dir_rep_by.clear();
10697 for (const auto &e : m->get_dir_rep_by()) {
10698 dir->dir_rep_by.insert(e);
10699 }
224ce89b 10700 }
7c673cae
FG
10701}
10702
10703
10704
10705
10706
10707// LINK
10708
10709void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10710{
10711 dout(7) << "send_dentry_link " << *dn << dendl;
10712
10713 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10714 for (const auto &p : dn->get_replicas()) {
7c673cae 10715 // don't tell (rename) witnesses; they already know
181888fb 10716 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10717 continue;
181888fb
FG
10718 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10719 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10720 rejoin_gather.count(p.first)))
7c673cae
FG
10721 continue;
10722 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 10723 auto m = MDentryLink::create(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
7c673cae
FG
10724 if (dnl->is_primary()) {
10725 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10726 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10727 mds->mdsmap->get_up_features());
10728 } else if (dnl->is_remote()) {
10729 inodeno_t ino = dnl->get_remote_ino();
10730 __u8 d_type = dnl->get_remote_d_type();
10731 dout(10) << " remote " << ino << " " << d_type << dendl;
11fdf7f2
TL
10732 encode(ino, m->bl);
10733 encode(d_type, m->bl);
7c673cae
FG
10734 } else
10735 ceph_abort(); // aie, bad caller!
181888fb 10736 mds->send_message_mds(m, p.first);
7c673cae
FG
10737 }
10738}
10739
11fdf7f2 10740void MDCache::handle_dentry_link(const MDentryLink::const_ref &m)
7c673cae 10741{
7c673cae
FG
10742 CDentry *dn = NULL;
10743 CDir *dir = get_dirfrag(m->get_dirfrag());
10744 if (!dir) {
10745 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10746 } else {
10747 dn = dir->lookup(m->get_dn());
10748 if (!dn) {
10749 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10750 } else {
10751 dout(7) << "handle_dentry_link on " << *dn << dendl;
10752 CDentry::linkage_t *dnl = dn->get_linkage();
10753
11fdf7f2
TL
10754 ceph_assert(!dn->is_auth());
10755 ceph_assert(dnl->is_null());
7c673cae
FG
10756 }
10757 }
10758
11fdf7f2
TL
10759 auto p = m->bl.cbegin();
10760 MDSContext::vec finished;
7c673cae
FG
10761 if (dn) {
10762 if (m->get_is_primary()) {
10763 // primary link.
10764 add_replica_inode(p, dn, finished);
10765 } else {
10766 // remote link, easy enough.
10767 inodeno_t ino;
10768 __u8 d_type;
11fdf7f2
TL
10769 decode(ino, p);
10770 decode(d_type, p);
7c673cae
FG
10771 dir->link_remote_inode(dn, ino, d_type);
10772 }
10773 } else {
10774 ceph_abort();
10775 }
10776
10777 if (!finished.empty())
10778 mds->queue_waiters(finished);
10779
7c673cae
FG
10780 return;
10781}
10782
10783
10784// UNLINK
10785
10786void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10787{
10788 dout(10) << "send_dentry_unlink " << *dn << dendl;
10789 // share unlink news with replicas
10790 set<mds_rank_t> replicas;
10791 dn->list_replicas(replicas);
11fdf7f2
TL
10792 bufferlist snapbl;
10793 if (straydn) {
7c673cae 10794 straydn->list_replicas(replicas);
11fdf7f2
TL
10795 CInode *strayin = straydn->get_linkage()->get_inode();
10796 strayin->encode_snap_blob(snapbl);
10797 }
7c673cae
FG
10798 for (set<mds_rank_t>::iterator it = replicas.begin();
10799 it != replicas.end();
10800 ++it) {
10801 // don't tell (rmdir) witnesses; they already know
10802 if (mdr.get() && mdr->more()->witnessed.count(*it))
10803 continue;
10804
10805 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10806 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10807 rejoin_gather.count(*it)))
10808 continue;
10809
11fdf7f2
TL
10810 auto unlink = MDentryUnlink::create(dn->get_dir()->dirfrag(), dn->get_name());
10811 if (straydn) {
7c673cae 10812 replicate_stray(straydn, *it, unlink->straybl);
11fdf7f2
TL
10813 unlink->snapbl = snapbl;
10814 }
7c673cae
FG
10815 mds->send_message_mds(unlink, *it);
10816 }
10817}
10818
11fdf7f2 10819void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref &m)
7c673cae
FG
10820{
10821 // straydn
10822 CDentry *straydn = NULL;
10823 if (m->straybl.length())
10824 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10825
10826 CDir *dir = get_dirfrag(m->get_dirfrag());
10827 if (!dir) {
10828 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10829 } else {
10830 CDentry *dn = dir->lookup(m->get_dn());
10831 if (!dn) {
10832 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10833 } else {
10834 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10835 CDentry::linkage_t *dnl = dn->get_linkage();
10836
10837 // open inode?
10838 if (dnl->is_primary()) {
10839 CInode *in = dnl->get_inode();
10840 dn->dir->unlink_inode(dn);
11fdf7f2 10841 ceph_assert(straydn);
7c673cae
FG
10842 straydn->dir->link_primary_inode(straydn, in);
10843
10844 // in->first is lazily updated on replica; drag it forward so
10845 // that we always keep it in sync with the dnq
11fdf7f2 10846 ceph_assert(straydn->first >= in->first);
7c673cae
FG
10847 in->first = straydn->first;
10848
10849 // update subtree map?
10850 if (in->is_dir())
10851 adjust_subtree_after_rename(in, dir, false);
10852
11fdf7f2
TL
10853 if (m->snapbl.length()) {
10854 bool hadrealm = (in->snaprealm ? true : false);
10855 in->decode_snap_blob(m->snapbl);
10856 ceph_assert(in->snaprealm);
10857 ceph_assert(in->snaprealm->have_past_parents_open());
10858 if (!hadrealm)
10859 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
10860 }
10861
7c673cae
FG
10862 // send caps to auth (if we're not already)
10863 if (in->is_any_caps() &&
10864 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10865 migrator->export_caps(in);
10866
7c673cae
FG
10867 straydn = NULL;
10868 } else {
11fdf7f2
TL
10869 ceph_assert(!straydn);
10870 ceph_assert(dnl->is_remote());
7c673cae
FG
10871 dn->dir->unlink_inode(dn);
10872 }
11fdf7f2 10873 ceph_assert(dnl->is_null());
7c673cae
FG
10874 }
10875 }
10876
10877 // race with trim_dentry()
10878 if (straydn) {
11fdf7f2
TL
10879 ceph_assert(straydn->get_num_ref() == 0);
10880 ceph_assert(straydn->get_linkage()->is_null());
10881 expiremap ex;
10882 trim_dentry(straydn, ex);
10883 send_expire_messages(ex);
7c673cae 10884 }
7c673cae
FG
10885}
10886
10887
10888
10889
10890
10891
10892// ===================================================================
10893
10894
10895
10896// ===================================================================
10897// FRAGMENT
10898
10899
10900/**
10901 * adjust_dir_fragments -- adjust fragmentation for a directory
10902 *
10903 * @param diri directory inode
10904 * @param basefrag base fragment
10905 * @param bits bit adjustment. positive for split, negative for merge.
10906 */
10907void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10908 list<CDir*>& resultfrags,
11fdf7f2 10909 MDSContext::vec& waiters,
7c673cae
FG
10910 bool replay)
10911{
10912 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10913 << " on " << *diri << dendl;
10914
10915 list<CDir*> srcfrags;
10916 diri->get_dirfrags_under(basefrag, srcfrags);
10917
10918 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10919}
10920
10921CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10922{
10923 CDir *dir = diri->get_dirfrag(fg);
10924 if (dir)
10925 return dir;
10926
10927 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10928
10929 list<CDir*> src, result;
11fdf7f2 10930 MDSContext::vec waiters;
7c673cae
FG
10931
10932 // split a parent?
10933 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10934 while (1) {
10935 CDir *pdir = diri->get_dirfrag(parent);
10936 if (pdir) {
10937 int split = fg.bits() - parent.bits();
10938 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10939 src.push_back(pdir);
10940 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10941 dir = diri->get_dirfrag(fg);
10942 if (dir) {
10943 dout(10) << "force_dir_fragment result " << *dir << dendl;
10944 break;
10945 }
10946 }
10947 if (parent == frag_t())
10948 break;
10949 frag_t last = parent;
10950 parent = parent.parent();
10951 dout(10) << " " << last << " parent is " << parent << dendl;
10952 }
10953
10954 if (!dir) {
10955 // hoover up things under fg?
10956 diri->get_dirfrags_under(fg, src);
10957 if (src.empty()) {
10958 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10959 } else {
10960 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10961 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10962 dir = result.front();
10963 dout(10) << "force_dir_fragment result " << *dir << dendl;
10964 }
10965 }
10966 if (!replay)
10967 mds->queue_waiters(waiters);
10968 return dir;
10969}
10970
10971void MDCache::adjust_dir_fragments(CInode *diri,
10972 list<CDir*>& srcfrags,
10973 frag_t basefrag, int bits,
10974 list<CDir*>& resultfrags,
11fdf7f2 10975 MDSContext::vec& waiters,
7c673cae
FG
10976 bool replay)
10977{
10978 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10979 << " srcfrags " << srcfrags
10980 << " on " << *diri << dendl;
10981
10982 // adjust fragtree
10983 // yuck. we may have discovered the inode while it was being fragmented.
10984 if (!diri->dirfragtree.is_leaf(basefrag))
10985 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10986
10987 if (bits > 0)
10988 diri->dirfragtree.split(basefrag, bits);
10989 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10990
10991 if (srcfrags.empty())
10992 return;
10993
10994 // split
10995 CDir *parent_dir = diri->get_parent_dir();
10996 CDir *parent_subtree = 0;
10997 if (parent_dir)
10998 parent_subtree = get_subtree_root(parent_dir);
10999
11000 if (bits > 0) {
11001 // SPLIT
11fdf7f2 11002 ceph_assert(srcfrags.size() == 1);
7c673cae
FG
11003 CDir *dir = srcfrags.front();
11004
11005 dir->split(bits, resultfrags, waiters, replay);
11006
11007 // did i change the subtree map?
11008 if (dir->is_subtree_root()) {
11009 // new frags are now separate subtrees
11010 for (list<CDir*>::iterator p = resultfrags.begin();
11011 p != resultfrags.end();
11012 ++p)
11013 subtrees[*p].clear(); // new frag is now its own subtree
11014
11015 // was i a bound?
11016 if (parent_subtree) {
11fdf7f2 11017 ceph_assert(subtrees[parent_subtree].count(dir));
7c673cae
FG
11018 subtrees[parent_subtree].erase(dir);
11019 for (list<CDir*>::iterator p = resultfrags.begin();
11020 p != resultfrags.end();
11021 ++p) {
11fdf7f2 11022 ceph_assert((*p)->is_subtree_root());
7c673cae
FG
11023 subtrees[parent_subtree].insert(*p);
11024 }
11025 }
11026
11027 // adjust my bounds.
11028 set<CDir*> bounds;
11029 bounds.swap(subtrees[dir]);
11030 subtrees.erase(dir);
11031 for (set<CDir*>::iterator p = bounds.begin();
11032 p != bounds.end();
11033 ++p) {
11034 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11035 subtrees[frag].insert(*p);
11036 }
11037
11038 show_subtrees(10);
7c673cae
FG
11039 }
11040
11041 diri->close_dirfrag(dir->get_frag());
11042
11043 } else {
11044 // MERGE
11045
11046 // are my constituent bits subtrees? if so, i will be too.
11047 // (it's all or none, actually.)
11fdf7f2 11048 bool any_subtree = false, any_non_subtree = false;
31f18b77 11049 for (CDir *dir : srcfrags) {
11fdf7f2 11050 if (dir->is_subtree_root())
31f18b77 11051 any_subtree = true;
11fdf7f2
TL
11052 else
11053 any_non_subtree = true;
31f18b77 11054 }
11fdf7f2
TL
11055 ceph_assert(!any_subtree || !any_non_subtree);
11056
31f18b77
FG
11057 set<CDir*> new_bounds;
11058 if (any_subtree) {
11059 for (CDir *dir : srcfrags) {
11060 // this simplifies the code that find subtrees underneath the dirfrag
11061 if (!dir->is_subtree_root()) {
11062 dir->state_set(CDir::STATE_AUXSUBTREE);
11063 adjust_subtree_auth(dir, mds->get_nodeid());
11064 }
11065 }
11066
11067 for (CDir *dir : srcfrags) {
11fdf7f2 11068 ceph_assert(dir->is_subtree_root());
7c673cae 11069 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
11070 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11071 set<CDir*>::iterator r = q->second.begin();
11072 while (r != subtrees[dir].end()) {
11073 new_bounds.insert(*r);
11074 subtrees[dir].erase(r++);
11075 }
11076 subtrees.erase(q);
31f18b77 11077
7c673cae
FG
11078 // remove myself as my parent's bound
11079 if (parent_subtree)
11080 subtrees[parent_subtree].erase(dir);
11081 }
11082 }
11083
11084 // merge
11085 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11086 f->merge(srcfrags, waiters, replay);
7c673cae 11087
31f18b77 11088 if (any_subtree) {
11fdf7f2 11089 ceph_assert(f->is_subtree_root());
7c673cae
FG
11090 subtrees[f].swap(new_bounds);
11091 if (parent_subtree)
11092 subtrees[parent_subtree].insert(f);
11093
11094 show_subtrees(10);
11095 }
11096
11097 resultfrags.push_back(f);
11098 }
11099}
11100
11101
11102class C_MDC_FragmentFrozen : public MDSInternalContext {
11103 MDCache *mdcache;
11104 MDRequestRef mdr;
11105public:
11106 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11107 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11108 void finish(int r) override {
11109 mdcache->fragment_frozen(mdr, r);
11110 }
11111};
11112
11113bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
11114{
11115 if (is_readonly()) {
11116 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11117 return false;
11118 }
11119 if (mds->is_cluster_degraded()) {
11120 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11121 return false;
11122 }
11123 if (diri->get_parent_dir() &&
11124 diri->get_parent_dir()->get_inode()->is_stray()) {
11125 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11126 return false;
11127 }
11128 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
11129 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11130 return false;
11131 }
11132
11133 if (diri->scrub_is_in_progress()) {
11134 dout(7) << "can_fragment: scrub in progress" << dendl;
11135 return false;
11136 }
11137
11138 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11139 CDir *dir = *p;
11140 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11141 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11142 return false;
11143 }
11144 if (!dir->is_auth()) {
11145 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11146 return false;
11147 }
11148 if (dir->is_bad()) {
11149 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11150 return false;
11151 }
11152 if (dir->is_frozen() ||
11153 dir->is_freezing()) {
11154 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11155 return false;
11156 }
11157 }
11158
11159 return true;
11160}
11161
11162void MDCache::split_dir(CDir *dir, int bits)
11163{
11164 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11fdf7f2 11165 ceph_assert(dir->is_auth());
7c673cae
FG
11166 CInode *diri = dir->inode;
11167
11168 list<CDir*> dirs;
11169 dirs.push_back(dir);
11170
11171 if (!can_fragment(diri, dirs)) {
11172 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11173 return;
11174 }
11175
31f18b77
FG
11176 if (dir->frag.bits() + bits > 24) {
11177 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11178 return;
11179 }
11180
7c673cae
FG
11181 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11182 mdr->more()->fragment_base = dir->dirfrag();
11183
11fdf7f2 11184 ceph_assert(fragments.count(dir->dirfrag()) == 0);
7c673cae
FG
11185 fragment_info_t& info = fragments[dir->dirfrag()];
11186 info.mdr = mdr;
11187 info.dirs.push_back(dir);
11188 info.bits = bits;
11189 info.last_cum_auth_pins_change = ceph_clock_now();
11190
11191 fragment_freeze_dirs(dirs);
11192 // initial mark+complete pass
11193 fragment_mark_and_complete(mdr);
11194}
11195
11196void MDCache::merge_dir(CInode *diri, frag_t frag)
11197{
11198 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11199
11200 list<CDir*> dirs;
11201 if (!diri->get_dirfrags_under(frag, dirs)) {
11202 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11203 return;
11204 }
11205
11206 if (diri->dirfragtree.is_leaf(frag)) {
11207 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11208 return;
11209 }
11210
11211 if (!can_fragment(diri, dirs))
11212 return;
11213
11214 CDir *first = dirs.front();
11215 int bits = first->get_frag().bits() - frag.bits();
11216 dout(10) << " we are merginb by " << bits << " bits" << dendl;
11217
11218 dirfrag_t basedirfrag(diri->ino(), frag);
11219 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11220 mdr->more()->fragment_base = basedirfrag;
11221
11fdf7f2 11222 ceph_assert(fragments.count(basedirfrag) == 0);
7c673cae
FG
11223 fragment_info_t& info = fragments[basedirfrag];
11224 info.mdr = mdr;
11225 info.dirs = dirs;
11226 info.bits = -bits;
11227 info.last_cum_auth_pins_change = ceph_clock_now();
11228
11229 fragment_freeze_dirs(dirs);
11230 // initial mark+complete pass
11231 fragment_mark_and_complete(mdr);
11232}
11233
11234void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
11235{
11fdf7f2
TL
11236 bool any_subtree = false, any_non_subtree = false;
11237 for (CDir* dir : dirs) {
7c673cae
FG
11238 dir->auth_pin(dir); // until we mark and complete them
11239 dir->state_set(CDir::STATE_FRAGMENTING);
11240 dir->freeze_dir();
11fdf7f2
TL
11241 ceph_assert(dir->is_freezing_dir());
11242
11243 if (dir->is_subtree_root())
11244 any_subtree = true;
11245 else
11246 any_non_subtree = true;
11247 }
11248
11249 if (any_subtree && any_non_subtree) {
11250 // either all dirfrags are subtree roots or all are not.
11251 for (CDir *dir : dirs) {
11252 if (dir->is_subtree_root()) {
11253 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11254 } else {
11255 dir->state_set(CDir::STATE_AUXSUBTREE);
11256 adjust_subtree_auth(dir, mds->get_nodeid());
11257 }
11258 }
7c673cae
FG
11259 }
11260}
11261
11262class C_MDC_FragmentMarking : public MDCacheContext {
11263 MDRequestRef mdr;
11264public:
11265 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11266 void finish(int r) override {
11267 mdcache->fragment_mark_and_complete(mdr);
11268 }
11269};
11270
11271void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11272{
11273 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11274 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11275 if (it == fragments.end() || it->second.mdr != mdr) {
11276 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11277 request_finish(mdr);
11278 return;
11279 }
11280
11281 fragment_info_t& info = it->second;
11282 CInode *diri = info.dirs.front()->get_inode();
11283 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11284
11285 MDSGatherBuilder gather(g_ceph_context);
11286
11287 for (list<CDir*>::iterator p = info.dirs.begin();
11288 p != info.dirs.end();
11289 ++p) {
11290 CDir *dir = *p;
11291
11292 bool ready = true;
11293 if (!dir->is_complete()) {
11294 dout(15) << " fetching incomplete " << *dir << dendl;
11295 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11296 ready = false;
11297 } else if (dir->get_frag() == frag_t()) {
11298 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11299 // the operation. To avoid CDir::fetch() complaining about missing object,
11300 // we commit new dirfrag first.
11301 if (dir->state_test(CDir::STATE_CREATING)) {
11302 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11303 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11304 ready = false;
11305 } else if (dir->is_new()) {
11306 dout(15) << " committing new " << *dir << dendl;
11fdf7f2 11307 ceph_assert(dir->is_dirty());
7c673cae
FG
11308 dir->commit(0, gather.new_sub(), true);
11309 ready = false;
11310 }
11311 }
11312 if (!ready)
11313 continue;
11314
11315 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11316 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11317 for (auto &p : dir->items) {
11318 CDentry *dn = p.second;
7c673cae 11319 dn->get(CDentry::PIN_FRAGMENTING);
11fdf7f2 11320 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11321 dn->state_set(CDentry::STATE_FRAGMENTING);
11322 }
11323 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11324 dir->auth_unpin(dir);
11325 } else {
11326 dout(15) << " already marked " << *dir << dendl;
11327 }
11328 }
11329 if (gather.has_subs()) {
11330 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11331 gather.activate();
11332 return;
11333 }
11334
11335 for (list<CDir*>::iterator p = info.dirs.begin();
11336 p != info.dirs.end();
11337 ++p) {
11338 CDir *dir = *p;
11339 if (!dir->is_frozen_dir()) {
11fdf7f2 11340 ceph_assert(dir->is_freezing_dir());
7c673cae
FG
11341 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11342 }
11343 }
11344 if (gather.has_subs()) {
11345 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11346 gather.activate();
11347 // flush log so that request auth_pins are retired
11348 mds->mdlog->flush();
11349 return;
11350 }
11351
11352 fragment_frozen(mdr, 0);
11353}
11354
11355void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11356{
11357 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11358 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11359 CDir *dir = *p;
11360 dout(10) << " frag " << *dir << dendl;
11361
11fdf7f2 11362 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
7c673cae
FG
11363 dir->state_clear(CDir::STATE_FRAGMENTING);
11364
11365 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11366 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11367
94b18763
FG
11368 for (auto &p : dir->items) {
11369 CDentry *dn = p.second;
11fdf7f2 11370 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11371 dn->state_clear(CDentry::STATE_FRAGMENTING);
11372 dn->put(CDentry::PIN_FRAGMENTING);
11373 }
11374 } else {
11375 dir->auth_unpin(dir);
11376 }
11377
11378 dir->unfreeze_dir();
11379 }
11380}
11381
11382bool MDCache::fragment_are_all_frozen(CDir *dir)
11383{
11fdf7f2 11384 ceph_assert(dir->is_frozen_dir());
7c673cae
FG
11385 map<dirfrag_t,fragment_info_t>::iterator p;
11386 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11387 p != fragments.end() && p->first.ino == dir->ino();
11388 ++p) {
11389 if (p->first.frag.contains(dir->get_frag()))
11390 return p->second.all_frozen;
11391 }
11392 ceph_abort();
11393 return false;
11394}
11395
11396void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11397{
11398 map<dirfrag_t,fragment_info_t>::iterator p;
11399 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11400 p != fragments.end() && p->first.ino == dir->ino();
11401 ++p) {
11402 if (p->first.frag.contains(dir->get_frag())) {
11403 p->second.num_remote_waiters++;
11404 return;
11405 }
11406 }
11407 ceph_abort();
11408}
11409
11410void MDCache::find_stale_fragment_freeze()
11411{
11412 dout(10) << "find_stale_fragment_freeze" << dendl;
11413 // see comment in Migrator::find_stale_export_freeze()
11414 utime_t now = ceph_clock_now();
11415 utime_t cutoff = now;
11fdf7f2 11416 cutoff -= g_conf()->mds_freeze_tree_timeout;
7c673cae
FG
11417
11418 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11419 p != fragments.end(); ) {
11420 dirfrag_t df = p->first;
11421 fragment_info_t& info = p->second;
11422 ++p;
11423 if (info.all_frozen)
11424 continue;
11425 CDir *dir;
11426 int total_auth_pins = 0;
11427 for (list<CDir*>::iterator q = info.dirs.begin();
11428 q != info.dirs.end();
11429 ++q) {
11430 dir = *q;
11431 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11432 total_auth_pins = -1;
11433 break;
11434 }
11435 if (dir->is_frozen_dir())
11436 continue;
11437 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11438 }
11439 if (total_auth_pins < 0)
11440 continue;
11441 if (info.last_cum_auth_pins != total_auth_pins) {
11442 info.last_cum_auth_pins = total_auth_pins;
11443 info.last_cum_auth_pins_change = now;
11444 continue;
11445 }
11446 if (info.last_cum_auth_pins_change >= cutoff)
11447 continue;
11448 dir = info.dirs.front();
11449 if (info.num_remote_waiters > 0 ||
11450 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11451 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11452 list<CDir*> dirs;
11453 info.dirs.swap(dirs);
11454 fragments.erase(df);
11455 fragment_unmark_unfreeze_dirs(dirs);
11456 }
11457 }
11458}
11459
11460class C_MDC_FragmentPrep : public MDCacheLogContext {
11461 MDRequestRef mdr;
11462public:
11463 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11464 void finish(int r) override {
11465 mdcache->_fragment_logged(mdr);
11466 }
11467};
11468
11469class C_MDC_FragmentStore : public MDCacheContext {
11470 MDRequestRef mdr;
11471public:
11472 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11473 void finish(int r) override {
11474 mdcache->_fragment_stored(mdr);
11475 }
11476};
11477
11478class C_MDC_FragmentCommit : public MDCacheLogContext {
11479 dirfrag_t basedirfrag;
a8e16298 11480 MDRequestRef mdr;
7c673cae 11481public:
a8e16298
TL
11482 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11483 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
7c673cae 11484 void finish(int r) override {
a8e16298 11485 mdcache->_fragment_committed(basedirfrag, mdr);
7c673cae
FG
11486 }
11487};
11488
a8e16298 11489class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
7c673cae 11490 dirfrag_t basedirfrag;
a8e16298
TL
11491 int bits;
11492 MDRequestRef mdr;
7c673cae 11493public:
a8e16298
TL
11494 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11495 const MDRequestRef& r) :
11496 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
7c673cae 11497 void finish(int r) override {
11fdf7f2 11498 ceph_assert(r == 0 || r == -ENOENT);
a8e16298 11499 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
7c673cae 11500 }
91327a77 11501 void print(ostream& out) const override {
a8e16298 11502 out << "fragment_purge_old(" << basedirfrag << ")";
91327a77 11503 }
7c673cae
FG
11504};
11505
11506void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11507{
11508 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11509 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11510 if (it == fragments.end() || it->second.mdr != mdr) {
11511 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11512 request_finish(mdr);
11513 return;
11514 }
11515
11fdf7f2 11516 ceph_assert(r == 0);
7c673cae
FG
11517 fragment_info_t& info = it->second;
11518 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11519 << " on " << info.dirs.front()->get_inode() << dendl;
11520
11521 info.all_frozen = true;
11522 dispatch_fragment_dir(mdr);
11523}
11524
11525void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11526{
11527 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11528 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11529 if (it == fragments.end() || it->second.mdr != mdr) {
11530 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11531 request_finish(mdr);
11532 return;
11533 }
11534
11535 fragment_info_t& info = it->second;
11536 CInode *diri = info.dirs.front()->get_inode();
11537
11538 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11539 << " on " << *diri << dendl;
11540 if (!mdr->aborted) {
11fdf7f2
TL
11541 MutationImpl::LockOpVec lov;
11542 lov.add_wrlock(&diri->dirfragtreelock);
7c673cae 11543 // prevent a racing gather on any other scatterlocks too
11fdf7f2
TL
11544 lov.add_wrlock(&diri->nestlock);
11545 lov.add_wrlock(&diri->filelock);
11546 if (!mds->locker->acquire_locks(mdr, lov, NULL, true))
7c673cae
FG
11547 if (!mdr->aborted)
11548 return;
11549 }
11550
11551 if (mdr->aborted) {
11552 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11553 << info.dirs.front()->dirfrag() << dendl;
11554 if (info.bits > 0)
11555 mds->balancer->queue_split(info.dirs.front(), false);
11556 else
11557 mds->balancer->queue_merge(info.dirs.front());
11558 fragment_unmark_unfreeze_dirs(info.dirs);
11559 fragments.erase(it);
11560 request_finish(mdr);
11561 return;
11562 }
11563
11564 mdr->ls = mds->mdlog->get_current_segment();
11565 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11566 mds->mdlog->start_entry(le);
11567
11568 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11569 CDir *dir = *p;
11570 dirfrag_rollback rollback;
11571 rollback.fnode = dir->fnode;
11572 le->add_orig_frag(dir->get_frag(), &rollback);
11573 }
11574
11575 // refragment
11fdf7f2 11576 MDSContext::vec waiters;
7c673cae
FG
11577 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11578 info.resultfrags, waiters, false);
11fdf7f2 11579 if (g_conf()->mds_debug_frag)
7c673cae
FG
11580 diri->verify_dirfrags();
11581 mds->queue_waiters(waiters);
11582
11fdf7f2
TL
11583 for (const auto& fg : le->orig_frags)
11584 ceph_assert(!diri->dirfragtree.is_leaf(fg));
7c673cae
FG
11585
11586 le->metablob.add_dir_context(*info.resultfrags.begin());
11587 for (list<CDir*>::iterator p = info.resultfrags.begin();
11588 p != info.resultfrags.end();
11589 ++p) {
11590 if (diri->is_auth()) {
11591 le->metablob.add_fragmented_dir(*p, false, false);
11592 } else {
11593 (*p)->state_set(CDir::STATE_DIRTYDFT);
11594 le->metablob.add_fragmented_dir(*p, false, true);
11595 }
11596 }
11597
11598 // dft lock
11599 if (diri->is_auth()) {
11600 // journal dirfragtree
94b18763
FG
11601 auto &pi = diri->project_inode();
11602 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11603 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11604 } else {
11605 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11606 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11607 mdr->add_updated_lock(&diri->dirfragtreelock);
11608 }
11609
11610 /*
11611 // filelock
11612 mds->locker->mark_updated_scatterlock(&diri->filelock);
11613 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11614 mut->add_updated_lock(&diri->filelock);
11615
11616 // dirlock
11617 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11618 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11619 mut->add_updated_lock(&diri->nestlock);
11620 */
11621
11622 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11623 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11624 mdr, __func__);
11625 mds->mdlog->flush();
11626}
11627
11628void MDCache::_fragment_logged(MDRequestRef& mdr)
11629{
11630 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298 11631 auto& info = fragments.at(basedirfrag);
7c673cae
FG
11632 CInode *diri = info.resultfrags.front()->get_inode();
11633
11634 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11635 << " on " << *diri << dendl;
a8e16298 11636 mdr->mark_event("prepare logged");
7c673cae
FG
11637
11638 if (diri->is_auth())
11639 diri->pop_and_dirty_projected_inode(mdr->ls);
11640
11641 mdr->apply(); // mark scatterlock
11642
11643 // store resulting frags
11644 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11645
11646 for (list<CDir*>::iterator p = info.resultfrags.begin();
11647 p != info.resultfrags.end();
11648 ++p) {
11649 CDir *dir = *p;
11650 dout(10) << " storing result frag " << *dir << dendl;
11651
11652 // freeze and store them too
11653 dir->auth_pin(this);
11654 dir->state_set(CDir::STATE_FRAGMENTING);
11655 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11656 }
11657
11658 gather.activate();
11659}
11660
11661void MDCache::_fragment_stored(MDRequestRef& mdr)
11662{
11663 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298
TL
11664 fragment_info_t &info = fragments.at(basedirfrag);
11665 CDir *first = info.resultfrags.front();
11666 CInode *diri = first->get_inode();
7c673cae
FG
11667
11668 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11669 << " on " << *diri << dendl;
a8e16298 11670 mdr->mark_event("new frags stored");
7c673cae
FG
11671
11672 // tell peers
a8e16298
TL
11673 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11674 diri->authority().first : CDIR_AUTH_UNKNOWN;
181888fb
FG
11675 for (const auto &p : first->get_replicas()) {
11676 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11677 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11678 rejoin_gather.count(p.first)))
7c673cae
FG
11679 continue;
11680
11fdf7f2 11681 auto notify = MMDSFragmentNotify::create(basedirfrag, info.bits, mdr->reqid.tid);
a8e16298
TL
11682 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11683 diri_auth != p.first) { // not auth mds of diri
11684 /*
11685 * In the nornal case, mds does not trim dir inode whose child dirfrags
11686 * are likely being fragmented (see trim_inode()). But when fragmenting
11687 * subtree roots, following race can happen:
11688 *
11689 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11690 * mds.c and drops wrlock on dirfragtreelock.
11691 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11692 * SYNC and send lock message mds.c
11693 * - mds.c receives the lock message and changes dirfragtreelock state
11694 * to SYNC
11695 * - mds.c trim dirfrag and dir inode from its cache
11696 * - mds.c receives the fragment_notify message
11697 *
11698 * So we need to ensure replicas have received the notify, then unlock
11699 * the dirfragtreelock.
11700 */
11701 notify->mark_ack_wanted();
11702 info.notify_ack_waiting.insert(p.first);
11703 }
7c673cae
FG
11704
11705 // freshly replicate new dirs to peers
11706 for (list<CDir*>::iterator q = info.resultfrags.begin();
11707 q != info.resultfrags.end();
11708 ++q)
181888fb 11709 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11710
181888fb 11711 mds->send_message_mds(notify, p.first);
7c673cae
FG
11712 }
11713
11714 // journal commit
11715 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
a8e16298 11716 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
7c673cae 11717
7c673cae
FG
11718
11719 // unfreeze resulting frags
11720 for (list<CDir*>::iterator p = info.resultfrags.begin();
11721 p != info.resultfrags.end();
11722 ++p) {
11723 CDir *dir = *p;
11724 dout(10) << " result frag " << *dir << dendl;
11725
94b18763
FG
11726 for (auto &p : dir->items) {
11727 CDentry *dn = p.second;
11fdf7f2 11728 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11729 dn->state_clear(CDentry::STATE_FRAGMENTING);
11730 dn->put(CDentry::PIN_FRAGMENTING);
11731 }
11732
11733 // unfreeze
11734 dir->unfreeze_dir();
11735 }
11736
a8e16298
TL
11737 if (info.notify_ack_waiting.empty()) {
11738 fragment_drop_locks(info);
11739 } else {
11740 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11741 }
7c673cae
FG
11742}
11743
a8e16298 11744void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
7c673cae
FG
11745{
11746 dout(10) << "fragment_committed " << basedirfrag << dendl;
a8e16298
TL
11747 if (mdr)
11748 mdr->mark_event("commit logged");
11749
11750 ufragment &uf = uncommitted_fragments.at(basedirfrag);
7c673cae
FG
11751
11752 // remove old frags
11753 C_GatherBuilder gather(
11754 g_ceph_context,
11755 new C_OnFinisher(
a8e16298 11756 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
7c673cae
FG
11757 mds->finisher));
11758
11759 SnapContext nullsnapc;
11760 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11fdf7f2
TL
11761 for (const auto& fg : uf.old_frags) {
11762 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
7c673cae 11763 ObjectOperation op;
11fdf7f2 11764 if (fg == frag_t()) {
7c673cae
FG
11765 // backtrace object
11766 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11767 op.truncate(0);
11768 op.omap_clear();
11769 } else {
11770 dout(10) << " removing orphan dirfrag " << oid << dendl;
11771 op.remove();
11772 }
11773 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11774 ceph::real_clock::now(),
11775 0, gather.new_sub());
11776 }
11777
11fdf7f2 11778 ceph_assert(gather.has_subs());
7c673cae
FG
11779 gather.activate();
11780}
11781
a8e16298 11782void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
7c673cae 11783{
a8e16298
TL
11784 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
11785 if (mdr)
11786 mdr->mark_event("old frags purged");
11787
11788 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
11789 mds->mdlog->start_submit_entry(le);
11790
11791 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11792
11793 if (mds->logger) {
11794 if (bits > 0) {
11795 mds->logger->inc(l_mds_dir_split);
11796 } else {
11797 mds->logger->inc(l_mds_dir_merge);
11798 }
11799 }
11800
11801 if (mdr) {
11802 auto it = fragments.find(basedirfrag);
11803 ceph_assert(it != fragments.end());
11804 it->second.finishing = true;
11805 if (it->second.notify_ack_waiting.empty())
11806 fragment_maybe_finish(it);
11807 else
11808 mdr->mark_event("wating for notify acks");
11809 }
11810}
11811
11812void MDCache::fragment_drop_locks(fragment_info_t& info)
11813{
11814 mds->locker->drop_locks(info.mdr.get());
11815 request_finish(info.mdr);
11816 //info.mdr.reset();
11817}
11818
11819void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
11820{
11821 if (!it->second.finishing)
11822 return;
7c673cae
FG
11823
11824 // unmark & auth_unpin
a8e16298 11825 for (const auto &dir : it->second.resultfrags) {
7c673cae
FG
11826 dir->state_clear(CDir::STATE_FRAGMENTING);
11827 dir->auth_unpin(this);
11828
11829 // In case the resulting fragments are beyond the split size,
11830 // we might need to split them again right away (they could
11831 // have been taking inserts between unfreezing and getting
11832 // here)
11833 mds->balancer->maybe_fragment(dir, false);
11834 }
11835
a8e16298
TL
11836 fragments.erase(it);
11837}
11838
11839
11fdf7f2 11840void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &ack)
a8e16298
TL
11841{
11842 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
11843 mds_rank_t from = mds_rank_t(ack->get_source().num());
11844
11845 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
a8e16298 11846 return;
7c673cae
FG
11847 }
11848
a8e16298
TL
11849 auto it = fragments.find(ack->get_base_dirfrag());
11850 if (it == fragments.end() ||
11851 it->second.get_tid() != ack->get_tid()) {
11852 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
a8e16298
TL
11853 return;
11854 }
7c673cae 11855
a8e16298
TL
11856 if (it->second.notify_ack_waiting.erase(from) &&
11857 it->second.notify_ack_waiting.empty()) {
11858 fragment_drop_locks(it->second);
11859 fragment_maybe_finish(it);
11860 }
7c673cae
FG
11861}
11862
11fdf7f2 11863void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref &notify)
7c673cae
FG
11864{
11865 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
a8e16298 11866 mds_rank_t from = mds_rank_t(notify->get_source().num());
7c673cae
FG
11867
11868 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
11869 return;
11870 }
11871
11872 CInode *diri = get_inode(notify->get_ino());
11873 if (diri) {
11874 frag_t base = notify->get_basefrag();
11875 int bits = notify->get_bits();
11876
11877/*
11878 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11879 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11880 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11881 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
7c673cae
FG
11882 return;
11883 }
11884*/
11885
11886 // refragment
11fdf7f2 11887 MDSContext::vec waiters;
7c673cae
FG
11888 list<CDir*> resultfrags;
11889 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11fdf7f2 11890 if (g_conf()->mds_debug_frag)
7c673cae
FG
11891 diri->verify_dirfrags();
11892
11893 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11894 diri->take_dir_waiting((*p)->get_frag(), waiters);
11895
11896 // add new replica dirs values
11fdf7f2 11897 auto p = notify->basebl.cbegin();
7c673cae 11898 while (!p.end())
a8e16298 11899 add_replica_dir(p, diri, from, waiters);
7c673cae
FG
11900
11901 mds->queue_waiters(waiters);
11902 } else {
11903 ceph_abort();
11904 }
11905
a8e16298 11906 if (notify->is_ack_wanted()) {
11fdf7f2
TL
11907 auto ack = MMDSFragmentNotifyAck::create(notify->get_base_dirfrag(),
11908 notify->get_bits(), notify->get_tid());
a8e16298
TL
11909 mds->send_message_mds(ack, from);
11910 }
7c673cae
FG
11911}
11912
11fdf7f2 11913void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
7c673cae
FG
11914 LogSegment *ls, bufferlist *rollback)
11915{
11916 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11fdf7f2 11917 ceph_assert(!uncommitted_fragments.count(basedirfrag));
7c673cae
FG
11918 ufragment& uf = uncommitted_fragments[basedirfrag];
11919 uf.old_frags = old_frags;
11920 uf.bits = bits;
11921 uf.ls = ls;
11922 ls->uncommitted_fragments.insert(basedirfrag);
11923 if (rollback)
11924 uf.rollback.swap(*rollback);
11925}
11926
11927void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11928{
11929 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11930 << " op " << EFragment::op_name(op) << dendl;
11931 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11932 if (it != uncommitted_fragments.end()) {
11933 ufragment& uf = it->second;
11934 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11935 uf.committed = true;
11936 } else {
11937 uf.ls->uncommitted_fragments.erase(basedirfrag);
11938 mds->queue_waiters(uf.waiters);
11939 uncommitted_fragments.erase(it);
11940 }
11941 }
11942}
11943
11fdf7f2 11944void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
7c673cae
FG
11945{
11946 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11947 << " old_frags (" << old_frags << ")" << dendl;
11948 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11949 if (it != uncommitted_fragments.end()) {
11950 ufragment& uf = it->second;
11951 if (!uf.old_frags.empty()) {
11fdf7f2 11952 uf.old_frags = std::move(old_frags);
7c673cae
FG
11953 uf.committed = true;
11954 } else {
11955 uf.ls->uncommitted_fragments.erase(basedirfrag);
11956 uncommitted_fragments.erase(it);
11957 }
11958 }
11959}
11960
11961void MDCache::rollback_uncommitted_fragments()
11962{
11963 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11964 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11965 p != uncommitted_fragments.end();
11966 ++p) {
11967 ufragment &uf = p->second;
11968 CInode *diri = get_inode(p->first.ino);
11fdf7f2 11969 ceph_assert(diri);
7c673cae
FG
11970
11971 if (uf.committed) {
a8e16298 11972 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
11973 continue;
11974 }
11975
11976 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11977
11978 LogSegment *ls = mds->mdlog->get_current_segment();
11979 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11980 mds->mdlog->start_entry(le);
11981 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11982
11fdf7f2 11983 frag_vec_t old_frags;
7c673cae
FG
11984 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11985
11986 list<CDir*> resultfrags;
11987 if (uf.old_frags.empty()) {
11988 // created by old format EFragment
11fdf7f2 11989 MDSContext::vec waiters;
7c673cae
FG
11990 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11991 } else {
11fdf7f2
TL
11992 auto bp = uf.rollback.cbegin();
11993 for (const auto& fg : uf.old_frags) {
11994 CDir *dir = force_dir_fragment(diri, fg);
7c673cae
FG
11995 resultfrags.push_back(dir);
11996
11997 dirfrag_rollback rollback;
11fdf7f2 11998 decode(rollback, bp);
7c673cae
FG
11999
12000 dir->set_version(rollback.fnode.version);
12001 dir->fnode = rollback.fnode;
12002
12003 dir->_mark_dirty(ls);
12004
12005 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
12006 dout(10) << " dirty nestinfo on " << *dir << dendl;
12007 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
12008 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
12009 }
12010 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
12011 dout(10) << " dirty fragstat on " << *dir << dendl;
12012 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
12013 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
12014 }
12015
12016 le->add_orig_frag(dir->get_frag());
12017 le->metablob.add_dir_context(dir);
12018 if (diri_auth) {
12019 le->metablob.add_fragmented_dir(dir, true, false);
12020 } else {
12021 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12022 dir->state_set(CDir::STATE_DIRTYDFT);
12023 le->metablob.add_fragmented_dir(dir, true, true);
12024 }
12025 }
12026 }
12027
12028 if (diri_auth) {
94b18763
FG
12029 auto &pi = diri->project_inode();
12030 pi.inode.version = diri->pre_dirty();
7c673cae
FG
12031 diri->pop_and_dirty_projected_inode(ls); // hacky
12032 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12033 } else {
12034 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12035 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12036 }
12037
11fdf7f2 12038 if (g_conf()->mds_debug_frag)
7c673cae
FG
12039 diri->verify_dirfrags();
12040
11fdf7f2
TL
12041 for (const auto& leaf : old_frags) {
12042 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12043 }
7c673cae 12044
7c673cae
FG
12045 mds->mdlog->submit_entry(le);
12046
12047 uf.old_frags.swap(old_frags);
a8e16298 12048 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
12049 }
12050}
12051
12052void MDCache::force_readonly()
12053{
12054 if (is_readonly())
12055 return;
12056
12057 dout(1) << "force file system read-only" << dendl;
12058 mds->clog->warn() << "force file system read-only";
12059
12060 set_readonly();
12061
12062 mds->server->force_clients_readonly();
12063
12064 // revoke write caps
81eedcae 12065 int count = 0;
94b18763 12066 for (auto &p : inode_map) {
b32b8144 12067 CInode *in = p.second;
7c673cae
FG
12068 if (in->is_head())
12069 mds->locker->eval(in, CEPH_CAP_LOCKS);
81eedcae
TL
12070 if (!(++count % 1000))
12071 mds->heartbeat_reset();
7c673cae
FG
12072 }
12073
12074 mds->mdlog->flush();
12075}
12076
12077
12078// ==============================================================
12079// debug crap
12080
81eedcae 12081void MDCache::show_subtrees(int dbl, bool force_print)
7c673cae 12082{
11fdf7f2 12083 if (g_conf()->mds_thrash_exports)
7c673cae
FG
12084 dbl += 15;
12085
12086 //dout(10) << "show_subtrees" << dendl;
12087
11fdf7f2 12088 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
7c673cae
FG
12089 return; // i won't print anything.
12090
12091 if (subtrees.empty()) {
11fdf7f2
TL
12092 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12093 << dendl;
7c673cae
FG
12094 return;
12095 }
12096
81eedcae
TL
12097 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12098 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12099 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12100 "printing subtrees" << dendl;
12101 return;
12102 }
12103
7c673cae
FG
12104 // root frags
12105 list<CDir*> basefrags;
12106 for (set<CInode*>::iterator p = base_inodes.begin();
12107 p != base_inodes.end();
12108 ++p)
12109 (*p)->get_dirfrags(basefrags);
12110 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12111 dout(15) << "show_subtrees" << dendl;
12112
12113 // queue stuff
12114 list<pair<CDir*,int> > q;
12115 string indent;
12116 set<CDir*> seen;
12117
12118 // calc max depth
12119 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
12120 q.push_back(pair<CDir*,int>(*p, 0));
12121
12122 set<CDir*> subtrees_seen;
12123
81eedcae 12124 unsigned int depth = 0;
7c673cae
FG
12125 while (!q.empty()) {
12126 CDir *dir = q.front().first;
81eedcae 12127 unsigned int d = q.front().second;
7c673cae
FG
12128 q.pop_front();
12129
12130 if (subtrees.count(dir) == 0) continue;
12131
12132 subtrees_seen.insert(dir);
12133
12134 if (d > depth) depth = d;
12135
12136 // sanity check
12137 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12138 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11fdf7f2 12139 ceph_assert(seen.count(dir) == 0);
7c673cae
FG
12140 seen.insert(dir);
12141
12142 // nested items?
12143 if (!subtrees[dir].empty()) {
12144 for (set<CDir*>::iterator p = subtrees[dir].begin();
12145 p != subtrees[dir].end();
12146 ++p) {
12147 //dout(25) << " saw sub " << **p << dendl;
12148 q.push_front(pair<CDir*,int>(*p, d+1));
12149 }
12150 }
12151 }
12152
81eedcae
TL
12153 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12154 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12155 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12156 "subtrees" << dendl;
12157 return;
12158 }
7c673cae
FG
12159
12160 // print tree
12161 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
12162 q.push_back(pair<CDir*,int>(*p, 0));
12163
12164 while (!q.empty()) {
12165 CDir *dir = q.front().first;
12166 int d = q.front().second;
12167 q.pop_front();
12168
12169 if (subtrees.count(dir) == 0) continue;
12170
12171 // adjust indenter
12172 while ((unsigned)d < indent.size())
12173 indent.resize(d);
12174
12175 // pad
12176 string pad = "______________________________________";
12177 pad.resize(depth*2+1-indent.size());
12178 if (!subtrees[dir].empty())
12179 pad[0] = '.'; // parent
12180
12181
12182 string auth;
12183 if (dir->is_auth())
12184 auth = "auth ";
12185 else
12186 auth = " rep ";
12187
12188 char s[10];
12189 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12190 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12191 else
12192 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12193
12194 // print
11fdf7f2
TL
12195 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12196 << " " << auth << *dir << dendl;
7c673cae
FG
12197
12198 if (dir->ino() == MDS_INO_ROOT)
11fdf7f2 12199 ceph_assert(dir->inode == root);
7c673cae 12200 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11fdf7f2 12201 ceph_assert(dir->inode == myin);
7c673cae 12202 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11fdf7f2 12203 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
7c673cae
FG
12204
12205 // nested items?
12206 if (!subtrees[dir].empty()) {
12207 // more at my level?
12208 if (!q.empty() && q.front().second == d)
12209 indent += "| ";
12210 else
12211 indent += " ";
12212
12213 for (set<CDir*>::iterator p = subtrees[dir].begin();
12214 p != subtrees[dir].end();
12215 ++p)
12216 q.push_front(pair<CDir*,int>(*p, d+2));
12217 }
12218 }
12219
12220 // verify there isn't stray crap in subtree map
12221 int lost = 0;
12222 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12223 p != subtrees.end();
12224 ++p) {
12225 if (subtrees_seen.count(p->first)) continue;
12226 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12227 lost++;
12228 }
11fdf7f2 12229 ceph_assert(lost == 0);
7c673cae
FG
12230}
12231
7c673cae
FG
12232void MDCache::show_cache()
12233{
12234 dout(7) << "show_cache" << dendl;
b32b8144
FG
12235
12236 auto show_func = [this](CInode *in) {
7c673cae 12237 // unlinked?
b32b8144
FG
12238 if (!in->parent)
12239 dout(7) << " unlinked " << *in << dendl;
12240
7c673cae
FG
12241 // dirfrags?
12242 list<CDir*> dfs;
b32b8144 12243 in->get_dirfrags(dfs);
7c673cae
FG
12244 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12245 CDir *dir = *p;
12246 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 12247
94b18763
FG
12248 for (auto &p : dir->items) {
12249 CDentry *dn = p.second;
7c673cae
FG
12250 dout(7) << " dentry " << *dn << dendl;
12251 CDentry::linkage_t *dnl = dn->get_linkage();
12252 if (dnl->is_primary() && dnl->get_inode())
12253 dout(7) << " inode " << *dnl->get_inode() << dendl;
12254 }
12255 }
b32b8144
FG
12256 };
12257
94b18763 12258 for (auto &p : inode_map)
b32b8144 12259 show_func(p.second);
94b18763 12260 for (auto &p : snap_inode_map)
b32b8144 12261 show_func(p.second);
7c673cae
FG
12262}
12263
f64942e4 12264void MDCache::cache_status(Formatter *f)
181888fb
FG
12265{
12266 f->open_object_section("cache");
12267
12268 f->open_object_section("pool");
12269 mempool::get_pool(mempool::mds_co::id).dump(f);
12270 f->close_section();
12271
12272 f->close_section();
181888fb
FG
12273}
12274
11fdf7f2 12275void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
7c673cae 12276{
11fdf7f2
TL
12277 ceph_assert(in);
12278 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12279 return;
12280 }
12281 list<CDir*> ls;
12282 in->get_dirfrags(ls);
12283 for (const auto &subdir : ls) {
12284 for (const auto &p : subdir->items) {
12285 CDentry *dn = p.second;
12286 CInode *in = dn->get_linkage()->get_inode();
12287 if (in) {
12288 dump_tree(in, cur_depth + 1, max_depth, f);
12289 }
12290 }
12291 }
12292 f->open_object_section("inode");
12293 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12294 f->close_section();
7c673cae
FG
12295}
12296
11fdf7f2 12297int MDCache::dump_cache(std::string_view file_name)
7c673cae 12298{
11fdf7f2 12299 return dump_cache(file_name, NULL);
7c673cae
FG
12300}
12301
11fdf7f2 12302int MDCache::dump_cache(Formatter *f)
7c673cae 12303{
11fdf7f2 12304 return dump_cache(std::string_view(""), f);
7c673cae
FG
12305}
12306
12307/**
12308 * Dump the metadata cache, either to a Formatter, if
12309 * provided, else to a plain text file.
12310 */
11fdf7f2 12311int MDCache::dump_cache(std::string_view fn, Formatter *f)
7c673cae
FG
12312{
12313 int r = 0;
f64942e4
AA
12314
12315 // dumping large caches may cause mds to hang or worse get killed.
12316 // so, disallow the dump if the cache size exceeds the configured
12317 // threshold, which is 1G for formatter and unlimited for file (note
12318 // that this can be jacked up by the admin... and is nothing but foot
12319 // shooting, but the option itself is for devs and hence dangerous to
12320 // tune). TODO: remove this when fixed.
12321 uint64_t threshold = f ?
11fdf7f2
TL
12322 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12323 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
f64942e4
AA
12324
12325 if (threshold && cache_size() > threshold) {
12326 if (f) {
12327 std::stringstream ss;
12328 ss << "cache usage exceeds dump threshold";
12329 f->open_object_section("result");
12330 f->dump_string("error", ss.str());
12331 f->close_section();
12332 } else {
12333 derr << "cache usage exceeds dump threshold" << dendl;
12334 r = -EINVAL;
12335 }
12336 return r;
12337 }
12338
12339 r = 0;
7c673cae
FG
12340 int fd = -1;
12341
12342 if (f) {
12343 f->open_array_section("inodes");
12344 } else {
94b18763
FG
12345 char path[PATH_MAX] = "";
12346 if (fn.length()) {
12347 snprintf(path, sizeof path, "%s", fn.data());
12348 } else {
12349 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
12350 }
12351
94b18763 12352 dout(1) << "dump_cache to " << path << dendl;
7c673cae 12353
91327a77 12354 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
7c673cae 12355 if (fd < 0) {
94b18763 12356 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 12357 return errno;
7c673cae
FG
12358 }
12359 }
12360
11fdf7f2 12361 auto dump_func = [fd, f](CInode *in) {
b32b8144 12362 int r;
7c673cae
FG
12363 if (f) {
12364 f->open_object_section("inode");
11fdf7f2
TL
12365 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12366 f->close_section();
12367 return 1;
12368 }
12369 ostringstream ss;
12370 ss << *in << std::endl;
12371 std::string s = ss.str();
12372 r = safe_write(fd, s.c_str(), s.length());
12373 if (r < 0)
12374 return r;
7c673cae
FG
12375 list<CDir*> dfs;
12376 in->get_dirfrags(dfs);
11fdf7f2
TL
12377 for (auto &dir : dfs) {
12378 ostringstream tt;
12379 tt << " " << *dir << std::endl;
12380 std::string t = tt.str();
12381 r = safe_write(fd, t.c_str(), t.length());
12382 if (r < 0)
12383 return r;
94b18763
FG
12384 for (auto &p : dir->items) {
12385 CDentry *dn = p.second;
11fdf7f2
TL
12386 ostringstream uu;
12387 uu << " " << *dn << std::endl;
12388 std::string u = uu.str();
12389 r = safe_write(fd, u.c_str(), u.length());
12390 if (r < 0)
12391 return r;
7c673cae
FG
12392 }
12393 dir->check_rstats();
7c673cae 12394 }
b32b8144
FG
12395 return 1;
12396 };
12397
94b18763 12398 for (auto &p : inode_map) {
b32b8144
FG
12399 r = dump_func(p.second);
12400 if (r < 0)
12401 goto out;
12402 }
94b18763 12403 for (auto &p : snap_inode_map) {
b32b8144
FG
12404 r = dump_func(p.second);
12405 if (r < 0)
12406 goto out;
7c673cae 12407 }
b32b8144 12408 r = 0;
7c673cae
FG
12409
12410 out:
12411 if (f) {
12412 f->close_section(); // inodes
12413 } else {
12414 ::close(fd);
12415 }
31f18b77 12416 return r;
7c673cae
FG
12417}
12418
12419
12420
12421C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12422 : MDSInternalContext(c->mds), cache(c), mdr(r)
12423{}
12424
12425void C_MDS_RetryRequest::finish(int r)
12426{
12427 mdr->retry++;
12428 cache->dispatch_request(mdr);
12429}
12430
12431
12432class C_MDS_EnqueueScrub : public Context
12433{
11fdf7f2 12434 std::string tag;
7c673cae
FG
12435 Formatter *formatter;
12436 Context *on_finish;
12437public:
12438 ScrubHeaderRef header;
11fdf7f2
TL
12439 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12440 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
7c673cae
FG
12441
12442 Context *take_finisher() {
12443 Context *fin = on_finish;
12444 on_finish = NULL;
12445 return fin;
12446 }
12447
12448 void finish(int r) override {
11fdf7f2
TL
12449 if (r == 0) {
12450 // since recursive scrub is asynchronous, dump minimal output
12451 // to not upset cli tools.
12452 if (header && header->get_recursive()) {
12453 formatter->open_object_section("results");
12454 formatter->dump_int("return_code", 0);
12455 formatter->dump_string("scrub_tag", tag);
12456 formatter->dump_string("mode", "asynchronous");
12457 formatter->close_section(); // results
12458 }
12459 } else { // we failed the lookup or something; dump ourselves
7c673cae
FG
12460 formatter->open_object_section("results");
12461 formatter->dump_int("return_code", r);
12462 formatter->close_section(); // results
11fdf7f2 12463 r = 0; // already dumped in formatter
7c673cae
FG
12464 }
12465 if (on_finish)
12466 on_finish->complete(r);
12467 }
12468};
12469
12470void MDCache::enqueue_scrub(
11fdf7f2
TL
12471 std::string_view path,
12472 std::string_view tag,
7c673cae
FG
12473 bool force, bool recursive, bool repair,
12474 Formatter *f, Context *fin)
12475{
11fdf7f2 12476 dout(10) << __func__ << " " << path << dendl;
7c673cae 12477 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
11fdf7f2
TL
12478 if (path == "~mdsdir") {
12479 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12480 mdr->set_filepath(fp);
12481 } else {
12482 filepath fp(path);
12483 mdr->set_filepath(path);
12484 }
12485
12486 bool is_internal = false;
12487 std::string tag_str(tag);
12488 if (tag_str.empty()) {
12489 uuid_d uuid_gen;
12490 uuid_gen.generate_random();
12491 tag_str = uuid_gen.to_string();
12492 is_internal = true;
12493 }
7c673cae 12494
11fdf7f2 12495 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
7c673cae 12496 cs->header = std::make_shared<ScrubHeader>(
11fdf7f2 12497 tag_str, is_internal, force, recursive, repair, f);
7c673cae
FG
12498
12499 mdr->internal_op_finish = cs;
12500 enqueue_scrub_work(mdr);
12501}
12502
12503void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12504{
11fdf7f2
TL
12505 MutationImpl::LockOpVec lov;
12506 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
7c673cae
FG
12507 if (NULL == in)
12508 return;
12509
12510 // TODO: Remove this restriction
11fdf7f2 12511 ceph_assert(in->is_auth());
7c673cae 12512
11fdf7f2 12513 bool locked = mds->locker->acquire_locks(mdr, lov);
7c673cae
FG
12514 if (!locked)
12515 return;
12516
12517 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
11fdf7f2 12518 ScrubHeaderRef header = cs->header;
7c673cae
FG
12519
12520 // Cannot scrub same dentry twice at same time
11fdf7f2 12521 if (in->scrub_is_in_progress()) {
7c673cae
FG
12522 mds->server->respond_to_request(mdr, -EBUSY);
12523 return;
12524 } else {
12525 in->scrub_info();
12526 }
12527
12528 header->set_origin(in);
12529
11fdf7f2
TL
12530 Context *fin;
12531 if (header->get_recursive()) {
12532 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12533 fin = new MDSInternalContextWrapper(mds,
12534 new FunctionContext([this, header](int r) {
12535 recursive_scrub_finish(header);
12536 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12537 })
12538 );
12539 } else {
b32b8144
FG
12540 fin = cs->take_finisher();
12541 }
12542
12543 // If the scrub did some repair, then flush the journal at the end of
12544 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12545 // the on disk state will still look damaged.
28e407b8
AA
12546 auto scrub_finish = new FunctionContext([this, header, fin](int r){
12547 if (!header->get_repaired()) {
12548 if (fin)
12549 fin->complete(r);
12550 return;
12551 }
12552
12553 auto flush_finish = new FunctionContext([this, fin](int r){
12554 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12555 mds->mdlog->trim_all();
12556
12557 if (fin) {
12558 MDSGatherBuilder gather(g_ceph_context);
12559 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12560 for (auto logseg : expiring_segments)
12561 logseg->wait_for_expiry(gather.new_sub());
11fdf7f2 12562 ceph_assert(gather.has_subs());
28e407b8
AA
12563 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12564 gather.activate();
b32b8144 12565 }
28e407b8
AA
12566 });
12567
12568 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12569 mds->mdlog->start_new_segment();
12570 mds->mdlog->flush();
12571 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12572 });
12573
7c673cae 12574 if (!header->get_recursive()) {
7c673cae 12575 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12576 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12577 } else {
12578 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12579 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12580 }
7c673cae
FG
12581
12582 mds->server->respond_to_request(mdr, 0);
12583 return;
12584}
12585
11fdf7f2
TL
12586void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12587{
12588 if (header->get_origin()->is_base() &&
12589 header->get_force() && header->get_repair()) {
12590 // notify snapserver that base directory is recursively scrubbed.
12591 // After both root and mdsdir are recursively scrubbed, snapserver
12592 // knows that all old format snaprealms are converted to the new
12593 // format.
12594 if (mds->mdsmap->get_num_in_mds() == 1 &&
12595 mds->mdsmap->get_num_failed_mds() == 0 &&
12596 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12597 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12598 }
12599 }
12600}
12601
12602struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
7c673cae 12603 MDRequestRef mdr;
11fdf7f2 12604 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
7c673cae
FG
12605 MDCacheLogContext(c), mdr(m) {}
12606 void finish(int r) override {
12607 mdr->apply();
12608 get_mds()->server->respond_to_request(mdr, r);
12609 }
12610};
12611
12612void MDCache::repair_dirfrag_stats(CDir *dir)
12613{
12614 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12615 mdr->pin(dir);
12616 mdr->internal_op_private = dir;
12617 mdr->internal_op_finish = new C_MDSInternalNoop;
12618 repair_dirfrag_stats_work(mdr);
12619}
12620
12621void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12622{
12623 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12624 dout(10) << __func__ << " " << *dir << dendl;
12625
12626 if (!dir->is_auth()) {
12627 mds->server->respond_to_request(mdr, -ESTALE);
12628 return;
12629 }
12630
12631 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12632 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12633
7c673cae
FG
12634 mds->locker->drop_locks(mdr.get());
12635 mdr->drop_local_auth_pins();
224ce89b
WB
12636 if (!mdr->remote_auth_pins.empty())
12637 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12638 return;
12639 }
12640
12641 mdr->auth_pin(dir);
12642
11fdf7f2 12643 MutationImpl::LockOpVec lov;
7c673cae 12644 CInode *diri = dir->inode;
11fdf7f2
TL
12645 lov.add_rdlock(&diri->dirfragtreelock);
12646 lov.add_wrlock(&diri->nestlock);
12647 lov.add_wrlock(&diri->filelock);
12648 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12649 return;
12650
12651 if (!dir->is_complete()) {
12652 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12653 return;
12654 }
12655
12656 frag_info_t frag_info;
12657 nest_info_t nest_info;
94b18763 12658 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12659 CDentry *dn = it->second;
12660 if (dn->last != CEPH_NOSNAP)
12661 continue;
12662 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12663 if (dnl->is_primary()) {
12664 CInode *in = dnl->get_inode();
12665 nest_info.add(in->get_projected_inode()->accounted_rstat);
12666 if (in->is_dir())
12667 frag_info.nsubdirs++;
12668 else
12669 frag_info.nfiles++;
12670 } else if (dnl->is_remote())
12671 frag_info.nfiles++;
12672 }
12673
12674 fnode_t *pf = dir->get_projected_fnode();
12675 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12676 bool good_rstat = nest_info.same_sums(pf->rstat);
12677 if (good_fragstat && good_rstat) {
12678 dout(10) << __func__ << " no corruption found" << dendl;
12679 mds->server->respond_to_request(mdr, 0);
12680 return;
12681 }
12682
12683 pf = dir->project_fnode();
12684 pf->version = dir->pre_dirty();
12685 mdr->add_projected_fnode(dir);
12686
12687 mdr->ls = mds->mdlog->get_current_segment();
12688 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12689 mds->mdlog->start_entry(le);
12690
12691 if (!good_fragstat) {
12692 if (pf->fragstat.mtime > frag_info.mtime)
12693 frag_info.mtime = pf->fragstat.mtime;
12694 if (pf->fragstat.change_attr > frag_info.change_attr)
12695 frag_info.change_attr = pf->fragstat.change_attr;
12696 pf->fragstat = frag_info;
12697 mds->locker->mark_updated_scatterlock(&diri->filelock);
12698 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12699 mdr->add_updated_lock(&diri->filelock);
12700 }
12701
12702 if (!good_rstat) {
12703 if (pf->rstat.rctime > nest_info.rctime)
12704 nest_info.rctime = pf->rstat.rctime;
12705 pf->rstat = nest_info;
12706 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12707 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12708 mdr->add_updated_lock(&diri->nestlock);
12709 }
12710
12711 le->metablob.add_dir_context(dir);
12712 le->metablob.add_dir(dir, true);
12713
11fdf7f2 12714 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
7c673cae
FG
12715}
12716
12717void MDCache::repair_inode_stats(CInode *diri)
12718{
12719 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12720 mdr->pin(diri);
12721 mdr->internal_op_private = diri;
12722 mdr->internal_op_finish = new C_MDSInternalNoop;
12723 repair_inode_stats_work(mdr);
12724}
12725
12726void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12727{
12728 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12729 dout(10) << __func__ << " " << *diri << dendl;
12730
12731 if (!diri->is_auth()) {
12732 mds->server->respond_to_request(mdr, -ESTALE);
12733 return;
12734 }
12735 if (!diri->is_dir()) {
12736 mds->server->respond_to_request(mdr, -ENOTDIR);
12737 return;
12738 }
12739
11fdf7f2 12740 MutationImpl::LockOpVec lov;
7c673cae
FG
12741
12742 if (mdr->ls) // already marked filelock/nestlock dirty ?
12743 goto do_rdlocks;
12744
11fdf7f2
TL
12745 lov.add_rdlock(&diri->dirfragtreelock);
12746 lov.add_wrlock(&diri->nestlock);
12747 lov.add_wrlock(&diri->filelock);
12748 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12749 return;
12750
12751 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12752 // the scatter-gather process, which will fix any fragstat/rstat errors.
11fdf7f2
TL
12753 {
12754 frag_vec_t leaves;
12755 diri->dirfragtree.get_leaves(leaves);
12756 for (const auto& leaf : leaves) {
12757 CDir *dir = diri->get_dirfrag(leaf);
12758 if (!dir) {
12759 ceph_assert(mdr->is_auth_pinned(diri));
12760 dir = diri->get_or_open_dirfrag(this, leaf);
12761 }
12762 if (dir->get_version() == 0) {
12763 ceph_assert(dir->is_auth());
12764 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12765 return;
12766 }
7c673cae
FG
12767 }
12768 }
12769
12770 diri->state_set(CInode::STATE_REPAIRSTATS);
12771 mdr->ls = mds->mdlog->get_current_segment();
12772 mds->locker->mark_updated_scatterlock(&diri->filelock);
12773 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12774 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12775 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12776
12777 mds->locker->drop_locks(mdr.get());
12778
12779do_rdlocks:
12780 // force the scatter-gather process
11fdf7f2
TL
12781 lov.clear();
12782 lov.add_rdlock(&diri->dirfragtreelock);
12783 lov.add_rdlock(&diri->nestlock);
12784 lov.add_rdlock(&diri->filelock);
12785 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12786 return;
12787
12788 diri->state_clear(CInode::STATE_REPAIRSTATS);
12789
12790 frag_info_t dir_info;
12791 nest_info_t nest_info;
11fdf7f2
TL
12792 nest_info.rsubdirs = 1; // it gets one to account for self
12793 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
12794 nest_info.rsnaps = srnode->snaps.size();
7c673cae 12795
11fdf7f2
TL
12796 {
12797 frag_vec_t leaves;
12798 diri->dirfragtree.get_leaves(leaves);
12799 for (const auto& leaf : leaves) {
12800 CDir *dir = diri->get_dirfrag(leaf);
12801 ceph_assert(dir);
12802 ceph_assert(dir->get_version() > 0);
12803 dir_info.add(dir->fnode.accounted_fragstat);
12804 nest_info.add(dir->fnode.accounted_rstat);
12805 }
7c673cae
FG
12806 }
12807
12808 if (!dir_info.same_sums(diri->inode.dirstat) ||
12809 !nest_info.same_sums(diri->inode.rstat)) {
12810 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12811 << *diri << dendl;
12812 }
12813
12814 mds->server->respond_to_request(mdr, 0);
12815}
12816
11fdf7f2
TL
12817void MDCache::upgrade_inode_snaprealm(CInode *in)
12818{
12819 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
12820 mdr->pin(in);
12821 mdr->internal_op_private = in;
12822 mdr->internal_op_finish = new C_MDSInternalNoop;
12823 upgrade_inode_snaprealm_work(mdr);
12824}
12825
12826void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
12827{
12828 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
12829 dout(10) << __func__ << " " << *in << dendl;
12830
12831 if (!in->is_auth()) {
12832 mds->server->respond_to_request(mdr, -ESTALE);
12833 return;
12834 }
12835
12836 MutationImpl::LockOpVec lov;
12837 mds->locker->include_snap_rdlocks(in, lov);
12838 lov.erase_rdlock(&in->snaplock);
12839 lov.add_xlock(&in->snaplock);
12840
12841 if (!mds->locker->acquire_locks(mdr, lov))
12842 return;
12843
12844 // project_snaprealm() upgrades snaprealm format
12845 auto &pi = in->project_inode(false, true);
12846 mdr->add_projected_inode(in);
12847 pi.inode.version = in->pre_dirty();
12848
12849 mdr->ls = mds->mdlog->get_current_segment();
12850 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
12851 mds->mdlog->start_entry(le);
12852
12853 if (in->is_base()) {
12854 le->metablob.add_root(true, in);
12855 } else {
12856 CDentry *pdn = in->get_projected_parent_dn();
12857 le->metablob.add_dir_context(pdn->get_dir());
12858 le->metablob.add_primary_dentry(pdn, in, true);
12859 }
12860
12861 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12862}
12863
12864void MDCache::flush_dentry(std::string_view path, Context *fin)
7c673cae
FG
12865{
12866 if (is_readonly()) {
12867 dout(10) << __func__ << ": read-only FS" << dendl;
12868 fin->complete(-EROFS);
12869 return;
12870 }
12871 dout(10) << "flush_dentry " << path << dendl;
12872 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 12873 filepath fp(path);
7c673cae
FG
12874 mdr->set_filepath(fp);
12875 mdr->internal_op_finish = fin;
12876 flush_dentry_work(mdr);
12877}
12878
11fdf7f2 12879class C_FinishIOMDR : public MDSContext {
7c673cae
FG
12880protected:
12881 MDSRank *mds;
12882 MDRequestRef mdr;
12883 MDSRank *get_mds() override { return mds; }
12884public:
12885 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12886 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12887};
12888
12889void MDCache::flush_dentry_work(MDRequestRef& mdr)
12890{
11fdf7f2
TL
12891 MutationImpl::LockOpVec lov;
12892 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
7c673cae
FG
12893 if (NULL == in)
12894 return;
12895
12896 // TODO: Is this necessary? Fix it if so
11fdf7f2
TL
12897 ceph_assert(in->is_auth());
12898 bool locked = mds->locker->acquire_locks(mdr, lov);
7c673cae
FG
12899 if (!locked)
12900 return;
12901 in->flush(new C_FinishIOMDR(mds, mdr));
12902}
12903
12904
12905/**
12906 * Initialize performance counters with global perfcounter
12907 * collection.
12908 */
12909void MDCache::register_perfcounters()
12910{
91327a77
AA
12911 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
12912
12913 // Stray/purge statistics
12914 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
12915 PerfCountersBuilder::PRIO_INTERESTING);
12916 pcb.add_u64(l_mdc_num_recovering_enqueued,
12917 "num_recovering_enqueued", "Files waiting for recovery", "recy",
12918 PerfCountersBuilder::PRIO_INTERESTING);
12919 pcb.add_u64_counter(l_mdc_recovery_completed,
12920 "recovery_completed", "File recoveries completed", "recd",
12921 PerfCountersBuilder::PRIO_INTERESTING);
12922
12923 // useful recovery queue statistics
12924 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
12925 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
12926 "Files currently being recovered");
12927 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
12928 "Files waiting for recovery with elevated priority");
12929 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
12930 "File recoveries started");
12931
12932 // along with other stray dentries stats
12933 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
12934 "Stray dentries delayed");
12935 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
12936 "Stray dentries enqueuing for purge");
12937 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
12938 "Stray dentries created");
7c673cae 12939 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
91327a77
AA
12940 "Stray dentries enqueued for purge");
12941 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
12942 "Stray dentries reintegrated");
12943 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
12944 "Stray dentries migrated");
7c673cae 12945
91327a77 12946 // low prio internal request stats
d2e6a577 12947 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
91327a77 12948 "Internal Request type enqueue scrub");
d2e6a577 12949 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
91327a77 12950 "Internal Request type export dir");
d2e6a577 12951 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
91327a77 12952 "Internal Request type flush");
d2e6a577 12953 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
91327a77 12954 "Internal Request type fragmentdir");
d2e6a577 12955 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
91327a77 12956 "Internal Request type frag stats");
d2e6a577 12957 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
91327a77 12958 "Internal Request type inode stats");
d2e6a577 12959
7c673cae
FG
12960 logger.reset(pcb.create_perf_counters());
12961 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12962 recovery_queue.set_logger(logger.get());
12963 stray_manager.set_logger(logger.get());
12964}
12965
7c673cae
FG
12966/**
12967 * Call this when putting references to an inode/dentry or
12968 * when attempting to trim it.
12969 *
12970 * If this inode is no longer linked by anyone, and this MDS
12971 * rank holds the primary dentry, and that dentry is in a stray
12972 * directory, then give up the dentry to the StrayManager, never
12973 * to be seen again by MDCache.
12974 *
12975 * @param delay if true, then purgeable inodes are stashed til
12976 * the next trim(), rather than being purged right
12977 * away.
12978 */
12979void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12980 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12981 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12982 return;
224ce89b 12983
7c673cae
FG
12984 CDentry *dn = in->get_projected_parent_dn();
12985
12986 if (dn->state_test(CDentry::STATE_PURGING)) {
12987 /* We have already entered the purging process, no need
12988 * to re-evaluate me ! */
12989 return;
12990 }
12991
11fdf7f2
TL
12992 if (dn->get_dir()->get_inode()->is_stray()) {
12993 if (delay)
12994 stray_manager.queue_delayed(dn);
12995 else
12996 stray_manager.eval_stray(dn);
7c673cae
FG
12997 }
12998}
12999
31f18b77
FG
13000void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13001 dout(10) << __func__ << " " << *diri << dendl;
11fdf7f2 13002 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
31f18b77
FG
13003 list<CDir*> ls;
13004 diri->get_dirfrags(ls);
94b18763 13005 for (auto &p : ls) {
31f18b77
FG
13006 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13007 p->try_remove_dentries_for_stray();
13008 }
13009 if (!diri->snaprealm) {
13010 if (diri->is_auth())
13011 diri->clear_dirty_rstat();
13012 diri->clear_scatter_dirty();
13013 }
13014}
13015
11fdf7f2
TL
13016bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13017 CInode *in = get_inode(number);
13018 if (!in) {
13019 return false;
13020 }
13021 f->open_object_section("inode");
13022 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13023 f->close_section();
13024 return true;
13025}
eafe8130
TL
13026
13027void MDCache::handle_mdsmap(const MDSMap &mdsmap) {
13028 // process export_pin_delayed_queue whenever a new MDSMap received
13029 auto &q = export_pin_delayed_queue;
13030 for (auto it = q.begin(); it != q.end(); ) {
13031 auto *in = *it;
13032 mds_rank_t export_pin = in->get_export_pin(false);
13033 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13034 << " max_mds=" << mdsmap.get_max_mds() << dendl;
13035 if (export_pin >= mdsmap.get_max_mds()) {
13036 it++;
13037 continue;
13038 }
13039
13040 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13041 it = q.erase(it);
13042 in->maybe_export_pin();
13043 }
13044}
13045