]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
import ceph quincy 17.2.1
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
f67539c2 16#include <ostream>
7c673cae 17#include <string>
11fdf7f2 18#include <string_view>
7c673cae
FG
19#include <map>
20
21#include "MDCache.h"
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDLog.h"
26#include "MDBalancer.h"
27#include "Migrator.h"
28#include "ScrubStack.h"
29
30#include "SnapClient.h"
31
32#include "MDSMap.h"
33
34#include "CInode.h"
35#include "CDir.h"
36
37#include "Mutation.h"
38
39#include "include/ceph_fs.h"
40#include "include/filepath.h"
181888fb 41#include "include/util.h"
7c673cae 42
11fdf7f2
TL
43#include "messages/MClientCaps.h"
44
7c673cae
FG
45#include "msg/Message.h"
46#include "msg/Messenger.h"
47
181888fb 48#include "common/MemoryModel.h"
7c673cae 49#include "common/errno.h"
7c673cae 50#include "common/perf_counters.h"
181888fb
FG
51#include "common/safe_io.h"
52
7c673cae
FG
53#include "osdc/Journaler.h"
54#include "osdc/Filer.h"
55
56#include "events/ESubtreeMap.h"
57#include "events/EUpdate.h"
f67539c2 58#include "events/EPeerUpdate.h"
7c673cae
FG
59#include "events/EImportFinish.h"
60#include "events/EFragment.h"
61#include "events/ECommitted.h"
9f95a23c 62#include "events/EPurged.h"
7c673cae
FG
63#include "events/ESessions.h"
64
7c673cae
FG
65#include "InoTable.h"
66
67#include "common/Timer.h"
68
69#include "perfglue/heap_profiler.h"
70
7c673cae
FG
71
72#include "common/config.h"
11fdf7f2 73#include "include/ceph_assert.h"
7c673cae
FG
74
75#define dout_context g_ceph_context
76#define dout_subsys ceph_subsys_mds
77#undef dout_prefix
78#define dout_prefix _prefix(_dout, mds)
20effc67
TL
79
80using namespace std;
81
7c673cae
FG
82static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
83 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
84}
85
86set<int> SimpleLock::empty_gather_set;
87
88
89/**
90 * All non-I/O contexts that require a reference
91 * to an MDCache instance descend from this.
92 */
11fdf7f2 93class MDCacheContext : public virtual MDSContext {
7c673cae
FG
94protected:
95 MDCache *mdcache;
96 MDSRank *get_mds() override
97 {
11fdf7f2 98 ceph_assert(mdcache != NULL);
7c673cae
FG
99 return mdcache->mds;
100 }
101public:
102 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
103};
104
105
106/**
107 * Only for contexts called back from an I/O completion
108 *
109 * Note: duplication of members wrt MDCacheContext, because
110 * it'ls the lesser of two evils compared with introducing
111 * yet another piece of (multiple) inheritance.
112 */
113class MDCacheIOContext : public virtual MDSIOContextBase {
114protected:
115 MDCache *mdcache;
116 MDSRank *get_mds() override
117 {
11fdf7f2 118 ceph_assert(mdcache != NULL);
7c673cae
FG
119 return mdcache->mds;
120 }
121public:
91327a77
AA
122 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
123 MDSIOContextBase(track), mdcache(mdc_) {}
7c673cae
FG
124};
125
126class MDCacheLogContext : public virtual MDSLogContextBase {
127protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
11fdf7f2 131 ceph_assert(mdcache != NULL);
7c673cae
FG
132 return mdcache->mds;
133 }
134public:
135 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
136};
137
138MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
139 mds(m),
9f95a23c 140 open_file_table(m),
7c673cae 141 filer(m->objecter, m->finisher),
a8e16298 142 stray_manager(m, purge_queue_),
9f95a23c
TL
143 recovery_queue(m),
144 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
7c673cae
FG
145{
146 migrator.reset(new Migrator(mds, this));
7c673cae 147
11fdf7f2
TL
148 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
149 (g_conf()->mds_dir_max_commit_size << 20) :
150 (0.9 *(g_conf()->osd_max_write_size << 20));
7c673cae 151
11fdf7f2
TL
152 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
153 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
154 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 155
f6b5b4d7
TL
156 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
157 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
158 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
159
20effc67
TL
160 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
161
11fdf7f2 162 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
7c673cae 163
31f18b77
FG
164 bottom_lru.lru_set_midpoint(0);
165
11fdf7f2 166 decayrate.set_halflife(g_conf()->mds_decay_halflife);
7c673cae 167
b3b6e05e 168 upkeeper = std::thread(&MDCache::upkeep_main, this);
7c673cae
FG
169}
170
171MDCache::~MDCache()
172{
173 if (logger) {
174 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
175 }
eafe8130
TL
176 if (upkeeper.joinable())
177 upkeeper.join();
7c673cae
FG
178}
179
92f5a8d4 180void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
91327a77 181{
f6b5b4d7 182 dout(20) << "config changes: " << changed << dendl;
91327a77 183 if (changed.count("mds_cache_memory_limit"))
11fdf7f2 184 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
91327a77 185 if (changed.count("mds_cache_reservation"))
11fdf7f2 186 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
f67539c2
TL
187
188 bool ephemeral_pin_config_changed = false;
f6b5b4d7
TL
189 if (changed.count("mds_export_ephemeral_distributed")) {
190 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
191 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
192 /* copy to vector to avoid removals during iteration */
f67539c2 193 ephemeral_pin_config_changed = true;
f6b5b4d7
TL
194 }
195 if (changed.count("mds_export_ephemeral_random")) {
196 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
197 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
198 /* copy to vector to avoid removals during iteration */
f67539c2
TL
199 ephemeral_pin_config_changed = true;
200 }
201 if (ephemeral_pin_config_changed) {
f6b5b4d7 202 std::vector<CInode*> migrate;
f67539c2 203 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
f6b5b4d7 204 for (auto& in : migrate) {
f67539c2 205 in->maybe_export_pin(true);
f6b5b4d7 206 }
f6b5b4d7
TL
207 }
208 if (changed.count("mds_export_ephemeral_random_max")) {
209 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
210 }
91327a77 211 if (changed.count("mds_health_cache_threshold"))
11fdf7f2 212 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 213 if (changed.count("mds_cache_mid"))
11fdf7f2 214 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
a8e16298 215 if (changed.count("mds_cache_trim_decay_rate")) {
11fdf7f2 216 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
a8e16298 217 }
20effc67
TL
218 if (changed.count("mds_symlink_recovery")) {
219 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
220 dout(10) << "Storing symlink targets on file object's head " << symlink_recovery << dendl;
221 }
7c673cae 222
92f5a8d4
TL
223 migrator->handle_conf_change(changed, mdsmap);
224 mds->balancer->handle_conf_change(changed, mdsmap);
91327a77 225}
7c673cae
FG
226
227void MDCache::log_stat()
228{
7c673cae
FG
229 mds->logger->set(l_mds_inodes, lru.lru_get_size());
230 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
231 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
232 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
233 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
234 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
235 mds->logger->set(l_mds_caps, Capability::count());
eafe8130 236 if (root) {
f67539c2
TL
237 mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles);
238 mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes);
239 mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps);
eafe8130 240 }
7c673cae
FG
241}
242
243
244//
245
246bool MDCache::shutdown()
247{
eafe8130
TL
248 {
249 std::scoped_lock lock(upkeep_mutex);
250 upkeep_trim_shutdown = true;
251 upkeep_cvar.notify_one();
252 }
7c673cae
FG
253 if (lru.lru_get_size() > 0) {
254 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
255 //show_cache();
256 show_subtrees();
257 //dump();
258 }
259 return true;
260}
261
262
263// ====================================================================
264// some inode functions
265
a4b75251 266void MDCache::add_inode(CInode *in)
7c673cae 267{
a4b75251 268 // add to inode map
b32b8144
FG
269 if (in->last == CEPH_NOSNAP) {
270 auto &p = inode_map[in->ino()];
11fdf7f2 271 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
272 p = in;
273 } else {
274 auto &p = snap_inode_map[in->vino()];
11fdf7f2 275 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
276 p = in;
277 }
7c673cae
FG
278
279 if (in->ino() < MDS_INO_SYSTEM_BASE) {
b3b6e05e 280 if (in->ino() == CEPH_INO_ROOT)
7c673cae
FG
281 root = in;
282 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
283 myin = in;
284 else if (in->is_stray()) {
285 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
286 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
287 }
288 }
289 if (in->is_base())
290 base_inodes.insert(in);
291 }
7c673cae
FG
292}
293
294void MDCache::remove_inode(CInode *o)
295{
296 dout(14) << "remove_inode " << *o << dendl;
297
298 if (o->get_parent_dn()) {
299 // FIXME: multiple parents?
300 CDentry *dn = o->get_parent_dn();
11fdf7f2 301 ceph_assert(!dn->is_dirty());
7c673cae
FG
302 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
303 }
304
305 if (o->is_dirty())
306 o->mark_clean();
307 if (o->is_dirty_parent())
308 o->clear_dirty_parent();
309
310 o->clear_scatter_dirty();
311
f91f0fd5
TL
312 o->clear_clientwriteable();
313
7c673cae
FG
314 o->item_open_file.remove_myself();
315
31f18b77
FG
316 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
317 export_pin_queue.erase(o);
7c673cae 318
eafe8130
TL
319 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
320 export_pin_delayed_queue.erase(o);
321
f67539c2 322 o->clear_ephemeral_pin(true, true);
f6b5b4d7 323
7c673cae 324 // remove from inode map
11fdf7f2 325 if (o->last == CEPH_NOSNAP) {
b32b8144 326 inode_map.erase(o->ino());
11fdf7f2
TL
327 } else {
328 o->item_caps.remove_myself();
b32b8144 329 snap_inode_map.erase(o->vino());
11fdf7f2 330 }
7c673cae
FG
331
332 if (o->ino() < MDS_INO_SYSTEM_BASE) {
333 if (o == root) root = 0;
334 if (o == myin) myin = 0;
335 if (o->is_stray()) {
336 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
337 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
338 }
339 }
340 if (o->is_base())
341 base_inodes.erase(o);
11fdf7f2 342 }
7c673cae
FG
343
344 // delete it
11fdf7f2 345 ceph_assert(o->get_num_ref() == 0);
7c673cae
FG
346 delete o;
347}
348
349file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
350{
351 file_layout_t result = file_layout_t::get_default();
352 result.pool_id = mdsmap.get_first_data_pool();
353 return result;
354}
355
356file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
357{
358 file_layout_t result = file_layout_t::get_default();
359 result.pool_id = mdsmap.get_metadata_pool();
11fdf7f2
TL
360 if (g_conf()->mds_log_segment_size > 0) {
361 result.object_size = g_conf()->mds_log_segment_size;
362 result.stripe_unit = g_conf()->mds_log_segment_size;
7c673cae
FG
363 }
364 return result;
365}
366
367void MDCache::init_layouts()
368{
369 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
370 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
371}
372
f67539c2 373void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const
7c673cae 374{
f67539c2
TL
375 auto _inode = in->_get_inode();
376 _inode->ino = ino;
377 _inode->version = 1;
378 _inode->xattr_version = 1;
379 _inode->mode = 0500 | mode;
380 _inode->size = 0;
381 _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now();
382 _inode->nlink = 1;
383 _inode->truncate_size = -1ull;
384 _inode->change_attr = 0;
385 _inode->export_pin = MDS_RANK_NONE;
7c673cae 386
92f5a8d4 387 // FIPS zeroization audit 20191117: this memset is not security related.
f67539c2
TL
388 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
389 if (_inode->is_dir()) {
390 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
391 _inode->rstat.rsubdirs = 1; /* itself */
392 _inode->rstat.rctime = in->get_inode()->ctime;
7c673cae 393 } else {
f67539c2
TL
394 _inode->layout = default_file_layout;
395 ++_inode->rstat.rfiles;
7c673cae 396 }
f67539c2 397 _inode->accounted_rstat = _inode->rstat;
7c673cae
FG
398
399 if (in->is_base()) {
400 if (in->is_root())
401 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
402 else
403 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
404 in->open_snaprealm(); // empty snaprealm
11fdf7f2 405 ceph_assert(!in->snaprealm->parent); // created its own
7c673cae
FG
406 in->snaprealm->srnode.seq = 1;
407 }
408}
409
410CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
411{
412 dout(0) << "creating system inode with ino:" << ino << dendl;
413 CInode *in = new CInode(this);
414 create_unlinked_system_inode(in, ino, mode);
415 add_inode(in);
416 return in;
417}
418
419CInode *MDCache::create_root_inode()
420{
b3b6e05e 421 CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
f67539c2
TL
422 auto _inode = in->_get_inode();
423 _inode->uid = g_conf()->mds_root_ino_uid;
424 _inode->gid = g_conf()->mds_root_ino_gid;
425 _inode->layout = default_file_layout;
426 _inode->layout.pool_id = mds->mdsmap->get_first_data_pool();
427 return in;
7c673cae
FG
428}
429
430void MDCache::create_empty_hierarchy(MDSGather *gather)
431{
432 // create root dir
433 CInode *root = create_root_inode();
434
435 // force empty root dir
436 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
437 adjust_subtree_auth(rootdir, mds->get_nodeid());
438 rootdir->dir_rep = CDir::REP_ALL; //NONE;
439
f67539c2
TL
440 ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat);
441 ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat);
442 ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat);
f64942e4
AA
443 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
444 * assume version 0 is stale/invalid.
445 */
7c673cae
FG
446
447 rootdir->mark_complete();
f67539c2
TL
448 rootdir->_get_fnode()->version = rootdir->pre_dirty();
449 rootdir->mark_dirty(mds->mdlog->get_current_segment());
7c673cae
FG
450 rootdir->commit(0, gather->new_sub());
451
f67539c2 452 root->store(gather->new_sub());
b3b6e05e
TL
453 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
454 root->store_backtrace(gather->new_sub());
7c673cae
FG
455}
456
457void MDCache::create_mydir_hierarchy(MDSGather *gather)
458{
459 // create mds dir
460 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
461
462 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
f67539c2
TL
463 auto mydir_fnode = mydir->_get_fnode();
464
7c673cae
FG
465 adjust_subtree_auth(mydir, mds->get_nodeid());
466
467 LogSegment *ls = mds->mdlog->get_current_segment();
468
469 // stray dir
470 for (int i = 0; i < NUM_STRAY; ++i) {
471 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
472 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
f67539c2
TL
473 CachedStackStringStream css;
474 *css << "stray" << i;
475 CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, "");
7c673cae
FG
476 sdn->_mark_dirty(mds->mdlog->get_current_segment());
477
f67539c2 478 stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat;
7c673cae 479
f67539c2
TL
480 mydir_fnode->rstat.add(stray->get_inode()->rstat);
481 mydir_fnode->fragstat.nsubdirs++;
7c673cae
FG
482 // save them
483 straydir->mark_complete();
f67539c2
TL
484 straydir->_get_fnode()->version = straydir->pre_dirty();
485 straydir->mark_dirty(ls);
7c673cae 486 straydir->commit(0, gather->new_sub());
28e407b8 487 stray->mark_dirty_parent(ls, true);
7c673cae
FG
488 stray->store_backtrace(gather->new_sub());
489 }
490
f67539c2
TL
491 mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat;
492 mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat;
7c673cae 493
f67539c2
TL
494 auto inode = myin->_get_inode();
495 inode->dirstat = mydir->get_fnode()->fragstat;
496 inode->rstat = mydir->get_fnode()->rstat;
497 ++inode->rstat.rsubdirs;
498 inode->accounted_rstat = inode->rstat;
7c673cae
FG
499
500 mydir->mark_complete();
f67539c2
TL
501 mydir_fnode->version = mydir->pre_dirty();
502 mydir->mark_dirty(ls);
7c673cae
FG
503 mydir->commit(0, gather->new_sub());
504
505 myin->store(gather->new_sub());
506}
507
508struct C_MDC_CreateSystemFile : public MDCacheLogContext {
509 MutationRef mut;
510 CDentry *dn;
511 version_t dpv;
11fdf7f2
TL
512 MDSContext *fin;
513 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
7c673cae
FG
514 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
515 void finish(int r) override {
516 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
517 }
518};
519
11fdf7f2 520void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
7c673cae
FG
521{
522 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
523 CDentry *dn = dir->add_null_dentry(name);
524
525 dn->push_projected_linkage(in);
526 version_t dpv = dn->pre_dirty();
527
528 CDir *mdir = 0;
f67539c2
TL
529 auto inode = in->_get_inode();
530 if (in->is_dir()) {
531 inode->rstat.rsubdirs = 1;
7c673cae
FG
532
533 mdir = in->get_or_open_dirfrag(this, frag_t());
534 mdir->mark_complete();
f67539c2
TL
535 mdir->_get_fnode()->version = mdir->pre_dirty();
536 } else {
537 inode->rstat.rfiles = 1;
538 }
539
540 inode->version = dn->pre_dirty();
7c673cae
FG
541
542 SnapRealm *realm = dir->get_inode()->find_snaprealm();
543 dn->first = in->first = realm->get_newest_seq() + 1;
544
545 MutationRef mut(new MutationImpl());
546
547 // force some locks. hacky.
548 mds->locker->wrlock_force(&dir->inode->filelock, mut);
549 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
550
551 mut->ls = mds->mdlog->get_current_segment();
552 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
553 mds->mdlog->start_entry(le);
554
555 if (!in->is_mdsdir()) {
556 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
557 le->metablob.add_primary_dentry(dn, in, true);
558 } else {
559 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
560 journal_dirty_inode(mut.get(), &le->metablob, in);
561 dn->push_projected_linkage(in->ino(), in->d_type());
562 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
563 le->metablob.add_root(true, in);
564 }
565 if (mdir)
566 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
567
568 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
569 mds->mdlog->flush();
570}
571
11fdf7f2 572void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
7c673cae
FG
573{
574 dout(10) << "_create_system_file_finish " << *dn << dendl;
575
576 dn->pop_projected_linkage();
577 dn->mark_dirty(dpv, mut->ls);
578
579 CInode *in = dn->get_linkage()->get_inode();
f67539c2 580 in->mark_dirty(mut->ls);
7c673cae 581
f67539c2 582 if (in->is_dir()) {
7c673cae 583 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 584 ceph_assert(dir);
f67539c2 585 dir->mark_dirty(mut->ls);
7c673cae
FG
586 dir->mark_new(mut->ls);
587 }
588
589 mut->apply();
590 mds->locker->drop_locks(mut.get());
591 mut->cleanup();
592
593 fin->complete(0);
594
595 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
596 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
597}
598
599
600
601struct C_MDS_RetryOpenRoot : public MDSInternalContext {
602 MDCache *cache;
603 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
604 void finish(int r) override {
605 if (r < 0) {
606 // If we can't open root, something disastrous has happened: mark
607 // this rank damaged for operator intervention. Note that
608 // it is not okay to call suicide() here because we are in
609 // a Finisher callback.
610 cache->mds->damaged();
611 ceph_abort(); // damaged should never return
612 } else {
613 cache->open_root();
614 }
615 }
616};
617
11fdf7f2 618void MDCache::open_root_inode(MDSContext *c)
7c673cae
FG
619{
620 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
621 CInode *in;
b3b6e05e 622 in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
7c673cae
FG
623 in->fetch(c);
624 } else {
b3b6e05e 625 discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
7c673cae
FG
626 }
627}
628
11fdf7f2 629void MDCache::open_mydir_inode(MDSContext *c)
7c673cae 630{
7c673cae 631 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
11fdf7f2 632 in->fetch(c);
7c673cae
FG
633}
634
11fdf7f2 635void MDCache::open_mydir_frag(MDSContext *c)
28e407b8
AA
636{
637 open_mydir_inode(
638 new MDSInternalContextWrapper(mds,
9f95a23c 639 new LambdaContext([this, c](int r) {
28e407b8
AA
640 if (r < 0) {
641 c->complete(r);
642 return;
643 }
644 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 645 ceph_assert(mydir);
28e407b8
AA
646 adjust_subtree_auth(mydir, mds->get_nodeid());
647 mydir->fetch(c);
648 })
649 )
650 );
651}
652
7c673cae
FG
653void MDCache::open_root()
654{
655 dout(10) << "open_root" << dendl;
656
657 if (!root) {
658 open_root_inode(new C_MDS_RetryOpenRoot(this));
659 return;
660 }
661 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
11fdf7f2 662 ceph_assert(root->is_auth());
7c673cae 663 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
11fdf7f2 664 ceph_assert(rootdir);
7c673cae
FG
665 if (!rootdir->is_subtree_root())
666 adjust_subtree_auth(rootdir, mds->get_nodeid());
667 if (!rootdir->is_complete()) {
668 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
669 return;
670 }
671 } else {
11fdf7f2 672 ceph_assert(!root->is_auth());
7c673cae
FG
673 CDir *rootdir = root->get_dirfrag(frag_t());
674 if (!rootdir) {
224ce89b 675 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
676 return;
677 }
678 }
679
680 if (!myin) {
681 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
682 in->fetch(new C_MDS_RetryOpenRoot(this));
683 return;
684 }
685 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 686 ceph_assert(mydir);
7c673cae
FG
687 adjust_subtree_auth(mydir, mds->get_nodeid());
688
689 populate_mydir();
690}
691
f67539c2
TL
692void MDCache::advance_stray() {
693 // check whether the directory has been fragmented
694 if (stray_fragmenting_index >= 0) {
695 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
696 bool any_fragmenting = false;
697 for (const auto& dir : dfs) {
698 if (dir->state_test(CDir::STATE_FRAGMENTING) ||
699 mds->balancer->is_fragment_pending(dir->dirfrag())) {
700 any_fragmenting = true;
701 break;
702 }
703 }
704 if (!any_fragmenting)
705 stray_fragmenting_index = -1;
706 }
707
708 for (int i = 1; i < NUM_STRAY; i++){
709 stray_index = (stray_index + i) % NUM_STRAY;
710 if (stray_index != stray_fragmenting_index)
711 break;
712 }
713
714 if (stray_fragmenting_index == -1 && is_open()) {
715 // Fragment later stray dir in advance. We don't choose past
716 // stray dir because in-flight requests may still use it.
717 stray_fragmenting_index = (stray_index + 3) % NUM_STRAY;
718 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
719 bool any_fragmenting = false;
720 for (const auto& dir : dfs) {
721 if (dir->should_split()) {
722 mds->balancer->queue_split(dir, true);
723 any_fragmenting = true;
724 } else if (dir->should_merge()) {
725 mds->balancer->queue_merge(dir);
726 any_fragmenting = true;
727 }
728 }
729 if (!any_fragmenting)
730 stray_fragmenting_index = -1;
731 }
732
733 dout(10) << "advance_stray to index " << stray_index
734 << " fragmenting index " << stray_fragmenting_index << dendl;
735}
736
7c673cae
FG
737void MDCache::populate_mydir()
738{
11fdf7f2 739 ceph_assert(myin);
7c673cae 740 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 741 ceph_assert(mydir);
7c673cae
FG
742
743 dout(10) << "populate_mydir " << *mydir << dendl;
744
745 if (!mydir->is_complete()) {
746 mydir->fetch(new C_MDS_RetryOpenRoot(this));
747 return;
748 }
749
750 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
751 // A missing dirfrag, we will recreate it. Before that, we must dirty
752 // it before dirtying any of the strays we create within it.
753 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
754 "recreating it now";
755 LogSegment *ls = mds->mdlog->get_current_segment();
756 mydir->state_clear(CDir::STATE_BADFRAG);
757 mydir->mark_complete();
f67539c2
TL
758 mydir->_get_fnode()->version = mydir->pre_dirty();
759 mydir->mark_dirty(ls);
7c673cae
FG
760 }
761
762 // open or create stray
763 uint64_t num_strays = 0;
764 for (int i = 0; i < NUM_STRAY; ++i) {
f67539c2
TL
765 CachedStackStringStream css;
766 *css << "stray" << i;
767 CDentry *straydn = mydir->lookup(css->str());
7c673cae
FG
768
769 // allow for older fs's with stray instead of stray0
770 if (straydn == NULL && i == 0)
771 straydn = mydir->lookup("stray");
772
773 if (!straydn || !straydn->get_linkage()->get_inode()) {
f67539c2 774 _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
7c673cae
FG
775 new C_MDS_RetryOpenRoot(this));
776 return;
777 }
11fdf7f2
TL
778 ceph_assert(straydn);
779 ceph_assert(strays[i]);
7c673cae
FG
780 // we make multiple passes through this method; make sure we only pin each stray once.
781 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
782 strays[i]->get(CInode::PIN_STRAY);
783 strays[i]->state_set(CInode::STATE_STRAYPINNED);
784 strays[i]->get_stickydirs();
785 }
786 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
787
788 // open all frags
11fdf7f2
TL
789 frag_vec_t leaves;
790 strays[i]->dirfragtree.get_leaves(leaves);
791 for (const auto& leaf : leaves) {
792 CDir *dir = strays[i]->get_dirfrag(leaf);
7c673cae 793 if (!dir) {
11fdf7f2 794 dir = strays[i]->get_or_open_dirfrag(this, leaf);
7c673cae
FG
795 }
796
797 // DamageTable applies special handling to strays: it will
798 // have damaged() us out if one is damaged.
11fdf7f2 799 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
7c673cae
FG
800
801 if (dir->get_version() == 0) {
802 dir->fetch(new C_MDS_RetryOpenRoot(this));
803 return;
804 }
805
806 if (dir->get_frag_size() > 0)
807 num_strays += dir->get_frag_size();
808 }
809 }
810
7c673cae
FG
811 // okay!
812 dout(10) << "populate_mydir done" << dendl;
11fdf7f2 813 ceph_assert(!open);
7c673cae
FG
814 open = true;
815 mds->queue_waiters(waiting_for_open);
816
11fdf7f2
TL
817 stray_manager.set_num_strays(num_strays);
818 stray_manager.activate();
819
7c673cae
FG
820 scan_stray_dir();
821}
822
11fdf7f2 823void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
7c673cae
FG
824{
825 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
826}
827
828CDir *MDCache::get_stray_dir(CInode *in)
829{
830 string straydname;
831 in->name_stray_dentry(straydname);
832
833 CInode *strayi = get_stray();
11fdf7f2 834 ceph_assert(strayi);
7c673cae
FG
835 frag_t fg = strayi->pick_dirfrag(straydname);
836 CDir *straydir = strayi->get_dirfrag(fg);
11fdf7f2 837 ceph_assert(straydir);
7c673cae
FG
838 return straydir;
839}
840
11fdf7f2 841MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
7c673cae
FG
842{
843 // inode?
844 if (info.ino)
845 return get_inode(info.ino, info.snapid);
846
847 // dir or dentry.
848 CDir *dir = get_dirfrag(info.dirfrag);
849 if (!dir) return 0;
850
851 if (info.dname.length())
852 return dir->lookup(info.dname, info.snapid);
853 else
854 return dir;
855}
856
857
f6b5b4d7
TL
858// ====================================================================
859// consistent hash ring
860
861/*
862 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
863*/
f67539c2 864mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
f6b5b4d7
TL
865{
866 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
867 uint64_t hash = rjhash64(ino);
f67539c2
TL
868 if (fg)
869 hash = rjhash64(hash + rjhash64(fg.value()));
870
f6b5b4d7
TL
871 int64_t b = -1, j = 0;
872 while (j < max_mds) {
873 b = j;
874 hash = hash*2862933555777941757ULL + 1;
875 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
876 }
877 // verify bounds before returning
878 auto result = mds_rank_t(b);
879 ceph_assert(result >= 0 && result < max_mds);
880 return result;
881}
7c673cae
FG
882
883
884// ====================================================================
885// subtree management
886
7c673cae
FG
887/*
888 * adjust the dir_auth of a subtree.
889 * merge with parent and/or child subtrees, if is it appropriate.
890 * merge can ONLY happen if both parent and child have unambiguous auth.
891 */
28e407b8 892void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
893{
894 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
895 << " on " << *dir << dendl;
896
7c673cae
FG
897 show_subtrees();
898
899 CDir *root;
900 if (dir->inode->is_base()) {
901 root = dir; // bootstrap hack.
902 if (subtrees.count(root) == 0) {
903 subtrees[root];
904 root->get(CDir::PIN_SUBTREE);
905 }
906 } else {
907 root = get_subtree_root(dir); // subtree root
908 }
11fdf7f2
TL
909 ceph_assert(root);
910 ceph_assert(subtrees.count(root));
7c673cae
FG
911 dout(7) << " current root is " << *root << dendl;
912
913 if (root == dir) {
914 // i am already a subtree.
915 dir->set_dir_auth(auth);
916 } else {
917 // i am a new subtree.
918 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 919 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
920 subtrees[dir]; // create empty subtree bounds list for me.
921 dir->get(CDir::PIN_SUBTREE);
922
923 // set dir_auth
924 dir->set_dir_auth(auth);
925
926 // move items nested beneath me, under me.
927 set<CDir*>::iterator p = subtrees[root].begin();
928 while (p != subtrees[root].end()) {
929 set<CDir*>::iterator next = p;
930 ++next;
931 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
932 // move under me
933 dout(10) << " claiming child bound " << **p << dendl;
934 subtrees[dir].insert(*p);
935 subtrees[root].erase(p);
936 }
937 p = next;
938 }
939
940 // i am a bound of the parent subtree.
941 subtrees[root].insert(dir);
942
943 // i am now the subtree root.
944 root = dir;
945
946 // adjust recursive pop counters
28e407b8 947 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
948 CDir *p = dir->get_parent_dir();
949 while (p) {
11fdf7f2 950 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
7c673cae
FG
951 if (p->is_subtree_root()) break;
952 p = p->inode->get_parent_dir();
953 }
954 }
7c673cae
FG
955 }
956
957 show_subtrees();
958}
959
960
961void MDCache::try_subtree_merge(CDir *dir)
962{
963 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
964 // record my old bounds
965 auto oldbounds = subtrees.at(dir);
7c673cae 966
224ce89b 967 set<CInode*> to_eval;
7c673cae 968 // try merge at my root
224ce89b 969 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
970
971 // try merge at my old bounds
224ce89b
WB
972 for (auto bound : oldbounds)
973 try_subtree_merge_at(bound, &to_eval);
974
975 if (!(mds->is_any_replay() || mds->is_resolve())) {
976 for(auto in : to_eval)
977 eval_subtree_root(in);
978 }
7c673cae
FG
979}
980
28e407b8 981void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
982{
983 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
984
985 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
986 dir->state_test(CDir::STATE_EXPORTBOUND) ||
987 dir->state_test(CDir::STATE_AUXSUBTREE))
988 return;
989
990 auto it = subtrees.find(dir);
11fdf7f2 991 ceph_assert(it != subtrees.end());
7c673cae 992
7c673cae
FG
993 // merge with parent?
994 CDir *parent = dir;
995 if (!dir->inode->is_base())
996 parent = get_subtree_root(dir->get_parent_dir());
997
b32b8144
FG
998 if (parent != dir && // we have a parent,
999 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
1000 // merge with parent.
1001 dout(10) << " subtree merge at " << *dir << dendl;
1002 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
1003
1004 // move our bounds under the parent
b32b8144 1005 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
1006
1007 // we are no longer a subtree or bound
1008 dir->put(CDir::PIN_SUBTREE);
b32b8144 1009 subtrees.erase(it);
7c673cae
FG
1010 subtrees[parent].erase(dir);
1011
1012 // adjust popularity?
28e407b8 1013 if (adjust_pop && dir->is_auth()) {
28e407b8 1014 CDir *cur = dir;
7c673cae
FG
1015 CDir *p = dir->get_parent_dir();
1016 while (p) {
11fdf7f2 1017 p->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8 1018 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 1019 if (p->is_subtree_root()) break;
28e407b8 1020 cur = p;
7c673cae
FG
1021 p = p->inode->get_parent_dir();
1022 }
1023 }
1024
224ce89b
WB
1025 if (to_eval && dir->get_inode()->is_auth())
1026 to_eval->insert(dir->get_inode());
7c673cae 1027
181888fb
FG
1028 show_subtrees(15);
1029 }
7c673cae
FG
1030}
1031
7c673cae
FG
1032void MDCache::eval_subtree_root(CInode *diri)
1033{
1034 // evaluate subtree inode filelock?
1035 // (we should scatter the filelock on subtree bounds)
11fdf7f2 1036 ceph_assert(diri->is_auth());
224ce89b 1037 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1038}
1039
1040
11fdf7f2 1041void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
7c673cae
FG
1042{
1043 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1044 << " on " << *dir
1045 << " bounds " << bounds
1046 << dendl;
1047
1048 show_subtrees();
1049
1050 CDir *root;
b3b6e05e 1051 if (dir->ino() == CEPH_INO_ROOT) {
7c673cae
FG
1052 root = dir; // bootstrap hack.
1053 if (subtrees.count(root) == 0) {
1054 subtrees[root];
1055 root->get(CDir::PIN_SUBTREE);
1056 }
1057 } else {
1058 root = get_subtree_root(dir); // subtree root
1059 }
11fdf7f2
TL
1060 ceph_assert(root);
1061 ceph_assert(subtrees.count(root));
7c673cae
FG
1062 dout(7) << " current root is " << *root << dendl;
1063
1064 mds_authority_t oldauth = dir->authority();
1065
1066 if (root == dir) {
1067 // i am already a subtree.
1068 dir->set_dir_auth(auth);
1069 } else {
1070 // i am a new subtree.
1071 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 1072 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
1073 subtrees[dir]; // create empty subtree bounds list for me.
1074 dir->get(CDir::PIN_SUBTREE);
1075
1076 // set dir_auth
1077 dir->set_dir_auth(auth);
1078
1079 // move items nested beneath me, under me.
1080 set<CDir*>::iterator p = subtrees[root].begin();
1081 while (p != subtrees[root].end()) {
1082 set<CDir*>::iterator next = p;
1083 ++next;
1084 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1085 // move under me
1086 dout(10) << " claiming child bound " << **p << dendl;
1087 subtrees[dir].insert(*p);
1088 subtrees[root].erase(p);
1089 }
1090 p = next;
1091 }
1092
1093 // i am a bound of the parent subtree.
1094 subtrees[root].insert(dir);
1095
1096 // i am now the subtree root.
1097 root = dir;
1098 }
1099
224ce89b
WB
1100 set<CInode*> to_eval;
1101
7c673cae
FG
1102 // verify/adjust bounds.
1103 // - these may be new, or
1104 // - beneath existing ambiguous bounds (which will be collapsed),
1105 // - but NOT beneath unambiguous bounds.
11fdf7f2 1106 for (const auto& bound : bounds) {
7c673cae
FG
1107 // new bound?
1108 if (subtrees[dir].count(bound) == 0) {
1109 if (get_subtree_root(bound) == dir) {
1110 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1111 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1112 }
1113 else {
1114 dout(10) << " want bound " << *bound << dendl;
1115 CDir *t = get_subtree_root(bound->get_parent_dir());
1116 if (subtrees[t].count(bound) == 0) {
11fdf7f2 1117 ceph_assert(t != dir);
7c673cae
FG
1118 dout(10) << " new bound " << *bound << dendl;
1119 adjust_subtree_auth(bound, t->authority());
1120 }
1121 // make sure it's nested beneath ambiguous subtree(s)
1122 while (1) {
1123 while (subtrees[dir].count(t) == 0)
1124 t = get_subtree_root(t->get_parent_dir());
1125 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1126 adjust_subtree_auth(t, auth);
224ce89b 1127 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1128 t = get_subtree_root(bound->get_parent_dir());
1129 if (t == dir) break;
1130 }
1131 }
1132 }
1133 else {
1134 dout(10) << " already have bound " << *bound << dendl;
1135 }
1136 }
1137 // merge stray bounds?
1138 while (!subtrees[dir].empty()) {
1139 set<CDir*> copy = subtrees[dir];
1140 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1141 if (bounds.count(*p) == 0) {
1142 CDir *stray = *p;
1143 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1144 adjust_subtree_auth(stray, auth);
224ce89b 1145 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1146 }
1147 }
1148 // swallowing subtree may add new subtree bounds
1149 if (copy == subtrees[dir])
1150 break;
1151 }
1152
1153 // bound should now match.
1154 verify_subtree_bounds(dir, bounds);
1155
1156 show_subtrees();
224ce89b
WB
1157
1158 if (!(mds->is_any_replay() || mds->is_resolve())) {
1159 for(auto in : to_eval)
1160 eval_subtree_root(in);
1161 }
7c673cae
FG
1162}
1163
1164
1165/*
1166 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1167 * fragmentation as necessary to get an equivalent bounding set. That is, only
1168 * split if one of our frags spans the provided bounding set. Never merge.
1169 */
11fdf7f2 1170void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
7c673cae
FG
1171{
1172 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1173
1174 // sort by ino
1175 map<inodeno_t, fragset_t> byino;
11fdf7f2 1176 for (auto& frag : dfs) {
9f95a23c 1177 byino[frag.ino].insert_raw(frag.frag);
11fdf7f2 1178 }
7c673cae
FG
1179 dout(10) << " by ino: " << byino << dendl;
1180
1181 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
9f95a23c 1182 p->second.simplify();
7c673cae
FG
1183 CInode *diri = get_inode(p->first);
1184 if (!diri)
1185 continue;
1186 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1187
1188 fragtree_t tmpdft;
1189 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1190 tmpdft.force_to_leaf(g_ceph_context, *q);
1191
11fdf7f2
TL
1192 for (const auto& fg : p->second) {
1193 frag_vec_t leaves;
1194 diri->dirfragtree.get_leaves_under(fg, leaves);
1195 if (leaves.empty()) {
7c673cae 1196 frag_t approx_fg = diri->dirfragtree[fg.value()];
11fdf7f2
TL
1197 frag_vec_t approx_leaves;
1198 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1199 for (const auto& leaf : approx_leaves) {
1200 if (p->second.get().count(leaf) == 0) {
7c673cae 1201 // not bound, so the resolve message is from auth MDS of the dirfrag
11fdf7f2 1202 force_dir_fragment(diri, leaf);
7c673cae
FG
1203 }
1204 }
7c673cae 1205 }
f67539c2
TL
1206
1207 auto&& [complete, sibs] = diri->get_dirfrags_under(fg);
1208 for (const auto& sib : sibs)
1209 bounds.insert(sib);
7c673cae
FG
1210 }
1211 }
1212}
1213
11fdf7f2 1214void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
7c673cae
FG
1215{
1216 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1217 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1218
1219 set<CDir*> bounds;
1220 get_force_dirfrag_bound_set(bound_dfs, bounds);
1221 adjust_bounded_subtree_auth(dir, bounds, auth);
1222}
1223
11fdf7f2 1224void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
7c673cae
FG
1225{
1226 dout(10) << "map_dirfrag_set " << dfs << dendl;
1227
1228 // group by inode
1229 map<inodeno_t, fragset_t> ino_fragset;
11fdf7f2 1230 for (const auto &df : dfs) {
9f95a23c 1231 ino_fragset[df.ino].insert_raw(df.frag);
11fdf7f2 1232 }
7c673cae
FG
1233 // get frags
1234 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1235 p != ino_fragset.end();
1236 ++p) {
9f95a23c 1237 p->second.simplify();
7c673cae
FG
1238 CInode *in = get_inode(p->first);
1239 if (!in)
1240 continue;
1241
11fdf7f2
TL
1242 frag_vec_t fgs;
1243 for (const auto& fg : p->second) {
1244 in->dirfragtree.get_leaves_under(fg, fgs);
1245 }
7c673cae 1246
11fdf7f2 1247 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
7c673cae
FG
1248 << " on " << *in << dendl;
1249
11fdf7f2
TL
1250 for (const auto& fg : fgs) {
1251 CDir *dir = in->get_dirfrag(fg);
7c673cae
FG
1252 if (dir)
1253 result.insert(dir);
1254 }
1255 }
1256}
1257
1258
1259
1260CDir *MDCache::get_subtree_root(CDir *dir)
1261{
1262 // find the underlying dir that delegates (or is about to delegate) auth
1263 while (true) {
1264 if (dir->is_subtree_root())
1265 return dir;
1266 dir = dir->get_inode()->get_parent_dir();
1267 if (!dir)
1268 return 0; // none
1269 }
1270}
1271
1272CDir *MDCache::get_projected_subtree_root(CDir *dir)
1273{
1274 // find the underlying dir that delegates (or is about to delegate) auth
1275 while (true) {
1276 if (dir->is_subtree_root())
1277 return dir;
1278 dir = dir->get_inode()->get_projected_parent_dir();
1279 if (!dir)
1280 return 0; // none
1281 }
1282}
1283
1284void MDCache::remove_subtree(CDir *dir)
1285{
1286 dout(10) << "remove_subtree " << *dir << dendl;
f6b5b4d7
TL
1287 auto it = subtrees.find(dir);
1288 ceph_assert(it != subtrees.end());
1289 subtrees.erase(it);
7c673cae
FG
1290 dir->put(CDir::PIN_SUBTREE);
1291 if (dir->get_parent_dir()) {
1292 CDir *p = get_subtree_root(dir->get_parent_dir());
f6b5b4d7
TL
1293 auto it = subtrees.find(p);
1294 ceph_assert(it != subtrees.end());
1295 auto count = it->second.erase(dir);
1296 ceph_assert(count == 1);
7c673cae
FG
1297 }
1298}
1299
1300void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1301{
11fdf7f2 1302 ceph_assert(subtrees.count(dir));
7c673cae
FG
1303 bounds = subtrees[dir];
1304}
1305
1306void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1307{
1308 if (subtrees.count(dir)) {
1309 // just copy them, dir is a subtree.
1310 get_subtree_bounds(dir, bounds);
1311 } else {
1312 // find them
1313 CDir *root = get_subtree_root(dir);
1314 for (set<CDir*>::iterator p = subtrees[root].begin();
1315 p != subtrees[root].end();
1316 ++p) {
1317 CDir *t = *p;
1318 while (t != root) {
1319 t = t->get_parent_dir();
11fdf7f2 1320 ceph_assert(t);
7c673cae
FG
1321 if (t == dir) {
1322 bounds.insert(*p);
1323 continue;
1324 }
1325 }
1326 }
1327 }
1328}
1329
1330void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1331{
1332 // for debugging only.
11fdf7f2 1333 ceph_assert(subtrees.count(dir));
7c673cae
FG
1334 if (bounds != subtrees[dir]) {
1335 dout(0) << "verify_subtree_bounds failed" << dendl;
1336 set<CDir*> b = bounds;
1337 for (auto &cd : subtrees[dir]) {
1338 if (bounds.count(cd)) {
1339 b.erase(cd);
1340 continue;
1341 }
1342 dout(0) << " missing bound " << *cd << dendl;
1343 }
1344 for (const auto &cd : b)
1345 dout(0) << " extra bound " << *cd << dendl;
1346 }
11fdf7f2 1347 ceph_assert(bounds == subtrees[dir]);
7c673cae
FG
1348}
1349
1350void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1351{
1352 // for debugging only.
11fdf7f2 1353 ceph_assert(subtrees.count(dir));
7c673cae
FG
1354
1355 // make sure that any bounds i do have are properly noted as such.
1356 int failed = 0;
1357 for (const auto &fg : bounds) {
1358 CDir *bd = get_dirfrag(fg);
1359 if (!bd) continue;
1360 if (subtrees[dir].count(bd) == 0) {
1361 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1362 failed++;
1363 }
1364 }
11fdf7f2 1365 ceph_assert(failed == 0);
7c673cae
FG
1366}
1367
1368void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1369{
1370 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1371 << " to " << *newdir << dendl;
1372 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1373}
1374
224ce89b 1375void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1376{
1377 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1378
7c673cae
FG
1379 CDir *newdir = diri->get_parent_dir();
1380
1381 if (pop) {
1382 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
11fdf7f2
TL
1383 ceph_assert(p != projected_subtree_renames.end());
1384 ceph_assert(!p->second.empty());
1385 ceph_assert(p->second.front().first == olddir);
1386 ceph_assert(p->second.front().second == newdir);
7c673cae
FG
1387 p->second.pop_front();
1388 if (p->second.empty())
1389 projected_subtree_renames.erase(p);
1390 }
1391
11fdf7f2
TL
1392 // adjust total auth pin of freezing subtree
1393 if (olddir != newdir) {
9f95a23c
TL
1394 auto&& dfls = diri->get_nested_dirfrags();
1395 for (const auto& dir : dfls)
11fdf7f2 1396 olddir->adjust_freeze_after_rename(dir);
11fdf7f2
TL
1397 }
1398
7c673cae 1399 // adjust subtree
9f95a23c
TL
1400 // N.B. make sure subtree dirfrags are at the front of the list
1401 auto dfls = diri->get_subtree_dirfrags();
7c673cae 1402 diri->get_nested_dirfrags(dfls);
9f95a23c 1403 for (const auto& dir : dfls) {
7c673cae
FG
1404 dout(10) << "dirfrag " << *dir << dendl;
1405 CDir *oldparent = get_subtree_root(olddir);
1406 dout(10) << " old parent " << *oldparent << dendl;
1407 CDir *newparent = get_subtree_root(newdir);
1408 dout(10) << " new parent " << *newparent << dendl;
1409
9f95a23c
TL
1410 auto& oldbounds = subtrees[oldparent];
1411 auto& newbounds = subtrees[newparent];
1412
28e407b8 1413 if (olddir != newdir)
11fdf7f2 1414 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
28e407b8 1415
7c673cae
FG
1416 if (oldparent == newparent) {
1417 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1418 } else if (dir->is_subtree_root()) {
7c673cae
FG
1419 // children are fine. change parent.
1420 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
9f95a23c
TL
1421 {
1422 auto n = oldbounds.erase(dir);
1423 ceph_assert(n == 1);
1424 }
1425 newbounds.insert(dir);
224ce89b 1426 // caller is responsible for 'eval diri'
28e407b8 1427 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1428 } else {
1429 // mid-subtree.
1430
1431 // see if any old bounds move to the new parent.
9f95a23c
TL
1432 std::vector<CDir*> tomove;
1433 for (const auto& bound : oldbounds) {
7c673cae
FG
1434 CDir *broot = get_subtree_root(bound->get_parent_dir());
1435 if (broot != oldparent) {
11fdf7f2 1436 ceph_assert(broot == newparent);
7c673cae
FG
1437 tomove.push_back(bound);
1438 }
1439 }
9f95a23c 1440 for (const auto& bound : tomove) {
7c673cae 1441 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
9f95a23c
TL
1442 oldbounds.erase(bound);
1443 newbounds.insert(bound);
7c673cae
FG
1444 }
1445
1446 // did auth change?
1447 if (oldparent->authority() != newparent->authority()) {
28e407b8 1448 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1449 // caller is responsible for 'eval diri'
28e407b8 1450 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1451 }
1452 }
28e407b8
AA
1453
1454 if (olddir != newdir)
11fdf7f2 1455 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
7c673cae
FG
1456 }
1457
1458 show_subtrees();
1459}
1460
7c673cae
FG
1461// ===================================
1462// journal and snap/cow helpers
1463
1464
1465/*
1466 * find first inode in cache that follows given snapid. otherwise, return current.
1467 */
1468CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1469{
1470 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
11fdf7f2 1471 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae 1472
b32b8144
FG
1473 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1474 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1475 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1476 in = p->second;
7c673cae 1477 }
b32b8144 1478
7c673cae
FG
1479 return in;
1480}
1481
1482
1483/*
1484 * note: i'm currently cheating wrt dirty and inode.version on cow
1485 * items. instead of doing a full dir predirty, i just take the
1486 * original item's version, and set the dirty flag (via
1487 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1488 * means a special case in the dir commit clean sweep assertions.
1489 * bah.
1490 */
1491CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1492{
11fdf7f2 1493 ceph_assert(last >= in->first);
7c673cae 1494
b32b8144 1495 CInode *oldin = new CInode(this, true, in->first, last);
f67539c2
TL
1496 auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode());
1497 _inode->trim_client_ranges(last);
1498 oldin->reset_inode(std::move(_inode));
1499 auto _xattrs = in->get_previous_projected_xattrs();
1500 oldin->reset_xattrs(std::move(_xattrs));
1501
11fdf7f2 1502 oldin->symlink = in->symlink;
7c673cae
FG
1503
1504 if (in->first < in->oldest_snap)
1505 in->oldest_snap = in->first;
1506
1507 in->first = last+1;
1508
1509 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1510 add_inode(oldin);
1511
1512 if (in->last != CEPH_NOSNAP) {
1513 CInode *head_in = get_inode(in->ino());
11fdf7f2 1514 ceph_assert(head_in);
494da23a
TL
1515 auto ret = head_in->split_need_snapflush(oldin, in);
1516 if (ret.first) {
7c673cae 1517 oldin->client_snap_caps = in->client_snap_caps;
eafe8130
TL
1518 if (!oldin->client_snap_caps.empty()) {
1519 for (int i = 0; i < num_cinode_locks; i++) {
1520 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1521 ceph_assert(lock);
494da23a
TL
1522 if (lock->get_state() != LOCK_SNAP_SYNC) {
1523 ceph_assert(lock->is_stable());
1524 lock->set_state(LOCK_SNAP_SYNC); // gathering
1525 oldin->auth_pin(lock);
1526 }
7c673cae
FG
1527 lock->get_wrlock(true);
1528 }
1529 }
1530 }
494da23a
TL
1531 if (!ret.second) {
1532 auto client_snap_caps = std::move(in->client_snap_caps);
1533 in->client_snap_caps.clear();
1534 in->item_open_file.remove_myself();
1535 in->item_caps.remove_myself();
eafe8130
TL
1536
1537 if (!client_snap_caps.empty()) {
1538 MDSContext::vec finished;
1539 for (int i = 0; i < num_cinode_locks; i++) {
1540 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1541 ceph_assert(lock);
1542 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
494da23a 1543 lock->put_wrlock();
eafe8130
TL
1544 if (!lock->get_num_wrlocks()) {
1545 lock->set_state(LOCK_SYNC);
1546 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1547 in->auth_unpin(lock);
1548 }
494da23a 1549 }
eafe8130 1550 mds->queue_waiters(finished);
494da23a
TL
1551 }
1552 }
7c673cae
FG
1553 return oldin;
1554 }
1555
b32b8144
FG
1556 if (!in->client_caps.empty()) {
1557 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1558 // clone caps?
94b18763 1559 for (auto &p : in->client_caps) {
b32b8144 1560 client_t client = p.first;
11fdf7f2
TL
1561 Capability *cap = &p.second;
1562 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
b32b8144
FG
1563 if ((issued & CEPH_CAP_ANY_WR) &&
1564 cap->client_follows < last) {
eafe8130
TL
1565 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1566 oldin->client_snap_caps.insert(client);
b32b8144
FG
1567 cap->client_follows = last;
1568
1569 // we need snapflushes for any intervening snaps
1570 dout(10) << " snaps " << snaps << dendl;
1571 for (auto q = snaps.lower_bound(oldin->first);
1572 q != snaps.end() && *q <= last;
1573 ++q) {
1574 in->add_need_snapflush(oldin, *q, client);
1575 }
1576 } else {
1577 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1578 }
7c673cae 1579 }
eafe8130
TL
1580
1581 if (!oldin->client_snap_caps.empty()) {
1582 for (int i = 0; i < num_cinode_locks; i++) {
1583 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1584 ceph_assert(lock);
1585 if (lock->get_state() != LOCK_SNAP_SYNC) {
1586 ceph_assert(lock->is_stable());
1587 lock->set_state(LOCK_SNAP_SYNC); // gathering
1588 oldin->auth_pin(lock);
1589 }
1590 lock->get_wrlock(true);
1591 }
1592 }
7c673cae 1593 }
7c673cae
FG
1594 return oldin;
1595}
1596
1597void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1598 CDentry *dn, snapid_t follows,
1599 CInode **pcow_inode, CDentry::linkage_t *dnl)
1600{
1601 if (!dn) {
1602 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1603 return;
1604 }
1605 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
11fdf7f2 1606 ceph_assert(dn->is_auth());
7c673cae
FG
1607
1608 // nothing to cow on a null dentry, fix caller
1609 if (!dnl)
1610 dnl = dn->get_projected_linkage();
11fdf7f2 1611 ceph_assert(!dnl->is_null());
7c673cae 1612
11fdf7f2
TL
1613 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1614 bool cow_head = false;
1615 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1616 ceph_assert(in->is_frozen_inode());
1617 cow_head = true;
1618 }
1619 if (in && (in->is_multiversion() || cow_head)) {
7c673cae 1620 // multiversion inode.
7c673cae
FG
1621 SnapRealm *realm = NULL;
1622
1623 if (in->get_projected_parent_dn() != dn) {
11fdf7f2 1624 ceph_assert(follows == CEPH_NOSNAP);
7c673cae 1625 realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1626 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1627 ceph_assert(dir_follows >= realm->get_newest_seq());
7c673cae
FG
1628
1629 if (dir_follows+1 > dn->first) {
1630 snapid_t oldfirst = dn->first;
1631 dn->first = dir_follows+1;
1632 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
f67539c2
TL
1633 CDir *dir = dn->dir;
1634 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows);
7c673cae 1635 dout(10) << " olddn " << *olddn << dendl;
f67539c2
TL
1636 ceph_assert(dir->is_projected());
1637 olddn->set_projected_version(dir->get_projected_version());
7c673cae
FG
1638 metablob->add_remote_dentry(olddn, true);
1639 mut->add_cow_dentry(olddn);
1640 // FIXME: adjust link count here? hmm.
1641
1642 if (dir_follows+1 > in->first)
11fdf7f2 1643 in->cow_old_inode(dir_follows, cow_head);
7c673cae
FG
1644 }
1645 }
1646
11fdf7f2 1647 follows = dir_follows;
7c673cae
FG
1648 if (in->snaprealm) {
1649 realm = in->snaprealm;
11fdf7f2
TL
1650 ceph_assert(follows >= realm->get_newest_seq());
1651 }
7c673cae
FG
1652 } else {
1653 realm = in->find_snaprealm();
11fdf7f2
TL
1654 if (follows == CEPH_NOSNAP) {
1655 follows = get_global_snaprealm()->get_newest_seq();
1656 ceph_assert(follows >= realm->get_newest_seq());
1657 }
7c673cae
FG
1658 }
1659
1660 // already cloned?
1661 if (follows < in->first) {
1662 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1663 return;
1664 }
1665
1666 if (!realm->has_snaps_in_range(in->first, follows)) {
1667 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1668 in->first = follows + 1;
1669 return;
1670 }
1671
11fdf7f2 1672 in->cow_old_inode(follows, cow_head);
7c673cae
FG
1673
1674 } else {
1675 SnapRealm *realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1676 if (follows == CEPH_NOSNAP) {
1677 follows = get_global_snaprealm()->get_newest_seq();
1678 ceph_assert(follows >= realm->get_newest_seq());
1679 }
7c673cae
FG
1680
1681 // already cloned?
1682 if (follows < dn->first) {
1683 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1684 return;
1685 }
1686
1687 // update dn.first before adding old dentry to cdir's map
1688 snapid_t oldfirst = dn->first;
1689 dn->first = follows+1;
1690
7c673cae
FG
1691 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1692 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1693 if (in)
1694 in->first = follows+1;
1695 return;
1696 }
1697
1698 dout(10) << " dn " << *dn << dendl;
f67539c2
TL
1699 CDir *dir = dn->get_dir();
1700 ceph_assert(dir->is_projected());
1701
7c673cae
FG
1702 if (in) {
1703 CInode *oldin = cow_inode(in, follows);
f67539c2 1704 ceph_assert(in->is_projected());
7c673cae
FG
1705 mut->add_cow_inode(oldin);
1706 if (pcow_inode)
1707 *pcow_inode = oldin;
f67539c2 1708 CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows);
7c673cae
FG
1709 dout(10) << " olddn " << *olddn << dendl;
1710 bool need_snapflush = !oldin->client_snap_caps.empty();
11fdf7f2 1711 if (need_snapflush) {
7c673cae 1712 mut->ls->open_files.push_back(&oldin->item_open_file);
11fdf7f2
TL
1713 mds->locker->mark_need_snapflush_inode(oldin);
1714 }
f67539c2 1715 olddn->set_projected_version(dir->get_projected_version());
7c673cae
FG
1716 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1717 mut->add_cow_dentry(olddn);
1718 } else {
11fdf7f2 1719 ceph_assert(dnl->is_remote());
f67539c2 1720 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows);
7c673cae 1721 dout(10) << " olddn " << *olddn << dendl;
f67539c2
TL
1722
1723 olddn->set_projected_version(dir->get_projected_version());
7c673cae
FG
1724 metablob->add_remote_dentry(olddn, true);
1725 mut->add_cow_dentry(olddn);
1726 }
1727 }
1728}
1729
7c673cae
FG
1730void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1731{
1732 if (in->is_base()) {
11fdf7f2 1733 metablob->add_root(true, in);
7c673cae
FG
1734 } else {
1735 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1736 follows = in->first - 1;
1737 CDentry *dn = in->get_projected_parent_dn();
1738 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1739 journal_cow_dentry(mut, metablob, dn, follows);
1740 if (in->get_projected_inode()->is_backtrace_updated()) {
1741 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1742 in->get_previous_projected_inode()->layout.pool_id;
1743 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1744 } else {
1745 metablob->add_primary_dentry(dn, in, true);
1746 }
1747 }
1748}
1749
1750
1751
1752// nested ---------------------------------------------------------------
1753
f67539c2
TL
1754void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
1755 CInode *cur, CDir *parent, snapid_t first,
7c673cae
FG
1756 int linkunlink, SnapRealm *prealm)
1757{
1758 CDentry *parentdn = cur->get_projected_parent_dn();
7c673cae
FG
1759
1760 if (cur->first > first)
1761 first = cur->first;
1762
1763 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1764 << " " << *cur << dendl;
1765 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1766 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1767
1768 /*
1769 * FIXME. this incompletely propagates rstats to _old_ parents
1770 * (i.e. shortly after a directory rename). but we need full
1771 * blown hard link backpointers to make this work properly...
1772 */
1773 snapid_t floor = parentdn->first;
1774 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1775
1776 if (!prealm)
1777 prealm = parent->inode->find_snaprealm();
1778 const set<snapid_t> snaps = prealm->get_snaps();
1779
1780 if (cur->last != CEPH_NOSNAP) {
11fdf7f2
TL
1781 ceph_assert(cur->dirty_old_rstats.empty());
1782 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
7c673cae
FG
1783 if (q == snaps.end() || *q > cur->last)
1784 return;
1785 }
1786
1787 if (cur->last >= floor) {
1788 bool update = true;
1789 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
f67539c2 1790 // rename src inode is not projected in the peer rename prep case. so we should
7c673cae 1791 // avoid updateing the inode.
11fdf7f2
TL
1792 ceph_assert(linkunlink < 0);
1793 ceph_assert(cur->is_frozen_inode());
7c673cae
FG
1794 update = false;
1795 }
f67539c2
TL
1796 // hacky
1797 const CInode::mempool_inode *pi;
1798 if (update && mut->is_projected(cur)) {
1799 pi = cur->_get_projected_inode();
1800 } else {
1801 pi = cur->get_projected_inode().get();
1802 if (update) {
1803 // new inode
1804 ceph_assert(pi->rstat == pi->accounted_rstat);
1805 update = false;
1806 }
1807 }
1808 _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent,
7c673cae
FG
1809 linkunlink, update);
1810 }
1811
11fdf7f2 1812 if (g_conf()->mds_snap_rstat) {
94b18763 1813 for (const auto &p : cur->dirty_old_rstats) {
f67539c2 1814 const auto &old = cur->get_old_inodes()->at(p);
94b18763
FG
1815 snapid_t ofirst = std::max(old.first, floor);
1816 auto it = snaps.lower_bound(ofirst);
1817 if (it == snaps.end() || *it > p)
7c673cae 1818 continue;
94b18763 1819 if (p >= floor)
f67539c2 1820 _project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1821 }
1822 }
1823 cur->dirty_old_rstats.clear();
1824}
1825
1826
f67539c2 1827void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1828 CDir *parent, int linkunlink, bool update_inode)
1829{
1830 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
f67539c2
TL
1831 dout(20) << " inode rstat " << inode->rstat << dendl;
1832 dout(20) << " inode accounted_rstat " << inode->accounted_rstat << dendl;
7c673cae
FG
1833 nest_info_t delta;
1834 if (linkunlink == 0) {
f67539c2
TL
1835 delta.add(inode->rstat);
1836 delta.sub(inode->accounted_rstat);
7c673cae 1837 } else if (linkunlink < 0) {
f67539c2 1838 delta.sub(inode->accounted_rstat);
7c673cae 1839 } else {
f67539c2 1840 delta.add(inode->rstat);
7c673cae
FG
1841 }
1842 dout(20) << " delta " << delta << dendl;
1843
7c673cae
FG
1844
1845 while (last >= ofirst) {
1846 /*
1847 * pick fnode version to update. at each iteration, we want to
1848 * pick a segment ending in 'last' to update. split as necessary
1849 * to make that work. then, adjust first up so that we only
1850 * update one segment at a time. then loop to cover the whole
1851 * [ofirst,last] interval.
1852 */
1853 nest_info_t *prstat;
1854 snapid_t first;
f67539c2 1855 auto pf = parent->_get_projected_fnode();
7c673cae 1856 if (last == CEPH_NOSNAP) {
11fdf7f2
TL
1857 if (g_conf()->mds_snap_rstat)
1858 first = std::max(ofirst, parent->first);
7c673cae
FG
1859 else
1860 first = parent->first;
1861 prstat = &pf->rstat;
1862 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1863
1864 if (first > parent->first &&
1865 !(pf->rstat == pf->accounted_rstat)) {
1866 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1867 << parent->first << "," << (first-1) << "] "
1868 << " " << *prstat << "/" << pf->accounted_rstat
1869 << dendl;
1870 parent->dirty_old_rstat[first-1].first = parent->first;
1871 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1872 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1873 }
1874 parent->first = first;
11fdf7f2 1875 } else if (!g_conf()->mds_snap_rstat) {
7c673cae
FG
1876 // drop snapshots' rstats
1877 break;
1878 } else if (last >= parent->first) {
1879 first = parent->first;
1880 parent->dirty_old_rstat[last].first = first;
1881 parent->dirty_old_rstat[last].rstat = pf->rstat;
1882 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1883 prstat = &parent->dirty_old_rstat[last].rstat;
1884 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1885 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1886 } else {
1887 // be careful, dirty_old_rstat is a _sparse_ map.
1888 // sorry, this is ugly.
1889 first = ofirst;
1890
1891 // find any intersection with last
94b18763
FG
1892 auto it = parent->dirty_old_rstat.lower_bound(last);
1893 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1894 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1895 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1896 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1897 first = parent->dirty_old_rstat.rbegin()->first+1;
1898 }
1899 } else {
94b18763
FG
1900 // *it last is >= last
1901 if (it->second.first <= last) {
1902 // *it intersects [first,last]
1903 if (it->second.first < first) {
1904 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1905 parent->dirty_old_rstat[first-1] = it->second;
1906 it->second.first = first;
7c673cae 1907 }
94b18763
FG
1908 if (it->second.first > first)
1909 first = it->second.first;
1910 if (last < it->first) {
1911 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1912 parent->dirty_old_rstat[last] = it->second;
1913 it->second.first = last+1;
7c673cae
FG
1914 }
1915 } else {
94b18763
FG
1916 // *it is to the _right_ of [first,last]
1917 it = parent->dirty_old_rstat.lower_bound(first);
1918 // new *it last is >= first
1919 if (it->second.first <= last && // new *it isn't also to the right, and
1920 it->first >= first) { // it intersects our first bit,
1921 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1922 first = it->first+1;
7c673cae
FG
1923 }
1924 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1925 }
1926 }
1927 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1928 parent->dirty_old_rstat[last].first = first;
1929 prstat = &parent->dirty_old_rstat[last].rstat;
1930 }
1931
1932 // apply
1933 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
11fdf7f2 1934 ceph_assert(last >= first);
7c673cae 1935 prstat->add(delta);
7c673cae
FG
1936 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1937
1938 last = first-1;
1939 }
f67539c2
TL
1940
1941 if (update_inode) {
1942 auto _inode = const_cast<CInode::mempool_inode*>(inode);
1943 _inode->accounted_rstat = _inode->rstat;
1944 }
7c673cae
FG
1945}
1946
f67539c2
TL
1947void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat,
1948 const nest_info_t& accounted_rstat,
7c673cae
FG
1949 snapid_t ofirst, snapid_t last,
1950 CInode *pin, bool cow_head)
1951{
1952 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1953 dout(20) << " frag rstat " << rstat << dendl;
1954 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1955 nest_info_t delta = rstat;
1956 delta.sub(accounted_rstat);
1957 dout(20) << " delta " << delta << dendl;
1958
f67539c2 1959 CInode::old_inode_map_ptr _old_inodes;
7c673cae 1960 while (last >= ofirst) {
94b18763 1961 CInode::mempool_inode *pi;
7c673cae
FG
1962 snapid_t first;
1963 if (last == pin->last) {
f67539c2 1964 pi = pin->_get_projected_inode();
11fdf7f2 1965 first = std::max(ofirst, pin->first);
7c673cae 1966 if (first > pin->first) {
f67539c2 1967 auto& old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1968 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1969 }
1970 } else {
f67539c2
TL
1971 if (!_old_inodes) {
1972 _old_inodes = CInode::allocate_old_inode_map();
1973 if (pin->is_any_old_inodes())
1974 *_old_inodes = *pin->get_old_inodes();
1975 }
7c673cae
FG
1976 if (last >= pin->first) {
1977 first = pin->first;
1978 pin->cow_old_inode(last, cow_head);
1979 } else {
1980 // our life is easier here because old_inodes is not sparse
1981 // (although it may not begin at snapid 1)
f67539c2
TL
1982 auto it = _old_inodes->lower_bound(last);
1983 if (it == _old_inodes->end()) {
7c673cae
FG
1984 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1985 break;
1986 }
94b18763 1987 first = it->second.first;
7c673cae 1988 if (first > last) {
94b18763 1989 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1990 //assert(p == pin->old_inodes.begin());
1991 break;
1992 }
94b18763
FG
1993 if (it->first > last) {
1994 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1995 << (last+1) << "," << it->first << "]" << dendl;
f67539c2 1996 (*_old_inodes)[last] = it->second;
94b18763
FG
1997 it->second.first = last+1;
1998 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1999 }
2000 }
2001 if (first < ofirst) {
2002 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
2003 << first << "," << ofirst-1 << "]" << dendl;
f67539c2 2004 (*_old_inodes)[ofirst-1] = (*_old_inodes)[last];
7c673cae 2005 pin->dirty_old_rstats.insert(ofirst-1);
f67539c2 2006 (*_old_inodes)[last].first = first = ofirst;
7c673cae 2007 }
f67539c2 2008 pi = &(*_old_inodes)[last].inode;
7c673cae
FG
2009 pin->dirty_old_rstats.insert(last);
2010 }
2011 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
2012 pi->rstat.add(delta);
2013 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
2014
2015 last = first-1;
2016 }
f67539c2
TL
2017 if (_old_inodes)
2018 pin->reset_old_inodes(std::move(_old_inodes));
7c673cae
FG
2019}
2020
a8e16298 2021void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
7c673cae 2022{
11fdf7f2
TL
2023 if (!(mds->is_active() || mds->is_stopping()))
2024 return;
2025
7c673cae
FG
2026 if (!in->is_auth() || in->is_frozen())
2027 return;
2028
f67539c2
TL
2029 const auto& pi = in->get_projected_inode();
2030 if (!pi->quota.is_enable() && !quota_change)
7c673cae
FG
2031 return;
2032
11fdf7f2
TL
2033 // creaete snaprealm for quota inode (quota was set before mimic)
2034 if (!in->get_projected_srnode())
2035 mds->server->create_quota_realm(in);
7c673cae 2036
11fdf7f2
TL
2037 for (auto &p : in->client_caps) {
2038 Capability *cap = &p.second;
2039 if (cap->is_noquota())
2040 continue;
28e407b8 2041
11fdf7f2 2042 if (exclude_ct >= 0 && exclude_ct != p.first)
28e407b8
AA
2043 goto update;
2044
f67539c2
TL
2045 if (cap->last_rbytes == pi->rstat.rbytes &&
2046 cap->last_rsize == pi->rstat.rsize())
7c673cae
FG
2047 continue;
2048
f67539c2
TL
2049 if (pi->quota.max_files > 0) {
2050 if (pi->rstat.rsize() >= pi->quota.max_files)
7c673cae
FG
2051 goto update;
2052
f67539c2
TL
2053 if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) <
2054 abs(cap->last_rsize - pi->rstat.rsize()))
7c673cae
FG
2055 goto update;
2056 }
2057
f67539c2
TL
2058 if (pi->quota.max_bytes > 0) {
2059 if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3))
7c673cae
FG
2060 goto update;
2061
f67539c2
TL
2062 if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) <
2063 abs(cap->last_rbytes - pi->rstat.rbytes))
7c673cae
FG
2064 goto update;
2065 }
2066
2067 continue;
2068
2069update:
f67539c2
TL
2070 cap->last_rsize = pi->rstat.rsize();
2071 cap->last_rbytes = pi->rstat.rbytes;
7c673cae 2072
9f95a23c 2073 auto msg = make_message<MClientQuota>();
7c673cae 2074 msg->ino = in->ino();
f67539c2
TL
2075 msg->rstat = pi->rstat;
2076 msg->quota = pi->quota;
11fdf7f2 2077 mds->send_message_client_counted(msg, cap->get_session());
7c673cae 2078 }
181888fb 2079 for (const auto &it : in->get_replicas()) {
9f95a23c 2080 auto msg = make_message<MGatherCaps>();
7c673cae 2081 msg->ino = in->ino();
181888fb 2082 mds->send_message_mds(msg, it.first);
7c673cae
FG
2083 }
2084}
2085
2086/*
2087 * NOTE: we _have_ to delay the scatter if we are called during a
2088 * rejoin, because we can't twiddle locks between when the
2089 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2090 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2091 * (no requests), and a survivor acks immediately. _except_ that
2092 * during rejoin_(weak|strong) processing, we may complete a lock
2093 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2094 * scatterlock state in that case or the lock states will get out of
2095 * sync between the auth and replica.
2096 *
2097 * the simple solution is to never do the scatter here. instead, put
2098 * the scatterlock on a list if it isn't already wrlockable. this is
2099 * probably the best plan anyway, since we avoid too many
2100 * scatters/locks under normal usage.
2101 */
2102/*
2103 * some notes on dirlock/nestlock scatterlock semantics:
2104 *
2105 * the fragstat (dirlock) will never be updated without
2106 * dirlock+nestlock wrlock held by the caller.
2107 *
2108 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2109 * data is pushed up the tree. this could be changed with some
2110 * restructuring here, but in its current form we ensure that the
2111 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2112 * frag, which is nice. and, we only need to track frags that need to
2113 * be nudged (and not inodes with pending rstat changes that need to
2114 * be pushed into the frag). a consequence of this is that the
2115 * accounted_rstat on scatterlock sync may not match our current
2116 * rstat. this is normal and expected.
2117 */
2118void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2119 CInode *in, CDir *parent,
2120 int flags, int linkunlink,
2121 snapid_t cfollows)
2122{
2123 bool primary_dn = flags & PREDIRTY_PRIMARY;
2124 bool do_parent_mtime = flags & PREDIRTY_DIR;
2125 bool shallow = flags & PREDIRTY_SHALLOW;
2126
11fdf7f2 2127 ceph_assert(mds->mdlog->entry_is_open());
7c673cae
FG
2128
2129 // make sure stamp is set
2130 if (mut->get_mds_stamp() == utime_t())
2131 mut->set_mds_stamp(ceph_clock_now());
2132
2133 if (in->is_base())
2134 return;
2135
2136 dout(10) << "predirty_journal_parents"
2137 << (do_parent_mtime ? " do_parent_mtime":"")
2138 << " linkunlink=" << linkunlink
2139 << (primary_dn ? " primary_dn":" remote_dn")
2140 << (shallow ? " SHALLOW":"")
2141 << " follows " << cfollows
2142 << " " << *in << dendl;
2143
2144 if (!parent) {
11fdf7f2 2145 ceph_assert(primary_dn);
7c673cae
FG
2146 parent = in->get_projected_parent_dn()->get_dir();
2147 }
2148
2149 if (flags == 0 && linkunlink == 0) {
2150 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2151 blob->add_dir_context(parent);
2152 return;
2153 }
2154
2155 // build list of inodes to wrlock, dirty, and update
2156 list<CInode*> lsi;
2157 CInode *cur = in;
2158 CDentry *parentdn = NULL;
2159 bool first = true;
2160 while (parent) {
2161 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
11fdf7f2 2162 ceph_assert(parent->is_auth());
7c673cae
FG
2163
2164 // opportunistically adjust parent dirfrag
2165 CInode *pin = parent->get_inode();
2166
2167 // inode -> dirfrag
2168 mut->auth_pin(parent);
7c673cae 2169
f67539c2 2170 auto pf = parent->project_fnode(mut);
7c673cae
FG
2171 pf->version = parent->pre_dirty();
2172
2173 if (do_parent_mtime || linkunlink) {
11fdf7f2
TL
2174 ceph_assert(mut->is_wrlocked(&pin->filelock));
2175 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2176 ceph_assert(cfollows == CEPH_NOSNAP);
7c673cae
FG
2177
2178 // update stale fragstat/rstat?
2179 parent->resync_accounted_fragstat();
2180 parent->resync_accounted_rstat();
2181
2182 if (do_parent_mtime) {
2183 pf->fragstat.mtime = mut->get_op_stamp();
2184 pf->fragstat.change_attr++;
2185 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2186 if (pf->fragstat.mtime > pf->rstat.rctime) {
2187 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2188 pf->rstat.rctime = pf->fragstat.mtime;
2189 } else {
2190 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2191 }
2192 }
2193 if (linkunlink) {
2194 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2195 if (in->is_dir()) {
2196 pf->fragstat.nsubdirs += linkunlink;
2197 //pf->rstat.rsubdirs += linkunlink;
2198 } else {
2199 pf->fragstat.nfiles += linkunlink;
2200 //pf->rstat.rfiles += linkunlink;
2201 }
2202 }
2203 }
2204
2205 // rstat
2206 if (!primary_dn) {
2207 // don't update parent this pass
2208 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2209 pin->versionlock.can_wrlock())) {
2210 dout(20) << " unwritable parent nestlock " << pin->nestlock
2211 << ", marking dirty rstat on " << *cur << dendl;
2212 cur->mark_dirty_rstat();
2213 } else {
2214 // if we don't hold a wrlock reference on this nestlock, take one,
2215 // because we are about to write into the dirfrag fnode and that needs
2216 // to commit before the lock can cycle.
2217 if (linkunlink) {
f67539c2 2218 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer());
7c673cae
FG
2219 }
2220
11fdf7f2 2221 if (!mut->is_wrlocked(&pin->nestlock)) {
7c673cae
FG
2222 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2223 mds->locker->wrlock_force(&pin->nestlock, mut);
2224 }
2225
2226 // now we can project the inode rstat diff the dirfrag
2227 SnapRealm *prealm = pin->find_snaprealm();
2228
2229 snapid_t follows = cfollows;
2230 if (follows == CEPH_NOSNAP)
2231 follows = prealm->get_newest_seq();
2232
2233 snapid_t first = follows+1;
2234
2235 // first, if the frag is stale, bring it back in sync.
2236 parent->resync_accounted_rstat();
2237
2238 // now push inode rstats into frag
f67539c2 2239 project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm);
7c673cae
FG
2240 cur->clear_dirty_rstat();
2241 }
2242
2243 bool stop = false;
2244 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2245 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2246 stop = true;
2247 }
2248
2249 // delay propagating until later?
2250 if (!stop && !first &&
11fdf7f2 2251 g_conf()->mds_dirstat_min_interval > 0) {
7c673cae 2252 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
11fdf7f2 2253 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
7c673cae 2254 dout(10) << "predirty_journal_parents last prop " << since_last_prop
11fdf7f2 2255 << " < " << g_conf()->mds_dirstat_min_interval
7c673cae
FG
2256 << ", stopping" << dendl;
2257 stop = true;
2258 } else {
2259 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2260 }
2261 }
2262
2263 // can cast only because i'm passing nowait=true in the sole user
7c673cae 2264 if (!stop &&
11fdf7f2 2265 !mut->is_wrlocked(&pin->nestlock) &&
7c673cae 2266 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
9f95a23c 2267 !mds->locker->wrlock_try(&pin->nestlock, mut)
7c673cae
FG
2268 )) { // ** do not initiate.. see above comment **
2269 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2270 << " on " << *pin << dendl;
2271 stop = true;
2272 }
2273 if (stop) {
2274 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2275 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2276 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2277 mut->add_updated_lock(&pin->nestlock);
2278 if (do_parent_mtime || linkunlink) {
2279 mds->locker->mark_updated_scatterlock(&pin->filelock);
2280 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2281 mut->add_updated_lock(&pin->filelock);
2282 }
2283 break;
2284 }
11fdf7f2 2285 if (!mut->is_wrlocked(&pin->versionlock))
7c673cae
FG
2286 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2287
f67539c2 2288 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer());
7c673cae
FG
2289
2290 pin->last_dirstat_prop = mut->get_mds_stamp();
2291
2292 // dirfrag -> diri
2293 mut->auth_pin(pin);
7c673cae
FG
2294 lsi.push_front(pin);
2295
2296 pin->pre_cow_old_inode(); // avoid cow mayhem!
2297
f67539c2
TL
2298 auto pi = pin->project_inode(mut);
2299 pi.inode->version = pin->pre_dirty();
7c673cae
FG
2300
2301 // dirstat
2302 if (do_parent_mtime || linkunlink) {
2303 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2304 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2305 bool touched_mtime = false, touched_chattr = false;
f67539c2 2306 pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2307 pf->accounted_fragstat = pf->fragstat;
2308 if (touched_mtime)
f67539c2 2309 pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime;
7c673cae 2310 if (touched_chattr)
f67539c2
TL
2311 pi.inode->change_attr = pi.inode->dirstat.change_attr;
2312 dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl;
7c673cae
FG
2313
2314 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
f67539c2 2315 if (pi.inode->dirstat.size() < 0)
11fdf7f2 2316 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
f67539c2 2317 if (pi.inode->dirstat.size() != pf->fragstat.size()) {
7c673cae 2318 mds->clog->error() << "unmatched fragstat size on single dirfrag "
f67539c2 2319 << parent->dirfrag() << ", inode has " << pi.inode->dirstat
7c673cae
FG
2320 << ", dirfrag has " << pf->fragstat;
2321
2322 // trust the dirfrag for now
f67539c2 2323 pi.inode->dirstat = pf->fragstat;
7c673cae 2324
11fdf7f2 2325 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
7c673cae
FG
2326 }
2327 }
2328 }
2329
7c673cae
FG
2330 // rstat
2331 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2332
2333 // first, if the frag is stale, bring it back in sync.
2334 parent->resync_accounted_rstat();
2335
11fdf7f2 2336 if (g_conf()->mds_snap_rstat) {
94b18763
FG
2337 for (auto &p : parent->dirty_old_rstat) {
2338 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2339 p.first, pin, true);
2340 }
7c673cae
FG
2341 }
2342 parent->dirty_old_rstat.clear();
2343 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2344
2345 pf->accounted_rstat = pf->rstat;
2346
2347 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
f67539c2 2348 if (pi.inode->rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2349 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
f67539c2 2350 << parent->dirfrag() << ", inode has " << pi.inode->rstat
7c673cae
FG
2351 << ", dirfrag has " << pf->rstat;
2352
2353 // trust the dirfrag for now
f67539c2 2354 pi.inode->rstat = pf->rstat;
7c673cae 2355
11fdf7f2 2356 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
7c673cae
FG
2357 }
2358 }
2359
2360 parent->check_rstats();
2361 broadcast_quota_to_client(pin);
f67539c2
TL
2362 if (pin->is_base())
2363 break;
7c673cae
FG
2364 // next parent!
2365 cur = pin;
f67539c2
TL
2366 parentdn = pin->get_projected_parent_dn();
2367 ceph_assert(parentdn);
7c673cae
FG
2368 parent = parentdn->get_dir();
2369 linkunlink = 0;
2370 do_parent_mtime = false;
2371 primary_dn = true;
2372 first = false;
2373 }
2374
2375 // now, stick it in the blob
11fdf7f2
TL
2376 ceph_assert(parent);
2377 ceph_assert(parent->is_auth());
7c673cae
FG
2378 blob->add_dir_context(parent);
2379 blob->add_dir(parent, true);
9f95a23c
TL
2380 for (const auto& in : lsi) {
2381 journal_dirty_inode(mut.get(), blob, in);
7c673cae
FG
2382 }
2383
2384}
2385
2386
2387
2388
2389
2390// ===================================
f67539c2 2391// peer requests
7c673cae
FG
2392
2393
2394/*
f67539c2
TL
2395 * some handlers for leader requests with peers. we need to make
2396 * sure leader journal commits before we forget we leadered them and
2397 * remove them from the uncommitted_leaders map (used during recovery
2398 * to commit|abort peers).
7c673cae 2399 */
f67539c2 2400struct C_MDC_CommittedLeader : public MDCacheLogContext {
7c673cae 2401 metareqid_t reqid;
f67539c2 2402 C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
7c673cae 2403 void finish(int r) override {
f67539c2 2404 mdcache->_logged_leader_commit(reqid);
7c673cae
FG
2405 }
2406};
2407
f67539c2 2408void MDCache::log_leader_commit(metareqid_t reqid)
7c673cae 2409{
f67539c2
TL
2410 dout(10) << "log_leader_commit " << reqid << dendl;
2411 uncommitted_leaders[reqid].committing = true;
7c673cae 2412 mds->mdlog->start_submit_entry(new ECommitted(reqid),
f67539c2 2413 new C_MDC_CommittedLeader(this, reqid));
7c673cae
FG
2414}
2415
f67539c2 2416void MDCache::_logged_leader_commit(metareqid_t reqid)
7c673cae 2417{
f67539c2
TL
2418 dout(10) << "_logged_leader_commit " << reqid << dendl;
2419 ceph_assert(uncommitted_leaders.count(reqid));
2420 uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2421 mds->queue_waiters(uncommitted_leaders[reqid].waiters);
2422 uncommitted_leaders.erase(reqid);
7c673cae
FG
2423}
2424
2425// while active...
2426
f67539c2 2427void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from)
7c673cae 2428{
f67539c2
TL
2429 dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl;
2430 ceph_assert(uncommitted_leaders.count(r));
2431 uncommitted_leaders[r].peers.erase(from);
2432 if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty())
2433 log_leader_commit(r);
7c673cae
FG
2434}
2435
f67539c2 2436void MDCache::logged_leader_update(metareqid_t reqid)
7c673cae 2437{
f67539c2
TL
2438 dout(10) << "logged_leader_update " << reqid << dendl;
2439 ceph_assert(uncommitted_leaders.count(reqid));
2440 uncommitted_leaders[reqid].safe = true;
2441 auto p = pending_leaders.find(reqid);
2442 if (p != pending_leaders.end()) {
2443 pending_leaders.erase(p);
2444 if (pending_leaders.empty())
7c673cae
FG
2445 process_delayed_resolve();
2446 }
2447}
2448
2449/*
f67539c2
TL
2450 * Leader may crash after receiving all peers' commit acks, but before journalling
2451 * the final commit. Peers may crash after journalling the peer commit, but before
2452 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
7c673cae
FG
2453 * resolve finishes.
2454 */
f67539c2 2455void MDCache::finish_committed_leaders()
7c673cae 2456{
f67539c2
TL
2457 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
2458 p != uncommitted_leaders.end();
7c673cae
FG
2459 ++p) {
2460 p->second.recovering = false;
f67539c2
TL
2461 if (!p->second.committing && p->second.peers.empty()) {
2462 dout(10) << "finish_committed_leaders " << p->first << dendl;
2463 log_leader_commit(p->first);
7c673cae
FG
2464 }
2465 }
2466}
2467
2468/*
f67539c2 2469 * at end of resolve... we must journal a commit|abort for all peer
7c673cae
FG
2470 * updates, before moving on.
2471 *
f67539c2
TL
2472 * this is so that the leader can safely journal ECommitted on ops it
2473 * leaders when it reaches up:active (all other recovering nodes must
7c673cae
FG
2474 * complete resolve before that happens).
2475 */
f67539c2 2476struct C_MDC_PeerCommit : public MDCacheLogContext {
7c673cae
FG
2477 mds_rank_t from;
2478 metareqid_t reqid;
f67539c2 2479 C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
7c673cae 2480 void finish(int r) override {
f67539c2 2481 mdcache->_logged_peer_commit(from, reqid);
7c673cae
FG
2482 }
2483};
2484
f67539c2 2485void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid)
7c673cae 2486{
f67539c2 2487 dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl;
7c673cae
FG
2488
2489 // send a message
f67539c2 2490 auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED);
7c673cae
FG
2491 mds->send_message_mds(req, from);
2492}
2493
2494
2495
2496
2497
2498
2499// ====================================================================
2500// import map, recovery
2501
2502void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2503 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2504{
2505 if (subtrees.count(oldparent)) {
2506 vector<dirfrag_t>& v = subtrees[oldparent];
2507 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2508 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2509 if (*it == df) {
2510 v.erase(it);
2511 break;
2512 }
2513 }
2514 if (subtrees.count(newparent)) {
2515 vector<dirfrag_t>& v = subtrees[newparent];
2516 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2517 v.push_back(df);
2518 }
2519}
2520
2521ESubtreeMap *MDCache::create_subtree_map()
2522{
2523 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2524 << num_subtrees_fullauth() << " fullauth"
2525 << dendl;
2526
2527 show_subtrees();
2528
2529 ESubtreeMap *le = new ESubtreeMap();
2530 mds->mdlog->_start_entry(le);
2531
2532 map<dirfrag_t, CDir*> dirs_to_add;
2533
2534 if (myin) {
2535 CDir* mydir = myin->get_dirfrag(frag_t());
2536 dirs_to_add[mydir->dirfrag()] = mydir;
2537 }
2538
2539 // include all auth subtrees, and their bounds.
2540 // and a spanning tree to tie it to the root.
f6b5b4d7 2541 for (auto& [dir, bounds] : subtrees) {
7c673cae
FG
2542 // journal subtree as "ours" if we are
2543 // me, -2
2544 // me, me
2545 // me, !me (may be importing and ambiguous!)
2546
2547 // so not
2548 // !me, *
2549 if (dir->get_dir_auth().first != mds->get_nodeid())
2550 continue;
2551
2552 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2553 my_ambiguous_imports.count(dir->dirfrag())) {
2554 dout(15) << " ambig subtree " << *dir << dendl;
2555 le->ambiguous_subtrees.insert(dir->dirfrag());
2556 } else {
f6b5b4d7 2557 dout(15) << " auth subtree " << *dir << dendl;
7c673cae
FG
2558 }
2559
2560 dirs_to_add[dir->dirfrag()] = dir;
2561 le->subtrees[dir->dirfrag()].clear();
2562
7c673cae 2563 // bounds
f6b5b4d7
TL
2564 size_t nbounds = bounds.size();
2565 if (nbounds > 3) {
2566 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2567 }
2568 for (auto& bound : bounds) {
2569 if (nbounds <= 3) {
2570 dout(15) << " subtree bound " << *bound << dendl;
2571 }
7c673cae
FG
2572 dirs_to_add[bound->dirfrag()] = bound;
2573 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2574 }
2575 }
2576
2577 // apply projected renames
9f95a23c
TL
2578 for (const auto& [diri, renames] : projected_subtree_renames) {
2579 for (const auto& [olddir, newdir] : renames) {
f6b5b4d7 2580 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
7c673cae 2581
9f95a23c
TL
2582 auto&& dfls = diri->get_dirfrags();
2583 for (const auto& dir : dfls) {
f6b5b4d7 2584 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
7c673cae 2585 CDir *oldparent = get_projected_subtree_root(olddir);
f6b5b4d7 2586 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
7c673cae 2587 CDir *newparent = get_projected_subtree_root(newdir);
f6b5b4d7 2588 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
7c673cae
FG
2589
2590 if (oldparent == newparent) {
f6b5b4d7 2591 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
7c673cae
FG
2592 << oldparent->dirfrag() << dendl;
2593 continue;
2594 }
2595
2596 if (dir->is_subtree_root()) {
2597 if (le->subtrees.count(newparent->dirfrag()) &&
2598 oldparent->get_dir_auth() != newparent->get_dir_auth())
2599 dirs_to_add[dir->dirfrag()] = dir;
2600 // children are fine. change parent.
2601 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2602 le->subtrees);
2603 } else {
2604 // mid-subtree.
2605
2606 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2607 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2608 // if oldparent is auth, subtree is mine; include it.
2609 if (le->subtrees.count(oldparent->dirfrag())) {
2610 dirs_to_add[dir->dirfrag()] = dir;
2611 le->subtrees[dir->dirfrag()].clear();
2612 }
2613 // if newparent is auth, subtree is a new bound
2614 if (le->subtrees.count(newparent->dirfrag())) {
2615 dirs_to_add[dir->dirfrag()] = dir;
2616 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2617 }
2618 newparent = dir;
2619 }
2620
2621 // see if any old bounds move to the new parent.
f6b5b4d7 2622 for (auto& bound : subtrees.at(oldparent)) {
7c673cae
FG
2623 if (dir->contains(bound->get_parent_dir()))
2624 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2625 le->subtrees);
2626 }
2627 }
2628 }
2629 }
2630 }
2631
2632 // simplify the journaled map. our in memory map may have more
2633 // subtrees than needed due to migrations that are just getting
2634 // started or just completing. but on replay, the "live" map will
2635 // be simple and we can do a straight comparison.
f6b5b4d7
TL
2636 for (auto& [frag, bfrags] : le->subtrees) {
2637 if (le->ambiguous_subtrees.count(frag))
7c673cae
FG
2638 continue;
2639 unsigned i = 0;
f6b5b4d7
TL
2640 while (i < bfrags.size()) {
2641 dirfrag_t b = bfrags[i];
7c673cae
FG
2642 if (le->subtrees.count(b) &&
2643 le->ambiguous_subtrees.count(b) == 0) {
f6b5b4d7
TL
2644 auto& bb = le->subtrees.at(b);
2645 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2646 for (auto& r : bb) {
2647 bfrags.push_back(r);
2648 }
7c673cae
FG
2649 dirs_to_add.erase(b);
2650 le->subtrees.erase(b);
f6b5b4d7 2651 bfrags.erase(bfrags.begin() + i);
7c673cae
FG
2652 } else {
2653 ++i;
2654 }
2655 }
2656 }
2657
94b18763 2658 for (auto &p : dirs_to_add) {
7c673cae
FG
2659 CDir *dir = p.second;
2660 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2661 le->metablob.add_dir(dir, false);
2662 }
2663
2664 dout(15) << " subtrees " << le->subtrees << dendl;
2665 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2666
2667 //le->metablob.print(cout);
2668 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2669 return le;
2670}
2671
2672void MDCache::dump_resolve_status(Formatter *f) const
2673{
2674 f->open_object_section("resolve_status");
2675 f->dump_stream("resolve_gather") << resolve_gather;
2676 f->dump_stream("resolve_ack_gather") << resolve_gather;
2677 f->close_section();
2678}
2679
11fdf7f2 2680void MDCache::resolve_start(MDSContext *resolve_done_)
7c673cae
FG
2681{
2682 dout(10) << "resolve_start" << dendl;
11fdf7f2 2683 ceph_assert(!resolve_done);
7c673cae
FG
2684 resolve_done.reset(resolve_done_);
2685
2686 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2687 // if we don't have the root dir, adjust it to UNKNOWN. during
2688 // resolve we want mds0 to explicit claim the portion of it that
2689 // it owns, so that anything beyond its bounds get left as
2690 // unknown.
2691 CDir *rootdir = root->get_dirfrag(frag_t());
2692 if (rootdir)
2693 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2694 }
2695 resolve_gather = recovery_set;
11fdf7f2
TL
2696
2697 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
7c673cae
FG
2698}
2699
2700void MDCache::send_resolves()
2701{
f67539c2 2702 send_peer_resolves();
11fdf7f2
TL
2703
2704 if (!resolve_done) {
2705 // I'm survivor: refresh snap cache
2706 mds->snapclient->sync(
2707 new MDSInternalContextWrapper(mds,
9f95a23c 2708 new LambdaContext([this](int r) {
f67539c2 2709 maybe_finish_peer_resolve();
11fdf7f2
TL
2710 })
2711 )
2712 );
2713 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2714 return;
2715 }
7c673cae
FG
2716 if (!resolve_ack_gather.empty()) {
2717 dout(10) << "send_resolves still waiting for resolve ack from ("
2718 << resolve_ack_gather << ")" << dendl;
2719 return;
2720 }
11fdf7f2 2721 if (!resolve_need_rollback.empty()) {
7c673cae 2722 dout(10) << "send_resolves still waiting for rollback to commit on ("
11fdf7f2 2723 << resolve_need_rollback << ")" << dendl;
7c673cae
FG
2724 return;
2725 }
11fdf7f2 2726
7c673cae
FG
2727 send_subtree_resolves();
2728}
2729
f67539c2 2730void MDCache::send_peer_resolves()
7c673cae 2731{
f67539c2 2732 dout(10) << "send_peer_resolves" << dendl;
7c673cae 2733
9f95a23c 2734 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
7c673cae
FG
2735
2736 if (mds->is_resolve()) {
f67539c2
TL
2737 for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
2738 p != uncommitted_peers.end();
7c673cae 2739 ++p) {
f67539c2
TL
2740 mds_rank_t leader = p->second.leader;
2741 auto &m = resolves[leader];
e306af50 2742 if (!m) m = make_message<MMDSResolve>();
f67539c2 2743 m->add_peer_request(p->first, false);
7c673cae
FG
2744 }
2745 } else {
2746 set<mds_rank_t> resolve_set;
2747 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2748 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2749 p != active_requests.end();
2750 ++p) {
2751 MDRequestRef& mdr = p->second;
f67539c2 2752 if (!mdr->is_peer())
7c673cae 2753 continue;
f67539c2 2754 if (!mdr->peer_did_prepare() && !mdr->committing) {
7c673cae
FG
2755 continue;
2756 }
f67539c2
TL
2757 mds_rank_t leader = mdr->peer_to_mds;
2758 if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) {
7c673cae 2759 dout(10) << " including uncommitted " << *mdr << dendl;
f67539c2
TL
2760 if (!resolves.count(leader))
2761 resolves[leader] = make_message<MMDSResolve>();
7c673cae
FG
2762 if (!mdr->committing &&
2763 mdr->has_more() && mdr->more()->is_inode_exporter) {
2764 // re-send cap exports
2765 CInode *in = mdr->more()->rename_inode;
2766 map<client_t, Capability::Export> cap_map;
2767 in->export_client_caps(cap_map);
2768 bufferlist bl;
f67539c2 2769 MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map);
9f95a23c 2770 encode(inode_caps, bl);
f67539c2 2771 resolves[leader]->add_peer_request(p->first, bl);
7c673cae 2772 } else {
f67539c2 2773 resolves[leader]->add_peer_request(p->first, mdr->committing);
7c673cae
FG
2774 }
2775 }
2776 }
2777 }
2778
11fdf7f2 2779 for (auto &p : resolves) {
f67539c2 2780 dout(10) << "sending peer resolve to mds." << p.first << dendl;
11fdf7f2
TL
2781 mds->send_message_mds(p.second, p.first);
2782 resolve_ack_gather.insert(p.first);
7c673cae
FG
2783 }
2784}
2785
2786void MDCache::send_subtree_resolves()
2787{
2788 dout(10) << "send_subtree_resolves" << dendl;
2789
2790 if (migrator->is_exporting() || migrator->is_importing()) {
2791 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2792 migrator->show_importing();
2793 migrator->show_exporting();
2794 resolves_pending = true;
2795 return; // not now
2796 }
2797
9f95a23c 2798 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
7c673cae
FG
2799 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2800 p != recovery_set.end();
2801 ++p) {
2802 if (*p == mds->get_nodeid())
2803 continue;
2804 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
9f95a23c 2805 resolves[*p] = make_message<MMDSResolve>();
7c673cae
FG
2806 }
2807
2808 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2809 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2810
2811 // known
2812 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2813 p != subtrees.end();
2814 ++p) {
2815 CDir *dir = p->first;
2816
2817 // only our subtrees
2818 if (dir->authority().first != mds->get_nodeid())
2819 continue;
2820
2821 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2822 continue; // we'll add it below
2823
2824 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2825 // ambiguous (mid-import)
2826 set<CDir*> bounds;
2827 get_subtree_bounds(dir, bounds);
2828 vector<dirfrag_t> dfls;
2829 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2830 dfls.push_back((*q)->dirfrag());
2831
2832 my_ambig_imports[dir->dirfrag()] = dfls;
2833 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2834 } else {
2835 // not ambiguous.
11fdf7f2
TL
2836 for (auto &q : resolves) {
2837 resolves[q.first]->add_subtree(dir->dirfrag());
2838 }
7c673cae
FG
2839 // bounds too
2840 vector<dirfrag_t> dfls;
2841 for (set<CDir*>::iterator q = subtrees[dir].begin();
2842 q != subtrees[dir].end();
2843 ++q) {
2844 CDir *bound = *q;
2845 dfls.push_back(bound->dirfrag());
2846 }
2847
2848 my_subtrees[dir->dirfrag()] = dfls;
2849 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2850 }
2851 }
2852
2853 // ambiguous
2854 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2855 p != my_ambiguous_imports.end();
2856 ++p) {
2857 my_ambig_imports[p->first] = p->second;
2858 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2859 }
2860
2861 // simplify the claimed subtree.
2862 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2863 unsigned i = 0;
2864 while (i < p->second.size()) {
2865 dirfrag_t b = p->second[i];
2866 if (my_subtrees.count(b)) {
2867 vector<dirfrag_t>& bb = my_subtrees[b];
2868 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2869 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2870 p->second.push_back(*r);
2871 my_subtrees.erase(b);
2872 p->second.erase(p->second.begin() + i);
2873 } else {
2874 ++i;
2875 }
2876 }
2877 }
2878
2879 // send
11fdf7f2 2880 for (auto &p : resolves) {
9f95a23c 2881 const ref_t<MMDSResolve> &m = p.second;
11fdf7f2
TL
2882 if (mds->is_resolve()) {
2883 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2884 } else {
2885 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2886 }
7c673cae
FG
2887 m->subtrees = my_subtrees;
2888 m->ambiguous_imports = my_ambig_imports;
11fdf7f2
TL
2889 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2890 mds->send_message_mds(m, p.first);
7c673cae
FG
2891 }
2892 resolves_pending = false;
2893}
2894
f67539c2 2895void MDCache::maybe_finish_peer_resolve() {
11fdf7f2
TL
2896 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2897 // snap cache get synced or I'm in resolve state
2898 if (mds->snapclient->is_synced() || resolve_done)
2899 send_subtree_resolves();
2900 process_delayed_resolve();
2901 }
2902}
2903
7c673cae
FG
2904void MDCache::handle_mds_failure(mds_rank_t who)
2905{
2906 dout(7) << "handle_mds_failure mds." << who << dendl;
2907
2908 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2909
2910 resolve_gather.insert(who);
2911 discard_delayed_resolve(who);
f67539c2 2912 ambiguous_peer_updates.erase(who);
7c673cae
FG
2913
2914 rejoin_gather.insert(who);
2915 rejoin_sent.erase(who); // i need to send another
31f18b77 2916 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2917 rejoin_ack_gather.erase(who); // i'll need/get another.
2918
2919 dout(10) << " resolve_gather " << resolve_gather << dendl;
2920 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2921 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2922 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2923 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2924
2925
2926 // tell the migrator too.
2927 migrator->handle_mds_failure_or_stop(who);
2928
224ce89b
WB
2929 // tell the balancer too.
2930 mds->balancer->handle_mds_failure(who);
2931
f67539c2 2932 // clean up any requests peer to/from this node
7c673cae
FG
2933 list<MDRequestRef> finish;
2934 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2935 p != active_requests.end();
2936 ++p) {
2937 MDRequestRef& mdr = p->second;
f67539c2
TL
2938 // peer to the failed node?
2939 if (mdr->peer_to_mds == who) {
2940 if (mdr->peer_did_prepare()) {
2941 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2942 if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds))
2943 remove_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2944
2945 if (!mdr->more()->waiting_on_peer.empty()) {
11fdf7f2 2946 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae 2947 // will rollback, no need to wait
f67539c2
TL
2948 mdr->reset_peer_request();
2949 mdr->more()->waiting_on_peer.clear();
7c673cae
FG
2950 }
2951 } else if (!mdr->committing) {
f67539c2
TL
2952 dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl;
2953 if (mdr->peer_request || mdr->peer_rolling_back())
7c673cae
FG
2954 mdr->aborted = true;
2955 else
2956 finish.push_back(mdr);
2957 }
2958 }
2959
f67539c2
TL
2960 if (mdr->is_peer() && mdr->peer_did_prepare()) {
2961 if (mdr->more()->waiting_on_peer.count(who)) {
11fdf7f2 2962 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
f67539c2 2963 dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds."
7c673cae 2964 << who << dendl;
f67539c2
TL
2965 mdr->more()->waiting_on_peer.erase(who);
2966 if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request)
7c673cae
FG
2967 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2968 }
2969
2970 if (mdr->more()->srcdn_auth_mds == who &&
f67539c2 2971 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) {
7c673cae 2972 // rename srcdn's auth mds failed, resolve even I'm a survivor.
f67539c2
TL
2973 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2974 add_ambiguous_peer_update(p->first, mdr->peer_to_mds);
7c673cae 2975 }
f67539c2
TL
2976 } else if (mdr->peer_request) {
2977 const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request;
2978 // FIXME: Peer rename request can arrive after we notice mds failure.
31f18b77 2979 // This can cause mds to crash (does not affect integrity of FS).
f67539c2
TL
2980 if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP &&
2981 peer_req->srcdn_auth == who)
2982 peer_req->mark_interrupted();
7c673cae
FG
2983 }
2984
f67539c2
TL
2985 // failed node is peer?
2986 if (mdr->is_leader() && !mdr->committing) {
7c673cae 2987 if (mdr->more()->srcdn_auth_mds == who) {
f67539c2 2988 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
7c673cae 2989 << who << " to recover" << dendl;
11fdf7f2 2990 ceph_assert(mdr->more()->witnessed.count(who) == 0);
7c673cae
FG
2991 if (mdr->more()->is_ambiguous_auth)
2992 mdr->clear_ambiguous_auth();
2993 // rename srcdn's auth mds failed, all witnesses will rollback
2994 mdr->more()->witnessed.clear();
f67539c2 2995 pending_leaders.erase(p->first);
7c673cae
FG
2996 }
2997
2998 if (mdr->more()->witnessed.count(who)) {
2999 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
f67539c2
TL
3000 if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) {
3001 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
7c673cae 3002 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
f67539c2
TL
3003 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
3004 // until either the request is committing or the peer also fails.
3005 ceph_assert(mdr->more()->waiting_on_peer.size() == 1);
3006 pending_leaders.insert(p->first);
7c673cae 3007 } else {
f67539c2 3008 dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds."
7c673cae
FG
3009 << who << " to recover" << dendl;
3010 if (srcdn_auth >= 0)
11fdf7f2 3011 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
7c673cae
FG
3012
3013 // discard this peer's prepare (if any)
3014 mdr->more()->witnessed.erase(who);
3015 }
3016 }
3017
f67539c2
TL
3018 if (mdr->more()->waiting_on_peer.count(who)) {
3019 dout(10) << " leader request " << *mdr << " waiting for peer mds." << who
7c673cae
FG
3020 << " to recover" << dendl;
3021 // retry request when peer recovers
f67539c2
TL
3022 mdr->more()->waiting_on_peer.erase(who);
3023 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
3024 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3025 }
3026
3027 if (mdr->locking && mdr->locking_target_mds == who)
3028 mdr->finish_locking(mdr->locking);
3029 }
3030 }
3031
f67539c2
TL
3032 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
3033 p != uncommitted_leaders.end();
7c673cae 3034 ++p) {
f67539c2
TL
3035 // The failed MDS may have already committed the peer update
3036 if (p->second.peers.count(who)) {
7c673cae 3037 p->second.recovering = true;
f67539c2 3038 p->second.peers.erase(who);
7c673cae
FG
3039 }
3040 }
3041
3042 while (!finish.empty()) {
f67539c2 3043 dout(10) << "cleaning up peer request " << *finish.front() << dendl;
7c673cae
FG
3044 request_finish(finish.front());
3045 finish.pop_front();
3046 }
3047
3048 kick_find_ino_peers(who);
3049 kick_open_ino_peers(who);
3050
3051 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3052 p != fragments.end(); ) {
3053 dirfrag_t df = p->first;
3054 fragment_info_t& info = p->second;
a8e16298
TL
3055
3056 if (info.is_fragmenting()) {
3057 if (info.notify_ack_waiting.erase(who) &&
3058 info.notify_ack_waiting.empty()) {
3059 fragment_drop_locks(info);
3060 fragment_maybe_finish(p++);
3061 } else {
3062 ++p;
3063 }
7c673cae 3064 continue;
a8e16298
TL
3065 }
3066
3067 ++p;
7c673cae 3068 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
9f95a23c 3069 std::vector<CDir*> dirs;
7c673cae
FG
3070 info.dirs.swap(dirs);
3071 fragments.erase(df);
3072 fragment_unmark_unfreeze_dirs(dirs);
3073 }
3074
3075 // MDCache::shutdown_export_strays() always exports strays to mds.0
3076 if (who == mds_rank_t(0))
f64942e4 3077 shutdown_exporting_strays.clear();
7c673cae
FG
3078
3079 show_subtrees();
3080}
3081
3082/*
3083 * handle_mds_recovery - called on another node's transition
3084 * from resolve -> active.
3085 */
3086void MDCache::handle_mds_recovery(mds_rank_t who)
3087{
3088 dout(7) << "handle_mds_recovery mds." << who << dendl;
3089
3090 // exclude all discover waiters. kick_discovers() will do the job
3091 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3092 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3093
11fdf7f2 3094 MDSContext::vec waiters;
7c673cae
FG
3095
3096 // wake up any waiters in their subtrees
3097 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3098 p != subtrees.end();
3099 ++p) {
3100 CDir *dir = p->first;
3101
3102 if (dir->authority().first != who ||
3103 dir->authority().second == mds->get_nodeid())
3104 continue;
11fdf7f2 3105 ceph_assert(!dir->is_auth());
7c673cae
FG
3106
3107 // wake any waiters
9f95a23c
TL
3108 std::queue<CDir*> q;
3109 q.push(dir);
7c673cae
FG
3110
3111 while (!q.empty()) {
3112 CDir *d = q.front();
9f95a23c 3113 q.pop();
7c673cae
FG
3114 d->take_waiting(d_mask, waiters);
3115
3116 // inode waiters too
94b18763
FG
3117 for (auto &p : d->items) {
3118 CDentry *dn = p.second;
7c673cae
FG
3119 CDentry::linkage_t *dnl = dn->get_linkage();
3120 if (dnl->is_primary()) {
3121 dnl->get_inode()->take_waiting(i_mask, waiters);
3122
3123 // recurse?
9f95a23c
TL
3124 auto&& ls = dnl->get_inode()->get_dirfrags();
3125 for (const auto& subdir : ls) {
7c673cae 3126 if (!subdir->is_subtree_root())
9f95a23c 3127 q.push(subdir);
7c673cae
FG
3128 }
3129 }
3130 }
3131 }
3132 }
3133
3134 kick_open_ino_peers(who);
3135 kick_find_ino_peers(who);
3136
3137 // queue them up.
3138 mds->queue_waiters(waiters);
3139}
3140
3141void MDCache::set_recovery_set(set<mds_rank_t>& s)
3142{
3143 dout(7) << "set_recovery_set " << s << dendl;
3144 recovery_set = s;
3145}
3146
3147
3148/*
3149 * during resolve state, we share resolves to determine who
3150 * is authoritative for which trees. we expect to get an resolve
3151 * from _everyone_ in the recovery_set (the mds cluster at the time of
3152 * the first failure).
3153 *
3154 * This functions puts the passed message before returning
3155 */
9f95a23c 3156void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
7c673cae
FG
3157{
3158 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3159 mds_rank_t from = mds_rank_t(m->get_source().num());
3160
3161 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3162 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3163 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3164 return;
3165 }
3166 // wait until we reach the resolve stage!
7c673cae
FG
3167 return;
3168 }
3169
3170 discard_delayed_resolve(from);
3171
f67539c2
TL
3172 // ambiguous peer requests?
3173 if (!m->peer_requests.empty()) {
7c673cae 3174 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
f67539c2
TL
3175 for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) {
3176 if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) {
11fdf7f2 3177 ceph_assert(!p->second.committing);
f67539c2 3178 pending_leaders.insert(p->first);
7c673cae
FG
3179 }
3180 }
3181
f67539c2
TL
3182 if (!pending_leaders.empty()) {
3183 dout(10) << " still have pending updates, delay processing peer resolve" << dendl;
7c673cae
FG
3184 delayed_resolve[from] = m;
3185 return;
3186 }
3187 }
3188
9f95a23c 3189 auto ack = make_message<MMDSResolveAck>();
f67539c2
TL
3190 for (const auto &p : m->peer_requests) {
3191 if (uncommitted_leaders.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
7c673cae 3192 // COMMIT
11fdf7f2 3193 if (p.second.committing) {
f67539c2
TL
3194 // already committing, waiting for the OP_COMMITTED peer reply
3195 dout(10) << " already committing peer request " << p << " noop "<< dendl;
7c673cae 3196 } else {
f67539c2 3197 dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl;
11fdf7f2 3198 ack->add_commit(p.first);
7c673cae 3199 }
f67539c2 3200 uncommitted_leaders[p.first].peers.insert(from); // wait for peer OP_COMMITTED before we log ECommitted
7c673cae 3201
11fdf7f2 3202 if (p.second.inode_caps.length() > 0) {
f67539c2 3203 // peer wants to export caps (rename)
11fdf7f2 3204 ceph_assert(mds->is_resolve());
f67539c2 3205 MMDSResolve::peer_inode_cap inode_caps;
11fdf7f2 3206 auto q = p.second.inode_caps.cbegin();
9f95a23c
TL
3207 decode(inode_caps, q);
3208 inodeno_t ino = inode_caps.ino;
3209 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
11fdf7f2 3210 ceph_assert(get_inode(ino));
7c673cae
FG
3211
3212 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3213 q != cap_exports.end();
3214 ++q) {
3215 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3216 im.cap_id = ++last_cap_id; // assign a new cap ID
3217 im.issue_seq = 1;
3218 im.mseq = q->second.mseq;
28e407b8
AA
3219
3220 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3221 if (session)
3222 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3223 }
3224
3225 // will process these caps in rejoin stage
f67539c2
TL
3226 rejoin_peer_exports[ino].first = from;
3227 rejoin_peer_exports[ino].second.swap(cap_exports);
7c673cae 3228
f67539c2 3229 // send information of imported caps back to peer
11fdf7f2 3230 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
7c673cae
FG
3231 }
3232 } else {
3233 // ABORT
f67539c2 3234 dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl;
11fdf7f2
TL
3235 ceph_assert(!p.second.committing);
3236 ack->add_abort(p.first);
7c673cae
FG
3237 }
3238 }
3239 mds->send_message(ack, m->get_connection());
7c673cae
FG
3240 return;
3241 }
3242
11fdf7f2 3243 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
7c673cae
FG
3244 dout(10) << "delay processing subtree resolve" << dendl;
3245 delayed_resolve[from] = m;
3246 return;
3247 }
3248
3249 bool survivor = false;
3250 // am i a surviving ambiguous importer?
3251 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3252 survivor = true;
3253 // check for any import success/failure (from this node)
3254 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3255 while (p != my_ambiguous_imports.end()) {
3256 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3257 ++next;
3258 CDir *dir = get_dirfrag(p->first);
11fdf7f2 3259 ceph_assert(dir);
7c673cae
FG
3260 dout(10) << "checking ambiguous import " << *dir << dendl;
3261 if (migrator->is_importing(dir->dirfrag()) &&
3262 migrator->get_import_peer(dir->dirfrag()) == from) {
11fdf7f2 3263 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
7c673cae
FG
3264
3265 // check if sender claims the subtree
3266 bool claimed_by_sender = false;
11fdf7f2 3267 for (const auto &q : m->subtrees) {
7c673cae 3268 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
11fdf7f2 3269 CDir *base = get_force_dirfrag(q.first, false);
7c673cae
FG
3270 if (!base || !base->contains(dir))
3271 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3272
3273 bool inside = true;
3274 set<CDir*> bounds;
11fdf7f2 3275 get_force_dirfrag_bound_set(q.second, bounds);
7c673cae
FG
3276 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3277 CDir *bound = *p;
3278 if (bound->contains(dir)) {
3279 inside = false; // nope, bound is dir or parent of dir, not inside.
3280 break;
3281 }
3282 }
3283 if (inside)
3284 claimed_by_sender = true;
3285 }
3286
3287 my_ambiguous_imports.erase(p); // no longer ambiguous.
3288 if (claimed_by_sender) {
3289 dout(7) << "ambiguous import failed on " << *dir << dendl;
3290 migrator->import_reverse(dir);
3291 } else {
3292 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3293 migrator->import_finish(dir, true);
3294 }
3295 }
3296 p = next;
3297 }
3298 }
3299
3300 // update my dir_auth values
3301 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3302 // migrations between other nodes)
11fdf7f2
TL
3303 for (const auto& p : m->subtrees) {
3304 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3305 CDir *dir = get_force_dirfrag(p.first, !survivor);
7c673cae
FG
3306 if (!dir)
3307 continue;
11fdf7f2 3308 adjust_bounded_subtree_auth(dir, p.second, from);
7c673cae
FG
3309 try_subtree_merge(dir);
3310 }
3311
3312 show_subtrees();
3313
3314 // note ambiguous imports too
11fdf7f2
TL
3315 for (const auto& p : m->ambiguous_imports) {
3316 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3317 other_ambiguous_imports[from][p.first] = p.second;
3318 }
3319
3320 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3321 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3322 for (const auto& p : m->table_clients) {
3323 dout(10) << " noting " << get_mdstable_name(p.type)
3324 << " pending_commits " << p.pending_commits << dendl;
3325 MDSTableClient *client = mds->get_table_client(p.type);
3326 for (const auto& q : p.pending_commits)
3327 client->notify_commit(q);
7c673cae
FG
3328 }
3329
3330 // did i get them all?
3331 resolve_gather.erase(from);
3332
3333 maybe_resolve_finish();
7c673cae
FG
3334}
3335
3336void MDCache::process_delayed_resolve()
3337{
3338 dout(10) << "process_delayed_resolve" << dendl;
9f95a23c 3339 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
7c673cae 3340 tmp.swap(delayed_resolve);
11fdf7f2
TL
3341 for (auto &p : tmp) {
3342 handle_resolve(p.second);
3343 }
7c673cae
FG
3344}
3345
3346void MDCache::discard_delayed_resolve(mds_rank_t who)
3347{
11fdf7f2 3348 delayed_resolve.erase(who);
7c673cae
FG
3349}
3350
3351void MDCache::maybe_resolve_finish()
3352{
11fdf7f2
TL
3353 ceph_assert(resolve_ack_gather.empty());
3354 ceph_assert(resolve_need_rollback.empty());
7c673cae
FG
3355
3356 if (!resolve_gather.empty()) {
3357 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3358 << resolve_gather << ")" << dendl;
3359 return;
3360 }
3361
3362 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3363 disambiguate_my_imports();
f67539c2 3364 finish_committed_leaders();
7c673cae
FG
3365
3366 if (resolve_done) {
11fdf7f2 3367 ceph_assert(mds->is_resolve());
7c673cae
FG
3368 trim_unlinked_inodes();
3369 recalc_auth_bits(false);
3370 resolve_done.release()->complete(0);
3371 } else {
11fdf7f2 3372 // I am survivor.
7c673cae
FG
3373 maybe_send_pending_rejoins();
3374 }
3375}
3376
9f95a23c 3377void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
7c673cae
FG
3378{
3379 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3380 mds_rank_t from = mds_rank_t(ack->get_source().num());
3381
3382 if (!resolve_ack_gather.count(from) ||
3383 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
7c673cae
FG
3384 return;
3385 }
3386
f67539c2 3387 if (ambiguous_peer_updates.count(from)) {
11fdf7f2
TL
3388 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3389 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
7c673cae
FG
3390 }
3391
11fdf7f2 3392 for (const auto &p : ack->commit) {
f67539c2 3393 dout(10) << " commit on peer " << p.first << dendl;
7c673cae 3394
f67539c2
TL
3395 if (ambiguous_peer_updates.count(from)) {
3396 remove_ambiguous_peer_update(p.first, from);
7c673cae
FG
3397 continue;
3398 }
3399
3400 if (mds->is_resolve()) {
3401 // replay
f67539c2 3402 MDPeerUpdate *su = get_uncommitted_peer(p.first, from);
11fdf7f2 3403 ceph_assert(su);
7c673cae
FG
3404
3405 // log commit
f67539c2
TL
3406 mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from,
3407 EPeerUpdate::OP_COMMIT, su->origop),
3408 new C_MDC_PeerCommit(this, from, p.first));
7c673cae
FG
3409 mds->mdlog->flush();
3410
f67539c2 3411 finish_uncommitted_peer(p.first);
7c673cae 3412 } else {
11fdf7f2 3413 MDRequestRef mdr = request_get(p.first);
f67539c2 3414 // information about leader imported caps
11fdf7f2
TL
3415 if (p.second.length() > 0)
3416 mdr->more()->inode_import.share(p.second);
7c673cae 3417
f67539c2 3418 ceph_assert(mdr->peer_request == 0); // shouldn't be doing anything!
7c673cae
FG
3419 request_finish(mdr);
3420 }
3421 }
3422
11fdf7f2 3423 for (const auto &metareq : ack->abort) {
f67539c2 3424 dout(10) << " abort on peer " << metareq << dendl;
7c673cae
FG
3425
3426 if (mds->is_resolve()) {
f67539c2 3427 MDPeerUpdate *su = get_uncommitted_peer(metareq, from);
11fdf7f2 3428 ceph_assert(su);
7c673cae
FG
3429
3430 // perform rollback (and journal a rollback entry)
3431 // note: this will hold up the resolve a bit, until the rollback entries journal.
3432 MDRequestRef null_ref;
3433 switch (su->origop) {
f67539c2 3434 case EPeerUpdate::LINK:
7c673cae
FG
3435 mds->server->do_link_rollback(su->rollback, from, null_ref);
3436 break;
f67539c2 3437 case EPeerUpdate::RENAME:
7c673cae
FG
3438 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3439 break;
f67539c2 3440 case EPeerUpdate::RMDIR:
7c673cae
FG
3441 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3442 break;
3443 default:
3444 ceph_abort();
3445 }
3446 } else {
11fdf7f2 3447 MDRequestRef mdr = request_get(metareq);
7c673cae 3448 mdr->aborted = true;
f67539c2
TL
3449 if (mdr->peer_request) {
3450 if (mdr->peer_did_prepare()) // journaling peer prepare ?
11fdf7f2 3451 add_rollback(metareq, from);
7c673cae
FG
3452 } else {
3453 request_finish(mdr);
3454 }
3455 }
3456 }
3457
f67539c2 3458 if (!ambiguous_peer_updates.count(from)) {
7c673cae 3459 resolve_ack_gather.erase(from);
f67539c2 3460 maybe_finish_peer_resolve();
7c673cae 3461 }
7c673cae
FG
3462}
3463
f67539c2 3464void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su)
7c673cae 3465{
f67539c2 3466 auto const &ret = uncommitted_peers.emplace(std::piecewise_construct,
e306af50
TL
3467 std::forward_as_tuple(reqid),
3468 std::forward_as_tuple());
3469 ceph_assert(ret.second);
f67539c2
TL
3470 ls->uncommitted_peers.insert(reqid);
3471 upeer &u = ret.first->second;
3472 u.leader = leader;
e306af50
TL
3473 u.ls = ls;
3474 u.su = su;
3475 if (su == nullptr) {
3476 return;
3477 }
7c673cae 3478 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
f67539c2 3479 uncommitted_peer_rename_olddir[*p]++;
7c673cae 3480 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
f67539c2 3481 uncommitted_peer_unlink[*p]++;
7c673cae
FG
3482}
3483
f67539c2 3484void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
7c673cae 3485{
f67539c2
TL
3486 auto it = uncommitted_peers.find(reqid);
3487 if (it == uncommitted_peers.end()) {
e306af50
TL
3488 ceph_assert(!assert_exist);
3489 return;
3490 }
f67539c2
TL
3491 upeer &u = it->second;
3492 MDPeerUpdate* su = u.su;
e306af50
TL
3493
3494 if (!u.waiters.empty()) {
3495 mds->queue_waiters(u.waiters);
3496 }
f67539c2
TL
3497 u.ls->uncommitted_peers.erase(reqid);
3498 uncommitted_peers.erase(it);
7c673cae 3499
e306af50
TL
3500 if (su == nullptr) {
3501 return;
3502 }
7c673cae
FG
3503 // discard the non-auth subtree we renamed out of
3504 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3505 CInode *diri = *p;
f67539c2
TL
3506 map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
3507 ceph_assert(it != uncommitted_peer_rename_olddir.end());
7c673cae
FG
3508 it->second--;
3509 if (it->second == 0) {
f67539c2 3510 uncommitted_peer_rename_olddir.erase(it);
9f95a23c
TL
3511 auto&& ls = diri->get_dirfrags();
3512 for (const auto& dir : ls) {
3513 CDir *root = get_subtree_root(dir);
7c673cae
FG
3514 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3515 try_trim_non_auth_subtree(root);
9f95a23c 3516 if (dir != root)
7c673cae
FG
3517 break;
3518 }
3519 }
3520 } else
11fdf7f2 3521 ceph_assert(it->second > 0);
7c673cae 3522 }
f67539c2 3523 // removed the inodes that were unlinked by peer update
7c673cae
FG
3524 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3525 CInode *in = *p;
f67539c2
TL
3526 map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
3527 ceph_assert(it != uncommitted_peer_unlink.end());
7c673cae
FG
3528 it->second--;
3529 if (it->second == 0) {
f67539c2 3530 uncommitted_peer_unlink.erase(it);
7c673cae
FG
3531 if (!in->get_projected_parent_dn())
3532 mds->mdcache->remove_inode_recursive(in);
3533 } else
11fdf7f2 3534 ceph_assert(it->second > 0);
7c673cae
FG
3535 }
3536 delete su;
3537}
3538
f67539c2 3539MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader)
7c673cae
FG
3540{
3541
f67539c2
TL
3542 MDPeerUpdate* su = nullptr;
3543 auto it = uncommitted_peers.find(reqid);
3544 if (it != uncommitted_peers.end() &&
3545 it->second.leader == leader) {
e306af50 3546 su = it->second.su;
7c673cae
FG
3547 }
3548 return su;
3549}
3550
e306af50 3551void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
f91f0fd5 3552 auto p = resolve_need_rollback.find(reqid);
11fdf7f2 3553 ceph_assert(p != resolve_need_rollback.end());
e306af50 3554 if (mds->is_resolve()) {
f67539c2 3555 finish_uncommitted_peer(reqid, false);
e306af50 3556 } else if (mdr) {
f67539c2 3557 finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled);
e306af50 3558 }
11fdf7f2 3559 resolve_need_rollback.erase(p);
f67539c2 3560 maybe_finish_peer_resolve();
7c673cae
FG
3561}
3562
3563void MDCache::disambiguate_other_imports()
3564{
3565 dout(10) << "disambiguate_other_imports" << dendl;
3566
3567 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3568 // other nodes' ambiguous imports
3569 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3570 p != other_ambiguous_imports.end();
3571 ++p) {
3572 mds_rank_t who = p->first;
3573 dout(10) << "ambiguous imports for mds." << who << dendl;
3574
3575 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3576 q != p->second.end();
3577 ++q) {
3578 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3579 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3580 CDir *dir = get_force_dirfrag(q->first, recovering);
3581 if (!dir) continue;
3582
3583 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3584 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3585 dout(10) << " mds." << who << " did import " << *dir << dendl;
3586 adjust_bounded_subtree_auth(dir, q->second, who);
3587 try_subtree_merge(dir);
3588 } else {
3589 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3590 }
3591 }
3592 }
3593 other_ambiguous_imports.clear();
3594}
3595
3596void MDCache::disambiguate_my_imports()
3597{
3598 dout(10) << "disambiguate_my_imports" << dendl;
3599
3600 if (!mds->is_resolve()) {
11fdf7f2 3601 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3602 return;
3603 }
3604
3605 disambiguate_other_imports();
3606
3607 // my ambiguous imports
3608 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3609 while (!my_ambiguous_imports.empty()) {
3610 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3611
3612 CDir *dir = get_dirfrag(q->first);
11fdf7f2 3613 ceph_assert(dir);
7c673cae
FG
3614
3615 if (dir->authority() != me_ambig) {
3616 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3617 cancel_ambiguous_import(dir);
3618
3619 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3620
3621 // subtree may have been swallowed by another node claiming dir
3622 // as their own.
3623 CDir *root = get_subtree_root(dir);
3624 if (root != dir)
3625 dout(10) << " subtree root is " << *root << dendl;
11fdf7f2 3626 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
7c673cae
FG
3627 try_trim_non_auth_subtree(root);
3628 } else {
3629 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3630 finish_ambiguous_import(q->first);
3631 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3632 }
3633 }
11fdf7f2 3634 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3635 mds->mdlog->flush();
3636
3637 // verify all my subtrees are unambiguous!
3638 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3639 p != subtrees.end();
3640 ++p) {
3641 CDir *dir = p->first;
3642 if (dir->is_ambiguous_dir_auth()) {
3643 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3644 }
11fdf7f2 3645 ceph_assert(!dir->is_ambiguous_dir_auth());
7c673cae
FG
3646 }
3647
3648 show_subtrees();
3649}
3650
3651
3652void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3653{
11fdf7f2 3654 ceph_assert(my_ambiguous_imports.count(base) == 0);
7c673cae
FG
3655 my_ambiguous_imports[base] = bounds;
3656}
3657
3658
3659void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3660{
3661 // make a list
3662 vector<dirfrag_t> binos;
3663 for (set<CDir*>::iterator p = bounds.begin();
3664 p != bounds.end();
3665 ++p)
3666 binos.push_back((*p)->dirfrag());
3667
3668 // note: this can get called twice if the exporter fails during recovery
3669 if (my_ambiguous_imports.count(base->dirfrag()))
3670 my_ambiguous_imports.erase(base->dirfrag());
3671
3672 add_ambiguous_import(base->dirfrag(), binos);
3673}
3674
3675void MDCache::cancel_ambiguous_import(CDir *dir)
3676{
3677 dirfrag_t df = dir->dirfrag();
11fdf7f2 3678 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3679 dout(10) << "cancel_ambiguous_import " << df
3680 << " bounds " << my_ambiguous_imports[df]
3681 << " " << *dir
3682 << dendl;
3683 my_ambiguous_imports.erase(df);
3684}
3685
3686void MDCache::finish_ambiguous_import(dirfrag_t df)
3687{
11fdf7f2 3688 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3689 vector<dirfrag_t> bounds;
3690 bounds.swap(my_ambiguous_imports[df]);
3691 my_ambiguous_imports.erase(df);
3692
3693 dout(10) << "finish_ambiguous_import " << df
3694 << " bounds " << bounds
3695 << dendl;
3696 CDir *dir = get_dirfrag(df);
11fdf7f2 3697 ceph_assert(dir);
7c673cae
FG
3698
3699 // adjust dir_auth, import maps
3700 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3701 try_subtree_merge(dir);
3702}
3703
3704void MDCache::remove_inode_recursive(CInode *in)
3705{
3706 dout(10) << "remove_inode_recursive " << *in << dendl;
9f95a23c
TL
3707 auto&& ls = in->get_dirfrags();
3708 for (const auto& subdir : ls) {
3709 dout(10) << " removing dirfrag " << *subdir << dendl;
94b18763
FG
3710 auto it = subdir->items.begin();
3711 while (it != subdir->items.end()) {
3712 CDentry *dn = it->second;
3713 ++it;
7c673cae
FG
3714 CDentry::linkage_t *dnl = dn->get_linkage();
3715 if (dnl->is_primary()) {
3716 CInode *tin = dnl->get_inode();
31f18b77 3717 subdir->unlink_inode(dn, false);
7c673cae
FG
3718 remove_inode_recursive(tin);
3719 }
3720 subdir->remove_dentry(dn);
3721 }
3722
3723 if (subdir->is_subtree_root())
3724 remove_subtree(subdir);
3725 in->close_dirfrag(subdir->dirfrag().frag);
3726 }
3727 remove_inode(in);
3728}
3729
11fdf7f2 3730bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
7c673cae 3731{
11fdf7f2 3732 ceph_assert(!in->is_auth());
7c673cae
FG
3733
3734 dout(10) << __func__ << ":" << *in << dendl;
3735
3736 // Recurse into any dirfrags beneath this inode
9f95a23c
TL
3737 auto&& ls = in->get_dirfrags();
3738 for (const auto& subdir : ls) {
7c673cae
FG
3739 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3740 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3741 return true;
3742 }
3743
1d09f67e
TL
3744 for (auto it = subdir->items.begin(); it != subdir->items.end();) {
3745 CDentry *dn = it->second;
3746 it++;
7c673cae
FG
3747 CDentry::linkage_t *dnl = dn->get_linkage();
3748 if (dnl->is_primary()) {
3749 CInode *tin = dnl->get_inode();
3750
3751 /* Remote strays with linkage (i.e. hardlinks) should not be
3752 * expired, because they may be the target of
3753 * a rename() as the owning MDS shuts down */
f67539c2 3754 if (!tin->is_stray() && tin->get_inode()->nlink) {
7c673cae
FG
3755 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3756 return true;
3757 }
3758
3759 const bool abort = expire_recursive(tin, expiremap);
3760 if (abort) {
3761 return true;
3762 }
3763 }
3764 if (dn->lru_is_expireable()) {
3765 trim_dentry(dn, expiremap);
3766 } else {
3767 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3768 return true;
3769 }
3770 }
3771 }
3772
3773 return false;
3774}
3775
3776void MDCache::trim_unlinked_inodes()
3777{
3778 dout(7) << "trim_unlinked_inodes" << dendl;
81eedcae
TL
3779 int count = 0;
3780 vector<CInode*> q;
94b18763 3781 for (auto &p : inode_map) {
b32b8144 3782 CInode *in = p.second;
7c673cae
FG
3783 if (in->get_parent_dn() == NULL && !in->is_base()) {
3784 dout(7) << " will trim from " << *in << dendl;
3785 q.push_back(in);
3786 }
81eedcae 3787
33c7a0ef 3788 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae
TL
3789 mds->heartbeat_reset();
3790 }
81eedcae
TL
3791 for (auto& in : q) {
3792 remove_inode_recursive(in);
3793
33c7a0ef 3794 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae 3795 mds->heartbeat_reset();
7c673cae 3796 }
7c673cae
FG
3797}
3798
3799/** recalc_auth_bits()
3800 * once subtree auth is disambiguated, we need to adjust all the
3801 * auth and dirty bits in our cache before moving on.
3802 */
3803void MDCache::recalc_auth_bits(bool replay)
3804{
3805 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3806
3807 if (root) {
3808 root->inode_auth.first = mds->mdsmap->get_root();
3809 bool auth = mds->get_nodeid() == root->inode_auth.first;
3810 if (auth) {
3811 root->state_set(CInode::STATE_AUTH);
3812 } else {
3813 root->state_clear(CInode::STATE_AUTH);
3814 if (!replay)
3815 root->state_set(CInode::STATE_REJOINING);
3816 }
3817 }
3818
3819 set<CInode*> subtree_inodes;
3820 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3821 p != subtrees.end();
3822 ++p) {
3823 if (p->first->dir_auth.first == mds->get_nodeid())
3824 subtree_inodes.insert(p->first->inode);
3825 }
3826
3827 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3828 p != subtrees.end();
3829 ++p) {
3830 if (p->first->inode->is_mdsdir()) {
3831 CInode *in = p->first->inode;
3832 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3833 if (auth) {
3834 in->state_set(CInode::STATE_AUTH);
3835 } else {
3836 in->state_clear(CInode::STATE_AUTH);
3837 if (!replay)
3838 in->state_set(CInode::STATE_REJOINING);
3839 }
3840 }
3841
9f95a23c
TL
3842 std::queue<CDir*> dfq; // dirfrag queue
3843 dfq.push(p->first);
7c673cae
FG
3844
3845 bool auth = p->first->authority().first == mds->get_nodeid();
3846 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3847
3848 while (!dfq.empty()) {
3849 CDir *dir = dfq.front();
9f95a23c 3850 dfq.pop();
7c673cae
FG
3851
3852 // dir
3853 if (auth) {
3854 dir->state_set(CDir::STATE_AUTH);
3855 } else {
3856 dir->state_clear(CDir::STATE_AUTH);
3857 if (!replay) {
3858 // close empty non-auth dirfrag
3859 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3860 dir->inode->close_dirfrag(dir->get_frag());
3861 continue;
3862 }
3863 dir->state_set(CDir::STATE_REJOINING);
3864 dir->state_clear(CDir::STATE_COMPLETE);
3865 if (dir->is_dirty())
3866 dir->mark_clean();
3867 }
3868 }
3869
3870 // dentries in this dir
94b18763 3871 for (auto &p : dir->items) {
7c673cae 3872 // dn
94b18763 3873 CDentry *dn = p.second;
7c673cae
FG
3874 CDentry::linkage_t *dnl = dn->get_linkage();
3875 if (auth) {
3876 dn->state_set(CDentry::STATE_AUTH);
3877 } else {
3878 dn->state_clear(CDentry::STATE_AUTH);
3879 if (!replay) {
3880 dn->state_set(CDentry::STATE_REJOINING);
3881 if (dn->is_dirty())
3882 dn->mark_clean();
3883 }
3884 }
3885
3886 if (dnl->is_primary()) {
3887 // inode
3888 CInode *in = dnl->get_inode();
3889 if (auth) {
3890 in->state_set(CInode::STATE_AUTH);
3891 } else {
3892 in->state_clear(CInode::STATE_AUTH);
3893 if (!replay) {
3894 in->state_set(CInode::STATE_REJOINING);
3895 if (in->is_dirty())
3896 in->mark_clean();
3897 if (in->is_dirty_parent())
3898 in->clear_dirty_parent();
3899 // avoid touching scatterlocks for our subtree roots!
3900 if (subtree_inodes.count(in) == 0)
3901 in->clear_scatter_dirty();
3902 }
3903 }
3904 // recurse?
9f95a23c
TL
3905 if (in->is_dir()) {
3906 auto&& dfv = in->get_nested_dirfrags();
3907 for (const auto& dir : dfv) {
3908 dfq.push(dir);
3909 }
3910 }
7c673cae
FG
3911 }
3912 }
3913 }
3914 }
3915
3916 show_subtrees();
3917 show_cache();
3918}
3919
3920
3921
3922// ===========================================================================
3923// REJOIN
3924
3925/*
3926 * notes on scatterlock recovery:
3927 *
3928 * - recovering inode replica sends scatterlock data for any subtree
3929 * roots (the only ones that are possibly dirty).
3930 *
3931 * - surviving auth incorporates any provided scatterlock data. any
3932 * pending gathers are then finished, as with the other lock types.
3933 *
3934 * that takes care of surviving auth + (recovering replica)*.
3935 *
3936 * - surviving replica sends strong_inode, which includes current
3937 * scatterlock state, AND any dirty scatterlock data. this
3938 * provides the recovering auth with everything it might need.
3939 *
3940 * - recovering auth must pick initial scatterlock state based on
3941 * (weak|strong) rejoins.
3942 * - always assimilate scatterlock data (it can't hurt)
3943 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3944 * - include base inode in ack for all inodes that saw scatterlock content
3945 *
3946 * also, for scatter gather,
3947 *
3948 * - auth increments {frag,r}stat.version on completion of any gather.
3949 *
3950 * - auth incorporates changes in a gather _only_ if the version
3951 * matches.
3952 *
3953 * - replica discards changes any time the scatterlock syncs, and
3954 * after recovery.
3955 */
3956
3957void MDCache::dump_rejoin_status(Formatter *f) const
3958{
3959 f->open_object_section("rejoin_status");
3960 f->dump_stream("rejoin_gather") << rejoin_gather;
3961 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3962 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3963 f->close_section();
3964}
3965
11fdf7f2 3966void MDCache::rejoin_start(MDSContext *rejoin_done_)
7c673cae
FG
3967{
3968 dout(10) << "rejoin_start" << dendl;
11fdf7f2 3969 ceph_assert(!rejoin_done);
7c673cae
FG
3970 rejoin_done.reset(rejoin_done_);
3971
3972 rejoin_gather = recovery_set;
3973 // need finish opening cap inodes before sending cache rejoins
3974 rejoin_gather.insert(mds->get_nodeid());
3975 process_imported_caps();
3976}
3977
3978/*
3979 * rejoin phase!
3980 *
11fdf7f2 3981 * this initiates rejoin. it should be called before we get any
7c673cae
FG
3982 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3983 *
3984 * we start out by sending rejoins to everyone in the recovery set.
3985 *
3986 * if we are rejoin, send for all regions in our cache.
11fdf7f2 3987 * if we are active|stopping, send only to nodes that are rejoining.
7c673cae
FG
3988 */
3989void MDCache::rejoin_send_rejoins()
3990{
3991 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3992
3993 if (rejoin_gather.count(mds->get_nodeid())) {
3994 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3995 rejoins_pending = true;
3996 return;
3997 }
3998 if (!resolve_gather.empty()) {
3999 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
4000 << resolve_gather << ")" << dendl;
4001 rejoins_pending = true;
4002 return;
4003 }
4004
11fdf7f2
TL
4005 ceph_assert(!migrator->is_importing());
4006 ceph_assert(!migrator->is_exporting());
7c673cae
FG
4007
4008 if (!mds->is_rejoin()) {
4009 disambiguate_other_imports();
4010 }
4011
9f95a23c 4012 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
7c673cae
FG
4013
4014
4015 // if i am rejoining, send a rejoin to everyone.
4016 // otherwise, just send to others who are rejoining.
9f95a23c
TL
4017 for (const auto& rank : recovery_set) {
4018 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4019 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
7c673cae 4020 if (mds->is_rejoin())
9f95a23c
TL
4021 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4022 else if (mds->mdsmap->is_rejoin(rank))
4023 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
7c673cae
FG
4024 }
4025
4026 if (mds->is_rejoin()) {
11fdf7f2
TL
4027 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4028 for (auto& p : cap_exports) {
4029 mds_rank_t target = p.second.first;
7c673cae
FG
4030 if (rejoins.count(target) == 0)
4031 continue;
11fdf7f2
TL
4032 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4033 Session *session = nullptr;
4034 auto it = client_exports.find(q->first);
4035 if (it != client_exports.end()) {
4036 session = it->second.first;
4037 if (session)
4038 it->second.second.insert(target);
4039 } else {
4040 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4041 auto& r = client_exports[q->first];
4042 r.first = session;
4043 if (session)
4044 r.second.insert(target);
4045 }
4046 if (session) {
4047 ++q;
4048 } else {
4049 // remove reconnect with no session
4050 p.second.second.erase(q++);
4051 }
4052 }
4053 rejoins[target]->cap_exports[p.first] = p.second.second;
7c673cae 4054 }
11fdf7f2
TL
4055 for (auto& p : client_exports) {
4056 Session *session = p.second.first;
4057 for (auto& q : p.second.second) {
4058 auto rejoin = rejoins[q];
4059 rejoin->client_map[p.first] = session->info.inst;
4060 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4061 }
7c673cae
FG
4062 }
4063 }
4064
4065
4066 // check all subtrees
4067 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4068 p != subtrees.end();
4069 ++p) {
4070 CDir *dir = p->first;
11fdf7f2 4071 ceph_assert(dir->is_subtree_root());
7c673cae
FG
4072 if (dir->is_ambiguous_dir_auth()) {
4073 // exporter is recovering, importer is survivor.
11fdf7f2
TL
4074 ceph_assert(rejoins.count(dir->authority().first));
4075 ceph_assert(!rejoins.count(dir->authority().second));
7c673cae
FG
4076 continue;
4077 }
4078
4079 // my subtree?
4080 if (dir->is_auth())
4081 continue; // skip my own regions!
4082
4083 mds_rank_t auth = dir->get_dir_auth().first;
11fdf7f2 4084 ceph_assert(auth >= 0);
7c673cae
FG
4085 if (rejoins.count(auth) == 0)
4086 continue; // don't care about this node's subtrees
4087
4088 rejoin_walk(dir, rejoins[auth]);
4089 }
4090
4091 // rejoin root inodes, too
11fdf7f2 4092 for (auto &p : rejoins) {
7c673cae
FG
4093 if (mds->is_rejoin()) {
4094 // weak
11fdf7f2
TL
4095 if (p.first == 0 && root) {
4096 p.second->add_weak_inode(root->vino());
7c673cae
FG
4097 if (root->is_dirty_scattered()) {
4098 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4099 p.second->add_scatterlock_state(root);
7c673cae
FG
4100 }
4101 }
11fdf7f2 4102 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
7c673cae 4103 if (in)
11fdf7f2 4104 p.second->add_weak_inode(in->vino());
7c673cae
FG
4105 }
4106 } else {
4107 // strong
11fdf7f2
TL
4108 if (p.first == 0 && root) {
4109 p.second->add_strong_inode(root->vino(),
7c673cae
FG
4110 root->get_replica_nonce(),
4111 root->get_caps_wanted(),
4112 root->filelock.get_state(),
4113 root->nestlock.get_state(),
4114 root->dirfragtreelock.get_state());
4115 root->state_set(CInode::STATE_REJOINING);
4116 if (root->is_dirty_scattered()) {
4117 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4118 p.second->add_scatterlock_state(root);
7c673cae
FG
4119 }
4120 }
4121
11fdf7f2
TL
4122 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4123 p.second->add_strong_inode(in->vino(),
7c673cae
FG
4124 in->get_replica_nonce(),
4125 in->get_caps_wanted(),
4126 in->filelock.get_state(),
4127 in->nestlock.get_state(),
4128 in->dirfragtreelock.get_state());
4129 in->state_set(CInode::STATE_REJOINING);
4130 }
4131 }
4132 }
4133
4134 if (!mds->is_rejoin()) {
4135 // i am survivor. send strong rejoin.
4136 // note request remote_auth_pins, xlocks
4137 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4138 p != active_requests.end();
4139 ++p) {
4140 MDRequestRef& mdr = p->second;
f67539c2 4141 if (mdr->is_peer())
7c673cae
FG
4142 continue;
4143 // auth pins
9f95a23c
TL
4144 for (const auto& q : mdr->object_states) {
4145 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4146 continue;
11fdf7f2 4147 if (!q.first->is_auth()) {
9f95a23c
TL
4148 mds_rank_t target = q.second.remote_auth_pinned;
4149 ceph_assert(target == q.first->authority().first);
4150 if (rejoins.count(target) == 0) continue;
4151 const auto& rejoin = rejoins[target];
7c673cae 4152
11fdf7f2 4153 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
7c673cae 4154 MDSCacheObjectInfo i;
11fdf7f2 4155 q.first->set_object_info(i);
7c673cae
FG
4156 if (i.ino)
4157 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4158 else
4159 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4160
4161 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
11fdf7f2 4162 mdr->more()->rename_inode == q.first)
7c673cae
FG
4163 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4164 mdr->reqid, mdr->attempt);
4165 }
4166 }
4167 // xlocks
11fdf7f2
TL
4168 for (const auto& q : mdr->locks) {
4169 auto lock = q.lock;
4170 auto obj = lock->get_parent();
4171 if (q.is_xlock() && !obj->is_auth()) {
4172 mds_rank_t who = obj->authority().first;
7c673cae 4173 if (rejoins.count(who) == 0) continue;
9f95a23c 4174 const auto& rejoin = rejoins[who];
7c673cae 4175
11fdf7f2 4176 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
7c673cae 4177 MDSCacheObjectInfo i;
11fdf7f2 4178 obj->set_object_info(i);
7c673cae 4179 if (i.ino)
11fdf7f2 4180 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
7c673cae
FG
4181 mdr->reqid, mdr->attempt);
4182 else
4183 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4184 mdr->reqid, mdr->attempt);
11fdf7f2
TL
4185 } else if (q.is_remote_wrlock()) {
4186 mds_rank_t who = q.wrlock_target;
4187 if (rejoins.count(who) == 0) continue;
9f95a23c 4188 const auto& rejoin = rejoins[who];
7c673cae 4189
11fdf7f2
TL
4190 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4191 MDSCacheObjectInfo i;
4192 obj->set_object_info(i);
4193 ceph_assert(i.ino);
4194 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4195 mdr->reqid, mdr->attempt);
4196 }
7c673cae
FG
4197 }
4198 }
4199 }
4200
4201 // send the messages
11fdf7f2
TL
4202 for (auto &p : rejoins) {
4203 ceph_assert(rejoin_sent.count(p.first) == 0);
4204 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4205 rejoin_sent.insert(p.first);
4206 rejoin_ack_gather.insert(p.first);
4207 mds->send_message_mds(p.second, p.first);
7c673cae
FG
4208 }
4209 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4210 rejoins_pending = false;
4211
4212 // nothing?
28e407b8 4213 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4214 dout(10) << "nothing to rejoin" << dendl;
4215 rejoin_gather_finish();
4216 }
4217}
4218
4219
4220/**
4221 * rejoin_walk - build rejoin declarations for a subtree
4222 *
4223 * @param dir subtree root
4224 * @param rejoin rejoin message
4225 *
4226 * from a rejoining node:
4227 * weak dirfrag
4228 * weak dentries (w/ connectivity)
4229 *
4230 * from a surviving node:
4231 * strong dirfrag
4232 * strong dentries (no connectivity!)
4233 * strong inodes
4234 */
9f95a23c 4235void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
7c673cae
FG
4236{
4237 dout(10) << "rejoin_walk " << *dir << dendl;
4238
9f95a23c 4239 std::vector<CDir*> nested; // finish this dir, then do nested items
7c673cae
FG
4240
4241 if (mds->is_rejoin()) {
4242 // WEAK
4243 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4244 for (auto &p : dir->items) {
4245 CDentry *dn = p.second;
11fdf7f2 4246 ceph_assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4247 CDentry::linkage_t *dnl = dn->get_linkage();
4248 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
11fdf7f2 4249 ceph_assert(dnl->is_primary());
7c673cae 4250 CInode *in = dnl->get_inode();
11fdf7f2 4251 ceph_assert(dnl->get_inode()->is_dir());
94b18763 4252 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
9f95a23c
TL
4253 {
4254 auto&& dirs = in->get_nested_dirfrags();
4255 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4256 }
7c673cae
FG
4257 if (in->is_dirty_scattered()) {
4258 dout(10) << " sending scatterlock state on " << *in << dendl;
4259 rejoin->add_scatterlock_state(in);
4260 }
4261 }
4262 } else {
4263 // STRONG
4264 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4265 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4266 dir->state_set(CDir::STATE_REJOINING);
4267
11fdf7f2 4268 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
94b18763 4269 CDentry *dn = it->second;
11fdf7f2
TL
4270 ++it;
4271 dn->state_set(CDentry::STATE_REJOINING);
7c673cae 4272 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2
TL
4273 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4274
4275 // trim snap dentries. because they may have been pruned by
4276 // their auth mds (snap deleted)
4277 if (dn->last != CEPH_NOSNAP) {
4278 if (in && !in->remote_parents.empty()) {
4279 // unlink any stale remote snap dentry.
4280 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4281 CDentry *remote_dn = *it2;
4282 ++it2;
4283 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4284 remote_dn->unlink_remote(remote_dn->get_linkage());
4285 }
4286 }
4287 if (dn->lru_is_expireable()) {
4288 if (!dnl->is_null())
4289 dir->unlink_inode(dn, false);
4290 if (in)
4291 remove_inode(in);
4292 dir->remove_dentry(dn);
4293 continue;
4294 } else {
4295 // Inventing null/remote dentry shouldn't cause problem
4296 ceph_assert(!dnl->is_primary());
4297 }
4298 }
4299
7c673cae 4300 dout(15) << " add_strong_dentry " << *dn << dendl;
f67539c2
TL
4301 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4302 dn->first, dn->last,
7c673cae
FG
4303 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4304 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4305 dnl->is_remote() ? dnl->get_remote_d_type():0,
4306 dn->get_replica_nonce(),
4307 dn->lock.get_state());
4308 dn->state_set(CDentry::STATE_REJOINING);
4309 if (dnl->is_primary()) {
4310 CInode *in = dnl->get_inode();
4311 dout(15) << " add_strong_inode " << *in << dendl;
4312 rejoin->add_strong_inode(in->vino(),
4313 in->get_replica_nonce(),
4314 in->get_caps_wanted(),
4315 in->filelock.get_state(),
4316 in->nestlock.get_state(),
4317 in->dirfragtreelock.get_state());
4318 in->state_set(CInode::STATE_REJOINING);
9f95a23c
TL
4319 {
4320 auto&& dirs = in->get_nested_dirfrags();
4321 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4322 }
7c673cae
FG
4323 if (in->is_dirty_scattered()) {
4324 dout(10) << " sending scatterlock state on " << *in << dendl;
4325 rejoin->add_scatterlock_state(in);
4326 }
4327 }
4328 }
4329 }
4330
4331 // recurse into nested dirs
9f95a23c
TL
4332 for (const auto& dir : nested) {
4333 rejoin_walk(dir, rejoin);
4334 }
7c673cae
FG
4335}
4336
4337
4338/*
4339 * i got a rejoin.
4340 * - reply with the lockstate
4341 *
4342 * if i am active|stopping,
4343 * - remove source from replica list for everything not referenced here.
7c673cae 4344 */
9f95a23c 4345void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
7c673cae
FG
4346{
4347 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4348 << " (" << m->get_payload().length() << " bytes)"
4349 << dendl;
4350
4351 switch (m->op) {
4352 case MMDSCacheRejoin::OP_WEAK:
4353 handle_cache_rejoin_weak(m);
4354 break;
4355 case MMDSCacheRejoin::OP_STRONG:
4356 handle_cache_rejoin_strong(m);
4357 break;
4358 case MMDSCacheRejoin::OP_ACK:
4359 handle_cache_rejoin_ack(m);
4360 break;
4361
4362 default:
4363 ceph_abort();
4364 }
7c673cae
FG
4365}
4366
4367
4368/*
4369 * handle_cache_rejoin_weak
4370 *
4371 * the sender
4372 * - is recovering from their journal.
4373 * - may have incorrect (out of date) inode contents
4374 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4375 *
4376 * if the sender didn't trim_non_auth(), they
4377 * - may have incorrect (out of date) dentry/inode linkage
4378 * - may have deleted/purged inodes
4379 * and i may have to go to disk to get accurate inode contents. yuck.
7c673cae 4380 */
9f95a23c 4381void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
7c673cae
FG
4382{
4383 mds_rank_t from = mds_rank_t(weak->get_source().num());
4384
4385 // possible response(s)
9f95a23c 4386 ref_t<MMDSCacheRejoin> ack; // if survivor
7c673cae
FG
4387 set<vinodeno_t> acked_inodes; // if survivor
4388 set<SimpleLock *> gather_locks; // if survivor
4389 bool survivor = false; // am i a survivor?
4390
4391 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4392 survivor = true;
4393 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
9f95a23c 4394 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
7c673cae
FG
4395
4396 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4397
4398 // check cap exports
4399 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4400 CInode *in = get_inode(p->first);
11fdf7f2 4401 ceph_assert(!in || in->is_auth());
7c673cae
FG
4402 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4403 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4404 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4405 Capability::Import& im = imported_caps[p->first][q->first];
4406 if (cap) {
4407 im.cap_id = cap->get_cap_id();
4408 im.issue_seq = cap->get_last_seq();
4409 im.mseq = cap->get_mseq();
4410 } else {
4411 // all are zero
4412 }
4413 }
4414 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4415 }
4416
11fdf7f2 4417 encode(imported_caps, ack->imported_caps);
7c673cae 4418 } else {
11fdf7f2 4419 ceph_assert(mds->is_rejoin());
7c673cae
FG
4420
4421 // we may have already received a strong rejoin from the sender.
4422 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
11fdf7f2 4423 ceph_assert(gather_locks.empty());
7c673cae
FG
4424
4425 // check cap exports.
4426 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
11fdf7f2
TL
4427 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4428 weak->client_metadata_map.end());
7c673cae
FG
4429
4430 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4431 CInode *in = get_inode(p->first);
11fdf7f2 4432 ceph_assert(!in || in->is_auth());
7c673cae
FG
4433 // note
4434 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4435 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4436 cap_imports[p->first][q->first][from] = q->second;
4437 }
4438 }
4439 }
4440
4441 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4442 for (const auto &p : weak->inode_scatterlocks) {
4443 CInode *in = get_inode(p.first);
4444 ceph_assert(in);
4445 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4446 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4447 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4448 if (!survivor)
4449 rejoin_potential_updated_scatterlocks.insert(in);
4450 }
4451
4452 // recovering peer may send incorrect dirfrags here. we need to
4453 // infer which dirfrag they meant. the ack will include a
4454 // strong_dirfrag that will set them straight on the fragmentation.
4455
4456 // walk weak map
4457 set<CDir*> dirs_to_share;
11fdf7f2
TL
4458 for (const auto &p : weak->weak_dirfrags) {
4459 CInode *diri = get_inode(p.ino);
7c673cae 4460 if (!diri)
11fdf7f2
TL
4461 dout(0) << " missing dir ino " << p.ino << dendl;
4462 ceph_assert(diri);
7c673cae 4463
11fdf7f2
TL
4464 frag_vec_t leaves;
4465 if (diri->dirfragtree.is_leaf(p.frag)) {
4466 leaves.push_back(p.frag);
7c673cae 4467 } else {
11fdf7f2
TL
4468 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4469 if (leaves.empty())
4470 leaves.push_back(diri->dirfragtree[p.frag.value()]);
7c673cae 4471 }
11fdf7f2
TL
4472 for (const auto& leaf : leaves) {
4473 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4474 if (!dir) {
11fdf7f2 4475 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
7c673cae
FG
4476 continue;
4477 }
11fdf7f2 4478 ceph_assert(dir);
7c673cae 4479 if (dirs_to_share.count(dir)) {
11fdf7f2 4480 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4481 } else {
4482 dirs_to_share.insert(dir);
4483 unsigned nonce = dir->add_replica(from);
11fdf7f2 4484 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4485 if (ack) {
4486 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4487 ack->add_dirfrag_base(dir);
4488 }
4489 }
4490 }
4491 }
4492
11fdf7f2
TL
4493 for (const auto &p : weak->weak) {
4494 CInode *diri = get_inode(p.first);
7c673cae 4495 if (!diri)
11fdf7f2
TL
4496 dout(0) << " missing dir ino " << p.first << dendl;
4497 ceph_assert(diri);
7c673cae
FG
4498
4499 // weak dentries
4500 CDir *dir = 0;
11fdf7f2 4501 for (const auto &q : p.second) {
7c673cae
FG
4502 // locate proper dirfrag.
4503 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
11fdf7f2 4504 frag_t fg = diri->pick_dirfrag(q.first.name);
7c673cae
FG
4505 if (!dir || dir->get_frag() != fg) {
4506 dir = diri->get_dirfrag(fg);
4507 if (!dir)
4508 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
11fdf7f2
TL
4509 ceph_assert(dir);
4510 ceph_assert(dirs_to_share.count(dir));
7c673cae
FG
4511 }
4512
4513 // and dentry
11fdf7f2
TL
4514 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4515 ceph_assert(dn);
7c673cae 4516 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 4517 ceph_assert(dnl->is_primary());
7c673cae
FG
4518
4519 if (survivor && dn->is_replica(from))
4520 dentry_remove_replica(dn, from, gather_locks);
4521 unsigned dnonce = dn->add_replica(from);
4522 dout(10) << " have " << *dn << dendl;
4523 if (ack)
f67539c2
TL
4524 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4525 dn->first, dn->last,
7c673cae
FG
4526 dnl->get_inode()->ino(), inodeno_t(0), 0,
4527 dnonce, dn->lock.get_replica_state());
4528
4529 // inode
4530 CInode *in = dnl->get_inode();
11fdf7f2 4531 ceph_assert(in);
7c673cae
FG
4532
4533 if (survivor && in->is_replica(from))
4534 inode_remove_replica(in, from, true, gather_locks);
4535 unsigned inonce = in->add_replica(from);
4536 dout(10) << " have " << *in << dendl;
4537
4538 // scatter the dirlock, just in case?
4539 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4540 in->filelock.set_state(LOCK_MIX);
4541
4542 if (ack) {
4543 acked_inodes.insert(in->vino());
4544 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4545 bufferlist bl;
4546 in->_encode_locks_state_for_rejoin(bl, from);
4547 ack->add_inode_locks(in, inonce, bl);
4548 }
4549 }
4550 }
4551
4552 // weak base inodes? (root, stray, etc.)
4553 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4554 p != weak->weak_inodes.end();
4555 ++p) {
4556 CInode *in = get_inode(*p);
11fdf7f2 4557 ceph_assert(in); // hmm fixme wrt stray?
7c673cae
FG
4558 if (survivor && in->is_replica(from))
4559 inode_remove_replica(in, from, true, gather_locks);
4560 unsigned inonce = in->add_replica(from);
4561 dout(10) << " have base " << *in << dendl;
4562
4563 if (ack) {
4564 acked_inodes.insert(in->vino());
4565 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4566 bufferlist bl;
4567 in->_encode_locks_state_for_rejoin(bl, from);
4568 ack->add_inode_locks(in, inonce, bl);
4569 }
4570 }
4571
11fdf7f2 4572 ceph_assert(rejoin_gather.count(from));
7c673cae
FG
4573 rejoin_gather.erase(from);
4574 if (survivor) {
4575 // survivor. do everything now.
11fdf7f2
TL
4576 for (const auto &p : weak->inode_scatterlocks) {
4577 CInode *in = get_inode(p.first);
4578 ceph_assert(in);
7c673cae
FG
4579 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4580 acked_inodes.insert(in->vino());
4581 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4582 }
4583
4584 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4585 mds->send_message(ack, weak->get_connection());
4586
4587 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4588 if (!(*p)->is_stable())
4589 mds->locker->eval_gather(*p);
4590 }
4591 } else {
4592 // done?
28e407b8 4593 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4594 rejoin_gather_finish();
4595 } else {
4596 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4597 }
4598 }
4599}
4600
7c673cae
FG
4601/*
4602 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4603 *
4604 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4605 * ack, the replica dne, and we can remove it from our replica maps.
4606 */
9f95a23c 4607void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
7c673cae
FG
4608 set<vinodeno_t>& acked_inodes,
4609 set<SimpleLock *>& gather_locks)
4610{
4611 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4612
b32b8144 4613 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4614 // inode?
4615 if (in->is_auth() &&
4616 in->is_replica(from) &&
b32b8144 4617 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4618 inode_remove_replica(in, from, false, gather_locks);
4619 dout(10) << " rem " << *in << dendl;
4620 }
4621
b32b8144
FG
4622 if (!in->is_dir())
4623 return;
7c673cae 4624
9f95a23c
TL
4625 const auto&& dfs = in->get_dirfrags();
4626 for (const auto& dir : dfs) {
181888fb
FG
4627 if (!dir->is_auth())
4628 continue;
7c673cae 4629
181888fb 4630 if (dir->is_replica(from) &&
7c673cae
FG
4631 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4632 dir->remove_replica(from);
4633 dout(10) << " rem " << *dir << dendl;
4634 }
4635
4636 // dentries
94b18763
FG
4637 for (auto &p : dir->items) {
4638 CDentry *dn = p.second;
7c673cae 4639
11fdf7f2
TL
4640 if (dn->is_replica(from)) {
4641 if (ack) {
4642 const auto it = ack->strong_dentries.find(dir->dirfrag());
4643 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4644 continue;
4645 }
4646 }
7c673cae
FG
4647 dentry_remove_replica(dn, from, gather_locks);
4648 dout(10) << " rem " << *dn << dendl;
4649 }
4650 }
4651 }
b32b8144
FG
4652 };
4653
94b18763 4654 for (auto &p : inode_map)
b32b8144 4655 scour_func(p.second);
94b18763 4656 for (auto &p : snap_inode_map)
b32b8144 4657 scour_func(p.second);
7c673cae
FG
4658}
4659
4660
4661CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4662{
f67539c2
TL
4663 CInode *in = new CInode(this, true, 2, last);
4664 in->_get_inode()->ino = ino;
7c673cae
FG
4665 in->state_set(CInode::STATE_REJOINUNDEF);
4666 add_inode(in);
4667 rejoin_undef_inodes.insert(in);
4668 dout(10) << " invented " << *in << dendl;
4669 return in;
4670}
4671
4672CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4673{
4674 CInode *in = get_inode(df.ino);
4675 if (!in)
4676 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4677 if (!in->is_dir()) {
11fdf7f2 4678 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
f67539c2
TL
4679 in->_get_inode()->mode = S_IFDIR;
4680 in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
4681 }
4682 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4683 dir->state_set(CDir::STATE_REJOINUNDEF);
4684 rejoin_undef_dirfrags.insert(dir);
4685 dout(10) << " invented " << *dir << dendl;
4686 return dir;
4687}
4688
9f95a23c 4689void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
7c673cae
FG
4690{
4691 mds_rank_t from = mds_rank_t(strong->get_source().num());
4692
4693 // only a recovering node will get a strong rejoin.
a8e16298
TL
4694 if (!mds->is_rejoin()) {
4695 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4696 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4697 return;
4698 }
11fdf7f2 4699 ceph_abort_msg("got unexpected rejoin message during recovery");
a8e16298 4700 }
7c673cae
FG
4701
4702 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4703 for (const auto &p : strong->inode_scatterlocks) {
4704 CInode *in = get_inode(p.first);
4705 ceph_assert(in);
4706 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4707 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4708 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4709 rejoin_potential_updated_scatterlocks.insert(in);
4710 }
4711
4712 rejoin_unlinked_inodes[from].clear();
4713
4714 // surviving peer may send incorrect dirfrag here (maybe they didn't
4715 // get the fragment notify, or maybe we rolled back?). we need to
4716 // infer the right frag and get them with the program. somehow.
4717 // we don't normally send ACK.. so we'll need to bundle this with
4718 // MISSING or something.
4719
4720 // strong dirfrags/dentries.
4721 // also process auth_pins, xlocks.
11fdf7f2
TL
4722 for (const auto &p : strong->strong_dirfrags) {
4723 auto& dirfrag = p.first;
4724 CInode *diri = get_inode(dirfrag.ino);
7c673cae 4725 if (!diri)
11fdf7f2
TL
4726 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4727 CDir *dir = diri->get_dirfrag(dirfrag.frag);
7c673cae
FG
4728 bool refragged = false;
4729 if (dir) {
4730 dout(10) << " have " << *dir << dendl;
4731 } else {
4732 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4733 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
11fdf7f2
TL
4734 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4735 dir = rejoin_invent_dirfrag(dirfrag);
7c673cae
FG
4736 }
4737 if (dir) {
11fdf7f2
TL
4738 dir->add_replica(from, p.second.nonce);
4739 dir->dir_rep = p.second.dir_rep;
7c673cae 4740 } else {
11fdf7f2
TL
4741 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4742 frag_vec_t leaves;
4743 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4744 if (leaves.empty())
4745 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4746 dout(10) << " maps to frag(s) " << leaves << dendl;
4747 for (const auto& leaf : leaves) {
4748 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4749 if (!dir)
11fdf7f2 4750 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
7c673cae
FG
4751 else
4752 dout(10) << " have(approx) " << *dir << dendl;
11fdf7f2
TL
4753 dir->add_replica(from, p.second.nonce);
4754 dir->dir_rep = p.second.dir_rep;
7c673cae
FG
4755 }
4756 refragged = true;
4757 }
4758
11fdf7f2
TL
4759 const auto it = strong->strong_dentries.find(dirfrag);
4760 if (it != strong->strong_dentries.end()) {
9f95a23c 4761 const auto& dmap = it->second;
11fdf7f2
TL
4762 for (const auto &q : dmap) {
4763 const string_snap_t& ss = q.first;
4764 const MMDSCacheRejoin::dn_strong& d = q.second;
4765 CDentry *dn;
4766 if (!refragged)
4767 dn = dir->lookup(ss.name, ss.snapid);
4768 else {
4769 frag_t fg = diri->pick_dirfrag(ss.name);
4770 dir = diri->get_dirfrag(fg);
4771 ceph_assert(dir);
4772 dn = dir->lookup(ss.name, ss.snapid);
4773 }
4774 if (!dn) {
4775 if (d.is_remote()) {
f67539c2 4776 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
11fdf7f2
TL
4777 } else if (d.is_null()) {
4778 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4779 } else {
4780 CInode *in = get_inode(d.ino, ss.snapid);
4781 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
f67539c2 4782 dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
11fdf7f2
TL
4783 }
4784 dout(10) << " invented " << *dn << dendl;
4785 }
4786 CDentry::linkage_t *dnl = dn->get_linkage();
4787
4788 // dn auth_pin?
4789 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4790 if (pinned_it != strong->authpinned_dentries.end()) {
f67539c2
TL
4791 const auto peer_reqid_it = pinned_it->second.find(ss);
4792 if (peer_reqid_it != pinned_it->second.end()) {
4793 for (const auto &r : peer_reqid_it->second) {
11fdf7f2
TL
4794 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4795
f67539c2 4796 // get/create peer mdrequest
11fdf7f2
TL
4797 MDRequestRef mdr;
4798 if (have_request(r.reqid))
4799 mdr = request_get(r.reqid);
4800 else
f67539c2 4801 mdr = request_start_peer(r.reqid, r.attempt, strong);
11fdf7f2
TL
4802 mdr->auth_pin(dn);
4803 }
4804 }
7c673cae 4805 }
7c673cae 4806
11fdf7f2
TL
4807 // dn xlock?
4808 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4809 if (xlocked_it != strong->xlocked_dentries.end()) {
4810 const auto ss_req_it = xlocked_it->second.find(ss);
4811 if (ss_req_it != xlocked_it->second.end()) {
f67539c2 4812 const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second;
11fdf7f2
TL
4813 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4814 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4815 ceph_assert(mdr->is_auth_pinned(dn));
4816 if (!mdr->is_xlocked(&dn->versionlock)) {
4817 ceph_assert(dn->versionlock.can_xlock_local());
4818 dn->versionlock.get_xlock(mdr, mdr->get_client());
9f95a23c 4819 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
11fdf7f2
TL
4820 }
4821 if (dn->lock.is_stable())
4822 dn->auth_pin(&dn->lock);
4823 dn->lock.set_state(LOCK_XLOCK);
4824 dn->lock.get_xlock(mdr, mdr->get_client());
9f95a23c 4825 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
11fdf7f2
TL
4826 }
4827 }
7c673cae 4828
11fdf7f2
TL
4829 dn->add_replica(from, d.nonce);
4830 dout(10) << " have " << *dn << dendl;
4831
4832 if (dnl->is_primary()) {
4833 if (d.is_primary()) {
4834 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4835 // the survivor missed MDentryUnlink+MDentryLink messages ?
4836 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4837 CInode *in = get_inode(d.ino, ss.snapid);
4838 ceph_assert(in);
4839 ceph_assert(in->get_parent_dn());
4840 rejoin_unlinked_inodes[from].insert(in);
4841 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4842 }
4843 } else {
4844 // the survivor missed MDentryLink message ?
4845 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4846 dout(7) << " sender doesn't have primay dentry" << dendl;
4847 }
4848 } else {
4849 if (d.is_primary()) {
4850 // the survivor missed MDentryUnlink message ?
4851 CInode *in = get_inode(d.ino, ss.snapid);
4852 ceph_assert(in);
4853 ceph_assert(in->get_parent_dn());
7c673cae 4854 rejoin_unlinked_inodes[from].insert(in);
11fdf7f2 4855 dout(7) << " sender has primary dentry but we don't" << dendl;
7c673cae 4856 }
11fdf7f2 4857 }
7c673cae
FG
4858 }
4859 }
4860 }
4861
11fdf7f2
TL
4862 for (const auto &p : strong->strong_inodes) {
4863 CInode *in = get_inode(p.first);
4864 ceph_assert(in);
4865 in->add_replica(from, p.second.nonce);
7c673cae
FG
4866 dout(10) << " have " << *in << dendl;
4867
11fdf7f2 4868 const MMDSCacheRejoin::inode_strong& is = p.second;
7c673cae
FG
4869
4870 // caps_wanted
4871 if (is.caps_wanted) {
11fdf7f2 4872 in->set_mds_caps_wanted(from, is.caps_wanted);
7c673cae
FG
4873 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4874 << " on " << *in << dendl;
4875 }
4876
4877 // scatterlocks?
4878 // infer state from replica state:
4879 // * go to MIX if they might have wrlocks
4880 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4881 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4882 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4883 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4884
4885 // auth pin?
11fdf7f2
TL
4886 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4887 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4888 for (const auto& r : authpinned_inodes_it->second) {
4889 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
7c673cae 4890
f67539c2 4891 // get/create peer mdrequest
7c673cae 4892 MDRequestRef mdr;
11fdf7f2
TL
4893 if (have_request(r.reqid))
4894 mdr = request_get(r.reqid);
7c673cae 4895 else
f67539c2 4896 mdr = request_start_peer(r.reqid, r.attempt, strong);
7c673cae 4897 if (strong->frozen_authpin_inodes.count(in->vino())) {
11fdf7f2 4898 ceph_assert(!in->get_num_auth_pins());
7c673cae
FG
4899 mdr->freeze_auth_pin(in);
4900 } else {
11fdf7f2 4901 ceph_assert(!in->is_frozen_auth_pin());
7c673cae
FG
4902 }
4903 mdr->auth_pin(in);
4904 }
4905 }
4906 // xlock(s)?
11fdf7f2
TL
4907 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4908 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4909 for (const auto &q : xlocked_inodes_it->second) {
4910 SimpleLock *lock = in->get_lock(q.first);
4911 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4912 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4913 ceph_assert(mdr->is_auth_pinned(in));
4914 if (!mdr->is_xlocked(&in->versionlock)) {
4915 ceph_assert(in->versionlock.can_xlock_local());
7c673cae 4916 in->versionlock.get_xlock(mdr, mdr->get_client());
9f95a23c 4917 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4918 }
4919 if (lock->is_stable())
4920 in->auth_pin(lock);
4921 lock->set_state(LOCK_XLOCK);
4922 if (lock == &in->filelock)
4923 in->loner_cap = -1;
4924 lock->get_xlock(mdr, mdr->get_client());
9f95a23c 4925 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4926 }
4927 }
4928 }
4929 // wrlock(s)?
11fdf7f2
TL
4930 for (const auto &p : strong->wrlocked_inodes) {
4931 CInode *in = get_inode(p.first);
4932 for (const auto &q : p.second) {
4933 SimpleLock *lock = in->get_lock(q.first);
4934 for (const auto &r : q.second) {
4935 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4936 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
7c673cae 4937 if (in->is_auth())
11fdf7f2 4938 ceph_assert(mdr->is_auth_pinned(in));
7c673cae
FG
4939 lock->set_state(LOCK_MIX);
4940 if (lock == &in->filelock)
4941 in->loner_cap = -1;
4942 lock->get_wrlock(true);
9f95a23c 4943 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
7c673cae
FG
4944 }
4945 }
4946 }
4947
4948 // done?
11fdf7f2 4949 ceph_assert(rejoin_gather.count(from));
7c673cae 4950 rejoin_gather.erase(from);
28e407b8 4951 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4952 rejoin_gather_finish();
4953 } else {
4954 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4955 }
4956}
4957
9f95a23c 4958void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
7c673cae
FG
4959{
4960 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4961 mds_rank_t from = mds_rank_t(ack->get_source().num());
4962
11fdf7f2 4963 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
b32b8144
FG
4964 bool survivor = !mds->is_rejoin();
4965
7c673cae
FG
4966 // for sending cache expire message
4967 set<CInode*> isolated_inodes;
4968 set<CInode*> refragged_inodes;
11fdf7f2 4969 list<pair<CInode*,int> > updated_realms;
7c673cae
FG
4970
4971 // dirs
11fdf7f2 4972 for (const auto &p : ack->strong_dirfrags) {
7c673cae
FG
4973 // we may have had incorrect dir fragmentation; refragment based
4974 // on what they auth tells us.
11fdf7f2 4975 CDir *dir = get_dirfrag(p.first);
7c673cae 4976 if (!dir) {
11fdf7f2 4977 dir = get_force_dirfrag(p.first, false);
7c673cae
FG
4978 if (dir)
4979 refragged_inodes.insert(dir->get_inode());
4980 }
4981 if (!dir) {
11fdf7f2 4982 CInode *diri = get_inode(p.first.ino);
7c673cae
FG
4983 if (!diri) {
4984 // barebones inode; the full inode loop below will clean up.
4985 diri = new CInode(this, false);
f67539c2
TL
4986 auto _inode = diri->_get_inode();
4987 _inode->ino = p.first.ino;
4988 _inode->mode = S_IFDIR;
4989 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4990
7c673cae 4991 add_inode(diri);
11fdf7f2 4992 if (MDS_INO_MDSDIR(from) == p.first.ino) {
7c673cae
FG
4993 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4994 dout(10) << " add inode " << *diri << dendl;
4995 } else {
4996 diri->inode_auth = CDIR_AUTH_DEFAULT;
4997 isolated_inodes.insert(diri);
11fdf7f2 4998 dout(10) << " unconnected dirfrag " << p.first << dendl;
7c673cae
FG
4999 }
5000 }
5001 // barebones dirfrag; the full dirfrag loop below will clean up.
11fdf7f2
TL
5002 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
5003 if (MDS_INO_MDSDIR(from) == p.first.ino ||
7c673cae
FG
5004 (dir->authority() != CDIR_AUTH_UNDEF &&
5005 dir->authority().first != from))
5006 adjust_subtree_auth(dir, from);
5007 dout(10) << " add dirfrag " << *dir << dendl;
5008 }
5009
11fdf7f2 5010 dir->set_replica_nonce(p.second.nonce);
7c673cae
FG
5011 dir->state_clear(CDir::STATE_REJOINING);
5012 dout(10) << " got " << *dir << dendl;
5013
5014 // dentries
11fdf7f2
TL
5015 auto it = ack->strong_dentries.find(p.first);
5016 if (it != ack->strong_dentries.end()) {
5017 for (const auto &q : it->second) {
5018 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5019 if(!dn)
5020 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5021
5022 CDentry::linkage_t *dnl = dn->get_linkage();
5023
5024 ceph_assert(dn->last == q.first.snapid);
5025 if (dn->first != q.second.first) {
5026 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5027 dn->first = q.second.first;
5028 }
7c673cae 5029
11fdf7f2
TL
5030 // may have bad linkage if we missed dentry link/unlink messages
5031 if (dnl->is_primary()) {
5032 CInode *in = dnl->get_inode();
5033 if (!q.second.is_primary() ||
5034 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5035 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5036 dir->unlink_inode(dn);
5037 }
5038 } else if (dnl->is_remote()) {
5039 if (!q.second.is_remote() ||
5040 q.second.remote_ino != dnl->get_remote_ino() ||
5041 q.second.remote_d_type != dnl->get_remote_d_type()) {
5042 dout(10) << " had bad linkage for " << *dn << dendl;
5043 dir->unlink_inode(dn);
5044 }
5045 } else {
5046 if (!q.second.is_null())
5047 dout(10) << " had bad linkage for " << *dn << dendl;
5048 }
7c673cae 5049
f67539c2
TL
5050 // hmm, did we have the proper linkage here?
5051 if (dnl->is_null() && !q.second.is_null()) {
11fdf7f2
TL
5052 if (q.second.is_remote()) {
5053 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5054 } else {
5055 CInode *in = get_inode(q.second.ino, q.first.snapid);
5056 if (!in) {
5057 // barebones inode; assume it's dir, the full inode loop below will clean up.
5058 in = new CInode(this, false, q.second.first, q.first.snapid);
f67539c2
TL
5059 auto _inode = in->_get_inode();
5060 _inode->ino = q.second.ino;
5061 _inode->mode = S_IFDIR;
5062 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
11fdf7f2
TL
5063 add_inode(in);
5064 dout(10) << " add inode " << *in << dendl;
5065 } else if (in->get_parent_dn()) {
5066 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5067 << ", unlinking " << *in << dendl;
5068 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5069 }
5070 dn->dir->link_primary_inode(dn, in);
5071 isolated_inodes.erase(in);
7c673cae 5072 }
f67539c2 5073 }
7c673cae 5074
11fdf7f2
TL
5075 dn->set_replica_nonce(q.second.nonce);
5076 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5077 dn->state_clear(CDentry::STATE_REJOINING);
5078 dout(10) << " got " << *dn << dendl;
5079 }
7c673cae
FG
5080 }
5081 }
5082
9f95a23c
TL
5083 for (const auto& in : refragged_inodes) {
5084 auto&& ls = in->get_nested_dirfrags();
5085 for (const auto& dir : ls) {
5086 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
7c673cae 5087 continue;
9f95a23c
TL
5088 ceph_assert(dir->get_num_any() == 0);
5089 in->close_dirfrag(dir->get_frag());
7c673cae
FG
5090 }
5091 }
5092
5093 // full dirfrags
11fdf7f2
TL
5094 for (const auto &p : ack->dirfrag_bases) {
5095 CDir *dir = get_dirfrag(p.first);
5096 ceph_assert(dir);
5097 auto q = p.second.cbegin();
7c673cae
FG
5098 dir->_decode_base(q);
5099 dout(10) << " got dir replica " << *dir << dendl;
5100 }
5101
5102 // full inodes
11fdf7f2 5103 auto p = ack->inode_base.cbegin();
7c673cae
FG
5104 while (!p.end()) {
5105 inodeno_t ino;
5106 snapid_t last;
5107 bufferlist basebl;
11fdf7f2
TL
5108 decode(ino, p);
5109 decode(last, p);
5110 decode(basebl, p);
7c673cae 5111 CInode *in = get_inode(ino, last);
11fdf7f2
TL
5112 ceph_assert(in);
5113 auto q = basebl.cbegin();
5114 snapid_t sseq = 0;
5115 if (in->snaprealm)
5116 sseq = in->snaprealm->srnode.seq;
7c673cae 5117 in->_decode_base(q);
11fdf7f2
TL
5118 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5119 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5120 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5121 }
7c673cae
FG
5122 dout(10) << " got inode base " << *in << dendl;
5123 }
5124
5125 // inodes
11fdf7f2 5126 p = ack->inode_locks.cbegin();
7c673cae
FG
5127 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5128 while (!p.end()) {
5129 inodeno_t ino;
5130 snapid_t last;
5131 __u32 nonce;
5132 bufferlist lockbl;
11fdf7f2
TL
5133 decode(ino, p);
5134 decode(last, p);
5135 decode(nonce, p);
5136 decode(lockbl, p);
7c673cae
FG
5137
5138 CInode *in = get_inode(ino, last);
11fdf7f2 5139 ceph_assert(in);
7c673cae 5140 in->set_replica_nonce(nonce);
11fdf7f2 5141 auto q = lockbl.cbegin();
b32b8144 5142 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5143 in->state_clear(CInode::STATE_REJOINING);
5144 dout(10) << " got inode locks " << *in << dendl;
5145 }
5146
5147 // FIXME: This can happen if entire subtree, together with the inode subtree root
5148 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
11fdf7f2 5149 ceph_assert(isolated_inodes.empty());
7c673cae
FG
5150
5151 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
11fdf7f2
TL
5152 auto bp = ack->imported_caps.cbegin();
5153 decode(peer_imported, bp);
7c673cae
FG
5154
5155 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5156 p != peer_imported.end();
5157 ++p) {
28e407b8 5158 auto& ex = cap_exports.at(p->first);
11fdf7f2 5159 ceph_assert(ex.first == from);
7c673cae
FG
5160 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5161 q != p->second.end();
5162 ++q) {
28e407b8 5163 auto r = ex.second.find(q->first);
11fdf7f2 5164 ceph_assert(r != ex.second.end());
7c673cae
FG
5165
5166 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5167 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5168 if (!session) {
5169 dout(10) << " no session for client." << p->first << dendl;
5170 ex.second.erase(r);
5171 continue;
5172 }
7c673cae
FG
5173
5174 // mark client caps stale.
9f95a23c 5175 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5176 r->second.capinfo.cap_id, 0,
7c673cae
FG
5177 mds->get_osd_epoch_barrier());
5178 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5179 (q->second.cap_id > 0 ? from : -1), 0);
5180 mds->send_message_client_counted(m, session);
5181
28e407b8 5182 ex.second.erase(r);
7c673cae 5183 }
11fdf7f2
TL
5184 ceph_assert(ex.second.empty());
5185 }
5186
5187 for (auto p : updated_realms) {
5188 CInode *in = p.first;
5189 bool notify_clients;
5190 if (mds->is_rejoin()) {
5191 if (!rejoin_pending_snaprealms.count(in)) {
5192 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5193 rejoin_pending_snaprealms.insert(in);
5194 }
5195 notify_clients = false;
5196 } else {
5197 // notify clients if I'm survivor
5198 notify_clients = true;
5199 }
5200 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
7c673cae
FG
5201 }
5202
5203 // done?
11fdf7f2 5204 ceph_assert(rejoin_ack_gather.count(from));
7c673cae 5205 rejoin_ack_gather.erase(from);
b32b8144 5206 if (!survivor) {
7c673cae
FG
5207 if (rejoin_gather.empty()) {
5208 // eval unstable scatter locks after all wrlocks are rejoined.
5209 while (!rejoin_eval_locks.empty()) {
5210 SimpleLock *lock = rejoin_eval_locks.front();
5211 rejoin_eval_locks.pop_front();
5212 if (!lock->is_stable())
5213 mds->locker->eval_gather(lock);
5214 }
5215 }
5216
5217 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5218 rejoin_ack_gather.empty()) {
5219 // finally, kickstart past snap parent opens
11fdf7f2 5220 open_snaprealms();
7c673cae
FG
5221 } else {
5222 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5223 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5224 }
5225 } else {
5226 // survivor.
5227 mds->queue_waiters(rejoin_waiters);
5228 }
5229}
5230
5231/**
5232 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5233 *
5234 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5235 * messages that clean these guys up...
5236 */
5237void MDCache::rejoin_trim_undef_inodes()
5238{
5239 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5240
5241 while (!rejoin_undef_inodes.empty()) {
5242 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5243 CInode *in = *p;
5244 rejoin_undef_inodes.erase(p);
5245
5246 in->clear_replica_map();
5247
5248 // close out dirfrags
5249 if (in->is_dir()) {
9f95a23c
TL
5250 const auto&& dfls = in->get_dirfrags();
5251 for (const auto& dir : dfls) {
7c673cae
FG
5252 dir->clear_replica_map();
5253
94b18763
FG
5254 for (auto &p : dir->items) {
5255 CDentry *dn = p.second;
7c673cae
FG
5256 dn->clear_replica_map();
5257
5258 dout(10) << " trimming " << *dn << dendl;
5259 dir->remove_dentry(dn);
5260 }
5261
5262 dout(10) << " trimming " << *dir << dendl;
5263 in->close_dirfrag(dir->dirfrag().frag);
5264 }
5265 }
5266
5267 CDentry *dn = in->get_parent_dn();
5268 if (dn) {
5269 dn->clear_replica_map();
5270 dout(10) << " trimming " << *dn << dendl;
5271 dn->dir->remove_dentry(dn);
5272 } else {
5273 dout(10) << " trimming " << *in << dendl;
5274 remove_inode(in);
5275 }
5276 }
5277
11fdf7f2 5278 ceph_assert(rejoin_undef_inodes.empty());
7c673cae
FG
5279}
5280
5281void MDCache::rejoin_gather_finish()
5282{
5283 dout(10) << "rejoin_gather_finish" << dendl;
11fdf7f2
TL
5284 ceph_assert(mds->is_rejoin());
5285 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5286
5287 if (open_undef_inodes_dirfrags())
5288 return;
5289
5290 if (process_imported_caps())
5291 return;
5292
5293 choose_lock_states_and_reconnect_caps();
5294
5295 identify_files_to_recover();
5296 rejoin_send_acks();
5297
5298 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5299 rejoin_ack_gather.erase(mds->get_nodeid());
5300
5301 // did we already get our acks too?
5302 if (rejoin_ack_gather.empty()) {
11fdf7f2
TL
5303 // finally, open snaprealms
5304 open_snaprealms();
7c673cae
FG
5305 }
5306}
5307
5308class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5309 inodeno_t ino;
5310public:
5311 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5312 void finish(int r) override {
5313 mdcache->rejoin_open_ino_finish(ino, r);
5314 }
5315};
5316
5317void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5318{
5319 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5320
5321 if (ret < 0) {
5322 cap_imports_missing.insert(ino);
5323 } else if (ret == mds->get_nodeid()) {
11fdf7f2 5324 ceph_assert(get_inode(ino));
7c673cae
FG
5325 } else {
5326 auto p = cap_imports.find(ino);
11fdf7f2 5327 ceph_assert(p != cap_imports.end());
7c673cae 5328 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
11fdf7f2
TL
5329 ceph_assert(q->second.count(MDS_RANK_NONE));
5330 ceph_assert(q->second.size() == 1);
7c673cae
FG
5331 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5332 }
5333 cap_imports.erase(p);
5334 }
5335
11fdf7f2 5336 ceph_assert(cap_imports_num_opening > 0);
7c673cae
FG
5337 cap_imports_num_opening--;
5338
5339 if (cap_imports_num_opening == 0) {
522d829b 5340 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
7c673cae
FG
5341 rejoin_gather_finish();
5342 else if (rejoin_gather.count(mds->get_nodeid()))
5343 process_imported_caps();
5344 }
5345}
5346
5347class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5348public:
28e407b8
AA
5349 map<client_t,pair<Session*,uint64_t> > session_map;
5350 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae 5351 void finish(int r) override {
11fdf7f2 5352 ceph_assert(r == 0);
28e407b8 5353 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5354 }
5355};
5356
28e407b8 5357void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5358{
5359 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5360 mds->server->finish_force_open_sessions(session_map);
5361 rejoin_session_map.swap(session_map);
522d829b 5362 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
7c673cae
FG
5363 rejoin_gather_finish();
5364}
5365
11fdf7f2
TL
5366void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5367{
5368 auto p = cap_imports.find(ino);
5369 if (p != cap_imports.end()) {
5370 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5371 if (ret < 0) {
5372 cap_imports_missing.insert(ino);
5373 } else if (ret != mds->get_nodeid()) {
5374 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5375 ceph_assert(q->second.count(MDS_RANK_NONE));
5376 ceph_assert(q->second.size() == 1);
5377 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5378 }
5379 cap_imports.erase(p);
5380 }
5381 }
5382}
5383
7c673cae
FG
5384bool MDCache::process_imported_caps()
5385{
5386 dout(10) << "process_imported_caps" << dendl;
5387
11fdf7f2
TL
5388 if (!open_file_table.is_prefetched() &&
5389 open_file_table.prefetch_inodes()) {
5390 open_file_table.wait_for_prefetch(
5391 new MDSInternalContextWrapper(mds,
9f95a23c 5392 new LambdaContext([this](int r) {
11fdf7f2
TL
5393 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5394 process_imported_caps();
5395 })
5396 )
5397 );
5398 return true;
5399 }
5400
f91f0fd5
TL
5401 for (auto& p : cap_imports) {
5402 CInode *in = get_inode(p.first);
7c673cae 5403 if (in) {
11fdf7f2 5404 ceph_assert(in->is_auth());
f91f0fd5 5405 cap_imports_missing.erase(p.first);
7c673cae
FG
5406 continue;
5407 }
f91f0fd5 5408 if (cap_imports_missing.count(p.first) > 0)
7c673cae
FG
5409 continue;
5410
f91f0fd5
TL
5411 uint64_t parent_ino = 0;
5412 std::string_view d_name;
5413 for (auto& q : p.second) {
5414 for (auto& r : q.second) {
5415 auto &icr = r.second;
5416 if (icr.capinfo.pathbase &&
5417 icr.path.length() > 0 &&
5418 icr.path.find('/') == string::npos) {
5419 parent_ino = icr.capinfo.pathbase;
5420 d_name = icr.path;
5421 break;
5422 }
5423 }
5424 if (parent_ino)
5425 break;
5426 }
5427
5428 dout(10) << " opening missing ino " << p.first << dendl;
7c673cae 5429 cap_imports_num_opening++;
f91f0fd5
TL
5430 auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
5431 if (parent_ino) {
5432 vector<inode_backpointer_t> ancestors;
5433 ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
5434 open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
5435 } else {
5436 open_ino(p.first, (int64_t)-1, fin, false);
5437 }
33c7a0ef 5438 if (!(cap_imports_num_opening % mds->heartbeat_reset_grace()))
28e407b8 5439 mds->heartbeat_reset();
7c673cae
FG
5440 }
5441
5442 if (cap_imports_num_opening > 0)
5443 return true;
5444
5445 // called by rejoin_gather_finish() ?
5446 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5447 if (!rejoin_client_map.empty() &&
5448 rejoin_session_map.empty()) {
5449 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5450 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
11fdf7f2 5451 rejoin_client_metadata_map,
28e407b8 5452 finish->session_map);
11fdf7f2
TL
5453 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5454 std::move(rejoin_client_metadata_map));
5455 mds->mdlog->start_submit_entry(le, finish);
28e407b8
AA
5456 mds->mdlog->flush();
5457 rejoin_client_map.clear();
11fdf7f2 5458 rejoin_client_metadata_map.clear();
28e407b8 5459 return true;
7c673cae 5460 }
7c673cae 5461
f67539c2
TL
5462 // process caps that were exported by peer rename
5463 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
5464 p != rejoin_peer_exports.end();
7c673cae
FG
5465 ++p) {
5466 CInode *in = get_inode(p->first);
11fdf7f2 5467 ceph_assert(in);
7c673cae
FG
5468 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5469 q != p->second.second.end();
5470 ++q) {
28e407b8
AA
5471 auto r = rejoin_session_map.find(q->first);
5472 if (r == rejoin_session_map.end())
5473 continue;
7c673cae 5474
28e407b8 5475 Session *session = r->second.first;
7c673cae 5476 Capability *cap = in->get_client_cap(q->first);
11fdf7f2 5477 if (!cap) {
7c673cae 5478 cap = in->add_client_cap(q->first, session);
11fdf7f2
TL
5479 // add empty item to reconnected_caps
5480 (void)reconnected_caps[p->first][q->first];
5481 }
7c673cae
FG
5482 cap->merge(q->second, true);
5483
5484 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
11fdf7f2
TL
5485 ceph_assert(cap->get_last_seq() == im.issue_seq);
5486 ceph_assert(cap->get_mseq() == im.mseq);
7c673cae
FG
5487 cap->set_cap_id(im.cap_id);
5488 // send cap import because we assigned a new cap ID
5489 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5490 p->second.first, CEPH_CAP_FLAG_AUTH);
5491 }
5492 }
f67539c2 5493 rejoin_peer_exports.clear();
7c673cae
FG
5494 rejoin_imported_caps.clear();
5495
5496 // process cap imports
5497 // ino -> client -> frommds -> capex
5498 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5499 CInode *in = get_inode(p->first);
5500 if (!in) {
5501 dout(10) << " still missing ino " << p->first
5502 << ", will try again after replayed client requests" << dendl;
5503 ++p;
5504 continue;
5505 }
11fdf7f2 5506 ceph_assert(in->is_auth());
7c673cae 5507 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5508 Session *session;
5509 {
5510 auto r = rejoin_session_map.find(q->first);
5511 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5512 }
5513
7c673cae 5514 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5515 if (!session) {
5516 if (r->first >= 0)
5517 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5518 continue;
5519 }
5520
7c673cae
FG
5521 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5522 add_reconnected_cap(q->first, in->ino(), r->second);
5523 if (r->first >= 0) {
5524 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5525 cap->inc_mseq();
5526 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5527
5528 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5529 im.cap_id = cap->get_cap_id();
5530 im.issue_seq = cap->get_last_seq();
5531 im.mseq = cap->get_mseq();
5532 }
5533 }
5534 }
5535 cap_imports.erase(p++); // remove and move on
5536 }
5537 } else {
5538 trim_non_auth();
5539
11fdf7f2 5540 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5541 rejoin_gather.erase(mds->get_nodeid());
11fdf7f2 5542 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5543 maybe_send_pending_rejoins();
7c673cae
FG
5544 }
5545 return false;
5546}
5547
7c673cae
FG
5548void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5549 client_t client, snapid_t snap_follows)
5550{
5551 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5552
11fdf7f2
TL
5553 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5554 return;
5555
7c673cae
FG
5556 const set<snapid_t>& snaps = realm->get_snaps();
5557 snapid_t follows = snap_follows;
5558
5559 while (true) {
5560 CInode *in = pick_inode_snap(head_in, follows);
5561 if (in == head_in)
5562 break;
11fdf7f2
TL
5563
5564 bool need_snapflush = false;
5565 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5566 p != snaps.end() && *p <= in->last;
5567 ++p) {
5568 head_in->add_need_snapflush(in, *p, client);
5569 need_snapflush = true;
5570 }
5571 follows = in->last;
5572 if (!need_snapflush)
5573 continue;
5574
7c673cae
FG
5575 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5576
eafe8130
TL
5577 if (in->client_snap_caps.empty()) {
5578 for (int i = 0; i < num_cinode_locks; i++) {
5579 int lockid = cinode_lock_info[i].lock;
5580 SimpleLock *lock = in->get_lock(lockid);
5581 ceph_assert(lock);
5582 in->auth_pin(lock);
5583 lock->set_state(LOCK_SNAP_SYNC);
5584 lock->get_wrlock(true);
5585 }
7c673cae 5586 }
eafe8130 5587 in->client_snap_caps.insert(client);
11fdf7f2 5588 mds->locker->mark_need_snapflush_inode(in);
7c673cae
FG
5589 }
5590}
5591
5592/*
5593 * choose lock states based on reconnected caps
5594 */
5595void MDCache::choose_lock_states_and_reconnect_caps()
5596{
5597 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5598
81eedcae 5599 int count = 0;
11fdf7f2
TL
5600 for (auto p : inode_map) {
5601 CInode *in = p.second;
7c673cae
FG
5602 if (in->last != CEPH_NOSNAP)
5603 continue;
5604
f67539c2 5605 if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat())
7c673cae
FG
5606 in->mark_dirty_rstat();
5607
7c673cae 5608 int dirty_caps = 0;
11fdf7f2
TL
5609 auto q = reconnected_caps.find(in->ino());
5610 if (q != reconnected_caps.end()) {
5611 for (const auto &it : q->second)
7c673cae
FG
5612 dirty_caps |= it.second.dirty_caps;
5613 }
5614 in->choose_lock_states(dirty_caps);
5615 dout(15) << " chose lock states on " << *in << dendl;
5616
11fdf7f2
TL
5617 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5618 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5619 rejoin_pending_snaprealms.insert(in);
7c673cae 5620 }
81eedcae 5621
33c7a0ef 5622 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae 5623 mds->heartbeat_reset();
11fdf7f2 5624 }
7c673cae
FG
5625}
5626
5627void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
9f95a23c 5628 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae 5629{
9f95a23c 5630 ref_t<MClientSnap> snap;
11fdf7f2
TL
5631 auto it = splits.find(client);
5632 if (it != splits.end()) {
5633 snap = it->second;
5634 snap->head.op = CEPH_SNAP_OP_SPLIT;
5635 } else {
9f95a23c 5636 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2 5637 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae 5638 snap->head.split = realm->inode->ino();
11fdf7f2 5639 snap->bl = realm->get_snap_trace();
7c673cae 5640
11fdf7f2
TL
5641 for (const auto& child : realm->open_children)
5642 snap->split_realms.push_back(child->inode->ino());
5643 }
7c673cae
FG
5644 snap->split_inos.push_back(ino);
5645}
5646
11fdf7f2 5647void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
9f95a23c 5648 map<client_t,ref_t<MClientSnap>>& splits)
11fdf7f2
TL
5649{
5650 ceph_assert(parent_realm);
5651
5652 vector<inodeno_t> split_inos;
5653 vector<inodeno_t> split_realms;
5654
f67539c2 5655 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
11fdf7f2
TL
5656 split_inos.push_back((*p)->ino());
5657 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5658 p != realm->open_children.end();
5659 ++p)
5660 split_realms.push_back((*p)->inode->ino());
5661
5662 for (const auto& p : realm->client_caps) {
5663 ceph_assert(!p.second->empty());
5664 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5665 if (em.second) {
9f95a23c 5666 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2
TL
5667 update->head.split = parent_realm->inode->ino();
5668 update->split_inos = split_inos;
5669 update->split_realms = split_realms;
5670 update->bl = parent_realm->get_snap_trace();
5671 em.first->second = std::move(update);
5672 }
5673 }
5674}
5675
9f95a23c 5676void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
5677{
5678 dout(10) << "send_snaps" << dendl;
5679
11fdf7f2
TL
5680 for (auto &p : splits) {
5681 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
7c673cae 5682 if (session) {
11fdf7f2
TL
5683 dout(10) << " client." << p.first
5684 << " split " << p.second->head.split
5685 << " inos " << p.second->split_inos
7c673cae 5686 << dendl;
11fdf7f2 5687 mds->send_message_client_counted(p.second, session);
7c673cae 5688 } else {
11fdf7f2 5689 dout(10) << " no session for client." << p.first << dendl;
7c673cae
FG
5690 }
5691 }
5692 splits.clear();
5693}
5694
5695
5696/*
5697 * remove any items from logsegment open_file lists that don't have
5698 * any caps
5699 */
5700void MDCache::clean_open_file_lists()
5701{
5702 dout(10) << "clean_open_file_lists" << dendl;
5703
5704 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5705 p != mds->mdlog->segments.end();
5706 ++p) {
5707 LogSegment *ls = p->second;
5708
5709 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5710 while (!q.end()) {
5711 CInode *in = *q;
5712 ++q;
5713 if (in->last == CEPH_NOSNAP) {
11fdf7f2
TL
5714 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5715 in->item_open_file.remove_myself();
5716 } else {
7c673cae
FG
5717 if (in->client_snap_caps.empty()) {
5718 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5719 in->item_open_file.remove_myself();
5720 }
5721 }
5722 }
5723 }
5724}
5725
11fdf7f2
TL
5726void MDCache::dump_openfiles(Formatter *f)
5727{
5728 f->open_array_section("openfiles");
5729 for (auto p = mds->mdlog->segments.begin();
5730 p != mds->mdlog->segments.end();
5731 ++p) {
5732 LogSegment *ls = p->second;
5733
5734 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5735 while (!q.end()) {
5736 CInode *in = *q;
5737 ++q;
5738 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5739 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5740 continue;
5741 f->open_object_section("file");
5742 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5743 f->close_section();
5744 }
5745 }
5746 f->close_section();
5747}
7c673cae
FG
5748
5749Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5750{
5751 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5752 << " on " << *in << dendl;
5753 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5754 if (!session) {
5755 dout(10) << " no session for client." << client << dendl;
5756 return NULL;
5757 }
5758
5759 Capability *cap = in->reconnect_cap(client, icr, session);
5760
5761 if (frommds >= 0) {
5762 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5763 cap->inc_mseq();
5764 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5765 }
5766
5767 return cap;
5768}
5769
5770void MDCache::export_remaining_imported_caps()
5771{
5772 dout(10) << "export_remaining_imported_caps" << dendl;
5773
f67539c2 5774 CachedStackStringStream css;
7c673cae 5775
81eedcae 5776 int count = 0;
7c673cae 5777 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
f67539c2 5778 *css << " ino " << p->first << "\n";
7c673cae
FG
5779 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5780 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5781 if (session) {
5782 // mark client caps stale.
9f95a23c
TL
5783 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5784 0, 0, 0,
5785 mds->get_osd_epoch_barrier());
7c673cae
FG
5786 stale->set_cap_peer(0, 0, 0, -1, 0);
5787 mds->send_message_client_counted(stale, q->first);
5788 }
5789 }
5790
33c7a0ef 5791 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae 5792 mds->heartbeat_reset();
7c673cae
FG
5793 }
5794
11fdf7f2 5795 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
7c673cae
FG
5796 p != cap_reconnect_waiters.end();
5797 ++p)
5798 mds->queue_waiters(p->second);
5799
5800 cap_imports.clear();
5801 cap_reconnect_waiters.clear();
5802
f67539c2
TL
5803 if (css->strv().length()) {
5804 mds->clog->warn() << "failed to reconnect caps for missing inodes:"
5805 << css->strv();
7c673cae
FG
5806 }
5807}
5808
a8e16298 5809Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
7c673cae
FG
5810{
5811 client_t client = session->info.get_client();
a8e16298 5812 Capability *cap = nullptr;
7c673cae
FG
5813 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5814 if (rc) {
a8e16298 5815 cap = in->reconnect_cap(client, *rc, session);
7c673cae
FG
5816 dout(10) << "try_reconnect_cap client." << client
5817 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5818 << " issue " << ccap_string(rc->capinfo.issued)
5819 << " on " << *in << dendl;
5820 remove_replay_cap_reconnect(in->ino(), client);
5821
5822 if (in->is_replicated()) {
5823 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5824 } else {
5825 int dirty_caps = 0;
5826 auto p = reconnected_caps.find(in->ino());
5827 if (p != reconnected_caps.end()) {
5828 auto q = p->second.find(client);
5829 if (q != p->second.end())
5830 dirty_caps = q->second.dirty_caps;
5831 }
5832 in->choose_lock_states(dirty_caps);
5833 dout(15) << " chose lock states on " << *in << dendl;
5834 }
5835
11fdf7f2 5836 map<inodeno_t, MDSContext::vec >::iterator it =
7c673cae
FG
5837 cap_reconnect_waiters.find(in->ino());
5838 if (it != cap_reconnect_waiters.end()) {
5839 mds->queue_waiters(it->second);
5840 cap_reconnect_waiters.erase(it);
5841 }
5842 }
a8e16298 5843 return cap;
7c673cae
FG
5844}
5845
5846
5847
5848// -------
5849// cap imports and delayed snap parent opens
5850
5851void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5852 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5853 int peer, int p_flags)
5854{
7c673cae 5855 SnapRealm *realm = in->find_snaprealm();
f67539c2
TL
5856 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5857 if (cap->get_last_seq() == 0) // reconnected cap
5858 cap->inc_last_seq();
5859 cap->set_last_issue();
5860 cap->set_last_issue_stamp(ceph_clock_now());
5861 cap->clear_new();
5862 auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
5863 in->ino(), realm->inode->ino(), cap->get_cap_id(),
5864 cap->get_last_seq(), cap->pending(), cap->wanted(),
5865 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
5866 in->encode_cap_message(reap, cap);
5867 reap->snapbl = realm->get_snap_trace();
5868 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5869 mds->send_message_client_counted(reap, session);
7c673cae
FG
5870}
5871
5872void MDCache::do_delayed_cap_imports()
5873{
5874 dout(10) << "do_delayed_cap_imports" << dendl;
5875
11fdf7f2 5876 ceph_assert(delayed_imported_caps.empty());
7c673cae
FG
5877}
5878
11fdf7f2
TL
5879struct C_MDC_OpenSnapRealms : public MDCacheContext {
5880 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
7c673cae 5881 void finish(int r) override {
11fdf7f2 5882 mdcache->open_snaprealms();
7c673cae
FG
5883 }
5884};
5885
11fdf7f2 5886void MDCache::open_snaprealms()
7c673cae 5887{
11fdf7f2 5888 dout(10) << "open_snaprealms" << dendl;
7c673cae 5889
11fdf7f2
TL
5890 auto it = rejoin_pending_snaprealms.begin();
5891 while (it != rejoin_pending_snaprealms.end()) {
5892 CInode *in = *it;
5893 SnapRealm *realm = in->snaprealm;
5894 ceph_assert(realm);
f67539c2
TL
5895
5896 map<client_t,ref_t<MClientSnap>> splits;
5897 // finish off client snaprealm reconnects?
5898 auto q = reconnected_snaprealms.find(in->ino());
5899 if (q != reconnected_snaprealms.end()) {
5900 for (const auto& r : q->second)
5901 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5902 reconnected_snaprealms.erase(q);
5903 }
5904
5905 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) {
5906 CInode *child = *p;
5907 auto q = reconnected_caps.find(child->ino());
5908 ceph_assert(q != reconnected_caps.end());
5909 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5910 Capability *cap = child->get_client_cap(r->first);
5911 if (!cap)
5912 continue;
5913 if (r->second.snap_follows > 0) {
5914 if (r->second.snap_follows < child->first - 1) {
5915 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5916 } else if (r->second.snapflush) {
5917 // When processing a cap flush message that is re-sent, it's possble
5918 // that the sender has already released all WR caps. So we should
5919 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5920 cap->mark_needsnapflush();
7c673cae
FG
5921 }
5922 }
f67539c2
TL
5923 // make sure client's cap is in the correct snaprealm.
5924 if (r->second.realm_ino != in->ino()) {
5925 prepare_realm_split(realm, r->first, child->ino(), splits);
5926 }
7c673cae 5927 }
7c673cae 5928 }
7c673cae 5929
f67539c2
TL
5930 rejoin_pending_snaprealms.erase(it++);
5931 in->put(CInode::PIN_OPENINGSNAPPARENTS);
11fdf7f2 5932
f67539c2 5933 send_snaps(splits);
11fdf7f2
TL
5934 }
5935
5936 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5937
5938 if (!reconnected_snaprealms.empty()) {
5939 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5940 for (auto& p : reconnected_snaprealms) {
f67539c2
TL
5941 CachedStackStringStream css;
5942 *css << " " << p.first << " {";
11fdf7f2
TL
5943 bool first = true;
5944 for (auto& q : p.second) {
5945 if (!first)
f67539c2
TL
5946 *css << ", ";
5947 *css << "client." << q.first << "/" << q.second;
7c673cae 5948 }
f67539c2
TL
5949 *css << "}";
5950 dout(5) << css->strv() << dendl;
7c673cae 5951 }
7c673cae 5952 }
11fdf7f2
TL
5953 ceph_assert(rejoin_waiters.empty());
5954 ceph_assert(rejoin_pending_snaprealms.empty());
5955 dout(10) << "open_snaprealms - all open" << dendl;
5956 do_delayed_cap_imports();
5957
5958 ceph_assert(rejoin_done);
5959 rejoin_done.release()->complete(0);
5960 reconnected_caps.clear();
7c673cae
FG
5961}
5962
5963bool MDCache::open_undef_inodes_dirfrags()
5964{
5965 dout(10) << "open_undef_inodes_dirfrags "
5966 << rejoin_undef_inodes.size() << " inodes "
5967 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5968
5969 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5970
5971 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5972 p != rejoin_undef_inodes.end();
5973 ++p) {
5974 CInode *in = *p;
11fdf7f2 5975 ceph_assert(!in->is_base());
f67539c2 5976 ceph_assert(in->get_parent_dir());
7c673cae
FG
5977 fetch_queue.insert(in->get_parent_dir());
5978 }
5979
5980 if (fetch_queue.empty())
5981 return false;
5982
28e407b8
AA
5983 MDSGatherBuilder gather(g_ceph_context,
5984 new MDSInternalContextWrapper(mds,
9f95a23c 5985 new LambdaContext([this](int r) {
522d829b 5986 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
28e407b8
AA
5987 rejoin_gather_finish();
5988 })
5989 )
5990 );
5991
7c673cae
FG
5992 for (set<CDir*>::iterator p = fetch_queue.begin();
5993 p != fetch_queue.end();
5994 ++p) {
5995 CDir *dir = *p;
5996 CInode *diri = dir->get_inode();
5997 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5998 continue;
5999 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 6000 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
7c673cae
FG
6001 dir->fetch(gather.new_sub());
6002 }
11fdf7f2 6003 ceph_assert(gather.has_subs());
7c673cae
FG
6004 gather.activate();
6005 return true;
6006}
6007
6008void MDCache::opened_undef_inode(CInode *in) {
6009 dout(10) << "opened_undef_inode " << *in << dendl;
6010 rejoin_undef_inodes.erase(in);
6011 if (in->is_dir()) {
6012 // FIXME: re-hash dentries if necessary
f67539c2 6013 ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
9f95a23c 6014 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
7c673cae 6015 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 6016 ceph_assert(dir);
7c673cae
FG
6017 rejoin_undef_dirfrags.erase(dir);
6018 in->force_dirfrags();
9f95a23c
TL
6019 auto&& ls = in->get_dirfrags();
6020 for (const auto& dir : ls) {
6021 rejoin_undef_dirfrags.insert(dir);
6022 }
7c673cae
FG
6023 }
6024 }
6025}
6026
11fdf7f2 6027void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
9f95a23c 6028 map<client_t,ref_t<MClientSnap>>& updates)
7c673cae
FG
6029{
6030 if (seq < realm->get_newest_seq()) {
6031 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
11fdf7f2 6032 << realm->get_newest_seq() << " on " << *realm << dendl;
9f95a23c 6033 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
11fdf7f2
TL
6034 snap->bl = realm->get_snap_trace();
6035 for (const auto& child : realm->open_children)
6036 snap->split_realms.push_back(child->inode->ino());
6037 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae
FG
6038 } else {
6039 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6040 << " on " << *realm << dendl;
6041 }
6042}
6043
6044
6045
6046void MDCache::rejoin_send_acks()
6047{
6048 dout(7) << "rejoin_send_acks" << dendl;
6049
6050 // replicate stray
6051 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6052 p != rejoin_unlinked_inodes.end();
6053 ++p) {
6054 for (set<CInode*>::iterator q = p->second.begin();
6055 q != p->second.end();
6056 ++q) {
6057 CInode *in = *q;
6058 dout(7) << " unlinked inode " << *in << dendl;
6059 // inode expired
6060 if (!in->is_replica(p->first))
6061 continue;
6062 while (1) {
6063 CDentry *dn = in->get_parent_dn();
6064 if (dn->is_replica(p->first))
6065 break;
6066 dn->add_replica(p->first);
6067 CDir *dir = dn->get_dir();
6068 if (dir->is_replica(p->first))
6069 break;
6070 dir->add_replica(p->first);
6071 in = dir->get_inode();
6072 if (in->is_replica(p->first))
6073 break;
224ce89b 6074 in->add_replica(p->first);
7c673cae
FG
6075 if (in->is_base())
6076 break;
6077 }
6078 }
6079 }
6080 rejoin_unlinked_inodes.clear();
6081
6082 // send acks to everyone in the recovery set
9f95a23c 6083 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
7c673cae
FG
6084 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6085 p != recovery_set.end();
31f18b77
FG
6086 ++p) {
6087 if (rejoin_ack_sent.count(*p))
6088 continue;
9f95a23c 6089 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
31f18b77
FG
6090 }
6091
6092 rejoin_ack_sent = recovery_set;
7c673cae
FG
6093
6094 // walk subtrees
6095 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6096 p != subtrees.end();
6097 ++p) {
6098 CDir *dir = p->first;
6099 if (!dir->is_auth())
6100 continue;
6101 dout(10) << "subtree " << *dir << dendl;
6102
6103 // auth items in this subtree
9f95a23c
TL
6104 std::queue<CDir*> dq;
6105 dq.push(dir);
7c673cae
FG
6106
6107 while (!dq.empty()) {
6108 CDir *dir = dq.front();
9f95a23c 6109 dq.pop();
7c673cae
FG
6110
6111 // dir
181888fb
FG
6112 for (auto &r : dir->get_replicas()) {
6113 auto it = acks.find(r.first);
31f18b77
FG
6114 if (it == acks.end())
6115 continue;
181888fb 6116 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 6117 it->second->add_dirfrag_base(dir);
7c673cae
FG
6118 }
6119
94b18763
FG
6120 for (auto &p : dir->items) {
6121 CDentry *dn = p.second;
7c673cae
FG
6122 CDentry::linkage_t *dnl = dn->get_linkage();
6123
6124 // inode
6125 CInode *in = NULL;
6126 if (dnl->is_primary())
6127 in = dnl->get_inode();
6128
6129 // dentry
181888fb
FG
6130 for (auto &r : dn->get_replicas()) {
6131 auto it = acks.find(r.first);
31f18b77
FG
6132 if (it == acks.end())
6133 continue;
f67539c2
TL
6134 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
6135 dn->first, dn->last,
7c673cae
FG
6136 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6137 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6138 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6139 ++r.second,
7c673cae
FG
6140 dn->lock.get_replica_state());
6141 // peer missed MDentrylink message ?
181888fb
FG
6142 if (in && !in->is_replica(r.first))
6143 in->add_replica(r.first);
7c673cae
FG
6144 }
6145
6146 if (!in)
6147 continue;
6148
181888fb
FG
6149 for (auto &r : in->get_replicas()) {
6150 auto it = acks.find(r.first);
31f18b77
FG
6151 if (it == acks.end())
6152 continue;
6153 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6154 bufferlist bl;
181888fb
FG
6155 in->_encode_locks_state_for_rejoin(bl, r.first);
6156 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6157 }
6158
6159 // subdirs in this subtree?
9f95a23c
TL
6160 {
6161 auto&& dirs = in->get_nested_dirfrags();
6162 for (const auto& dir : dirs) {
6163 dq.push(dir);
6164 }
6165 }
7c673cae
FG
6166 }
6167 }
6168 }
6169
6170 // base inodes too
6171 if (root && root->is_auth())
181888fb
FG
6172 for (auto &r : root->get_replicas()) {
6173 auto it = acks.find(r.first);
31f18b77
FG
6174 if (it == acks.end())
6175 continue;
6176 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6177 bufferlist bl;
181888fb
FG
6178 root->_encode_locks_state_for_rejoin(bl, r.first);
6179 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6180 }
6181 if (myin)
181888fb
FG
6182 for (auto &r : myin->get_replicas()) {
6183 auto it = acks.find(r.first);
31f18b77
FG
6184 if (it == acks.end())
6185 continue;
6186 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6187 bufferlist bl;
181888fb
FG
6188 myin->_encode_locks_state_for_rejoin(bl, r.first);
6189 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6190 }
6191
6192 // include inode base for any inodes whose scatterlocks may have updated
6193 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6194 p != rejoin_potential_updated_scatterlocks.end();
6195 ++p) {
6196 CInode *in = *p;
181888fb
FG
6197 for (const auto &r : in->get_replicas()) {
6198 auto it = acks.find(r.first);
31f18b77
FG
6199 if (it == acks.end())
6200 continue;
6201 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6202 }
7c673cae
FG
6203 }
6204
6205 // send acks
31f18b77 6206 for (auto p = acks.begin(); p != acks.end(); ++p) {
11fdf7f2 6207 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
7c673cae
FG
6208 mds->send_message_mds(p->second, p->first);
6209 }
6210
6211 rejoin_imported_caps.clear();
6212}
6213
c07f9fc5
FG
6214class C_MDC_ReIssueCaps : public MDCacheContext {
6215 CInode *in;
6216public:
6217 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6218 MDCacheContext(mdc), in(i)
6219 {
6220 in->get(CInode::PIN_PTRWAITER);
6221 }
6222 void finish(int r) override {
6223 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6224 mdcache->mds->locker->issue_caps(in);
6225 in->put(CInode::PIN_PTRWAITER);
6226 }
6227};
7c673cae
FG
6228
6229void MDCache::reissue_all_caps()
6230{
6231 dout(10) << "reissue_all_caps" << dendl;
6232
81eedcae 6233 int count = 0;
94b18763 6234 for (auto &p : inode_map) {
81eedcae 6235 int n = 1;
b32b8144 6236 CInode *in = p.second;
7c673cae 6237 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6238 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6239 if (in->is_frozen_inode()) {
6240 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6241 continue;
6242 }
7c673cae 6243 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
81eedcae 6244 n += mds->locker->issue_caps(in);
7c673cae 6245 }
81eedcae 6246
33c7a0ef 6247 if ((count % mds->heartbeat_reset_grace()) + n >= mds->heartbeat_reset_grace())
81eedcae
TL
6248 mds->heartbeat_reset();
6249 count += n;
7c673cae
FG
6250 }
6251}
6252
6253
6254// ===============================================================================
6255
6256struct C_MDC_QueuedCow : public MDCacheContext {
6257 CInode *in;
6258 MutationRef mut;
6259 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6260 MDCacheContext(mdc), in(i), mut(m) {}
6261 void finish(int r) override {
6262 mdcache->_queued_file_recover_cow(in, mut);
6263 }
6264};
6265
6266
6267void MDCache::queue_file_recover(CInode *in)
6268{
6269 dout(10) << "queue_file_recover " << *in << dendl;
11fdf7f2 6270 ceph_assert(in->is_auth());
7c673cae
FG
6271
6272 // cow?
6273 /*
6274 SnapRealm *realm = in->find_snaprealm();
6275 set<snapid_t> s = realm->get_snaps();
6276 while (!s.empty() && *s.begin() < in->first)
6277 s.erase(s.begin());
6278 while (!s.empty() && *s.rbegin() > in->last)
6279 s.erase(*s.rbegin());
6280 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6281 if (s.size() > 1) {
f67539c2
TL
6282 auto pi = in->project_inode(mut);
6283 pi.inode.version = in->pre_dirty();
7c673cae
FG
6284
6285 auto mut(std::make_shared<MutationImpl>());
6286 mut->ls = mds->mdlog->get_current_segment();
6287 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6288 mds->mdlog->start_entry(le);
6289 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6290
6291 s.erase(*s.begin());
6292 while (!s.empty()) {
6293 snapid_t snapid = *s.begin();
6294 CInode *cow_inode = 0;
6295 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
11fdf7f2 6296 ceph_assert(cow_inode);
7c673cae
FG
6297 recovery_queue.enqueue(cow_inode);
6298 s.erase(*s.begin());
6299 }
6300
6301 in->parent->first = in->first;
6302 le->metablob.add_primary_dentry(in->parent, in, true);
6303 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6304 mds->mdlog->flush();
6305 }
6306 */
6307
6308 recovery_queue.enqueue(in);
6309}
6310
6311void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6312{
7c673cae
FG
6313 mut->apply();
6314 mds->locker->drop_locks(mut.get());
6315 mut->cleanup();
6316}
6317
6318
6319/*
6320 * called after recovery to recover file sizes for previously opened (for write)
6321 * files. that is, those where max_size > size.
6322 */
6323void MDCache::identify_files_to_recover()
6324{
6325 dout(10) << "identify_files_to_recover" << dendl;
81eedcae 6326 int count = 0;
94b18763 6327 for (auto &p : inode_map) {
b32b8144 6328 CInode *in = p.second;
7c673cae
FG
6329 if (!in->is_auth())
6330 continue;
6331
6332 if (in->last != CEPH_NOSNAP)
6333 continue;
6334
6335 // Only normal files need file size recovery
6336 if (!in->is_file()) {
6337 continue;
6338 }
6339
6340 bool recover = false;
f91f0fd5
TL
6341 const auto& client_ranges = in->get_projected_inode()->client_ranges;
6342 if (!client_ranges.empty()) {
6343 in->mark_clientwriteable();
6344 for (auto& p : client_ranges) {
6345 Capability *cap = in->get_client_cap(p.first);
6346 if (cap) {
6347 cap->mark_clientwriteable();
6348 } else {
6349 dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
6350 recover = true;
6351 break;
6352 }
7c673cae
FG
6353 }
6354 }
6355
6356 if (recover) {
6357 if (in->filelock.is_stable()) {
6358 in->auth_pin(&in->filelock);
6359 } else {
11fdf7f2 6360 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
7c673cae
FG
6361 }
6362 in->filelock.set_state(LOCK_PRE_SCAN);
6363 rejoin_recover_q.push_back(in);
6364 } else {
6365 rejoin_check_q.push_back(in);
6366 }
81eedcae 6367
33c7a0ef 6368 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae 6369 mds->heartbeat_reset();
7c673cae
FG
6370 }
6371}
6372
6373void MDCache::start_files_to_recover()
6374{
b3b6e05e 6375 int count = 0;
7c673cae
FG
6376 for (CInode *in : rejoin_check_q) {
6377 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6378 mds->locker->issue_caps(in);
6379 mds->locker->check_inode_max_size(in);
33c7a0ef 6380 if (!(++count % mds->heartbeat_reset_grace()))
b3b6e05e 6381 mds->heartbeat_reset();
7c673cae
FG
6382 }
6383 rejoin_check_q.clear();
6384 for (CInode *in : rejoin_recover_q) {
6385 mds->locker->file_recover(&in->filelock);
33c7a0ef 6386 if (!(++count % mds->heartbeat_reset_grace()))
b3b6e05e 6387 mds->heartbeat_reset();
7c673cae
FG
6388 }
6389 if (!rejoin_recover_q.empty()) {
6390 rejoin_recover_q.clear();
6391 do_file_recover();
6392 }
6393}
6394
6395void MDCache::do_file_recover()
6396{
6397 recovery_queue.advance();
6398}
6399
6400// ===============================================================================
6401
6402
6403// ----------------------------
6404// truncate
6405
6406class C_MDC_RetryTruncate : public MDCacheContext {
6407 CInode *in;
6408 LogSegment *ls;
6409public:
6410 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6411 MDCacheContext(c), in(i), ls(l) {}
6412 void finish(int r) override {
6413 mdcache->_truncate_inode(in, ls);
6414 }
6415};
6416
6417void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6418{
f67539c2 6419 const auto& pi = in->get_projected_inode();
7c673cae
FG
6420 dout(10) << "truncate_inode "
6421 << pi->truncate_from << " -> " << pi->truncate_size
6422 << " on " << *in
6423 << dendl;
6424
6425 ls->truncating_inodes.insert(in);
6426 in->get(CInode::PIN_TRUNCATING);
6427 in->auth_pin(this);
6428
6429 if (!in->client_need_snapflush.empty() &&
6430 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6431 ceph_assert(in->filelock.is_xlocked());
7c673cae
FG
6432 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6433 mds->locker->issue_caps(in);
6434 return;
6435 }
6436
6437 _truncate_inode(in, ls);
6438}
6439
6440struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6441 CInode *in;
6442 LogSegment *ls;
6443 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
91327a77
AA
6444 MDCacheIOContext(c, false), in(i), ls(l) {
6445 }
7c673cae 6446 void finish(int r) override {
f67539c2 6447 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
7c673cae
FG
6448 mdcache->truncate_inode_finish(in, ls);
6449 }
91327a77
AA
6450 void print(ostream& out) const override {
6451 out << "file_truncate(" << in->ino() << ")";
6452 }
7c673cae
FG
6453};
6454
6455void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6456{
f67539c2 6457 const auto& pi = in->get_inode();
7c673cae
FG
6458 dout(10) << "_truncate_inode "
6459 << pi->truncate_from << " -> " << pi->truncate_size
6460 << " on " << *in << dendl;
6461
11fdf7f2
TL
6462 ceph_assert(pi->is_truncating());
6463 ceph_assert(pi->truncate_size < (1ULL << 63));
6464 ceph_assert(pi->truncate_from < (1ULL << 63));
6465 ceph_assert(pi->truncate_size < pi->truncate_from);
7c673cae
FG
6466
6467
6468 SnapRealm *realm = in->find_snaprealm();
6469 SnapContext nullsnap;
6470 const SnapContext *snapc;
6471 if (realm) {
6472 dout(10) << " realm " << *realm << dendl;
6473 snapc = &realm->get_snap_context();
6474 } else {
6475 dout(10) << " NO realm, using null context" << dendl;
6476 snapc = &nullsnap;
11fdf7f2 6477 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae
FG
6478 }
6479 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
f67539c2
TL
6480 auto layout = pi->layout;
6481 filer.truncate(in->ino(), &layout, *snapc,
7c673cae
FG
6482 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6483 pi->truncate_seq, ceph::real_time::min(), 0,
6484 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6485 mds->finisher));
6486}
6487
6488struct C_MDC_TruncateLogged : public MDCacheLogContext {
6489 CInode *in;
6490 MutationRef mut;
6491 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6492 MDCacheLogContext(m), in(i), mut(mu) {}
6493 void finish(int r) override {
6494 mdcache->truncate_inode_logged(in, mut);
6495 }
6496};
6497
6498void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6499{
6500 dout(10) << "truncate_inode_finish " << *in << dendl;
6501
6502 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6503 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6504 ls->truncating_inodes.erase(p);
6505
7c673cae
FG
6506 MutationRef mut(new MutationImpl());
6507 mut->ls = mds->mdlog->get_current_segment();
f67539c2
TL
6508
6509 // update
6510 auto pi = in->project_inode(mut);
6511 pi.inode->version = in->pre_dirty();
6512 pi.inode->truncate_from = 0;
6513 pi.inode->truncate_pending--;
7c673cae
FG
6514
6515 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6516 mds->mdlog->start_entry(le);
7c673cae 6517
f67539c2 6518 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7c673cae 6519 journal_dirty_inode(mut.get(), &le->metablob, in);
f67539c2 6520 le->metablob.add_truncate_finish(in->ino(), ls->seq);
7c673cae
FG
6521 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6522
6523 // flush immediately if there are readers/writers waiting
6524 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6525 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6526 mds->mdlog->flush();
6527}
6528
6529void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6530{
6531 dout(10) << "truncate_inode_logged " << *in << dendl;
6532 mut->apply();
6533 mds->locker->drop_locks(mut.get());
6534 mut->cleanup();
6535
6536 in->put(CInode::PIN_TRUNCATING);
6537 in->auth_unpin(this);
6538
11fdf7f2 6539 MDSContext::vec waiters;
7c673cae
FG
6540 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6541 mds->queue_waiters(waiters);
6542}
6543
6544
6545void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6546{
6547 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6548 << ls->seq << "/" << ls->offset << dendl;
6549 ls->truncating_inodes.insert(in);
6550 in->get(CInode::PIN_TRUNCATING);
6551}
6552
6553void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6554{
6555 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6556 << ls->seq << "/" << ls->offset << dendl;
6557 // if we have the logseg the truncate started in, it must be in our list.
6558 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6559 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6560 ls->truncating_inodes.erase(p);
6561 in->put(CInode::PIN_TRUNCATING);
6562}
6563
6564void MDCache::start_recovered_truncates()
6565{
6566 dout(10) << "start_recovered_truncates" << dendl;
6567 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6568 p != mds->mdlog->segments.end();
6569 ++p) {
6570 LogSegment *ls = p->second;
6571 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6572 q != ls->truncating_inodes.end();
6573 ++q) {
6574 CInode *in = *q;
6575 in->auth_pin(this);
6576
6577 if (!in->client_need_snapflush.empty() &&
6578 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6579 ceph_assert(in->filelock.is_stable());
7c673cae
FG
6580 in->filelock.set_state(LOCK_XLOCKDONE);
6581 in->auth_pin(&in->filelock);
6582 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6583 // start_files_to_recover will revoke caps
6584 continue;
6585 }
6586 _truncate_inode(in, ls);
6587 }
6588 }
6589}
6590
6591
9f95a23c
TL
6592class C_MDS_purge_completed_finish : public MDCacheLogContext {
6593 interval_set<inodeno_t> inos;
9f95a23c 6594 LogSegment *ls;
f67539c2 6595 version_t inotablev;
9f95a23c 6596public:
f67539c2
TL
6597 C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos,
6598 LogSegment *_ls, version_t iv)
6599 : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {}
9f95a23c 6600 void finish(int r) override {
20effc67 6601 ceph_assert(r == 0);
9f95a23c 6602 if (inotablev) {
f67539c2 6603 get_mds()->inotable->apply_release_ids(inos);
20effc67 6604 ceph_assert(get_mds()->inotable->get_version() == inotablev);
9f95a23c 6605 }
f67539c2 6606 ls->purge_inodes_finish(inos);
9f95a23c
TL
6607 }
6608};
7c673cae 6609
9f95a23c
TL
6610void MDCache::start_purge_inodes(){
6611 dout(10) << "start_purge_inodes" << dendl;
6612 for (auto& p : mds->mdlog->segments){
6613 LogSegment *ls = p.second;
f67539c2
TL
6614 if (ls->purging_inodes.size()){
6615 purge_inodes(ls->purging_inodes, ls);
9f95a23c
TL
6616 }
6617 }
6618}
7c673cae 6619
9f95a23c
TL
6620void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6621{
f67539c2
TL
6622 dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl;
6623 // FIXME: handle non-default data pool and namespace
6624
9f95a23c 6625 auto cb = new LambdaContext([this, inos, ls](int r){
20effc67 6626 ceph_assert(r == 0 || r == -2);
9f95a23c
TL
6627 mds->inotable->project_release_ids(inos);
6628 version_t piv = mds->inotable->get_projected_version();
20effc67 6629 ceph_assert(piv != 0);
f67539c2
TL
6630 mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv),
6631 new C_MDS_purge_completed_finish(this, inos, ls, piv));
9f95a23c
TL
6632 mds->mdlog->flush();
6633 });
6634
9f95a23c 6635 C_GatherBuilder gather(g_ceph_context,
f67539c2 6636 new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher));
9f95a23c 6637 SnapContext nullsnapc;
f67539c2
TL
6638 for (const auto& [start, len] : inos) {
6639 for (auto i = start; i < start + len ; i += 1) {
6640 filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1,
6641 ceph::real_clock::now(), 0, gather.new_sub());
9f95a23c
TL
6642 }
6643 }
6644 gather.activate();
6645}
7c673cae
FG
6646
6647// ================================================================================
6648// cache trimming
6649
11fdf7f2 6650std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
181888fb 6651{
7c673cae 6652 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6653 std::vector<CDentry *> unexpirables;
6654 uint64_t trimmed = 0;
6655
11fdf7f2 6656 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
a8e16298 6657
181888fb
FG
6658 dout(7) << "trim_lru trimming " << count
6659 << " items from LRU"
6660 << " size=" << lru.lru_get_size()
6661 << " mid=" << lru.lru_get_top()
6662 << " pintail=" << lru.lru_get_pintail()
6663 << " pinned=" << lru.lru_get_num_pinned()
6664 << dendl;
7c673cae 6665
11fdf7f2 6666 const uint64_t trim_counter_start = trim_counter.get();
a8e16298
TL
6667 bool throttled = false;
6668 while (1) {
6669 throttled |= trim_counter_start+trimmed >= trim_threshold;
6670 if (throttled) break;
31f18b77
FG
6671 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6672 if (!dn)
6673 break;
6674 if (trim_dentry(dn, expiremap)) {
6675 unexpirables.push_back(dn);
181888fb
FG
6676 } else {
6677 trimmed++;
31f18b77
FG
6678 }
6679 }
6680
181888fb 6681 for (auto &dn : unexpirables) {
31f18b77 6682 bottom_lru.lru_insert_mid(dn);
181888fb 6683 }
31f18b77
FG
6684 unexpirables.clear();
6685
181888fb 6686 // trim dentries from the LRU until count is reached
b3b6e05e 6687 // if mds is in standby_replay and skip trimming the inodes
494da23a 6688 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
a8e16298
TL
6689 throttled |= trim_counter_start+trimmed >= trim_threshold;
6690 if (throttled) break;
7c673cae
FG
6691 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6692 if (!dn) {
6693 break;
6694 }
b3b6e05e 6695 if (is_standby_replay && dn->get_linkage()->inode) {
494da23a
TL
6696 // we move the inodes that need to be trimmed to the end of the lru queue.
6697 // refer to MDCache::standby_trim_segment
6698 lru.lru_insert_bot(dn);
6699 break;
181888fb
FG
6700 } else if (trim_dentry(dn, expiremap)) {
6701 unexpirables.push_back(dn);
6702 } else {
6703 trimmed++;
3efd9988 6704 if (count > 0) count--;
7c673cae
FG
6705 }
6706 }
11fdf7f2 6707 trim_counter.hit(trimmed);
181888fb
FG
6708
6709 for (auto &dn : unexpirables) {
31f18b77 6710 lru.lru_insert_mid(dn);
181888fb 6711 }
31f18b77 6712 unexpirables.clear();
7c673cae 6713
181888fb 6714 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
a8e16298 6715 return std::pair<bool, uint64_t>(throttled, trimmed);
181888fb
FG
6716}
6717
6718/*
6719 * note: only called while MDS is active or stopping... NOT during recovery.
6720 * however, we may expire a replica whose authority is recovering.
6721 *
6722 * @param count is number of dentries to try to expire
6723 */
a8e16298 6724std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
181888fb
FG
6725{
6726 uint64_t used = cache_size();
91327a77 6727 uint64_t limit = cache_memory_limit;
11fdf7f2 6728 expiremap expiremap;
181888fb
FG
6729
6730 dout(7) << "trim bytes_used=" << bytes2str(used)
6731 << " limit=" << bytes2str(limit)
91327a77 6732 << " reservation=" << cache_reservation
181888fb
FG
6733 << "% count=" << count << dendl;
6734
6735 // process delayed eval_stray()
6736 stray_manager.advance_delayed();
6737
a8e16298
TL
6738 auto result = trim_lru(count, expiremap);
6739 auto& trimmed = result.second;
181888fb 6740
7c673cae 6741 // trim non-auth, non-bound subtrees
181888fb 6742 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6743 CDir *dir = p->first;
6744 ++p;
31f18b77
FG
6745 CInode *diri = dir->get_inode();
6746 if (dir->is_auth()) {
f6b5b4d7
TL
6747 if (diri->is_auth() && !diri->is_base()) {
6748 /* this situation should correspond to an export pin */
6749 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6750 /* pinned empty subtree, try to drop */
6751 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6752 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6753 dir->state_clear(CDir::STATE_AUXSUBTREE);
6754 remove_subtree(dir);
6755 diri->close_dirfrag(dir->dirfrag().frag);
6756 }
6757 }
6758 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6759 if (dir->state_test(CDir::STATE_EXPORTING) ||
6760 !(mds->is_active() || mds->is_stopping()) ||
6761 dir->is_freezing() || dir->is_frozen())
6762 continue;
31f18b77 6763
f6b5b4d7 6764 migrator->export_empty_import(dir);
a8e16298 6765 ++trimmed;
31f18b77 6766 }
f6b5b4d7
TL
6767 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6768 // only subtree pin
f91f0fd5 6769 if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
f6b5b4d7 6770 continue;
f91f0fd5 6771 }
31f18b77 6772
f6b5b4d7
TL
6773 // don't trim subtree root if its auth MDS is recovering.
6774 // This simplify the cache rejoin code.
6775 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6776 continue;
6777 trim_dirfrag(dir, 0, expiremap);
6778 ++trimmed;
7c673cae
FG
6779 }
6780 }
6781
6782 // trim root?
181888fb 6783 if (mds->is_stopping() && root) {
9f95a23c
TL
6784 auto&& ls = root->get_dirfrags();
6785 for (const auto& dir : ls) {
a8e16298 6786 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6787 trim_dirfrag(dir, 0, expiremap);
a8e16298
TL
6788 ++trimmed;
6789 }
7c673cae 6790 }
a8e16298 6791 if (root->get_num_ref() == 0) {
7c673cae 6792 trim_inode(0, root, 0, expiremap);
a8e16298
TL
6793 ++trimmed;
6794 }
7c673cae
FG
6795 }
6796
6797 std::set<mds_rank_t> stopping;
6798 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6799 stopping.erase(mds->get_nodeid());
6800 for (auto rank : stopping) {
6801 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6802 if (!mdsdir_in)
6803 continue;
6804
11fdf7f2
TL
6805 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6806 if (em.second) {
9f95a23c 6807 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
7c673cae
FG
6808 }
6809
20effc67 6810 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds->get_nodeid() << dendl;
7c673cae
FG
6811
6812 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6813 if (!aborted) {
6814 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
9f95a23c 6815 auto&& ls = mdsdir_in->get_dirfrags();
7c673cae 6816 for (auto dir : ls) {
a8e16298 6817 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6818 trim_dirfrag(dir, dir, expiremap);
a8e16298
TL
6819 ++trimmed;
6820 }
7c673cae 6821 }
a8e16298 6822 if (mdsdir_in->get_num_ref() == 0) {
7c673cae 6823 trim_inode(NULL, mdsdir_in, NULL, expiremap);
a8e16298
TL
6824 ++trimmed;
6825 }
7c673cae
FG
6826 } else {
6827 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6828 }
6829 }
6830
6831 // Other rank's base inodes (when I'm stopping)
181888fb 6832 if (mds->is_stopping()) {
7c673cae 6833 for (set<CInode*>::iterator p = base_inodes.begin();
11fdf7f2
TL
6834 p != base_inodes.end();) {
6835 CInode *base_in = *p;
6836 ++p;
6837 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6838 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6839 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6840 if (base_in->get_num_ref() == 0) {
6841 trim_inode(NULL, base_in, NULL, expiremap);
a8e16298 6842 ++trimmed;
7c673cae
FG
6843 }
6844 }
6845 }
6846 }
6847
6848 // send any expire messages
6849 send_expire_messages(expiremap);
6850
a8e16298 6851 return result;
7c673cae
FG
6852}
6853
11fdf7f2 6854void MDCache::send_expire_messages(expiremap& expiremap)
7c673cae
FG
6855{
6856 // send expires
11fdf7f2 6857 for (const auto &p : expiremap) {
7c673cae 6858 if (mds->is_cluster_degraded() &&
11fdf7f2
TL
6859 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6860 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6861 rejoin_sent.count(p.first) == 0))) {
7c673cae
FG
6862 continue;
6863 }
11fdf7f2
TL
6864 dout(7) << "sending cache_expire to " << p.first << dendl;
6865 mds->send_message_mds(p.second, p.first);
7c673cae 6866 }
11fdf7f2 6867 expiremap.clear();
7c673cae
FG
6868}
6869
6870
11fdf7f2 6871bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
7c673cae
FG
6872{
6873 dout(12) << "trim_dentry " << *dn << dendl;
6874
6875 CDentry::linkage_t *dnl = dn->get_linkage();
6876
6877 CDir *dir = dn->get_dir();
11fdf7f2 6878 ceph_assert(dir);
7c673cae
FG
6879
6880 CDir *con = get_subtree_root(dir);
6881 if (con)
6882 dout(12) << " in container " << *con << dendl;
6883 else {
6884 dout(12) << " no container; under a not-yet-linked dir" << dendl;
11fdf7f2 6885 ceph_assert(dn->is_auth());
7c673cae
FG
6886 }
6887
6888 // If replica dentry is not readable, it's likely we will receive
6889 // MDentryLink/MDentryUnlink message soon (It's possible we first
6890 // receive a MDentryUnlink message, then MDentryLink message)
6891 // MDentryLink message only replicates an inode, so we should
6892 // avoid trimming the inode's parent dentry. This is because that
6893 // unconnected replicas are problematic for subtree migration.
6894 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6895 !dn->get_dir()->get_inode()->is_stray())
6896 return true;
6897
6898 // adjust the dir state
6899 // NOTE: we can safely remove a clean, null dentry without effecting
6900 // directory completeness.
6901 // (check this _before_ we unlink the inode, below!)
6902 bool clear_complete = false;
6903 if (!(dnl->is_null() && dn->is_clean()))
6904 clear_complete = true;
6905
6906 // unlink the dentry
6907 if (dnl->is_remote()) {
6908 // just unlink.
31f18b77 6909 dir->unlink_inode(dn, false);
7c673cae
FG
6910 } else if (dnl->is_primary()) {
6911 // expire the inode, too.
6912 CInode *in = dnl->get_inode();
11fdf7f2 6913 ceph_assert(in);
7c673cae
FG
6914 if (trim_inode(dn, in, con, expiremap))
6915 return true; // purging stray instead of trimming
6916 } else {
11fdf7f2 6917 ceph_assert(dnl->is_null());
7c673cae
FG
6918 }
6919
6920 if (!dn->is_auth()) {
6921 // notify dentry authority.
6922 mds_authority_t auth = dn->authority();
6923
6924 for (int p=0; p<2; p++) {
6925 mds_rank_t a = auth.first;
6926 if (p) a = auth.second;
6927 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6928 if (mds->get_nodeid() == auth.second &&
6929 con->is_importing()) break; // don't send any expire while importing.
6930 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6931
6932 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
11fdf7f2
TL
6933 ceph_assert(a != mds->get_nodeid());
6934 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6935 if (em.second)
9f95a23c 6936 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
11fdf7f2 6937 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6938 }
6939 }
6940
6941 // remove dentry
6942 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6943 dir->add_to_bloom(dn);
6944 dir->remove_dentry(dn);
6945
6946 if (clear_complete)
6947 dir->state_clear(CDir::STATE_COMPLETE);
6948
7c673cae
FG
6949 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6950 return false;
6951}
6952
6953
11fdf7f2 6954void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7c673cae
FG
6955{
6956 dout(15) << "trim_dirfrag " << *dir << dendl;
6957
6958 if (dir->is_subtree_root()) {
11fdf7f2 6959 ceph_assert(!dir->is_auth() ||
7c673cae
FG
6960 (!dir->is_replicated() && dir->inode->is_base()));
6961 remove_subtree(dir); // remove from subtree map
6962 }
11fdf7f2 6963 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
6964
6965 CInode *in = dir->get_inode();
6966
6967 if (!dir->is_auth()) {
6968 mds_authority_t auth = dir->authority();
6969
6970 // was this an auth delegation? (if so, slightly modified container)
6971 dirfrag_t condf;
6972 if (dir->is_subtree_root()) {
6973 dout(12) << " subtree root, container is " << *dir << dendl;
6974 con = dir;
6975 condf = dir->dirfrag();
6976 } else {
6977 condf = con->dirfrag();
6978 }
6979
6980 for (int p=0; p<2; p++) {
6981 mds_rank_t a = auth.first;
6982 if (p) a = auth.second;
6983 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6984 if (mds->get_nodeid() == auth.second &&
6985 con->is_importing()) break; // don't send any expire while importing.
6986 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6987
6988 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
11fdf7f2
TL
6989 ceph_assert(a != mds->get_nodeid());
6990 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6991 if (em.second)
9f95a23c 6992 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
11fdf7f2 6993 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7c673cae
FG
6994 }
6995 }
6996
6997 in->close_dirfrag(dir->dirfrag().frag);
6998}
6999
7000/**
7001 * Try trimming an inode from the cache
7002 *
7003 * @return true if the inode is still in cache, else false if it was trimmed
7004 */
11fdf7f2 7005bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7c673cae
FG
7006{
7007 dout(15) << "trim_inode " << *in << dendl;
11fdf7f2 7008 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7009
7010 if (in->is_dir()) {
7011 // If replica inode's dirfragtreelock is not readable, it's likely
7012 // some dirfrags of the inode are being fragmented and we will receive
7013 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7014 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7015 // This is because that unconnected replicas are problematic for
7016 // subtree migration.
7017 //
9f95a23c 7018 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7c673cae 7019 return true;
28e407b8 7020 }
7c673cae
FG
7021
7022 // DIR
9f95a23c
TL
7023 auto&& dfls = in->get_dirfrags();
7024 for (const auto& dir : dfls) {
11fdf7f2 7025 ceph_assert(!dir->is_subtree_root());
7c673cae
FG
7026 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7027 }
7028 }
7029
7030 // INODE
7031 if (in->is_auth()) {
7032 // eval stray after closing dirfrags
7033 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7034 maybe_eval_stray(in);
7035 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7036 return true;
7037 }
7038 } else {
7039 mds_authority_t auth = in->authority();
7040
7041 dirfrag_t df;
7042 if (con)
7043 df = con->dirfrag();
7044 else
7045 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7046
7047 for (int p=0; p<2; p++) {
7048 mds_rank_t a = auth.first;
7049 if (p) a = auth.second;
7050 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7051 if (con && mds->get_nodeid() == auth.second &&
7052 con->is_importing()) break; // don't send any expire while importing.
7053 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7054
7055 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
11fdf7f2
TL
7056 ceph_assert(a != mds->get_nodeid());
7057 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7058 if (em.second)
9f95a23c 7059 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
11fdf7f2 7060 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7c673cae
FG
7061 }
7062 }
7063
7064 /*
7065 if (in->is_auth()) {
7066 if (in->hack_accessed)
7067 mds->logger->inc("outt");
7068 else {
7069 mds->logger->inc("outut");
7070 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7071 }
7072 }
7073 */
7074
7075 // unlink
7076 if (dn)
31f18b77 7077 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
7078 remove_inode(in);
7079 return false;
7080}
7081
7082
7083/**
7084 * trim_non_auth - remove any non-auth items from our cache
7085 *
7086 * this reduces the amount of non-auth metadata in our cache, reducing the
7087 * load incurred by the rejoin phase.
7088 *
7089 * the only non-auth items that remain are those that are needed to
7090 * attach our own subtrees to the root.
7091 *
7092 * when we are done, all dentries will be in the top bit of the lru.
7093 *
7094 * why we have to do this:
7095 * we may not have accurate linkage for non-auth items. which means we will
7096 * know which subtree it falls into, and can not be sure to declare it to the
7097 * correct authority.
7098 */
7099void MDCache::trim_non_auth()
7100{
7101 dout(7) << "trim_non_auth" << dendl;
7102
7103 // temporarily pin all subtree roots
7104 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7105 p != subtrees.end();
7106 ++p)
7107 p->first->get(CDir::PIN_SUBTREETEMP);
7108
31f18b77 7109 list<CDentry*> auth_list;
7c673cae
FG
7110
7111 // trim non-auth items from the lru
31f18b77
FG
7112 for (;;) {
7113 CDentry *dn = NULL;
7114 if (bottom_lru.lru_get_size() > 0)
7115 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7116 if (!dn && lru.lru_get_size() > 0)
7117 dn = static_cast<CDentry*>(lru.lru_expire());
7118 if (!dn)
7119 break;
7120
7c673cae
FG
7121 CDentry::linkage_t *dnl = dn->get_linkage();
7122
7123 if (dn->is_auth()) {
7124 // add back into lru (at the top)
31f18b77 7125 auth_list.push_back(dn);
7c673cae
FG
7126
7127 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7128 dn->unlink_remote(dnl);
7c673cae
FG
7129 } else {
7130 // non-auth. expire.
7131 CDir *dir = dn->get_dir();
11fdf7f2 7132 ceph_assert(dir);
7c673cae
FG
7133
7134 // unlink the dentry
7135 dout(10) << " removing " << *dn << dendl;
7136 if (dnl->is_remote()) {
31f18b77 7137 dir->unlink_inode(dn, false);
7c673cae
FG
7138 }
7139 else if (dnl->is_primary()) {
7140 CInode *in = dnl->get_inode();
7141 dout(10) << " removing " << *in << dendl;
9f95a23c
TL
7142 auto&& ls = in->get_dirfrags();
7143 for (const auto& subdir : ls) {
11fdf7f2 7144 ceph_assert(!subdir->is_subtree_root());
7c673cae
FG
7145 in->close_dirfrag(subdir->dirfrag().frag);
7146 }
31f18b77 7147 dir->unlink_inode(dn, false);
7c673cae
FG
7148 remove_inode(in);
7149 }
7150 else {
11fdf7f2 7151 ceph_assert(dnl->is_null());
7c673cae
FG
7152 }
7153
11fdf7f2 7154 ceph_assert(!dir->has_bloom());
7c673cae
FG
7155 dir->remove_dentry(dn);
7156 // adjust the dir state
7157 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7158 // close empty non-auth dirfrag
7159 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7160 dir->inode->close_dirfrag(dir->get_frag());
7161 }
7162 }
7163
9f95a23c 7164 for (const auto& dn : auth_list) {
31f18b77
FG
7165 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7166 bottom_lru.lru_insert_mid(dn);
7167 else
7168 lru.lru_insert_top(dn);
7169 }
7170
7c673cae
FG
7171 // move everything in the pintail to the top bit of the lru.
7172 lru.lru_touch_entire_pintail();
7173
7174 // unpin all subtrees
7175 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7176 p != subtrees.end();
7177 ++p)
7178 p->first->put(CDir::PIN_SUBTREETEMP);
7179
31f18b77
FG
7180 if (lru.lru_get_size() == 0 &&
7181 bottom_lru.lru_get_size() == 0) {
7c673cae 7182 // root, stray, etc.?
b32b8144 7183 auto p = inode_map.begin();
7c673cae 7184 while (p != inode_map.end()) {
7c673cae 7185 CInode *in = p->second;
b32b8144 7186 ++p;
7c673cae 7187 if (!in->is_auth()) {
9f95a23c
TL
7188 auto&& ls = in->get_dirfrags();
7189 for (const auto& dir : ls) {
7190 dout(10) << " removing " << *dir << dendl;
7191 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7192 remove_subtree(dir);
7193 in->close_dirfrag(dir->dirfrag().frag);
7c673cae
FG
7194 }
7195 dout(10) << " removing " << *in << dendl;
11fdf7f2
TL
7196 ceph_assert(!in->get_parent_dn());
7197 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7198 remove_inode(in);
7199 }
7c673cae
FG
7200 }
7201 }
7202
7203 show_subtrees();
7204}
7205
7206/**
7207 * Recursively trim the subtree rooted at directory to remove all
7208 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7209 * of those links. This is used to clear invalid data out of the cache.
7210 * Note that it doesn't clear the passed-in directory, since that's not
7211 * always safe.
7212 */
7213bool MDCache::trim_non_auth_subtree(CDir *dir)
7214{
7215 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7216
7217 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7218
94b18763
FG
7219 auto j = dir->begin();
7220 auto i = j;
7c673cae
FG
7221 while (j != dir->end()) {
7222 i = j++;
7223 CDentry *dn = i->second;
7224 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7225 CDentry::linkage_t *dnl = dn->get_linkage();
7226 if (dnl->is_primary()) { // check for subdirectories, etc
7227 CInode *in = dnl->get_inode();
7228 bool keep_inode = false;
7229 if (in->is_dir()) {
9f95a23c
TL
7230 auto&& subdirs = in->get_dirfrags();
7231 for (const auto& subdir : subdirs) {
7232 if (subdir->is_subtree_root()) {
7c673cae 7233 keep_inode = true;
9f95a23c 7234 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7c673cae 7235 } else {
9f95a23c 7236 if (trim_non_auth_subtree(subdir))
7c673cae
FG
7237 keep_inode = true;
7238 else {
9f95a23c 7239 in->close_dirfrag(subdir->get_frag());
7c673cae
FG
7240 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7241 }
7242 }
7243 }
7244
7245 }
7246 if (!keep_inode) { // remove it!
7247 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7248 dir->unlink_inode(dn, false);
7c673cae 7249 remove_inode(in);
11fdf7f2 7250 ceph_assert(!dir->has_bloom());
7c673cae
FG
7251 dir->remove_dentry(dn);
7252 } else {
7253 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7254 dn->state_clear(CDentry::STATE_AUTH);
7255 in->state_clear(CInode::STATE_AUTH);
7256 }
f67539c2 7257 } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback
7c673cae
FG
7258 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7259 } else { // just remove it
7260 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7261 if (dnl->is_remote())
31f18b77 7262 dir->unlink_inode(dn, false);
7c673cae
FG
7263 dir->remove_dentry(dn);
7264 }
7265 }
7266 dir->state_clear(CDir::STATE_AUTH);
7267 /**
7268 * We've now checked all our children and deleted those that need it.
7269 * Now return to caller, and tell them if *we're* a keeper.
7270 */
7271 return keep_dir || dir->get_num_any();
7272}
7273
7274/*
7275 * during replay, when we determine a subtree is no longer ours, we
7276 * try to trim it from our cache. because subtrees must be connected
7277 * to the root, the fact that we can trim this tree may mean that our
7278 * children or parents can also be trimmed.
7279 */
7280void MDCache::try_trim_non_auth_subtree(CDir *dir)
7281{
7282 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7283
7284 // can we now trim child subtrees?
7285 set<CDir*> bounds;
7286 get_subtree_bounds(dir, bounds);
7287 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7288 CDir *bd = *p;
7289 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7290 bd->get_num_any() == 0 && // and empty
7291 can_trim_non_auth_dirfrag(bd)) {
7292 CInode *bi = bd->get_inode();
7293 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7294 remove_subtree(bd);
7295 bd->mark_clean();
7296 bi->close_dirfrag(bd->get_frag());
7297 }
7298 }
7299
7300 if (trim_non_auth_subtree(dir)) {
7301 // keep
7302 try_subtree_merge(dir);
7303 } else {
7304 // can we trim this subtree (and possibly our ancestors) too?
7305 while (true) {
7306 CInode *diri = dir->get_inode();
7307 if (diri->is_base()) {
7308 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7309 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7310 remove_subtree(dir);
7311 dir->mark_clean();
7312 diri->close_dirfrag(dir->get_frag());
7313
7314 dout(10) << " removing " << *diri << dendl;
11fdf7f2
TL
7315 ceph_assert(!diri->get_parent_dn());
7316 ceph_assert(diri->get_num_ref() == 0);
7c673cae
FG
7317 remove_inode(diri);
7318 }
7319 break;
7320 }
7321
7322 CDir *psub = get_subtree_root(diri->get_parent_dir());
7323 dout(10) << " parent subtree is " << *psub << dendl;
7324 if (psub->get_dir_auth().first == mds->get_nodeid())
7325 break; // we are auth, keep.
7326
7327 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7328 remove_subtree(dir);
7329 dir->mark_clean();
7330 diri->close_dirfrag(dir->get_frag());
7331
7332 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7333 if (trim_non_auth_subtree(psub))
7334 break;
7335 dir = psub;
7336 }
7337 }
7338
7339 show_subtrees();
7340}
7341
7342void MDCache::standby_trim_segment(LogSegment *ls)
7343{
494da23a
TL
7344 auto try_trim_inode = [this](CInode *in) {
7345 if (in->get_num_ref() == 0 &&
7346 !in->item_open_file.is_on_list() &&
7347 in->parent != NULL &&
7348 in->parent->get_num_ref() == 0){
7349 touch_dentry_bottom(in->parent);
7350 }
7351 };
7352
7353 auto try_trim_dentry = [this](CDentry *dn) {
7354 if (dn->get_num_ref() > 0)
7355 return;
7356 auto in = dn->get_linkage()->inode;
7357 if(in && in->item_open_file.is_on_list())
7358 return;
7359 touch_dentry_bottom(dn);
7360 };
7361
7c673cae
FG
7362 ls->new_dirfrags.clear_list();
7363 ls->open_files.clear_list();
7364
7365 while (!ls->dirty_dirfrags.empty()) {
7366 CDir *dir = ls->dirty_dirfrags.front();
7367 dir->mark_clean();
494da23a
TL
7368 if (dir->inode)
7369 try_trim_inode(dir->inode);
7c673cae
FG
7370 }
7371 while (!ls->dirty_inodes.empty()) {
7372 CInode *in = ls->dirty_inodes.front();
7373 in->mark_clean();
494da23a 7374 try_trim_inode(in);
7c673cae
FG
7375 }
7376 while (!ls->dirty_dentries.empty()) {
7377 CDentry *dn = ls->dirty_dentries.front();
7378 dn->mark_clean();
494da23a 7379 try_trim_dentry(dn);
7c673cae
FG
7380 }
7381 while (!ls->dirty_parent_inodes.empty()) {
7382 CInode *in = ls->dirty_parent_inodes.front();
7383 in->clear_dirty_parent();
494da23a 7384 try_trim_inode(in);
7c673cae
FG
7385 }
7386 while (!ls->dirty_dirfrag_dir.empty()) {
7387 CInode *in = ls->dirty_dirfrag_dir.front();
7388 in->filelock.remove_dirty();
494da23a 7389 try_trim_inode(in);
7c673cae
FG
7390 }
7391 while (!ls->dirty_dirfrag_nest.empty()) {
7392 CInode *in = ls->dirty_dirfrag_nest.front();
7393 in->nestlock.remove_dirty();
494da23a 7394 try_trim_inode(in);
7c673cae
FG
7395 }
7396 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7397 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7398 in->dirfragtreelock.remove_dirty();
494da23a 7399 try_trim_inode(in);
7c673cae 7400 }
eafe8130
TL
7401 while (!ls->truncating_inodes.empty()) {
7402 auto it = ls->truncating_inodes.begin();
7403 CInode *in = *it;
7404 ls->truncating_inodes.erase(it);
7405 in->put(CInode::PIN_TRUNCATING);
7406 try_trim_inode(in);
7407 }
7c673cae
FG
7408}
7409
9f95a23c 7410void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7c673cae
FG
7411{
7412 mds_rank_t from = mds_rank_t(m->get_from());
7413
7414 dout(7) << "cache_expire from mds." << from << dendl;
7415
7416 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
7417 return;
7418 }
7419
7420 set<SimpleLock *> gather_locks;
7421 // loop over realms
11fdf7f2 7422 for (const auto &p : m->realms) {
7c673cae 7423 // check container?
11fdf7f2
TL
7424 if (p.first.ino > 0) {
7425 CInode *expired_inode = get_inode(p.first.ino);
7426 ceph_assert(expired_inode); // we had better have this.
7427 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7428 ceph_assert(parent_dir);
7c673cae
FG
7429
7430 int export_state = -1;
7431 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7432 export_state = migrator->get_export_state(parent_dir);
11fdf7f2 7433 ceph_assert(export_state >= 0);
7c673cae
FG
7434 }
7435
7436 if (!parent_dir->is_auth() ||
7437 (export_state != -1 &&
7438 ((export_state == Migrator::EXPORT_WARNING &&
7439 migrator->export_has_warned(parent_dir,from)) ||
7440 export_state == Migrator::EXPORT_EXPORTING ||
7441 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7442 (export_state == Migrator::EXPORT_NOTIFYING &&
7443 !migrator->export_has_notified(parent_dir,from))))) {
7444
7445 // not auth.
7446 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
11fdf7f2 7447 ceph_assert(parent_dir->is_frozen_tree_root());
7c673cae
FG
7448
7449 // make a message container
11fdf7f2
TL
7450
7451 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7452 if (em.second)
9f95a23c 7453 em.first->second = make_message<MCacheExpire>(from); /* new */
11fdf7f2 7454
7c673cae 7455 // merge these expires into it
11fdf7f2 7456 em.first->second->add_realm(p.first, p.second);
7c673cae
FG
7457 continue;
7458 }
11fdf7f2 7459 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7c673cae
FG
7460 (export_state == Migrator::EXPORT_WARNING &&
7461 !migrator->export_has_warned(parent_dir, from)));
7462
7463 dout(7) << "expires for " << *parent_dir << dendl;
7464 } else {
7465 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7466 }
7467
7468 // INODES
11fdf7f2
TL
7469 for (const auto &q : p.second.inodes) {
7470 CInode *in = get_inode(q.first);
7471 unsigned nonce = q.second;
7c673cae
FG
7472
7473 if (!in) {
11fdf7f2 7474 dout(0) << " inode expire on " << q.first << " from " << from
7c673cae 7475 << ", don't have it" << dendl;
11fdf7f2 7476 ceph_assert(in);
7c673cae 7477 }
11fdf7f2 7478 ceph_assert(in->is_auth());
7c673cae
FG
7479 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7480
7481 // check nonce
7482 if (nonce == in->get_replica_nonce(from)) {
7483 // remove from our cached_by
7484 dout(7) << " inode expire on " << *in << " from mds." << from
7485 << " cached_by was " << in->get_replicas() << dendl;
7486 inode_remove_replica(in, from, false, gather_locks);
7487 }
7488 else {
7489 // this is an old nonce, ignore expire.
7490 dout(7) << " inode expire on " << *in << " from mds." << from
7491 << " with old nonce " << nonce
7492 << " (current " << in->get_replica_nonce(from) << "), dropping"
7493 << dendl;
7494 }
7495 }
7496
7497 // DIRS
11fdf7f2
TL
7498 for (const auto &q : p.second.dirs) {
7499 CDir *dir = get_dirfrag(q.first);
7500 unsigned nonce = q.second;
7c673cae
FG
7501
7502 if (!dir) {
11fdf7f2 7503 CInode *diri = get_inode(q.first.ino);
7c673cae
FG
7504 if (diri) {
7505 if (mds->is_rejoin() &&
7506 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7507 !diri->is_replica(from)) {
9f95a23c 7508 auto&& ls = diri->get_nested_dirfrags();
11fdf7f2 7509 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae 7510 << " while rejoining, inode isn't replicated" << dendl;
9f95a23c
TL
7511 for (const auto& d : ls) {
7512 dir = d;
7c673cae
FG
7513 if (dir->is_replica(from)) {
7514 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7515 dir->remove_replica(from);
7516 }
7517 }
7518 continue;
7519 }
11fdf7f2 7520 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7c673cae 7521 if (other) {
11fdf7f2 7522 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7523 << " have " << *other << ", mismatched frags, dropping" << dendl;
7524 continue;
7525 }
7526 }
11fdf7f2 7527 dout(0) << " dir expire on " << q.first << " from " << from
7c673cae 7528 << ", don't have it" << dendl;
11fdf7f2 7529 ceph_assert(dir);
7c673cae
FG
7530 }
7531 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7532
11fdf7f2 7533 ceph_assert(dir->is_auth());
7c673cae
FG
7534
7535 // check nonce
7536 if (nonce == dir->get_replica_nonce(from)) {
7537 // remove from our cached_by
7538 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7539 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7540 dir->remove_replica(from);
7541 }
7542 else {
7543 // this is an old nonce, ignore expire.
7544 dout(7) << " dir expire on " << *dir << " from mds." << from
7545 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7546 << "), dropping" << dendl;
7547 }
7548 }
7549
7550 // DENTRIES
11fdf7f2
TL
7551 for (const auto &pd : p.second.dentries) {
7552 dout(10) << " dn expires in dir " << pd.first << dendl;
7553 CInode *diri = get_inode(pd.first.ino);
7554 ceph_assert(diri);
7555 CDir *dir = diri->get_dirfrag(pd.first.frag);
7c673cae
FG
7556
7557 if (!dir) {
11fdf7f2 7558 dout(0) << " dn expires on " << pd.first << " from " << from
7c673cae
FG
7559 << ", must have refragmented" << dendl;
7560 } else {
11fdf7f2 7561 ceph_assert(dir->is_auth());
7c673cae
FG
7562 }
7563
11fdf7f2
TL
7564 for (const auto &p : pd.second) {
7565 unsigned nonce = p.second;
7c673cae
FG
7566 CDentry *dn;
7567
7568 if (dir) {
11fdf7f2 7569 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7570 } else {
7571 // which dirfrag for this dentry?
11fdf7f2
TL
7572 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7573 ceph_assert(dir);
7574 ceph_assert(dir->is_auth());
7575 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7576 }
7577
7578 if (!dn) {
7579 if (dir)
11fdf7f2 7580 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7c673cae 7581 else
11fdf7f2 7582 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7c673cae 7583 }
11fdf7f2 7584 ceph_assert(dn);
7c673cae
FG
7585
7586 if (nonce == dn->get_replica_nonce(from)) {
7587 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7588 dentry_remove_replica(dn, from, gather_locks);
7589 }
7590 else {
7591 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7592 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7593 << "), dropping" << dendl;
7594 }
7595 }
7596 }
7597 }
7598
7c673cae
FG
7599 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7600 if (!(*p)->is_stable())
7601 mds->locker->eval_gather(*p);
7602 }
7603}
7604
7605void MDCache::process_delayed_expire(CDir *dir)
7606{
7607 dout(7) << "process_delayed_expire on " << *dir << dendl;
11fdf7f2
TL
7608 for (const auto &p : delayed_expire[dir]) {
7609 handle_cache_expire(p.second);
7610 }
7c673cae
FG
7611 delayed_expire.erase(dir);
7612}
7613
7614void MDCache::discard_delayed_expire(CDir *dir)
7615{
7616 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7c673cae
FG
7617 delayed_expire.erase(dir);
7618}
7619
7620void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7621 set<SimpleLock *>& gather_locks)
7622{
7623 in->remove_replica(from);
11fdf7f2 7624 in->set_mds_caps_wanted(from, 0);
7c673cae
FG
7625
7626 // note: this code calls _eval more often than it needs to!
7627 // fix lock
7628 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7629 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7630 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7631 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7632 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7633 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7634
7635 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7636 // Don't remove the recovering mds from lock's gathering list because
7637 // it may hold rejoined wrlocks.
7638 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7639 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7640 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7641}
7642
7643void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7644{
7645 dn->remove_replica(from);
7646
7647 // fix lock
7648 if (dn->lock.remove_replica(from))
7649 gather_locks.insert(&dn->lock);
7650
7651 // Replicated strays might now be elegible for purge
11fdf7f2 7652 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7653 if (dnl->is_primary()) {
7654 maybe_eval_stray(dnl->get_inode());
7655 }
7656}
7657
7658void MDCache::trim_client_leases()
7659{
7660 utime_t now = ceph_clock_now();
7661
7662 dout(10) << "trim_client_leases" << dendl;
7663
eafe8130
TL
7664 std::size_t pool = 0;
7665 for (const auto& list : client_leases) {
7666 pool += 1;
7667 if (list.empty())
7c673cae
FG
7668 continue;
7669
eafe8130
TL
7670 auto before = list.size();
7671 while (!list.empty()) {
7672 ClientLease *r = list.front();
7c673cae
FG
7673 if (r->ttl > now) break;
7674 CDentry *dn = static_cast<CDentry*>(r->parent);
7675 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7676 dn->remove_client_lease(r, mds->locker);
7677 }
eafe8130 7678 auto after = list.size();
7c673cae
FG
7679 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7680 << (before-after) << " leases, " << after << " left" << dendl;
7681 }
7682}
7683
7c673cae
FG
7684void MDCache::check_memory_usage()
7685{
7686 static MemoryModel mm(g_ceph_context);
7687 static MemoryModel::snap last;
7688 mm.sample(&last);
7689 static MemoryModel::snap baseline = last;
7690
7691 // check client caps
11fdf7f2 7692 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7693 double caps_per_inode = 0.0;
7c673cae 7694 if (CInode::count())
181888fb 7695 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae 7696
a8e16298 7697 dout(2) << "Memory usage: "
7c673cae
FG
7698 << " total " << last.get_total()
7699 << ", rss " << last.get_rss()
7700 << ", heap " << last.get_heap()
7701 << ", baseline " << baseline.get_heap()
7c673cae
FG
7702 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7703 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7704 << dendl;
7705
c07f9fc5 7706 mds->update_mlogger();
7c673cae
FG
7707 mds->mlogger->set(l_mdm_rss, last.get_rss());
7708 mds->mlogger->set(l_mdm_heap, last.get_heap());
7c673cae
FG
7709}
7710
7711
7712
7713// =========================================================================================
7714// shutdown
7715
7716class C_MDC_ShutdownCheck : public MDCacheContext {
7717public:
7718 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7719 void finish(int) override {
7720 mdcache->shutdown_check();
7721 }
7722};
7723
7724void MDCache::shutdown_check()
7725{
7726 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7727
7728 // cache
7729 char old_val[32] = { 0 };
7730 char *o = old_val;
11fdf7f2
TL
7731 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7732 g_conf().set_val("debug_mds", "10");
7733 g_conf().apply_changes(nullptr);
7c673cae 7734 show_cache();
11fdf7f2
TL
7735 g_conf().set_val("debug_mds", old_val);
7736 g_conf().apply_changes(nullptr);
7737 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae
FG
7738
7739 // this
31f18b77 7740 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7741 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7742
7743
7744 if (mds->objecter->is_active()) {
7745 dout(0) << "objecter still active" << dendl;
7746 mds->objecter->dump_active();
7747 }
7748}
7749
7750
7751void MDCache::shutdown_start()
7752{
a8e16298 7753 dout(5) << "shutdown_start" << dendl;
7c673cae 7754
11fdf7f2
TL
7755 if (g_conf()->mds_shutdown_check)
7756 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae 7757
11fdf7f2 7758 // g_conf()->debug_mds = 10;
7c673cae
FG
7759}
7760
7761
7762
7763bool MDCache::shutdown_pass()
7764{
7765 dout(7) << "shutdown_pass" << dendl;
7766
7767 if (mds->is_stopped()) {
7768 dout(7) << " already shut down" << dendl;
7769 show_cache();
7770 show_subtrees();
7771 return true;
7772 }
7773
7774 // empty stray dir
28e407b8 7775 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7776
7777 // trim cache
181888fb 7778 trim(UINT64_MAX);
31f18b77 7779 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7780
28e407b8 7781 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae 7782 int num_auth_subtree = 0;
f6b5b4d7
TL
7783 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7784 dout(7) << "looking for subtrees to export" << dendl;
9f95a23c 7785 std::vector<CDir*> ls;
f6b5b4d7
TL
7786 for (auto& [dir, bounds] : subtrees) {
7787 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7788 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7c673cae 7789 continue;
f6b5b4d7
TL
7790 num_auth_subtree++;
7791 if (dir->is_frozen() ||
7792 dir->is_freezing() ||
7793 dir->is_ambiguous_dir_auth() ||
7794 dir->state_test(CDir::STATE_EXPORTING) ||
7795 dir->get_inode()->is_ephemerally_pinned()) {
7796 continue;
7c673cae 7797 }
f6b5b4d7 7798 ls.push_back(dir);
7c673cae 7799 }
28e407b8
AA
7800
7801 migrator->clear_export_queue();
f67539c2
TL
7802 // stopping mds does not call MDBalancer::tick()
7803 mds->balancer->handle_export_pins();
9f95a23c 7804 for (const auto& dir : ls) {
7c673cae
FG
7805 mds_rank_t dest = dir->get_inode()->authority().first;
7806 if (dest > 0 && !mds->mdsmap->is_active(dest))
7807 dest = 0;
7808 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7809 migrator->export_dir_nicely(dir, dest);
7810 }
7811 }
7812
28e407b8
AA
7813 if (!strays_all_exported) {
7814 dout(7) << "waiting for strays to migrate" << dendl;
7815 return false;
7816 }
7817
7c673cae 7818 if (num_auth_subtree > 0) {
11fdf7f2 7819 ceph_assert(mds->get_nodeid() > 0);
7c673cae
FG
7820 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7821 show_subtrees();
7822 return false;
7823 }
7824
7825 // close out any sessions (and open files!) before we try to trim the log, etc.
7826 if (mds->sessionmap.have_unclosed_sessions()) {
7827 if (!mds->server->terminating_sessions)
7828 mds->server->terminate_sessions();
7829 return false;
7830 }
7831
28e407b8
AA
7832 // Fully trim the log so that all objects in cache are clean and may be
7833 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7834 // trim the log such that the cache eventually becomes clean.
f64942e4
AA
7835 if (mds->mdlog->get_num_segments() > 0) {
7836 auto ls = mds->mdlog->get_current_segment();
7837 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7838 // Current segment contains events other than subtreemap or
7839 // there are dirty dirfrags (see CDir::log_mark_dirty())
7840 mds->mdlog->start_new_segment();
7841 mds->mdlog->flush();
7842 }
7843 }
7844 mds->mdlog->trim_all();
28e407b8
AA
7845 if (mds->mdlog->get_num_segments() > 1) {
7846 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7847 return false;
7848 }
7849
7850 // drop our reference to our stray dir inode
7851 for (int i = 0; i < NUM_STRAY; ++i) {
7852 if (strays[i] &&
7853 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7854 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7855 strays[i]->put(CInode::PIN_STRAY);
7856 strays[i]->put_stickydirs();
7857 }
7858 }
7859
7c673cae
FG
7860 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7861 if (mydir && !mydir->is_subtree_root())
7862 mydir = NULL;
7863
7864 // subtrees map not empty yet?
7865 if (subtrees.size() > (mydir ? 1 : 0)) {
7866 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7867 show_subtrees();
7868 migrator->show_importing();
7869 migrator->show_exporting();
7870 if (!migrator->is_importing() && !migrator->is_exporting())
7871 show_cache();
7872 return false;
7873 }
11fdf7f2
TL
7874 ceph_assert(!migrator->is_exporting());
7875 ceph_assert(!migrator->is_importing());
7c673cae 7876
f64942e4
AA
7877 // replicas may dirty scatter locks
7878 if (myin && myin->is_replicated()) {
7879 dout(7) << "still have replicated objects" << dendl;
7880 return false;
7881 }
7882
11fdf7f2
TL
7883 if ((myin && myin->get_num_auth_pins()) ||
7884 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
181888fb
FG
7885 dout(7) << "still have auth pinned objects" << dendl;
7886 return false;
7887 }
7888
7c673cae
FG
7889 // (only do this once!)
7890 if (!mds->mdlog->is_capped()) {
7891 dout(7) << "capping the log" << dendl;
7892 mds->mdlog->cap();
7c673cae
FG
7893 }
7894
f64942e4
AA
7895 if (!mds->mdlog->empty())
7896 mds->mdlog->trim(0);
7897
7c673cae
FG
7898 if (!mds->mdlog->empty()) {
7899 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7900 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7901 return false;
7902 }
7903
7904 if (!did_shutdown_log_cap) {
7905 // flush journal header
7906 dout(7) << "writing header for (now-empty) journal" << dendl;
11fdf7f2 7907 ceph_assert(mds->mdlog->empty());
7c673cae
FG
7908 mds->mdlog->write_head(0);
7909 // NOTE: filer active checker below will block us until this completes.
7910 did_shutdown_log_cap = true;
7911 return false;
7912 }
7913
7914 // filer active?
7915 if (mds->objecter->is_active()) {
7916 dout(7) << "objecter still active" << dendl;
7917 mds->objecter->dump_active();
7918 return false;
7919 }
7920
7921 // trim what we can from the cache
31f18b77
FG
7922 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7923 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7924 show_cache();
7925 //dump();
7926 return false;
7927 }
31f18b77
FG
7928
7929 // make mydir subtree go away
7930 if (mydir) {
7931 if (mydir->get_num_ref() > 1) { // subtree pin
7932 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7933 show_cache();
7934 return false;
7935 }
7936
7937 remove_subtree(mydir);
7938 myin->close_dirfrag(mydir->get_frag());
7939 }
11fdf7f2 7940 ceph_assert(subtrees.empty());
31f18b77 7941
1adf2230 7942 if (myin) {
31f18b77 7943 remove_inode(myin);
11fdf7f2 7944 ceph_assert(!myin);
1adf2230
AA
7945 }
7946
11fdf7f2
TL
7947 if (global_snaprealm) {
7948 remove_inode(global_snaprealm->inode);
7949 global_snaprealm = nullptr;
7950 }
7951
7c673cae 7952 // done!
a8e16298 7953 dout(5) << "shutdown done." << dendl;
7c673cae
FG
7954 return true;
7955}
7956
7957bool MDCache::shutdown_export_strays()
7958{
f64942e4
AA
7959 static const unsigned MAX_EXPORTING = 100;
7960
7c673cae
FG
7961 if (mds->get_nodeid() == 0)
7962 return true;
f64942e4
AA
7963
7964 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7965 return false;
7966
7967 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7968 << " '" << shutdown_export_next.second << "'" << dendl;
7c673cae
FG
7969
7970 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
f64942e4 7971 bool all_exported = false;
7c673cae 7972
f64942e4
AA
7973again:
7974 auto next = shutdown_export_next;
7c673cae 7975
7c673cae 7976 for (int i = 0; i < NUM_STRAY; ++i) {
f64942e4
AA
7977 CInode *strayi = strays[i];
7978 if (!strayi ||
7979 !strayi->state_test(CInode::STATE_STRAYPINNED))
7980 continue;
7981 if (strayi->ino() < next.first.ino)
7c673cae 7982 continue;
7c673cae 7983
f64942e4
AA
7984 deque<CDir*> dfls;
7985 strayi->get_dirfrags(dfls);
7c673cae 7986
f64942e4
AA
7987 while (!dfls.empty()) {
7988 CDir *dir = dfls.front();
7989 dfls.pop_front();
7990
7991 if (dir->dirfrag() < next.first)
7c673cae 7992 continue;
f64942e4
AA
7993 if (next.first < dir->dirfrag()) {
7994 next.first = dir->dirfrag();
7995 next.second.clear();
7996 }
7997
7998 if (!dir->is_complete()) {
11fdf7f2 7999 MDSContext *fin = nullptr;
f64942e4
AA
8000 if (shutdown_exporting_strays.empty()) {
8001 fin = new MDSInternalContextWrapper(mds,
9f95a23c 8002 new LambdaContext([this](int r) {
f64942e4
AA
8003 shutdown_export_strays();
8004 })
8005 );
8006 }
8007 dir->fetch(fin);
8008 goto done;
7c673cae
FG
8009 }
8010
f64942e4
AA
8011 CDir::dentry_key_map::iterator it;
8012 if (next.second.empty()) {
8013 it = dir->begin();
7c673cae 8014 } else {
f64942e4
AA
8015 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8016 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
7c673cae 8017 }
f64942e4
AA
8018
8019 for (; it != dir->end(); ++it) {
8020 CDentry *dn = it->second;
8021 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8022 if (dnl->is_null())
8023 continue;
8024
8025 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
11fdf7f2 8026 next.second = it->first.name;
f64942e4
AA
8027 goto done;
8028 }
8029
8030 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8031 if (!ret.second) {
8032 dout(10) << "already exporting/purging " << *dn << dendl;
8033 continue;
8034 }
8035
8036 // Don't try to migrate anything that is actually
8037 // being purged right now
8038 if (!dn->state_test(CDentry::STATE_PURGING))
8039 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8040
8041 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8042 ++it;
8043 if (it != dir->end()) {
11fdf7f2 8044 next.second = it->first.name;
f64942e4
AA
8045 } else {
8046 if (dfls.empty())
8047 next.first.ino.val++;
8048 else
8049 next.first = dfls.front()->dirfrag();
8050 next.second.clear();
8051 }
8052 goto done;
8053 }
8054 }
8055 }
8056 }
8057
8058 if (shutdown_exporting_strays.empty()) {
8059 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8060 if (first_df < shutdown_export_next.first ||
8061 !shutdown_export_next.second.empty()) {
8062 shutdown_export_next.first = first_df;
8063 shutdown_export_next.second.clear();
8064 goto again;
7c673cae 8065 }
f64942e4 8066 all_exported = true;
7c673cae
FG
8067 }
8068
f64942e4
AA
8069done:
8070 shutdown_export_next = next;
8071 return all_exported;
7c673cae
FG
8072}
8073
8074// ========= messaging ==============
8075
9f95a23c 8076void MDCache::dispatch(const cref_t<Message> &m)
7c673cae
FG
8077{
8078 switch (m->get_type()) {
8079
8080 // RESOLVE
8081 case MSG_MDS_RESOLVE:
9f95a23c 8082 handle_resolve(ref_cast<MMDSResolve>(m));
7c673cae
FG
8083 break;
8084 case MSG_MDS_RESOLVEACK:
9f95a23c 8085 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
7c673cae
FG
8086 break;
8087
8088 // REJOIN
8089 case MSG_MDS_CACHEREJOIN:
9f95a23c 8090 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
7c673cae
FG
8091 break;
8092
8093 case MSG_MDS_DISCOVER:
9f95a23c 8094 handle_discover(ref_cast<MDiscover>(m));
7c673cae
FG
8095 break;
8096 case MSG_MDS_DISCOVERREPLY:
9f95a23c 8097 handle_discover_reply(ref_cast<MDiscoverReply>(m));
7c673cae
FG
8098 break;
8099
8100 case MSG_MDS_DIRUPDATE:
9f95a23c 8101 handle_dir_update(ref_cast<MDirUpdate>(m));
7c673cae
FG
8102 break;
8103
8104 case MSG_MDS_CACHEEXPIRE:
9f95a23c 8105 handle_cache_expire(ref_cast<MCacheExpire>(m));
7c673cae
FG
8106 break;
8107
8108 case MSG_MDS_DENTRYLINK:
9f95a23c 8109 handle_dentry_link(ref_cast<MDentryLink>(m));
7c673cae
FG
8110 break;
8111 case MSG_MDS_DENTRYUNLINK:
9f95a23c 8112 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
7c673cae
FG
8113 break;
8114
8115 case MSG_MDS_FRAGMENTNOTIFY:
9f95a23c 8116 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
7c673cae 8117 break;
a8e16298 8118 case MSG_MDS_FRAGMENTNOTIFYACK:
9f95a23c 8119 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
a8e16298 8120 break;
7c673cae
FG
8121
8122 case MSG_MDS_FINDINO:
9f95a23c 8123 handle_find_ino(ref_cast<MMDSFindIno>(m));
7c673cae
FG
8124 break;
8125 case MSG_MDS_FINDINOREPLY:
9f95a23c 8126 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
7c673cae
FG
8127 break;
8128
8129 case MSG_MDS_OPENINO:
9f95a23c 8130 handle_open_ino(ref_cast<MMDSOpenIno>(m));
7c673cae
FG
8131 break;
8132 case MSG_MDS_OPENINOREPLY:
9f95a23c 8133 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
11fdf7f2
TL
8134 break;
8135
8136 case MSG_MDS_SNAPUPDATE:
9f95a23c 8137 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
7c673cae
FG
8138 break;
8139
8140 default:
8141 derr << "cache unknown message " << m->get_type() << dendl;
11fdf7f2 8142 ceph_abort_msg("cache unknown message");
7c673cae
FG
8143 }
8144}
8145
9f95a23c
TL
8146int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8147 const filepath& path, int flags,
8148 vector<CDentry*> *pdnvec, CInode **pin)
7c673cae 8149{
9f95a23c
TL
8150 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8151 bool forward = !discover;
8152 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8153 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8154 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8155 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8156 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8157 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8158 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
7c673cae 8159
9f95a23c
TL
8160 if (forward)
8161 ceph_assert(mdr); // forward requires a request
7c673cae
FG
8162
8163 snapid_t snapid = CEPH_NOSNAP;
8164 if (mdr)
8165 mdr->snapid = snapid;
8166
b3b6e05e 8167 client_t client = mdr ? mdr->get_client() : -1;
7c673cae
FG
8168
8169 if (mds->logger) mds->logger->inc(l_mds_traverse);
8170
8171 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8172 CInode *cur = get_inode(path.get_ino());
9f95a23c
TL
8173 if (!cur) {
8174 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
11fdf7f2 8175 open_foreign_mdsdir(path.get_ino(), cf.build());
9f95a23c 8176 return 1;
7c673cae 8177 }
9f95a23c
TL
8178 if (MDS_INO_IS_STRAY(path.get_ino())) {
8179 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8180 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8181 filepath path(strays[idx]->get_parent_dn()->get_name(),
8182 MDS_INO_MDSDIR(rank));
8183 MDRequestRef null_ref;
8184 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8185 }
f67539c2 8186 return -CEPHFS_ESTALE;
7c673cae
FG
8187 }
8188 if (cur->state_test(CInode::STATE_PURGING))
f67539c2 8189 return -CEPHFS_ESTALE;
7c673cae 8190
9f95a23c
TL
8191 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8192 mds->locker->find_and_attach_lock_cache(mdr, cur);
8193
8194 if (mdr && mdr->lock_cache) {
8195 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8196 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8197 } else if (rdlock_snap) {
8198 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8199 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8200 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8201 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8202 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8203 return 1;
8204 }
8205 }
8206
7c673cae
FG
8207 // start trace
8208 if (pdnvec)
8209 pdnvec->clear();
8210 if (pin)
8211 *pin = cur;
8212
9f95a23c
TL
8213 MutationImpl::LockOpVec lov;
8214
8215 for (unsigned depth = 0; depth < path.depth(); ) {
7c673cae
FG
8216 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8217 << "' snapid " << snapid << dendl;
8218
8219 if (!cur->is_dir()) {
8220 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
f67539c2 8221 return -CEPHFS_ENOTDIR;
7c673cae
FG
8222 }
8223
8224 // walk into snapdir?
8225 if (path[depth].length() == 0) {
8226 dout(10) << "traverse: snapdir" << dendl;
9f95a23c 8227 if (!mdr || depth > 0) // snapdir must be the first component
f67539c2 8228 return -CEPHFS_EINVAL;
7c673cae
FG
8229 snapid = CEPH_SNAPDIR;
8230 mdr->snapid = snapid;
8231 depth++;
8232 continue;
8233 }
8234 // walk thru snapdir?
8235 if (snapid == CEPH_SNAPDIR) {
8236 if (!mdr)
f67539c2 8237 return -CEPHFS_EINVAL;
7c673cae
FG
8238 SnapRealm *realm = cur->find_snaprealm();
8239 snapid = realm->resolve_snapname(path[depth], cur->ino());
8240 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
11fdf7f2 8241 if (!snapid) {
9f95a23c
TL
8242 if (pdnvec)
8243 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
f67539c2 8244 return -CEPHFS_ENOENT;
11fdf7f2 8245 }
7c673cae
FG
8246 mdr->snapid = snapid;
8247 depth++;
8248 continue;
8249 }
8250
8251 // open dir
8252 frag_t fg = cur->pick_dirfrag(path[depth]);
8253 CDir *curdir = cur->get_dirfrag(fg);
8254 if (!curdir) {
8255 if (cur->is_auth()) {
8256 // parent dir frozen_dir?
8257 if (cur->is_frozen()) {
8258 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
11fdf7f2 8259 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
7c673cae
FG
8260 return 1;
8261 }
8262 curdir = cur->get_or_open_dirfrag(this, fg);
8263 } else {
8264 // discover?
8265 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
11fdf7f2 8266 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
9f95a23c 8267 path_locked);
7c673cae
FG
8268 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8269 return 1;
8270 }
8271 }
11fdf7f2 8272 ceph_assert(curdir);
7c673cae
FG
8273
8274#ifdef MDS_VERIFY_FRAGSTAT
8275 if (curdir->is_complete())
8276 curdir->verify_fragstat();
8277#endif
8278
8279 // frozen?
8280 /*
8281 if (curdir->is_frozen()) {
8282 // doh!
8283 // FIXME: traverse is allowed?
8284 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8285 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8286 if (onfinish) delete onfinish;
8287 return 1;
8288 }
8289 */
8290
9f95a23c
TL
8291 if (want_auth && want_dentry && depth == path.depth() - 1) {
8292 if (curdir->is_ambiguous_auth()) {
8293 dout(10) << "waiting for single auth on " << *curdir << dendl;
8294 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8295 return 1;
8296 }
8297 if (!curdir->is_auth()) {
8298 dout(10) << "fw to auth for " << *curdir << dendl;
8299 request_forward(mdr, curdir->authority().first);
8300 return 2;
8301 }
8302 }
8303
7c673cae
FG
8304 // Before doing dirfrag->dn lookup, compare with DamageTable's
8305 // record of which dentries were unreadable
8306 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8307 dout(4) << "traverse: stopped lookup at damaged dentry "
8308 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
f67539c2 8309 return -CEPHFS_EIO;
7c673cae
FG
8310 }
8311
8312 // dentry
8313 CDentry *dn = curdir->lookup(path[depth], snapid);
9f95a23c
TL
8314 if (dn) {
8315 if (dn->state_test(CDentry::STATE_PURGING))
f67539c2 8316 return -CEPHFS_ENOENT;
9f95a23c
TL
8317
8318 if (rdlock_path) {
8319 lov.clear();
8320 if (xlock_dentry && depth == path.depth() - 1) {
8321 if (depth > 0 || !mdr->lock_cache) {
8322 lov.add_wrlock(&cur->filelock);
8323 lov.add_wrlock(&cur->nestlock);
8324 if (rdlock_authlock)
8325 lov.add_rdlock(&cur->authlock);
8326 }
8327 lov.add_xlock(&dn->lock);
8328 } else {
8329 // force client to flush async dir operation if necessary
8330 if (cur->filelock.is_cached())
8331 lov.add_wrlock(&cur->filelock);
8332 lov.add_rdlock(&dn->lock);
8333 }
8334 if (!mds->locker->acquire_locks(mdr, lov)) {
8335 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8336 return 1;
8337 }
8338 } else if (!path_locked &&
8339 !dn->lock.can_read(client) &&
8340 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8341 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8342 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8343 if (mds->logger)
8344 mds->logger->inc(l_mds_traverse_lock);
8345 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8346 mds->mdlog->flush();
8347 return 1;
8348 }
7c673cae 8349
7c673cae
FG
8350 if (pdnvec)
8351 pdnvec->push_back(dn);
7c673cae 8352
9f95a23c 8353 CDentry::linkage_t *dnl = dn->get_projected_linkage();
f67539c2 8354 // can we conclude CEPHFS_ENOENT?
9f95a23c
TL
8355 if (dnl->is_null()) {
8356 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8357 if (depth == path.depth() - 1) {
8358 if (want_dentry)
8359 break;
8360 } else {
8361 if (pdnvec)
7c673cae
FG
8362 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8363 }
f67539c2 8364 return -CEPHFS_ENOENT;
7c673cae 8365 }
7c673cae 8366
7c673cae 8367 // do we have inode?
9f95a23c 8368 CInode *in = dnl->get_inode();
7c673cae 8369 if (!in) {
11fdf7f2 8370 ceph_assert(dnl->is_remote());
7c673cae
FG
8371 // do i have it?
8372 in = get_inode(dnl->get_remote_ino());
8373 if (in) {
8374 dout(7) << "linking in remote in " << *in << dendl;
8375 dn->link_remote(dnl, in);
8376 } else {
8377 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
11fdf7f2 8378 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7c673cae
FG
8379 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8380 dout(4) << "traverse: remote dentry points to damaged ino "
8381 << *dn << dendl;
f67539c2 8382 return -CEPHFS_EIO;
7c673cae 8383 }
11fdf7f2 8384 open_remote_dentry(dn, true, cf.build(),
9f95a23c 8385 (path_locked && depth == path.depth() - 1));
7c673cae
FG
8386 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8387 return 1;
9f95a23c 8388 }
7c673cae
FG
8389 }
8390
8391 cur = in;
7c673cae 8392
9f95a23c
TL
8393 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8394 lov.clear();
8395 lov.add_rdlock(&cur->snaplock);
8396 if (!mds->locker->acquire_locks(mdr, lov)) {
8397 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8398 return 1;
8399 }
8400 }
8401
7c673cae
FG
8402 // add to trace, continue.
8403 touch_inode(cur);
7c673cae
FG
8404 if (pin)
8405 *pin = cur;
8406 depth++;
8407 continue;
8408 }
9f95a23c
TL
8409
8410 ceph_assert(!dn);
7c673cae
FG
8411
8412 // MISS. dentry doesn't exist.
8413 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8414
8415 if (curdir->is_auth()) {
8416 // dentry is mine.
8417 if (curdir->is_complete() ||
8418 (snapid == CEPH_NOSNAP &&
8419 curdir->has_bloom() &&
11fdf7f2 8420 !curdir->is_in_bloom(path[depth]))) {
7c673cae
FG
8421 // file not found
8422 if (pdnvec) {
8423 // instantiate a null dn?
9f95a23c 8424 if (depth < path.depth() - 1) {
7c673cae 8425 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
7c673cae
FG
8426 } else if (snapid < CEPH_MAXSNAP) {
8427 dout(20) << " not adding null for snapid " << snapid << dendl;
9f95a23c
TL
8428 } else if (curdir->is_frozen()) {
8429 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8430 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8431 return 1;
7c673cae
FG
8432 } else {
8433 // create a null dentry
8434 dn = curdir->add_null_dentry(path[depth]);
8435 dout(20) << " added null " << *dn << dendl;
9f95a23c
TL
8436
8437 if (rdlock_path) {
8438 lov.clear();
8439 if (xlock_dentry) {
8440 if (depth > 0 || !mdr->lock_cache) {
8441 lov.add_wrlock(&cur->filelock);
8442 lov.add_wrlock(&cur->nestlock);
8443 if (rdlock_authlock)
8444 lov.add_rdlock(&cur->authlock);
8445 }
8446 lov.add_xlock(&dn->lock);
8447 } else {
8448 // force client to flush async dir operation if necessary
8449 if (cur->filelock.is_cached())
8450 lov.add_wrlock(&cur->filelock);
8451 lov.add_rdlock(&dn->lock);
8452 }
8453 if (!mds->locker->acquire_locks(mdr, lov)) {
8454 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8455 return 1;
8456 }
8457 }
7c673cae 8458 }
9f95a23c 8459 if (dn) {
7c673cae 8460 pdnvec->push_back(dn);
9f95a23c
TL
8461 if (want_dentry)
8462 break;
8463 } else {
7c673cae 8464 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
9f95a23c 8465 }
7c673cae 8466 }
f67539c2 8467 return -CEPHFS_ENOENT;
7c673cae
FG
8468 } else {
8469
8470 // Check DamageTable for missing fragments before trying to fetch
8471 // this
8472 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8473 dout(4) << "traverse: damaged dirfrag " << *curdir
8474 << ", blocking fetch" << dendl;
f67539c2 8475 return -CEPHFS_EIO;
7c673cae
FG
8476 }
8477
8478 // directory isn't complete; reload
8479 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8480 touch_inode(cur);
11fdf7f2 8481 curdir->fetch(cf.build(), path[depth]);
7c673cae
FG
8482 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8483 return 1;
8484 }
8485 } else {
8486 // dirfrag/dentry is not mine.
8487 mds_authority_t dauth = curdir->authority();
8488
f91f0fd5 8489 if (forward &&
11fdf7f2 8490 mdr && mdr->client_request &&
9f95a23c 8491 (int)depth < mdr->client_request->get_num_fwd()){
7c673cae
FG
8492 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8493 << " < fwd " << mdr->client_request->get_num_fwd()
8494 << ", discovering instead of forwarding" << dendl;
8495 discover = true;
8496 }
8497
9f95a23c 8498 if ((discover)) {
7c673cae 8499 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
11fdf7f2 8500 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
9f95a23c 8501 path_locked);
7c673cae
FG
8502 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8503 return 1;
8504 }
8505 if (forward) {
8506 // forward
8507 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8508
8509 if (curdir->is_ambiguous_auth()) {
8510 // wait
8511 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
11fdf7f2 8512 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
7c673cae
FG
8513 return 1;
8514 }
8515
8516 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
11fdf7f2
TL
8517
8518 request_forward(mdr, dauth.first);
8519
7c673cae 8520 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
7c673cae 8521 return 2;
11fdf7f2 8522 }
7c673cae 8523 }
11fdf7f2 8524
7c673cae
FG
8525 ceph_abort(); // i shouldn't get here
8526 }
9f95a23c
TL
8527
8528 if (want_auth && !want_dentry) {
8529 if (cur->is_ambiguous_auth()) {
8530 dout(10) << "waiting for single auth on " << *cur << dendl;
8531 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8532 return 1;
8533 }
8534 if (!cur->is_auth()) {
8535 dout(10) << "fw to auth for " << *cur << dendl;
8536 request_forward(mdr, cur->authority().first);
8537 return 2;
8538 }
8539 }
7c673cae
FG
8540
8541 // success.
8542 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8543 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8544 if (mdr)
11fdf7f2 8545 ceph_assert(mdr->snapid == snapid);
9f95a23c
TL
8546
8547 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8548 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8549 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8550 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8551
8552 if (rdlock_path)
8553 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8554
7c673cae
FG
8555 return 0;
8556}
8557
8558CInode *MDCache::cache_traverse(const filepath& fp)
8559{
8560 dout(10) << "cache_traverse " << fp << dendl;
8561
8562 CInode *in;
f67539c2 8563 unsigned depth = 0;
20effc67
TL
8564 char mdsdir_name[16];
8565 sprintf(mdsdir_name, "~mds%d", mds->get_nodeid());
f67539c2
TL
8566
8567 if (fp.get_ino()) {
7c673cae 8568 in = get_inode(fp.get_ino());
20effc67 8569 } else if (fp.depth() > 0 && (fp[0] == "~mdsdir" || fp[0] == mdsdir_name)) {
f67539c2
TL
8570 in = myin;
8571 depth = 1;
8572 } else {
7c673cae 8573 in = root;
f67539c2 8574 }
7c673cae
FG
8575 if (!in)
8576 return NULL;
8577
f67539c2
TL
8578 for (; depth < fp.depth(); depth++) {
8579 std::string_view dname = fp[depth];
7c673cae 8580 frag_t fg = in->pick_dirfrag(dname);
f67539c2 8581 dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl;
7c673cae
FG
8582 CDir *curdir = in->get_dirfrag(fg);
8583 if (!curdir)
8584 return NULL;
8585 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8586 if (!dn)
8587 return NULL;
8588 in = dn->get_linkage()->get_inode();
8589 if (!in)
8590 return NULL;
8591 }
8592 dout(10) << " got " << *in << dendl;
8593 return in;
8594}
8595
8596
8597/**
8598 * open_remote_dir -- open up a remote dirfrag
8599 *
8600 * @param diri base inode
8601 * @param approxfg approximate fragment.
8602 * @param fin completion callback
8603 */
11fdf7f2 8604void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
7c673cae
FG
8605{
8606 dout(10) << "open_remote_dir on " << *diri << dendl;
11fdf7f2
TL
8607 ceph_assert(diri->is_dir());
8608 ceph_assert(!diri->is_auth());
8609 ceph_assert(diri->get_dirfrag(approxfg) == 0);
7c673cae 8610
224ce89b 8611 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8612}
8613
8614
8615/**
8616 * get_dentry_inode - get or open inode
8617 *
8618 * @param dn the dentry
8619 * @param mdr current request
8620 *
8621 * will return inode for primary, or link up/open up remote link's inode as necessary.
8622 * If it's not available right now, puts mdr on wait list and returns null.
8623 */
8624CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8625{
8626 CDentry::linkage_t *dnl;
8627 if (projected)
8628 dnl = dn->get_projected_linkage();
8629 else
8630 dnl = dn->get_linkage();
8631
11fdf7f2 8632 ceph_assert(!dnl->is_null());
7c673cae
FG
8633
8634 if (dnl->is_primary())
8635 return dnl->inode;
8636
11fdf7f2 8637 ceph_assert(dnl->is_remote());
7c673cae
FG
8638 CInode *in = get_inode(dnl->get_remote_ino());
8639 if (in) {
8640 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8641 dn->link_remote(dnl, in);
8642 return in;
8643 } else {
8644 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8645 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8646 return 0;
8647 }
8648}
8649
8650struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8651 CDentry *dn;
8652 inodeno_t ino;
11fdf7f2 8653 MDSContext *onfinish;
7c673cae 8654 bool want_xlocked;
11fdf7f2 8655 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
31f18b77
FG
8656 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8657 dn->get(MDSCacheObject::PIN_PTRWAITER);
8658 }
7c673cae
FG
8659 void finish(int r) override {
8660 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8661 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8662 }
8663};
8664
11fdf7f2 8665void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
7c673cae
FG
8666{
8667 dout(10) << "open_remote_dentry " << *dn << dendl;
8668 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8669 inodeno_t ino = dnl->get_remote_ino();
b3b6e05e 8670 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1;
7c673cae
FG
8671 open_ino(ino, pool,
8672 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8673}
8674
11fdf7f2 8675void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
8676 bool want_xlocked, int r)
8677{
8678 if (r < 0) {
31f18b77
FG
8679 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8680 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8681 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8682 dn->state_set(CDentry::STATE_BADREMOTEINO);
8683
8684 std::string path;
8685 CDir *dir = dn->get_dir();
8686 if (dir) {
31f18b77 8687 dir->get_inode()->make_path_string(path);
94b18763 8688 path += "/";
11fdf7f2 8689 path += dn->get_name();
7c673cae
FG
8690 }
8691
31f18b77 8692 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8693 if (fatal) {
31f18b77
FG
8694 mds->damaged();
8695 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8696 }
31f18b77
FG
8697 } else {
8698 r = 0;
8699 }
7c673cae
FG
8700 }
8701 fin->complete(r < 0 ? r : 0);
8702}
8703
8704
8705void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8706{
8707 // empty trace if we're a base inode
8708 if (in->is_base())
8709 return;
8710
8711 CInode *parent = in->get_parent_inode();
11fdf7f2 8712 ceph_assert(parent);
7c673cae
FG
8713 make_trace(trace, parent);
8714
8715 CDentry *dn = in->get_parent_dn();
8716 dout(15) << "make_trace adding " << *dn << dendl;
8717 trace.push_back(dn);
8718}
8719
8720
8721// -------------------------------------------------------------------------------
8722// Open inode by inode number
8723
8724class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8725 inodeno_t ino;
8726 public:
8727 bufferlist bl;
8728 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8729 MDCacheIOContext(c), ino(i) {}
8730 void finish(int r) override {
8731 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8732 }
91327a77
AA
8733 void print(ostream& out) const override {
8734 out << "openino_backtrace_fetch" << ino << ")";
8735 }
7c673cae
FG
8736};
8737
8738struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8739 inodeno_t ino;
9f95a23c 8740 cref_t<MMDSOpenIno> msg;
7c673cae
FG
8741 bool parent;
8742 public:
9f95a23c 8743 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
7c673cae
FG
8744 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8745 void finish(int r) override {
8746 if (r < 0 && !parent)
f67539c2 8747 r = -CEPHFS_EAGAIN;
7c673cae
FG
8748 if (msg) {
8749 mdcache->handle_open_ino(msg, r);
8750 return;
8751 }
11fdf7f2
TL
8752 auto& info = mdcache->opening_inodes.at(ino);
8753 mdcache->_open_ino_traverse_dir(ino, info, r);
7c673cae
FG
8754 }
8755};
8756
8757struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8758 inodeno_t ino;
8759 public:
8760 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8761 void finish(int r) override {
8762 mdcache->_open_ino_parent_opened(ino, r);
8763 }
8764};
8765
8766void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8767{
8768 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8769
11fdf7f2 8770 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8771
8772 CInode *in = get_inode(ino);
8773 if (in) {
8774 dout(10) << " found cached " << *in << dendl;
8775 open_ino_finish(ino, info, in->authority().first);
8776 return;
8777 }
8778
8779 inode_backtrace_t backtrace;
8780 if (err == 0) {
8781 try {
11fdf7f2 8782 decode(backtrace, bl);
7c673cae
FG
8783 } catch (const buffer::error &decode_exc) {
8784 derr << "corrupt backtrace on ino x0" << std::hex << ino
f67539c2
TL
8785 << std::dec << ": " << decode_exc.what() << dendl;
8786 open_ino_finish(ino, info, -CEPHFS_EIO);
7c673cae
FG
8787 return;
8788 }
8789 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8790 dout(10) << " old object in pool " << info.pool
8791 << ", retrying pool " << backtrace.pool << dendl;
8792 info.pool = backtrace.pool;
8793 C_IO_MDC_OpenInoBacktraceFetched *fin =
8794 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8795 fetch_backtrace(ino, info.pool, fin->bl,
8796 new C_OnFinisher(fin, mds->finisher));
8797 return;
8798 }
f67539c2 8799 } else if (err == -CEPHFS_ENOENT) {
b3b6e05e 8800 int64_t meta_pool = mds->get_metadata_pool();
7c673cae
FG
8801 if (info.pool != meta_pool) {
8802 dout(10) << " no object in pool " << info.pool
8803 << ", retrying pool " << meta_pool << dendl;
8804 info.pool = meta_pool;
8805 C_IO_MDC_OpenInoBacktraceFetched *fin =
8806 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8807 fetch_backtrace(ino, info.pool, fin->bl,
8808 new C_OnFinisher(fin, mds->finisher));
8809 return;
8810 }
8811 err = 0; // backtrace.ancestors.empty() is checked below
8812 }
8813
8814 if (err == 0) {
8815 if (backtrace.ancestors.empty()) {
8816 dout(10) << " got empty backtrace " << dendl;
f67539c2 8817 err = -CEPHFS_ESTALE;
7c673cae
FG
8818 } else if (!info.ancestors.empty()) {
8819 if (info.ancestors[0] == backtrace.ancestors[0]) {
8820 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
f67539c2 8821 err = -CEPHFS_EINVAL;
7c673cae
FG
8822 } else {
8823 info.last_err = 0;
8824 }
8825 }
8826 }
8827 if (err) {
8828 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8829 if (info.last_err)
8830 err = info.last_err;
8831 open_ino_finish(ino, info, err);
8832 return;
8833 }
8834
8835 dout(10) << " got backtrace " << backtrace << dendl;
8836 info.ancestors = backtrace.ancestors;
8837
8838 _open_ino_traverse_dir(ino, info, 0);
8839}
8840
8841void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8842{
8843 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8844
11fdf7f2 8845 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8846
8847 CInode *in = get_inode(ino);
8848 if (in) {
8849 dout(10) << " found cached " << *in << dendl;
8850 open_ino_finish(ino, info, in->authority().first);
8851 return;
8852 }
8853
8854 if (ret == mds->get_nodeid()) {
8855 _open_ino_traverse_dir(ino, info, 0);
8856 } else {
8857 if (ret >= 0) {
8858 mds_rank_t checked_rank = mds_rank_t(ret);
8859 info.check_peers = true;
8860 info.auth_hint = checked_rank;
8861 info.checked.erase(checked_rank);
8862 }
8863 do_open_ino(ino, info, ret);
8864 }
8865}
8866
8867void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8868{
8869 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8870
8871 CInode *in = get_inode(ino);
8872 if (in) {
8873 dout(10) << " found cached " << *in << dendl;
8874 open_ino_finish(ino, info, in->authority().first);
8875 return;
8876 }
8877
8878 if (ret) {
8879 do_open_ino(ino, info, ret);
8880 return;
8881 }
8882
8883 mds_rank_t hint = info.auth_hint;
8884 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8885 info.discover, info.want_xlocked, &hint);
8886 if (ret > 0)
8887 return;
8888 if (hint != mds->get_nodeid())
8889 info.auth_hint = hint;
8890 do_open_ino(ino, info, ret);
8891}
8892
9f95a23c 8893void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
7c673cae
FG
8894{
8895 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 8896 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
7c673cae 8897 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
11fdf7f2
TL
8898 if (mds->logger)
8899 mds->logger->inc(l_mds_openino_dir_fetch);
7c673cae
FG
8900}
8901
9f95a23c 8902int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
11fdf7f2 8903 const vector<inode_backpointer_t>& ancestors,
7c673cae
FG
8904 bool discover, bool want_xlocked, mds_rank_t *hint)
8905{
8906 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8907 int err = 0;
8908 for (unsigned i = 0; i < ancestors.size(); i++) {
11fdf7f2
TL
8909 const auto& ancestor = ancestors.at(i);
8910 CInode *diri = get_inode(ancestor.dirino);
7c673cae
FG
8911
8912 if (!diri) {
11fdf7f2
TL
8913 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8914 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
7c673cae
FG
8915 return 1;
8916 }
8917 continue;
8918 }
8919
8920 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8921 CDir *dir = diri->get_parent_dir();
8922 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8923 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8924 dir = dir->get_inode()->get_parent_dir();
8925 _open_ino_fetch_dir(ino, m, dir, i == 0);
8926 return 1;
8927 }
8928
8929 if (!diri->is_dir()) {
8930 dout(10) << " " << *diri << " is not dir" << dendl;
8931 if (i == 0)
f67539c2 8932 err = -CEPHFS_ENOTDIR;
7c673cae
FG
8933 break;
8934 }
8935
11fdf7f2 8936 const string& name = ancestor.dname;
7c673cae
FG
8937 frag_t fg = diri->pick_dirfrag(name);
8938 CDir *dir = diri->get_dirfrag(fg);
8939 if (!dir) {
8940 if (diri->is_auth()) {
8941 if (diri->is_frozen()) {
8942 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8943 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8944 return 1;
8945 }
8946 dir = diri->get_or_open_dirfrag(this, fg);
8947 } else if (discover) {
8948 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8949 return 1;
8950 }
8951 }
8952 if (dir) {
11fdf7f2 8953 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
7c673cae
FG
8954 CDentry *dn = dir->lookup(name);
8955 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8956 if (dir->is_auth()) {
8957 if (dnl && dnl->is_primary() &&
8958 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8959 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8960 _open_ino_fetch_dir(ino, m, dir, i == 0);
8961 return 1;
8962 }
8963
8964 if (!dnl && !dir->is_complete() &&
8965 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8966 dout(10) << " fetching incomplete " << *dir << dendl;
8967 _open_ino_fetch_dir(ino, m, dir, i == 0);
8968 return 1;
8969 }
8970
8971 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8972 if (i == 0)
f67539c2 8973 err = -CEPHFS_ENOENT;
7c673cae
FG
8974 } else if (discover) {
8975 if (!dnl) {
8976 filepath path(name, 0);
8977 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8978 (i == 0 && want_xlocked));
8979 return 1;
8980 }
8981 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8982 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8983 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8984 return 1;
8985 }
8986 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8987 if (i == 0)
f67539c2 8988 err = -CEPHFS_ENOENT;
7c673cae
FG
8989 }
8990 }
8991 if (hint && i == 0)
8992 *hint = dir ? dir->authority().first : diri->authority().first;
8993 break;
8994 }
8995 return err;
8996}
8997
8998void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8999{
9000 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9001
11fdf7f2 9002 MDSContext::vec waiters;
7c673cae
FG
9003 waiters.swap(info.waiters);
9004 opening_inodes.erase(ino);
9005 finish_contexts(g_ceph_context, waiters, ret);
9006}
9007
9008void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9009{
f67539c2 9010 if (err < 0 && err != -CEPHFS_EAGAIN) {
7c673cae 9011 info.checked.clear();
7c673cae
FG
9012 info.checking = MDS_RANK_NONE;
9013 info.check_peers = true;
9014 info.fetch_backtrace = true;
9015 if (info.discover) {
9016 info.discover = false;
9017 info.ancestors.clear();
9018 }
f67539c2 9019 if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR)
7c673cae
FG
9020 info.last_err = err;
9021 }
9022
d2e6a577
FG
9023 if (info.check_peers || info.discover) {
9024 if (info.discover) {
9025 // got backtrace from peer, but failed to find inode. re-check peers
9026 info.discover = false;
9027 info.ancestors.clear();
9028 info.checked.clear();
9029 }
7c673cae
FG
9030 info.check_peers = false;
9031 info.checking = MDS_RANK_NONE;
9032 do_open_ino_peer(ino, info);
9033 } else if (info.fetch_backtrace) {
9034 info.check_peers = true;
9035 info.fetch_backtrace = false;
9036 info.checking = mds->get_nodeid();
9037 info.checked.clear();
7c673cae
FG
9038 C_IO_MDC_OpenInoBacktraceFetched *fin =
9039 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9040 fetch_backtrace(ino, info.pool, fin->bl,
9041 new C_OnFinisher(fin, mds->finisher));
9042 } else {
11fdf7f2 9043 ceph_assert(!info.ancestors.empty());
7c673cae 9044 info.checking = mds->get_nodeid();
b3b6e05e 9045 open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(),
7c673cae
FG
9046 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9047 }
9048}
9049
9050void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9051{
9052 set<mds_rank_t> all, active;
9053 mds->mdsmap->get_mds_set(all);
7c673cae 9054 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
9055 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9056 else
9057 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9058
9059 dout(10) << "do_open_ino_peer " << ino << " active " << active
9060 << " all " << all << " checked " << info.checked << dendl;
9061
11fdf7f2 9062 mds_rank_t whoami = mds->get_nodeid();
7c673cae 9063 mds_rank_t peer = MDS_RANK_NONE;
11fdf7f2 9064 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
7c673cae
FG
9065 if (active.count(info.auth_hint)) {
9066 peer = info.auth_hint;
9067 info.auth_hint = MDS_RANK_NONE;
9068 }
9069 } else {
9070 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
11fdf7f2 9071 if (*p != whoami && info.checked.count(*p) == 0) {
7c673cae
FG
9072 peer = *p;
9073 break;
9074 }
9075 }
9076 if (peer < 0) {
11fdf7f2 9077 all.erase(whoami);
d2e6a577 9078 if (all != info.checked) {
7c673cae
FG
9079 dout(10) << " waiting for more peers to be active" << dendl;
9080 } else {
9081 dout(10) << " all MDS peers have been checked " << dendl;
9082 do_open_ino(ino, info, 0);
9083 }
9084 } else {
9085 info.checking = peer;
9086 vector<inode_backpointer_t> *pa = NULL;
9087 // got backtrace from peer or backtrace just fetched
9088 if (info.discover || !info.fetch_backtrace)
9089 pa = &info.ancestors;
9f95a23c 9090 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
11fdf7f2
TL
9091 if (mds->logger)
9092 mds->logger->inc(l_mds_openino_peer_discover);
7c673cae
FG
9093 }
9094}
9095
9f95a23c 9096void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
7c673cae
FG
9097{
9098 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9099 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9100 return;
9101 }
9102
9103 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9104
11fdf7f2 9105 auto from = mds_rank_t(m->get_source().num());
7c673cae 9106 inodeno_t ino = m->ino;
9f95a23c 9107 ref_t<MMDSOpenInoReply> reply;
7c673cae
FG
9108 CInode *in = get_inode(ino);
9109 if (in) {
9110 dout(10) << " have " << *in << dendl;
9f95a23c 9111 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
7c673cae
FG
9112 if (in->is_auth()) {
9113 touch_inode(in);
9114 while (1) {
9115 CDentry *pdn = in->get_parent_dn();
9116 if (!pdn)
9117 break;
9118 CInode *diri = pdn->get_dir()->get_inode();
94b18763 9119 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
f67539c2 9120 in->get_version()));
7c673cae
FG
9121 in = diri;
9122 }
9123 } else {
9124 reply->hint = in->authority().first;
9125 }
9126 } else if (err < 0) {
9f95a23c 9127 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
7c673cae
FG
9128 } else {
9129 mds_rank_t hint = MDS_RANK_NONE;
9130 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9131 if (ret > 0)
9132 return;
9f95a23c 9133 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
7c673cae 9134 }
11fdf7f2 9135 mds->send_message_mds(reply, from);
7c673cae
FG
9136}
9137
9f95a23c 9138void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
7c673cae
FG
9139{
9140 dout(10) << "handle_open_ino_reply " << *m << dendl;
9141
9142 inodeno_t ino = m->ino;
9143 mds_rank_t from = mds_rank_t(m->get_source().num());
9144 auto it = opening_inodes.find(ino);
9145 if (it != opening_inodes.end() && it->second.checking == from) {
9146 open_ino_info_t& info = it->second;
9147 info.checking = MDS_RANK_NONE;
9148 info.checked.insert(from);
9149
9150 CInode *in = get_inode(ino);
9151 if (in) {
9152 dout(10) << " found cached " << *in << dendl;
9153 open_ino_finish(ino, info, in->authority().first);
9154 } else if (!m->ancestors.empty()) {
9155 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9156 if (!info.want_replica) {
9157 open_ino_finish(ino, info, from);
7c673cae
FG
9158 return;
9159 }
9160
9161 info.ancestors = m->ancestors;
9162 info.auth_hint = from;
9163 info.checking = mds->get_nodeid();
9164 info.discover = true;
9165 _open_ino_traverse_dir(ino, info, 0);
9166 } else if (m->error) {
9167 dout(10) << " error " << m->error << " from mds." << from << dendl;
9168 do_open_ino(ino, info, m->error);
9169 } else {
9170 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9171 info.auth_hint = m->hint;
9172 info.checked.erase(m->hint);
9173 }
9174 do_open_ino_peer(ino, info);
9175 }
9176 }
7c673cae
FG
9177}
9178
9179void MDCache::kick_open_ino_peers(mds_rank_t who)
9180{
9181 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9182
9183 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9184 p != opening_inodes.end();
9185 ++p) {
9186 open_ino_info_t& info = p->second;
9187 if (info.checking == who) {
9188 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9189 info.checking = MDS_RANK_NONE;
9190 do_open_ino_peer(p->first, info);
9191 } else if (info.checking == MDS_RANK_NONE) {
9192 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9193 do_open_ino_peer(p->first, info);
9194 }
9195 }
9196}
9197
11fdf7f2 9198void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
f91f0fd5
TL
9199 bool want_replica, bool want_xlocked,
9200 vector<inode_backpointer_t> *ancestors_hint,
9201 mds_rank_t auth_hint)
7c673cae
FG
9202{
9203 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9204 << want_replica << dendl;
9205
11fdf7f2
TL
9206 auto it = opening_inodes.find(ino);
9207 if (it != opening_inodes.end()) {
9208 open_ino_info_t& info = it->second;
7c673cae
FG
9209 if (want_replica) {
9210 info.want_replica = true;
9211 if (want_xlocked && !info.want_xlocked) {
9212 if (!info.ancestors.empty()) {
9213 CInode *diri = get_inode(info.ancestors[0].dirino);
9214 if (diri) {
9215 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9216 CDir *dir = diri->get_dirfrag(fg);
9217 if (dir && !dir->is_auth()) {
9218 filepath path(info.ancestors[0].dname, 0);
9219 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9220 }
9221 }
9222 }
9223 info.want_xlocked = true;
9224 }
9225 }
9226 info.waiters.push_back(fin);
9227 } else {
9228 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
9229 info.want_replica = want_replica;
9230 info.want_xlocked = want_xlocked;
9231 info.tid = ++open_ino_last_tid;
9232 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9233 info.waiters.push_back(fin);
f91f0fd5
TL
9234 if (auth_hint != MDS_RANK_NONE)
9235 info.auth_hint = auth_hint;
9236 if (ancestors_hint) {
9237 info.ancestors = std::move(*ancestors_hint);
11fdf7f2
TL
9238 info.fetch_backtrace = false;
9239 info.checking = mds->get_nodeid();
9240 _open_ino_traverse_dir(ino, info, 0);
9241 } else {
9242 do_open_ino(ino, info, 0);
9243 }
7c673cae
FG
9244 }
9245}
9246
9247/* ---------------------------- */
9248
9249/*
9250 * search for a given inode on MDS peers. optionally start with the given node.
9251
9252
9253 TODO
9254 - recover from mds node failure, recovery
9255 - traverse path
9256
9257 */
9f95a23c
TL
9258void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9259 mds_rank_t hint, bool path_locked)
7c673cae
FG
9260{
9261 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
9262 CInode *in = get_inode(ino);
9263 if (in && in->state_test(CInode::STATE_PURGING)) {
f67539c2 9264 c->complete(-CEPHFS_ESTALE);
b32b8144
FG
9265 return;
9266 }
11fdf7f2 9267 ceph_assert(!in);
7c673cae
FG
9268
9269 ceph_tid_t tid = ++find_ino_peer_last_tid;
9270 find_ino_peer_info_t& fip = find_ino_peer[tid];
9271 fip.ino = ino;
9272 fip.tid = tid;
9273 fip.fin = c;
9f95a23c 9274 fip.path_locked = path_locked;
7c673cae 9275 fip.hint = hint;
7c673cae
FG
9276 _do_find_ino_peer(fip);
9277}
9278
9279void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9280{
9281 set<mds_rank_t> all, active;
9282 mds->mdsmap->get_mds_set(all);
1adf2230 9283 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9284
9285 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9286 << " active " << active << " all " << all
9287 << " checked " << fip.checked
9288 << dendl;
9289
9290 mds_rank_t m = MDS_RANK_NONE;
9291 if (fip.hint >= 0) {
9292 m = fip.hint;
9293 fip.hint = MDS_RANK_NONE;
9294 } else {
9295 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9296 if (*p != mds->get_nodeid() &&
9297 fip.checked.count(*p) == 0) {
9298 m = *p;
9299 break;
9300 }
9301 }
9302 if (m == MDS_RANK_NONE) {
d2e6a577
FG
9303 all.erase(mds->get_nodeid());
9304 if (all != fip.checked) {
7c673cae
FG
9305 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9306 } else {
9307 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
f67539c2 9308 fip.fin->complete(-CEPHFS_ESTALE);
7c673cae
FG
9309 find_ino_peer.erase(fip.tid);
9310 }
9311 } else {
9312 fip.checking = m;
9f95a23c 9313 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
7c673cae
FG
9314 }
9315}
9316
9f95a23c 9317void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
7c673cae
FG
9318{
9319 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
9320 return;
9321 }
9322
9323 dout(10) << "handle_find_ino " << *m << dendl;
9f95a23c 9324 auto r = make_message<MMDSFindInoReply>(m->tid);
7c673cae
FG
9325 CInode *in = get_inode(m->ino);
9326 if (in) {
9327 in->make_path(r->path);
9328 dout(10) << " have " << r->path << " " << *in << dendl;
33c7a0ef
TL
9329
9330 /*
9331 * If the the CInode was just created by using openc in current
9332 * auth MDS, but the client just sends a getattr request to another
9333 * replica MDS. Then here it will make a path of '#INODE-NUMBER'
9334 * only because the CInode hasn't been linked yet, and the replica
9335 * MDS will keep retrying until the auth MDS flushes the mdlog and
9336 * the C_MDS_openc_finish and link_primary_inode are called at most
9337 * 5 seconds later.
9338 */
9339 if (!in->get_parent_dn() && in->is_auth()) {
9340 mds->mdlog->flush();
9341 }
7c673cae 9342 }
11fdf7f2 9343 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
7c673cae
FG
9344}
9345
9346
9f95a23c 9347void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
7c673cae 9348{
9f95a23c 9349 auto p = find_ino_peer.find(m->tid);
7c673cae
FG
9350 if (p != find_ino_peer.end()) {
9351 dout(10) << "handle_find_ino_reply " << *m << dendl;
9352 find_ino_peer_info_t& fip = p->second;
9353
9354 // success?
9355 if (get_inode(fip.ino)) {
9356 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9357 mds->queue_waiter(fip.fin);
9358 find_ino_peer.erase(p);
7c673cae
FG
9359 return;
9360 }
9361
9362 mds_rank_t from = mds_rank_t(m->get_source().num());
9363 if (fip.checking == from)
9364 fip.checking = MDS_RANK_NONE;
9365 fip.checked.insert(from);
9366
9367 if (!m->path.empty()) {
9368 // we got a path!
9369 vector<CDentry*> trace;
11fdf7f2 9370 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 9371 MDRequestRef null_ref;
9f95a23c
TL
9372 int flags = MDS_TRAVERSE_DISCOVER;
9373 if (fip.path_locked)
9374 flags |= MDS_TRAVERSE_PATH_LOCKED;
9375 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
7c673cae
FG
9376 if (r > 0)
9377 return;
9378 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9379 << ", retrying" << dendl;
9380 fip.checked.clear();
9381 _do_find_ino_peer(fip);
9382 } else {
9383 // nope, continue.
9384 _do_find_ino_peer(fip);
9385 }
9386 } else {
9387 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9388 }
7c673cae
FG
9389}
9390
9391void MDCache::kick_find_ino_peers(mds_rank_t who)
9392{
9393 // find_ino_peers requests we should move on from
9394 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9395 p != find_ino_peer.end();
9396 ++p) {
9397 find_ino_peer_info_t& fip = p->second;
9398 if (fip.checking == who) {
9399 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9400 fip.checking = MDS_RANK_NONE;
9401 _do_find_ino_peer(fip);
9402 } else if (fip.checking == MDS_RANK_NONE) {
9403 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9404 _do_find_ino_peer(fip);
9405 }
9406 }
9407}
9408
9409/* ---------------------------- */
9410
9411int MDCache::get_num_client_requests()
9412{
9413 int count = 0;
9414 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9415 p != active_requests.end();
9416 ++p) {
9417 MDRequestRef& mdr = p->second;
f67539c2 9418 if (mdr->reqid.name.is_client() && !mdr->is_peer())
7c673cae
FG
9419 count++;
9420 }
9421 return count;
9422}
9423
9f95a23c 9424MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
7c673cae 9425{
f67539c2 9426 // did we win a forward race against a peer?
7c673cae
FG
9427 if (active_requests.count(req->get_reqid())) {
9428 MDRequestRef& mdr = active_requests[req->get_reqid()];
11fdf7f2 9429 ceph_assert(mdr);
f67539c2 9430 if (mdr->is_peer()) {
7c673cae
FG
9431 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9432 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9433 } else {
9434 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
7c673cae
FG
9435 }
9436 return MDRequestRef();
9437 }
9438
9439 // register new client request
9440 MDRequestImpl::Params params;
9441 params.reqid = req->get_reqid();
9442 params.attempt = req->get_num_fwd();
9443 params.client_req = req;
9444 params.initiated = req->get_recv_stamp();
9445 params.throttled = req->get_throttle_stamp();
9446 params.all_read = req->get_recv_complete_stamp();
9447 params.dispatched = req->get_dispatch_stamp();
9448
9449 MDRequestRef mdr =
11fdf7f2 9450 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae
FG
9451 active_requests[params.reqid] = mdr;
9452 mdr->set_op_stamp(req->get_stamp());
9453 dout(7) << "request_start " << *mdr << dendl;
9454 return mdr;
9455}
9456
f67539c2 9457MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
7c673cae
FG
9458{
9459 int by = m->get_source().num();
9460 MDRequestImpl::Params params;
9461 params.reqid = ri;
9462 params.attempt = attempt;
f67539c2
TL
9463 params.triggering_peer_req = m;
9464 params.peer_to = by;
7c673cae
FG
9465 params.initiated = m->get_recv_stamp();
9466 params.throttled = m->get_throttle_stamp();
9467 params.all_read = m->get_recv_complete_stamp();
9468 params.dispatched = m->get_dispatch_stamp();
9469 MDRequestRef mdr =
11fdf7f2
TL
9470 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9471 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae 9472 active_requests[mdr->reqid] = mdr;
f67539c2 9473 dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl;
7c673cae
FG
9474 return mdr;
9475}
9476
9477MDRequestRef MDCache::request_start_internal(int op)
9478{
91327a77 9479 utime_t now = ceph_clock_now();
7c673cae
FG
9480 MDRequestImpl::Params params;
9481 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9482 params.reqid.tid = mds->issue_tid();
91327a77
AA
9483 params.initiated = now;
9484 params.throttled = now;
9485 params.all_read = now;
9486 params.dispatched = now;
7c673cae
FG
9487 params.internal_op = op;
9488 MDRequestRef mdr =
11fdf7f2 9489 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae 9490
11fdf7f2 9491 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9492 active_requests[mdr->reqid] = mdr;
9493 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9494 return mdr;
9495}
9496
9497MDRequestRef MDCache::request_get(metareqid_t rid)
9498{
9499 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
11fdf7f2 9500 ceph_assert(p != active_requests.end());
7c673cae
FG
9501 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9502 return p->second;
9503}
9504
9505void MDCache::request_finish(MDRequestRef& mdr)
9506{
9507 dout(7) << "request_finish " << *mdr << dendl;
9508 mdr->mark_event("finishing request");
9509
f67539c2
TL
9510 // peer finisher?
9511 if (mdr->has_more() && mdr->more()->peer_commit) {
9512 Context *fin = mdr->more()->peer_commit;
9513 mdr->more()->peer_commit = 0;
7c673cae
FG
9514 int ret;
9515 if (mdr->aborted) {
9516 mdr->aborted = false;
9517 ret = -1;
f67539c2 9518 mdr->more()->peer_rolling_back = true;
7c673cae
FG
9519 } else {
9520 ret = 0;
9521 mdr->committing = true;
9522 }
9523 fin->complete(ret); // this must re-call request_finish.
9524 return;
9525 }
9526
d2e6a577
FG
9527 switch(mdr->internal_op) {
9528 case CEPH_MDS_OP_FRAGMENTDIR:
9529 logger->inc(l_mdss_ireq_fragmentdir);
9530 break;
9531 case CEPH_MDS_OP_EXPORTDIR:
9532 logger->inc(l_mdss_ireq_exportdir);
9533 break;
9534 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9535 logger->inc(l_mdss_ireq_enqueue_scrub);
9536 break;
9537 case CEPH_MDS_OP_FLUSH:
9538 logger->inc(l_mdss_ireq_flush);
9539 break;
9540 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9541 logger->inc(l_mdss_ireq_fragstats);
9542 break;
9543 case CEPH_MDS_OP_REPAIR_INODESTATS:
9544 logger->inc(l_mdss_ireq_inodestats);
9545 break;
9546 }
9547
7c673cae
FG
9548 request_cleanup(mdr);
9549}
9550
9551
9552void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9553{
f91f0fd5
TL
9554 CachedStackStringStream css;
9555 *css << "forwarding request to mds." << who;
9556 mdr->mark_event(css->strv());
7c673cae
FG
9557 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9558 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9559 << *mdr->client_request << dendl;
f91f0fd5
TL
9560 if (mdr->is_batch_head()) {
9561 mdr->release_batch_op()->forward(who);
9f95a23c
TL
9562 } else {
9563 mds->forward_message_mds(mdr->release_client_request(), who);
9564 }
7c673cae
FG
9565 if (mds->logger) mds->logger->inc(l_mds_forward);
9566 } else if (mdr->internal_op >= 0) {
9567 dout(10) << "request_forward on internal op; cancelling" << dendl;
f67539c2 9568 mdr->internal_op_finish->complete(-CEPHFS_EXDEV);
7c673cae
FG
9569 } else {
9570 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9571 << " was from mds" << dendl;
9572 }
9573 request_cleanup(mdr);
9574}
9575
9576
9577void MDCache::dispatch_request(MDRequestRef& mdr)
9578{
9579 if (mdr->client_request) {
9580 mds->server->dispatch_client_request(mdr);
f67539c2
TL
9581 } else if (mdr->peer_request) {
9582 mds->server->dispatch_peer_request(mdr);
7c673cae
FG
9583 } else {
9584 switch (mdr->internal_op) {
9585 case CEPH_MDS_OP_FRAGMENTDIR:
9586 dispatch_fragment_dir(mdr);
9587 break;
9588 case CEPH_MDS_OP_EXPORTDIR:
9589 migrator->dispatch_export_dir(mdr, 0);
9590 break;
9591 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9592 enqueue_scrub_work(mdr);
9593 break;
9594 case CEPH_MDS_OP_FLUSH:
9595 flush_dentry_work(mdr);
9596 break;
9597 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9598 repair_dirfrag_stats_work(mdr);
9599 break;
9600 case CEPH_MDS_OP_REPAIR_INODESTATS:
9601 repair_inode_stats_work(mdr);
9602 break;
f67539c2
TL
9603 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
9604 rdlock_dirfrags_stats_work(mdr);
11fdf7f2 9605 break;
7c673cae
FG
9606 default:
9607 ceph_abort();
9608 }
9609 }
9610}
9611
9612
9613void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9614{
9615 if (!mdr->has_more())
9616 return;
9617
f67539c2 9618 // clean up peers
7c673cae 9619 // (will implicitly drop remote dn pins)
f67539c2
TL
9620 for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin();
9621 p != mdr->more()->peers.end();
7c673cae 9622 ++p) {
f67539c2
TL
9623 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
9624 MMDSPeerRequest::OP_FINISH);
7c673cae
FG
9625
9626 if (mdr->killed && !mdr->committing) {
9627 r->mark_abort();
9628 } else if (mdr->more()->srcdn_auth_mds == *p &&
9629 mdr->more()->inode_import.length() > 0) {
9630 // information about rename imported caps
f67539c2 9631 r->inode_export = std::move(mdr->more()->inode_import);
7c673cae
FG
9632 }
9633
9634 mds->send_message_mds(r, *p);
9635 }
9636
9637 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9638 * implicitly. Note that we don't call the finishers -- there shouldn't
9639 * be any on a remote lock and the request finish wakes up all
9640 * the waiters anyway! */
7c673cae 9641
11fdf7f2
TL
9642 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9643 SimpleLock *lock = it->lock;
9644 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9645 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9646 << " on " << lock->get_parent() << dendl;
9647 lock->put_xlock();
9648 mdr->locks.erase(it++);
9649 } else if (it->is_remote_wrlock()) {
9650 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9651 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9652 if (it->is_wrlock()) {
9653 it->clear_remote_wrlock();
9654 ++it;
9655 } else {
9656 mdr->locks.erase(it++);
9657 }
9658 } else {
9659 ++it;
9660 }
7c673cae
FG
9661 }
9662
f67539c2 9663 mdr->more()->peers.clear(); /* we no longer have requests out to them, and
7c673cae
FG
9664 * leaving them in can cause double-notifies as
9665 * this function can get called more than once */
9666}
9667
9668void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9669{
9670 request_drop_foreign_locks(mdr);
9671 mds->locker->drop_non_rdlocks(mdr.get());
9672}
9673
9674void MDCache::request_drop_locks(MDRequestRef& mdr)
9675{
9676 request_drop_foreign_locks(mdr);
9677 mds->locker->drop_locks(mdr.get());
9678}
9679
9680void MDCache::request_cleanup(MDRequestRef& mdr)
9681{
9682 dout(15) << "request_cleanup " << *mdr << dendl;
9683
9684 if (mdr->has_more()) {
9685 if (mdr->more()->is_ambiguous_auth)
9686 mdr->clear_ambiguous_auth();
9687 if (!mdr->more()->waiting_for_finish.empty())
9688 mds->queue_waiters(mdr->more()->waiting_for_finish);
9689 }
9690
9691 request_drop_locks(mdr);
9692
9693 // drop (local) auth pins
9694 mdr->drop_local_auth_pins();
9695
9696 // drop stickydirs
11fdf7f2 9697 mdr->put_stickydirs();
7c673cae
FG
9698
9699 mds->locker->kick_cap_releases(mdr);
9700
9701 // drop cache pins
9702 mdr->drop_pins();
9703
9704 // remove from session
9705 mdr->item_session_request.remove_myself();
9706
9707 // remove from map
9708 active_requests.erase(mdr->reqid);
9709
9710 if (mds->logger)
9711 log_stat();
9712
9713 mdr->mark_event("cleaned up request");
9714}
9715
9716void MDCache::request_kill(MDRequestRef& mdr)
9717{
f67539c2 9718 // rollback peer requests is tricky. just let the request proceed.
94b18763 9719 if (mdr->has_more() &&
f67539c2 9720 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) {
9f95a23c 9721 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11fdf7f2 9722 ceph_assert(mdr->more()->witnessed.empty());
94b18763 9723 mdr->aborted = true;
f67539c2 9724 dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl;
94b18763 9725 } else {
f67539c2 9726 dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl;
94b18763 9727 }
7c673cae 9728
11fdf7f2
TL
9729 ceph_assert(mdr->used_prealloc_ino == 0);
9730 ceph_assert(mdr->prealloc_inos.empty());
7c673cae
FG
9731
9732 mdr->session = NULL;
9733 mdr->item_session_request.remove_myself();
9734 return;
9735 }
9736
9737 mdr->killed = true;
9738 mdr->mark_event("killing request");
9739
9740 if (mdr->committing) {
f67539c2
TL
9741 dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
9742 mdr->item_session_request.remove_myself();
7c673cae
FG
9743 } else {
9744 dout(10) << "request_kill " << *mdr << dendl;
9745 request_cleanup(mdr);
9746 }
9747}
9748
9749// -------------------------------------------------------------------------------
9750// SNAPREALMS
9751
11fdf7f2 9752void MDCache::create_global_snaprealm()
7c673cae 9753{
11fdf7f2 9754 CInode *in = new CInode(this); // dummy inode
b3b6e05e 9755 create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
11fdf7f2
TL
9756 add_inode(in);
9757 global_snaprealm = in->snaprealm;
7c673cae
FG
9758}
9759
11fdf7f2 9760void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
7c673cae
FG
9761{
9762 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9763
9764 vector<inodeno_t> split_inos;
9765 vector<inodeno_t> split_realms;
9766
11fdf7f2 9767 if (notify_clients) {
11fdf7f2
TL
9768 if (snapop == CEPH_SNAP_OP_SPLIT) {
9769 // notify clients of update|split
f67539c2 9770 for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p)
11fdf7f2 9771 split_inos.push_back((*p)->ino());
7c673cae 9772
f67539c2
TL
9773 for (auto& r : in->snaprealm->open_children)
9774 split_realms.push_back(r->inode->ino());
11fdf7f2
TL
9775 }
9776 }
7c673cae 9777
9f95a23c 9778 map<client_t, ref_t<MClientSnap>> updates;
7c673cae
FG
9779 list<SnapRealm*> q;
9780 q.push_back(in->snaprealm);
9781 while (!q.empty()) {
9782 SnapRealm *realm = q.front();
9783 q.pop_front();
9784
9785 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9786 realm->invalidate_cached_snaps();
9787
11fdf7f2
TL
9788 if (notify_clients) {
9789 for (const auto& p : realm->client_caps) {
9790 const auto& client = p.first;
9791 const auto& caps = p.second;
9792 ceph_assert(!caps->empty());
9793
9794 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9795 if (em.second) {
9f95a23c 9796 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2
TL
9797 update->head.split = in->ino();
9798 update->split_inos = split_inos;
9799 update->split_realms = split_realms;
9800 update->bl = in->snaprealm->get_snap_trace();
9801 em.first->second = std::move(update);
9802 }
7c673cae
FG
9803 }
9804 }
9805
7c673cae
FG
9806 // notify for active children, too.
9807 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
f67539c2
TL
9808 for (auto& r : realm->open_children)
9809 q.push_back(r);
7c673cae
FG
9810 }
9811
11fdf7f2 9812 if (notify_clients)
7c673cae 9813 send_snaps(updates);
7c673cae
FG
9814}
9815
11fdf7f2 9816void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
7c673cae 9817{
11fdf7f2
TL
9818 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9819 ceph_assert(in->is_auth());
7c673cae 9820
11fdf7f2
TL
9821 set<mds_rank_t> mds_set;
9822 if (stid > 0) {
9823 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9824 mds_set.erase(mds->get_nodeid());
9825 } else {
9826 in->list_replicas(mds_set);
9827 }
7c673cae 9828
11fdf7f2
TL
9829 if (!mds_set.empty()) {
9830 bufferlist snap_blob;
9831 in->encode_snap(snap_blob);
7c673cae 9832
11fdf7f2 9833 for (auto p : mds_set) {
9f95a23c 9834 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
11fdf7f2
TL
9835 m->snap_blob = snap_blob;
9836 mds->send_message_mds(m, p);
9837 }
9838 }
7c673cae 9839
11fdf7f2
TL
9840 if (stid > 0)
9841 notify_global_snaprealm_update(snap_op);
9842}
7c673cae 9843
9f95a23c 9844void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
11fdf7f2
TL
9845{
9846 mds_rank_t from = mds_rank_t(m->get_source().num());
9847 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 9848
11fdf7f2
TL
9849 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9850 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9851 return;
9852 }
7c673cae 9853
11fdf7f2
TL
9854 // null rejoin_done means open_snaprealms() has already been called
9855 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9856 (mds->is_rejoin() && !rejoin_done);
9857
9858 if (m->get_tid() > 0) {
9859 mds->snapclient->notify_commit(m->get_tid());
9860 if (notify_clients)
9861 notify_global_snaprealm_update(m->get_snap_op());
9862 }
9863
9864 CInode *in = get_inode(m->get_ino());
9865 if (in) {
9866 ceph_assert(!in->is_auth());
9867 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9868 (mds->is_rejoin() && !in->is_rejoining())) {
9869 auto p = m->snap_blob.cbegin();
9870 in->decode_snap(p);
9871
9872 if (!notify_clients) {
9873 if (!rejoin_pending_snaprealms.count(in)) {
9874 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9875 rejoin_pending_snaprealms.insert(in);
9876 }
9877 }
9878 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9879 }
9880 }
7c673cae
FG
9881}
9882
11fdf7f2
TL
9883void MDCache::notify_global_snaprealm_update(int snap_op)
9884{
9885 if (snap_op != CEPH_SNAP_OP_DESTROY)
9886 snap_op = CEPH_SNAP_OP_UPDATE;
9887 set<Session*> sessions;
9888 mds->sessionmap.get_client_session_set(sessions);
9889 for (auto &session : sessions) {
9890 if (!session->is_open() && !session->is_stale())
9891 continue;
9f95a23c 9892 auto update = make_message<MClientSnap>(snap_op);
11fdf7f2
TL
9893 update->head.split = global_snaprealm->inode->ino();
9894 update->bl = global_snaprealm->get_snap_trace();
9895 mds->send_message_client_counted(update, session);
9896 }
9897}
7c673cae
FG
9898
9899// -------------------------------------------------------------------------------
9900// STRAYS
9901
9902struct C_MDC_RetryScanStray : public MDCacheContext {
9903 dirfrag_t next;
9904 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9905 void finish(int r) override {
9906 mdcache->scan_stray_dir(next);
9907 }
9908};
9909
9910void MDCache::scan_stray_dir(dirfrag_t next)
9911{
9912 dout(10) << "scan_stray_dir " << next << dendl;
9913
f67539c2
TL
9914 if (next.ino)
9915 next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()];
9916
7c673cae
FG
9917 for (int i = 0; i < NUM_STRAY; ++i) {
9918 if (strays[i]->ino() < next.ino)
9919 continue;
f67539c2
TL
9920
9921 std::vector<CDir*> ls;
7c673cae 9922 strays[i]->get_dirfrags(ls);
7c673cae 9923
f67539c2
TL
9924 for (const auto& dir : ls) {
9925 if (dir->get_frag() < next.frag)
9926 continue;
9927
9928 if (!dir->can_auth_pin()) {
9929 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag()));
9930 return;
9931 }
9932
9933 if (!dir->is_complete()) {
9934 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9935 return;
9936 }
9937
9938 for (auto &p : dir->items) {
9939 CDentry *dn = p.second;
9940 dn->state_set(CDentry::STATE_STRAY);
9941 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9942 if (dnl->is_primary()) {
9943 CInode *in = dnl->get_inode();
9944 if (in->get_inode()->nlink == 0)
9945 in->state_set(CInode::STATE_ORPHAN);
9946 maybe_eval_stray(in);
9947 }
7c673cae
FG
9948 }
9949 }
9950 }
9951}
9952
7c673cae
FG
9953void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9954{
9955 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9956 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
11fdf7f2
TL
9957 if (mds->logger)
9958 mds->logger->inc(l_mds_openino_backtrace_fetch);
7c673cae
FG
9959}
9960
9961
9962
9963
9964
9965// ========================================================================================
9966// DISCOVER
9967/*
9968
9969 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9970 to the parent metadata object in the cache (pinning it).
9971
9972 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9973
9974*/
9975
9976void MDCache::_send_discover(discover_info_t& d)
9977{
9f95a23c
TL
9978 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
9979 d.want_base_dir, d.path_locked);
7c673cae
FG
9980 dis->set_tid(d.tid);
9981 mds->send_message_mds(dis, d.mds);
9982}
9983
9984void MDCache::discover_base_ino(inodeno_t want_ino,
11fdf7f2 9985 MDSContext *onfinish,
7c673cae
FG
9986 mds_rank_t from)
9987{
9988 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9989 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9990 discover_info_t& d = _create_discover(from);
9991 d.ino = want_ino;
9992 _send_discover(d);
9993 }
9994 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9995}
9996
9997
9998void MDCache::discover_dir_frag(CInode *base,
9999 frag_t approx_fg,
11fdf7f2 10000 MDSContext *onfinish,
7c673cae
FG
10001 mds_rank_t from)
10002{
10003 if (from < 0)
10004 from = base->authority().first;
10005
10006 dirfrag_t df(base->ino(), approx_fg);
10007 dout(7) << "discover_dir_frag " << df
10008 << " from mds." << from << dendl;
10009
10010 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10011 discover_info_t& d = _create_discover(from);
10012 d.pin_base(base);
10013 d.ino = base->ino();
10014 d.frag = approx_fg;
10015 d.want_base_dir = true;
10016 _send_discover(d);
10017 }
10018
10019 if (onfinish)
10020 base->add_dir_waiter(approx_fg, onfinish);
10021}
10022
10023struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10024 CInode *base;
10025 snapid_t snapid;
10026 filepath path;
10027 mds_rank_t from;
10028 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10029 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10030 void finish(int r) override {
10031 mdcache->discover_path(base, snapid, path, 0, from);
10032 }
10033};
10034
10035void MDCache::discover_path(CInode *base,
10036 snapid_t snap,
10037 filepath want_path,
11fdf7f2 10038 MDSContext *onfinish,
9f95a23c 10039 bool path_locked,
7c673cae
FG
10040 mds_rank_t from)
10041{
10042 if (from < 0)
10043 from = base->authority().first;
10044
10045 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9f95a23c 10046 << (path_locked ? " path_locked":"")
7c673cae
FG
10047 << dendl;
10048
10049 if (base->is_ambiguous_auth()) {
10050 dout(10) << " waiting for single auth on " << *base << dendl;
10051 if (!onfinish)
10052 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10053 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10054 return;
10055 } else if (from == mds->get_nodeid()) {
11fdf7f2 10056 MDSContext::vec finished;
7c673cae
FG
10057 base->take_waiting(CInode::WAIT_DIR, finished);
10058 mds->queue_waiters(finished);
10059 return;
10060 }
10061
10062 frag_t fg = base->pick_dirfrag(want_path[0]);
9f95a23c 10063 if ((path_locked && want_path.depth() == 1) ||
7c673cae
FG
10064 !base->is_waiting_for_dir(fg) || !onfinish) {
10065 discover_info_t& d = _create_discover(from);
10066 d.ino = base->ino();
10067 d.pin_base(base);
10068 d.frag = fg;
10069 d.snap = snap;
10070 d.want_path = want_path;
10071 d.want_base_dir = true;
9f95a23c 10072 d.path_locked = path_locked;
7c673cae
FG
10073 _send_discover(d);
10074 }
10075
10076 // register + wait
10077 if (onfinish)
10078 base->add_dir_waiter(fg, onfinish);
10079}
10080
10081struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10082 CDir *base;
10083 snapid_t snapid;
10084 filepath path;
10085 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10086 MDCacheContext(c), base(b), snapid(s), path(p) {}
10087 void finish(int r) override {
10088 mdcache->discover_path(base, snapid, path, 0);
10089 }
10090};
10091
10092void MDCache::discover_path(CDir *base,
10093 snapid_t snap,
10094 filepath want_path,
11fdf7f2 10095 MDSContext *onfinish,
9f95a23c 10096 bool path_locked)
7c673cae
FG
10097{
10098 mds_rank_t from = base->authority().first;
10099
10100 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9f95a23c 10101 << (path_locked ? " path_locked":"")
7c673cae
FG
10102 << dendl;
10103
10104 if (base->is_ambiguous_auth()) {
10105 dout(7) << " waiting for single auth on " << *base << dendl;
10106 if (!onfinish)
10107 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10108 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10109 return;
10110 } else if (from == mds->get_nodeid()) {
11fdf7f2 10111 MDSContext::vec finished;
7c673cae
FG
10112 base->take_sub_waiting(finished);
10113 mds->queue_waiters(finished);
10114 return;
10115 }
10116
9f95a23c 10117 if ((path_locked && want_path.depth() == 1) ||
7c673cae
FG
10118 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10119 discover_info_t& d = _create_discover(from);
10120 d.ino = base->ino();
31f18b77 10121 d.pin_base(base->inode);
7c673cae
FG
10122 d.frag = base->get_frag();
10123 d.snap = snap;
10124 d.want_path = want_path;
10125 d.want_base_dir = false;
9f95a23c 10126 d.path_locked = path_locked;
7c673cae
FG
10127 _send_discover(d);
10128 }
10129
10130 // register + wait
10131 if (onfinish)
10132 base->add_dentry_waiter(want_path[0], snap, onfinish);
10133}
10134
10135void MDCache::kick_discovers(mds_rank_t who)
10136{
10137 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10138 p != discovers.end();
10139 ++p) {
10140 if (p->second.mds != who)
10141 continue;
10142 _send_discover(p->second);
10143 }
10144}
10145
10146
9f95a23c 10147void MDCache::handle_discover(const cref_t<MDiscover> &dis)
7c673cae
FG
10148{
10149 mds_rank_t whoami = mds->get_nodeid();
10150 mds_rank_t from = mds_rank_t(dis->get_source().num());
10151
11fdf7f2 10152 ceph_assert(from != whoami);
7c673cae
FG
10153
10154 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10155 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 10156 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
10157 return;
10158 }
10159
10160 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10161 // delay processing request from survivor because we may not yet choose lock states.
10162 if (!mds->mdsmap->is_rejoin(from)) {
10163 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10164 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10165 return;
10166 }
10167 }
10168
10169
10170 CInode *cur = 0;
9f95a23c 10171 auto reply = make_message<MDiscoverReply>(*dis);
7c673cae
FG
10172
10173 snapid_t snapid = dis->get_snapid();
10174
10175 // get started.
10176 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10177 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10178 // wants root
10179 dout(7) << "handle_discover from mds." << from
10180 << " wants base + " << dis->get_want().get_path()
10181 << " snap " << snapid
10182 << dendl;
10183
10184 cur = get_inode(dis->get_base_ino());
11fdf7f2 10185 ceph_assert(cur);
7c673cae
FG
10186
10187 // add root
10188 reply->starts_with = MDiscoverReply::INODE;
9f95a23c 10189 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
7c673cae
FG
10190 dout(10) << "added base " << *cur << dendl;
10191 }
10192 else {
10193 // there's a base inode
10194 cur = get_inode(dis->get_base_ino(), snapid);
10195 if (!cur && snapid != CEPH_NOSNAP) {
10196 cur = get_inode(dis->get_base_ino());
10197 if (cur && !cur->is_multiversion())
10198 cur = NULL; // nope!
10199 }
10200
10201 if (!cur) {
10202 dout(7) << "handle_discover mds." << from
10203 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10204 << dendl;
10205 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10206 reply->set_error_dentry(dis->get_dentry(0));
10207 reply->set_flag_error_dir();
10208 } else if (dis->wants_base_dir()) {
10209 dout(7) << "handle_discover mds." << from
10210 << " wants basedir+" << dis->get_want().get_path()
10211 << " has " << *cur
10212 << dendl;
10213 } else {
10214 dout(7) << "handle_discover mds." << from
10215 << " wants " << dis->get_want().get_path()
10216 << " has " << *cur
10217 << dendl;
10218 }
10219 }
10220
11fdf7f2 10221 ceph_assert(reply);
7c673cae
FG
10222
10223 // add content
10224 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10225 for (unsigned i = 0;
10226 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10227 i++) {
10228
10229 // -- figure out the dir
10230
10231 // is *cur even a dir at all?
10232 if (!cur->is_dir()) {
10233 dout(7) << *cur << " not a dir" << dendl;
10234 reply->set_flag_error_dir();
10235 break;
10236 }
10237
10238 // pick frag
10239 frag_t fg;
10240 if (dis->get_want().depth()) {
10241 // dentry specifies
10242 fg = cur->pick_dirfrag(dis->get_dentry(i));
10243 } else {
10244 // requester explicity specified the frag
11fdf7f2 10245 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
7c673cae
FG
10246 fg = dis->get_base_dir_frag();
10247 if (!cur->dirfragtree.is_leaf(fg))
10248 fg = cur->dirfragtree[fg.value()];
10249 }
10250 CDir *curdir = cur->get_dirfrag(fg);
10251
10252 if ((!curdir && !cur->is_auth()) ||
10253 (curdir && !curdir->is_auth())) {
10254
10255 /* before:
10256 * ONLY set flag if empty!!
10257 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10258 * resulting in duplicate discovers in flight,
10259 * which can wreak havoc when discovering rename srcdn (which may move)
10260 */
10261
10262 if (reply->is_empty()) {
10263 // only hint if empty.
10264 // someday this could be better, but right now the waiter logic isn't smart enough.
10265
10266 // hint
10267 if (curdir) {
10268 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10269 reply->set_dir_auth_hint(curdir->authority().first);
10270 } else {
10271 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10272 << *cur << dendl;
10273 reply->set_dir_auth_hint(cur->authority().first);
10274 }
10275
10276 // note error dentry, if any
10277 // NOTE: important, as it allows requester to issue an equivalent discover
10278 // to whomever we hint at.
10279 if (dis->get_want().depth() > i)
10280 reply->set_error_dentry(dis->get_dentry(i));
10281 }
10282
10283 break;
10284 }
10285
10286 if (!curdir) { // open dir?
10287 if (cur->is_frozen()) {
10288 if (!reply->is_empty()) {
10289 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10290 break;
10291 }
10292 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10293 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10294 return;
10295 }
10296 curdir = cur->get_or_open_dirfrag(this, fg);
10297 } else if (curdir->is_frozen_tree() ||
10298 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
10299 if (!reply->is_empty()) {
10300 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10301 break;
10302 }
7c673cae
FG
10303 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10304 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10305 reply->set_flag_error_dir();
10306 break;
10307 }
7c673cae
FG
10308 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10309 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10310 return;
10311 }
10312
10313 // add dir
10314 if (curdir->get_version() == 0) {
10315 // fetch newly opened dir
10316 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10317 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10318 // make sure the base frag is correct, though, in there was a refragment since the
10319 // original request was sent.
10320 reply->set_base_dir_frag(curdir->get_frag());
10321 } else {
11fdf7f2 10322 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
7c673cae
FG
10323 if (!reply->trace.length())
10324 reply->starts_with = MDiscoverReply::DIR;
9f95a23c 10325 encode_replica_dir(curdir, from, reply->trace);
7c673cae
FG
10326 dout(7) << "handle_discover added dir " << *curdir << dendl;
10327 }
10328
10329 // lookup
10330 CDentry *dn = 0;
10331 if (curdir->get_version() == 0) {
10332 // fetch newly opened dir
11fdf7f2 10333 ceph_assert(!curdir->has_bloom());
7c673cae
FG
10334 } else if (dis->get_want().depth() > 0) {
10335 // lookup dentry
10336 dn = curdir->lookup(dis->get_dentry(i), snapid);
10337 } else
10338 break; // done!
10339
10340 // incomplete dir?
10341 if (!dn) {
31f18b77 10342 if (!curdir->is_complete() &&
11fdf7f2
TL
10343 !(snapid == CEPH_NOSNAP &&
10344 curdir->has_bloom() &&
10345 !curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
10346 // readdir
10347 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10348 if (reply->is_empty()) {
10349 // fetch and wait
10350 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10351 dis->wants_base_dir() && curdir->get_version() == 0);
7c673cae
FG
10352 return;
10353 } else {
10354 // initiate fetch, but send what we have so far
10355 curdir->fetch(0);
10356 break;
10357 }
10358 }
10359
11fdf7f2
TL
10360 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10361 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10362 << " dne, non-empty reply, stopping" << dendl;
10363 break;
10364 }
10365
7c673cae
FG
10366 // send null dentry
10367 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10368 << *curdir << dendl;
11fdf7f2
TL
10369 if (snapid == CEPH_NOSNAP)
10370 dn = curdir->add_null_dentry(dis->get_dentry(i));
10371 else
10372 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
7c673cae 10373 }
11fdf7f2 10374 ceph_assert(dn);
7c673cae 10375
31f18b77
FG
10376 // don't add replica to purging dentry/inode
10377 if (dn->state_test(CDentry::STATE_PURGING)) {
10378 if (reply->is_empty())
10379 reply->set_flag_error_dn(dis->get_dentry(i));
10380 break;
10381 }
10382
7c673cae
FG
10383 CDentry::linkage_t *dnl = dn->get_linkage();
10384
10385 // xlocked dentry?
10386 // ...always block on non-tail items (they are unrelated)
10387 // ...allow xlocked tail disocvery _only_ if explicitly requested
7c673cae
FG
10388 if (dn->lock.is_xlocked()) {
10389 // is this the last (tail) item in the discover traversal?
9f95a23c
TL
10390 if (dis->is_path_locked()) {
10391 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
7c673cae
FG
10392 } else if (reply->is_empty()) {
10393 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10394 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10395 return;
10396 } else {
10397 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10398 break;
10399 }
10400 }
10401
10402 // frozen inode?
9f95a23c 10403 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
7c673cae 10404 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9f95a23c 10405 if (tailitem && dis->is_path_locked()) {
7c673cae
FG
10406 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10407 } else if (reply->is_empty()) {
10408 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10409 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10410 return;
10411 } else {
10412 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10413 break;
10414 }
10415 }
10416
10417 // add dentry
10418 if (!reply->trace.length())
10419 reply->starts_with = MDiscoverReply::DENTRY;
9f95a23c 10420 encode_replica_dentry(dn, from, reply->trace);
7c673cae
FG
10421 dout(7) << "handle_discover added dentry " << *dn << dendl;
10422
10423 if (!dnl->is_primary()) break; // stop on null or remote link.
10424
10425 // add inode
10426 CInode *next = dnl->get_inode();
11fdf7f2 10427 ceph_assert(next->is_auth());
7c673cae 10428
9f95a23c 10429 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
7c673cae
FG
10430 dout(7) << "handle_discover added inode " << *next << dendl;
10431
10432 // descend, keep going.
10433 cur = next;
10434 continue;
10435 }
10436
10437 // how did we do?
11fdf7f2 10438 ceph_assert(!reply->is_empty());
7c673cae
FG
10439 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10440 mds->send_message(reply, dis->get_connection());
7c673cae
FG
10441}
10442
9f95a23c 10443void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
7c673cae
FG
10444{
10445 /*
10446 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10447 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
7c673cae
FG
10448 return;
10449 }
10450 */
10451 dout(7) << "discover_reply " << *m << dendl;
10452 if (m->is_flag_error_dir())
10453 dout(7) << " flag error, dir" << dendl;
10454 if (m->is_flag_error_dn())
10455 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10456
11fdf7f2 10457 MDSContext::vec finished, error;
7c673cae
FG
10458 mds_rank_t from = mds_rank_t(m->get_source().num());
10459
10460 // starting point
10461 CInode *cur = get_inode(m->get_base_ino());
11fdf7f2 10462 auto p = m->trace.cbegin();
7c673cae
FG
10463
10464 int next = m->starts_with;
10465
10466 // decrement discover counters
10467 if (m->get_tid()) {
10468 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10469 if (p != discovers.end()) {
10470 dout(10) << " found tid " << m->get_tid() << dendl;
10471 discovers.erase(p);
10472 } else {
10473 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10474 }
10475 }
10476
10477 // discover may start with an inode
10478 if (!p.end() && next == MDiscoverReply::INODE) {
9f95a23c 10479 decode_replica_inode(cur, p, NULL, finished);
7c673cae 10480 dout(7) << "discover_reply got base inode " << *cur << dendl;
11fdf7f2 10481 ceph_assert(cur->is_base());
7c673cae
FG
10482
10483 next = MDiscoverReply::DIR;
10484
10485 // take waiters?
10486 if (cur->is_base() &&
10487 waiting_for_base_ino[from].count(cur->ino())) {
10488 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10489 waiting_for_base_ino[from].erase(cur->ino());
10490 }
10491 }
11fdf7f2 10492 ceph_assert(cur);
7c673cae
FG
10493
10494 // loop over discover results.
10495 // indexes follow each ([[dir] dentry] inode)
10496 // can start, end with any type.
10497 while (!p.end()) {
10498 // dir
10499 frag_t fg;
9f95a23c 10500 CDir *curdir = nullptr;
7c673cae 10501 if (next == MDiscoverReply::DIR) {
9f95a23c 10502 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
7c673cae 10503 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
11fdf7f2 10504 ceph_assert(m->get_wanted_base_dir());
7c673cae
FG
10505 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10506 }
10507 } else {
10508 // note: this can only happen our first way around this loop.
10509 if (p.end() && m->is_flag_error_dn()) {
10510 fg = cur->pick_dirfrag(m->get_error_dentry());
10511 curdir = cur->get_dirfrag(fg);
10512 } else
10513 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10514 }
10515
10516 if (p.end())
10517 break;
10518
10519 // dentry
9f95a23c
TL
10520 CDentry *dn = nullptr;
10521 decode_replica_dentry(dn, p, curdir, finished);
7c673cae
FG
10522
10523 if (p.end())
10524 break;
10525
10526 // inode
9f95a23c 10527 decode_replica_inode(cur, p, dn, finished);
7c673cae
FG
10528
10529 next = MDiscoverReply::DIR;
10530 }
10531
10532 // dir error?
10533 // or dir_auth hint?
10534 if (m->is_flag_error_dir() && !cur->is_dir()) {
10535 // not a dir.
10536 cur->take_waiting(CInode::WAIT_DIR, error);
10537 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10538 mds_rank_t who = m->get_dir_auth_hint();
10539 if (who == mds->get_nodeid()) who = -1;
10540 if (who >= 0)
10541 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10542
7c673cae
FG
10543
10544 if (m->get_wanted_base_dir()) {
31f18b77
FG
10545 frag_t fg = m->get_base_dir_frag();
10546 CDir *dir = cur->get_dirfrag(fg);
10547
7c673cae
FG
10548 if (cur->is_waiting_for_dir(fg)) {
10549 if (cur->is_auth())
10550 cur->take_waiting(CInode::WAIT_DIR, finished);
10551 else if (dir || !cur->dirfragtree.is_leaf(fg))
10552 cur->take_dir_waiting(fg, finished);
10553 else
10554 discover_dir_frag(cur, fg, 0, who);
10555 } else
10556 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10557 }
10558
10559 // try again?
10560 if (m->get_error_dentry().length()) {
31f18b77
FG
10561 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10562 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10563 // wanted a dentry
10564 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10565 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10566 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10567 m->get_wanted_snapid(), finished);
10568 } else {
10569 filepath relpath(m->get_error_dentry(), 0);
9f95a23c 10570 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
7c673cae
FG
10571 }
10572 } else
10573 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10574 << m->get_error_dentry() << dendl;
10575 }
31f18b77
FG
10576 } else if (m->is_flag_error_dn()) {
10577 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10578 CDir *dir = cur->get_dirfrag(fg);
10579 if (dir) {
10580 if (dir->is_auth()) {
10581 dir->take_sub_waiting(finished);
10582 } else {
10583 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10584 m->get_wanted_snapid(), error);
10585 }
10586 }
7c673cae
FG
10587 }
10588
10589 // waiters
f67539c2 10590 finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT); // finish errors directly
7c673cae 10591 mds->queue_waiters(finished);
7c673cae
FG
10592}
10593
10594
10595
10596// ----------------------------
10597// REPLICAS
10598
b32b8144 10599
9f95a23c 10600void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
b32b8144 10601{
9f95a23c 10602 ENCODE_START(1, 1, bl);
b32b8144 10603 dirfrag_t df = dir->dirfrag();
11fdf7f2 10604 encode(df, bl);
9f95a23c
TL
10605 __u32 nonce = dir->add_replica(to);
10606 encode(nonce, bl);
10607 dir->_encode_base(bl);
10608 ENCODE_FINISH(bl);
b32b8144
FG
10609}
10610
9f95a23c 10611void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
b32b8144 10612{
f67539c2 10613 ENCODE_START(2, 1, bl);
11fdf7f2
TL
10614 encode(dn->get_name(), bl);
10615 encode(dn->last, bl);
9f95a23c
TL
10616
10617 __u32 nonce = dn->add_replica(to);
10618 encode(nonce, bl);
10619 encode(dn->first, bl);
10620 encode(dn->linkage.remote_ino, bl);
10621 encode(dn->linkage.remote_d_type, bl);
10622 dn->lock.encode_state_for_replica(bl);
10623 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10624 encode(need_recover, bl);
f67539c2 10625 encode(dn->alternate_name, bl);
9f95a23c 10626 ENCODE_FINISH(bl);
b32b8144
FG
10627}
10628
9f95a23c 10629void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144
FG
10630 uint64_t features)
10631{
9f95a23c 10632 ceph_assert(in->is_auth());
f67539c2
TL
10633
10634 ENCODE_START(2, 1, bl);
10635 encode(in->ino(), bl); // bleh, minor assymetry here
11fdf7f2 10636 encode(in->last, bl);
9f95a23c
TL
10637
10638 __u32 nonce = in->add_replica(to);
10639 encode(nonce, bl);
10640
10641 in->_encode_base(bl, features);
10642 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
f6b5b4d7
TL
10643
10644 __u32 state = in->state;
10645 encode(state, bl);
10646
9f95a23c 10647 ENCODE_FINISH(bl);
b32b8144
FG
10648}
10649
9f95a23c 10650void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
11fdf7f2 10651 MDSContext::vec& finished)
7c673cae 10652{
9f95a23c 10653 DECODE_START(1, p);
7c673cae 10654 dirfrag_t df;
11fdf7f2 10655 decode(df, p);
7c673cae 10656
11fdf7f2 10657 ceph_assert(diri->ino() == df.ino);
7c673cae
FG
10658
10659 // add it (_replica_)
9f95a23c 10660 dir = diri->get_dirfrag(df.frag);
7c673cae
FG
10661
10662 if (dir) {
10663 // had replica. update w/ new nonce.
9f95a23c
TL
10664 __u32 nonce;
10665 decode(nonce, p);
10666 dir->set_replica_nonce(nonce);
10667 dir->_decode_base(p);
10668 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
7c673cae
FG
10669 } else {
10670 // force frag to leaf in the diri tree
10671 if (!diri->dirfragtree.is_leaf(df.frag)) {
9f95a23c 10672 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
7c673cae
FG
10673 << diri->dirfragtree << dendl;
10674 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10675 }
7c673cae
FG
10676 // add replica.
10677 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
9f95a23c
TL
10678 __u32 nonce;
10679 decode(nonce, p);
10680 dir->set_replica_nonce(nonce);
10681 dir->_decode_base(p);
7c673cae
FG
10682 // is this a dir_auth delegation boundary?
10683 if (from != diri->authority().first ||
10684 diri->is_ambiguous_auth() ||
10685 diri->is_base())
10686 adjust_subtree_auth(dir, from);
10687
9f95a23c 10688 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
7c673cae
FG
10689 // get waiters
10690 diri->take_dir_waiting(df.frag, finished);
10691 }
9f95a23c 10692 DECODE_FINISH(p);
7c673cae
FG
10693}
10694
9f95a23c 10695void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
7c673cae 10696{
9f95a23c 10697 DECODE_START(1, p);
7c673cae
FG
10698 string name;
10699 snapid_t last;
11fdf7f2
TL
10700 decode(name, p);
10701 decode(last, p);
7c673cae 10702
9f95a23c 10703 dn = dir->lookup(name, last);
7c673cae
FG
10704
10705 // have it?
9f95a23c 10706 bool is_new = false;
7c673cae 10707 if (dn) {
9f95a23c
TL
10708 is_new = false;
10709 dout(7) << __func__ << " had " << *dn << dendl;
7c673cae 10710 } else {
9f95a23c 10711 is_new = true;
7c673cae 10712 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
9f95a23c 10713 dout(7) << __func__ << " added " << *dn << dendl;
7c673cae 10714 }
9f95a23c
TL
10715
10716 __u32 nonce;
10717 decode(nonce, p);
10718 dn->set_replica_nonce(nonce);
10719 decode(dn->first, p);
7c673cae 10720
9f95a23c
TL
10721 inodeno_t rino;
10722 unsigned char rdtype;
10723 decode(rino, p);
10724 decode(rdtype, p);
10725 dn->lock.decode_state(p, is_new);
7c673cae 10726
9f95a23c
TL
10727 bool need_recover;
10728 decode(need_recover, p);
10729
f67539c2
TL
10730 mempool::mds_co::string alternate_name;
10731 if (struct_v >= 2) {
10732 decode(alternate_name, p);
10733 }
10734
9f95a23c 10735 if (is_new) {
f67539c2 10736 dn->set_alternate_name(std::move(alternate_name));
9f95a23c
TL
10737 if (rino)
10738 dir->link_remote_inode(dn, rino, rdtype);
10739 if (need_recover)
10740 dn->lock.mark_need_recover();
f67539c2
TL
10741 } else {
10742 ceph_assert(dn->alternate_name == alternate_name);
9f95a23c
TL
10743 }
10744
10745 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10746 DECODE_FINISH(p);
7c673cae
FG
10747}
10748
9f95a23c 10749void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
7c673cae 10750{
f6b5b4d7 10751 DECODE_START(2, p);
7c673cae
FG
10752 inodeno_t ino;
10753 snapid_t last;
9f95a23c 10754 __u32 nonce;
11fdf7f2
TL
10755 decode(ino, p);
10756 decode(last, p);
9f95a23c
TL
10757 decode(nonce, p);
10758 in = get_inode(ino, last);
7c673cae 10759 if (!in) {
f67539c2 10760 in = new CInode(this, false, 2, last);
9f95a23c
TL
10761 in->set_replica_nonce(nonce);
10762 in->_decode_base(p);
10763 in->_decode_locks_state_for_replica(p, true);
7c673cae 10764 add_inode(in);
b3b6e05e 10765 if (in->ino() == CEPH_INO_ROOT)
7c673cae
FG
10766 in->inode_auth.first = 0;
10767 else if (in->is_mdsdir())
10768 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
9f95a23c 10769 dout(10) << __func__ << " added " << *in << dendl;
7c673cae 10770 if (dn) {
11fdf7f2 10771 ceph_assert(dn->get_linkage()->is_null());
7c673cae
FG
10772 dn->dir->link_primary_inode(dn, in);
10773 }
10774 } else {
9f95a23c
TL
10775 in->set_replica_nonce(nonce);
10776 in->_decode_base(p);
10777 in->_decode_locks_state_for_replica(p, false);
10778 dout(10) << __func__ << " had " << *in << dendl;
7c673cae
FG
10779 }
10780
10781 if (dn) {
10782 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
9f95a23c 10783 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
7c673cae 10784 }
f6b5b4d7
TL
10785
10786 if (struct_v >= 2) {
10787 __u32 s;
10788 decode(s, p);
10789 s &= CInode::MASK_STATE_REPLICATED;
10790 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10791 dout(10) << "replica inode is random ephemeral pinned" << dendl;
f67539c2 10792 in->set_ephemeral_pin(false, true);
f6b5b4d7
TL
10793 }
10794 }
10795
9f95a23c 10796 DECODE_FINISH(p);
7c673cae
FG
10797}
10798
10799
9f95a23c 10800void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
7c673cae 10801{
f67539c2 10802 ceph_assert(straydn->get_num_auth_pins());
33c7a0ef 10803 ENCODE_START(2, 1, bl);
7c673cae 10804 uint64_t features = mds->mdsmap->get_up_features();
9f95a23c
TL
10805 encode_replica_inode(get_myin(), who, bl, features);
10806 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10807 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10808 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10809 encode_replica_dir(straydn->get_dir(), who, bl);
10810 encode_replica_dentry(straydn, who, bl);
33c7a0ef
TL
10811 if (!straydn->get_projected_linkage()->is_null()) {
10812 encode_replica_inode(straydn->get_projected_linkage()->get_inode(), who, bl, features);
10813 }
9f95a23c 10814 ENCODE_FINISH(bl);
7c673cae
FG
10815}
10816
33c7a0ef 10817void MDCache::decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from)
7c673cae 10818{
11fdf7f2
TL
10819 MDSContext::vec finished;
10820 auto p = bl.cbegin();
7c673cae 10821
33c7a0ef 10822 DECODE_START(2, p);
9f95a23c
TL
10823 CInode *mdsin = nullptr;
10824 decode_replica_inode(mdsin, p, NULL, finished);
10825 CDir *mdsdir = nullptr;
10826 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10827 CDentry *straydirdn = nullptr;
10828 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10829 CInode *strayin = nullptr;
10830 decode_replica_inode(strayin, p, straydirdn, finished);
10831 CDir *straydir = nullptr;
10832 decode_replica_dir(straydir, p, strayin, from, finished);
10833
10834 decode_replica_dentry(straydn, p, straydir, finished);
33c7a0ef
TL
10835 if (struct_v >= 2 && in) {
10836 decode_replica_inode(*in, p, straydn, finished);
10837 }
7c673cae
FG
10838 if (!finished.empty())
10839 mds->queue_waiters(finished);
9f95a23c 10840 DECODE_FINISH(p);
7c673cae
FG
10841}
10842
10843
10844int MDCache::send_dir_updates(CDir *dir, bool bcast)
10845{
10846 // this is an FYI, re: replication
10847
10848 set<mds_rank_t> who;
10849 if (bcast) {
f67539c2
TL
10850 set<mds_rank_t> mds_set;
10851 mds->get_mds_map()->get_active_mds_set(mds_set);
10852
10853 set<mds_rank_t> replica_set;
10854 for (const auto &p : dir->get_replicas()) {
10855 replica_set.insert(p.first);
10856 }
10857
10858 std::set_difference(mds_set.begin(), mds_set.end(),
10859 replica_set.begin(), replica_set.end(),
10860 std::inserter(who, who.end()));
7c673cae 10861 } else {
181888fb
FG
10862 for (const auto &p : dir->get_replicas()) {
10863 who.insert(p.first);
10864 }
7c673cae
FG
10865 }
10866
10867 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10868
10869 filepath path;
10870 dir->inode->make_path(path);
10871
f67539c2
TL
10872 std::set<int32_t> dir_rep_set;
10873 for (const auto &r : dir->dir_rep_by) {
10874 dir_rep_set.insert(r);
10875 }
10876
7c673cae
FG
10877 mds_rank_t whoami = mds->get_nodeid();
10878 for (set<mds_rank_t>::iterator it = who.begin();
10879 it != who.end();
10880 ++it) {
10881 if (*it == whoami) continue;
10882 //if (*it == except) continue;
10883 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10884
f67539c2 10885 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it);
7c673cae
FG
10886 }
10887
10888 return 0;
10889}
10890
9f95a23c 10891void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
7c673cae 10892{
224ce89b
WB
10893 dirfrag_t df = m->get_dirfrag();
10894 CDir *dir = get_dirfrag(df);
7c673cae 10895 if (!dir) {
224ce89b 10896 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10897
10898 // discover it?
10899 if (m->should_discover()) {
10900 // only try once!
10901 // this is key to avoid a fragtree update race, among other things.
224ce89b 10902 m->inc_tried_discover();
7c673cae
FG
10903 vector<CDentry*> trace;
10904 CInode *in;
10905 filepath path = m->get_path();
10906 dout(5) << "trying discover on dir_update for " << path << dendl;
11fdf7f2 10907 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 10908 MDRequestRef null_ref;
9f95a23c 10909 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
7c673cae
FG
10910 if (r > 0)
10911 return;
224ce89b
WB
10912 if (r == 0 &&
10913 in->ino() == df.ino &&
10914 in->get_approx_dirfrag(df.frag) == NULL) {
10915 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10916 return;
10917 }
7c673cae
FG
10918 }
10919
7c673cae
FG
10920 return;
10921 }
10922
224ce89b
WB
10923 if (!m->has_tried_discover()) {
10924 // Update if it already exists. Othwerwise it got updated by discover reply.
10925 dout(5) << "dir_update on " << *dir << dendl;
10926 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10927 dir->dir_rep_by.clear();
10928 for (const auto &e : m->get_dir_rep_by()) {
10929 dir->dir_rep_by.insert(e);
10930 }
224ce89b 10931 }
7c673cae
FG
10932}
10933
10934
10935
10936
10937
10938// LINK
10939
9f95a23c
TL
10940void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10941{
10942 ENCODE_START(1, 1, bl);
10943 inodeno_t ino = dnl->get_remote_ino();
10944 encode(ino, bl);
10945 __u8 d_type = dnl->get_remote_d_type();
10946 encode(d_type, bl);
10947 ENCODE_FINISH(bl);
10948}
10949
10950void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10951{
10952 DECODE_START(1, p);
10953 inodeno_t ino;
10954 __u8 d_type;
10955 decode(ino, p);
10956 decode(d_type, p);
10957 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
10958 dir->link_remote_inode(dn, ino, d_type);
10959 DECODE_FINISH(p);
10960}
10961
7c673cae
FG
10962void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10963{
9f95a23c 10964 dout(7) << __func__ << " " << *dn << dendl;
7c673cae
FG
10965
10966 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10967 for (const auto &p : dn->get_replicas()) {
7c673cae 10968 // don't tell (rename) witnesses; they already know
181888fb 10969 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10970 continue;
181888fb
FG
10971 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10972 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10973 rejoin_gather.count(p.first)))
7c673cae
FG
10974 continue;
10975 CDentry::linkage_t *dnl = dn->get_linkage();
9f95a23c 10976 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
7c673cae 10977 if (dnl->is_primary()) {
9f95a23c
TL
10978 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
10979 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10980 mds->mdsmap->get_up_features());
10981 } else if (dnl->is_remote()) {
9f95a23c 10982 encode_remote_dentry_link(dnl, m->bl);
7c673cae
FG
10983 } else
10984 ceph_abort(); // aie, bad caller!
181888fb 10985 mds->send_message_mds(m, p.first);
7c673cae
FG
10986 }
10987}
10988
9f95a23c 10989void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
7c673cae 10990{
7c673cae
FG
10991 CDentry *dn = NULL;
10992 CDir *dir = get_dirfrag(m->get_dirfrag());
10993 if (!dir) {
9f95a23c 10994 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
7c673cae
FG
10995 } else {
10996 dn = dir->lookup(m->get_dn());
10997 if (!dn) {
9f95a23c 10998 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
7c673cae 10999 } else {
9f95a23c 11000 dout(7) << __func__ << " on " << *dn << dendl;
7c673cae
FG
11001 CDentry::linkage_t *dnl = dn->get_linkage();
11002
11fdf7f2
TL
11003 ceph_assert(!dn->is_auth());
11004 ceph_assert(dnl->is_null());
7c673cae
FG
11005 }
11006 }
11007
11fdf7f2
TL
11008 auto p = m->bl.cbegin();
11009 MDSContext::vec finished;
7c673cae
FG
11010 if (dn) {
11011 if (m->get_is_primary()) {
11012 // primary link.
9f95a23c
TL
11013 CInode *in = nullptr;
11014 decode_replica_inode(in, p, dn, finished);
7c673cae
FG
11015 } else {
11016 // remote link, easy enough.
9f95a23c 11017 decode_remote_dentry_link(dir, dn, p);
7c673cae
FG
11018 }
11019 } else {
11020 ceph_abort();
11021 }
11022
11023 if (!finished.empty())
11024 mds->queue_waiters(finished);
11025
7c673cae
FG
11026 return;
11027}
11028
11029
11030// UNLINK
11031
11032void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11033{
9f95a23c 11034 dout(10) << __func__ << " " << *dn << dendl;
7c673cae
FG
11035 // share unlink news with replicas
11036 set<mds_rank_t> replicas;
11037 dn->list_replicas(replicas);
11fdf7f2
TL
11038 bufferlist snapbl;
11039 if (straydn) {
7c673cae 11040 straydn->list_replicas(replicas);
11fdf7f2
TL
11041 CInode *strayin = straydn->get_linkage()->get_inode();
11042 strayin->encode_snap_blob(snapbl);
11043 }
7c673cae
FG
11044 for (set<mds_rank_t>::iterator it = replicas.begin();
11045 it != replicas.end();
11046 ++it) {
11047 // don't tell (rmdir) witnesses; they already know
11048 if (mdr.get() && mdr->more()->witnessed.count(*it))
11049 continue;
11050
11051 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11052 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11053 rejoin_gather.count(*it)))
11054 continue;
11055
9f95a23c 11056 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11fdf7f2 11057 if (straydn) {
9f95a23c 11058 encode_replica_stray(straydn, *it, unlink->straybl);
11fdf7f2
TL
11059 unlink->snapbl = snapbl;
11060 }
7c673cae
FG
11061 mds->send_message_mds(unlink, *it);
11062 }
11063}
11064
9f95a23c 11065void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
7c673cae
FG
11066{
11067 // straydn
9f95a23c 11068 CDentry *straydn = nullptr;
33c7a0ef 11069 CInode *strayin = nullptr;
7c673cae 11070 if (m->straybl.length())
33c7a0ef 11071 decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num()));
7c673cae
FG
11072
11073 CDir *dir = get_dirfrag(m->get_dirfrag());
11074 if (!dir) {
9f95a23c 11075 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
7c673cae
FG
11076 } else {
11077 CDentry *dn = dir->lookup(m->get_dn());
11078 if (!dn) {
9f95a23c 11079 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
7c673cae 11080 } else {
9f95a23c 11081 dout(7) << __func__ << " on " << *dn << dendl;
7c673cae
FG
11082 CDentry::linkage_t *dnl = dn->get_linkage();
11083
11084 // open inode?
11085 if (dnl->is_primary()) {
11086 CInode *in = dnl->get_inode();
11087 dn->dir->unlink_inode(dn);
11fdf7f2 11088 ceph_assert(straydn);
7c673cae
FG
11089 straydn->dir->link_primary_inode(straydn, in);
11090
11091 // in->first is lazily updated on replica; drag it forward so
11092 // that we always keep it in sync with the dnq
11fdf7f2 11093 ceph_assert(straydn->first >= in->first);
7c673cae
FG
11094 in->first = straydn->first;
11095
11096 // update subtree map?
11097 if (in->is_dir())
11098 adjust_subtree_after_rename(in, dir, false);
11099
11fdf7f2
TL
11100 if (m->snapbl.length()) {
11101 bool hadrealm = (in->snaprealm ? true : false);
11102 in->decode_snap_blob(m->snapbl);
11103 ceph_assert(in->snaprealm);
11fdf7f2
TL
11104 if (!hadrealm)
11105 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11106 }
11107
7c673cae
FG
11108 // send caps to auth (if we're not already)
11109 if (in->is_any_caps() &&
11110 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11111 migrator->export_caps(in);
11112
7c673cae
FG
11113 straydn = NULL;
11114 } else {
11fdf7f2
TL
11115 ceph_assert(!straydn);
11116 ceph_assert(dnl->is_remote());
7c673cae
FG
11117 dn->dir->unlink_inode(dn);
11118 }
11fdf7f2 11119 ceph_assert(dnl->is_null());
7c673cae
FG
11120 }
11121 }
11122
11123 // race with trim_dentry()
11124 if (straydn) {
11fdf7f2
TL
11125 ceph_assert(straydn->get_num_ref() == 0);
11126 ceph_assert(straydn->get_linkage()->is_null());
11127 expiremap ex;
11128 trim_dentry(straydn, ex);
11129 send_expire_messages(ex);
7c673cae 11130 }
7c673cae
FG
11131}
11132
11133
11134
11135
11136
11137
11138// ===================================================================
11139
11140
11141
11142// ===================================================================
11143// FRAGMENT
11144
11145
11146/**
11147 * adjust_dir_fragments -- adjust fragmentation for a directory
11148 *
11149 * @param diri directory inode
11150 * @param basefrag base fragment
11151 * @param bits bit adjustment. positive for split, negative for merge.
11152 */
11153void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
9f95a23c 11154 std::vector<CDir*>* resultfrags,
11fdf7f2 11155 MDSContext::vec& waiters,
7c673cae
FG
11156 bool replay)
11157{
11158 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11159 << " on " << *diri << dendl;
11160
9f95a23c 11161 auto&& p = diri->get_dirfrags_under(basefrag);
7c673cae 11162
9f95a23c 11163 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
7c673cae
FG
11164}
11165
11166CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11167{
11168 CDir *dir = diri->get_dirfrag(fg);
11169 if (dir)
11170 return dir;
11171
11172 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11173
9f95a23c 11174 std::vector<CDir*> src, result;
11fdf7f2 11175 MDSContext::vec waiters;
7c673cae
FG
11176
11177 // split a parent?
11178 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11179 while (1) {
11180 CDir *pdir = diri->get_dirfrag(parent);
11181 if (pdir) {
11182 int split = fg.bits() - parent.bits();
11183 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11184 src.push_back(pdir);
9f95a23c 11185 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
7c673cae
FG
11186 dir = diri->get_dirfrag(fg);
11187 if (dir) {
11188 dout(10) << "force_dir_fragment result " << *dir << dendl;
11189 break;
11190 }
11191 }
11192 if (parent == frag_t())
11193 break;
11194 frag_t last = parent;
11195 parent = parent.parent();
11196 dout(10) << " " << last << " parent is " << parent << dendl;
11197 }
11198
11199 if (!dir) {
11200 // hoover up things under fg?
9f95a23c
TL
11201 {
11202 auto&& p = diri->get_dirfrags_under(fg);
11203 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11204 }
7c673cae
FG
11205 if (src.empty()) {
11206 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11207 } else {
11208 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
9f95a23c 11209 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
7c673cae
FG
11210 dir = result.front();
11211 dout(10) << "force_dir_fragment result " << *dir << dendl;
11212 }
11213 }
11214 if (!replay)
11215 mds->queue_waiters(waiters);
11216 return dir;
11217}
11218
11219void MDCache::adjust_dir_fragments(CInode *diri,
9f95a23c 11220 const std::vector<CDir*>& srcfrags,
7c673cae 11221 frag_t basefrag, int bits,
9f95a23c 11222 std::vector<CDir*>* resultfrags,
11fdf7f2 11223 MDSContext::vec& waiters,
7c673cae
FG
11224 bool replay)
11225{
11226 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11227 << " srcfrags " << srcfrags
11228 << " on " << *diri << dendl;
11229
11230 // adjust fragtree
11231 // yuck. we may have discovered the inode while it was being fragmented.
11232 if (!diri->dirfragtree.is_leaf(basefrag))
11233 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11234
11235 if (bits > 0)
11236 diri->dirfragtree.split(basefrag, bits);
11237 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11238
11239 if (srcfrags.empty())
11240 return;
11241
11242 // split
11243 CDir *parent_dir = diri->get_parent_dir();
11244 CDir *parent_subtree = 0;
11245 if (parent_dir)
11246 parent_subtree = get_subtree_root(parent_dir);
11247
9f95a23c 11248 ceph_assert(srcfrags.size() >= 1);
7c673cae
FG
11249 if (bits > 0) {
11250 // SPLIT
11fdf7f2 11251 ceph_assert(srcfrags.size() == 1);
7c673cae
FG
11252 CDir *dir = srcfrags.front();
11253
11254 dir->split(bits, resultfrags, waiters, replay);
11255
11256 // did i change the subtree map?
11257 if (dir->is_subtree_root()) {
11258 // new frags are now separate subtrees
9f95a23c
TL
11259 for (const auto& dir : *resultfrags) {
11260 subtrees[dir].clear(); // new frag is now its own subtree
11261 }
7c673cae
FG
11262
11263 // was i a bound?
11264 if (parent_subtree) {
11fdf7f2 11265 ceph_assert(subtrees[parent_subtree].count(dir));
7c673cae 11266 subtrees[parent_subtree].erase(dir);
9f95a23c
TL
11267 for (const auto& dir : *resultfrags) {
11268 ceph_assert(dir->is_subtree_root());
11269 subtrees[parent_subtree].insert(dir);
7c673cae
FG
11270 }
11271 }
11272
11273 // adjust my bounds.
11274 set<CDir*> bounds;
11275 bounds.swap(subtrees[dir]);
11276 subtrees.erase(dir);
11277 for (set<CDir*>::iterator p = bounds.begin();
11278 p != bounds.end();
11279 ++p) {
11280 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11281 subtrees[frag].insert(*p);
11282 }
11283
11284 show_subtrees(10);
7c673cae
FG
11285 }
11286
11287 diri->close_dirfrag(dir->get_frag());
11288
11289 } else {
11290 // MERGE
11291
11292 // are my constituent bits subtrees? if so, i will be too.
11293 // (it's all or none, actually.)
11fdf7f2 11294 bool any_subtree = false, any_non_subtree = false;
9f95a23c 11295 for (const auto& dir : srcfrags) {
11fdf7f2 11296 if (dir->is_subtree_root())
31f18b77 11297 any_subtree = true;
11fdf7f2
TL
11298 else
11299 any_non_subtree = true;
31f18b77 11300 }
11fdf7f2
TL
11301 ceph_assert(!any_subtree || !any_non_subtree);
11302
31f18b77
FG
11303 set<CDir*> new_bounds;
11304 if (any_subtree) {
9f95a23c 11305 for (const auto& dir : srcfrags) {
31f18b77
FG
11306 // this simplifies the code that find subtrees underneath the dirfrag
11307 if (!dir->is_subtree_root()) {
11308 dir->state_set(CDir::STATE_AUXSUBTREE);
11309 adjust_subtree_auth(dir, mds->get_nodeid());
11310 }
11311 }
11312
9f95a23c 11313 for (const auto& dir : srcfrags) {
11fdf7f2 11314 ceph_assert(dir->is_subtree_root());
7c673cae 11315 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
11316 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11317 set<CDir*>::iterator r = q->second.begin();
11318 while (r != subtrees[dir].end()) {
11319 new_bounds.insert(*r);
11320 subtrees[dir].erase(r++);
11321 }
11322 subtrees.erase(q);
31f18b77 11323
7c673cae
FG
11324 // remove myself as my parent's bound
11325 if (parent_subtree)
11326 subtrees[parent_subtree].erase(dir);
11327 }
11328 }
11329
11330 // merge
11331 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11332 f->merge(srcfrags, waiters, replay);
7c673cae 11333
31f18b77 11334 if (any_subtree) {
11fdf7f2 11335 ceph_assert(f->is_subtree_root());
7c673cae
FG
11336 subtrees[f].swap(new_bounds);
11337 if (parent_subtree)
11338 subtrees[parent_subtree].insert(f);
11339
11340 show_subtrees(10);
11341 }
11342
9f95a23c 11343 resultfrags->push_back(f);
7c673cae
FG
11344 }
11345}
11346
11347
11348class C_MDC_FragmentFrozen : public MDSInternalContext {
11349 MDCache *mdcache;
11350 MDRequestRef mdr;
11351public:
11352 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11353 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11354 void finish(int r) override {
11355 mdcache->fragment_frozen(mdr, r);
11356 }
11357};
11358
9f95a23c 11359bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
7c673cae
FG
11360{
11361 if (is_readonly()) {
11362 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11363 return false;
11364 }
11365 if (mds->is_cluster_degraded()) {
11366 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11367 return false;
11368 }
11369 if (diri->get_parent_dir() &&
11370 diri->get_parent_dir()->get_inode()->is_stray()) {
11371 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11372 return false;
11373 }
b3b6e05e 11374 if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) {
f67539c2 11375 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl;
7c673cae
FG
11376 return false;
11377 }
11378
9f95a23c 11379 for (const auto& dir : dirs) {
f67539c2
TL
11380 if (dir->scrub_is_in_progress()) {
11381 dout(7) << "can_fragment: scrub in progress " << *dir << dendl;
11382 return false;
11383 }
11384
7c673cae
FG
11385 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11386 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11387 return false;
11388 }
11389 if (!dir->is_auth()) {
11390 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11391 return false;
11392 }
11393 if (dir->is_bad()) {
11394 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11395 return false;
11396 }
11397 if (dir->is_frozen() ||
11398 dir->is_freezing()) {
11399 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11400 return false;
11401 }
11402 }
11403
11404 return true;
11405}
11406
11407void MDCache::split_dir(CDir *dir, int bits)
11408{
11409 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11fdf7f2 11410 ceph_assert(dir->is_auth());
7c673cae
FG
11411 CInode *diri = dir->inode;
11412
9f95a23c 11413 std::vector<CDir*> dirs;
7c673cae
FG
11414 dirs.push_back(dir);
11415
11416 if (!can_fragment(diri, dirs)) {
11417 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11418 return;
11419 }
11420
31f18b77
FG
11421 if (dir->frag.bits() + bits > 24) {
11422 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11423 return;
11424 }
11425
7c673cae
FG
11426 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11427 mdr->more()->fragment_base = dir->dirfrag();
11428
11fdf7f2 11429 ceph_assert(fragments.count(dir->dirfrag()) == 0);
7c673cae
FG
11430 fragment_info_t& info = fragments[dir->dirfrag()];
11431 info.mdr = mdr;
11432 info.dirs.push_back(dir);
11433 info.bits = bits;
11434 info.last_cum_auth_pins_change = ceph_clock_now();
11435
11436 fragment_freeze_dirs(dirs);
11437 // initial mark+complete pass
11438 fragment_mark_and_complete(mdr);
11439}
11440
11441void MDCache::merge_dir(CInode *diri, frag_t frag)
11442{
11443 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11444
9f95a23c
TL
11445 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11446 if (!all) {
7c673cae
FG
11447 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11448 return;
11449 }
11450
11451 if (diri->dirfragtree.is_leaf(frag)) {
11452 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11453 return;
11454 }
11455
11456 if (!can_fragment(diri, dirs))
11457 return;
11458
11459 CDir *first = dirs.front();
11460 int bits = first->get_frag().bits() - frag.bits();
1911f103 11461 dout(10) << " we are merging by " << bits << " bits" << dendl;
7c673cae
FG
11462
11463 dirfrag_t basedirfrag(diri->ino(), frag);
11464 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11465 mdr->more()->fragment_base = basedirfrag;
11466
11fdf7f2 11467 ceph_assert(fragments.count(basedirfrag) == 0);
7c673cae
FG
11468 fragment_info_t& info = fragments[basedirfrag];
11469 info.mdr = mdr;
11470 info.dirs = dirs;
11471 info.bits = -bits;
11472 info.last_cum_auth_pins_change = ceph_clock_now();
11473
11474 fragment_freeze_dirs(dirs);
11475 // initial mark+complete pass
11476 fragment_mark_and_complete(mdr);
11477}
11478
9f95a23c 11479void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
7c673cae 11480{
11fdf7f2 11481 bool any_subtree = false, any_non_subtree = false;
9f95a23c 11482 for (const auto& dir : dirs) {
7c673cae
FG
11483 dir->auth_pin(dir); // until we mark and complete them
11484 dir->state_set(CDir::STATE_FRAGMENTING);
11485 dir->freeze_dir();
11fdf7f2
TL
11486 ceph_assert(dir->is_freezing_dir());
11487
11488 if (dir->is_subtree_root())
11489 any_subtree = true;
11490 else
11491 any_non_subtree = true;
11492 }
11493
11494 if (any_subtree && any_non_subtree) {
11495 // either all dirfrags are subtree roots or all are not.
9f95a23c 11496 for (const auto& dir : dirs) {
11fdf7f2
TL
11497 if (dir->is_subtree_root()) {
11498 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11499 } else {
11500 dir->state_set(CDir::STATE_AUXSUBTREE);
11501 adjust_subtree_auth(dir, mds->get_nodeid());
11502 }
11503 }
7c673cae
FG
11504 }
11505}
11506
11507class C_MDC_FragmentMarking : public MDCacheContext {
11508 MDRequestRef mdr;
11509public:
11510 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11511 void finish(int r) override {
11512 mdcache->fragment_mark_and_complete(mdr);
11513 }
11514};
11515
11516void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11517{
11518 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11519 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11520 if (it == fragments.end() || it->second.mdr != mdr) {
11521 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11522 request_finish(mdr);
11523 return;
11524 }
11525
11526 fragment_info_t& info = it->second;
11527 CInode *diri = info.dirs.front()->get_inode();
11528 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11529
11530 MDSGatherBuilder gather(g_ceph_context);
11531
9f95a23c 11532 for (const auto& dir : info.dirs) {
7c673cae
FG
11533 bool ready = true;
11534 if (!dir->is_complete()) {
11535 dout(15) << " fetching incomplete " << *dir << dendl;
11536 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11537 ready = false;
11538 } else if (dir->get_frag() == frag_t()) {
11539 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11540 // the operation. To avoid CDir::fetch() complaining about missing object,
11541 // we commit new dirfrag first.
11542 if (dir->state_test(CDir::STATE_CREATING)) {
11543 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11544 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11545 ready = false;
11546 } else if (dir->is_new()) {
11547 dout(15) << " committing new " << *dir << dendl;
11fdf7f2 11548 ceph_assert(dir->is_dirty());
7c673cae
FG
11549 dir->commit(0, gather.new_sub(), true);
11550 ready = false;
11551 }
11552 }
11553 if (!ready)
11554 continue;
11555
11556 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11557 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11558 for (auto &p : dir->items) {
11559 CDentry *dn = p.second;
7c673cae 11560 dn->get(CDentry::PIN_FRAGMENTING);
11fdf7f2 11561 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11562 dn->state_set(CDentry::STATE_FRAGMENTING);
11563 }
11564 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11565 dir->auth_unpin(dir);
11566 } else {
11567 dout(15) << " already marked " << *dir << dendl;
11568 }
11569 }
11570 if (gather.has_subs()) {
11571 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11572 gather.activate();
11573 return;
11574 }
11575
9f95a23c 11576 for (const auto& dir : info.dirs) {
7c673cae 11577 if (!dir->is_frozen_dir()) {
11fdf7f2 11578 ceph_assert(dir->is_freezing_dir());
7c673cae
FG
11579 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11580 }
11581 }
11582 if (gather.has_subs()) {
11583 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11584 gather.activate();
11585 // flush log so that request auth_pins are retired
11586 mds->mdlog->flush();
11587 return;
11588 }
11589
11590 fragment_frozen(mdr, 0);
11591}
11592
9f95a23c 11593void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
7c673cae
FG
11594{
11595 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
9f95a23c 11596 for (const auto& dir : dirs) {
7c673cae
FG
11597 dout(10) << " frag " << *dir << dendl;
11598
11fdf7f2 11599 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
7c673cae
FG
11600 dir->state_clear(CDir::STATE_FRAGMENTING);
11601
11602 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11603 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11604
94b18763
FG
11605 for (auto &p : dir->items) {
11606 CDentry *dn = p.second;
11fdf7f2 11607 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11608 dn->state_clear(CDentry::STATE_FRAGMENTING);
11609 dn->put(CDentry::PIN_FRAGMENTING);
11610 }
11611 } else {
11612 dir->auth_unpin(dir);
11613 }
11614
11615 dir->unfreeze_dir();
11616 }
11617}
11618
11619bool MDCache::fragment_are_all_frozen(CDir *dir)
11620{
11fdf7f2 11621 ceph_assert(dir->is_frozen_dir());
7c673cae
FG
11622 map<dirfrag_t,fragment_info_t>::iterator p;
11623 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11624 p != fragments.end() && p->first.ino == dir->ino();
11625 ++p) {
11626 if (p->first.frag.contains(dir->get_frag()))
11627 return p->second.all_frozen;
11628 }
11629 ceph_abort();
11630 return false;
11631}
11632
11633void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11634{
11635 map<dirfrag_t,fragment_info_t>::iterator p;
11636 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11637 p != fragments.end() && p->first.ino == dir->ino();
11638 ++p) {
11639 if (p->first.frag.contains(dir->get_frag())) {
11640 p->second.num_remote_waiters++;
11641 return;
11642 }
11643 }
11644 ceph_abort();
11645}
11646
11647void MDCache::find_stale_fragment_freeze()
11648{
11649 dout(10) << "find_stale_fragment_freeze" << dendl;
11650 // see comment in Migrator::find_stale_export_freeze()
11651 utime_t now = ceph_clock_now();
11652 utime_t cutoff = now;
11fdf7f2 11653 cutoff -= g_conf()->mds_freeze_tree_timeout;
7c673cae
FG
11654
11655 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11656 p != fragments.end(); ) {
11657 dirfrag_t df = p->first;
11658 fragment_info_t& info = p->second;
11659 ++p;
11660 if (info.all_frozen)
11661 continue;
11662 CDir *dir;
11663 int total_auth_pins = 0;
9f95a23c
TL
11664 for (const auto& d : info.dirs) {
11665 dir = d;
7c673cae
FG
11666 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11667 total_auth_pins = -1;
11668 break;
11669 }
11670 if (dir->is_frozen_dir())
11671 continue;
11672 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11673 }
11674 if (total_auth_pins < 0)
11675 continue;
11676 if (info.last_cum_auth_pins != total_auth_pins) {
11677 info.last_cum_auth_pins = total_auth_pins;
11678 info.last_cum_auth_pins_change = now;
11679 continue;
11680 }
11681 if (info.last_cum_auth_pins_change >= cutoff)
11682 continue;
11683 dir = info.dirs.front();
11684 if (info.num_remote_waiters > 0 ||
11685 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11686 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
9f95a23c 11687 std::vector<CDir*> dirs;
7c673cae
FG
11688 info.dirs.swap(dirs);
11689 fragments.erase(df);
11690 fragment_unmark_unfreeze_dirs(dirs);
11691 }
11692 }
11693}
11694
11695class C_MDC_FragmentPrep : public MDCacheLogContext {
11696 MDRequestRef mdr;
11697public:
11698 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11699 void finish(int r) override {
11700 mdcache->_fragment_logged(mdr);
11701 }
11702};
11703
11704class C_MDC_FragmentStore : public MDCacheContext {
11705 MDRequestRef mdr;
11706public:
11707 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11708 void finish(int r) override {
11709 mdcache->_fragment_stored(mdr);
11710 }
11711};
11712
11713class C_MDC_FragmentCommit : public MDCacheLogContext {
11714 dirfrag_t basedirfrag;
a8e16298 11715 MDRequestRef mdr;
7c673cae 11716public:
a8e16298
TL
11717 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11718 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
7c673cae 11719 void finish(int r) override {
a8e16298 11720 mdcache->_fragment_committed(basedirfrag, mdr);
7c673cae
FG
11721 }
11722};
11723
a8e16298 11724class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
7c673cae 11725 dirfrag_t basedirfrag;
a8e16298
TL
11726 int bits;
11727 MDRequestRef mdr;
7c673cae 11728public:
a8e16298
TL
11729 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11730 const MDRequestRef& r) :
11731 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
7c673cae 11732 void finish(int r) override {
f67539c2 11733 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
a8e16298 11734 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
7c673cae 11735 }
91327a77 11736 void print(ostream& out) const override {
a8e16298 11737 out << "fragment_purge_old(" << basedirfrag << ")";
91327a77 11738 }
7c673cae
FG
11739};
11740
11741void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11742{
11743 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11744 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11745 if (it == fragments.end() || it->second.mdr != mdr) {
11746 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11747 request_finish(mdr);
11748 return;
11749 }
11750
11fdf7f2 11751 ceph_assert(r == 0);
7c673cae
FG
11752 fragment_info_t& info = it->second;
11753 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11754 << " on " << info.dirs.front()->get_inode() << dendl;
11755
11756 info.all_frozen = true;
11757 dispatch_fragment_dir(mdr);
11758}
11759
11760void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11761{
11762 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11763 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11764 if (it == fragments.end() || it->second.mdr != mdr) {
11765 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11766 request_finish(mdr);
11767 return;
11768 }
11769
11770 fragment_info_t& info = it->second;
11771 CInode *diri = info.dirs.front()->get_inode();
11772
11773 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11774 << " on " << *diri << dendl;
9f95a23c 11775
f67539c2 11776 if (mdr->more()->peer_error)
9f95a23c
TL
11777 mdr->aborted = true;
11778
7c673cae 11779 if (!mdr->aborted) {
11fdf7f2
TL
11780 MutationImpl::LockOpVec lov;
11781 lov.add_wrlock(&diri->dirfragtreelock);
7c673cae 11782 // prevent a racing gather on any other scatterlocks too
9f95a23c
TL
11783 lov.lock_scatter_gather(&diri->nestlock);
11784 lov.lock_scatter_gather(&diri->filelock);
11785 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
7c673cae
FG
11786 if (!mdr->aborted)
11787 return;
9f95a23c 11788 }
7c673cae
FG
11789 }
11790
11791 if (mdr->aborted) {
11792 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11793 << info.dirs.front()->dirfrag() << dendl;
11794 if (info.bits > 0)
11795 mds->balancer->queue_split(info.dirs.front(), false);
11796 else
11797 mds->balancer->queue_merge(info.dirs.front());
11798 fragment_unmark_unfreeze_dirs(info.dirs);
11799 fragments.erase(it);
11800 request_finish(mdr);
11801 return;
11802 }
11803
11804 mdr->ls = mds->mdlog->get_current_segment();
11805 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11806 mds->mdlog->start_entry(le);
11807
9f95a23c 11808 for (const auto& dir : info.dirs) {
7c673cae
FG
11809 dirfrag_rollback rollback;
11810 rollback.fnode = dir->fnode;
11811 le->add_orig_frag(dir->get_frag(), &rollback);
11812 }
11813
11814 // refragment
11fdf7f2 11815 MDSContext::vec waiters;
7c673cae 11816 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
9f95a23c 11817 &info.resultfrags, waiters, false);
11fdf7f2 11818 if (g_conf()->mds_debug_frag)
7c673cae
FG
11819 diri->verify_dirfrags();
11820 mds->queue_waiters(waiters);
11821
11fdf7f2
TL
11822 for (const auto& fg : le->orig_frags)
11823 ceph_assert(!diri->dirfragtree.is_leaf(fg));
7c673cae 11824
9f95a23c
TL
11825 le->metablob.add_dir_context(info.resultfrags.front());
11826 for (const auto& dir : info.resultfrags) {
7c673cae 11827 if (diri->is_auth()) {
9f95a23c 11828 le->metablob.add_fragmented_dir(dir, false, false);
7c673cae 11829 } else {
9f95a23c
TL
11830 dir->state_set(CDir::STATE_DIRTYDFT);
11831 le->metablob.add_fragmented_dir(dir, false, true);
7c673cae
FG
11832 }
11833 }
11834
11835 // dft lock
11836 if (diri->is_auth()) {
11837 // journal dirfragtree
f67539c2
TL
11838 auto pi = diri->project_inode(mdr);
11839 pi.inode->version = diri->pre_dirty();
11840 predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
7c673cae
FG
11841 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11842 } else {
11843 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11844 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11845 mdr->add_updated_lock(&diri->dirfragtreelock);
11846 }
11847
11848 /*
11849 // filelock
11850 mds->locker->mark_updated_scatterlock(&diri->filelock);
11851 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11852 mut->add_updated_lock(&diri->filelock);
11853
11854 // dirlock
11855 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11856 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11857 mut->add_updated_lock(&diri->nestlock);
11858 */
11859
11860 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11861 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11862 mdr, __func__);
11863 mds->mdlog->flush();
11864}
11865
11866void MDCache::_fragment_logged(MDRequestRef& mdr)
11867{
11868 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298 11869 auto& info = fragments.at(basedirfrag);
7c673cae
FG
11870 CInode *diri = info.resultfrags.front()->get_inode();
11871
11872 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11873 << " on " << *diri << dendl;
a8e16298 11874 mdr->mark_event("prepare logged");
7c673cae 11875
7c673cae
FG
11876 mdr->apply(); // mark scatterlock
11877
11878 // store resulting frags
11879 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11880
9f95a23c 11881 for (const auto& dir : info.resultfrags) {
7c673cae
FG
11882 dout(10) << " storing result frag " << *dir << dendl;
11883
f67539c2 11884 dir->mark_dirty(mdr->ls);
f91f0fd5
TL
11885 dir->mark_new(mdr->ls);
11886
7c673cae
FG
11887 // freeze and store them too
11888 dir->auth_pin(this);
11889 dir->state_set(CDir::STATE_FRAGMENTING);
11890 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11891 }
11892
11893 gather.activate();
11894}
11895
11896void MDCache::_fragment_stored(MDRequestRef& mdr)
11897{
11898 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298
TL
11899 fragment_info_t &info = fragments.at(basedirfrag);
11900 CDir *first = info.resultfrags.front();
11901 CInode *diri = first->get_inode();
7c673cae
FG
11902
11903 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11904 << " on " << *diri << dendl;
a8e16298 11905 mdr->mark_event("new frags stored");
7c673cae
FG
11906
11907 // tell peers
a8e16298
TL
11908 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11909 diri->authority().first : CDIR_AUTH_UNKNOWN;
181888fb
FG
11910 for (const auto &p : first->get_replicas()) {
11911 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11912 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11913 rejoin_gather.count(p.first)))
7c673cae
FG
11914 continue;
11915
9f95a23c 11916 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
a8e16298
TL
11917 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11918 diri_auth != p.first) { // not auth mds of diri
11919 /*
11920 * In the nornal case, mds does not trim dir inode whose child dirfrags
11921 * are likely being fragmented (see trim_inode()). But when fragmenting
11922 * subtree roots, following race can happen:
11923 *
11924 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11925 * mds.c and drops wrlock on dirfragtreelock.
11926 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11927 * SYNC and send lock message mds.c
11928 * - mds.c receives the lock message and changes dirfragtreelock state
11929 * to SYNC
11930 * - mds.c trim dirfrag and dir inode from its cache
11931 * - mds.c receives the fragment_notify message
11932 *
11933 * So we need to ensure replicas have received the notify, then unlock
11934 * the dirfragtreelock.
11935 */
11936 notify->mark_ack_wanted();
11937 info.notify_ack_waiting.insert(p.first);
11938 }
7c673cae
FG
11939
11940 // freshly replicate new dirs to peers
9f95a23c
TL
11941 for (const auto& dir : info.resultfrags) {
11942 encode_replica_dir(dir, p.first, notify->basebl);
11943 }
7c673cae 11944
181888fb 11945 mds->send_message_mds(notify, p.first);
7c673cae
FG
11946 }
11947
11948 // journal commit
11949 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
a8e16298 11950 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
7c673cae 11951
7c673cae
FG
11952
11953 // unfreeze resulting frags
9f95a23c 11954 for (const auto& dir : info.resultfrags) {
7c673cae
FG
11955 dout(10) << " result frag " << *dir << dendl;
11956
94b18763
FG
11957 for (auto &p : dir->items) {
11958 CDentry *dn = p.second;
11fdf7f2 11959 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11960 dn->state_clear(CDentry::STATE_FRAGMENTING);
11961 dn->put(CDentry::PIN_FRAGMENTING);
11962 }
11963
11964 // unfreeze
11965 dir->unfreeze_dir();
11966 }
11967
a8e16298
TL
11968 if (info.notify_ack_waiting.empty()) {
11969 fragment_drop_locks(info);
11970 } else {
11971 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11972 }
7c673cae
FG
11973}
11974
a8e16298 11975void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
7c673cae
FG
11976{
11977 dout(10) << "fragment_committed " << basedirfrag << dendl;
a8e16298
TL
11978 if (mdr)
11979 mdr->mark_event("commit logged");
11980
11981 ufragment &uf = uncommitted_fragments.at(basedirfrag);
7c673cae
FG
11982
11983 // remove old frags
11984 C_GatherBuilder gather(
11985 g_ceph_context,
11986 new C_OnFinisher(
a8e16298 11987 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
7c673cae
FG
11988 mds->finisher));
11989
11990 SnapContext nullsnapc;
b3b6e05e 11991 object_locator_t oloc(mds->get_metadata_pool());
11fdf7f2
TL
11992 for (const auto& fg : uf.old_frags) {
11993 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
7c673cae 11994 ObjectOperation op;
11fdf7f2 11995 if (fg == frag_t()) {
7c673cae
FG
11996 // backtrace object
11997 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11998 op.truncate(0);
11999 op.omap_clear();
12000 } else {
12001 dout(10) << " removing orphan dirfrag " << oid << dendl;
12002 op.remove();
12003 }
12004 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12005 ceph::real_clock::now(),
12006 0, gather.new_sub());
12007 }
12008
11fdf7f2 12009 ceph_assert(gather.has_subs());
7c673cae
FG
12010 gather.activate();
12011}
12012
a8e16298 12013void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
7c673cae 12014{
a8e16298
TL
12015 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12016 if (mdr)
12017 mdr->mark_event("old frags purged");
12018
12019 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12020 mds->mdlog->start_submit_entry(le);
12021
12022 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12023
12024 if (mds->logger) {
12025 if (bits > 0) {
12026 mds->logger->inc(l_mds_dir_split);
12027 } else {
12028 mds->logger->inc(l_mds_dir_merge);
12029 }
12030 }
12031
12032 if (mdr) {
12033 auto it = fragments.find(basedirfrag);
12034 ceph_assert(it != fragments.end());
12035 it->second.finishing = true;
12036 if (it->second.notify_ack_waiting.empty())
12037 fragment_maybe_finish(it);
12038 else
12039 mdr->mark_event("wating for notify acks");
12040 }
12041}
12042
12043void MDCache::fragment_drop_locks(fragment_info_t& info)
12044{
12045 mds->locker->drop_locks(info.mdr.get());
12046 request_finish(info.mdr);
12047 //info.mdr.reset();
12048}
12049
12050void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12051{
12052 if (!it->second.finishing)
12053 return;
7c673cae
FG
12054
12055 // unmark & auth_unpin
a8e16298 12056 for (const auto &dir : it->second.resultfrags) {
7c673cae
FG
12057 dir->state_clear(CDir::STATE_FRAGMENTING);
12058 dir->auth_unpin(this);
12059
12060 // In case the resulting fragments are beyond the split size,
12061 // we might need to split them again right away (they could
12062 // have been taking inserts between unfreezing and getting
12063 // here)
12064 mds->balancer->maybe_fragment(dir, false);
12065 }
12066
a8e16298
TL
12067 fragments.erase(it);
12068}
12069
12070
9f95a23c 12071void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
a8e16298
TL
12072{
12073 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12074 mds_rank_t from = mds_rank_t(ack->get_source().num());
12075
12076 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
a8e16298 12077 return;
7c673cae
FG
12078 }
12079
a8e16298
TL
12080 auto it = fragments.find(ack->get_base_dirfrag());
12081 if (it == fragments.end() ||
12082 it->second.get_tid() != ack->get_tid()) {
12083 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
a8e16298
TL
12084 return;
12085 }
7c673cae 12086
a8e16298
TL
12087 if (it->second.notify_ack_waiting.erase(from) &&
12088 it->second.notify_ack_waiting.empty()) {
12089 fragment_drop_locks(it->second);
12090 fragment_maybe_finish(it);
12091 }
7c673cae
FG
12092}
12093
9f95a23c 12094void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
7c673cae
FG
12095{
12096 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
a8e16298 12097 mds_rank_t from = mds_rank_t(notify->get_source().num());
7c673cae
FG
12098
12099 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
12100 return;
12101 }
12102
12103 CInode *diri = get_inode(notify->get_ino());
12104 if (diri) {
12105 frag_t base = notify->get_basefrag();
12106 int bits = notify->get_bits();
12107
12108/*
12109 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12110 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12111 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12112 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
7c673cae
FG
12113 return;
12114 }
12115*/
12116
12117 // refragment
11fdf7f2 12118 MDSContext::vec waiters;
9f95a23c
TL
12119 std::vector<CDir*> resultfrags;
12120 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
11fdf7f2 12121 if (g_conf()->mds_debug_frag)
7c673cae
FG
12122 diri->verify_dirfrags();
12123
9f95a23c
TL
12124 for (const auto& dir : resultfrags) {
12125 diri->take_dir_waiting(dir->get_frag(), waiters);
12126 }
7c673cae
FG
12127
12128 // add new replica dirs values
11fdf7f2 12129 auto p = notify->basebl.cbegin();
9f95a23c
TL
12130 while (!p.end()) {
12131 CDir *tmp_dir = nullptr;
12132 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12133 }
7c673cae
FG
12134
12135 mds->queue_waiters(waiters);
12136 } else {
12137 ceph_abort();
12138 }
12139
a8e16298 12140 if (notify->is_ack_wanted()) {
9f95a23c 12141 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
11fdf7f2 12142 notify->get_bits(), notify->get_tid());
a8e16298
TL
12143 mds->send_message_mds(ack, from);
12144 }
7c673cae
FG
12145}
12146
11fdf7f2 12147void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
7c673cae
FG
12148 LogSegment *ls, bufferlist *rollback)
12149{
12150 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11fdf7f2 12151 ceph_assert(!uncommitted_fragments.count(basedirfrag));
7c673cae
FG
12152 ufragment& uf = uncommitted_fragments[basedirfrag];
12153 uf.old_frags = old_frags;
12154 uf.bits = bits;
12155 uf.ls = ls;
12156 ls->uncommitted_fragments.insert(basedirfrag);
12157 if (rollback)
12158 uf.rollback.swap(*rollback);
12159}
12160
12161void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12162{
12163 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12164 << " op " << EFragment::op_name(op) << dendl;
12165 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12166 if (it != uncommitted_fragments.end()) {
12167 ufragment& uf = it->second;
12168 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12169 uf.committed = true;
12170 } else {
12171 uf.ls->uncommitted_fragments.erase(basedirfrag);
12172 mds->queue_waiters(uf.waiters);
12173 uncommitted_fragments.erase(it);
12174 }
12175 }
12176}
12177
11fdf7f2 12178void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
7c673cae
FG
12179{
12180 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12181 << " old_frags (" << old_frags << ")" << dendl;
12182 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12183 if (it != uncommitted_fragments.end()) {
12184 ufragment& uf = it->second;
12185 if (!uf.old_frags.empty()) {
11fdf7f2 12186 uf.old_frags = std::move(old_frags);
7c673cae
FG
12187 uf.committed = true;
12188 } else {
12189 uf.ls->uncommitted_fragments.erase(basedirfrag);
12190 uncommitted_fragments.erase(it);
12191 }
12192 }
12193}
12194
f91f0fd5 12195void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
e306af50 12196{
f91f0fd5
TL
12197 MDSGatherBuilder gather(g_ceph_context, finisher);
12198 for (auto& p : uncommitted_fragments) {
12199 p.second.waiters.push_back(gather.new_sub());
12200 }
12201 gather.activate();
e306af50
TL
12202}
12203
f67539c2
TL
12204struct C_MDC_FragmentRollback : public MDCacheLogContext {
12205 MutationRef mut;
12206 C_MDC_FragmentRollback(MDCache *c, MutationRef& m) :
12207 MDCacheLogContext(c), mut(m) {}
12208 void finish(int r) override {
12209 mut->apply();
12210 get_mds()->locker->drop_locks(mut.get());
12211 mut->cleanup();
12212 }
12213};
12214
7c673cae
FG
12215void MDCache::rollback_uncommitted_fragments()
12216{
12217 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12218 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12219 p != uncommitted_fragments.end();
12220 ++p) {
12221 ufragment &uf = p->second;
12222 CInode *diri = get_inode(p->first.ino);
11fdf7f2 12223 ceph_assert(diri);
7c673cae
FG
12224
12225 if (uf.committed) {
a8e16298 12226 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
12227 continue;
12228 }
12229
12230 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12231
f67539c2
TL
12232 MutationRef mut(new MutationImpl());
12233 mut->ls = mds->mdlog->get_current_segment();
7c673cae
FG
12234 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12235 mds->mdlog->start_entry(le);
12236 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12237
11fdf7f2 12238 frag_vec_t old_frags;
7c673cae
FG
12239 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12240
9f95a23c 12241 std::vector<CDir*> resultfrags;
7c673cae
FG
12242 if (uf.old_frags.empty()) {
12243 // created by old format EFragment
11fdf7f2 12244 MDSContext::vec waiters;
9f95a23c 12245 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
7c673cae 12246 } else {
11fdf7f2
TL
12247 auto bp = uf.rollback.cbegin();
12248 for (const auto& fg : uf.old_frags) {
12249 CDir *dir = force_dir_fragment(diri, fg);
7c673cae
FG
12250 resultfrags.push_back(dir);
12251
12252 dirfrag_rollback rollback;
11fdf7f2 12253 decode(rollback, bp);
7c673cae 12254
7c673cae
FG
12255 dir->fnode = rollback.fnode;
12256
f67539c2 12257 dir->mark_dirty(mut->ls);
7c673cae 12258
f67539c2 12259 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
7c673cae 12260 dout(10) << " dirty nestinfo on " << *dir << dendl;
f67539c2
TL
12261 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12262 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12263 mut->add_updated_lock(&diri->nestlock);
7c673cae 12264 }
f67539c2 12265 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
7c673cae 12266 dout(10) << " dirty fragstat on " << *dir << dendl;
f67539c2
TL
12267 mds->locker->mark_updated_scatterlock(&diri->filelock);
12268 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12269 mut->add_updated_lock(&diri->filelock);
7c673cae
FG
12270 }
12271
12272 le->add_orig_frag(dir->get_frag());
12273 le->metablob.add_dir_context(dir);
12274 if (diri_auth) {
12275 le->metablob.add_fragmented_dir(dir, true, false);
12276 } else {
12277 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12278 dir->state_set(CDir::STATE_DIRTYDFT);
12279 le->metablob.add_fragmented_dir(dir, true, true);
12280 }
12281 }
12282 }
12283
12284 if (diri_auth) {
f67539c2
TL
12285 auto pi = diri->project_inode(mut);
12286 pi.inode->version = diri->pre_dirty();
12287 predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
7c673cae
FG
12288 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12289 } else {
12290 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
f67539c2
TL
12291 mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12292 mut->add_updated_lock(&diri->dirfragtreelock);
7c673cae
FG
12293 }
12294
11fdf7f2 12295 if (g_conf()->mds_debug_frag)
7c673cae
FG
12296 diri->verify_dirfrags();
12297
11fdf7f2
TL
12298 for (const auto& leaf : old_frags) {
12299 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12300 }
7c673cae 12301
f67539c2 12302 mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut));
7c673cae
FG
12303
12304 uf.old_frags.swap(old_frags);
a8e16298 12305 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
12306 }
12307}
12308
12309void MDCache::force_readonly()
12310{
12311 if (is_readonly())
12312 return;
12313
12314 dout(1) << "force file system read-only" << dendl;
12315 mds->clog->warn() << "force file system read-only";
12316
12317 set_readonly();
12318
12319 mds->server->force_clients_readonly();
12320
12321 // revoke write caps
81eedcae 12322 int count = 0;
94b18763 12323 for (auto &p : inode_map) {
b32b8144 12324 CInode *in = p.second;
7c673cae
FG
12325 if (in->is_head())
12326 mds->locker->eval(in, CEPH_CAP_LOCKS);
33c7a0ef 12327 if (!(++count % mds->heartbeat_reset_grace()))
81eedcae 12328 mds->heartbeat_reset();
7c673cae
FG
12329 }
12330
12331 mds->mdlog->flush();
12332}
12333
12334
12335// ==============================================================
12336// debug crap
12337
81eedcae 12338void MDCache::show_subtrees(int dbl, bool force_print)
7c673cae 12339{
11fdf7f2 12340 if (g_conf()->mds_thrash_exports)
7c673cae
FG
12341 dbl += 15;
12342
12343 //dout(10) << "show_subtrees" << dendl;
12344
11fdf7f2 12345 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
7c673cae
FG
12346 return; // i won't print anything.
12347
12348 if (subtrees.empty()) {
11fdf7f2
TL
12349 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12350 << dendl;
7c673cae
FG
12351 return;
12352 }
12353
81eedcae
TL
12354 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12355 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12356 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12357 "printing subtrees" << dendl;
12358 return;
12359 }
12360
7c673cae 12361 // root frags
9f95a23c 12362 std::vector<CDir*> basefrags;
7c673cae
FG
12363 for (set<CInode*>::iterator p = base_inodes.begin();
12364 p != base_inodes.end();
12365 ++p)
12366 (*p)->get_dirfrags(basefrags);
12367 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12368 dout(15) << "show_subtrees" << dendl;
12369
12370 // queue stuff
12371 list<pair<CDir*,int> > q;
12372 string indent;
12373 set<CDir*> seen;
12374
12375 // calc max depth
9f95a23c
TL
12376 for (const auto& dir : basefrags) {
12377 q.emplace_back(dir, 0);
12378 }
7c673cae
FG
12379
12380 set<CDir*> subtrees_seen;
12381
81eedcae 12382 unsigned int depth = 0;
7c673cae
FG
12383 while (!q.empty()) {
12384 CDir *dir = q.front().first;
81eedcae 12385 unsigned int d = q.front().second;
7c673cae
FG
12386 q.pop_front();
12387
12388 if (subtrees.count(dir) == 0) continue;
12389
12390 subtrees_seen.insert(dir);
12391
12392 if (d > depth) depth = d;
12393
12394 // sanity check
12395 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12396 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11fdf7f2 12397 ceph_assert(seen.count(dir) == 0);
7c673cae
FG
12398 seen.insert(dir);
12399
12400 // nested items?
12401 if (!subtrees[dir].empty()) {
12402 for (set<CDir*>::iterator p = subtrees[dir].begin();
12403 p != subtrees[dir].end();
12404 ++p) {
12405 //dout(25) << " saw sub " << **p << dendl;
12406 q.push_front(pair<CDir*,int>(*p, d+1));
12407 }
12408 }
12409 }
12410
81eedcae
TL
12411 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12412 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12413 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12414 "subtrees" << dendl;
12415 return;
12416 }
7c673cae
FG
12417
12418 // print tree
9f95a23c
TL
12419 for (const auto& dir : basefrags) {
12420 q.emplace_back(dir, 0);
12421 }
7c673cae
FG
12422
12423 while (!q.empty()) {
12424 CDir *dir = q.front().first;
12425 int d = q.front().second;
12426 q.pop_front();
12427
12428 if (subtrees.count(dir) == 0) continue;
12429
12430 // adjust indenter
12431 while ((unsigned)d < indent.size())
12432 indent.resize(d);
12433
12434 // pad
12435 string pad = "______________________________________";
12436 pad.resize(depth*2+1-indent.size());
12437 if (!subtrees[dir].empty())
12438 pad[0] = '.'; // parent
12439
12440
12441 string auth;
12442 if (dir->is_auth())
12443 auth = "auth ";
12444 else
12445 auth = " rep ";
12446
12447 char s[10];
12448 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12449 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12450 else
12451 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12452
12453 // print
11fdf7f2
TL
12454 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12455 << " " << auth << *dir << dendl;
7c673cae 12456
b3b6e05e 12457 if (dir->ino() == CEPH_INO_ROOT)
11fdf7f2 12458 ceph_assert(dir->inode == root);
7c673cae 12459 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11fdf7f2 12460 ceph_assert(dir->inode == myin);
7c673cae 12461 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11fdf7f2 12462 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
7c673cae
FG
12463
12464 // nested items?
12465 if (!subtrees[dir].empty()) {
12466 // more at my level?
12467 if (!q.empty() && q.front().second == d)
12468 indent += "| ";
12469 else
12470 indent += " ";
12471
12472 for (set<CDir*>::iterator p = subtrees[dir].begin();
12473 p != subtrees[dir].end();
12474 ++p)
12475 q.push_front(pair<CDir*,int>(*p, d+2));
12476 }
12477 }
12478
12479 // verify there isn't stray crap in subtree map
12480 int lost = 0;
12481 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12482 p != subtrees.end();
12483 ++p) {
12484 if (subtrees_seen.count(p->first)) continue;
12485 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12486 lost++;
12487 }
11fdf7f2 12488 ceph_assert(lost == 0);
7c673cae
FG
12489}
12490
7c673cae
FG
12491void MDCache::show_cache()
12492{
f67539c2
TL
12493 if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
12494 return;
7c673cae 12495 dout(7) << "show_cache" << dendl;
b32b8144
FG
12496
12497 auto show_func = [this](CInode *in) {
7c673cae 12498 // unlinked?
b32b8144
FG
12499 if (!in->parent)
12500 dout(7) << " unlinked " << *in << dendl;
12501
7c673cae 12502 // dirfrags?
9f95a23c
TL
12503 auto&& dfs = in->get_dirfrags();
12504 for (const auto& dir : dfs) {
7c673cae 12505 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 12506
94b18763
FG
12507 for (auto &p : dir->items) {
12508 CDentry *dn = p.second;
7c673cae
FG
12509 dout(7) << " dentry " << *dn << dendl;
12510 CDentry::linkage_t *dnl = dn->get_linkage();
12511 if (dnl->is_primary() && dnl->get_inode())
12512 dout(7) << " inode " << *dnl->get_inode() << dendl;
12513 }
12514 }
b32b8144
FG
12515 };
12516
94b18763 12517 for (auto &p : inode_map)
b32b8144 12518 show_func(p.second);
94b18763 12519 for (auto &p : snap_inode_map)
b32b8144 12520 show_func(p.second);
7c673cae
FG
12521}
12522
f64942e4 12523void MDCache::cache_status(Formatter *f)
181888fb
FG
12524{
12525 f->open_object_section("cache");
12526
12527 f->open_object_section("pool");
12528 mempool::get_pool(mempool::mds_co::id).dump(f);
12529 f->close_section();
12530
12531 f->close_section();
181888fb
FG
12532}
12533
11fdf7f2 12534void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
7c673cae 12535{
11fdf7f2
TL
12536 ceph_assert(in);
12537 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12538 return;
12539 }
9f95a23c 12540 auto&& ls = in->get_dirfrags();
11fdf7f2
TL
12541 for (const auto &subdir : ls) {
12542 for (const auto &p : subdir->items) {
12543 CDentry *dn = p.second;
12544 CInode *in = dn->get_linkage()->get_inode();
12545 if (in) {
12546 dump_tree(in, cur_depth + 1, max_depth, f);
12547 }
12548 }
12549 }
12550 f->open_object_section("inode");
12551 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12552 f->close_section();
7c673cae
FG
12553}
12554
20effc67 12555int MDCache::dump_cache(std::string_view file_name, double timeout)
7c673cae 12556{
20effc67 12557 return dump_cache(file_name, NULL, timeout);
7c673cae
FG
12558}
12559
20effc67 12560int MDCache::dump_cache(Formatter *f, double timeout)
7c673cae 12561{
20effc67 12562 return dump_cache(std::string_view(""), f, timeout);
7c673cae
FG
12563}
12564
12565/**
12566 * Dump the metadata cache, either to a Formatter, if
12567 * provided, else to a plain text file.
12568 */
20effc67 12569int MDCache::dump_cache(std::string_view fn, Formatter *f, double timeout)
7c673cae
FG
12570{
12571 int r = 0;
f64942e4
AA
12572
12573 // dumping large caches may cause mds to hang or worse get killed.
12574 // so, disallow the dump if the cache size exceeds the configured
12575 // threshold, which is 1G for formatter and unlimited for file (note
12576 // that this can be jacked up by the admin... and is nothing but foot
12577 // shooting, but the option itself is for devs and hence dangerous to
12578 // tune). TODO: remove this when fixed.
12579 uint64_t threshold = f ?
11fdf7f2
TL
12580 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12581 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
f64942e4
AA
12582
12583 if (threshold && cache_size() > threshold) {
12584 if (f) {
f67539c2
TL
12585 CachedStackStringStream css;
12586 *css << "cache usage exceeds dump threshold";
f64942e4 12587 f->open_object_section("result");
f67539c2 12588 f->dump_string("error", css->strv());
f64942e4
AA
12589 f->close_section();
12590 } else {
12591 derr << "cache usage exceeds dump threshold" << dendl;
f67539c2 12592 r = -CEPHFS_EINVAL;
f64942e4
AA
12593 }
12594 return r;
12595 }
12596
12597 r = 0;
7c673cae
FG
12598 int fd = -1;
12599
12600 if (f) {
12601 f->open_array_section("inodes");
12602 } else {
94b18763
FG
12603 char path[PATH_MAX] = "";
12604 if (fn.length()) {
12605 snprintf(path, sizeof path, "%s", fn.data());
12606 } else {
12607 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
12608 }
12609
94b18763 12610 dout(1) << "dump_cache to " << path << dendl;
7c673cae 12611
91327a77 12612 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
7c673cae 12613 if (fd < 0) {
94b18763 12614 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 12615 return errno;
7c673cae
FG
12616 }
12617 }
12618
11fdf7f2 12619 auto dump_func = [fd, f](CInode *in) {
b32b8144 12620 int r;
7c673cae
FG
12621 if (f) {
12622 f->open_object_section("inode");
11fdf7f2
TL
12623 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12624 f->close_section();
12625 return 1;
12626 }
f67539c2
TL
12627 CachedStackStringStream css;
12628 *css << *in << std::endl;
12629 auto sv = css->strv();
12630 r = safe_write(fd, sv.data(), sv.size());
11fdf7f2
TL
12631 if (r < 0)
12632 return r;
9f95a23c 12633 auto&& dfs = in->get_dirfrags();
11fdf7f2 12634 for (auto &dir : dfs) {
f67539c2
TL
12635 CachedStackStringStream css2;
12636 *css2 << " " << *dir << std::endl;
12637 auto sv = css2->strv();
12638 r = safe_write(fd, sv.data(), sv.size());
11fdf7f2
TL
12639 if (r < 0)
12640 return r;
94b18763
FG
12641 for (auto &p : dir->items) {
12642 CDentry *dn = p.second;
f67539c2
TL
12643 CachedStackStringStream css3;
12644 *css3 << " " << *dn << std::endl;
12645 auto sv = css3->strv();
12646 r = safe_write(fd, sv.data(), sv.size());
11fdf7f2
TL
12647 if (r < 0)
12648 return r;
7c673cae
FG
12649 }
12650 dir->check_rstats();
7c673cae 12651 }
b32b8144
FG
12652 return 1;
12653 };
12654
20effc67
TL
12655 auto start = mono_clock::now();
12656 int64_t count = 0;
94b18763 12657 for (auto &p : inode_map) {
b32b8144
FG
12658 r = dump_func(p.second);
12659 if (r < 0)
12660 goto out;
20effc67
TL
12661 if (!(++count % 1000) &&
12662 timeout > 0 &&
12663 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12664 r = -ETIMEDOUT;
12665 goto out;
12666 }
b32b8144 12667 }
94b18763 12668 for (auto &p : snap_inode_map) {
b32b8144
FG
12669 r = dump_func(p.second);
12670 if (r < 0)
12671 goto out;
20effc67
TL
12672 if (!(++count % 1000) &&
12673 timeout > 0 &&
12674 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12675 r = -ETIMEDOUT;
12676 goto out;
12677 }
12678
7c673cae 12679 }
b32b8144 12680 r = 0;
7c673cae
FG
12681
12682 out:
12683 if (f) {
20effc67
TL
12684 if (r == -ETIMEDOUT)
12685 {
12686 f->close_section();
12687 f->open_object_section("result");
12688 f->dump_string("error", "the operation timeout");
12689 }
7c673cae
FG
12690 f->close_section(); // inodes
12691 } else {
20effc67
TL
12692 if (r == -ETIMEDOUT)
12693 {
12694 CachedStackStringStream css;
12695 *css << "error : the operation timeout" << std::endl;
12696 auto sv = css->strv();
12697 r = safe_write(fd, sv.data(), sv.size());
12698 }
7c673cae
FG
12699 ::close(fd);
12700 }
31f18b77 12701 return r;
7c673cae
FG
12702}
12703
7c673cae
FG
12704void C_MDS_RetryRequest::finish(int r)
12705{
12706 mdr->retry++;
12707 cache->dispatch_request(mdr);
12708}
12709
f67539c2
TL
12710MDSContext *CF_MDS_RetryRequestFactory::build()
12711{
12712 if (drop_locks) {
12713 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
12714 mdr->drop_local_auth_pins();
12715 }
12716 return new C_MDS_RetryRequest(mdcache, mdr);
12717}
7c673cae
FG
12718
12719class C_MDS_EnqueueScrub : public Context
12720{
11fdf7f2 12721 std::string tag;
7c673cae
FG
12722 Formatter *formatter;
12723 Context *on_finish;
12724public:
12725 ScrubHeaderRef header;
11fdf7f2
TL
12726 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12727 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
7c673cae 12728
7c673cae 12729 void finish(int r) override {
f67539c2
TL
12730 formatter->open_object_section("results");
12731 formatter->dump_int("return_code", r);
11fdf7f2 12732 if (r == 0) {
f67539c2
TL
12733 formatter->dump_string("scrub_tag", tag);
12734 formatter->dump_string("mode", "asynchronous");
7c673cae 12735 }
f67539c2
TL
12736 formatter->close_section();
12737
12738 r = 0;
7c673cae
FG
12739 if (on_finish)
12740 on_finish->complete(r);
12741 }
12742};
12743
12744void MDCache::enqueue_scrub(
11fdf7f2
TL
12745 std::string_view path,
12746 std::string_view tag,
7c673cae
FG
12747 bool force, bool recursive, bool repair,
12748 Formatter *f, Context *fin)
12749{
11fdf7f2 12750 dout(10) << __func__ << " " << path << dendl;
f67539c2
TL
12751
12752 filepath fp;
12753 if (path.compare(0, 4, "~mds") == 0) {
12754 mds_rank_t rank;
12755 if (path == "~mdsdir") {
12756 rank = mds->get_nodeid();
12757 } else {
12758 std::string err;
12759 rank = strict_strtoll(path.substr(4), 10, &err);
12760 if (!err.empty())
12761 rank = MDS_RANK_NONE;
12762 }
12763 if (rank >= 0 && rank < MAX_MDS)
12764 fp.set_path("", MDS_INO_MDSDIR(rank));
11fdf7f2 12765 }
f67539c2
TL
12766 if (fp.get_ino() == inodeno_t(0))
12767 fp.set_path(path);
12768
12769 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12770 mdr->set_filepath(fp);
11fdf7f2
TL
12771
12772 bool is_internal = false;
12773 std::string tag_str(tag);
12774 if (tag_str.empty()) {
12775 uuid_d uuid_gen;
12776 uuid_gen.generate_random();
12777 tag_str = uuid_gen.to_string();
12778 is_internal = true;
12779 }
7c673cae 12780
11fdf7f2 12781 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
f67539c2 12782 cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, recursive, repair);
7c673cae
FG
12783
12784 mdr->internal_op_finish = cs;
12785 enqueue_scrub_work(mdr);
12786}
12787
12788void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12789{
f67539c2
TL
12790 CInode *in;
12791 CF_MDS_RetryRequestFactory cf(this, mdr, true);
12792 int r = path_traverse(mdr, cf, mdr->get_filepath(),
12793 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH,
12794 nullptr, &in);
12795 if (r > 0)
7c673cae 12796 return;
f67539c2
TL
12797 if (r < 0) {
12798 mds->server->respond_to_request(mdr, r);
12799 return;
12800 }
7c673cae
FG
12801
12802 // Cannot scrub same dentry twice at same time
11fdf7f2 12803 if (in->scrub_is_in_progress()) {
f67539c2 12804 mds->server->respond_to_request(mdr, -CEPHFS_EBUSY);
7c673cae
FG
12805 return;
12806 } else {
12807 in->scrub_info();
12808 }
12809
f67539c2
TL
12810 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12811 ScrubHeaderRef& header = cs->header;
7c673cae 12812
f67539c2 12813 r = mds->scrubstack->enqueue(in, header, !header->get_recursive());
7c673cae 12814
f67539c2 12815 mds->server->respond_to_request(mdr, r);
11fdf7f2
TL
12816}
12817
12818struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
7c673cae 12819 MDRequestRef mdr;
11fdf7f2 12820 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
7c673cae
FG
12821 MDCacheLogContext(c), mdr(m) {}
12822 void finish(int r) override {
12823 mdr->apply();
12824 get_mds()->server->respond_to_request(mdr, r);
12825 }
12826};
12827
f67539c2
TL
12828struct C_MDC_ScrubRepaired : public MDCacheContext {
12829 ScrubHeaderRef header;
12830public:
12831 C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h)
12832 : MDCacheContext(m), header(h) {
12833 header->inc_num_pending();
12834 }
12835 void finish(int r) override {
12836 header->dec_num_pending();
12837 }
12838};
12839
7c673cae
FG
12840void MDCache::repair_dirfrag_stats(CDir *dir)
12841{
12842 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12843 mdr->pin(dir);
12844 mdr->internal_op_private = dir;
f67539c2
TL
12845 if (dir->scrub_is_in_progress())
12846 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header());
12847 else
12848 mdr->internal_op_finish = new C_MDSInternalNoop;
7c673cae
FG
12849 repair_dirfrag_stats_work(mdr);
12850}
12851
12852void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12853{
12854 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12855 dout(10) << __func__ << " " << *dir << dendl;
12856
12857 if (!dir->is_auth()) {
f67539c2 12858 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
12859 return;
12860 }
12861
12862 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12863 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12864
7c673cae
FG
12865 mds->locker->drop_locks(mdr.get());
12866 mdr->drop_local_auth_pins();
9f95a23c 12867 if (mdr->is_any_remote_auth_pin())
224ce89b 12868 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12869 return;
12870 }
12871
12872 mdr->auth_pin(dir);
12873
11fdf7f2 12874 MutationImpl::LockOpVec lov;
7c673cae 12875 CInode *diri = dir->inode;
11fdf7f2
TL
12876 lov.add_rdlock(&diri->dirfragtreelock);
12877 lov.add_wrlock(&diri->nestlock);
12878 lov.add_wrlock(&diri->filelock);
12879 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12880 return;
12881
12882 if (!dir->is_complete()) {
12883 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12884 return;
12885 }
12886
12887 frag_info_t frag_info;
12888 nest_info_t nest_info;
94b18763 12889 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12890 CDentry *dn = it->second;
12891 if (dn->last != CEPH_NOSNAP)
12892 continue;
12893 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12894 if (dnl->is_primary()) {
12895 CInode *in = dnl->get_inode();
12896 nest_info.add(in->get_projected_inode()->accounted_rstat);
12897 if (in->is_dir())
12898 frag_info.nsubdirs++;
12899 else
12900 frag_info.nfiles++;
12901 } else if (dnl->is_remote())
12902 frag_info.nfiles++;
12903 }
12904
f67539c2 12905 auto pf = dir->get_projected_fnode();
7c673cae
FG
12906 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12907 bool good_rstat = nest_info.same_sums(pf->rstat);
12908 if (good_fragstat && good_rstat) {
12909 dout(10) << __func__ << " no corruption found" << dendl;
12910 mds->server->respond_to_request(mdr, 0);
12911 return;
12912 }
12913
f67539c2
TL
12914 auto _pf = dir->project_fnode(mdr);
12915 _pf->version = dir->pre_dirty();
12916 pf = _pf;
7c673cae
FG
12917
12918 mdr->ls = mds->mdlog->get_current_segment();
12919 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12920 mds->mdlog->start_entry(le);
12921
12922 if (!good_fragstat) {
12923 if (pf->fragstat.mtime > frag_info.mtime)
12924 frag_info.mtime = pf->fragstat.mtime;
12925 if (pf->fragstat.change_attr > frag_info.change_attr)
12926 frag_info.change_attr = pf->fragstat.change_attr;
f67539c2 12927 _pf->fragstat = frag_info;
7c673cae
FG
12928 mds->locker->mark_updated_scatterlock(&diri->filelock);
12929 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12930 mdr->add_updated_lock(&diri->filelock);
12931 }
12932
12933 if (!good_rstat) {
12934 if (pf->rstat.rctime > nest_info.rctime)
12935 nest_info.rctime = pf->rstat.rctime;
f67539c2 12936 _pf->rstat = nest_info;
7c673cae
FG
12937 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12938 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12939 mdr->add_updated_lock(&diri->nestlock);
12940 }
12941
12942 le->metablob.add_dir_context(dir);
12943 le->metablob.add_dir(dir, true);
12944
11fdf7f2 12945 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
7c673cae
FG
12946}
12947
12948void MDCache::repair_inode_stats(CInode *diri)
12949{
12950 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
f67539c2 12951 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
7c673cae 12952 mdr->internal_op_private = diri;
f67539c2
TL
12953 if (diri->scrub_is_in_progress())
12954 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header());
12955 else
12956 mdr->internal_op_finish = new C_MDSInternalNoop;
7c673cae
FG
12957 repair_inode_stats_work(mdr);
12958}
12959
12960void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12961{
12962 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12963 dout(10) << __func__ << " " << *diri << dendl;
12964
12965 if (!diri->is_auth()) {
f67539c2 12966 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
12967 return;
12968 }
12969 if (!diri->is_dir()) {
f67539c2 12970 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
12971 return;
12972 }
12973
11fdf7f2 12974 MutationImpl::LockOpVec lov;
7c673cae
FG
12975
12976 if (mdr->ls) // already marked filelock/nestlock dirty ?
12977 goto do_rdlocks;
12978
11fdf7f2
TL
12979 lov.add_rdlock(&diri->dirfragtreelock);
12980 lov.add_wrlock(&diri->nestlock);
12981 lov.add_wrlock(&diri->filelock);
12982 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12983 return;
12984
12985 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12986 // the scatter-gather process, which will fix any fragstat/rstat errors.
11fdf7f2
TL
12987 {
12988 frag_vec_t leaves;
12989 diri->dirfragtree.get_leaves(leaves);
12990 for (const auto& leaf : leaves) {
12991 CDir *dir = diri->get_dirfrag(leaf);
12992 if (!dir) {
12993 ceph_assert(mdr->is_auth_pinned(diri));
12994 dir = diri->get_or_open_dirfrag(this, leaf);
12995 }
12996 if (dir->get_version() == 0) {
12997 ceph_assert(dir->is_auth());
12998 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12999 return;
13000 }
7c673cae
FG
13001 }
13002 }
13003
13004 diri->state_set(CInode::STATE_REPAIRSTATS);
13005 mdr->ls = mds->mdlog->get_current_segment();
13006 mds->locker->mark_updated_scatterlock(&diri->filelock);
13007 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13008 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13009 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13010
13011 mds->locker->drop_locks(mdr.get());
13012
13013do_rdlocks:
13014 // force the scatter-gather process
11fdf7f2
TL
13015 lov.clear();
13016 lov.add_rdlock(&diri->dirfragtreelock);
13017 lov.add_rdlock(&diri->nestlock);
13018 lov.add_rdlock(&diri->filelock);
13019 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
13020 return;
13021
13022 diri->state_clear(CInode::STATE_REPAIRSTATS);
13023
13024 frag_info_t dir_info;
13025 nest_info_t nest_info;
11fdf7f2
TL
13026 nest_info.rsubdirs = 1; // it gets one to account for self
13027 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13028 nest_info.rsnaps = srnode->snaps.size();
7c673cae 13029
11fdf7f2
TL
13030 {
13031 frag_vec_t leaves;
13032 diri->dirfragtree.get_leaves(leaves);
13033 for (const auto& leaf : leaves) {
13034 CDir *dir = diri->get_dirfrag(leaf);
13035 ceph_assert(dir);
13036 ceph_assert(dir->get_version() > 0);
f67539c2
TL
13037 dir_info.add(dir->get_fnode()->accounted_fragstat);
13038 nest_info.add(dir->get_fnode()->accounted_rstat);
11fdf7f2 13039 }
7c673cae
FG
13040 }
13041
f67539c2
TL
13042 if (!dir_info.same_sums(diri->get_inode()->dirstat) ||
13043 !nest_info.same_sums(diri->get_inode()->rstat)) {
7c673cae
FG
13044 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13045 << *diri << dendl;
13046 }
13047
13048 mds->server->respond_to_request(mdr, 0);
13049}
13050
f67539c2 13051void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin)
11fdf7f2 13052{
f67539c2
TL
13053 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS);
13054 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
13055 mdr->internal_op_private = diri;
13056 mdr->internal_op_finish = fin;
13057 return rdlock_dirfrags_stats_work(mdr);
11fdf7f2
TL
13058}
13059
f67539c2 13060void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr)
11fdf7f2 13061{
f67539c2
TL
13062 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13063 dout(10) << __func__ << " " << *diri << dendl;
13064 if (!diri->is_auth()) {
13065 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13066 return;
13067 }
13068 if (!diri->is_dir()) {
13069 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
11fdf7f2
TL
13070 return;
13071 }
13072
13073 MutationImpl::LockOpVec lov;
f67539c2
TL
13074 lov.add_rdlock(&diri->dirfragtreelock);
13075 lov.add_rdlock(&diri->nestlock);
13076 lov.add_rdlock(&diri->filelock);
11fdf7f2
TL
13077 if (!mds->locker->acquire_locks(mdr, lov))
13078 return;
f67539c2 13079 dout(10) << __func__ << " start dirfrags : " << *diri << dendl;
11fdf7f2 13080
f67539c2
TL
13081 mds->server->respond_to_request(mdr, 0);
13082 return;
11fdf7f2
TL
13083}
13084
13085void MDCache::flush_dentry(std::string_view path, Context *fin)
7c673cae
FG
13086{
13087 if (is_readonly()) {
13088 dout(10) << __func__ << ": read-only FS" << dendl;
f67539c2 13089 fin->complete(-CEPHFS_EROFS);
7c673cae
FG
13090 return;
13091 }
13092 dout(10) << "flush_dentry " << path << dendl;
13093 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 13094 filepath fp(path);
7c673cae
FG
13095 mdr->set_filepath(fp);
13096 mdr->internal_op_finish = fin;
13097 flush_dentry_work(mdr);
13098}
13099
11fdf7f2 13100class C_FinishIOMDR : public MDSContext {
7c673cae
FG
13101protected:
13102 MDSRank *mds;
13103 MDRequestRef mdr;
13104 MDSRank *get_mds() override { return mds; }
13105public:
13106 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13107 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13108};
13109
13110void MDCache::flush_dentry_work(MDRequestRef& mdr)
13111{
11fdf7f2 13112 MutationImpl::LockOpVec lov;
9f95a23c
TL
13113 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13114 if (!in)
7c673cae
FG
13115 return;
13116
11fdf7f2 13117 ceph_assert(in->is_auth());
7c673cae
FG
13118 in->flush(new C_FinishIOMDR(mds, mdr));
13119}
13120
13121
13122/**
13123 * Initialize performance counters with global perfcounter
13124 * collection.
13125 */
13126void MDCache::register_perfcounters()
13127{
91327a77
AA
13128 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13129
13130 // Stray/purge statistics
13131 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13132 PerfCountersBuilder::PRIO_INTERESTING);
13133 pcb.add_u64(l_mdc_num_recovering_enqueued,
13134 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13135 PerfCountersBuilder::PRIO_INTERESTING);
13136 pcb.add_u64_counter(l_mdc_recovery_completed,
13137 "recovery_completed", "File recoveries completed", "recd",
13138 PerfCountersBuilder::PRIO_INTERESTING);
13139
13140 // useful recovery queue statistics
13141 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13142 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13143 "Files currently being recovered");
13144 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13145 "Files waiting for recovery with elevated priority");
13146 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13147 "File recoveries started");
13148
13149 // along with other stray dentries stats
13150 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13151 "Stray dentries delayed");
13152 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13153 "Stray dentries enqueuing for purge");
13154 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13155 "Stray dentries created");
7c673cae 13156 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
91327a77
AA
13157 "Stray dentries enqueued for purge");
13158 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13159 "Stray dentries reintegrated");
13160 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13161 "Stray dentries migrated");
7c673cae 13162
91327a77 13163 // low prio internal request stats
d2e6a577 13164 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
91327a77 13165 "Internal Request type enqueue scrub");
d2e6a577 13166 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
91327a77 13167 "Internal Request type export dir");
d2e6a577 13168 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
91327a77 13169 "Internal Request type flush");
d2e6a577 13170 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
91327a77 13171 "Internal Request type fragmentdir");
d2e6a577 13172 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
91327a77 13173 "Internal Request type frag stats");
d2e6a577 13174 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
91327a77 13175 "Internal Request type inode stats");
d2e6a577 13176
7c673cae
FG
13177 logger.reset(pcb.create_perf_counters());
13178 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13179 recovery_queue.set_logger(logger.get());
13180 stray_manager.set_logger(logger.get());
13181}
13182
7c673cae
FG
13183/**
13184 * Call this when putting references to an inode/dentry or
13185 * when attempting to trim it.
13186 *
13187 * If this inode is no longer linked by anyone, and this MDS
13188 * rank holds the primary dentry, and that dentry is in a stray
13189 * directory, then give up the dentry to the StrayManager, never
13190 * to be seen again by MDCache.
13191 *
13192 * @param delay if true, then purgeable inodes are stashed til
13193 * the next trim(), rather than being purged right
13194 * away.
13195 */
13196void MDCache::maybe_eval_stray(CInode *in, bool delay) {
f67539c2 13197 if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() ||
224ce89b 13198 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 13199 return;
224ce89b 13200
7c673cae
FG
13201 CDentry *dn = in->get_projected_parent_dn();
13202
13203 if (dn->state_test(CDentry::STATE_PURGING)) {
13204 /* We have already entered the purging process, no need
13205 * to re-evaluate me ! */
13206 return;
13207 }
13208
11fdf7f2
TL
13209 if (dn->get_dir()->get_inode()->is_stray()) {
13210 if (delay)
13211 stray_manager.queue_delayed(dn);
13212 else
13213 stray_manager.eval_stray(dn);
7c673cae
FG
13214 }
13215}
13216
31f18b77
FG
13217void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13218 dout(10) << __func__ << " " << *diri << dendl;
11fdf7f2 13219 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
9f95a23c 13220 auto&& ls = diri->get_dirfrags();
94b18763 13221 for (auto &p : ls) {
31f18b77
FG
13222 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13223 p->try_remove_dentries_for_stray();
13224 }
13225 if (!diri->snaprealm) {
13226 if (diri->is_auth())
13227 diri->clear_dirty_rstat();
13228 diri->clear_scatter_dirty();
13229 }
13230}
13231
11fdf7f2
TL
13232bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13233 CInode *in = get_inode(number);
13234 if (!in) {
13235 return false;
13236 }
13237 f->open_object_section("inode");
13238 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13239 f->close_section();
13240 return true;
13241}
eafe8130 13242
f6b5b4d7 13243void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
f67539c2
TL
13244 const mds_rank_t max_mds = mdsmap.get_max_mds();
13245
eafe8130
TL
13246 // process export_pin_delayed_queue whenever a new MDSMap received
13247 auto &q = export_pin_delayed_queue;
13248 for (auto it = q.begin(); it != q.end(); ) {
13249 auto *in = *it;
13250 mds_rank_t export_pin = in->get_export_pin(false);
13251 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
f67539c2 13252 << " max_mds=" << max_mds << dendl;
eafe8130
TL
13253 if (export_pin >= mdsmap.get_max_mds()) {
13254 it++;
13255 continue;
13256 }
13257
13258 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13259 it = q.erase(it);
f6b5b4d7 13260 in->queue_export_pin(export_pin);
eafe8130 13261 }
eafe8130 13262
f6b5b4d7
TL
13263 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13264 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13265 /* copy to vector to avoid removals during iteration */
13266 std::vector<CInode*> migrate;
f67539c2 13267 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
f6b5b4d7 13268 for (auto& in : migrate) {
f67539c2 13269 in->maybe_export_pin();
f6b5b4d7
TL
13270 }
13271 }
f67539c2
TL
13272
13273 if (max_mds <= 1) {
13274 export_ephemeral_dist_frag_bits = 0;
13275 } else {
13276 double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
13277 want *= max_mds;
13278 unsigned n = 0;
13279 while ((1U << n) < (unsigned)want)
13280 ++n;
13281 export_ephemeral_dist_frag_bits = n;
13282 }
f6b5b4d7 13283}
b3b6e05e
TL
13284
13285void MDCache::upkeep_main(void)
13286{
13287 std::unique_lock lock(upkeep_mutex);
13288 while (!upkeep_trim_shutdown.load()) {
13289 auto now = clock::now();
13290 auto since = now-upkeep_last_trim;
13291 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
13292 if (since >= trim_interval*.90) {
13293 lock.unlock(); /* mds_lock -> upkeep_mutex */
13294 std::scoped_lock mds_lock(mds->mds_lock);
13295 lock.lock();
13296 if (upkeep_trim_shutdown.load())
13297 return;
13298 check_memory_usage();
13299 if (mds->is_cache_trimmable()) {
13300 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
13301 bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
13302 if (active_with_clients) {
13303 trim_client_leases();
13304 }
a4b75251
TL
13305 if (is_open()) {
13306 trim();
13307 }
b3b6e05e
TL
13308 if (active_with_clients) {
13309 auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
13310 if (cache_toofull()) {
13311 recall_flags = recall_flags|Server::RecallFlags::TRIM;
13312 }
13313 mds->server->recall_client_state(nullptr, recall_flags);
13314 }
13315 upkeep_last_trim = now = clock::now();
13316 } else {
13317 dout(10) << "cache not ready for trimming" << dendl;
13318 }
13319 } else {
13320 trim_interval -= since;
13321 }
13322 since = now-upkeep_last_release;
13323 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
13324 if (since >= release_interval*.90) {
13325 /* XXX not necessary once MDCache uses PriorityCache */
13326 dout(10) << "releasing free memory" << dendl;
13327 ceph_heap_release_free_memory();
13328 upkeep_last_release = clock::now();
13329 } else {
13330 release_interval -= since;
13331 }
13332 auto interval = std::min(release_interval, trim_interval);
13333 dout(20) << "upkeep thread waiting interval " << interval << dendl;
13334 upkeep_cvar.wait_for(lock, interval);
13335 }
13336}