]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
import ceph 15.2.14
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
11fdf7f2 20#include <string_view>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae 44
11fdf7f2
TL
45#include "messages/MClientCaps.h"
46
7c673cae
FG
47#include "msg/Message.h"
48#include "msg/Messenger.h"
49
181888fb 50#include "common/MemoryModel.h"
7c673cae 51#include "common/errno.h"
7c673cae 52#include "common/perf_counters.h"
181888fb
FG
53#include "common/safe_io.h"
54
7c673cae
FG
55#include "osdc/Journaler.h"
56#include "osdc/Filer.h"
57
58#include "events/ESubtreeMap.h"
59#include "events/EUpdate.h"
60#include "events/ESlaveUpdate.h"
61#include "events/EImportFinish.h"
62#include "events/EFragment.h"
63#include "events/ECommitted.h"
9f95a23c 64#include "events/EPurged.h"
7c673cae
FG
65#include "events/ESessions.h"
66
7c673cae
FG
67#include "InoTable.h"
68
69#include "common/Timer.h"
70
71#include "perfglue/heap_profiler.h"
72
7c673cae
FG
73
74#include "common/config.h"
11fdf7f2 75#include "include/ceph_assert.h"
7c673cae
FG
76
77#define dout_context g_ceph_context
78#define dout_subsys ceph_subsys_mds
79#undef dout_prefix
80#define dout_prefix _prefix(_dout, mds)
81static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
82 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
83}
84
85set<int> SimpleLock::empty_gather_set;
86
87
88/**
89 * All non-I/O contexts that require a reference
90 * to an MDCache instance descend from this.
91 */
11fdf7f2 92class MDCacheContext : public virtual MDSContext {
7c673cae
FG
93protected:
94 MDCache *mdcache;
95 MDSRank *get_mds() override
96 {
11fdf7f2 97 ceph_assert(mdcache != NULL);
7c673cae
FG
98 return mdcache->mds;
99 }
100public:
101 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
102};
103
104
105/**
106 * Only for contexts called back from an I/O completion
107 *
108 * Note: duplication of members wrt MDCacheContext, because
109 * it'ls the lesser of two evils compared with introducing
110 * yet another piece of (multiple) inheritance.
111 */
112class MDCacheIOContext : public virtual MDSIOContextBase {
113protected:
114 MDCache *mdcache;
115 MDSRank *get_mds() override
116 {
11fdf7f2 117 ceph_assert(mdcache != NULL);
7c673cae
FG
118 return mdcache->mds;
119 }
120public:
91327a77
AA
121 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
122 MDSIOContextBase(track), mdcache(mdc_) {}
7c673cae
FG
123};
124
125class MDCacheLogContext : public virtual MDSLogContextBase {
126protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
11fdf7f2 130 ceph_assert(mdcache != NULL);
7c673cae
FG
131 return mdcache->mds;
132 }
133public:
134 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
135};
136
137MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
138 mds(m),
9f95a23c 139 open_file_table(m),
7c673cae 140 filer(m->objecter, m->finisher),
a8e16298 141 stray_manager(m, purge_queue_),
9f95a23c
TL
142 recovery_queue(m),
143 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
7c673cae
FG
144{
145 migrator.reset(new Migrator(mds, this));
7c673cae 146
11fdf7f2
TL
147 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
148 (g_conf()->mds_dir_max_commit_size << 20) :
149 (0.9 *(g_conf()->osd_max_write_size << 20));
7c673cae 150
11fdf7f2
TL
151 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
153 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 154
f6b5b4d7
TL
155 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
156 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
157 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
158
11fdf7f2 159 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
7c673cae 160
31f18b77
FG
161 bottom_lru.lru_set_midpoint(0);
162
11fdf7f2 163 decayrate.set_halflife(g_conf()->mds_decay_halflife);
7c673cae 164
7f7e6c64 165 upkeeper = std::thread(&MDCache::upkeep_main, this);
7c673cae
FG
166}
167
168MDCache::~MDCache()
169{
170 if (logger) {
171 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
172 }
eafe8130
TL
173 if (upkeeper.joinable())
174 upkeeper.join();
7c673cae
FG
175}
176
92f5a8d4 177void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
91327a77 178{
f6b5b4d7 179 dout(20) << "config changes: " << changed << dendl;
91327a77 180 if (changed.count("mds_cache_memory_limit"))
11fdf7f2 181 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
91327a77 182 if (changed.count("mds_cache_reservation"))
11fdf7f2 183 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
f6b5b4d7
TL
184 if (changed.count("mds_export_ephemeral_distributed")) {
185 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
186 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
187 /* copy to vector to avoid removals during iteration */
188 std::vector<CInode*> migrate;
189 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
190 for (auto& in : migrate) {
191 in->maybe_ephemeral_dist();
192 }
193 mds->balancer->handle_export_pins();
194 }
195 if (changed.count("mds_export_ephemeral_random")) {
196 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
197 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
198 /* copy to vector to avoid removals during iteration */
199 std::vector<CInode*> migrate;
200 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
201 for (auto& in : migrate) {
202 in->maybe_ephemeral_rand();
203 }
204 mds->balancer->handle_export_pins();
205 }
206 if (changed.count("mds_export_ephemeral_random_max")) {
207 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
208 }
91327a77 209 if (changed.count("mds_health_cache_threshold"))
11fdf7f2 210 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 211 if (changed.count("mds_cache_mid"))
11fdf7f2 212 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
a8e16298 213 if (changed.count("mds_cache_trim_decay_rate")) {
11fdf7f2 214 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
a8e16298 215 }
7c673cae 216
92f5a8d4
TL
217 migrator->handle_conf_change(changed, mdsmap);
218 mds->balancer->handle_conf_change(changed, mdsmap);
91327a77 219}
7c673cae
FG
220
221void MDCache::log_stat()
222{
7c673cae
FG
223 mds->logger->set(l_mds_inodes, lru.lru_get_size());
224 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
225 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
226 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
227 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
228 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
229 mds->logger->set(l_mds_caps, Capability::count());
eafe8130
TL
230 if (root) {
231 mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
232 mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
233 mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
234 }
7c673cae
FG
235}
236
237
238//
239
240bool MDCache::shutdown()
241{
eafe8130
TL
242 {
243 std::scoped_lock lock(upkeep_mutex);
244 upkeep_trim_shutdown = true;
245 upkeep_cvar.notify_one();
246 }
7c673cae
FG
247 if (lru.lru_get_size() > 0) {
248 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
249 //show_cache();
250 show_subtrees();
251 //dump();
252 }
253 return true;
254}
255
256
257// ====================================================================
258// some inode functions
259
260void MDCache::add_inode(CInode *in)
261{
262 // add to lru, inode map
b32b8144
FG
263 if (in->last == CEPH_NOSNAP) {
264 auto &p = inode_map[in->ino()];
11fdf7f2 265 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
266 p = in;
267 } else {
268 auto &p = snap_inode_map[in->vino()];
11fdf7f2 269 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
270 p = in;
271 }
7c673cae
FG
272
273 if (in->ino() < MDS_INO_SYSTEM_BASE) {
ec96510d 274 if (in->ino() == CEPH_INO_ROOT)
7c673cae
FG
275 root = in;
276 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
277 myin = in;
278 else if (in->is_stray()) {
279 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
280 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
281 }
282 }
283 if (in->is_base())
284 base_inodes.insert(in);
285 }
286
f6b5b4d7 287 in->maybe_ephemeral_dist(false);
7c673cae
FG
288}
289
290void MDCache::remove_inode(CInode *o)
291{
292 dout(14) << "remove_inode " << *o << dendl;
293
294 if (o->get_parent_dn()) {
295 // FIXME: multiple parents?
296 CDentry *dn = o->get_parent_dn();
11fdf7f2 297 ceph_assert(!dn->is_dirty());
7c673cae
FG
298 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
299 }
300
301 if (o->is_dirty())
302 o->mark_clean();
303 if (o->is_dirty_parent())
304 o->clear_dirty_parent();
305
306 o->clear_scatter_dirty();
307
f91f0fd5
TL
308 o->clear_clientwriteable();
309
7c673cae
FG
310 o->item_open_file.remove_myself();
311
31f18b77
FG
312 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
313 export_pin_queue.erase(o);
7c673cae 314
eafe8130
TL
315 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
316 export_pin_delayed_queue.erase(o);
317
f6b5b4d7
TL
318 o->set_ephemeral_dist(false);
319 o->set_ephemeral_rand(false);
320
7c673cae 321 // remove from inode map
11fdf7f2 322 if (o->last == CEPH_NOSNAP) {
b32b8144 323 inode_map.erase(o->ino());
11fdf7f2
TL
324 } else {
325 o->item_caps.remove_myself();
b32b8144 326 snap_inode_map.erase(o->vino());
11fdf7f2 327 }
7c673cae
FG
328
329 if (o->ino() < MDS_INO_SYSTEM_BASE) {
330 if (o == root) root = 0;
331 if (o == myin) myin = 0;
332 if (o->is_stray()) {
333 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
334 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
335 }
336 }
337 if (o->is_base())
338 base_inodes.erase(o);
11fdf7f2 339 }
7c673cae
FG
340
341 // delete it
11fdf7f2 342 ceph_assert(o->get_num_ref() == 0);
7c673cae
FG
343 delete o;
344}
345
346file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
347{
348 file_layout_t result = file_layout_t::get_default();
349 result.pool_id = mdsmap.get_first_data_pool();
350 return result;
351}
352
353file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
354{
355 file_layout_t result = file_layout_t::get_default();
356 result.pool_id = mdsmap.get_metadata_pool();
11fdf7f2
TL
357 if (g_conf()->mds_log_segment_size > 0) {
358 result.object_size = g_conf()->mds_log_segment_size;
359 result.stripe_unit = g_conf()->mds_log_segment_size;
7c673cae
FG
360 }
361 return result;
362}
363
364void MDCache::init_layouts()
365{
366 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
367 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
368}
369
370void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
371 int mode) const
372{
373 in->inode.ino = ino;
374 in->inode.version = 1;
375 in->inode.xattr_version = 1;
376 in->inode.mode = 0500 | mode;
377 in->inode.size = 0;
378 in->inode.ctime =
379 in->inode.mtime =
380 in->inode.btime = ceph_clock_now();
381 in->inode.nlink = 1;
382 in->inode.truncate_size = -1ull;
383 in->inode.change_attr = 0;
384 in->inode.export_pin = MDS_RANK_NONE;
385
92f5a8d4 386 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
387 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
388 if (in->inode.is_dir()) {
11fdf7f2 389 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
f64942e4
AA
390 in->inode.rstat.rsubdirs = 1; /* itself */
391 in->inode.rstat.rctime = in->inode.ctime;
7c673cae
FG
392 } else {
393 in->inode.layout = default_file_layout;
394 ++in->inode.rstat.rfiles;
395 }
396 in->inode.accounted_rstat = in->inode.rstat;
397
398 if (in->is_base()) {
399 if (in->is_root())
400 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
401 else
402 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
403 in->open_snaprealm(); // empty snaprealm
11fdf7f2 404 ceph_assert(!in->snaprealm->parent); // created its own
7c673cae
FG
405 in->snaprealm->srnode.seq = 1;
406 }
407}
408
409CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
410{
411 dout(0) << "creating system inode with ino:" << ino << dendl;
412 CInode *in = new CInode(this);
413 create_unlinked_system_inode(in, ino, mode);
414 add_inode(in);
415 return in;
416}
417
418CInode *MDCache::create_root_inode()
419{
ec96510d 420 CInode *i = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
11fdf7f2
TL
421 i->inode.uid = g_conf()->mds_root_ino_uid;
422 i->inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
423 i->inode.layout = default_file_layout;
424 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
425 return i;
426}
427
428void MDCache::create_empty_hierarchy(MDSGather *gather)
429{
430 // create root dir
431 CInode *root = create_root_inode();
432
433 // force empty root dir
434 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
435 adjust_subtree_auth(rootdir, mds->get_nodeid());
436 rootdir->dir_rep = CDir::REP_ALL; //NONE;
437
11fdf7f2
TL
438 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
439 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
440 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
f64942e4
AA
441 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
442 * assume version 0 is stale/invalid.
443 */
7c673cae
FG
444
445 rootdir->mark_complete();
446 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
447 rootdir->commit(0, gather->new_sub());
448
28e407b8
AA
449 root->mark_clean();
450 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
451 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
452 root->flush(gather->new_sub());
7c673cae
FG
453}
454
455void MDCache::create_mydir_hierarchy(MDSGather *gather)
456{
457 // create mds dir
458 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
459
460 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
461 adjust_subtree_auth(mydir, mds->get_nodeid());
462
463 LogSegment *ls = mds->mdlog->get_current_segment();
464
465 // stray dir
466 for (int i = 0; i < NUM_STRAY; ++i) {
467 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
468 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
469 stringstream name;
470 name << "stray" << i;
471 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
472 sdn->_mark_dirty(mds->mdlog->get_current_segment());
473
474 stray->inode.dirstat = straydir->fnode.fragstat;
475
476 mydir->fnode.rstat.add(stray->inode.rstat);
477 mydir->fnode.fragstat.nsubdirs++;
478 // save them
479 straydir->mark_complete();
480 straydir->mark_dirty(straydir->pre_dirty(), ls);
481 straydir->commit(0, gather->new_sub());
28e407b8 482 stray->mark_dirty_parent(ls, true);
7c673cae
FG
483 stray->store_backtrace(gather->new_sub());
484 }
485
486 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
487 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
488
489 myin->inode.dirstat = mydir->fnode.fragstat;
490 myin->inode.rstat = mydir->fnode.rstat;
491 ++myin->inode.rstat.rsubdirs;
492 myin->inode.accounted_rstat = myin->inode.rstat;
493
494 mydir->mark_complete();
495 mydir->mark_dirty(mydir->pre_dirty(), ls);
496 mydir->commit(0, gather->new_sub());
497
498 myin->store(gather->new_sub());
499}
500
501struct C_MDC_CreateSystemFile : public MDCacheLogContext {
502 MutationRef mut;
503 CDentry *dn;
504 version_t dpv;
11fdf7f2
TL
505 MDSContext *fin;
506 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
7c673cae
FG
507 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
508 void finish(int r) override {
509 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
510 }
511};
512
11fdf7f2 513void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
7c673cae
FG
514{
515 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
516 CDentry *dn = dir->add_null_dentry(name);
517
518 dn->push_projected_linkage(in);
519 version_t dpv = dn->pre_dirty();
520
521 CDir *mdir = 0;
522 if (in->inode.is_dir()) {
523 in->inode.rstat.rsubdirs = 1;
524
525 mdir = in->get_or_open_dirfrag(this, frag_t());
526 mdir->mark_complete();
527 mdir->pre_dirty();
528 } else
529 in->inode.rstat.rfiles = 1;
530 in->inode.version = dn->pre_dirty();
531
532 SnapRealm *realm = dir->get_inode()->find_snaprealm();
533 dn->first = in->first = realm->get_newest_seq() + 1;
534
535 MutationRef mut(new MutationImpl());
536
537 // force some locks. hacky.
538 mds->locker->wrlock_force(&dir->inode->filelock, mut);
539 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
540
541 mut->ls = mds->mdlog->get_current_segment();
542 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
543 mds->mdlog->start_entry(le);
544
545 if (!in->is_mdsdir()) {
546 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
547 le->metablob.add_primary_dentry(dn, in, true);
548 } else {
549 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
550 journal_dirty_inode(mut.get(), &le->metablob, in);
551 dn->push_projected_linkage(in->ino(), in->d_type());
552 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
553 le->metablob.add_root(true, in);
554 }
555 if (mdir)
556 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
557
558 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
559 mds->mdlog->flush();
560}
561
11fdf7f2 562void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
7c673cae
FG
563{
564 dout(10) << "_create_system_file_finish " << *dn << dendl;
565
566 dn->pop_projected_linkage();
567 dn->mark_dirty(dpv, mut->ls);
568
569 CInode *in = dn->get_linkage()->get_inode();
570 in->inode.version--;
571 in->mark_dirty(in->inode.version + 1, mut->ls);
572
573 if (in->inode.is_dir()) {
574 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 575 ceph_assert(dir);
7c673cae
FG
576 dir->mark_dirty(1, mut->ls);
577 dir->mark_new(mut->ls);
578 }
579
580 mut->apply();
581 mds->locker->drop_locks(mut.get());
582 mut->cleanup();
583
584 fin->complete(0);
585
586 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
587 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
588}
589
590
591
592struct C_MDS_RetryOpenRoot : public MDSInternalContext {
593 MDCache *cache;
594 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
595 void finish(int r) override {
596 if (r < 0) {
597 // If we can't open root, something disastrous has happened: mark
598 // this rank damaged for operator intervention. Note that
599 // it is not okay to call suicide() here because we are in
600 // a Finisher callback.
601 cache->mds->damaged();
602 ceph_abort(); // damaged should never return
603 } else {
604 cache->open_root();
605 }
606 }
607};
608
11fdf7f2 609void MDCache::open_root_inode(MDSContext *c)
7c673cae
FG
610{
611 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
612 CInode *in;
ec96510d 613 in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
7c673cae
FG
614 in->fetch(c);
615 } else {
ec96510d 616 discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
7c673cae
FG
617 }
618}
619
11fdf7f2 620void MDCache::open_mydir_inode(MDSContext *c)
7c673cae 621{
7c673cae 622 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
11fdf7f2 623 in->fetch(c);
7c673cae
FG
624}
625
11fdf7f2 626void MDCache::open_mydir_frag(MDSContext *c)
28e407b8
AA
627{
628 open_mydir_inode(
629 new MDSInternalContextWrapper(mds,
9f95a23c 630 new LambdaContext([this, c](int r) {
28e407b8
AA
631 if (r < 0) {
632 c->complete(r);
633 return;
634 }
635 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 636 ceph_assert(mydir);
28e407b8
AA
637 adjust_subtree_auth(mydir, mds->get_nodeid());
638 mydir->fetch(c);
639 })
640 )
641 );
642}
643
7c673cae
FG
644void MDCache::open_root()
645{
646 dout(10) << "open_root" << dendl;
647
648 if (!root) {
649 open_root_inode(new C_MDS_RetryOpenRoot(this));
650 return;
651 }
652 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
11fdf7f2 653 ceph_assert(root->is_auth());
7c673cae 654 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
11fdf7f2 655 ceph_assert(rootdir);
7c673cae
FG
656 if (!rootdir->is_subtree_root())
657 adjust_subtree_auth(rootdir, mds->get_nodeid());
658 if (!rootdir->is_complete()) {
659 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
660 return;
661 }
662 } else {
11fdf7f2 663 ceph_assert(!root->is_auth());
7c673cae
FG
664 CDir *rootdir = root->get_dirfrag(frag_t());
665 if (!rootdir) {
224ce89b 666 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
667 return;
668 }
669 }
670
671 if (!myin) {
672 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
673 in->fetch(new C_MDS_RetryOpenRoot(this));
674 return;
675 }
676 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 677 ceph_assert(mydir);
7c673cae
FG
678 adjust_subtree_auth(mydir, mds->get_nodeid());
679
680 populate_mydir();
681}
682
683void MDCache::populate_mydir()
684{
11fdf7f2 685 ceph_assert(myin);
7c673cae 686 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 687 ceph_assert(mydir);
7c673cae
FG
688
689 dout(10) << "populate_mydir " << *mydir << dendl;
690
691 if (!mydir->is_complete()) {
692 mydir->fetch(new C_MDS_RetryOpenRoot(this));
693 return;
694 }
695
696 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
697 // A missing dirfrag, we will recreate it. Before that, we must dirty
698 // it before dirtying any of the strays we create within it.
699 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
700 "recreating it now";
701 LogSegment *ls = mds->mdlog->get_current_segment();
702 mydir->state_clear(CDir::STATE_BADFRAG);
703 mydir->mark_complete();
704 mydir->mark_dirty(mydir->pre_dirty(), ls);
705 }
706
707 // open or create stray
708 uint64_t num_strays = 0;
709 for (int i = 0; i < NUM_STRAY; ++i) {
710 stringstream name;
711 name << "stray" << i;
712 CDentry *straydn = mydir->lookup(name.str());
713
714 // allow for older fs's with stray instead of stray0
715 if (straydn == NULL && i == 0)
716 straydn = mydir->lookup("stray");
717
718 if (!straydn || !straydn->get_linkage()->get_inode()) {
719 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
720 new C_MDS_RetryOpenRoot(this));
721 return;
722 }
11fdf7f2
TL
723 ceph_assert(straydn);
724 ceph_assert(strays[i]);
7c673cae
FG
725 // we make multiple passes through this method; make sure we only pin each stray once.
726 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
727 strays[i]->get(CInode::PIN_STRAY);
728 strays[i]->state_set(CInode::STATE_STRAYPINNED);
729 strays[i]->get_stickydirs();
730 }
731 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
732
733 // open all frags
11fdf7f2
TL
734 frag_vec_t leaves;
735 strays[i]->dirfragtree.get_leaves(leaves);
736 for (const auto& leaf : leaves) {
737 CDir *dir = strays[i]->get_dirfrag(leaf);
7c673cae 738 if (!dir) {
11fdf7f2 739 dir = strays[i]->get_or_open_dirfrag(this, leaf);
7c673cae
FG
740 }
741
742 // DamageTable applies special handling to strays: it will
743 // have damaged() us out if one is damaged.
11fdf7f2 744 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
7c673cae
FG
745
746 if (dir->get_version() == 0) {
747 dir->fetch(new C_MDS_RetryOpenRoot(this));
748 return;
749 }
750
751 if (dir->get_frag_size() > 0)
752 num_strays += dir->get_frag_size();
753 }
754 }
755
7c673cae
FG
756 // okay!
757 dout(10) << "populate_mydir done" << dendl;
11fdf7f2 758 ceph_assert(!open);
7c673cae
FG
759 open = true;
760 mds->queue_waiters(waiting_for_open);
761
11fdf7f2
TL
762 stray_manager.set_num_strays(num_strays);
763 stray_manager.activate();
764
7c673cae
FG
765 scan_stray_dir();
766}
767
11fdf7f2 768void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
7c673cae
FG
769{
770 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
771}
772
773CDir *MDCache::get_stray_dir(CInode *in)
774{
775 string straydname;
776 in->name_stray_dentry(straydname);
777
778 CInode *strayi = get_stray();
11fdf7f2 779 ceph_assert(strayi);
7c673cae
FG
780 frag_t fg = strayi->pick_dirfrag(straydname);
781 CDir *straydir = strayi->get_dirfrag(fg);
11fdf7f2 782 ceph_assert(straydir);
7c673cae
FG
783 return straydir;
784}
785
786CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
787{
788 CDir *straydir = get_stray_dir(in);
789 string straydname;
790 in->name_stray_dentry(straydname);
791 CDentry *straydn = straydir->lookup(straydname);
792 if (!straydn) {
793 straydn = straydir->add_null_dentry(straydname);
794 straydn->mark_new();
795 } else {
11fdf7f2 796 ceph_assert(straydn->get_projected_linkage()->is_null());
7c673cae
FG
797 }
798
799 straydn->state_set(CDentry::STATE_STRAY);
800 return straydn;
801}
802
803
804
11fdf7f2 805MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
7c673cae
FG
806{
807 // inode?
808 if (info.ino)
809 return get_inode(info.ino, info.snapid);
810
811 // dir or dentry.
812 CDir *dir = get_dirfrag(info.dirfrag);
813 if (!dir) return 0;
814
815 if (info.dname.length())
816 return dir->lookup(info.dname, info.snapid);
817 else
818 return dir;
819}
820
821
f6b5b4d7
TL
822// ====================================================================
823// consistent hash ring
824
825/*
826 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
827*/
828mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino)
829{
830 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
831 uint64_t hash = rjhash64(ino);
832 int64_t b = -1, j = 0;
833 while (j < max_mds) {
834 b = j;
835 hash = hash*2862933555777941757ULL + 1;
836 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
837 }
838 // verify bounds before returning
839 auto result = mds_rank_t(b);
840 ceph_assert(result >= 0 && result < max_mds);
841 return result;
842}
7c673cae
FG
843
844
845// ====================================================================
846// subtree management
847
7c673cae
FG
848/*
849 * adjust the dir_auth of a subtree.
850 * merge with parent and/or child subtrees, if is it appropriate.
851 * merge can ONLY happen if both parent and child have unambiguous auth.
852 */
28e407b8 853void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
854{
855 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
856 << " on " << *dir << dendl;
857
7c673cae
FG
858 show_subtrees();
859
860 CDir *root;
861 if (dir->inode->is_base()) {
862 root = dir; // bootstrap hack.
863 if (subtrees.count(root) == 0) {
864 subtrees[root];
865 root->get(CDir::PIN_SUBTREE);
866 }
867 } else {
868 root = get_subtree_root(dir); // subtree root
869 }
11fdf7f2
TL
870 ceph_assert(root);
871 ceph_assert(subtrees.count(root));
7c673cae
FG
872 dout(7) << " current root is " << *root << dendl;
873
874 if (root == dir) {
875 // i am already a subtree.
876 dir->set_dir_auth(auth);
877 } else {
878 // i am a new subtree.
879 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 880 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
881 subtrees[dir]; // create empty subtree bounds list for me.
882 dir->get(CDir::PIN_SUBTREE);
883
884 // set dir_auth
885 dir->set_dir_auth(auth);
886
887 // move items nested beneath me, under me.
888 set<CDir*>::iterator p = subtrees[root].begin();
889 while (p != subtrees[root].end()) {
890 set<CDir*>::iterator next = p;
891 ++next;
892 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
893 // move under me
894 dout(10) << " claiming child bound " << **p << dendl;
895 subtrees[dir].insert(*p);
896 subtrees[root].erase(p);
897 }
898 p = next;
899 }
900
901 // i am a bound of the parent subtree.
902 subtrees[root].insert(dir);
903
904 // i am now the subtree root.
905 root = dir;
906
907 // adjust recursive pop counters
28e407b8 908 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
909 CDir *p = dir->get_parent_dir();
910 while (p) {
11fdf7f2 911 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
7c673cae
FG
912 if (p->is_subtree_root()) break;
913 p = p->inode->get_parent_dir();
914 }
915 }
7c673cae
FG
916 }
917
f6b5b4d7
TL
918 if (dir->is_auth()) {
919 /* do this now that we are auth for the CDir */
920 dir->inode->maybe_pin();
921 }
922
7c673cae
FG
923 show_subtrees();
924}
925
926
927void MDCache::try_subtree_merge(CDir *dir)
928{
929 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
930 // record my old bounds
931 auto oldbounds = subtrees.at(dir);
7c673cae 932
224ce89b 933 set<CInode*> to_eval;
7c673cae 934 // try merge at my root
224ce89b 935 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
936
937 // try merge at my old bounds
224ce89b
WB
938 for (auto bound : oldbounds)
939 try_subtree_merge_at(bound, &to_eval);
940
941 if (!(mds->is_any_replay() || mds->is_resolve())) {
942 for(auto in : to_eval)
943 eval_subtree_root(in);
944 }
7c673cae
FG
945}
946
947class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
948 CInode *in;
949 MutationRef mut;
950public:
951 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
952 void finish(int r) override {
953 mdcache->subtree_merge_writebehind_finish(in, mut);
954 }
955};
956
28e407b8 957void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
958{
959 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
960
961 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
962 dir->state_test(CDir::STATE_EXPORTBOUND) ||
963 dir->state_test(CDir::STATE_AUXSUBTREE))
964 return;
965
966 auto it = subtrees.find(dir);
11fdf7f2 967 ceph_assert(it != subtrees.end());
7c673cae 968
7c673cae
FG
969 // merge with parent?
970 CDir *parent = dir;
971 if (!dir->inode->is_base())
972 parent = get_subtree_root(dir->get_parent_dir());
973
b32b8144
FG
974 if (parent != dir && // we have a parent,
975 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
976 // merge with parent.
977 dout(10) << " subtree merge at " << *dir << dendl;
978 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
979
980 // move our bounds under the parent
b32b8144 981 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
982
983 // we are no longer a subtree or bound
984 dir->put(CDir::PIN_SUBTREE);
b32b8144 985 subtrees.erase(it);
7c673cae
FG
986 subtrees[parent].erase(dir);
987
988 // adjust popularity?
28e407b8 989 if (adjust_pop && dir->is_auth()) {
28e407b8 990 CDir *cur = dir;
7c673cae
FG
991 CDir *p = dir->get_parent_dir();
992 while (p) {
11fdf7f2 993 p->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8 994 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 995 if (p->is_subtree_root()) break;
28e407b8 996 cur = p;
7c673cae
FG
997 p = p->inode->get_parent_dir();
998 }
999 }
1000
224ce89b
WB
1001 if (to_eval && dir->get_inode()->is_auth())
1002 to_eval->insert(dir->get_inode());
7c673cae 1003
181888fb
FG
1004 show_subtrees(15);
1005 }
7c673cae
FG
1006}
1007
1008void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
1009{
1010 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
1011 in->pop_and_dirty_projected_inode(mut->ls);
1012
1013 mut->apply();
1014 mds->locker->drop_locks(mut.get());
1015 mut->cleanup();
1016
1017 in->auth_unpin(this);
1018}
1019
1020void MDCache::eval_subtree_root(CInode *diri)
1021{
1022 // evaluate subtree inode filelock?
1023 // (we should scatter the filelock on subtree bounds)
11fdf7f2 1024 ceph_assert(diri->is_auth());
224ce89b 1025 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
1026}
1027
1028
11fdf7f2 1029void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
7c673cae
FG
1030{
1031 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1032 << " on " << *dir
1033 << " bounds " << bounds
1034 << dendl;
1035
1036 show_subtrees();
1037
1038 CDir *root;
ec96510d 1039 if (dir->ino() == CEPH_INO_ROOT) {
7c673cae
FG
1040 root = dir; // bootstrap hack.
1041 if (subtrees.count(root) == 0) {
1042 subtrees[root];
1043 root->get(CDir::PIN_SUBTREE);
1044 }
1045 } else {
1046 root = get_subtree_root(dir); // subtree root
1047 }
11fdf7f2
TL
1048 ceph_assert(root);
1049 ceph_assert(subtrees.count(root));
7c673cae
FG
1050 dout(7) << " current root is " << *root << dendl;
1051
1052 mds_authority_t oldauth = dir->authority();
1053
1054 if (root == dir) {
1055 // i am already a subtree.
1056 dir->set_dir_auth(auth);
1057 } else {
1058 // i am a new subtree.
1059 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 1060 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
1061 subtrees[dir]; // create empty subtree bounds list for me.
1062 dir->get(CDir::PIN_SUBTREE);
1063
1064 // set dir_auth
1065 dir->set_dir_auth(auth);
1066
1067 // move items nested beneath me, under me.
1068 set<CDir*>::iterator p = subtrees[root].begin();
1069 while (p != subtrees[root].end()) {
1070 set<CDir*>::iterator next = p;
1071 ++next;
1072 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1073 // move under me
1074 dout(10) << " claiming child bound " << **p << dendl;
1075 subtrees[dir].insert(*p);
1076 subtrees[root].erase(p);
1077 }
1078 p = next;
1079 }
1080
1081 // i am a bound of the parent subtree.
1082 subtrees[root].insert(dir);
1083
1084 // i am now the subtree root.
1085 root = dir;
1086 }
1087
224ce89b
WB
1088 set<CInode*> to_eval;
1089
7c673cae
FG
1090 // verify/adjust bounds.
1091 // - these may be new, or
1092 // - beneath existing ambiguous bounds (which will be collapsed),
1093 // - but NOT beneath unambiguous bounds.
11fdf7f2 1094 for (const auto& bound : bounds) {
7c673cae
FG
1095 // new bound?
1096 if (subtrees[dir].count(bound) == 0) {
1097 if (get_subtree_root(bound) == dir) {
1098 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1099 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1100 }
1101 else {
1102 dout(10) << " want bound " << *bound << dendl;
1103 CDir *t = get_subtree_root(bound->get_parent_dir());
1104 if (subtrees[t].count(bound) == 0) {
11fdf7f2 1105 ceph_assert(t != dir);
7c673cae
FG
1106 dout(10) << " new bound " << *bound << dendl;
1107 adjust_subtree_auth(bound, t->authority());
1108 }
1109 // make sure it's nested beneath ambiguous subtree(s)
1110 while (1) {
1111 while (subtrees[dir].count(t) == 0)
1112 t = get_subtree_root(t->get_parent_dir());
1113 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1114 adjust_subtree_auth(t, auth);
224ce89b 1115 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1116 t = get_subtree_root(bound->get_parent_dir());
1117 if (t == dir) break;
1118 }
1119 }
1120 }
1121 else {
1122 dout(10) << " already have bound " << *bound << dendl;
1123 }
1124 }
1125 // merge stray bounds?
1126 while (!subtrees[dir].empty()) {
1127 set<CDir*> copy = subtrees[dir];
1128 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1129 if (bounds.count(*p) == 0) {
1130 CDir *stray = *p;
1131 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1132 adjust_subtree_auth(stray, auth);
224ce89b 1133 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1134 }
1135 }
1136 // swallowing subtree may add new subtree bounds
1137 if (copy == subtrees[dir])
1138 break;
1139 }
1140
1141 // bound should now match.
1142 verify_subtree_bounds(dir, bounds);
1143
1144 show_subtrees();
224ce89b
WB
1145
1146 if (!(mds->is_any_replay() || mds->is_resolve())) {
1147 for(auto in : to_eval)
1148 eval_subtree_root(in);
1149 }
7c673cae
FG
1150}
1151
1152
1153/*
1154 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1155 * fragmentation as necessary to get an equivalent bounding set. That is, only
1156 * split if one of our frags spans the provided bounding set. Never merge.
1157 */
11fdf7f2 1158void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
7c673cae
FG
1159{
1160 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1161
1162 // sort by ino
1163 map<inodeno_t, fragset_t> byino;
11fdf7f2 1164 for (auto& frag : dfs) {
9f95a23c 1165 byino[frag.ino].insert_raw(frag.frag);
11fdf7f2 1166 }
7c673cae
FG
1167 dout(10) << " by ino: " << byino << dendl;
1168
1169 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
9f95a23c 1170 p->second.simplify();
7c673cae
FG
1171 CInode *diri = get_inode(p->first);
1172 if (!diri)
1173 continue;
1174 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1175
1176 fragtree_t tmpdft;
1177 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1178 tmpdft.force_to_leaf(g_ceph_context, *q);
1179
11fdf7f2
TL
1180 for (const auto& fg : p->second) {
1181 frag_vec_t leaves;
1182 diri->dirfragtree.get_leaves_under(fg, leaves);
1183 if (leaves.empty()) {
7c673cae
FG
1184 bool all = true;
1185 frag_t approx_fg = diri->dirfragtree[fg.value()];
11fdf7f2
TL
1186 frag_vec_t approx_leaves;
1187 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1188 for (const auto& leaf : approx_leaves) {
1189 if (p->second.get().count(leaf) == 0) {
7c673cae 1190 // not bound, so the resolve message is from auth MDS of the dirfrag
11fdf7f2 1191 force_dir_fragment(diri, leaf);
7c673cae
FG
1192 all = false;
1193 }
1194 }
1195 if (all)
11fdf7f2 1196 leaves.push_back(approx_fg);
7c673cae 1197 else
11fdf7f2 1198 diri->dirfragtree.get_leaves_under(fg, leaves);
7c673cae 1199 }
11fdf7f2
TL
1200 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1201 for (const auto& leaf : leaves) {
1202 CDir *dir = diri->get_dirfrag(leaf);
7c673cae
FG
1203 if (dir)
1204 bounds.insert(dir);
1205 }
1206 }
1207 }
1208}
1209
11fdf7f2 1210void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
7c673cae
FG
1211{
1212 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1213 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1214
1215 set<CDir*> bounds;
1216 get_force_dirfrag_bound_set(bound_dfs, bounds);
1217 adjust_bounded_subtree_auth(dir, bounds, auth);
1218}
1219
11fdf7f2 1220void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
7c673cae
FG
1221{
1222 dout(10) << "map_dirfrag_set " << dfs << dendl;
1223
1224 // group by inode
1225 map<inodeno_t, fragset_t> ino_fragset;
11fdf7f2 1226 for (const auto &df : dfs) {
9f95a23c 1227 ino_fragset[df.ino].insert_raw(df.frag);
11fdf7f2 1228 }
7c673cae
FG
1229 // get frags
1230 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1231 p != ino_fragset.end();
1232 ++p) {
9f95a23c 1233 p->second.simplify();
7c673cae
FG
1234 CInode *in = get_inode(p->first);
1235 if (!in)
1236 continue;
1237
11fdf7f2
TL
1238 frag_vec_t fgs;
1239 for (const auto& fg : p->second) {
1240 in->dirfragtree.get_leaves_under(fg, fgs);
1241 }
7c673cae 1242
11fdf7f2 1243 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
7c673cae
FG
1244 << " on " << *in << dendl;
1245
11fdf7f2
TL
1246 for (const auto& fg : fgs) {
1247 CDir *dir = in->get_dirfrag(fg);
7c673cae
FG
1248 if (dir)
1249 result.insert(dir);
1250 }
1251 }
1252}
1253
1254
1255
1256CDir *MDCache::get_subtree_root(CDir *dir)
1257{
1258 // find the underlying dir that delegates (or is about to delegate) auth
1259 while (true) {
1260 if (dir->is_subtree_root())
1261 return dir;
1262 dir = dir->get_inode()->get_parent_dir();
1263 if (!dir)
1264 return 0; // none
1265 }
1266}
1267
1268CDir *MDCache::get_projected_subtree_root(CDir *dir)
1269{
1270 // find the underlying dir that delegates (or is about to delegate) auth
1271 while (true) {
1272 if (dir->is_subtree_root())
1273 return dir;
1274 dir = dir->get_inode()->get_projected_parent_dir();
1275 if (!dir)
1276 return 0; // none
1277 }
1278}
1279
1280void MDCache::remove_subtree(CDir *dir)
1281{
1282 dout(10) << "remove_subtree " << *dir << dendl;
f6b5b4d7
TL
1283 auto it = subtrees.find(dir);
1284 ceph_assert(it != subtrees.end());
1285 subtrees.erase(it);
7c673cae
FG
1286 dir->put(CDir::PIN_SUBTREE);
1287 if (dir->get_parent_dir()) {
1288 CDir *p = get_subtree_root(dir->get_parent_dir());
f6b5b4d7
TL
1289 auto it = subtrees.find(p);
1290 ceph_assert(it != subtrees.end());
1291 auto count = it->second.erase(dir);
1292 ceph_assert(count == 1);
7c673cae
FG
1293 }
1294}
1295
1296void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1297{
11fdf7f2 1298 ceph_assert(subtrees.count(dir));
7c673cae
FG
1299 bounds = subtrees[dir];
1300}
1301
1302void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1303{
1304 if (subtrees.count(dir)) {
1305 // just copy them, dir is a subtree.
1306 get_subtree_bounds(dir, bounds);
1307 } else {
1308 // find them
1309 CDir *root = get_subtree_root(dir);
1310 for (set<CDir*>::iterator p = subtrees[root].begin();
1311 p != subtrees[root].end();
1312 ++p) {
1313 CDir *t = *p;
1314 while (t != root) {
1315 t = t->get_parent_dir();
11fdf7f2 1316 ceph_assert(t);
7c673cae
FG
1317 if (t == dir) {
1318 bounds.insert(*p);
1319 continue;
1320 }
1321 }
1322 }
1323 }
1324}
1325
1326void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1327{
1328 // for debugging only.
11fdf7f2 1329 ceph_assert(subtrees.count(dir));
7c673cae
FG
1330 if (bounds != subtrees[dir]) {
1331 dout(0) << "verify_subtree_bounds failed" << dendl;
1332 set<CDir*> b = bounds;
1333 for (auto &cd : subtrees[dir]) {
1334 if (bounds.count(cd)) {
1335 b.erase(cd);
1336 continue;
1337 }
1338 dout(0) << " missing bound " << *cd << dendl;
1339 }
1340 for (const auto &cd : b)
1341 dout(0) << " extra bound " << *cd << dendl;
1342 }
11fdf7f2 1343 ceph_assert(bounds == subtrees[dir]);
7c673cae
FG
1344}
1345
1346void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1347{
1348 // for debugging only.
11fdf7f2 1349 ceph_assert(subtrees.count(dir));
7c673cae
FG
1350
1351 // make sure that any bounds i do have are properly noted as such.
1352 int failed = 0;
1353 for (const auto &fg : bounds) {
1354 CDir *bd = get_dirfrag(fg);
1355 if (!bd) continue;
1356 if (subtrees[dir].count(bd) == 0) {
1357 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1358 failed++;
1359 }
1360 }
11fdf7f2 1361 ceph_assert(failed == 0);
7c673cae
FG
1362}
1363
1364void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1365{
1366 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1367 << " to " << *newdir << dendl;
1368 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1369}
1370
224ce89b 1371void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1372{
1373 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1374
7c673cae
FG
1375 CDir *newdir = diri->get_parent_dir();
1376
1377 if (pop) {
1378 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
11fdf7f2
TL
1379 ceph_assert(p != projected_subtree_renames.end());
1380 ceph_assert(!p->second.empty());
1381 ceph_assert(p->second.front().first == olddir);
1382 ceph_assert(p->second.front().second == newdir);
7c673cae
FG
1383 p->second.pop_front();
1384 if (p->second.empty())
1385 projected_subtree_renames.erase(p);
1386 }
1387
11fdf7f2
TL
1388 // adjust total auth pin of freezing subtree
1389 if (olddir != newdir) {
9f95a23c
TL
1390 auto&& dfls = diri->get_nested_dirfrags();
1391 for (const auto& dir : dfls)
11fdf7f2 1392 olddir->adjust_freeze_after_rename(dir);
11fdf7f2
TL
1393 }
1394
7c673cae 1395 // adjust subtree
9f95a23c
TL
1396 // N.B. make sure subtree dirfrags are at the front of the list
1397 auto dfls = diri->get_subtree_dirfrags();
7c673cae 1398 diri->get_nested_dirfrags(dfls);
9f95a23c 1399 for (const auto& dir : dfls) {
7c673cae
FG
1400 dout(10) << "dirfrag " << *dir << dendl;
1401 CDir *oldparent = get_subtree_root(olddir);
1402 dout(10) << " old parent " << *oldparent << dendl;
1403 CDir *newparent = get_subtree_root(newdir);
1404 dout(10) << " new parent " << *newparent << dendl;
1405
9f95a23c
TL
1406 auto& oldbounds = subtrees[oldparent];
1407 auto& newbounds = subtrees[newparent];
1408
28e407b8 1409 if (olddir != newdir)
11fdf7f2 1410 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
28e407b8 1411
7c673cae
FG
1412 if (oldparent == newparent) {
1413 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1414 } else if (dir->is_subtree_root()) {
7c673cae
FG
1415 // children are fine. change parent.
1416 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
9f95a23c
TL
1417 {
1418 auto n = oldbounds.erase(dir);
1419 ceph_assert(n == 1);
1420 }
1421 newbounds.insert(dir);
224ce89b 1422 // caller is responsible for 'eval diri'
28e407b8 1423 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1424 } else {
1425 // mid-subtree.
1426
1427 // see if any old bounds move to the new parent.
9f95a23c
TL
1428 std::vector<CDir*> tomove;
1429 for (const auto& bound : oldbounds) {
7c673cae
FG
1430 CDir *broot = get_subtree_root(bound->get_parent_dir());
1431 if (broot != oldparent) {
11fdf7f2 1432 ceph_assert(broot == newparent);
7c673cae
FG
1433 tomove.push_back(bound);
1434 }
1435 }
9f95a23c 1436 for (const auto& bound : tomove) {
7c673cae 1437 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
9f95a23c
TL
1438 oldbounds.erase(bound);
1439 newbounds.insert(bound);
7c673cae
FG
1440 }
1441
1442 // did auth change?
1443 if (oldparent->authority() != newparent->authority()) {
28e407b8 1444 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1445 // caller is responsible for 'eval diri'
28e407b8 1446 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1447 }
1448 }
28e407b8
AA
1449
1450 if (olddir != newdir)
11fdf7f2 1451 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
7c673cae
FG
1452 }
1453
1454 show_subtrees();
1455}
1456
7c673cae
FG
1457// ===================================
1458// journal and snap/cow helpers
1459
1460
1461/*
1462 * find first inode in cache that follows given snapid. otherwise, return current.
1463 */
1464CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1465{
1466 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
11fdf7f2 1467 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae 1468
b32b8144
FG
1469 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1470 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1471 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1472 in = p->second;
7c673cae 1473 }
b32b8144 1474
7c673cae
FG
1475 return in;
1476}
1477
1478
1479/*
1480 * note: i'm currently cheating wrt dirty and inode.version on cow
1481 * items. instead of doing a full dir predirty, i just take the
1482 * original item's version, and set the dirty flag (via
1483 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1484 * means a special case in the dir commit clean sweep assertions.
1485 * bah.
1486 */
1487CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1488{
11fdf7f2 1489 ceph_assert(last >= in->first);
7c673cae 1490
b32b8144 1491 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae 1492 oldin->inode = *in->get_previous_projected_inode();
7c673cae 1493 oldin->xattrs = *in->get_previous_projected_xattrs();
11fdf7f2 1494 oldin->symlink = in->symlink;
7c673cae
FG
1495 oldin->inode.trim_client_ranges(last);
1496
1497 if (in->first < in->oldest_snap)
1498 in->oldest_snap = in->first;
1499
1500 in->first = last+1;
1501
1502 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1503 add_inode(oldin);
1504
1505 if (in->last != CEPH_NOSNAP) {
1506 CInode *head_in = get_inode(in->ino());
11fdf7f2 1507 ceph_assert(head_in);
494da23a
TL
1508 auto ret = head_in->split_need_snapflush(oldin, in);
1509 if (ret.first) {
7c673cae 1510 oldin->client_snap_caps = in->client_snap_caps;
eafe8130
TL
1511 if (!oldin->client_snap_caps.empty()) {
1512 for (int i = 0; i < num_cinode_locks; i++) {
1513 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1514 ceph_assert(lock);
494da23a
TL
1515 if (lock->get_state() != LOCK_SNAP_SYNC) {
1516 ceph_assert(lock->is_stable());
1517 lock->set_state(LOCK_SNAP_SYNC); // gathering
1518 oldin->auth_pin(lock);
1519 }
7c673cae
FG
1520 lock->get_wrlock(true);
1521 }
1522 }
1523 }
494da23a
TL
1524 if (!ret.second) {
1525 auto client_snap_caps = std::move(in->client_snap_caps);
1526 in->client_snap_caps.clear();
1527 in->item_open_file.remove_myself();
1528 in->item_caps.remove_myself();
eafe8130
TL
1529
1530 if (!client_snap_caps.empty()) {
1531 MDSContext::vec finished;
1532 for (int i = 0; i < num_cinode_locks; i++) {
1533 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1534 ceph_assert(lock);
1535 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
494da23a 1536 lock->put_wrlock();
eafe8130
TL
1537 if (!lock->get_num_wrlocks()) {
1538 lock->set_state(LOCK_SYNC);
1539 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1540 in->auth_unpin(lock);
1541 }
494da23a 1542 }
eafe8130 1543 mds->queue_waiters(finished);
494da23a
TL
1544 }
1545 }
7c673cae
FG
1546 return oldin;
1547 }
1548
b32b8144
FG
1549 if (!in->client_caps.empty()) {
1550 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1551 // clone caps?
94b18763 1552 for (auto &p : in->client_caps) {
b32b8144 1553 client_t client = p.first;
11fdf7f2
TL
1554 Capability *cap = &p.second;
1555 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
b32b8144
FG
1556 if ((issued & CEPH_CAP_ANY_WR) &&
1557 cap->client_follows < last) {
eafe8130
TL
1558 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1559 oldin->client_snap_caps.insert(client);
b32b8144
FG
1560 cap->client_follows = last;
1561
1562 // we need snapflushes for any intervening snaps
1563 dout(10) << " snaps " << snaps << dendl;
1564 for (auto q = snaps.lower_bound(oldin->first);
1565 q != snaps.end() && *q <= last;
1566 ++q) {
1567 in->add_need_snapflush(oldin, *q, client);
1568 }
1569 } else {
1570 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1571 }
7c673cae 1572 }
eafe8130
TL
1573
1574 if (!oldin->client_snap_caps.empty()) {
1575 for (int i = 0; i < num_cinode_locks; i++) {
1576 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1577 ceph_assert(lock);
1578 if (lock->get_state() != LOCK_SNAP_SYNC) {
1579 ceph_assert(lock->is_stable());
1580 lock->set_state(LOCK_SNAP_SYNC); // gathering
1581 oldin->auth_pin(lock);
1582 }
1583 lock->get_wrlock(true);
1584 }
1585 }
7c673cae 1586 }
7c673cae
FG
1587 return oldin;
1588}
1589
1590void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1591 CDentry *dn, snapid_t follows,
1592 CInode **pcow_inode, CDentry::linkage_t *dnl)
1593{
1594 if (!dn) {
1595 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1596 return;
1597 }
1598 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
11fdf7f2 1599 ceph_assert(dn->is_auth());
7c673cae
FG
1600
1601 // nothing to cow on a null dentry, fix caller
1602 if (!dnl)
1603 dnl = dn->get_projected_linkage();
11fdf7f2 1604 ceph_assert(!dnl->is_null());
7c673cae 1605
11fdf7f2
TL
1606 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1607 bool cow_head = false;
1608 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1609 ceph_assert(in->is_frozen_inode());
1610 cow_head = true;
1611 }
1612 if (in && (in->is_multiversion() || cow_head)) {
7c673cae 1613 // multiversion inode.
7c673cae
FG
1614 SnapRealm *realm = NULL;
1615
1616 if (in->get_projected_parent_dn() != dn) {
11fdf7f2 1617 ceph_assert(follows == CEPH_NOSNAP);
7c673cae 1618 realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1619 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1620 ceph_assert(dir_follows >= realm->get_newest_seq());
7c673cae
FG
1621
1622 if (dir_follows+1 > dn->first) {
1623 snapid_t oldfirst = dn->first;
1624 dn->first = dir_follows+1;
1625 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1626 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1627 oldfirst, dir_follows);
1628 olddn->pre_dirty();
1629 dout(10) << " olddn " << *olddn << dendl;
1630 metablob->add_remote_dentry(olddn, true);
1631 mut->add_cow_dentry(olddn);
1632 // FIXME: adjust link count here? hmm.
1633
1634 if (dir_follows+1 > in->first)
11fdf7f2 1635 in->cow_old_inode(dir_follows, cow_head);
7c673cae
FG
1636 }
1637 }
1638
11fdf7f2 1639 follows = dir_follows;
7c673cae
FG
1640 if (in->snaprealm) {
1641 realm = in->snaprealm;
11fdf7f2
TL
1642 ceph_assert(follows >= realm->get_newest_seq());
1643 }
7c673cae
FG
1644 } else {
1645 realm = in->find_snaprealm();
11fdf7f2
TL
1646 if (follows == CEPH_NOSNAP) {
1647 follows = get_global_snaprealm()->get_newest_seq();
1648 ceph_assert(follows >= realm->get_newest_seq());
1649 }
7c673cae
FG
1650 }
1651
1652 // already cloned?
1653 if (follows < in->first) {
1654 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1655 return;
1656 }
1657
1658 if (!realm->has_snaps_in_range(in->first, follows)) {
1659 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1660 in->first = follows + 1;
1661 return;
1662 }
1663
11fdf7f2 1664 in->cow_old_inode(follows, cow_head);
7c673cae
FG
1665
1666 } else {
1667 SnapRealm *realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1668 if (follows == CEPH_NOSNAP) {
1669 follows = get_global_snaprealm()->get_newest_seq();
1670 ceph_assert(follows >= realm->get_newest_seq());
1671 }
7c673cae
FG
1672
1673 // already cloned?
1674 if (follows < dn->first) {
1675 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1676 return;
1677 }
1678
1679 // update dn.first before adding old dentry to cdir's map
1680 snapid_t oldfirst = dn->first;
1681 dn->first = follows+1;
1682
7c673cae
FG
1683 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1684 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1685 if (in)
1686 in->first = follows+1;
1687 return;
1688 }
1689
1690 dout(10) << " dn " << *dn << dendl;
1691 if (in) {
1692 CInode *oldin = cow_inode(in, follows);
1693 mut->add_cow_inode(oldin);
1694 if (pcow_inode)
1695 *pcow_inode = oldin;
11fdf7f2 1696 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
7c673cae
FG
1697 oldin->inode.version = olddn->pre_dirty();
1698 dout(10) << " olddn " << *olddn << dendl;
1699 bool need_snapflush = !oldin->client_snap_caps.empty();
11fdf7f2 1700 if (need_snapflush) {
7c673cae 1701 mut->ls->open_files.push_back(&oldin->item_open_file);
11fdf7f2
TL
1702 mds->locker->mark_need_snapflush_inode(oldin);
1703 }
7c673cae
FG
1704 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1705 mut->add_cow_dentry(olddn);
1706 } else {
11fdf7f2 1707 ceph_assert(dnl->is_remote());
94b18763 1708 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1709 oldfirst, follows);
1710 olddn->pre_dirty();
1711 dout(10) << " olddn " << *olddn << dendl;
1712 metablob->add_remote_dentry(olddn, true);
1713 mut->add_cow_dentry(olddn);
1714 }
1715 }
1716}
1717
1718
1719void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1720 CInode *in, snapid_t follows,
1721 CInode **pcow_inode)
1722{
1723 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1724 CDentry *dn = in->get_projected_parent_dn();
1725 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1726}
1727
1728void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1729{
1730 if (in->is_base()) {
11fdf7f2 1731 metablob->add_root(true, in);
7c673cae
FG
1732 } else {
1733 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1734 follows = in->first - 1;
1735 CDentry *dn = in->get_projected_parent_dn();
1736 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1737 journal_cow_dentry(mut, metablob, dn, follows);
1738 if (in->get_projected_inode()->is_backtrace_updated()) {
1739 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1740 in->get_previous_projected_inode()->layout.pool_id;
1741 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1742 } else {
1743 metablob->add_primary_dentry(dn, in, true);
1744 }
1745 }
1746}
1747
1748
1749
1750// nested ---------------------------------------------------------------
1751
1752void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1753 int linkunlink, SnapRealm *prealm)
1754{
1755 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1756 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1757
1758 if (cur->first > first)
1759 first = cur->first;
1760
1761 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1762 << " " << *cur << dendl;
1763 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1764 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1765
1766 /*
1767 * FIXME. this incompletely propagates rstats to _old_ parents
1768 * (i.e. shortly after a directory rename). but we need full
1769 * blown hard link backpointers to make this work properly...
1770 */
1771 snapid_t floor = parentdn->first;
1772 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1773
1774 if (!prealm)
1775 prealm = parent->inode->find_snaprealm();
1776 const set<snapid_t> snaps = prealm->get_snaps();
1777
1778 if (cur->last != CEPH_NOSNAP) {
11fdf7f2
TL
1779 ceph_assert(cur->dirty_old_rstats.empty());
1780 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
7c673cae
FG
1781 if (q == snaps.end() || *q > cur->last)
1782 return;
1783 }
1784
1785 if (cur->last >= floor) {
1786 bool update = true;
1787 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1788 // rename src inode is not projected in the slave rename prep case. so we should
1789 // avoid updateing the inode.
11fdf7f2
TL
1790 ceph_assert(linkunlink < 0);
1791 ceph_assert(cur->is_frozen_inode());
7c673cae
FG
1792 update = false;
1793 }
11fdf7f2 1794 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
7c673cae
FG
1795 linkunlink, update);
1796 }
1797
11fdf7f2 1798 if (g_conf()->mds_snap_rstat) {
94b18763
FG
1799 for (const auto &p : cur->dirty_old_rstats) {
1800 auto &old = cur->old_inodes[p];
1801 snapid_t ofirst = std::max(old.first, floor);
1802 auto it = snaps.lower_bound(ofirst);
1803 if (it == snaps.end() || *it > p)
7c673cae 1804 continue;
94b18763
FG
1805 if (p >= floor)
1806 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1807 }
1808 }
1809 cur->dirty_old_rstats.clear();
1810}
1811
1812
94b18763 1813void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1814 CDir *parent, int linkunlink, bool update_inode)
1815{
1816 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1817 dout(20) << " inode rstat " << inode.rstat << dendl;
1818 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1819 nest_info_t delta;
1820 if (linkunlink == 0) {
1821 delta.add(inode.rstat);
1822 delta.sub(inode.accounted_rstat);
1823 } else if (linkunlink < 0) {
1824 delta.sub(inode.accounted_rstat);
1825 } else {
1826 delta.add(inode.rstat);
1827 }
1828 dout(20) << " delta " << delta << dendl;
1829
1830 if (update_inode)
1831 inode.accounted_rstat = inode.rstat;
1832
1833 while (last >= ofirst) {
1834 /*
1835 * pick fnode version to update. at each iteration, we want to
1836 * pick a segment ending in 'last' to update. split as necessary
1837 * to make that work. then, adjust first up so that we only
1838 * update one segment at a time. then loop to cover the whole
1839 * [ofirst,last] interval.
1840 */
1841 nest_info_t *prstat;
1842 snapid_t first;
1843 fnode_t *pf = parent->get_projected_fnode();
1844 if (last == CEPH_NOSNAP) {
11fdf7f2
TL
1845 if (g_conf()->mds_snap_rstat)
1846 first = std::max(ofirst, parent->first);
7c673cae
FG
1847 else
1848 first = parent->first;
1849 prstat = &pf->rstat;
1850 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1851
1852 if (first > parent->first &&
1853 !(pf->rstat == pf->accounted_rstat)) {
1854 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1855 << parent->first << "," << (first-1) << "] "
1856 << " " << *prstat << "/" << pf->accounted_rstat
1857 << dendl;
1858 parent->dirty_old_rstat[first-1].first = parent->first;
1859 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1860 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1861 }
1862 parent->first = first;
11fdf7f2 1863 } else if (!g_conf()->mds_snap_rstat) {
7c673cae
FG
1864 // drop snapshots' rstats
1865 break;
1866 } else if (last >= parent->first) {
1867 first = parent->first;
1868 parent->dirty_old_rstat[last].first = first;
1869 parent->dirty_old_rstat[last].rstat = pf->rstat;
1870 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1871 prstat = &parent->dirty_old_rstat[last].rstat;
1872 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1873 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1874 } else {
1875 // be careful, dirty_old_rstat is a _sparse_ map.
1876 // sorry, this is ugly.
1877 first = ofirst;
1878
1879 // find any intersection with last
94b18763
FG
1880 auto it = parent->dirty_old_rstat.lower_bound(last);
1881 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1882 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1883 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1884 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1885 first = parent->dirty_old_rstat.rbegin()->first+1;
1886 }
1887 } else {
94b18763
FG
1888 // *it last is >= last
1889 if (it->second.first <= last) {
1890 // *it intersects [first,last]
1891 if (it->second.first < first) {
1892 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1893 parent->dirty_old_rstat[first-1] = it->second;
1894 it->second.first = first;
7c673cae 1895 }
94b18763
FG
1896 if (it->second.first > first)
1897 first = it->second.first;
1898 if (last < it->first) {
1899 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1900 parent->dirty_old_rstat[last] = it->second;
1901 it->second.first = last+1;
7c673cae
FG
1902 }
1903 } else {
94b18763
FG
1904 // *it is to the _right_ of [first,last]
1905 it = parent->dirty_old_rstat.lower_bound(first);
1906 // new *it last is >= first
1907 if (it->second.first <= last && // new *it isn't also to the right, and
1908 it->first >= first) { // it intersects our first bit,
1909 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1910 first = it->first+1;
7c673cae
FG
1911 }
1912 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1913 }
1914 }
1915 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1916 parent->dirty_old_rstat[last].first = first;
1917 prstat = &parent->dirty_old_rstat[last].rstat;
1918 }
1919
1920 // apply
1921 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
11fdf7f2 1922 ceph_assert(last >= first);
7c673cae
FG
1923 prstat->add(delta);
1924 if (update_inode)
1925 inode.accounted_rstat = inode.rstat;
1926 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1927
1928 last = first-1;
1929 }
1930}
1931
1932void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1933 snapid_t ofirst, snapid_t last,
1934 CInode *pin, bool cow_head)
1935{
1936 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1937 dout(20) << " frag rstat " << rstat << dendl;
1938 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1939 nest_info_t delta = rstat;
1940 delta.sub(accounted_rstat);
1941 dout(20) << " delta " << delta << dendl;
1942
1943 while (last >= ofirst) {
94b18763 1944 CInode::mempool_inode *pi;
7c673cae
FG
1945 snapid_t first;
1946 if (last == pin->last) {
1947 pi = pin->get_projected_inode();
11fdf7f2 1948 first = std::max(ofirst, pin->first);
7c673cae 1949 if (first > pin->first) {
94b18763 1950 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1951 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1952 }
1953 } else {
1954 if (last >= pin->first) {
1955 first = pin->first;
1956 pin->cow_old_inode(last, cow_head);
1957 } else {
1958 // our life is easier here because old_inodes is not sparse
1959 // (although it may not begin at snapid 1)
94b18763
FG
1960 auto it = pin->old_inodes.lower_bound(last);
1961 if (it == pin->old_inodes.end()) {
7c673cae
FG
1962 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1963 break;
1964 }
94b18763 1965 first = it->second.first;
7c673cae 1966 if (first > last) {
94b18763 1967 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1968 //assert(p == pin->old_inodes.begin());
1969 break;
1970 }
94b18763
FG
1971 if (it->first > last) {
1972 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1973 << (last+1) << "," << it->first << "]" << dendl;
1974 pin->old_inodes[last] = it->second;
1975 it->second.first = last+1;
1976 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1977 }
1978 }
1979 if (first < ofirst) {
1980 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1981 << first << "," << ofirst-1 << "]" << dendl;
1982 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1983 pin->dirty_old_rstats.insert(ofirst-1);
1984 pin->old_inodes[last].first = first = ofirst;
1985 }
1986 pi = &pin->old_inodes[last].inode;
1987 pin->dirty_old_rstats.insert(last);
1988 }
1989 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1990 pi->rstat.add(delta);
1991 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1992
1993 last = first-1;
1994 }
1995}
1996
a8e16298 1997void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
7c673cae 1998{
11fdf7f2
TL
1999 if (!(mds->is_active() || mds->is_stopping()))
2000 return;
2001
7c673cae
FG
2002 if (!in->is_auth() || in->is_frozen())
2003 return;
2004
94b18763 2005 auto i = in->get_projected_inode();
a8e16298
TL
2006
2007 if (!i->quota.is_enable() &&
2008 !quota_change)
7c673cae
FG
2009 return;
2010
11fdf7f2
TL
2011 // creaete snaprealm for quota inode (quota was set before mimic)
2012 if (!in->get_projected_srnode())
2013 mds->server->create_quota_realm(in);
7c673cae 2014
11fdf7f2
TL
2015 for (auto &p : in->client_caps) {
2016 Capability *cap = &p.second;
2017 if (cap->is_noquota())
2018 continue;
28e407b8 2019
11fdf7f2 2020 if (exclude_ct >= 0 && exclude_ct != p.first)
28e407b8
AA
2021 goto update;
2022
7c673cae
FG
2023 if (cap->last_rbytes == i->rstat.rbytes &&
2024 cap->last_rsize == i->rstat.rsize())
2025 continue;
2026
2027 if (i->quota.max_files > 0) {
2028 if (i->rstat.rsize() >= i->quota.max_files)
2029 goto update;
2030
2031 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2032 abs(cap->last_rsize - i->rstat.rsize()))
2033 goto update;
2034 }
2035
2036 if (i->quota.max_bytes > 0) {
2037 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2038 goto update;
2039
2040 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2041 abs(cap->last_rbytes - i->rstat.rbytes))
2042 goto update;
2043 }
2044
2045 continue;
2046
2047update:
2048 cap->last_rsize = i->rstat.rsize();
2049 cap->last_rbytes = i->rstat.rbytes;
2050
9f95a23c 2051 auto msg = make_message<MClientQuota>();
7c673cae
FG
2052 msg->ino = in->ino();
2053 msg->rstat = i->rstat;
2054 msg->quota = i->quota;
11fdf7f2 2055 mds->send_message_client_counted(msg, cap->get_session());
7c673cae 2056 }
181888fb 2057 for (const auto &it : in->get_replicas()) {
9f95a23c 2058 auto msg = make_message<MGatherCaps>();
7c673cae 2059 msg->ino = in->ino();
181888fb 2060 mds->send_message_mds(msg, it.first);
7c673cae
FG
2061 }
2062}
2063
2064/*
2065 * NOTE: we _have_ to delay the scatter if we are called during a
2066 * rejoin, because we can't twiddle locks between when the
2067 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2068 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2069 * (no requests), and a survivor acks immediately. _except_ that
2070 * during rejoin_(weak|strong) processing, we may complete a lock
2071 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2072 * scatterlock state in that case or the lock states will get out of
2073 * sync between the auth and replica.
2074 *
2075 * the simple solution is to never do the scatter here. instead, put
2076 * the scatterlock on a list if it isn't already wrlockable. this is
2077 * probably the best plan anyway, since we avoid too many
2078 * scatters/locks under normal usage.
2079 */
2080/*
2081 * some notes on dirlock/nestlock scatterlock semantics:
2082 *
2083 * the fragstat (dirlock) will never be updated without
2084 * dirlock+nestlock wrlock held by the caller.
2085 *
2086 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2087 * data is pushed up the tree. this could be changed with some
2088 * restructuring here, but in its current form we ensure that the
2089 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2090 * frag, which is nice. and, we only need to track frags that need to
2091 * be nudged (and not inodes with pending rstat changes that need to
2092 * be pushed into the frag). a consequence of this is that the
2093 * accounted_rstat on scatterlock sync may not match our current
2094 * rstat. this is normal and expected.
2095 */
2096void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2097 CInode *in, CDir *parent,
2098 int flags, int linkunlink,
2099 snapid_t cfollows)
2100{
2101 bool primary_dn = flags & PREDIRTY_PRIMARY;
2102 bool do_parent_mtime = flags & PREDIRTY_DIR;
2103 bool shallow = flags & PREDIRTY_SHALLOW;
2104
11fdf7f2 2105 ceph_assert(mds->mdlog->entry_is_open());
7c673cae
FG
2106
2107 // make sure stamp is set
2108 if (mut->get_mds_stamp() == utime_t())
2109 mut->set_mds_stamp(ceph_clock_now());
2110
2111 if (in->is_base())
2112 return;
2113
2114 dout(10) << "predirty_journal_parents"
2115 << (do_parent_mtime ? " do_parent_mtime":"")
2116 << " linkunlink=" << linkunlink
2117 << (primary_dn ? " primary_dn":" remote_dn")
2118 << (shallow ? " SHALLOW":"")
2119 << " follows " << cfollows
2120 << " " << *in << dendl;
2121
2122 if (!parent) {
11fdf7f2 2123 ceph_assert(primary_dn);
7c673cae
FG
2124 parent = in->get_projected_parent_dn()->get_dir();
2125 }
2126
2127 if (flags == 0 && linkunlink == 0) {
2128 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2129 blob->add_dir_context(parent);
2130 return;
2131 }
2132
2133 // build list of inodes to wrlock, dirty, and update
2134 list<CInode*> lsi;
2135 CInode *cur = in;
2136 CDentry *parentdn = NULL;
2137 bool first = true;
2138 while (parent) {
2139 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
11fdf7f2 2140 ceph_assert(parent->is_auth());
7c673cae
FG
2141
2142 // opportunistically adjust parent dirfrag
2143 CInode *pin = parent->get_inode();
2144
2145 // inode -> dirfrag
2146 mut->auth_pin(parent);
2147 mut->add_projected_fnode(parent);
2148
2149 fnode_t *pf = parent->project_fnode();
2150 pf->version = parent->pre_dirty();
2151
2152 if (do_parent_mtime || linkunlink) {
11fdf7f2
TL
2153 ceph_assert(mut->is_wrlocked(&pin->filelock));
2154 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2155 ceph_assert(cfollows == CEPH_NOSNAP);
7c673cae
FG
2156
2157 // update stale fragstat/rstat?
2158 parent->resync_accounted_fragstat();
2159 parent->resync_accounted_rstat();
2160
2161 if (do_parent_mtime) {
2162 pf->fragstat.mtime = mut->get_op_stamp();
2163 pf->fragstat.change_attr++;
2164 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2165 if (pf->fragstat.mtime > pf->rstat.rctime) {
2166 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2167 pf->rstat.rctime = pf->fragstat.mtime;
2168 } else {
2169 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2170 }
2171 }
2172 if (linkunlink) {
2173 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2174 if (in->is_dir()) {
2175 pf->fragstat.nsubdirs += linkunlink;
2176 //pf->rstat.rsubdirs += linkunlink;
2177 } else {
2178 pf->fragstat.nfiles += linkunlink;
2179 //pf->rstat.rfiles += linkunlink;
2180 }
2181 }
2182 }
2183
2184 // rstat
2185 if (!primary_dn) {
2186 // don't update parent this pass
2187 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2188 pin->versionlock.can_wrlock())) {
2189 dout(20) << " unwritable parent nestlock " << pin->nestlock
2190 << ", marking dirty rstat on " << *cur << dendl;
2191 cur->mark_dirty_rstat();
2192 } else {
2193 // if we don't hold a wrlock reference on this nestlock, take one,
2194 // because we are about to write into the dirfrag fnode and that needs
2195 // to commit before the lock can cycle.
2196 if (linkunlink) {
11fdf7f2 2197 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
7c673cae
FG
2198 }
2199
11fdf7f2 2200 if (!mut->is_wrlocked(&pin->nestlock)) {
7c673cae
FG
2201 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2202 mds->locker->wrlock_force(&pin->nestlock, mut);
2203 }
2204
2205 // now we can project the inode rstat diff the dirfrag
2206 SnapRealm *prealm = pin->find_snaprealm();
2207
2208 snapid_t follows = cfollows;
2209 if (follows == CEPH_NOSNAP)
2210 follows = prealm->get_newest_seq();
2211
2212 snapid_t first = follows+1;
2213
2214 // first, if the frag is stale, bring it back in sync.
2215 parent->resync_accounted_rstat();
2216
2217 // now push inode rstats into frag
2218 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2219 cur->clear_dirty_rstat();
2220 }
2221
2222 bool stop = false;
2223 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2224 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2225 stop = true;
2226 }
2227
2228 // delay propagating until later?
2229 if (!stop && !first &&
11fdf7f2 2230 g_conf()->mds_dirstat_min_interval > 0) {
7c673cae 2231 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
11fdf7f2 2232 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
7c673cae 2233 dout(10) << "predirty_journal_parents last prop " << since_last_prop
11fdf7f2 2234 << " < " << g_conf()->mds_dirstat_min_interval
7c673cae
FG
2235 << ", stopping" << dendl;
2236 stop = true;
2237 } else {
2238 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2239 }
2240 }
2241
2242 // can cast only because i'm passing nowait=true in the sole user
7c673cae 2243 if (!stop &&
11fdf7f2 2244 !mut->is_wrlocked(&pin->nestlock) &&
7c673cae 2245 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
9f95a23c 2246 !mds->locker->wrlock_try(&pin->nestlock, mut)
7c673cae
FG
2247 )) { // ** do not initiate.. see above comment **
2248 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2249 << " on " << *pin << dendl;
2250 stop = true;
2251 }
2252 if (stop) {
2253 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2254 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2255 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2256 mut->add_updated_lock(&pin->nestlock);
2257 if (do_parent_mtime || linkunlink) {
2258 mds->locker->mark_updated_scatterlock(&pin->filelock);
2259 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2260 mut->add_updated_lock(&pin->filelock);
2261 }
2262 break;
2263 }
11fdf7f2 2264 if (!mut->is_wrlocked(&pin->versionlock))
7c673cae
FG
2265 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2266
11fdf7f2 2267 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
7c673cae
FG
2268
2269 pin->last_dirstat_prop = mut->get_mds_stamp();
2270
2271 // dirfrag -> diri
2272 mut->auth_pin(pin);
2273 mut->add_projected_inode(pin);
2274 lsi.push_front(pin);
2275
2276 pin->pre_cow_old_inode(); // avoid cow mayhem!
2277
94b18763
FG
2278 auto &pi = pin->project_inode();
2279 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2280
2281 // dirstat
2282 if (do_parent_mtime || linkunlink) {
2283 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2284 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2285 bool touched_mtime = false, touched_chattr = false;
94b18763 2286 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2287 pf->accounted_fragstat = pf->fragstat;
2288 if (touched_mtime)
94b18763 2289 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2290 if (touched_chattr)
94b18763
FG
2291 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2292 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2293
2294 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2295 if (pi.inode.dirstat.size() < 0)
11fdf7f2 2296 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
94b18763 2297 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2298 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2299 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2300 << ", dirfrag has " << pf->fragstat;
2301
2302 // trust the dirfrag for now
94b18763 2303 pi.inode.dirstat = pf->fragstat;
7c673cae 2304
11fdf7f2 2305 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
7c673cae
FG
2306 }
2307 }
2308 }
2309
2310 /*
2311 * the rule here is to follow the _oldest_ parent with dirty rstat
2312 * data. if we don't propagate all data, we add ourselves to the
2313 * nudge list. that way all rstat data will (eventually) get
2314 * pushed up the tree.
2315 *
2316 * actually, no. for now, silently drop rstats for old parents. we need
2317 * hard link backpointers to do the above properly.
2318 */
2319
2320 // stop?
2321 if (pin->is_base())
2322 break;
2323 parentdn = pin->get_projected_parent_dn();
11fdf7f2 2324 ceph_assert(parentdn);
7c673cae
FG
2325
2326 // rstat
2327 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2328
2329 // first, if the frag is stale, bring it back in sync.
2330 parent->resync_accounted_rstat();
2331
11fdf7f2 2332 if (g_conf()->mds_snap_rstat) {
94b18763
FG
2333 for (auto &p : parent->dirty_old_rstat) {
2334 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2335 p.first, pin, true);
2336 }
7c673cae
FG
2337 }
2338 parent->dirty_old_rstat.clear();
2339 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2340
2341 pf->accounted_rstat = pf->rstat;
2342
2343 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2344 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2345 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2346 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2347 << ", dirfrag has " << pf->rstat;
2348
2349 // trust the dirfrag for now
94b18763 2350 pi.inode.rstat = pf->rstat;
7c673cae 2351
11fdf7f2 2352 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
7c673cae
FG
2353 }
2354 }
2355
2356 parent->check_rstats();
2357 broadcast_quota_to_client(pin);
2358 // next parent!
2359 cur = pin;
2360 parent = parentdn->get_dir();
2361 linkunlink = 0;
2362 do_parent_mtime = false;
2363 primary_dn = true;
2364 first = false;
2365 }
2366
2367 // now, stick it in the blob
11fdf7f2
TL
2368 ceph_assert(parent);
2369 ceph_assert(parent->is_auth());
7c673cae
FG
2370 blob->add_dir_context(parent);
2371 blob->add_dir(parent, true);
9f95a23c
TL
2372 for (const auto& in : lsi) {
2373 journal_dirty_inode(mut.get(), blob, in);
7c673cae
FG
2374 }
2375
2376}
2377
2378
2379
2380
2381
2382// ===================================
2383// slave requests
2384
2385
2386/*
2387 * some handlers for master requests with slaves. we need to make
2388 * sure slaves journal commits before we forget we mastered them and
2389 * remove them from the uncommitted_masters map (used during recovery
2390 * to commit|abort slaves).
2391 */
2392struct C_MDC_CommittedMaster : public MDCacheLogContext {
2393 metareqid_t reqid;
2394 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2395 void finish(int r) override {
2396 mdcache->_logged_master_commit(reqid);
2397 }
2398};
2399
2400void MDCache::log_master_commit(metareqid_t reqid)
2401{
2402 dout(10) << "log_master_commit " << reqid << dendl;
2403 uncommitted_masters[reqid].committing = true;
2404 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2405 new C_MDC_CommittedMaster(this, reqid));
2406}
2407
2408void MDCache::_logged_master_commit(metareqid_t reqid)
2409{
2410 dout(10) << "_logged_master_commit " << reqid << dendl;
11fdf7f2 2411 ceph_assert(uncommitted_masters.count(reqid));
7c673cae
FG
2412 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2413 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2414 uncommitted_masters.erase(reqid);
2415}
2416
2417// while active...
2418
2419void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2420{
2421 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
11fdf7f2 2422 ceph_assert(uncommitted_masters.count(r));
7c673cae
FG
2423 uncommitted_masters[r].slaves.erase(from);
2424 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2425 log_master_commit(r);
2426}
2427
2428void MDCache::logged_master_update(metareqid_t reqid)
2429{
2430 dout(10) << "logged_master_update " << reqid << dendl;
11fdf7f2 2431 ceph_assert(uncommitted_masters.count(reqid));
7c673cae 2432 uncommitted_masters[reqid].safe = true;
11fdf7f2
TL
2433 auto p = pending_masters.find(reqid);
2434 if (p != pending_masters.end()) {
2435 pending_masters.erase(p);
7c673cae
FG
2436 if (pending_masters.empty())
2437 process_delayed_resolve();
2438 }
2439}
2440
2441/*
2442 * Master may crash after receiving all slaves' commit acks, but before journalling
2443 * the final commit. Slaves may crash after journalling the slave commit, but before
2444 * sending commit ack to the master. Commit masters with no uncommitted slave when
2445 * resolve finishes.
2446 */
2447void MDCache::finish_committed_masters()
2448{
2449 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2450 p != uncommitted_masters.end();
2451 ++p) {
2452 p->second.recovering = false;
2453 if (!p->second.committing && p->second.slaves.empty()) {
2454 dout(10) << "finish_committed_masters " << p->first << dendl;
2455 log_master_commit(p->first);
2456 }
2457 }
2458}
2459
2460/*
2461 * at end of resolve... we must journal a commit|abort for all slave
2462 * updates, before moving on.
2463 *
2464 * this is so that the master can safely journal ECommitted on ops it
2465 * masters when it reaches up:active (all other recovering nodes must
2466 * complete resolve before that happens).
2467 */
2468struct C_MDC_SlaveCommit : public MDCacheLogContext {
2469 mds_rank_t from;
2470 metareqid_t reqid;
2471 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2472 void finish(int r) override {
2473 mdcache->_logged_slave_commit(from, reqid);
2474 }
2475};
2476
2477void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2478{
2479 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2480
2481 // send a message
9f95a23c 2482 auto req = make_message<MMDSSlaveRequest>(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
7c673cae
FG
2483 mds->send_message_mds(req, from);
2484}
2485
2486
2487
2488
2489
2490
2491// ====================================================================
2492// import map, recovery
2493
2494void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2495 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2496{
2497 if (subtrees.count(oldparent)) {
2498 vector<dirfrag_t>& v = subtrees[oldparent];
2499 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2500 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2501 if (*it == df) {
2502 v.erase(it);
2503 break;
2504 }
2505 }
2506 if (subtrees.count(newparent)) {
2507 vector<dirfrag_t>& v = subtrees[newparent];
2508 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2509 v.push_back(df);
2510 }
2511}
2512
2513ESubtreeMap *MDCache::create_subtree_map()
2514{
2515 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2516 << num_subtrees_fullauth() << " fullauth"
2517 << dendl;
2518
2519 show_subtrees();
2520
2521 ESubtreeMap *le = new ESubtreeMap();
2522 mds->mdlog->_start_entry(le);
2523
2524 map<dirfrag_t, CDir*> dirs_to_add;
2525
2526 if (myin) {
2527 CDir* mydir = myin->get_dirfrag(frag_t());
2528 dirs_to_add[mydir->dirfrag()] = mydir;
2529 }
2530
2531 // include all auth subtrees, and their bounds.
2532 // and a spanning tree to tie it to the root.
f6b5b4d7 2533 for (auto& [dir, bounds] : subtrees) {
7c673cae
FG
2534 // journal subtree as "ours" if we are
2535 // me, -2
2536 // me, me
2537 // me, !me (may be importing and ambiguous!)
2538
2539 // so not
2540 // !me, *
2541 if (dir->get_dir_auth().first != mds->get_nodeid())
2542 continue;
2543
2544 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2545 my_ambiguous_imports.count(dir->dirfrag())) {
2546 dout(15) << " ambig subtree " << *dir << dendl;
2547 le->ambiguous_subtrees.insert(dir->dirfrag());
2548 } else {
f6b5b4d7 2549 dout(15) << " auth subtree " << *dir << dendl;
7c673cae
FG
2550 }
2551
2552 dirs_to_add[dir->dirfrag()] = dir;
2553 le->subtrees[dir->dirfrag()].clear();
2554
7c673cae 2555 // bounds
f6b5b4d7
TL
2556 size_t nbounds = bounds.size();
2557 if (nbounds > 3) {
2558 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2559 }
2560 for (auto& bound : bounds) {
2561 if (nbounds <= 3) {
2562 dout(15) << " subtree bound " << *bound << dendl;
2563 }
7c673cae
FG
2564 dirs_to_add[bound->dirfrag()] = bound;
2565 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2566 }
2567 }
2568
2569 // apply projected renames
9f95a23c
TL
2570 for (const auto& [diri, renames] : projected_subtree_renames) {
2571 for (const auto& [olddir, newdir] : renames) {
f6b5b4d7 2572 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
7c673cae 2573
9f95a23c
TL
2574 auto&& dfls = diri->get_dirfrags();
2575 for (const auto& dir : dfls) {
f6b5b4d7 2576 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
7c673cae 2577 CDir *oldparent = get_projected_subtree_root(olddir);
f6b5b4d7 2578 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
7c673cae 2579 CDir *newparent = get_projected_subtree_root(newdir);
f6b5b4d7 2580 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
7c673cae
FG
2581
2582 if (oldparent == newparent) {
f6b5b4d7 2583 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
7c673cae
FG
2584 << oldparent->dirfrag() << dendl;
2585 continue;
2586 }
2587
2588 if (dir->is_subtree_root()) {
2589 if (le->subtrees.count(newparent->dirfrag()) &&
2590 oldparent->get_dir_auth() != newparent->get_dir_auth())
2591 dirs_to_add[dir->dirfrag()] = dir;
2592 // children are fine. change parent.
2593 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2594 le->subtrees);
2595 } else {
2596 // mid-subtree.
2597
2598 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2599 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2600 // if oldparent is auth, subtree is mine; include it.
2601 if (le->subtrees.count(oldparent->dirfrag())) {
2602 dirs_to_add[dir->dirfrag()] = dir;
2603 le->subtrees[dir->dirfrag()].clear();
2604 }
2605 // if newparent is auth, subtree is a new bound
2606 if (le->subtrees.count(newparent->dirfrag())) {
2607 dirs_to_add[dir->dirfrag()] = dir;
2608 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2609 }
2610 newparent = dir;
2611 }
2612
2613 // see if any old bounds move to the new parent.
f6b5b4d7 2614 for (auto& bound : subtrees.at(oldparent)) {
7c673cae
FG
2615 if (dir->contains(bound->get_parent_dir()))
2616 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2617 le->subtrees);
2618 }
2619 }
2620 }
2621 }
2622 }
2623
2624 // simplify the journaled map. our in memory map may have more
2625 // subtrees than needed due to migrations that are just getting
2626 // started or just completing. but on replay, the "live" map will
2627 // be simple and we can do a straight comparison.
f6b5b4d7
TL
2628 for (auto& [frag, bfrags] : le->subtrees) {
2629 if (le->ambiguous_subtrees.count(frag))
7c673cae
FG
2630 continue;
2631 unsigned i = 0;
f6b5b4d7
TL
2632 while (i < bfrags.size()) {
2633 dirfrag_t b = bfrags[i];
7c673cae
FG
2634 if (le->subtrees.count(b) &&
2635 le->ambiguous_subtrees.count(b) == 0) {
f6b5b4d7
TL
2636 auto& bb = le->subtrees.at(b);
2637 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2638 for (auto& r : bb) {
2639 bfrags.push_back(r);
2640 }
7c673cae
FG
2641 dirs_to_add.erase(b);
2642 le->subtrees.erase(b);
f6b5b4d7 2643 bfrags.erase(bfrags.begin() + i);
7c673cae
FG
2644 } else {
2645 ++i;
2646 }
2647 }
2648 }
2649
94b18763 2650 for (auto &p : dirs_to_add) {
7c673cae
FG
2651 CDir *dir = p.second;
2652 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2653 le->metablob.add_dir(dir, false);
2654 }
2655
2656 dout(15) << " subtrees " << le->subtrees << dendl;
2657 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2658
2659 //le->metablob.print(cout);
2660 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2661 return le;
2662}
2663
2664void MDCache::dump_resolve_status(Formatter *f) const
2665{
2666 f->open_object_section("resolve_status");
2667 f->dump_stream("resolve_gather") << resolve_gather;
2668 f->dump_stream("resolve_ack_gather") << resolve_gather;
2669 f->close_section();
2670}
2671
11fdf7f2 2672void MDCache::resolve_start(MDSContext *resolve_done_)
7c673cae
FG
2673{
2674 dout(10) << "resolve_start" << dendl;
11fdf7f2 2675 ceph_assert(!resolve_done);
7c673cae
FG
2676 resolve_done.reset(resolve_done_);
2677
2678 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2679 // if we don't have the root dir, adjust it to UNKNOWN. during
2680 // resolve we want mds0 to explicit claim the portion of it that
2681 // it owns, so that anything beyond its bounds get left as
2682 // unknown.
2683 CDir *rootdir = root->get_dirfrag(frag_t());
2684 if (rootdir)
2685 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2686 }
2687 resolve_gather = recovery_set;
11fdf7f2
TL
2688
2689 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
7c673cae
FG
2690}
2691
2692void MDCache::send_resolves()
2693{
2694 send_slave_resolves();
11fdf7f2
TL
2695
2696 if (!resolve_done) {
2697 // I'm survivor: refresh snap cache
2698 mds->snapclient->sync(
2699 new MDSInternalContextWrapper(mds,
9f95a23c 2700 new LambdaContext([this](int r) {
11fdf7f2
TL
2701 maybe_finish_slave_resolve();
2702 })
2703 )
2704 );
2705 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2706 return;
2707 }
7c673cae
FG
2708 if (!resolve_ack_gather.empty()) {
2709 dout(10) << "send_resolves still waiting for resolve ack from ("
2710 << resolve_ack_gather << ")" << dendl;
2711 return;
2712 }
11fdf7f2 2713 if (!resolve_need_rollback.empty()) {
7c673cae 2714 dout(10) << "send_resolves still waiting for rollback to commit on ("
11fdf7f2 2715 << resolve_need_rollback << ")" << dendl;
7c673cae
FG
2716 return;
2717 }
11fdf7f2 2718
7c673cae
FG
2719 send_subtree_resolves();
2720}
2721
2722void MDCache::send_slave_resolves()
2723{
2724 dout(10) << "send_slave_resolves" << dendl;
2725
9f95a23c 2726 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
7c673cae
FG
2727
2728 if (mds->is_resolve()) {
e306af50
TL
2729 for (map<metareqid_t, uslave>::iterator p = uncommitted_slaves.begin();
2730 p != uncommitted_slaves.end();
7c673cae 2731 ++p) {
e306af50
TL
2732 mds_rank_t master = p->second.master;
2733 auto &m = resolves[master];
2734 if (!m) m = make_message<MMDSResolve>();
2735 m->add_slave_request(p->first, false);
7c673cae
FG
2736 }
2737 } else {
2738 set<mds_rank_t> resolve_set;
2739 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2740 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2741 p != active_requests.end();
2742 ++p) {
2743 MDRequestRef& mdr = p->second;
2744 if (!mdr->is_slave())
2745 continue;
2746 if (!mdr->slave_did_prepare() && !mdr->committing) {
2747 continue;
2748 }
2749 mds_rank_t master = mdr->slave_to_mds;
2750 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2751 dout(10) << " including uncommitted " << *mdr << dendl;
2752 if (!resolves.count(master))
9f95a23c 2753 resolves[master] = make_message<MMDSResolve>();
7c673cae
FG
2754 if (!mdr->committing &&
2755 mdr->has_more() && mdr->more()->is_inode_exporter) {
2756 // re-send cap exports
2757 CInode *in = mdr->more()->rename_inode;
2758 map<client_t, Capability::Export> cap_map;
2759 in->export_client_caps(cap_map);
2760 bufferlist bl;
9f95a23c
TL
2761 MMDSResolve::slave_inode_cap inode_caps(in->ino(), cap_map);
2762 encode(inode_caps, bl);
7c673cae
FG
2763 resolves[master]->add_slave_request(p->first, bl);
2764 } else {
2765 resolves[master]->add_slave_request(p->first, mdr->committing);
2766 }
2767 }
2768 }
2769 }
2770
11fdf7f2
TL
2771 for (auto &p : resolves) {
2772 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2773 mds->send_message_mds(p.second, p.first);
2774 resolve_ack_gather.insert(p.first);
7c673cae
FG
2775 }
2776}
2777
2778void MDCache::send_subtree_resolves()
2779{
2780 dout(10) << "send_subtree_resolves" << dendl;
2781
2782 if (migrator->is_exporting() || migrator->is_importing()) {
2783 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2784 migrator->show_importing();
2785 migrator->show_exporting();
2786 resolves_pending = true;
2787 return; // not now
2788 }
2789
9f95a23c 2790 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
7c673cae
FG
2791 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2792 p != recovery_set.end();
2793 ++p) {
2794 if (*p == mds->get_nodeid())
2795 continue;
2796 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
9f95a23c 2797 resolves[*p] = make_message<MMDSResolve>();
7c673cae
FG
2798 }
2799
2800 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2801 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2802
2803 // known
2804 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2805 p != subtrees.end();
2806 ++p) {
2807 CDir *dir = p->first;
2808
2809 // only our subtrees
2810 if (dir->authority().first != mds->get_nodeid())
2811 continue;
2812
2813 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2814 continue; // we'll add it below
2815
2816 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2817 // ambiguous (mid-import)
2818 set<CDir*> bounds;
2819 get_subtree_bounds(dir, bounds);
2820 vector<dirfrag_t> dfls;
2821 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2822 dfls.push_back((*q)->dirfrag());
2823
2824 my_ambig_imports[dir->dirfrag()] = dfls;
2825 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2826 } else {
2827 // not ambiguous.
11fdf7f2
TL
2828 for (auto &q : resolves) {
2829 resolves[q.first]->add_subtree(dir->dirfrag());
2830 }
7c673cae
FG
2831 // bounds too
2832 vector<dirfrag_t> dfls;
2833 for (set<CDir*>::iterator q = subtrees[dir].begin();
2834 q != subtrees[dir].end();
2835 ++q) {
2836 CDir *bound = *q;
2837 dfls.push_back(bound->dirfrag());
2838 }
2839
2840 my_subtrees[dir->dirfrag()] = dfls;
2841 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2842 }
2843 }
2844
2845 // ambiguous
2846 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2847 p != my_ambiguous_imports.end();
2848 ++p) {
2849 my_ambig_imports[p->first] = p->second;
2850 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2851 }
2852
2853 // simplify the claimed subtree.
2854 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2855 unsigned i = 0;
2856 while (i < p->second.size()) {
2857 dirfrag_t b = p->second[i];
2858 if (my_subtrees.count(b)) {
2859 vector<dirfrag_t>& bb = my_subtrees[b];
2860 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2861 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2862 p->second.push_back(*r);
2863 my_subtrees.erase(b);
2864 p->second.erase(p->second.begin() + i);
2865 } else {
2866 ++i;
2867 }
2868 }
2869 }
2870
2871 // send
11fdf7f2 2872 for (auto &p : resolves) {
9f95a23c 2873 const ref_t<MMDSResolve> &m = p.second;
11fdf7f2
TL
2874 if (mds->is_resolve()) {
2875 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2876 } else {
2877 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2878 }
7c673cae
FG
2879 m->subtrees = my_subtrees;
2880 m->ambiguous_imports = my_ambig_imports;
11fdf7f2
TL
2881 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2882 mds->send_message_mds(m, p.first);
7c673cae
FG
2883 }
2884 resolves_pending = false;
2885}
2886
11fdf7f2
TL
2887void MDCache::maybe_finish_slave_resolve() {
2888 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2889 // snap cache get synced or I'm in resolve state
2890 if (mds->snapclient->is_synced() || resolve_done)
2891 send_subtree_resolves();
2892 process_delayed_resolve();
2893 }
2894}
2895
7c673cae
FG
2896void MDCache::handle_mds_failure(mds_rank_t who)
2897{
2898 dout(7) << "handle_mds_failure mds." << who << dendl;
2899
2900 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2901
2902 resolve_gather.insert(who);
2903 discard_delayed_resolve(who);
2904 ambiguous_slave_updates.erase(who);
2905
2906 rejoin_gather.insert(who);
2907 rejoin_sent.erase(who); // i need to send another
31f18b77 2908 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2909 rejoin_ack_gather.erase(who); // i'll need/get another.
2910
2911 dout(10) << " resolve_gather " << resolve_gather << dendl;
2912 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2913 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2914 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2915 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2916
2917
2918 // tell the migrator too.
2919 migrator->handle_mds_failure_or_stop(who);
2920
224ce89b
WB
2921 // tell the balancer too.
2922 mds->balancer->handle_mds_failure(who);
2923
7c673cae
FG
2924 // clean up any requests slave to/from this node
2925 list<MDRequestRef> finish;
2926 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2927 p != active_requests.end();
2928 ++p) {
2929 MDRequestRef& mdr = p->second;
2930 // slave to the failed node?
2931 if (mdr->slave_to_mds == who) {
2932 if (mdr->slave_did_prepare()) {
2933 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2934 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2935 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2936
2937 if (!mdr->more()->waiting_on_slave.empty()) {
11fdf7f2 2938 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae 2939 // will rollback, no need to wait
91327a77 2940 mdr->reset_slave_request();
7c673cae
FG
2941 mdr->more()->waiting_on_slave.clear();
2942 }
2943 } else if (!mdr->committing) {
2944 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2945 if (mdr->slave_request || mdr->slave_rolling_back())
2946 mdr->aborted = true;
2947 else
2948 finish.push_back(mdr);
2949 }
2950 }
2951
2952 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2953 if (mdr->more()->waiting_on_slave.count(who)) {
11fdf7f2 2954 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae
FG
2955 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2956 << who << dendl;
2957 mdr->more()->waiting_on_slave.erase(who);
2958 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2959 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2960 }
2961
2962 if (mdr->more()->srcdn_auth_mds == who &&
2963 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2964 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2965 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2966 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2967 }
31f18b77 2968 } else if (mdr->slave_request) {
9f95a23c 2969 const cref_t<MMDSSlaveRequest> &slave_req = mdr->slave_request;
31f18b77
FG
2970 // FIXME: Slave rename request can arrive after we notice mds failure.
2971 // This can cause mds to crash (does not affect integrity of FS).
2972 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2973 slave_req->srcdn_auth == who)
2974 slave_req->mark_interrupted();
7c673cae
FG
2975 }
2976
2977 // failed node is slave?
2978 if (mdr->is_master() && !mdr->committing) {
2979 if (mdr->more()->srcdn_auth_mds == who) {
2980 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2981 << who << " to recover" << dendl;
11fdf7f2 2982 ceph_assert(mdr->more()->witnessed.count(who) == 0);
7c673cae
FG
2983 if (mdr->more()->is_ambiguous_auth)
2984 mdr->clear_ambiguous_auth();
2985 // rename srcdn's auth mds failed, all witnesses will rollback
2986 mdr->more()->witnessed.clear();
2987 pending_masters.erase(p->first);
2988 }
2989
2990 if (mdr->more()->witnessed.count(who)) {
2991 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2992 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2993 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2994 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2995 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2996 // until either the request is committing or the slave also fails.
11fdf7f2 2997 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
7c673cae
FG
2998 pending_masters.insert(p->first);
2999 } else {
3000 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
3001 << who << " to recover" << dendl;
3002 if (srcdn_auth >= 0)
11fdf7f2 3003 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
7c673cae
FG
3004
3005 // discard this peer's prepare (if any)
3006 mdr->more()->witnessed.erase(who);
3007 }
3008 }
3009
3010 if (mdr->more()->waiting_on_slave.count(who)) {
3011 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3012 << " to recover" << dendl;
3013 // retry request when peer recovers
3014 mdr->more()->waiting_on_slave.erase(who);
3015 if (mdr->more()->waiting_on_slave.empty())
3016 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3017 }
3018
3019 if (mdr->locking && mdr->locking_target_mds == who)
3020 mdr->finish_locking(mdr->locking);
3021 }
3022 }
3023
3024 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3025 p != uncommitted_masters.end();
3026 ++p) {
3027 // The failed MDS may have already committed the slave update
3028 if (p->second.slaves.count(who)) {
3029 p->second.recovering = true;
3030 p->second.slaves.erase(who);
3031 }
3032 }
3033
3034 while (!finish.empty()) {
3035 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3036 request_finish(finish.front());
3037 finish.pop_front();
3038 }
3039
3040 kick_find_ino_peers(who);
3041 kick_open_ino_peers(who);
3042
3043 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3044 p != fragments.end(); ) {
3045 dirfrag_t df = p->first;
3046 fragment_info_t& info = p->second;
a8e16298
TL
3047
3048 if (info.is_fragmenting()) {
3049 if (info.notify_ack_waiting.erase(who) &&
3050 info.notify_ack_waiting.empty()) {
3051 fragment_drop_locks(info);
3052 fragment_maybe_finish(p++);
3053 } else {
3054 ++p;
3055 }
7c673cae 3056 continue;
a8e16298
TL
3057 }
3058
3059 ++p;
7c673cae 3060 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
9f95a23c 3061 std::vector<CDir*> dirs;
7c673cae
FG
3062 info.dirs.swap(dirs);
3063 fragments.erase(df);
3064 fragment_unmark_unfreeze_dirs(dirs);
3065 }
3066
3067 // MDCache::shutdown_export_strays() always exports strays to mds.0
3068 if (who == mds_rank_t(0))
f64942e4 3069 shutdown_exporting_strays.clear();
7c673cae
FG
3070
3071 show_subtrees();
3072}
3073
3074/*
3075 * handle_mds_recovery - called on another node's transition
3076 * from resolve -> active.
3077 */
3078void MDCache::handle_mds_recovery(mds_rank_t who)
3079{
3080 dout(7) << "handle_mds_recovery mds." << who << dendl;
3081
3082 // exclude all discover waiters. kick_discovers() will do the job
3083 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3084 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3085
11fdf7f2 3086 MDSContext::vec waiters;
7c673cae
FG
3087
3088 // wake up any waiters in their subtrees
3089 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3090 p != subtrees.end();
3091 ++p) {
3092 CDir *dir = p->first;
3093
3094 if (dir->authority().first != who ||
3095 dir->authority().second == mds->get_nodeid())
3096 continue;
11fdf7f2 3097 ceph_assert(!dir->is_auth());
7c673cae
FG
3098
3099 // wake any waiters
9f95a23c
TL
3100 std::queue<CDir*> q;
3101 q.push(dir);
7c673cae
FG
3102
3103 while (!q.empty()) {
3104 CDir *d = q.front();
9f95a23c 3105 q.pop();
7c673cae
FG
3106 d->take_waiting(d_mask, waiters);
3107
3108 // inode waiters too
94b18763
FG
3109 for (auto &p : d->items) {
3110 CDentry *dn = p.second;
7c673cae
FG
3111 CDentry::linkage_t *dnl = dn->get_linkage();
3112 if (dnl->is_primary()) {
3113 dnl->get_inode()->take_waiting(i_mask, waiters);
3114
3115 // recurse?
9f95a23c
TL
3116 auto&& ls = dnl->get_inode()->get_dirfrags();
3117 for (const auto& subdir : ls) {
7c673cae 3118 if (!subdir->is_subtree_root())
9f95a23c 3119 q.push(subdir);
7c673cae
FG
3120 }
3121 }
3122 }
3123 }
3124 }
3125
3126 kick_open_ino_peers(who);
3127 kick_find_ino_peers(who);
3128
3129 // queue them up.
3130 mds->queue_waiters(waiters);
3131}
3132
3133void MDCache::set_recovery_set(set<mds_rank_t>& s)
3134{
3135 dout(7) << "set_recovery_set " << s << dendl;
3136 recovery_set = s;
3137}
3138
3139
3140/*
3141 * during resolve state, we share resolves to determine who
3142 * is authoritative for which trees. we expect to get an resolve
3143 * from _everyone_ in the recovery_set (the mds cluster at the time of
3144 * the first failure).
3145 *
3146 * This functions puts the passed message before returning
3147 */
9f95a23c 3148void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
7c673cae
FG
3149{
3150 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3151 mds_rank_t from = mds_rank_t(m->get_source().num());
3152
3153 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3154 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3155 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3156 return;
3157 }
3158 // wait until we reach the resolve stage!
7c673cae
FG
3159 return;
3160 }
3161
3162 discard_delayed_resolve(from);
3163
3164 // ambiguous slave requests?
3165 if (!m->slave_requests.empty()) {
3166 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3167 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3168 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
11fdf7f2 3169 ceph_assert(!p->second.committing);
7c673cae
FG
3170 pending_masters.insert(p->first);
3171 }
3172 }
3173
3174 if (!pending_masters.empty()) {
3175 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3176 delayed_resolve[from] = m;
3177 return;
3178 }
3179 }
3180
9f95a23c 3181 auto ack = make_message<MMDSResolveAck>();
11fdf7f2
TL
3182 for (const auto &p : m->slave_requests) {
3183 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
7c673cae 3184 // COMMIT
11fdf7f2 3185 if (p.second.committing) {
7c673cae 3186 // already committing, waiting for the OP_COMMITTED slave reply
11fdf7f2 3187 dout(10) << " already committing slave request " << p << " noop "<< dendl;
7c673cae 3188 } else {
11fdf7f2
TL
3189 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3190 ack->add_commit(p.first);
7c673cae 3191 }
11fdf7f2 3192 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
7c673cae 3193
11fdf7f2 3194 if (p.second.inode_caps.length() > 0) {
7c673cae 3195 // slave wants to export caps (rename)
11fdf7f2 3196 ceph_assert(mds->is_resolve());
9f95a23c 3197 MMDSResolve::slave_inode_cap inode_caps;
11fdf7f2 3198 auto q = p.second.inode_caps.cbegin();
9f95a23c
TL
3199 decode(inode_caps, q);
3200 inodeno_t ino = inode_caps.ino;
3201 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
11fdf7f2 3202 ceph_assert(get_inode(ino));
7c673cae
FG
3203
3204 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3205 q != cap_exports.end();
3206 ++q) {
3207 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3208 im.cap_id = ++last_cap_id; // assign a new cap ID
3209 im.issue_seq = 1;
3210 im.mseq = q->second.mseq;
28e407b8
AA
3211
3212 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3213 if (session)
3214 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3215 }
3216
3217 // will process these caps in rejoin stage
3218 rejoin_slave_exports[ino].first = from;
3219 rejoin_slave_exports[ino].second.swap(cap_exports);
3220
3221 // send information of imported caps back to slave
11fdf7f2 3222 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
7c673cae
FG
3223 }
3224 } else {
3225 // ABORT
11fdf7f2
TL
3226 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3227 ceph_assert(!p.second.committing);
3228 ack->add_abort(p.first);
7c673cae
FG
3229 }
3230 }
3231 mds->send_message(ack, m->get_connection());
7c673cae
FG
3232 return;
3233 }
3234
11fdf7f2 3235 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
7c673cae
FG
3236 dout(10) << "delay processing subtree resolve" << dendl;
3237 delayed_resolve[from] = m;
3238 return;
3239 }
3240
3241 bool survivor = false;
3242 // am i a surviving ambiguous importer?
3243 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3244 survivor = true;
3245 // check for any import success/failure (from this node)
3246 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3247 while (p != my_ambiguous_imports.end()) {
3248 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3249 ++next;
3250 CDir *dir = get_dirfrag(p->first);
11fdf7f2 3251 ceph_assert(dir);
7c673cae
FG
3252 dout(10) << "checking ambiguous import " << *dir << dendl;
3253 if (migrator->is_importing(dir->dirfrag()) &&
3254 migrator->get_import_peer(dir->dirfrag()) == from) {
11fdf7f2 3255 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
7c673cae
FG
3256
3257 // check if sender claims the subtree
3258 bool claimed_by_sender = false;
11fdf7f2 3259 for (const auto &q : m->subtrees) {
7c673cae 3260 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
11fdf7f2 3261 CDir *base = get_force_dirfrag(q.first, false);
7c673cae
FG
3262 if (!base || !base->contains(dir))
3263 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3264
3265 bool inside = true;
3266 set<CDir*> bounds;
11fdf7f2 3267 get_force_dirfrag_bound_set(q.second, bounds);
7c673cae
FG
3268 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3269 CDir *bound = *p;
3270 if (bound->contains(dir)) {
3271 inside = false; // nope, bound is dir or parent of dir, not inside.
3272 break;
3273 }
3274 }
3275 if (inside)
3276 claimed_by_sender = true;
3277 }
3278
3279 my_ambiguous_imports.erase(p); // no longer ambiguous.
3280 if (claimed_by_sender) {
3281 dout(7) << "ambiguous import failed on " << *dir << dendl;
3282 migrator->import_reverse(dir);
3283 } else {
3284 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3285 migrator->import_finish(dir, true);
3286 }
3287 }
3288 p = next;
3289 }
3290 }
3291
3292 // update my dir_auth values
3293 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3294 // migrations between other nodes)
11fdf7f2
TL
3295 for (const auto& p : m->subtrees) {
3296 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3297 CDir *dir = get_force_dirfrag(p.first, !survivor);
7c673cae
FG
3298 if (!dir)
3299 continue;
11fdf7f2 3300 adjust_bounded_subtree_auth(dir, p.second, from);
7c673cae
FG
3301 try_subtree_merge(dir);
3302 }
3303
3304 show_subtrees();
3305
3306 // note ambiguous imports too
11fdf7f2
TL
3307 for (const auto& p : m->ambiguous_imports) {
3308 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3309 other_ambiguous_imports[from][p.first] = p.second;
3310 }
3311
3312 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3313 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3314 for (const auto& p : m->table_clients) {
3315 dout(10) << " noting " << get_mdstable_name(p.type)
3316 << " pending_commits " << p.pending_commits << dendl;
3317 MDSTableClient *client = mds->get_table_client(p.type);
3318 for (const auto& q : p.pending_commits)
3319 client->notify_commit(q);
7c673cae
FG
3320 }
3321
3322 // did i get them all?
3323 resolve_gather.erase(from);
3324
3325 maybe_resolve_finish();
7c673cae
FG
3326}
3327
3328void MDCache::process_delayed_resolve()
3329{
3330 dout(10) << "process_delayed_resolve" << dendl;
9f95a23c 3331 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
7c673cae 3332 tmp.swap(delayed_resolve);
11fdf7f2
TL
3333 for (auto &p : tmp) {
3334 handle_resolve(p.second);
3335 }
7c673cae
FG
3336}
3337
3338void MDCache::discard_delayed_resolve(mds_rank_t who)
3339{
11fdf7f2 3340 delayed_resolve.erase(who);
7c673cae
FG
3341}
3342
3343void MDCache::maybe_resolve_finish()
3344{
11fdf7f2
TL
3345 ceph_assert(resolve_ack_gather.empty());
3346 ceph_assert(resolve_need_rollback.empty());
7c673cae
FG
3347
3348 if (!resolve_gather.empty()) {
3349 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3350 << resolve_gather << ")" << dendl;
3351 return;
3352 }
3353
3354 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3355 disambiguate_my_imports();
3356 finish_committed_masters();
3357
3358 if (resolve_done) {
11fdf7f2 3359 ceph_assert(mds->is_resolve());
7c673cae
FG
3360 trim_unlinked_inodes();
3361 recalc_auth_bits(false);
3362 resolve_done.release()->complete(0);
3363 } else {
11fdf7f2 3364 // I am survivor.
7c673cae
FG
3365 maybe_send_pending_rejoins();
3366 }
3367}
3368
9f95a23c 3369void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
7c673cae
FG
3370{
3371 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3372 mds_rank_t from = mds_rank_t(ack->get_source().num());
3373
3374 if (!resolve_ack_gather.count(from) ||
3375 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
7c673cae
FG
3376 return;
3377 }
3378
3379 if (ambiguous_slave_updates.count(from)) {
11fdf7f2
TL
3380 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3381 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
7c673cae
FG
3382 }
3383
11fdf7f2
TL
3384 for (const auto &p : ack->commit) {
3385 dout(10) << " commit on slave " << p.first << dendl;
7c673cae
FG
3386
3387 if (ambiguous_slave_updates.count(from)) {
11fdf7f2 3388 remove_ambiguous_slave_update(p.first, from);
7c673cae
FG
3389 continue;
3390 }
3391
3392 if (mds->is_resolve()) {
3393 // replay
e306af50 3394 MDSlaveUpdate *su = get_uncommitted_slave(p.first, from);
11fdf7f2 3395 ceph_assert(su);
7c673cae
FG
3396
3397 // log commit
11fdf7f2 3398 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
7c673cae 3399 ESlaveUpdate::OP_COMMIT, su->origop),
11fdf7f2 3400 new C_MDC_SlaveCommit(this, from, p.first));
7c673cae
FG
3401 mds->mdlog->flush();
3402
e306af50 3403 finish_uncommitted_slave(p.first);
7c673cae 3404 } else {
11fdf7f2 3405 MDRequestRef mdr = request_get(p.first);
7c673cae 3406 // information about master imported caps
11fdf7f2
TL
3407 if (p.second.length() > 0)
3408 mdr->more()->inode_import.share(p.second);
7c673cae 3409
11fdf7f2 3410 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
7c673cae
FG
3411 request_finish(mdr);
3412 }
3413 }
3414
11fdf7f2
TL
3415 for (const auto &metareq : ack->abort) {
3416 dout(10) << " abort on slave " << metareq << dendl;
7c673cae
FG
3417
3418 if (mds->is_resolve()) {
e306af50 3419 MDSlaveUpdate *su = get_uncommitted_slave(metareq, from);
11fdf7f2 3420 ceph_assert(su);
7c673cae
FG
3421
3422 // perform rollback (and journal a rollback entry)
3423 // note: this will hold up the resolve a bit, until the rollback entries journal.
3424 MDRequestRef null_ref;
3425 switch (su->origop) {
3426 case ESlaveUpdate::LINK:
3427 mds->server->do_link_rollback(su->rollback, from, null_ref);
3428 break;
3429 case ESlaveUpdate::RENAME:
3430 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3431 break;
3432 case ESlaveUpdate::RMDIR:
3433 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3434 break;
3435 default:
3436 ceph_abort();
3437 }
3438 } else {
11fdf7f2 3439 MDRequestRef mdr = request_get(metareq);
7c673cae
FG
3440 mdr->aborted = true;
3441 if (mdr->slave_request) {
3442 if (mdr->slave_did_prepare()) // journaling slave prepare ?
11fdf7f2 3443 add_rollback(metareq, from);
7c673cae
FG
3444 } else {
3445 request_finish(mdr);
3446 }
3447 }
3448 }
3449
11fdf7f2 3450 if (!ambiguous_slave_updates.count(from)) {
7c673cae 3451 resolve_ack_gather.erase(from);
11fdf7f2 3452 maybe_finish_slave_resolve();
7c673cae 3453 }
7c673cae
FG
3454}
3455
e306af50 3456void MDCache::add_uncommitted_slave(metareqid_t reqid, LogSegment *ls, mds_rank_t master, MDSlaveUpdate *su)
7c673cae 3457{
e306af50
TL
3458 auto const &ret = uncommitted_slaves.emplace(std::piecewise_construct,
3459 std::forward_as_tuple(reqid),
3460 std::forward_as_tuple());
3461 ceph_assert(ret.second);
3462 ls->uncommitted_slaves.insert(reqid);
3463 uslave &u = ret.first->second;
3464 u.master = master;
3465 u.ls = ls;
3466 u.su = su;
3467 if (su == nullptr) {
3468 return;
3469 }
7c673cae
FG
3470 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3471 uncommitted_slave_rename_olddir[*p]++;
3472 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3473 uncommitted_slave_unlink[*p]++;
3474}
3475
e306af50 3476void MDCache::finish_uncommitted_slave(metareqid_t reqid, bool assert_exist)
7c673cae 3477{
e306af50
TL
3478 auto it = uncommitted_slaves.find(reqid);
3479 if (it == uncommitted_slaves.end()) {
3480 ceph_assert(!assert_exist);
3481 return;
3482 }
3483 uslave &u = it->second;
3484 MDSlaveUpdate* su = u.su;
3485
3486 if (!u.waiters.empty()) {
3487 mds->queue_waiters(u.waiters);
3488 }
3489 u.ls->uncommitted_slaves.erase(reqid);
3490 uncommitted_slaves.erase(it);
7c673cae 3491
e306af50
TL
3492 if (su == nullptr) {
3493 return;
3494 }
7c673cae
FG
3495 // discard the non-auth subtree we renamed out of
3496 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3497 CInode *diri = *p;
3498 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
11fdf7f2 3499 ceph_assert(it != uncommitted_slave_rename_olddir.end());
7c673cae
FG
3500 it->second--;
3501 if (it->second == 0) {
3502 uncommitted_slave_rename_olddir.erase(it);
9f95a23c
TL
3503 auto&& ls = diri->get_dirfrags();
3504 for (const auto& dir : ls) {
3505 CDir *root = get_subtree_root(dir);
7c673cae
FG
3506 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3507 try_trim_non_auth_subtree(root);
9f95a23c 3508 if (dir != root)
7c673cae
FG
3509 break;
3510 }
3511 }
3512 } else
11fdf7f2 3513 ceph_assert(it->second > 0);
7c673cae
FG
3514 }
3515 // removed the inodes that were unlinked by slave update
3516 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3517 CInode *in = *p;
3518 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
11fdf7f2 3519 ceph_assert(it != uncommitted_slave_unlink.end());
7c673cae
FG
3520 it->second--;
3521 if (it->second == 0) {
3522 uncommitted_slave_unlink.erase(it);
3523 if (!in->get_projected_parent_dn())
3524 mds->mdcache->remove_inode_recursive(in);
3525 } else
11fdf7f2 3526 ceph_assert(it->second > 0);
7c673cae
FG
3527 }
3528 delete su;
3529}
3530
e306af50 3531MDSlaveUpdate* MDCache::get_uncommitted_slave(metareqid_t reqid, mds_rank_t master)
7c673cae
FG
3532{
3533
e306af50
TL
3534 MDSlaveUpdate* su = nullptr;
3535 auto it = uncommitted_slaves.find(reqid);
3536 if (it != uncommitted_slaves.end() &&
3537 it->second.master == master) {
3538 su = it->second.su;
7c673cae
FG
3539 }
3540 return su;
3541}
3542
e306af50 3543void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
f91f0fd5 3544 auto p = resolve_need_rollback.find(reqid);
11fdf7f2 3545 ceph_assert(p != resolve_need_rollback.end());
e306af50
TL
3546 if (mds->is_resolve()) {
3547 finish_uncommitted_slave(reqid, false);
3548 } else if (mdr) {
3549 finish_uncommitted_slave(mdr->reqid, mdr->more()->slave_update_journaled);
3550 }
11fdf7f2
TL
3551 resolve_need_rollback.erase(p);
3552 maybe_finish_slave_resolve();
7c673cae
FG
3553}
3554
3555void MDCache::disambiguate_other_imports()
3556{
3557 dout(10) << "disambiguate_other_imports" << dendl;
3558
3559 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3560 // other nodes' ambiguous imports
3561 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3562 p != other_ambiguous_imports.end();
3563 ++p) {
3564 mds_rank_t who = p->first;
3565 dout(10) << "ambiguous imports for mds." << who << dendl;
3566
3567 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3568 q != p->second.end();
3569 ++q) {
3570 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3571 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3572 CDir *dir = get_force_dirfrag(q->first, recovering);
3573 if (!dir) continue;
3574
3575 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3576 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3577 dout(10) << " mds." << who << " did import " << *dir << dendl;
3578 adjust_bounded_subtree_auth(dir, q->second, who);
3579 try_subtree_merge(dir);
3580 } else {
3581 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3582 }
3583 }
3584 }
3585 other_ambiguous_imports.clear();
3586}
3587
3588void MDCache::disambiguate_my_imports()
3589{
3590 dout(10) << "disambiguate_my_imports" << dendl;
3591
3592 if (!mds->is_resolve()) {
11fdf7f2 3593 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3594 return;
3595 }
3596
3597 disambiguate_other_imports();
3598
3599 // my ambiguous imports
3600 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3601 while (!my_ambiguous_imports.empty()) {
3602 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3603
3604 CDir *dir = get_dirfrag(q->first);
11fdf7f2 3605 ceph_assert(dir);
7c673cae
FG
3606
3607 if (dir->authority() != me_ambig) {
3608 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3609 cancel_ambiguous_import(dir);
3610
3611 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3612
3613 // subtree may have been swallowed by another node claiming dir
3614 // as their own.
3615 CDir *root = get_subtree_root(dir);
3616 if (root != dir)
3617 dout(10) << " subtree root is " << *root << dendl;
11fdf7f2 3618 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
7c673cae
FG
3619 try_trim_non_auth_subtree(root);
3620 } else {
3621 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3622 finish_ambiguous_import(q->first);
3623 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3624 }
3625 }
11fdf7f2 3626 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3627 mds->mdlog->flush();
3628
3629 // verify all my subtrees are unambiguous!
3630 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3631 p != subtrees.end();
3632 ++p) {
3633 CDir *dir = p->first;
3634 if (dir->is_ambiguous_dir_auth()) {
3635 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3636 }
11fdf7f2 3637 ceph_assert(!dir->is_ambiguous_dir_auth());
7c673cae
FG
3638 }
3639
3640 show_subtrees();
3641}
3642
3643
3644void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3645{
11fdf7f2 3646 ceph_assert(my_ambiguous_imports.count(base) == 0);
7c673cae
FG
3647 my_ambiguous_imports[base] = bounds;
3648}
3649
3650
3651void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3652{
3653 // make a list
3654 vector<dirfrag_t> binos;
3655 for (set<CDir*>::iterator p = bounds.begin();
3656 p != bounds.end();
3657 ++p)
3658 binos.push_back((*p)->dirfrag());
3659
3660 // note: this can get called twice if the exporter fails during recovery
3661 if (my_ambiguous_imports.count(base->dirfrag()))
3662 my_ambiguous_imports.erase(base->dirfrag());
3663
3664 add_ambiguous_import(base->dirfrag(), binos);
3665}
3666
3667void MDCache::cancel_ambiguous_import(CDir *dir)
3668{
3669 dirfrag_t df = dir->dirfrag();
11fdf7f2 3670 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3671 dout(10) << "cancel_ambiguous_import " << df
3672 << " bounds " << my_ambiguous_imports[df]
3673 << " " << *dir
3674 << dendl;
3675 my_ambiguous_imports.erase(df);
3676}
3677
3678void MDCache::finish_ambiguous_import(dirfrag_t df)
3679{
11fdf7f2 3680 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3681 vector<dirfrag_t> bounds;
3682 bounds.swap(my_ambiguous_imports[df]);
3683 my_ambiguous_imports.erase(df);
3684
3685 dout(10) << "finish_ambiguous_import " << df
3686 << " bounds " << bounds
3687 << dendl;
3688 CDir *dir = get_dirfrag(df);
11fdf7f2 3689 ceph_assert(dir);
7c673cae
FG
3690
3691 // adjust dir_auth, import maps
3692 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3693 try_subtree_merge(dir);
3694}
3695
3696void MDCache::remove_inode_recursive(CInode *in)
3697{
3698 dout(10) << "remove_inode_recursive " << *in << dendl;
9f95a23c
TL
3699 auto&& ls = in->get_dirfrags();
3700 for (const auto& subdir : ls) {
3701 dout(10) << " removing dirfrag " << *subdir << dendl;
94b18763
FG
3702 auto it = subdir->items.begin();
3703 while (it != subdir->items.end()) {
3704 CDentry *dn = it->second;
3705 ++it;
7c673cae
FG
3706 CDentry::linkage_t *dnl = dn->get_linkage();
3707 if (dnl->is_primary()) {
3708 CInode *tin = dnl->get_inode();
31f18b77 3709 subdir->unlink_inode(dn, false);
7c673cae
FG
3710 remove_inode_recursive(tin);
3711 }
3712 subdir->remove_dentry(dn);
3713 }
3714
3715 if (subdir->is_subtree_root())
3716 remove_subtree(subdir);
3717 in->close_dirfrag(subdir->dirfrag().frag);
3718 }
3719 remove_inode(in);
3720}
3721
11fdf7f2 3722bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
7c673cae 3723{
11fdf7f2 3724 ceph_assert(!in->is_auth());
7c673cae
FG
3725
3726 dout(10) << __func__ << ":" << *in << dendl;
3727
3728 // Recurse into any dirfrags beneath this inode
9f95a23c
TL
3729 auto&& ls = in->get_dirfrags();
3730 for (const auto& subdir : ls) {
7c673cae
FG
3731 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3732 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3733 return true;
3734 }
3735
3736 for (auto &it : subdir->items) {
3737 CDentry *dn = it.second;
3738 CDentry::linkage_t *dnl = dn->get_linkage();
3739 if (dnl->is_primary()) {
3740 CInode *tin = dnl->get_inode();
3741
3742 /* Remote strays with linkage (i.e. hardlinks) should not be
3743 * expired, because they may be the target of
3744 * a rename() as the owning MDS shuts down */
3745 if (!tin->is_stray() && tin->inode.nlink) {
3746 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3747 return true;
3748 }
3749
3750 const bool abort = expire_recursive(tin, expiremap);
3751 if (abort) {
3752 return true;
3753 }
3754 }
3755 if (dn->lru_is_expireable()) {
3756 trim_dentry(dn, expiremap);
3757 } else {
3758 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3759 return true;
3760 }
3761 }
3762 }
3763
3764 return false;
3765}
3766
3767void MDCache::trim_unlinked_inodes()
3768{
3769 dout(7) << "trim_unlinked_inodes" << dendl;
81eedcae
TL
3770 int count = 0;
3771 vector<CInode*> q;
94b18763 3772 for (auto &p : inode_map) {
b32b8144 3773 CInode *in = p.second;
7c673cae
FG
3774 if (in->get_parent_dn() == NULL && !in->is_base()) {
3775 dout(7) << " will trim from " << *in << dendl;
3776 q.push_back(in);
3777 }
81eedcae
TL
3778
3779 if (!(++count % 1000))
3780 mds->heartbeat_reset();
3781 }
81eedcae
TL
3782 for (auto& in : q) {
3783 remove_inode_recursive(in);
3784
3785 if (!(++count % 1000))
3786 mds->heartbeat_reset();
7c673cae 3787 }
7c673cae
FG
3788}
3789
3790/** recalc_auth_bits()
3791 * once subtree auth is disambiguated, we need to adjust all the
3792 * auth and dirty bits in our cache before moving on.
3793 */
3794void MDCache::recalc_auth_bits(bool replay)
3795{
3796 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3797
3798 if (root) {
3799 root->inode_auth.first = mds->mdsmap->get_root();
3800 bool auth = mds->get_nodeid() == root->inode_auth.first;
3801 if (auth) {
3802 root->state_set(CInode::STATE_AUTH);
3803 } else {
3804 root->state_clear(CInode::STATE_AUTH);
3805 if (!replay)
3806 root->state_set(CInode::STATE_REJOINING);
3807 }
3808 }
3809
3810 set<CInode*> subtree_inodes;
3811 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3812 p != subtrees.end();
3813 ++p) {
3814 if (p->first->dir_auth.first == mds->get_nodeid())
3815 subtree_inodes.insert(p->first->inode);
3816 }
3817
3818 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3819 p != subtrees.end();
3820 ++p) {
3821 if (p->first->inode->is_mdsdir()) {
3822 CInode *in = p->first->inode;
3823 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3824 if (auth) {
3825 in->state_set(CInode::STATE_AUTH);
3826 } else {
3827 in->state_clear(CInode::STATE_AUTH);
3828 if (!replay)
3829 in->state_set(CInode::STATE_REJOINING);
3830 }
3831 }
3832
9f95a23c
TL
3833 std::queue<CDir*> dfq; // dirfrag queue
3834 dfq.push(p->first);
7c673cae
FG
3835
3836 bool auth = p->first->authority().first == mds->get_nodeid();
3837 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3838
3839 while (!dfq.empty()) {
3840 CDir *dir = dfq.front();
9f95a23c 3841 dfq.pop();
7c673cae
FG
3842
3843 // dir
3844 if (auth) {
3845 dir->state_set(CDir::STATE_AUTH);
3846 } else {
3847 dir->state_clear(CDir::STATE_AUTH);
3848 if (!replay) {
3849 // close empty non-auth dirfrag
3850 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3851 dir->inode->close_dirfrag(dir->get_frag());
3852 continue;
3853 }
3854 dir->state_set(CDir::STATE_REJOINING);
3855 dir->state_clear(CDir::STATE_COMPLETE);
3856 if (dir->is_dirty())
3857 dir->mark_clean();
3858 }
3859 }
3860
3861 // dentries in this dir
94b18763 3862 for (auto &p : dir->items) {
7c673cae 3863 // dn
94b18763 3864 CDentry *dn = p.second;
7c673cae
FG
3865 CDentry::linkage_t *dnl = dn->get_linkage();
3866 if (auth) {
3867 dn->state_set(CDentry::STATE_AUTH);
3868 } else {
3869 dn->state_clear(CDentry::STATE_AUTH);
3870 if (!replay) {
3871 dn->state_set(CDentry::STATE_REJOINING);
3872 if (dn->is_dirty())
3873 dn->mark_clean();
3874 }
3875 }
3876
3877 if (dnl->is_primary()) {
3878 // inode
3879 CInode *in = dnl->get_inode();
3880 if (auth) {
3881 in->state_set(CInode::STATE_AUTH);
3882 } else {
3883 in->state_clear(CInode::STATE_AUTH);
3884 if (!replay) {
3885 in->state_set(CInode::STATE_REJOINING);
3886 if (in->is_dirty())
3887 in->mark_clean();
3888 if (in->is_dirty_parent())
3889 in->clear_dirty_parent();
3890 // avoid touching scatterlocks for our subtree roots!
3891 if (subtree_inodes.count(in) == 0)
3892 in->clear_scatter_dirty();
3893 }
3894 }
3895 // recurse?
9f95a23c
TL
3896 if (in->is_dir()) {
3897 auto&& dfv = in->get_nested_dirfrags();
3898 for (const auto& dir : dfv) {
3899 dfq.push(dir);
3900 }
3901 }
7c673cae
FG
3902 }
3903 }
3904 }
3905 }
3906
3907 show_subtrees();
3908 show_cache();
3909}
3910
3911
3912
3913// ===========================================================================
3914// REJOIN
3915
3916/*
3917 * notes on scatterlock recovery:
3918 *
3919 * - recovering inode replica sends scatterlock data for any subtree
3920 * roots (the only ones that are possibly dirty).
3921 *
3922 * - surviving auth incorporates any provided scatterlock data. any
3923 * pending gathers are then finished, as with the other lock types.
3924 *
3925 * that takes care of surviving auth + (recovering replica)*.
3926 *
3927 * - surviving replica sends strong_inode, which includes current
3928 * scatterlock state, AND any dirty scatterlock data. this
3929 * provides the recovering auth with everything it might need.
3930 *
3931 * - recovering auth must pick initial scatterlock state based on
3932 * (weak|strong) rejoins.
3933 * - always assimilate scatterlock data (it can't hurt)
3934 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3935 * - include base inode in ack for all inodes that saw scatterlock content
3936 *
3937 * also, for scatter gather,
3938 *
3939 * - auth increments {frag,r}stat.version on completion of any gather.
3940 *
3941 * - auth incorporates changes in a gather _only_ if the version
3942 * matches.
3943 *
3944 * - replica discards changes any time the scatterlock syncs, and
3945 * after recovery.
3946 */
3947
3948void MDCache::dump_rejoin_status(Formatter *f) const
3949{
3950 f->open_object_section("rejoin_status");
3951 f->dump_stream("rejoin_gather") << rejoin_gather;
3952 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3953 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3954 f->close_section();
3955}
3956
11fdf7f2 3957void MDCache::rejoin_start(MDSContext *rejoin_done_)
7c673cae
FG
3958{
3959 dout(10) << "rejoin_start" << dendl;
11fdf7f2 3960 ceph_assert(!rejoin_done);
7c673cae
FG
3961 rejoin_done.reset(rejoin_done_);
3962
3963 rejoin_gather = recovery_set;
3964 // need finish opening cap inodes before sending cache rejoins
3965 rejoin_gather.insert(mds->get_nodeid());
3966 process_imported_caps();
3967}
3968
3969/*
3970 * rejoin phase!
3971 *
11fdf7f2 3972 * this initiates rejoin. it should be called before we get any
7c673cae
FG
3973 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3974 *
3975 * we start out by sending rejoins to everyone in the recovery set.
3976 *
3977 * if we are rejoin, send for all regions in our cache.
11fdf7f2 3978 * if we are active|stopping, send only to nodes that are rejoining.
7c673cae
FG
3979 */
3980void MDCache::rejoin_send_rejoins()
3981{
3982 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3983
3984 if (rejoin_gather.count(mds->get_nodeid())) {
3985 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3986 rejoins_pending = true;
3987 return;
3988 }
3989 if (!resolve_gather.empty()) {
3990 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3991 << resolve_gather << ")" << dendl;
3992 rejoins_pending = true;
3993 return;
3994 }
3995
11fdf7f2
TL
3996 ceph_assert(!migrator->is_importing());
3997 ceph_assert(!migrator->is_exporting());
7c673cae
FG
3998
3999 if (!mds->is_rejoin()) {
4000 disambiguate_other_imports();
4001 }
4002
9f95a23c 4003 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
7c673cae
FG
4004
4005
4006 // if i am rejoining, send a rejoin to everyone.
4007 // otherwise, just send to others who are rejoining.
9f95a23c
TL
4008 for (const auto& rank : recovery_set) {
4009 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4010 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
7c673cae 4011 if (mds->is_rejoin())
9f95a23c
TL
4012 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4013 else if (mds->mdsmap->is_rejoin(rank))
4014 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
7c673cae
FG
4015 }
4016
4017 if (mds->is_rejoin()) {
11fdf7f2
TL
4018 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4019 for (auto& p : cap_exports) {
4020 mds_rank_t target = p.second.first;
7c673cae
FG
4021 if (rejoins.count(target) == 0)
4022 continue;
11fdf7f2
TL
4023 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4024 Session *session = nullptr;
4025 auto it = client_exports.find(q->first);
4026 if (it != client_exports.end()) {
4027 session = it->second.first;
4028 if (session)
4029 it->second.second.insert(target);
4030 } else {
4031 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4032 auto& r = client_exports[q->first];
4033 r.first = session;
4034 if (session)
4035 r.second.insert(target);
4036 }
4037 if (session) {
4038 ++q;
4039 } else {
4040 // remove reconnect with no session
4041 p.second.second.erase(q++);
4042 }
4043 }
4044 rejoins[target]->cap_exports[p.first] = p.second.second;
7c673cae 4045 }
11fdf7f2
TL
4046 for (auto& p : client_exports) {
4047 Session *session = p.second.first;
4048 for (auto& q : p.second.second) {
4049 auto rejoin = rejoins[q];
4050 rejoin->client_map[p.first] = session->info.inst;
4051 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4052 }
7c673cae
FG
4053 }
4054 }
4055
4056
4057 // check all subtrees
4058 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4059 p != subtrees.end();
4060 ++p) {
4061 CDir *dir = p->first;
11fdf7f2 4062 ceph_assert(dir->is_subtree_root());
7c673cae
FG
4063 if (dir->is_ambiguous_dir_auth()) {
4064 // exporter is recovering, importer is survivor.
11fdf7f2
TL
4065 ceph_assert(rejoins.count(dir->authority().first));
4066 ceph_assert(!rejoins.count(dir->authority().second));
7c673cae
FG
4067 continue;
4068 }
4069
4070 // my subtree?
4071 if (dir->is_auth())
4072 continue; // skip my own regions!
4073
4074 mds_rank_t auth = dir->get_dir_auth().first;
11fdf7f2 4075 ceph_assert(auth >= 0);
7c673cae
FG
4076 if (rejoins.count(auth) == 0)
4077 continue; // don't care about this node's subtrees
4078
4079 rejoin_walk(dir, rejoins[auth]);
4080 }
4081
4082 // rejoin root inodes, too
11fdf7f2 4083 for (auto &p : rejoins) {
7c673cae
FG
4084 if (mds->is_rejoin()) {
4085 // weak
11fdf7f2
TL
4086 if (p.first == 0 && root) {
4087 p.second->add_weak_inode(root->vino());
7c673cae
FG
4088 if (root->is_dirty_scattered()) {
4089 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4090 p.second->add_scatterlock_state(root);
7c673cae
FG
4091 }
4092 }
11fdf7f2 4093 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
7c673cae 4094 if (in)
11fdf7f2 4095 p.second->add_weak_inode(in->vino());
7c673cae
FG
4096 }
4097 } else {
4098 // strong
11fdf7f2
TL
4099 if (p.first == 0 && root) {
4100 p.second->add_strong_inode(root->vino(),
7c673cae
FG
4101 root->get_replica_nonce(),
4102 root->get_caps_wanted(),
4103 root->filelock.get_state(),
4104 root->nestlock.get_state(),
4105 root->dirfragtreelock.get_state());
4106 root->state_set(CInode::STATE_REJOINING);
4107 if (root->is_dirty_scattered()) {
4108 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4109 p.second->add_scatterlock_state(root);
7c673cae
FG
4110 }
4111 }
4112
11fdf7f2
TL
4113 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4114 p.second->add_strong_inode(in->vino(),
7c673cae
FG
4115 in->get_replica_nonce(),
4116 in->get_caps_wanted(),
4117 in->filelock.get_state(),
4118 in->nestlock.get_state(),
4119 in->dirfragtreelock.get_state());
4120 in->state_set(CInode::STATE_REJOINING);
4121 }
4122 }
4123 }
4124
4125 if (!mds->is_rejoin()) {
4126 // i am survivor. send strong rejoin.
4127 // note request remote_auth_pins, xlocks
4128 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4129 p != active_requests.end();
4130 ++p) {
4131 MDRequestRef& mdr = p->second;
4132 if (mdr->is_slave())
4133 continue;
4134 // auth pins
9f95a23c
TL
4135 for (const auto& q : mdr->object_states) {
4136 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4137 continue;
11fdf7f2 4138 if (!q.first->is_auth()) {
9f95a23c
TL
4139 mds_rank_t target = q.second.remote_auth_pinned;
4140 ceph_assert(target == q.first->authority().first);
4141 if (rejoins.count(target) == 0) continue;
4142 const auto& rejoin = rejoins[target];
7c673cae 4143
11fdf7f2 4144 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
7c673cae 4145 MDSCacheObjectInfo i;
11fdf7f2 4146 q.first->set_object_info(i);
7c673cae
FG
4147 if (i.ino)
4148 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4149 else
4150 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4151
4152 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
11fdf7f2 4153 mdr->more()->rename_inode == q.first)
7c673cae
FG
4154 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4155 mdr->reqid, mdr->attempt);
4156 }
4157 }
4158 // xlocks
11fdf7f2
TL
4159 for (const auto& q : mdr->locks) {
4160 auto lock = q.lock;
4161 auto obj = lock->get_parent();
4162 if (q.is_xlock() && !obj->is_auth()) {
4163 mds_rank_t who = obj->authority().first;
7c673cae 4164 if (rejoins.count(who) == 0) continue;
9f95a23c 4165 const auto& rejoin = rejoins[who];
7c673cae 4166
11fdf7f2 4167 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
7c673cae 4168 MDSCacheObjectInfo i;
11fdf7f2 4169 obj->set_object_info(i);
7c673cae 4170 if (i.ino)
11fdf7f2 4171 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
7c673cae
FG
4172 mdr->reqid, mdr->attempt);
4173 else
4174 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4175 mdr->reqid, mdr->attempt);
11fdf7f2
TL
4176 } else if (q.is_remote_wrlock()) {
4177 mds_rank_t who = q.wrlock_target;
4178 if (rejoins.count(who) == 0) continue;
9f95a23c 4179 const auto& rejoin = rejoins[who];
7c673cae 4180
11fdf7f2
TL
4181 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4182 MDSCacheObjectInfo i;
4183 obj->set_object_info(i);
4184 ceph_assert(i.ino);
4185 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4186 mdr->reqid, mdr->attempt);
4187 }
7c673cae
FG
4188 }
4189 }
4190 }
4191
4192 // send the messages
11fdf7f2
TL
4193 for (auto &p : rejoins) {
4194 ceph_assert(rejoin_sent.count(p.first) == 0);
4195 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4196 rejoin_sent.insert(p.first);
4197 rejoin_ack_gather.insert(p.first);
4198 mds->send_message_mds(p.second, p.first);
7c673cae
FG
4199 }
4200 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4201 rejoins_pending = false;
4202
4203 // nothing?
28e407b8 4204 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4205 dout(10) << "nothing to rejoin" << dendl;
4206 rejoin_gather_finish();
4207 }
4208}
4209
4210
4211/**
4212 * rejoin_walk - build rejoin declarations for a subtree
4213 *
4214 * @param dir subtree root
4215 * @param rejoin rejoin message
4216 *
4217 * from a rejoining node:
4218 * weak dirfrag
4219 * weak dentries (w/ connectivity)
4220 *
4221 * from a surviving node:
4222 * strong dirfrag
4223 * strong dentries (no connectivity!)
4224 * strong inodes
4225 */
9f95a23c 4226void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
7c673cae
FG
4227{
4228 dout(10) << "rejoin_walk " << *dir << dendl;
4229
9f95a23c 4230 std::vector<CDir*> nested; // finish this dir, then do nested items
7c673cae
FG
4231
4232 if (mds->is_rejoin()) {
4233 // WEAK
4234 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4235 for (auto &p : dir->items) {
4236 CDentry *dn = p.second;
11fdf7f2 4237 ceph_assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4238 CDentry::linkage_t *dnl = dn->get_linkage();
4239 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
11fdf7f2 4240 ceph_assert(dnl->is_primary());
7c673cae 4241 CInode *in = dnl->get_inode();
11fdf7f2 4242 ceph_assert(dnl->get_inode()->is_dir());
94b18763 4243 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
9f95a23c
TL
4244 {
4245 auto&& dirs = in->get_nested_dirfrags();
4246 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4247 }
7c673cae
FG
4248 if (in->is_dirty_scattered()) {
4249 dout(10) << " sending scatterlock state on " << *in << dendl;
4250 rejoin->add_scatterlock_state(in);
4251 }
4252 }
4253 } else {
4254 // STRONG
4255 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4256 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4257 dir->state_set(CDir::STATE_REJOINING);
4258
11fdf7f2 4259 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
94b18763 4260 CDentry *dn = it->second;
11fdf7f2
TL
4261 ++it;
4262 dn->state_set(CDentry::STATE_REJOINING);
7c673cae 4263 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2
TL
4264 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4265
4266 // trim snap dentries. because they may have been pruned by
4267 // their auth mds (snap deleted)
4268 if (dn->last != CEPH_NOSNAP) {
4269 if (in && !in->remote_parents.empty()) {
4270 // unlink any stale remote snap dentry.
4271 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4272 CDentry *remote_dn = *it2;
4273 ++it2;
4274 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4275 remote_dn->unlink_remote(remote_dn->get_linkage());
4276 }
4277 }
4278 if (dn->lru_is_expireable()) {
4279 if (!dnl->is_null())
4280 dir->unlink_inode(dn, false);
4281 if (in)
4282 remove_inode(in);
4283 dir->remove_dentry(dn);
4284 continue;
4285 } else {
4286 // Inventing null/remote dentry shouldn't cause problem
4287 ceph_assert(!dnl->is_primary());
4288 }
4289 }
4290
7c673cae 4291 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4292 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4293 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4294 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4295 dnl->is_remote() ? dnl->get_remote_d_type():0,
4296 dn->get_replica_nonce(),
4297 dn->lock.get_state());
4298 dn->state_set(CDentry::STATE_REJOINING);
4299 if (dnl->is_primary()) {
4300 CInode *in = dnl->get_inode();
4301 dout(15) << " add_strong_inode " << *in << dendl;
4302 rejoin->add_strong_inode(in->vino(),
4303 in->get_replica_nonce(),
4304 in->get_caps_wanted(),
4305 in->filelock.get_state(),
4306 in->nestlock.get_state(),
4307 in->dirfragtreelock.get_state());
4308 in->state_set(CInode::STATE_REJOINING);
9f95a23c
TL
4309 {
4310 auto&& dirs = in->get_nested_dirfrags();
4311 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4312 }
7c673cae
FG
4313 if (in->is_dirty_scattered()) {
4314 dout(10) << " sending scatterlock state on " << *in << dendl;
4315 rejoin->add_scatterlock_state(in);
4316 }
4317 }
4318 }
4319 }
4320
4321 // recurse into nested dirs
9f95a23c
TL
4322 for (const auto& dir : nested) {
4323 rejoin_walk(dir, rejoin);
4324 }
7c673cae
FG
4325}
4326
4327
4328/*
4329 * i got a rejoin.
4330 * - reply with the lockstate
4331 *
4332 * if i am active|stopping,
4333 * - remove source from replica list for everything not referenced here.
7c673cae 4334 */
9f95a23c 4335void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
7c673cae
FG
4336{
4337 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4338 << " (" << m->get_payload().length() << " bytes)"
4339 << dendl;
4340
4341 switch (m->op) {
4342 case MMDSCacheRejoin::OP_WEAK:
4343 handle_cache_rejoin_weak(m);
4344 break;
4345 case MMDSCacheRejoin::OP_STRONG:
4346 handle_cache_rejoin_strong(m);
4347 break;
4348 case MMDSCacheRejoin::OP_ACK:
4349 handle_cache_rejoin_ack(m);
4350 break;
4351
4352 default:
4353 ceph_abort();
4354 }
7c673cae
FG
4355}
4356
4357
4358/*
4359 * handle_cache_rejoin_weak
4360 *
4361 * the sender
4362 * - is recovering from their journal.
4363 * - may have incorrect (out of date) inode contents
4364 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4365 *
4366 * if the sender didn't trim_non_auth(), they
4367 * - may have incorrect (out of date) dentry/inode linkage
4368 * - may have deleted/purged inodes
4369 * and i may have to go to disk to get accurate inode contents. yuck.
7c673cae 4370 */
9f95a23c 4371void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
7c673cae
FG
4372{
4373 mds_rank_t from = mds_rank_t(weak->get_source().num());
4374
4375 // possible response(s)
9f95a23c 4376 ref_t<MMDSCacheRejoin> ack; // if survivor
7c673cae
FG
4377 set<vinodeno_t> acked_inodes; // if survivor
4378 set<SimpleLock *> gather_locks; // if survivor
4379 bool survivor = false; // am i a survivor?
4380
4381 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4382 survivor = true;
4383 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
9f95a23c 4384 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
7c673cae
FG
4385
4386 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4387
4388 // check cap exports
4389 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4390 CInode *in = get_inode(p->first);
11fdf7f2 4391 ceph_assert(!in || in->is_auth());
7c673cae
FG
4392 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4393 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4394 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4395 Capability::Import& im = imported_caps[p->first][q->first];
4396 if (cap) {
4397 im.cap_id = cap->get_cap_id();
4398 im.issue_seq = cap->get_last_seq();
4399 im.mseq = cap->get_mseq();
4400 } else {
4401 // all are zero
4402 }
4403 }
4404 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4405 }
4406
11fdf7f2 4407 encode(imported_caps, ack->imported_caps);
7c673cae 4408 } else {
11fdf7f2 4409 ceph_assert(mds->is_rejoin());
7c673cae
FG
4410
4411 // we may have already received a strong rejoin from the sender.
4412 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
11fdf7f2 4413 ceph_assert(gather_locks.empty());
7c673cae
FG
4414
4415 // check cap exports.
4416 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
11fdf7f2
TL
4417 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4418 weak->client_metadata_map.end());
7c673cae
FG
4419
4420 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4421 CInode *in = get_inode(p->first);
11fdf7f2 4422 ceph_assert(!in || in->is_auth());
7c673cae
FG
4423 // note
4424 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4425 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4426 cap_imports[p->first][q->first][from] = q->second;
4427 }
4428 }
4429 }
4430
4431 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4432 for (const auto &p : weak->inode_scatterlocks) {
4433 CInode *in = get_inode(p.first);
4434 ceph_assert(in);
4435 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4436 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4437 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4438 if (!survivor)
4439 rejoin_potential_updated_scatterlocks.insert(in);
4440 }
4441
4442 // recovering peer may send incorrect dirfrags here. we need to
4443 // infer which dirfrag they meant. the ack will include a
4444 // strong_dirfrag that will set them straight on the fragmentation.
4445
4446 // walk weak map
4447 set<CDir*> dirs_to_share;
11fdf7f2
TL
4448 for (const auto &p : weak->weak_dirfrags) {
4449 CInode *diri = get_inode(p.ino);
7c673cae 4450 if (!diri)
11fdf7f2
TL
4451 dout(0) << " missing dir ino " << p.ino << dendl;
4452 ceph_assert(diri);
7c673cae 4453
11fdf7f2
TL
4454 frag_vec_t leaves;
4455 if (diri->dirfragtree.is_leaf(p.frag)) {
4456 leaves.push_back(p.frag);
7c673cae 4457 } else {
11fdf7f2
TL
4458 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4459 if (leaves.empty())
4460 leaves.push_back(diri->dirfragtree[p.frag.value()]);
7c673cae 4461 }
11fdf7f2
TL
4462 for (const auto& leaf : leaves) {
4463 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4464 if (!dir) {
11fdf7f2 4465 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
7c673cae
FG
4466 continue;
4467 }
11fdf7f2 4468 ceph_assert(dir);
7c673cae 4469 if (dirs_to_share.count(dir)) {
11fdf7f2 4470 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4471 } else {
4472 dirs_to_share.insert(dir);
4473 unsigned nonce = dir->add_replica(from);
11fdf7f2 4474 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4475 if (ack) {
4476 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4477 ack->add_dirfrag_base(dir);
4478 }
4479 }
4480 }
4481 }
4482
11fdf7f2
TL
4483 for (const auto &p : weak->weak) {
4484 CInode *diri = get_inode(p.first);
7c673cae 4485 if (!diri)
11fdf7f2
TL
4486 dout(0) << " missing dir ino " << p.first << dendl;
4487 ceph_assert(diri);
7c673cae
FG
4488
4489 // weak dentries
4490 CDir *dir = 0;
11fdf7f2 4491 for (const auto &q : p.second) {
7c673cae
FG
4492 // locate proper dirfrag.
4493 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
11fdf7f2 4494 frag_t fg = diri->pick_dirfrag(q.first.name);
7c673cae
FG
4495 if (!dir || dir->get_frag() != fg) {
4496 dir = diri->get_dirfrag(fg);
4497 if (!dir)
4498 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
11fdf7f2
TL
4499 ceph_assert(dir);
4500 ceph_assert(dirs_to_share.count(dir));
7c673cae
FG
4501 }
4502
4503 // and dentry
11fdf7f2
TL
4504 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4505 ceph_assert(dn);
7c673cae 4506 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 4507 ceph_assert(dnl->is_primary());
7c673cae
FG
4508
4509 if (survivor && dn->is_replica(from))
4510 dentry_remove_replica(dn, from, gather_locks);
4511 unsigned dnonce = dn->add_replica(from);
4512 dout(10) << " have " << *dn << dendl;
4513 if (ack)
94b18763 4514 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4515 dnl->get_inode()->ino(), inodeno_t(0), 0,
4516 dnonce, dn->lock.get_replica_state());
4517
4518 // inode
4519 CInode *in = dnl->get_inode();
11fdf7f2 4520 ceph_assert(in);
7c673cae
FG
4521
4522 if (survivor && in->is_replica(from))
4523 inode_remove_replica(in, from, true, gather_locks);
4524 unsigned inonce = in->add_replica(from);
4525 dout(10) << " have " << *in << dendl;
4526
4527 // scatter the dirlock, just in case?
4528 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4529 in->filelock.set_state(LOCK_MIX);
4530
4531 if (ack) {
4532 acked_inodes.insert(in->vino());
4533 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4534 bufferlist bl;
4535 in->_encode_locks_state_for_rejoin(bl, from);
4536 ack->add_inode_locks(in, inonce, bl);
4537 }
4538 }
4539 }
4540
4541 // weak base inodes? (root, stray, etc.)
4542 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4543 p != weak->weak_inodes.end();
4544 ++p) {
4545 CInode *in = get_inode(*p);
11fdf7f2 4546 ceph_assert(in); // hmm fixme wrt stray?
7c673cae
FG
4547 if (survivor && in->is_replica(from))
4548 inode_remove_replica(in, from, true, gather_locks);
4549 unsigned inonce = in->add_replica(from);
4550 dout(10) << " have base " << *in << dendl;
4551
4552 if (ack) {
4553 acked_inodes.insert(in->vino());
4554 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4555 bufferlist bl;
4556 in->_encode_locks_state_for_rejoin(bl, from);
4557 ack->add_inode_locks(in, inonce, bl);
4558 }
4559 }
4560
11fdf7f2 4561 ceph_assert(rejoin_gather.count(from));
7c673cae
FG
4562 rejoin_gather.erase(from);
4563 if (survivor) {
4564 // survivor. do everything now.
11fdf7f2
TL
4565 for (const auto &p : weak->inode_scatterlocks) {
4566 CInode *in = get_inode(p.first);
4567 ceph_assert(in);
7c673cae
FG
4568 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4569 acked_inodes.insert(in->vino());
4570 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4571 }
4572
4573 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4574 mds->send_message(ack, weak->get_connection());
4575
4576 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4577 if (!(*p)->is_stable())
4578 mds->locker->eval_gather(*p);
4579 }
4580 } else {
4581 // done?
28e407b8 4582 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4583 rejoin_gather_finish();
4584 } else {
4585 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4586 }
4587 }
4588}
4589
7c673cae
FG
4590/*
4591 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4592 *
4593 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4594 * ack, the replica dne, and we can remove it from our replica maps.
4595 */
9f95a23c 4596void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
7c673cae
FG
4597 set<vinodeno_t>& acked_inodes,
4598 set<SimpleLock *>& gather_locks)
4599{
4600 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4601
b32b8144 4602 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4603 // inode?
4604 if (in->is_auth() &&
4605 in->is_replica(from) &&
b32b8144 4606 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4607 inode_remove_replica(in, from, false, gather_locks);
4608 dout(10) << " rem " << *in << dendl;
4609 }
4610
b32b8144
FG
4611 if (!in->is_dir())
4612 return;
7c673cae 4613
9f95a23c
TL
4614 const auto&& dfs = in->get_dirfrags();
4615 for (const auto& dir : dfs) {
181888fb
FG
4616 if (!dir->is_auth())
4617 continue;
7c673cae 4618
181888fb 4619 if (dir->is_replica(from) &&
7c673cae
FG
4620 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4621 dir->remove_replica(from);
4622 dout(10) << " rem " << *dir << dendl;
4623 }
4624
4625 // dentries
94b18763
FG
4626 for (auto &p : dir->items) {
4627 CDentry *dn = p.second;
7c673cae 4628
11fdf7f2
TL
4629 if (dn->is_replica(from)) {
4630 if (ack) {
4631 const auto it = ack->strong_dentries.find(dir->dirfrag());
4632 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4633 continue;
4634 }
4635 }
7c673cae
FG
4636 dentry_remove_replica(dn, from, gather_locks);
4637 dout(10) << " rem " << *dn << dendl;
4638 }
4639 }
4640 }
b32b8144
FG
4641 };
4642
94b18763 4643 for (auto &p : inode_map)
b32b8144 4644 scour_func(p.second);
94b18763 4645 for (auto &p : snap_inode_map)
b32b8144 4646 scour_func(p.second);
7c673cae
FG
4647}
4648
4649
4650CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4651{
4652 CInode *in = new CInode(this, true, 1, last);
4653 in->inode.ino = ino;
4654 in->state_set(CInode::STATE_REJOINUNDEF);
4655 add_inode(in);
4656 rejoin_undef_inodes.insert(in);
4657 dout(10) << " invented " << *in << dendl;
4658 return in;
4659}
4660
4661CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4662{
4663 CInode *in = get_inode(df.ino);
4664 if (!in)
4665 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4666 if (!in->is_dir()) {
11fdf7f2 4667 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
7c673cae 4668 in->inode.mode = S_IFDIR;
11fdf7f2 4669 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
4670 }
4671 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4672 dir->state_set(CDir::STATE_REJOINUNDEF);
4673 rejoin_undef_dirfrags.insert(dir);
4674 dout(10) << " invented " << *dir << dendl;
4675 return dir;
4676}
4677
9f95a23c 4678void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
7c673cae
FG
4679{
4680 mds_rank_t from = mds_rank_t(strong->get_source().num());
4681
4682 // only a recovering node will get a strong rejoin.
a8e16298
TL
4683 if (!mds->is_rejoin()) {
4684 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4685 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4686 return;
4687 }
11fdf7f2 4688 ceph_abort_msg("got unexpected rejoin message during recovery");
a8e16298 4689 }
7c673cae
FG
4690
4691 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4692 for (const auto &p : strong->inode_scatterlocks) {
4693 CInode *in = get_inode(p.first);
4694 ceph_assert(in);
4695 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4696 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4697 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4698 rejoin_potential_updated_scatterlocks.insert(in);
4699 }
4700
4701 rejoin_unlinked_inodes[from].clear();
4702
4703 // surviving peer may send incorrect dirfrag here (maybe they didn't
4704 // get the fragment notify, or maybe we rolled back?). we need to
4705 // infer the right frag and get them with the program. somehow.
4706 // we don't normally send ACK.. so we'll need to bundle this with
4707 // MISSING or something.
4708
4709 // strong dirfrags/dentries.
4710 // also process auth_pins, xlocks.
11fdf7f2
TL
4711 for (const auto &p : strong->strong_dirfrags) {
4712 auto& dirfrag = p.first;
4713 CInode *diri = get_inode(dirfrag.ino);
7c673cae 4714 if (!diri)
11fdf7f2
TL
4715 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4716 CDir *dir = diri->get_dirfrag(dirfrag.frag);
7c673cae
FG
4717 bool refragged = false;
4718 if (dir) {
4719 dout(10) << " have " << *dir << dendl;
4720 } else {
4721 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4722 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
11fdf7f2
TL
4723 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4724 dir = rejoin_invent_dirfrag(dirfrag);
7c673cae
FG
4725 }
4726 if (dir) {
11fdf7f2
TL
4727 dir->add_replica(from, p.second.nonce);
4728 dir->dir_rep = p.second.dir_rep;
7c673cae 4729 } else {
11fdf7f2
TL
4730 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4731 frag_vec_t leaves;
4732 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4733 if (leaves.empty())
4734 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4735 dout(10) << " maps to frag(s) " << leaves << dendl;
4736 for (const auto& leaf : leaves) {
4737 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4738 if (!dir)
11fdf7f2 4739 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
7c673cae
FG
4740 else
4741 dout(10) << " have(approx) " << *dir << dendl;
11fdf7f2
TL
4742 dir->add_replica(from, p.second.nonce);
4743 dir->dir_rep = p.second.dir_rep;
7c673cae
FG
4744 }
4745 refragged = true;
4746 }
4747
11fdf7f2
TL
4748 const auto it = strong->strong_dentries.find(dirfrag);
4749 if (it != strong->strong_dentries.end()) {
9f95a23c 4750 const auto& dmap = it->second;
11fdf7f2
TL
4751 for (const auto &q : dmap) {
4752 const string_snap_t& ss = q.first;
4753 const MMDSCacheRejoin::dn_strong& d = q.second;
4754 CDentry *dn;
4755 if (!refragged)
4756 dn = dir->lookup(ss.name, ss.snapid);
4757 else {
4758 frag_t fg = diri->pick_dirfrag(ss.name);
4759 dir = diri->get_dirfrag(fg);
4760 ceph_assert(dir);
4761 dn = dir->lookup(ss.name, ss.snapid);
4762 }
4763 if (!dn) {
4764 if (d.is_remote()) {
4765 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4766 } else if (d.is_null()) {
4767 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4768 } else {
4769 CInode *in = get_inode(d.ino, ss.snapid);
4770 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4771 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4772 }
4773 dout(10) << " invented " << *dn << dendl;
4774 }
4775 CDentry::linkage_t *dnl = dn->get_linkage();
4776
4777 // dn auth_pin?
4778 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4779 if (pinned_it != strong->authpinned_dentries.end()) {
4780 const auto slave_reqid_it = pinned_it->second.find(ss);
4781 if (slave_reqid_it != pinned_it->second.end()) {
4782 for (const auto &r : slave_reqid_it->second) {
4783 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4784
4785 // get/create slave mdrequest
4786 MDRequestRef mdr;
4787 if (have_request(r.reqid))
4788 mdr = request_get(r.reqid);
4789 else
4790 mdr = request_start_slave(r.reqid, r.attempt, strong);
4791 mdr->auth_pin(dn);
4792 }
4793 }
7c673cae 4794 }
7c673cae 4795
11fdf7f2
TL
4796 // dn xlock?
4797 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4798 if (xlocked_it != strong->xlocked_dentries.end()) {
4799 const auto ss_req_it = xlocked_it->second.find(ss);
4800 if (ss_req_it != xlocked_it->second.end()) {
4801 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4802 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4803 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4804 ceph_assert(mdr->is_auth_pinned(dn));
4805 if (!mdr->is_xlocked(&dn->versionlock)) {
4806 ceph_assert(dn->versionlock.can_xlock_local());
4807 dn->versionlock.get_xlock(mdr, mdr->get_client());
9f95a23c 4808 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
11fdf7f2
TL
4809 }
4810 if (dn->lock.is_stable())
4811 dn->auth_pin(&dn->lock);
4812 dn->lock.set_state(LOCK_XLOCK);
4813 dn->lock.get_xlock(mdr, mdr->get_client());
9f95a23c 4814 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
11fdf7f2
TL
4815 }
4816 }
7c673cae 4817
11fdf7f2
TL
4818 dn->add_replica(from, d.nonce);
4819 dout(10) << " have " << *dn << dendl;
4820
4821 if (dnl->is_primary()) {
4822 if (d.is_primary()) {
4823 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4824 // the survivor missed MDentryUnlink+MDentryLink messages ?
4825 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4826 CInode *in = get_inode(d.ino, ss.snapid);
4827 ceph_assert(in);
4828 ceph_assert(in->get_parent_dn());
4829 rejoin_unlinked_inodes[from].insert(in);
4830 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4831 }
4832 } else {
4833 // the survivor missed MDentryLink message ?
4834 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4835 dout(7) << " sender doesn't have primay dentry" << dendl;
4836 }
4837 } else {
4838 if (d.is_primary()) {
4839 // the survivor missed MDentryUnlink message ?
4840 CInode *in = get_inode(d.ino, ss.snapid);
4841 ceph_assert(in);
4842 ceph_assert(in->get_parent_dn());
7c673cae 4843 rejoin_unlinked_inodes[from].insert(in);
11fdf7f2 4844 dout(7) << " sender has primary dentry but we don't" << dendl;
7c673cae 4845 }
11fdf7f2 4846 }
7c673cae
FG
4847 }
4848 }
4849 }
4850
11fdf7f2
TL
4851 for (const auto &p : strong->strong_inodes) {
4852 CInode *in = get_inode(p.first);
4853 ceph_assert(in);
4854 in->add_replica(from, p.second.nonce);
7c673cae
FG
4855 dout(10) << " have " << *in << dendl;
4856
11fdf7f2 4857 const MMDSCacheRejoin::inode_strong& is = p.second;
7c673cae
FG
4858
4859 // caps_wanted
4860 if (is.caps_wanted) {
11fdf7f2 4861 in->set_mds_caps_wanted(from, is.caps_wanted);
7c673cae
FG
4862 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4863 << " on " << *in << dendl;
4864 }
4865
4866 // scatterlocks?
4867 // infer state from replica state:
4868 // * go to MIX if they might have wrlocks
4869 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4870 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4871 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4872 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4873
4874 // auth pin?
11fdf7f2
TL
4875 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4876 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4877 for (const auto& r : authpinned_inodes_it->second) {
4878 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
7c673cae
FG
4879
4880 // get/create slave mdrequest
4881 MDRequestRef mdr;
11fdf7f2
TL
4882 if (have_request(r.reqid))
4883 mdr = request_get(r.reqid);
7c673cae 4884 else
11fdf7f2 4885 mdr = request_start_slave(r.reqid, r.attempt, strong);
7c673cae 4886 if (strong->frozen_authpin_inodes.count(in->vino())) {
11fdf7f2 4887 ceph_assert(!in->get_num_auth_pins());
7c673cae
FG
4888 mdr->freeze_auth_pin(in);
4889 } else {
11fdf7f2 4890 ceph_assert(!in->is_frozen_auth_pin());
7c673cae
FG
4891 }
4892 mdr->auth_pin(in);
4893 }
4894 }
4895 // xlock(s)?
11fdf7f2
TL
4896 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4897 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4898 for (const auto &q : xlocked_inodes_it->second) {
4899 SimpleLock *lock = in->get_lock(q.first);
4900 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4901 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4902 ceph_assert(mdr->is_auth_pinned(in));
4903 if (!mdr->is_xlocked(&in->versionlock)) {
4904 ceph_assert(in->versionlock.can_xlock_local());
7c673cae 4905 in->versionlock.get_xlock(mdr, mdr->get_client());
9f95a23c 4906 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4907 }
4908 if (lock->is_stable())
4909 in->auth_pin(lock);
4910 lock->set_state(LOCK_XLOCK);
4911 if (lock == &in->filelock)
4912 in->loner_cap = -1;
4913 lock->get_xlock(mdr, mdr->get_client());
9f95a23c 4914 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4915 }
4916 }
4917 }
4918 // wrlock(s)?
11fdf7f2
TL
4919 for (const auto &p : strong->wrlocked_inodes) {
4920 CInode *in = get_inode(p.first);
4921 for (const auto &q : p.second) {
4922 SimpleLock *lock = in->get_lock(q.first);
4923 for (const auto &r : q.second) {
4924 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4925 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
7c673cae 4926 if (in->is_auth())
11fdf7f2 4927 ceph_assert(mdr->is_auth_pinned(in));
7c673cae
FG
4928 lock->set_state(LOCK_MIX);
4929 if (lock == &in->filelock)
4930 in->loner_cap = -1;
4931 lock->get_wrlock(true);
9f95a23c 4932 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
7c673cae
FG
4933 }
4934 }
4935 }
4936
4937 // done?
11fdf7f2 4938 ceph_assert(rejoin_gather.count(from));
7c673cae 4939 rejoin_gather.erase(from);
28e407b8 4940 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4941 rejoin_gather_finish();
4942 } else {
4943 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4944 }
4945}
4946
9f95a23c 4947void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
7c673cae
FG
4948{
4949 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4950 mds_rank_t from = mds_rank_t(ack->get_source().num());
4951
11fdf7f2 4952 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
b32b8144
FG
4953 bool survivor = !mds->is_rejoin();
4954
7c673cae
FG
4955 // for sending cache expire message
4956 set<CInode*> isolated_inodes;
4957 set<CInode*> refragged_inodes;
11fdf7f2 4958 list<pair<CInode*,int> > updated_realms;
7c673cae
FG
4959
4960 // dirs
11fdf7f2 4961 for (const auto &p : ack->strong_dirfrags) {
7c673cae
FG
4962 // we may have had incorrect dir fragmentation; refragment based
4963 // on what they auth tells us.
11fdf7f2 4964 CDir *dir = get_dirfrag(p.first);
7c673cae 4965 if (!dir) {
11fdf7f2 4966 dir = get_force_dirfrag(p.first, false);
7c673cae
FG
4967 if (dir)
4968 refragged_inodes.insert(dir->get_inode());
4969 }
4970 if (!dir) {
11fdf7f2 4971 CInode *diri = get_inode(p.first.ino);
7c673cae
FG
4972 if (!diri) {
4973 // barebones inode; the full inode loop below will clean up.
4974 diri = new CInode(this, false);
11fdf7f2 4975 diri->inode.ino = p.first.ino;
7c673cae 4976 diri->inode.mode = S_IFDIR;
11fdf7f2 4977 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 4978 add_inode(diri);
11fdf7f2 4979 if (MDS_INO_MDSDIR(from) == p.first.ino) {
7c673cae
FG
4980 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4981 dout(10) << " add inode " << *diri << dendl;
4982 } else {
4983 diri->inode_auth = CDIR_AUTH_DEFAULT;
4984 isolated_inodes.insert(diri);
11fdf7f2 4985 dout(10) << " unconnected dirfrag " << p.first << dendl;
7c673cae
FG
4986 }
4987 }
4988 // barebones dirfrag; the full dirfrag loop below will clean up.
11fdf7f2
TL
4989 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4990 if (MDS_INO_MDSDIR(from) == p.first.ino ||
7c673cae
FG
4991 (dir->authority() != CDIR_AUTH_UNDEF &&
4992 dir->authority().first != from))
4993 adjust_subtree_auth(dir, from);
4994 dout(10) << " add dirfrag " << *dir << dendl;
4995 }
4996
11fdf7f2 4997 dir->set_replica_nonce(p.second.nonce);
7c673cae
FG
4998 dir->state_clear(CDir::STATE_REJOINING);
4999 dout(10) << " got " << *dir << dendl;
5000
5001 // dentries
11fdf7f2
TL
5002 auto it = ack->strong_dentries.find(p.first);
5003 if (it != ack->strong_dentries.end()) {
5004 for (const auto &q : it->second) {
5005 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5006 if(!dn)
5007 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5008
5009 CDentry::linkage_t *dnl = dn->get_linkage();
5010
5011 ceph_assert(dn->last == q.first.snapid);
5012 if (dn->first != q.second.first) {
5013 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5014 dn->first = q.second.first;
5015 }
7c673cae 5016
11fdf7f2
TL
5017 // may have bad linkage if we missed dentry link/unlink messages
5018 if (dnl->is_primary()) {
5019 CInode *in = dnl->get_inode();
5020 if (!q.second.is_primary() ||
5021 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5022 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5023 dir->unlink_inode(dn);
5024 }
5025 } else if (dnl->is_remote()) {
5026 if (!q.second.is_remote() ||
5027 q.second.remote_ino != dnl->get_remote_ino() ||
5028 q.second.remote_d_type != dnl->get_remote_d_type()) {
5029 dout(10) << " had bad linkage for " << *dn << dendl;
5030 dir->unlink_inode(dn);
5031 }
5032 } else {
5033 if (!q.second.is_null())
5034 dout(10) << " had bad linkage for " << *dn << dendl;
5035 }
7c673cae 5036
11fdf7f2
TL
5037 // hmm, did we have the proper linkage here?
5038 if (dnl->is_null() && !q.second.is_null()) {
5039 if (q.second.is_remote()) {
5040 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5041 } else {
5042 CInode *in = get_inode(q.second.ino, q.first.snapid);
5043 if (!in) {
5044 // barebones inode; assume it's dir, the full inode loop below will clean up.
5045 in = new CInode(this, false, q.second.first, q.first.snapid);
5046 in->inode.ino = q.second.ino;
5047 in->inode.mode = S_IFDIR;
5048 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5049 add_inode(in);
5050 dout(10) << " add inode " << *in << dendl;
5051 } else if (in->get_parent_dn()) {
5052 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5053 << ", unlinking " << *in << dendl;
5054 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5055 }
5056 dn->dir->link_primary_inode(dn, in);
5057 isolated_inodes.erase(in);
7c673cae 5058 }
11fdf7f2 5059 }
7c673cae 5060
11fdf7f2
TL
5061 dn->set_replica_nonce(q.second.nonce);
5062 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5063 dn->state_clear(CDentry::STATE_REJOINING);
5064 dout(10) << " got " << *dn << dendl;
5065 }
7c673cae
FG
5066 }
5067 }
5068
9f95a23c
TL
5069 for (const auto& in : refragged_inodes) {
5070 auto&& ls = in->get_nested_dirfrags();
5071 for (const auto& dir : ls) {
5072 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
7c673cae 5073 continue;
9f95a23c
TL
5074 ceph_assert(dir->get_num_any() == 0);
5075 in->close_dirfrag(dir->get_frag());
7c673cae
FG
5076 }
5077 }
5078
5079 // full dirfrags
11fdf7f2
TL
5080 for (const auto &p : ack->dirfrag_bases) {
5081 CDir *dir = get_dirfrag(p.first);
5082 ceph_assert(dir);
5083 auto q = p.second.cbegin();
7c673cae
FG
5084 dir->_decode_base(q);
5085 dout(10) << " got dir replica " << *dir << dendl;
5086 }
5087
5088 // full inodes
11fdf7f2 5089 auto p = ack->inode_base.cbegin();
7c673cae
FG
5090 while (!p.end()) {
5091 inodeno_t ino;
5092 snapid_t last;
5093 bufferlist basebl;
11fdf7f2
TL
5094 decode(ino, p);
5095 decode(last, p);
5096 decode(basebl, p);
7c673cae 5097 CInode *in = get_inode(ino, last);
11fdf7f2
TL
5098 ceph_assert(in);
5099 auto q = basebl.cbegin();
5100 snapid_t sseq = 0;
5101 if (in->snaprealm)
5102 sseq = in->snaprealm->srnode.seq;
7c673cae 5103 in->_decode_base(q);
11fdf7f2
TL
5104 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5105 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5106 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5107 }
7c673cae
FG
5108 dout(10) << " got inode base " << *in << dendl;
5109 }
5110
5111 // inodes
11fdf7f2 5112 p = ack->inode_locks.cbegin();
7c673cae
FG
5113 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5114 while (!p.end()) {
5115 inodeno_t ino;
5116 snapid_t last;
5117 __u32 nonce;
5118 bufferlist lockbl;
11fdf7f2
TL
5119 decode(ino, p);
5120 decode(last, p);
5121 decode(nonce, p);
5122 decode(lockbl, p);
7c673cae
FG
5123
5124 CInode *in = get_inode(ino, last);
11fdf7f2 5125 ceph_assert(in);
7c673cae 5126 in->set_replica_nonce(nonce);
11fdf7f2 5127 auto q = lockbl.cbegin();
b32b8144 5128 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5129 in->state_clear(CInode::STATE_REJOINING);
5130 dout(10) << " got inode locks " << *in << dendl;
5131 }
5132
5133 // FIXME: This can happen if entire subtree, together with the inode subtree root
5134 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
11fdf7f2 5135 ceph_assert(isolated_inodes.empty());
7c673cae
FG
5136
5137 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
11fdf7f2
TL
5138 auto bp = ack->imported_caps.cbegin();
5139 decode(peer_imported, bp);
7c673cae
FG
5140
5141 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5142 p != peer_imported.end();
5143 ++p) {
28e407b8 5144 auto& ex = cap_exports.at(p->first);
11fdf7f2 5145 ceph_assert(ex.first == from);
7c673cae
FG
5146 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5147 q != p->second.end();
5148 ++q) {
28e407b8 5149 auto r = ex.second.find(q->first);
11fdf7f2 5150 ceph_assert(r != ex.second.end());
7c673cae
FG
5151
5152 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5153 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5154 if (!session) {
5155 dout(10) << " no session for client." << p->first << dendl;
5156 ex.second.erase(r);
5157 continue;
5158 }
7c673cae
FG
5159
5160 // mark client caps stale.
9f95a23c 5161 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5162 r->second.capinfo.cap_id, 0,
7c673cae
FG
5163 mds->get_osd_epoch_barrier());
5164 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5165 (q->second.cap_id > 0 ? from : -1), 0);
5166 mds->send_message_client_counted(m, session);
5167
28e407b8 5168 ex.second.erase(r);
7c673cae 5169 }
11fdf7f2
TL
5170 ceph_assert(ex.second.empty());
5171 }
5172
5173 for (auto p : updated_realms) {
5174 CInode *in = p.first;
5175 bool notify_clients;
5176 if (mds->is_rejoin()) {
5177 if (!rejoin_pending_snaprealms.count(in)) {
5178 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5179 rejoin_pending_snaprealms.insert(in);
5180 }
5181 notify_clients = false;
5182 } else {
5183 // notify clients if I'm survivor
5184 notify_clients = true;
5185 }
5186 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
7c673cae
FG
5187 }
5188
5189 // done?
11fdf7f2 5190 ceph_assert(rejoin_ack_gather.count(from));
7c673cae 5191 rejoin_ack_gather.erase(from);
b32b8144 5192 if (!survivor) {
7c673cae
FG
5193 if (rejoin_gather.empty()) {
5194 // eval unstable scatter locks after all wrlocks are rejoined.
5195 while (!rejoin_eval_locks.empty()) {
5196 SimpleLock *lock = rejoin_eval_locks.front();
5197 rejoin_eval_locks.pop_front();
5198 if (!lock->is_stable())
5199 mds->locker->eval_gather(lock);
5200 }
5201 }
5202
5203 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5204 rejoin_ack_gather.empty()) {
5205 // finally, kickstart past snap parent opens
11fdf7f2 5206 open_snaprealms();
7c673cae
FG
5207 } else {
5208 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5209 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5210 }
5211 } else {
5212 // survivor.
5213 mds->queue_waiters(rejoin_waiters);
5214 }
5215}
5216
5217/**
5218 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5219 *
5220 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5221 * messages that clean these guys up...
5222 */
5223void MDCache::rejoin_trim_undef_inodes()
5224{
5225 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5226
5227 while (!rejoin_undef_inodes.empty()) {
5228 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5229 CInode *in = *p;
5230 rejoin_undef_inodes.erase(p);
5231
5232 in->clear_replica_map();
5233
5234 // close out dirfrags
5235 if (in->is_dir()) {
9f95a23c
TL
5236 const auto&& dfls = in->get_dirfrags();
5237 for (const auto& dir : dfls) {
7c673cae
FG
5238 dir->clear_replica_map();
5239
94b18763
FG
5240 for (auto &p : dir->items) {
5241 CDentry *dn = p.second;
7c673cae
FG
5242 dn->clear_replica_map();
5243
5244 dout(10) << " trimming " << *dn << dendl;
5245 dir->remove_dentry(dn);
5246 }
5247
5248 dout(10) << " trimming " << *dir << dendl;
5249 in->close_dirfrag(dir->dirfrag().frag);
5250 }
5251 }
5252
5253 CDentry *dn = in->get_parent_dn();
5254 if (dn) {
5255 dn->clear_replica_map();
5256 dout(10) << " trimming " << *dn << dendl;
5257 dn->dir->remove_dentry(dn);
5258 } else {
5259 dout(10) << " trimming " << *in << dendl;
5260 remove_inode(in);
5261 }
5262 }
5263
11fdf7f2 5264 ceph_assert(rejoin_undef_inodes.empty());
7c673cae
FG
5265}
5266
5267void MDCache::rejoin_gather_finish()
5268{
5269 dout(10) << "rejoin_gather_finish" << dendl;
11fdf7f2
TL
5270 ceph_assert(mds->is_rejoin());
5271 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5272
5273 if (open_undef_inodes_dirfrags())
5274 return;
5275
5276 if (process_imported_caps())
5277 return;
5278
5279 choose_lock_states_and_reconnect_caps();
5280
5281 identify_files_to_recover();
5282 rejoin_send_acks();
5283
5284 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5285 rejoin_ack_gather.erase(mds->get_nodeid());
5286
5287 // did we already get our acks too?
5288 if (rejoin_ack_gather.empty()) {
11fdf7f2
TL
5289 // finally, open snaprealms
5290 open_snaprealms();
7c673cae
FG
5291 }
5292}
5293
5294class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5295 inodeno_t ino;
5296public:
5297 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5298 void finish(int r) override {
5299 mdcache->rejoin_open_ino_finish(ino, r);
5300 }
5301};
5302
5303void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5304{
5305 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5306
5307 if (ret < 0) {
5308 cap_imports_missing.insert(ino);
5309 } else if (ret == mds->get_nodeid()) {
11fdf7f2 5310 ceph_assert(get_inode(ino));
7c673cae
FG
5311 } else {
5312 auto p = cap_imports.find(ino);
11fdf7f2 5313 ceph_assert(p != cap_imports.end());
7c673cae 5314 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
11fdf7f2
TL
5315 ceph_assert(q->second.count(MDS_RANK_NONE));
5316 ceph_assert(q->second.size() == 1);
7c673cae
FG
5317 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5318 }
5319 cap_imports.erase(p);
5320 }
5321
11fdf7f2 5322 ceph_assert(cap_imports_num_opening > 0);
7c673cae
FG
5323 cap_imports_num_opening--;
5324
5325 if (cap_imports_num_opening == 0) {
5326 if (rejoin_gather.empty())
5327 rejoin_gather_finish();
5328 else if (rejoin_gather.count(mds->get_nodeid()))
5329 process_imported_caps();
5330 }
5331}
5332
5333class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5334public:
28e407b8
AA
5335 map<client_t,pair<Session*,uint64_t> > session_map;
5336 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae 5337 void finish(int r) override {
11fdf7f2 5338 ceph_assert(r == 0);
28e407b8 5339 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5340 }
5341};
5342
28e407b8 5343void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5344{
5345 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5346 mds->server->finish_force_open_sessions(session_map);
5347 rejoin_session_map.swap(session_map);
7c673cae
FG
5348 if (rejoin_gather.empty())
5349 rejoin_gather_finish();
5350}
5351
11fdf7f2
TL
5352void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5353{
5354 auto p = cap_imports.find(ino);
5355 if (p != cap_imports.end()) {
5356 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5357 if (ret < 0) {
5358 cap_imports_missing.insert(ino);
5359 } else if (ret != mds->get_nodeid()) {
5360 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5361 ceph_assert(q->second.count(MDS_RANK_NONE));
5362 ceph_assert(q->second.size() == 1);
5363 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5364 }
5365 cap_imports.erase(p);
5366 }
5367 }
5368}
5369
7c673cae
FG
5370bool MDCache::process_imported_caps()
5371{
5372 dout(10) << "process_imported_caps" << dendl;
5373
11fdf7f2
TL
5374 if (!open_file_table.is_prefetched() &&
5375 open_file_table.prefetch_inodes()) {
5376 open_file_table.wait_for_prefetch(
5377 new MDSInternalContextWrapper(mds,
9f95a23c 5378 new LambdaContext([this](int r) {
11fdf7f2
TL
5379 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5380 process_imported_caps();
5381 })
5382 )
5383 );
5384 return true;
5385 }
5386
f91f0fd5
TL
5387 for (auto& p : cap_imports) {
5388 CInode *in = get_inode(p.first);
7c673cae 5389 if (in) {
11fdf7f2 5390 ceph_assert(in->is_auth());
f91f0fd5 5391 cap_imports_missing.erase(p.first);
7c673cae
FG
5392 continue;
5393 }
f91f0fd5 5394 if (cap_imports_missing.count(p.first) > 0)
7c673cae
FG
5395 continue;
5396
f91f0fd5
TL
5397 uint64_t parent_ino = 0;
5398 std::string_view d_name;
5399 for (auto& q : p.second) {
5400 for (auto& r : q.second) {
5401 auto &icr = r.second;
5402 if (icr.capinfo.pathbase &&
5403 icr.path.length() > 0 &&
5404 icr.path.find('/') == string::npos) {
5405 parent_ino = icr.capinfo.pathbase;
5406 d_name = icr.path;
5407 break;
5408 }
5409 }
5410 if (parent_ino)
5411 break;
5412 }
5413
5414 dout(10) << " opening missing ino " << p.first << dendl;
7c673cae 5415 cap_imports_num_opening++;
f91f0fd5
TL
5416 auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
5417 if (parent_ino) {
5418 vector<inode_backpointer_t> ancestors;
5419 ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
5420 open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
5421 } else {
5422 open_ino(p.first, (int64_t)-1, fin, false);
5423 }
28e407b8
AA
5424 if (!(cap_imports_num_opening % 1000))
5425 mds->heartbeat_reset();
7c673cae
FG
5426 }
5427
5428 if (cap_imports_num_opening > 0)
5429 return true;
5430
5431 // called by rejoin_gather_finish() ?
5432 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5433 if (!rejoin_client_map.empty() &&
5434 rejoin_session_map.empty()) {
5435 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5436 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
11fdf7f2 5437 rejoin_client_metadata_map,
28e407b8 5438 finish->session_map);
11fdf7f2
TL
5439 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5440 std::move(rejoin_client_metadata_map));
5441 mds->mdlog->start_submit_entry(le, finish);
28e407b8
AA
5442 mds->mdlog->flush();
5443 rejoin_client_map.clear();
11fdf7f2 5444 rejoin_client_metadata_map.clear();
28e407b8 5445 return true;
7c673cae 5446 }
7c673cae
FG
5447
5448 // process caps that were exported by slave rename
5449 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5450 p != rejoin_slave_exports.end();
5451 ++p) {
5452 CInode *in = get_inode(p->first);
11fdf7f2 5453 ceph_assert(in);
7c673cae
FG
5454 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5455 q != p->second.second.end();
5456 ++q) {
28e407b8
AA
5457 auto r = rejoin_session_map.find(q->first);
5458 if (r == rejoin_session_map.end())
5459 continue;
7c673cae 5460
28e407b8 5461 Session *session = r->second.first;
7c673cae 5462 Capability *cap = in->get_client_cap(q->first);
11fdf7f2 5463 if (!cap) {
7c673cae 5464 cap = in->add_client_cap(q->first, session);
11fdf7f2
TL
5465 // add empty item to reconnected_caps
5466 (void)reconnected_caps[p->first][q->first];
5467 }
7c673cae
FG
5468 cap->merge(q->second, true);
5469
5470 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
11fdf7f2
TL
5471 ceph_assert(cap->get_last_seq() == im.issue_seq);
5472 ceph_assert(cap->get_mseq() == im.mseq);
7c673cae
FG
5473 cap->set_cap_id(im.cap_id);
5474 // send cap import because we assigned a new cap ID
5475 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5476 p->second.first, CEPH_CAP_FLAG_AUTH);
5477 }
5478 }
5479 rejoin_slave_exports.clear();
5480 rejoin_imported_caps.clear();
5481
5482 // process cap imports
5483 // ino -> client -> frommds -> capex
5484 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5485 CInode *in = get_inode(p->first);
5486 if (!in) {
5487 dout(10) << " still missing ino " << p->first
5488 << ", will try again after replayed client requests" << dendl;
5489 ++p;
5490 continue;
5491 }
11fdf7f2 5492 ceph_assert(in->is_auth());
7c673cae 5493 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5494 Session *session;
5495 {
5496 auto r = rejoin_session_map.find(q->first);
5497 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5498 }
5499
7c673cae 5500 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5501 if (!session) {
5502 if (r->first >= 0)
5503 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5504 continue;
5505 }
5506
7c673cae
FG
5507 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5508 add_reconnected_cap(q->first, in->ino(), r->second);
5509 if (r->first >= 0) {
5510 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5511 cap->inc_mseq();
5512 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5513
5514 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5515 im.cap_id = cap->get_cap_id();
5516 im.issue_seq = cap->get_last_seq();
5517 im.mseq = cap->get_mseq();
5518 }
5519 }
5520 }
5521 cap_imports.erase(p++); // remove and move on
5522 }
5523 } else {
5524 trim_non_auth();
5525
11fdf7f2 5526 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5527 rejoin_gather.erase(mds->get_nodeid());
11fdf7f2 5528 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5529 maybe_send_pending_rejoins();
7c673cae
FG
5530 }
5531 return false;
5532}
5533
7c673cae
FG
5534void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5535 client_t client, snapid_t snap_follows)
5536{
5537 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5538
11fdf7f2
TL
5539 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5540 return;
5541
7c673cae
FG
5542 const set<snapid_t>& snaps = realm->get_snaps();
5543 snapid_t follows = snap_follows;
5544
5545 while (true) {
5546 CInode *in = pick_inode_snap(head_in, follows);
5547 if (in == head_in)
5548 break;
11fdf7f2
TL
5549
5550 bool need_snapflush = false;
5551 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5552 p != snaps.end() && *p <= in->last;
5553 ++p) {
5554 head_in->add_need_snapflush(in, *p, client);
5555 need_snapflush = true;
5556 }
5557 follows = in->last;
5558 if (!need_snapflush)
5559 continue;
5560
7c673cae
FG
5561 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5562
eafe8130
TL
5563 if (in->client_snap_caps.empty()) {
5564 for (int i = 0; i < num_cinode_locks; i++) {
5565 int lockid = cinode_lock_info[i].lock;
5566 SimpleLock *lock = in->get_lock(lockid);
5567 ceph_assert(lock);
5568 in->auth_pin(lock);
5569 lock->set_state(LOCK_SNAP_SYNC);
5570 lock->get_wrlock(true);
5571 }
7c673cae 5572 }
eafe8130 5573 in->client_snap_caps.insert(client);
11fdf7f2 5574 mds->locker->mark_need_snapflush_inode(in);
7c673cae
FG
5575 }
5576}
5577
5578/*
5579 * choose lock states based on reconnected caps
5580 */
5581void MDCache::choose_lock_states_and_reconnect_caps()
5582{
5583 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5584
81eedcae 5585 int count = 0;
11fdf7f2
TL
5586 for (auto p : inode_map) {
5587 CInode *in = p.second;
7c673cae
FG
5588 if (in->last != CEPH_NOSNAP)
5589 continue;
5590
5591 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5592 in->mark_dirty_rstat();
5593
7c673cae 5594 int dirty_caps = 0;
11fdf7f2
TL
5595 auto q = reconnected_caps.find(in->ino());
5596 if (q != reconnected_caps.end()) {
5597 for (const auto &it : q->second)
7c673cae
FG
5598 dirty_caps |= it.second.dirty_caps;
5599 }
5600 in->choose_lock_states(dirty_caps);
5601 dout(15) << " chose lock states on " << *in << dendl;
5602
11fdf7f2
TL
5603 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5604 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5605 rejoin_pending_snaprealms.insert(in);
7c673cae 5606 }
81eedcae
TL
5607
5608 if (!(++count % 1000))
5609 mds->heartbeat_reset();
11fdf7f2 5610 }
7c673cae
FG
5611}
5612
5613void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
9f95a23c 5614 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae 5615{
9f95a23c 5616 ref_t<MClientSnap> snap;
11fdf7f2
TL
5617 auto it = splits.find(client);
5618 if (it != splits.end()) {
5619 snap = it->second;
5620 snap->head.op = CEPH_SNAP_OP_SPLIT;
5621 } else {
9f95a23c 5622 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2 5623 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae 5624 snap->head.split = realm->inode->ino();
11fdf7f2 5625 snap->bl = realm->get_snap_trace();
7c673cae 5626
11fdf7f2
TL
5627 for (const auto& child : realm->open_children)
5628 snap->split_realms.push_back(child->inode->ino());
5629 }
7c673cae
FG
5630 snap->split_inos.push_back(ino);
5631}
5632
11fdf7f2 5633void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
9f95a23c 5634 map<client_t,ref_t<MClientSnap>>& splits)
11fdf7f2
TL
5635{
5636 ceph_assert(parent_realm);
5637
5638 vector<inodeno_t> split_inos;
5639 vector<inodeno_t> split_realms;
5640
5641 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5642 !p.end();
5643 ++p)
5644 split_inos.push_back((*p)->ino());
5645 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5646 p != realm->open_children.end();
5647 ++p)
5648 split_realms.push_back((*p)->inode->ino());
5649
5650 for (const auto& p : realm->client_caps) {
5651 ceph_assert(!p.second->empty());
5652 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5653 if (em.second) {
9f95a23c 5654 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2
TL
5655 update->head.split = parent_realm->inode->ino();
5656 update->split_inos = split_inos;
5657 update->split_realms = split_realms;
5658 update->bl = parent_realm->get_snap_trace();
5659 em.first->second = std::move(update);
5660 }
5661 }
5662}
5663
9f95a23c 5664void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
5665{
5666 dout(10) << "send_snaps" << dendl;
5667
11fdf7f2
TL
5668 for (auto &p : splits) {
5669 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
7c673cae 5670 if (session) {
11fdf7f2
TL
5671 dout(10) << " client." << p.first
5672 << " split " << p.second->head.split
5673 << " inos " << p.second->split_inos
7c673cae 5674 << dendl;
11fdf7f2 5675 mds->send_message_client_counted(p.second, session);
7c673cae 5676 } else {
11fdf7f2 5677 dout(10) << " no session for client." << p.first << dendl;
7c673cae
FG
5678 }
5679 }
5680 splits.clear();
5681}
5682
5683
5684/*
5685 * remove any items from logsegment open_file lists that don't have
5686 * any caps
5687 */
5688void MDCache::clean_open_file_lists()
5689{
5690 dout(10) << "clean_open_file_lists" << dendl;
5691
5692 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5693 p != mds->mdlog->segments.end();
5694 ++p) {
5695 LogSegment *ls = p->second;
5696
5697 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5698 while (!q.end()) {
5699 CInode *in = *q;
5700 ++q;
5701 if (in->last == CEPH_NOSNAP) {
11fdf7f2
TL
5702 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5703 in->item_open_file.remove_myself();
5704 } else {
7c673cae
FG
5705 if (in->client_snap_caps.empty()) {
5706 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5707 in->item_open_file.remove_myself();
5708 }
5709 }
5710 }
5711 }
5712}
5713
11fdf7f2
TL
5714void MDCache::dump_openfiles(Formatter *f)
5715{
5716 f->open_array_section("openfiles");
5717 for (auto p = mds->mdlog->segments.begin();
5718 p != mds->mdlog->segments.end();
5719 ++p) {
5720 LogSegment *ls = p->second;
5721
5722 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5723 while (!q.end()) {
5724 CInode *in = *q;
5725 ++q;
5726 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5727 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5728 continue;
5729 f->open_object_section("file");
5730 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5731 f->close_section();
5732 }
5733 }
5734 f->close_section();
5735}
7c673cae
FG
5736
5737Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5738{
5739 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5740 << " on " << *in << dendl;
5741 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5742 if (!session) {
5743 dout(10) << " no session for client." << client << dendl;
5744 return NULL;
5745 }
5746
5747 Capability *cap = in->reconnect_cap(client, icr, session);
5748
5749 if (frommds >= 0) {
5750 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5751 cap->inc_mseq();
5752 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5753 }
5754
5755 return cap;
5756}
5757
5758void MDCache::export_remaining_imported_caps()
5759{
5760 dout(10) << "export_remaining_imported_caps" << dendl;
5761
5762 stringstream warn_str;
5763
81eedcae 5764 int count = 0;
7c673cae
FG
5765 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5766 warn_str << " ino " << p->first << "\n";
5767 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5768 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5769 if (session) {
5770 // mark client caps stale.
9f95a23c
TL
5771 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5772 0, 0, 0,
5773 mds->get_osd_epoch_barrier());
7c673cae
FG
5774 stale->set_cap_peer(0, 0, 0, -1, 0);
5775 mds->send_message_client_counted(stale, q->first);
5776 }
5777 }
5778
81eedcae
TL
5779 if (!(++count % 1000))
5780 mds->heartbeat_reset();
7c673cae
FG
5781 }
5782
11fdf7f2 5783 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
7c673cae
FG
5784 p != cap_reconnect_waiters.end();
5785 ++p)
5786 mds->queue_waiters(p->second);
5787
5788 cap_imports.clear();
5789 cap_reconnect_waiters.clear();
5790
5791 if (warn_str.peek() != EOF) {
5792 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5793 mds->clog->warn(warn_str);
5794 }
5795}
5796
a8e16298 5797Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
7c673cae
FG
5798{
5799 client_t client = session->info.get_client();
a8e16298 5800 Capability *cap = nullptr;
7c673cae
FG
5801 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5802 if (rc) {
a8e16298 5803 cap = in->reconnect_cap(client, *rc, session);
7c673cae
FG
5804 dout(10) << "try_reconnect_cap client." << client
5805 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5806 << " issue " << ccap_string(rc->capinfo.issued)
5807 << " on " << *in << dendl;
5808 remove_replay_cap_reconnect(in->ino(), client);
5809
5810 if (in->is_replicated()) {
5811 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5812 } else {
5813 int dirty_caps = 0;
5814 auto p = reconnected_caps.find(in->ino());
5815 if (p != reconnected_caps.end()) {
5816 auto q = p->second.find(client);
5817 if (q != p->second.end())
5818 dirty_caps = q->second.dirty_caps;
5819 }
5820 in->choose_lock_states(dirty_caps);
5821 dout(15) << " chose lock states on " << *in << dendl;
5822 }
5823
11fdf7f2 5824 map<inodeno_t, MDSContext::vec >::iterator it =
7c673cae
FG
5825 cap_reconnect_waiters.find(in->ino());
5826 if (it != cap_reconnect_waiters.end()) {
5827 mds->queue_waiters(it->second);
5828 cap_reconnect_waiters.erase(it);
5829 }
5830 }
a8e16298 5831 return cap;
7c673cae
FG
5832}
5833
5834
5835
5836// -------
5837// cap imports and delayed snap parent opens
5838
5839void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5840 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5841 int peer, int p_flags)
5842{
7c673cae
FG
5843 SnapRealm *realm = in->find_snaprealm();
5844 if (realm->have_past_parents_open()) {
5845 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5846 if (cap->get_last_seq() == 0) // reconnected cap
5847 cap->inc_last_seq();
5848 cap->set_last_issue();
5849 cap->set_last_issue_stamp(ceph_clock_now());
5850 cap->clear_new();
9f95a23c
TL
5851 auto reap = make_message<MClientCaps>(
5852 CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(),
5853 cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(),
5854 mds->get_osd_epoch_barrier());
7c673cae 5855 in->encode_cap_message(reap, cap);
11fdf7f2 5856 reap->snapbl = realm->get_snap_trace();
7c673cae
FG
5857 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5858 mds->send_message_client_counted(reap, session);
5859 } else {
11fdf7f2 5860 ceph_abort();
7c673cae
FG
5861 }
5862}
5863
5864void MDCache::do_delayed_cap_imports()
5865{
5866 dout(10) << "do_delayed_cap_imports" << dendl;
5867
11fdf7f2 5868 ceph_assert(delayed_imported_caps.empty());
7c673cae
FG
5869}
5870
11fdf7f2
TL
5871struct C_MDC_OpenSnapRealms : public MDCacheContext {
5872 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
7c673cae 5873 void finish(int r) override {
11fdf7f2 5874 mdcache->open_snaprealms();
7c673cae
FG
5875 }
5876};
5877
11fdf7f2 5878void MDCache::open_snaprealms()
7c673cae 5879{
11fdf7f2 5880 dout(10) << "open_snaprealms" << dendl;
7c673cae 5881
7c673cae
FG
5882 MDSGatherBuilder gather(g_ceph_context);
5883
11fdf7f2
TL
5884 auto it = rejoin_pending_snaprealms.begin();
5885 while (it != rejoin_pending_snaprealms.end()) {
5886 CInode *in = *it;
5887 SnapRealm *realm = in->snaprealm;
5888 ceph_assert(realm);
5889 if (realm->have_past_parents_open() ||
5890 realm->open_parents(gather.new_sub())) {
7c673cae
FG
5891 dout(10) << " past parents now open on " << *in << dendl;
5892
9f95a23c 5893 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
5894 // finish off client snaprealm reconnects?
5895 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5896 if (q != reconnected_snaprealms.end()) {
5897 for (const auto& r : q->second)
5898 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5899 reconnected_snaprealms.erase(q);
5900 }
5901
5902 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5903 !p.end(); ++p) {
5904 CInode *child = *p;
7c673cae 5905 auto q = reconnected_caps.find(child->ino());
11fdf7f2 5906 ceph_assert(q != reconnected_caps.end());
7c673cae 5907 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
11fdf7f2
TL
5908 Capability *cap = child->get_client_cap(r->first);
5909 if (!cap)
5910 continue;
5911 if (r->second.snap_follows > 0) {
5912 if (r->second.snap_follows < child->first - 1) {
5913 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5914 } else if (r->second.snapflush) {
5915 // When processing a cap flush message that is re-sent, it's possble
5916 // that the sender has already released all WR caps. So we should
5917 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5918 cap->mark_needsnapflush();
5919 }
7c673cae
FG
5920 }
5921 // make sure client's cap is in the correct snaprealm.
5922 if (r->second.realm_ino != in->ino()) {
11fdf7f2 5923 prepare_realm_split(realm, r->first, child->ino(), splits);
7c673cae
FG
5924 }
5925 }
5926 }
5927
11fdf7f2 5928 rejoin_pending_snaprealms.erase(it++);
7c673cae
FG
5929 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5930
11fdf7f2 5931 send_snaps(splits);
7c673cae
FG
5932 } else {
5933 dout(10) << " opening past parents on " << *in << dendl;
11fdf7f2 5934 ++it;
7c673cae
FG
5935 }
5936 }
5937
7c673cae 5938 if (gather.has_subs()) {
11fdf7f2
TL
5939 if (gather.num_subs_remaining() == 0) {
5940 // cleanup gather
5941 gather.set_finisher(new C_MDSInternalNoop);
5942 gather.activate();
5943 } else {
5944 // for multimds, must succeed the first time
5945 ceph_assert(recovery_set.empty());
5946
5947 dout(10) << "open_snaprealms - waiting for "
5948 << gather.num_subs_remaining() << dendl;
5949 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5950 gather.activate();
5951 return;
5952 }
5953 }
5954
5955 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5956
5957 if (!reconnected_snaprealms.empty()) {
5958 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5959 for (auto& p : reconnected_snaprealms) {
7c673cae 5960 stringstream warn_str;
11fdf7f2
TL
5961 warn_str << " " << p.first << " {";
5962 bool first = true;
5963 for (auto& q : p.second) {
5964 if (!first)
5965 warn_str << ", ";
5966 warn_str << "client." << q.first << "/" << q.second;
7c673cae 5967 }
11fdf7f2
TL
5968 warn_str << "}";
5969 dout(5) << warn_str.str() << dendl;
7c673cae 5970 }
7c673cae 5971 }
11fdf7f2
TL
5972 ceph_assert(rejoin_waiters.empty());
5973 ceph_assert(rejoin_pending_snaprealms.empty());
5974 dout(10) << "open_snaprealms - all open" << dendl;
5975 do_delayed_cap_imports();
5976
5977 ceph_assert(rejoin_done);
5978 rejoin_done.release()->complete(0);
5979 reconnected_caps.clear();
7c673cae
FG
5980}
5981
5982bool MDCache::open_undef_inodes_dirfrags()
5983{
5984 dout(10) << "open_undef_inodes_dirfrags "
5985 << rejoin_undef_inodes.size() << " inodes "
5986 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5987
5988 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5989
5990 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5991 p != rejoin_undef_inodes.end();
5992 ++p) {
5993 CInode *in = *p;
11fdf7f2 5994 ceph_assert(!in->is_base());
7c673cae
FG
5995 fetch_queue.insert(in->get_parent_dir());
5996 }
5997
5998 if (fetch_queue.empty())
5999 return false;
6000
28e407b8
AA
6001 MDSGatherBuilder gather(g_ceph_context,
6002 new MDSInternalContextWrapper(mds,
9f95a23c 6003 new LambdaContext([this](int r) {
28e407b8
AA
6004 if (rejoin_gather.empty())
6005 rejoin_gather_finish();
6006 })
6007 )
6008 );
6009
7c673cae
FG
6010 for (set<CDir*>::iterator p = fetch_queue.begin();
6011 p != fetch_queue.end();
6012 ++p) {
6013 CDir *dir = *p;
6014 CInode *diri = dir->get_inode();
6015 if (diri->state_test(CInode::STATE_REJOINUNDEF))
6016 continue;
6017 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 6018 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
7c673cae
FG
6019 dir->fetch(gather.new_sub());
6020 }
11fdf7f2 6021 ceph_assert(gather.has_subs());
7c673cae
FG
6022 gather.activate();
6023 return true;
6024}
6025
6026void MDCache::opened_undef_inode(CInode *in) {
6027 dout(10) << "opened_undef_inode " << *in << dendl;
6028 rejoin_undef_inodes.erase(in);
6029 if (in->is_dir()) {
6030 // FIXME: re-hash dentries if necessary
11fdf7f2 6031 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
9f95a23c 6032 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
7c673cae 6033 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 6034 ceph_assert(dir);
7c673cae
FG
6035 rejoin_undef_dirfrags.erase(dir);
6036 in->force_dirfrags();
9f95a23c
TL
6037 auto&& ls = in->get_dirfrags();
6038 for (const auto& dir : ls) {
6039 rejoin_undef_dirfrags.insert(dir);
6040 }
7c673cae
FG
6041 }
6042 }
6043}
6044
11fdf7f2 6045void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
9f95a23c 6046 map<client_t,ref_t<MClientSnap>>& updates)
7c673cae
FG
6047{
6048 if (seq < realm->get_newest_seq()) {
6049 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
11fdf7f2 6050 << realm->get_newest_seq() << " on " << *realm << dendl;
9f95a23c 6051 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
11fdf7f2
TL
6052 snap->bl = realm->get_snap_trace();
6053 for (const auto& child : realm->open_children)
6054 snap->split_realms.push_back(child->inode->ino());
6055 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae
FG
6056 } else {
6057 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6058 << " on " << *realm << dendl;
6059 }
6060}
6061
6062
6063
6064void MDCache::rejoin_send_acks()
6065{
6066 dout(7) << "rejoin_send_acks" << dendl;
6067
6068 // replicate stray
6069 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6070 p != rejoin_unlinked_inodes.end();
6071 ++p) {
6072 for (set<CInode*>::iterator q = p->second.begin();
6073 q != p->second.end();
6074 ++q) {
6075 CInode *in = *q;
6076 dout(7) << " unlinked inode " << *in << dendl;
6077 // inode expired
6078 if (!in->is_replica(p->first))
6079 continue;
6080 while (1) {
6081 CDentry *dn = in->get_parent_dn();
6082 if (dn->is_replica(p->first))
6083 break;
6084 dn->add_replica(p->first);
6085 CDir *dir = dn->get_dir();
6086 if (dir->is_replica(p->first))
6087 break;
6088 dir->add_replica(p->first);
6089 in = dir->get_inode();
6090 if (in->is_replica(p->first))
6091 break;
224ce89b 6092 in->add_replica(p->first);
7c673cae
FG
6093 if (in->is_base())
6094 break;
6095 }
6096 }
6097 }
6098 rejoin_unlinked_inodes.clear();
6099
6100 // send acks to everyone in the recovery set
9f95a23c 6101 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
7c673cae
FG
6102 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6103 p != recovery_set.end();
31f18b77
FG
6104 ++p) {
6105 if (rejoin_ack_sent.count(*p))
6106 continue;
9f95a23c 6107 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
31f18b77
FG
6108 }
6109
6110 rejoin_ack_sent = recovery_set;
7c673cae
FG
6111
6112 // walk subtrees
6113 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6114 p != subtrees.end();
6115 ++p) {
6116 CDir *dir = p->first;
6117 if (!dir->is_auth())
6118 continue;
6119 dout(10) << "subtree " << *dir << dendl;
6120
6121 // auth items in this subtree
9f95a23c
TL
6122 std::queue<CDir*> dq;
6123 dq.push(dir);
7c673cae
FG
6124
6125 while (!dq.empty()) {
6126 CDir *dir = dq.front();
9f95a23c 6127 dq.pop();
7c673cae
FG
6128
6129 // dir
181888fb
FG
6130 for (auto &r : dir->get_replicas()) {
6131 auto it = acks.find(r.first);
31f18b77
FG
6132 if (it == acks.end())
6133 continue;
181888fb 6134 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 6135 it->second->add_dirfrag_base(dir);
7c673cae
FG
6136 }
6137
94b18763
FG
6138 for (auto &p : dir->items) {
6139 CDentry *dn = p.second;
7c673cae
FG
6140 CDentry::linkage_t *dnl = dn->get_linkage();
6141
6142 // inode
6143 CInode *in = NULL;
6144 if (dnl->is_primary())
6145 in = dnl->get_inode();
6146
6147 // dentry
181888fb
FG
6148 for (auto &r : dn->get_replicas()) {
6149 auto it = acks.find(r.first);
31f18b77
FG
6150 if (it == acks.end())
6151 continue;
94b18763 6152 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6153 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6154 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6155 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6156 ++r.second,
7c673cae
FG
6157 dn->lock.get_replica_state());
6158 // peer missed MDentrylink message ?
181888fb
FG
6159 if (in && !in->is_replica(r.first))
6160 in->add_replica(r.first);
7c673cae
FG
6161 }
6162
6163 if (!in)
6164 continue;
6165
181888fb
FG
6166 for (auto &r : in->get_replicas()) {
6167 auto it = acks.find(r.first);
31f18b77
FG
6168 if (it == acks.end())
6169 continue;
6170 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6171 bufferlist bl;
181888fb
FG
6172 in->_encode_locks_state_for_rejoin(bl, r.first);
6173 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6174 }
6175
6176 // subdirs in this subtree?
9f95a23c
TL
6177 {
6178 auto&& dirs = in->get_nested_dirfrags();
6179 for (const auto& dir : dirs) {
6180 dq.push(dir);
6181 }
6182 }
7c673cae
FG
6183 }
6184 }
6185 }
6186
6187 // base inodes too
6188 if (root && root->is_auth())
181888fb
FG
6189 for (auto &r : root->get_replicas()) {
6190 auto it = acks.find(r.first);
31f18b77
FG
6191 if (it == acks.end())
6192 continue;
6193 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6194 bufferlist bl;
181888fb
FG
6195 root->_encode_locks_state_for_rejoin(bl, r.first);
6196 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6197 }
6198 if (myin)
181888fb
FG
6199 for (auto &r : myin->get_replicas()) {
6200 auto it = acks.find(r.first);
31f18b77
FG
6201 if (it == acks.end())
6202 continue;
6203 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6204 bufferlist bl;
181888fb
FG
6205 myin->_encode_locks_state_for_rejoin(bl, r.first);
6206 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6207 }
6208
6209 // include inode base for any inodes whose scatterlocks may have updated
6210 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6211 p != rejoin_potential_updated_scatterlocks.end();
6212 ++p) {
6213 CInode *in = *p;
181888fb
FG
6214 for (const auto &r : in->get_replicas()) {
6215 auto it = acks.find(r.first);
31f18b77
FG
6216 if (it == acks.end())
6217 continue;
6218 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6219 }
7c673cae
FG
6220 }
6221
6222 // send acks
31f18b77 6223 for (auto p = acks.begin(); p != acks.end(); ++p) {
11fdf7f2 6224 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
7c673cae
FG
6225 mds->send_message_mds(p->second, p->first);
6226 }
6227
6228 rejoin_imported_caps.clear();
6229}
6230
c07f9fc5
FG
6231class C_MDC_ReIssueCaps : public MDCacheContext {
6232 CInode *in;
6233public:
6234 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6235 MDCacheContext(mdc), in(i)
6236 {
6237 in->get(CInode::PIN_PTRWAITER);
6238 }
6239 void finish(int r) override {
6240 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6241 mdcache->mds->locker->issue_caps(in);
6242 in->put(CInode::PIN_PTRWAITER);
6243 }
6244};
7c673cae
FG
6245
6246void MDCache::reissue_all_caps()
6247{
6248 dout(10) << "reissue_all_caps" << dendl;
6249
81eedcae 6250 int count = 0;
94b18763 6251 for (auto &p : inode_map) {
81eedcae 6252 int n = 1;
b32b8144 6253 CInode *in = p.second;
7c673cae 6254 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6255 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6256 if (in->is_frozen_inode()) {
6257 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6258 continue;
6259 }
7c673cae 6260 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
81eedcae 6261 n += mds->locker->issue_caps(in);
7c673cae 6262 }
81eedcae
TL
6263
6264 if ((count % 1000) + n >= 1000)
6265 mds->heartbeat_reset();
6266 count += n;
7c673cae
FG
6267 }
6268}
6269
6270
6271// ===============================================================================
6272
6273struct C_MDC_QueuedCow : public MDCacheContext {
6274 CInode *in;
6275 MutationRef mut;
6276 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6277 MDCacheContext(mdc), in(i), mut(m) {}
6278 void finish(int r) override {
6279 mdcache->_queued_file_recover_cow(in, mut);
6280 }
6281};
6282
6283
6284void MDCache::queue_file_recover(CInode *in)
6285{
6286 dout(10) << "queue_file_recover " << *in << dendl;
11fdf7f2 6287 ceph_assert(in->is_auth());
7c673cae
FG
6288
6289 // cow?
6290 /*
6291 SnapRealm *realm = in->find_snaprealm();
6292 set<snapid_t> s = realm->get_snaps();
6293 while (!s.empty() && *s.begin() < in->first)
6294 s.erase(s.begin());
6295 while (!s.empty() && *s.rbegin() > in->last)
6296 s.erase(*s.rbegin());
6297 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6298 if (s.size() > 1) {
94b18763 6299 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6300 pi->version = in->pre_dirty();
6301
6302 auto mut(std::make_shared<MutationImpl>());
6303 mut->ls = mds->mdlog->get_current_segment();
6304 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6305 mds->mdlog->start_entry(le);
6306 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6307
6308 s.erase(*s.begin());
6309 while (!s.empty()) {
6310 snapid_t snapid = *s.begin();
6311 CInode *cow_inode = 0;
6312 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
11fdf7f2 6313 ceph_assert(cow_inode);
7c673cae
FG
6314 recovery_queue.enqueue(cow_inode);
6315 s.erase(*s.begin());
6316 }
6317
6318 in->parent->first = in->first;
6319 le->metablob.add_primary_dentry(in->parent, in, true);
6320 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6321 mds->mdlog->flush();
6322 }
6323 */
6324
6325 recovery_queue.enqueue(in);
6326}
6327
6328void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6329{
6330 in->pop_and_dirty_projected_inode(mut->ls);
6331 mut->apply();
6332 mds->locker->drop_locks(mut.get());
6333 mut->cleanup();
6334}
6335
6336
6337/*
6338 * called after recovery to recover file sizes for previously opened (for write)
6339 * files. that is, those where max_size > size.
6340 */
6341void MDCache::identify_files_to_recover()
6342{
6343 dout(10) << "identify_files_to_recover" << dendl;
81eedcae 6344 int count = 0;
94b18763 6345 for (auto &p : inode_map) {
b32b8144 6346 CInode *in = p.second;
7c673cae
FG
6347 if (!in->is_auth())
6348 continue;
6349
6350 if (in->last != CEPH_NOSNAP)
6351 continue;
6352
6353 // Only normal files need file size recovery
6354 if (!in->is_file()) {
6355 continue;
6356 }
6357
6358 bool recover = false;
f91f0fd5
TL
6359 const auto& client_ranges = in->get_projected_inode()->client_ranges;
6360 if (!client_ranges.empty()) {
6361 in->mark_clientwriteable();
6362 for (auto& p : client_ranges) {
6363 Capability *cap = in->get_client_cap(p.first);
6364 if (cap) {
6365 cap->mark_clientwriteable();
6366 } else {
6367 dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
6368 recover = true;
6369 break;
6370 }
7c673cae
FG
6371 }
6372 }
6373
6374 if (recover) {
6375 if (in->filelock.is_stable()) {
6376 in->auth_pin(&in->filelock);
6377 } else {
11fdf7f2 6378 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
7c673cae
FG
6379 }
6380 in->filelock.set_state(LOCK_PRE_SCAN);
6381 rejoin_recover_q.push_back(in);
6382 } else {
6383 rejoin_check_q.push_back(in);
6384 }
81eedcae
TL
6385
6386 if (!(++count % 1000))
6387 mds->heartbeat_reset();
7c673cae
FG
6388 }
6389}
6390
6391void MDCache::start_files_to_recover()
6392{
6393 for (CInode *in : rejoin_check_q) {
6394 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6395 mds->locker->issue_caps(in);
6396 mds->locker->check_inode_max_size(in);
6397 }
6398 rejoin_check_q.clear();
6399 for (CInode *in : rejoin_recover_q) {
6400 mds->locker->file_recover(&in->filelock);
6401 }
6402 if (!rejoin_recover_q.empty()) {
6403 rejoin_recover_q.clear();
6404 do_file_recover();
6405 }
6406}
6407
6408void MDCache::do_file_recover()
6409{
6410 recovery_queue.advance();
6411}
6412
6413// ===============================================================================
6414
6415
6416// ----------------------------
6417// truncate
6418
6419class C_MDC_RetryTruncate : public MDCacheContext {
6420 CInode *in;
6421 LogSegment *ls;
6422public:
6423 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6424 MDCacheContext(c), in(i), ls(l) {}
6425 void finish(int r) override {
6426 mdcache->_truncate_inode(in, ls);
6427 }
6428};
6429
6430void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6431{
94b18763 6432 auto pi = in->get_projected_inode();
7c673cae
FG
6433 dout(10) << "truncate_inode "
6434 << pi->truncate_from << " -> " << pi->truncate_size
6435 << " on " << *in
6436 << dendl;
6437
6438 ls->truncating_inodes.insert(in);
6439 in->get(CInode::PIN_TRUNCATING);
6440 in->auth_pin(this);
6441
6442 if (!in->client_need_snapflush.empty() &&
6443 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6444 ceph_assert(in->filelock.is_xlocked());
7c673cae
FG
6445 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6446 mds->locker->issue_caps(in);
6447 return;
6448 }
6449
6450 _truncate_inode(in, ls);
6451}
6452
6453struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6454 CInode *in;
6455 LogSegment *ls;
6456 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
91327a77
AA
6457 MDCacheIOContext(c, false), in(i), ls(l) {
6458 }
7c673cae 6459 void finish(int r) override {
11fdf7f2 6460 ceph_assert(r == 0 || r == -ENOENT);
7c673cae
FG
6461 mdcache->truncate_inode_finish(in, ls);
6462 }
91327a77
AA
6463 void print(ostream& out) const override {
6464 out << "file_truncate(" << in->ino() << ")";
6465 }
7c673cae
FG
6466};
6467
6468void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6469{
94b18763 6470 auto pi = &in->inode;
7c673cae
FG
6471 dout(10) << "_truncate_inode "
6472 << pi->truncate_from << " -> " << pi->truncate_size
6473 << " on " << *in << dendl;
6474
11fdf7f2
TL
6475 ceph_assert(pi->is_truncating());
6476 ceph_assert(pi->truncate_size < (1ULL << 63));
6477 ceph_assert(pi->truncate_from < (1ULL << 63));
6478 ceph_assert(pi->truncate_size < pi->truncate_from);
7c673cae
FG
6479
6480
6481 SnapRealm *realm = in->find_snaprealm();
6482 SnapContext nullsnap;
6483 const SnapContext *snapc;
6484 if (realm) {
6485 dout(10) << " realm " << *realm << dendl;
6486 snapc = &realm->get_snap_context();
6487 } else {
6488 dout(10) << " NO realm, using null context" << dendl;
6489 snapc = &nullsnap;
11fdf7f2 6490 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae
FG
6491 }
6492 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6493 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6494 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6495 pi->truncate_seq, ceph::real_time::min(), 0,
6496 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6497 mds->finisher));
6498}
6499
6500struct C_MDC_TruncateLogged : public MDCacheLogContext {
6501 CInode *in;
6502 MutationRef mut;
6503 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6504 MDCacheLogContext(m), in(i), mut(mu) {}
6505 void finish(int r) override {
6506 mdcache->truncate_inode_logged(in, mut);
6507 }
6508};
6509
6510void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6511{
6512 dout(10) << "truncate_inode_finish " << *in << dendl;
6513
6514 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6515 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6516 ls->truncating_inodes.erase(p);
6517
6518 // update
94b18763
FG
6519 auto &pi = in->project_inode();
6520 pi.inode.version = in->pre_dirty();
6521 pi.inode.truncate_from = 0;
6522 pi.inode.truncate_pending--;
7c673cae
FG
6523
6524 MutationRef mut(new MutationImpl());
6525 mut->ls = mds->mdlog->get_current_segment();
6526 mut->add_projected_inode(in);
6527
6528 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6529 mds->mdlog->start_entry(le);
6530 CDentry *dn = in->get_projected_parent_dn();
6531 le->metablob.add_dir_context(dn->get_dir());
6532 le->metablob.add_primary_dentry(dn, in, true);
6533 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6534
6535 journal_dirty_inode(mut.get(), &le->metablob, in);
6536 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6537
6538 // flush immediately if there are readers/writers waiting
6539 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6540 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6541 mds->mdlog->flush();
6542}
6543
6544void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6545{
6546 dout(10) << "truncate_inode_logged " << *in << dendl;
6547 mut->apply();
6548 mds->locker->drop_locks(mut.get());
6549 mut->cleanup();
6550
6551 in->put(CInode::PIN_TRUNCATING);
6552 in->auth_unpin(this);
6553
11fdf7f2 6554 MDSContext::vec waiters;
7c673cae
FG
6555 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6556 mds->queue_waiters(waiters);
6557}
6558
6559
6560void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6561{
6562 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6563 << ls->seq << "/" << ls->offset << dendl;
6564 ls->truncating_inodes.insert(in);
6565 in->get(CInode::PIN_TRUNCATING);
6566}
6567
6568void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6569{
6570 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6571 << ls->seq << "/" << ls->offset << dendl;
6572 // if we have the logseg the truncate started in, it must be in our list.
6573 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6574 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6575 ls->truncating_inodes.erase(p);
6576 in->put(CInode::PIN_TRUNCATING);
6577}
6578
6579void MDCache::start_recovered_truncates()
6580{
6581 dout(10) << "start_recovered_truncates" << dendl;
6582 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6583 p != mds->mdlog->segments.end();
6584 ++p) {
6585 LogSegment *ls = p->second;
6586 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6587 q != ls->truncating_inodes.end();
6588 ++q) {
6589 CInode *in = *q;
6590 in->auth_pin(this);
6591
6592 if (!in->client_need_snapflush.empty() &&
6593 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6594 ceph_assert(in->filelock.is_stable());
7c673cae
FG
6595 in->filelock.set_state(LOCK_XLOCKDONE);
6596 in->auth_pin(&in->filelock);
6597 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6598 // start_files_to_recover will revoke caps
6599 continue;
6600 }
6601 _truncate_inode(in, ls);
6602 }
6603 }
6604}
6605
6606
9f95a23c
TL
6607class C_MDS_purge_completed_finish : public MDCacheLogContext {
6608 interval_set<inodeno_t> inos;
6609 version_t inotablev;
6610 LogSegment *ls;
6611public:
6612 C_MDS_purge_completed_finish(MDCache *m,
6613 interval_set<inodeno_t> i,
6614 version_t iv,
6615 LogSegment *_ls)
6616 : MDCacheLogContext(m),
6617 inos(std::move(i)),
6618 inotablev(iv),
6619 ls(_ls) {}
6620 void finish(int r) override {
6621 assert(r == 0);
6622 if (inotablev) {
6623 ls->purge_inodes_finish(inos);
6624 mdcache->mds->inotable->apply_release_ids(inos);
6625 assert(mdcache->mds->inotable->get_version() == inotablev);
6626 }
6627 }
6628};
7c673cae 6629
9f95a23c
TL
6630void MDCache::start_purge_inodes(){
6631 dout(10) << "start_purge_inodes" << dendl;
6632 for (auto& p : mds->mdlog->segments){
6633 LogSegment *ls = p.second;
6634 if (ls->purge_inodes.size()){
6635 purge_inodes(ls->purge_inodes, ls);
6636 }
6637 }
6638}
7c673cae 6639
9f95a23c
TL
6640void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6641{
6642 auto cb = new LambdaContext([this, inos, ls](int r){
6643 assert(r == 0 || r == -2);
6644 mds->inotable->project_release_ids(inos);
6645 version_t piv = mds->inotable->get_projected_version();
6646 assert(piv != 0);
6647 mds->mdlog->start_submit_entry(new EPurged(inos, piv, ls->seq),
6648 new C_MDS_purge_completed_finish(this, inos, piv, ls));
6649 mds->mdlog->flush();
6650 });
6651
6652 dout(10) << __func__ << " start purge data : " << inos << dendl;
6653 C_GatherBuilder gather(g_ceph_context,
6654 new C_OnFinisher( new MDSIOContextWrapper(mds, cb), mds->finisher));
6655 SnapContext nullsnapc;
6656 uint64_t num = Striper::get_num_objects(default_file_layout, default_file_layout.get_period());
6657 for (auto p = inos.begin();
6658 p != inos.end();
6659 ++p){
6660 dout(10) << __func__
6661 << " prealloc_inos : " << inos.size()
6662 << " start : " << p.get_start().val
6663 << " length : " << p.get_len() << " "
6664 << " seq : " << ls->seq << dendl;
6665
6666 for (_inodeno_t i = 0; i < p.get_len(); i++){
6667 dout(20) << __func__ << " : " << p.get_start() + i << dendl;
6668 filer.purge_range(p.get_start() + i,
6669 &default_file_layout,
6670 nullsnapc,
6671 0, num,
6672 ceph::real_clock::now(),
6673 0, gather.new_sub());
6674 }
6675 }
6676 gather.activate();
6677}
7c673cae
FG
6678
6679// ================================================================================
6680// cache trimming
6681
11fdf7f2 6682std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
181888fb 6683{
7c673cae 6684 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6685 std::vector<CDentry *> unexpirables;
6686 uint64_t trimmed = 0;
6687
11fdf7f2 6688 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
a8e16298 6689
181888fb
FG
6690 dout(7) << "trim_lru trimming " << count
6691 << " items from LRU"
6692 << " size=" << lru.lru_get_size()
6693 << " mid=" << lru.lru_get_top()
6694 << " pintail=" << lru.lru_get_pintail()
6695 << " pinned=" << lru.lru_get_num_pinned()
6696 << dendl;
7c673cae 6697
11fdf7f2 6698 const uint64_t trim_counter_start = trim_counter.get();
a8e16298
TL
6699 bool throttled = false;
6700 while (1) {
6701 throttled |= trim_counter_start+trimmed >= trim_threshold;
6702 if (throttled) break;
31f18b77
FG
6703 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6704 if (!dn)
6705 break;
6706 if (trim_dentry(dn, expiremap)) {
6707 unexpirables.push_back(dn);
181888fb
FG
6708 } else {
6709 trimmed++;
31f18b77
FG
6710 }
6711 }
6712
181888fb 6713 for (auto &dn : unexpirables) {
31f18b77 6714 bottom_lru.lru_insert_mid(dn);
181888fb 6715 }
31f18b77
FG
6716 unexpirables.clear();
6717
181888fb 6718 // trim dentries from the LRU until count is reached
7f7e6c64 6719 // if mds is in standby_replay and skip trimming the inodes
494da23a 6720 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
a8e16298
TL
6721 throttled |= trim_counter_start+trimmed >= trim_threshold;
6722 if (throttled) break;
7c673cae
FG
6723 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6724 if (!dn) {
6725 break;
6726 }
7f7e6c64 6727 if (is_standby_replay && dn->get_linkage()->inode) {
494da23a
TL
6728 // we move the inodes that need to be trimmed to the end of the lru queue.
6729 // refer to MDCache::standby_trim_segment
6730 lru.lru_insert_bot(dn);
6731 break;
181888fb
FG
6732 } else if (trim_dentry(dn, expiremap)) {
6733 unexpirables.push_back(dn);
6734 } else {
6735 trimmed++;
3efd9988 6736 if (count > 0) count--;
7c673cae
FG
6737 }
6738 }
11fdf7f2 6739 trim_counter.hit(trimmed);
181888fb
FG
6740
6741 for (auto &dn : unexpirables) {
31f18b77 6742 lru.lru_insert_mid(dn);
181888fb 6743 }
31f18b77 6744 unexpirables.clear();
7c673cae 6745
181888fb 6746 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
a8e16298 6747 return std::pair<bool, uint64_t>(throttled, trimmed);
181888fb
FG
6748}
6749
6750/*
6751 * note: only called while MDS is active or stopping... NOT during recovery.
6752 * however, we may expire a replica whose authority is recovering.
6753 *
6754 * @param count is number of dentries to try to expire
6755 */
a8e16298 6756std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
181888fb
FG
6757{
6758 uint64_t used = cache_size();
91327a77 6759 uint64_t limit = cache_memory_limit;
11fdf7f2 6760 expiremap expiremap;
181888fb
FG
6761
6762 dout(7) << "trim bytes_used=" << bytes2str(used)
6763 << " limit=" << bytes2str(limit)
91327a77 6764 << " reservation=" << cache_reservation
181888fb
FG
6765 << "% count=" << count << dendl;
6766
6767 // process delayed eval_stray()
6768 stray_manager.advance_delayed();
6769
a8e16298
TL
6770 auto result = trim_lru(count, expiremap);
6771 auto& trimmed = result.second;
181888fb 6772
7c673cae 6773 // trim non-auth, non-bound subtrees
181888fb 6774 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6775 CDir *dir = p->first;
6776 ++p;
31f18b77
FG
6777 CInode *diri = dir->get_inode();
6778 if (dir->is_auth()) {
f6b5b4d7
TL
6779 if (diri->is_auth() && !diri->is_base()) {
6780 /* this situation should correspond to an export pin */
6781 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6782 /* pinned empty subtree, try to drop */
6783 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6784 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6785 dir->state_clear(CDir::STATE_AUXSUBTREE);
6786 remove_subtree(dir);
6787 diri->close_dirfrag(dir->dirfrag().frag);
6788 }
6789 }
6790 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6791 if (dir->state_test(CDir::STATE_EXPORTING) ||
6792 !(mds->is_active() || mds->is_stopping()) ||
6793 dir->is_freezing() || dir->is_frozen())
6794 continue;
31f18b77 6795
f6b5b4d7 6796 migrator->export_empty_import(dir);
a8e16298 6797 ++trimmed;
31f18b77 6798 }
f6b5b4d7
TL
6799 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6800 // only subtree pin
f91f0fd5 6801 if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
f6b5b4d7 6802 continue;
f91f0fd5 6803 }
31f18b77 6804
f6b5b4d7
TL
6805 // don't trim subtree root if its auth MDS is recovering.
6806 // This simplify the cache rejoin code.
6807 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6808 continue;
6809 trim_dirfrag(dir, 0, expiremap);
6810 ++trimmed;
7c673cae
FG
6811 }
6812 }
6813
6814 // trim root?
181888fb 6815 if (mds->is_stopping() && root) {
9f95a23c
TL
6816 auto&& ls = root->get_dirfrags();
6817 for (const auto& dir : ls) {
a8e16298 6818 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6819 trim_dirfrag(dir, 0, expiremap);
a8e16298
TL
6820 ++trimmed;
6821 }
7c673cae 6822 }
a8e16298 6823 if (root->get_num_ref() == 0) {
7c673cae 6824 trim_inode(0, root, 0, expiremap);
a8e16298
TL
6825 ++trimmed;
6826 }
7c673cae
FG
6827 }
6828
6829 std::set<mds_rank_t> stopping;
6830 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6831 stopping.erase(mds->get_nodeid());
6832 for (auto rank : stopping) {
6833 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6834 if (!mdsdir_in)
6835 continue;
6836
11fdf7f2
TL
6837 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6838 if (em.second) {
9f95a23c 6839 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
7c673cae
FG
6840 }
6841
6842 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6843
6844 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6845 if (!aborted) {
6846 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
9f95a23c 6847 auto&& ls = mdsdir_in->get_dirfrags();
7c673cae 6848 for (auto dir : ls) {
a8e16298 6849 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6850 trim_dirfrag(dir, dir, expiremap);
a8e16298
TL
6851 ++trimmed;
6852 }
7c673cae 6853 }
a8e16298 6854 if (mdsdir_in->get_num_ref() == 0) {
7c673cae 6855 trim_inode(NULL, mdsdir_in, NULL, expiremap);
a8e16298
TL
6856 ++trimmed;
6857 }
7c673cae
FG
6858 } else {
6859 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6860 }
6861 }
6862
6863 // Other rank's base inodes (when I'm stopping)
181888fb 6864 if (mds->is_stopping()) {
7c673cae 6865 for (set<CInode*>::iterator p = base_inodes.begin();
11fdf7f2
TL
6866 p != base_inodes.end();) {
6867 CInode *base_in = *p;
6868 ++p;
6869 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6870 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6871 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6872 if (base_in->get_num_ref() == 0) {
6873 trim_inode(NULL, base_in, NULL, expiremap);
a8e16298 6874 ++trimmed;
7c673cae
FG
6875 }
6876 }
6877 }
6878 }
6879
6880 // send any expire messages
6881 send_expire_messages(expiremap);
6882
a8e16298 6883 return result;
7c673cae
FG
6884}
6885
11fdf7f2 6886void MDCache::send_expire_messages(expiremap& expiremap)
7c673cae
FG
6887{
6888 // send expires
11fdf7f2 6889 for (const auto &p : expiremap) {
7c673cae 6890 if (mds->is_cluster_degraded() &&
11fdf7f2
TL
6891 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6892 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6893 rejoin_sent.count(p.first) == 0))) {
7c673cae
FG
6894 continue;
6895 }
11fdf7f2
TL
6896 dout(7) << "sending cache_expire to " << p.first << dendl;
6897 mds->send_message_mds(p.second, p.first);
7c673cae 6898 }
11fdf7f2 6899 expiremap.clear();
7c673cae
FG
6900}
6901
6902
11fdf7f2 6903bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
7c673cae
FG
6904{
6905 dout(12) << "trim_dentry " << *dn << dendl;
6906
6907 CDentry::linkage_t *dnl = dn->get_linkage();
6908
6909 CDir *dir = dn->get_dir();
11fdf7f2 6910 ceph_assert(dir);
7c673cae
FG
6911
6912 CDir *con = get_subtree_root(dir);
6913 if (con)
6914 dout(12) << " in container " << *con << dendl;
6915 else {
6916 dout(12) << " no container; under a not-yet-linked dir" << dendl;
11fdf7f2 6917 ceph_assert(dn->is_auth());
7c673cae
FG
6918 }
6919
6920 // If replica dentry is not readable, it's likely we will receive
6921 // MDentryLink/MDentryUnlink message soon (It's possible we first
6922 // receive a MDentryUnlink message, then MDentryLink message)
6923 // MDentryLink message only replicates an inode, so we should
6924 // avoid trimming the inode's parent dentry. This is because that
6925 // unconnected replicas are problematic for subtree migration.
6926 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6927 !dn->get_dir()->get_inode()->is_stray())
6928 return true;
6929
6930 // adjust the dir state
6931 // NOTE: we can safely remove a clean, null dentry without effecting
6932 // directory completeness.
6933 // (check this _before_ we unlink the inode, below!)
6934 bool clear_complete = false;
6935 if (!(dnl->is_null() && dn->is_clean()))
6936 clear_complete = true;
6937
6938 // unlink the dentry
6939 if (dnl->is_remote()) {
6940 // just unlink.
31f18b77 6941 dir->unlink_inode(dn, false);
7c673cae
FG
6942 } else if (dnl->is_primary()) {
6943 // expire the inode, too.
6944 CInode *in = dnl->get_inode();
11fdf7f2 6945 ceph_assert(in);
7c673cae
FG
6946 if (trim_inode(dn, in, con, expiremap))
6947 return true; // purging stray instead of trimming
6948 } else {
11fdf7f2 6949 ceph_assert(dnl->is_null());
7c673cae
FG
6950 }
6951
6952 if (!dn->is_auth()) {
6953 // notify dentry authority.
6954 mds_authority_t auth = dn->authority();
6955
6956 for (int p=0; p<2; p++) {
6957 mds_rank_t a = auth.first;
6958 if (p) a = auth.second;
6959 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6960 if (mds->get_nodeid() == auth.second &&
6961 con->is_importing()) break; // don't send any expire while importing.
6962 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6963
6964 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
11fdf7f2
TL
6965 ceph_assert(a != mds->get_nodeid());
6966 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6967 if (em.second)
9f95a23c 6968 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
11fdf7f2 6969 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6970 }
6971 }
6972
6973 // remove dentry
6974 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6975 dir->add_to_bloom(dn);
6976 dir->remove_dentry(dn);
6977
6978 if (clear_complete)
6979 dir->state_clear(CDir::STATE_COMPLETE);
6980
7c673cae
FG
6981 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6982 return false;
6983}
6984
6985
11fdf7f2 6986void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7c673cae
FG
6987{
6988 dout(15) << "trim_dirfrag " << *dir << dendl;
6989
6990 if (dir->is_subtree_root()) {
11fdf7f2 6991 ceph_assert(!dir->is_auth() ||
7c673cae
FG
6992 (!dir->is_replicated() && dir->inode->is_base()));
6993 remove_subtree(dir); // remove from subtree map
6994 }
11fdf7f2 6995 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
6996
6997 CInode *in = dir->get_inode();
6998
6999 if (!dir->is_auth()) {
7000 mds_authority_t auth = dir->authority();
7001
7002 // was this an auth delegation? (if so, slightly modified container)
7003 dirfrag_t condf;
7004 if (dir->is_subtree_root()) {
7005 dout(12) << " subtree root, container is " << *dir << dendl;
7006 con = dir;
7007 condf = dir->dirfrag();
7008 } else {
7009 condf = con->dirfrag();
7010 }
7011
7012 for (int p=0; p<2; p++) {
7013 mds_rank_t a = auth.first;
7014 if (p) a = auth.second;
7015 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7016 if (mds->get_nodeid() == auth.second &&
7017 con->is_importing()) break; // don't send any expire while importing.
7018 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7019
7020 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
11fdf7f2
TL
7021 ceph_assert(a != mds->get_nodeid());
7022 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7023 if (em.second)
9f95a23c 7024 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
11fdf7f2 7025 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7c673cae
FG
7026 }
7027 }
7028
7029 in->close_dirfrag(dir->dirfrag().frag);
7030}
7031
7032/**
7033 * Try trimming an inode from the cache
7034 *
7035 * @return true if the inode is still in cache, else false if it was trimmed
7036 */
11fdf7f2 7037bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7c673cae
FG
7038{
7039 dout(15) << "trim_inode " << *in << dendl;
11fdf7f2 7040 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7041
7042 if (in->is_dir()) {
7043 // If replica inode's dirfragtreelock is not readable, it's likely
7044 // some dirfrags of the inode are being fragmented and we will receive
7045 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7046 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7047 // This is because that unconnected replicas are problematic for
7048 // subtree migration.
7049 //
9f95a23c 7050 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7c673cae 7051 return true;
28e407b8 7052 }
7c673cae
FG
7053
7054 // DIR
9f95a23c
TL
7055 auto&& dfls = in->get_dirfrags();
7056 for (const auto& dir : dfls) {
11fdf7f2 7057 ceph_assert(!dir->is_subtree_root());
7c673cae
FG
7058 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7059 }
7060 }
7061
7062 // INODE
7063 if (in->is_auth()) {
7064 // eval stray after closing dirfrags
7065 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7066 maybe_eval_stray(in);
7067 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7068 return true;
7069 }
7070 } else {
7071 mds_authority_t auth = in->authority();
7072
7073 dirfrag_t df;
7074 if (con)
7075 df = con->dirfrag();
7076 else
7077 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7078
7079 for (int p=0; p<2; p++) {
7080 mds_rank_t a = auth.first;
7081 if (p) a = auth.second;
7082 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7083 if (con && mds->get_nodeid() == auth.second &&
7084 con->is_importing()) break; // don't send any expire while importing.
7085 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7086
7087 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
11fdf7f2
TL
7088 ceph_assert(a != mds->get_nodeid());
7089 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7090 if (em.second)
9f95a23c 7091 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
11fdf7f2 7092 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7c673cae
FG
7093 }
7094 }
7095
7096 /*
7097 if (in->is_auth()) {
7098 if (in->hack_accessed)
7099 mds->logger->inc("outt");
7100 else {
7101 mds->logger->inc("outut");
7102 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7103 }
7104 }
7105 */
7106
7107 // unlink
7108 if (dn)
31f18b77 7109 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
7110 remove_inode(in);
7111 return false;
7112}
7113
7114
7115/**
7116 * trim_non_auth - remove any non-auth items from our cache
7117 *
7118 * this reduces the amount of non-auth metadata in our cache, reducing the
7119 * load incurred by the rejoin phase.
7120 *
7121 * the only non-auth items that remain are those that are needed to
7122 * attach our own subtrees to the root.
7123 *
7124 * when we are done, all dentries will be in the top bit of the lru.
7125 *
7126 * why we have to do this:
7127 * we may not have accurate linkage for non-auth items. which means we will
7128 * know which subtree it falls into, and can not be sure to declare it to the
7129 * correct authority.
7130 */
7131void MDCache::trim_non_auth()
7132{
7133 dout(7) << "trim_non_auth" << dendl;
7134
7135 // temporarily pin all subtree roots
7136 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7137 p != subtrees.end();
7138 ++p)
7139 p->first->get(CDir::PIN_SUBTREETEMP);
7140
31f18b77 7141 list<CDentry*> auth_list;
7c673cae
FG
7142
7143 // trim non-auth items from the lru
31f18b77
FG
7144 for (;;) {
7145 CDentry *dn = NULL;
7146 if (bottom_lru.lru_get_size() > 0)
7147 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7148 if (!dn && lru.lru_get_size() > 0)
7149 dn = static_cast<CDentry*>(lru.lru_expire());
7150 if (!dn)
7151 break;
7152
7c673cae
FG
7153 CDentry::linkage_t *dnl = dn->get_linkage();
7154
7155 if (dn->is_auth()) {
7156 // add back into lru (at the top)
31f18b77 7157 auth_list.push_back(dn);
7c673cae
FG
7158
7159 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7160 dn->unlink_remote(dnl);
7c673cae
FG
7161 } else {
7162 // non-auth. expire.
7163 CDir *dir = dn->get_dir();
11fdf7f2 7164 ceph_assert(dir);
7c673cae
FG
7165
7166 // unlink the dentry
7167 dout(10) << " removing " << *dn << dendl;
7168 if (dnl->is_remote()) {
31f18b77 7169 dir->unlink_inode(dn, false);
7c673cae
FG
7170 }
7171 else if (dnl->is_primary()) {
7172 CInode *in = dnl->get_inode();
7173 dout(10) << " removing " << *in << dendl;
9f95a23c
TL
7174 auto&& ls = in->get_dirfrags();
7175 for (const auto& subdir : ls) {
11fdf7f2 7176 ceph_assert(!subdir->is_subtree_root());
7c673cae
FG
7177 in->close_dirfrag(subdir->dirfrag().frag);
7178 }
31f18b77 7179 dir->unlink_inode(dn, false);
7c673cae
FG
7180 remove_inode(in);
7181 }
7182 else {
11fdf7f2 7183 ceph_assert(dnl->is_null());
7c673cae
FG
7184 }
7185
11fdf7f2 7186 ceph_assert(!dir->has_bloom());
7c673cae
FG
7187 dir->remove_dentry(dn);
7188 // adjust the dir state
7189 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7190 // close empty non-auth dirfrag
7191 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7192 dir->inode->close_dirfrag(dir->get_frag());
7193 }
7194 }
7195
9f95a23c 7196 for (const auto& dn : auth_list) {
31f18b77
FG
7197 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7198 bottom_lru.lru_insert_mid(dn);
7199 else
7200 lru.lru_insert_top(dn);
7201 }
7202
7c673cae
FG
7203 // move everything in the pintail to the top bit of the lru.
7204 lru.lru_touch_entire_pintail();
7205
7206 // unpin all subtrees
7207 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7208 p != subtrees.end();
7209 ++p)
7210 p->first->put(CDir::PIN_SUBTREETEMP);
7211
31f18b77
FG
7212 if (lru.lru_get_size() == 0 &&
7213 bottom_lru.lru_get_size() == 0) {
7c673cae 7214 // root, stray, etc.?
b32b8144 7215 auto p = inode_map.begin();
7c673cae 7216 while (p != inode_map.end()) {
7c673cae 7217 CInode *in = p->second;
b32b8144 7218 ++p;
7c673cae 7219 if (!in->is_auth()) {
9f95a23c
TL
7220 auto&& ls = in->get_dirfrags();
7221 for (const auto& dir : ls) {
7222 dout(10) << " removing " << *dir << dendl;
7223 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7224 remove_subtree(dir);
7225 in->close_dirfrag(dir->dirfrag().frag);
7c673cae
FG
7226 }
7227 dout(10) << " removing " << *in << dendl;
11fdf7f2
TL
7228 ceph_assert(!in->get_parent_dn());
7229 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7230 remove_inode(in);
7231 }
7c673cae
FG
7232 }
7233 }
7234
7235 show_subtrees();
7236}
7237
7238/**
7239 * Recursively trim the subtree rooted at directory to remove all
7240 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7241 * of those links. This is used to clear invalid data out of the cache.
7242 * Note that it doesn't clear the passed-in directory, since that's not
7243 * always safe.
7244 */
7245bool MDCache::trim_non_auth_subtree(CDir *dir)
7246{
7247 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7248
7249 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7250
94b18763
FG
7251 auto j = dir->begin();
7252 auto i = j;
7c673cae
FG
7253 while (j != dir->end()) {
7254 i = j++;
7255 CDentry *dn = i->second;
7256 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7257 CDentry::linkage_t *dnl = dn->get_linkage();
7258 if (dnl->is_primary()) { // check for subdirectories, etc
7259 CInode *in = dnl->get_inode();
7260 bool keep_inode = false;
7261 if (in->is_dir()) {
9f95a23c
TL
7262 auto&& subdirs = in->get_dirfrags();
7263 for (const auto& subdir : subdirs) {
7264 if (subdir->is_subtree_root()) {
7c673cae 7265 keep_inode = true;
9f95a23c 7266 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7c673cae 7267 } else {
9f95a23c 7268 if (trim_non_auth_subtree(subdir))
7c673cae
FG
7269 keep_inode = true;
7270 else {
9f95a23c 7271 in->close_dirfrag(subdir->get_frag());
7c673cae
FG
7272 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7273 }
7274 }
7275 }
7276
7277 }
7278 if (!keep_inode) { // remove it!
7279 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7280 dir->unlink_inode(dn, false);
7c673cae 7281 remove_inode(in);
11fdf7f2 7282 ceph_assert(!dir->has_bloom());
7c673cae
FG
7283 dir->remove_dentry(dn);
7284 } else {
7285 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7286 dn->state_clear(CDentry::STATE_AUTH);
7287 in->state_clear(CInode::STATE_AUTH);
7288 }
7289 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7290 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7291 } else { // just remove it
7292 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7293 if (dnl->is_remote())
31f18b77 7294 dir->unlink_inode(dn, false);
7c673cae
FG
7295 dir->remove_dentry(dn);
7296 }
7297 }
7298 dir->state_clear(CDir::STATE_AUTH);
7299 /**
7300 * We've now checked all our children and deleted those that need it.
7301 * Now return to caller, and tell them if *we're* a keeper.
7302 */
7303 return keep_dir || dir->get_num_any();
7304}
7305
7306/*
7307 * during replay, when we determine a subtree is no longer ours, we
7308 * try to trim it from our cache. because subtrees must be connected
7309 * to the root, the fact that we can trim this tree may mean that our
7310 * children or parents can also be trimmed.
7311 */
7312void MDCache::try_trim_non_auth_subtree(CDir *dir)
7313{
7314 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7315
7316 // can we now trim child subtrees?
7317 set<CDir*> bounds;
7318 get_subtree_bounds(dir, bounds);
7319 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7320 CDir *bd = *p;
7321 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7322 bd->get_num_any() == 0 && // and empty
7323 can_trim_non_auth_dirfrag(bd)) {
7324 CInode *bi = bd->get_inode();
7325 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7326 remove_subtree(bd);
7327 bd->mark_clean();
7328 bi->close_dirfrag(bd->get_frag());
7329 }
7330 }
7331
7332 if (trim_non_auth_subtree(dir)) {
7333 // keep
7334 try_subtree_merge(dir);
7335 } else {
7336 // can we trim this subtree (and possibly our ancestors) too?
7337 while (true) {
7338 CInode *diri = dir->get_inode();
7339 if (diri->is_base()) {
7340 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7341 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7342 remove_subtree(dir);
7343 dir->mark_clean();
7344 diri->close_dirfrag(dir->get_frag());
7345
7346 dout(10) << " removing " << *diri << dendl;
11fdf7f2
TL
7347 ceph_assert(!diri->get_parent_dn());
7348 ceph_assert(diri->get_num_ref() == 0);
7c673cae
FG
7349 remove_inode(diri);
7350 }
7351 break;
7352 }
7353
7354 CDir *psub = get_subtree_root(diri->get_parent_dir());
7355 dout(10) << " parent subtree is " << *psub << dendl;
7356 if (psub->get_dir_auth().first == mds->get_nodeid())
7357 break; // we are auth, keep.
7358
7359 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7360 remove_subtree(dir);
7361 dir->mark_clean();
7362 diri->close_dirfrag(dir->get_frag());
7363
7364 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7365 if (trim_non_auth_subtree(psub))
7366 break;
7367 dir = psub;
7368 }
7369 }
7370
7371 show_subtrees();
7372}
7373
7374void MDCache::standby_trim_segment(LogSegment *ls)
7375{
494da23a
TL
7376 auto try_trim_inode = [this](CInode *in) {
7377 if (in->get_num_ref() == 0 &&
7378 !in->item_open_file.is_on_list() &&
7379 in->parent != NULL &&
7380 in->parent->get_num_ref() == 0){
7381 touch_dentry_bottom(in->parent);
7382 }
7383 };
7384
7385 auto try_trim_dentry = [this](CDentry *dn) {
7386 if (dn->get_num_ref() > 0)
7387 return;
7388 auto in = dn->get_linkage()->inode;
7389 if(in && in->item_open_file.is_on_list())
7390 return;
7391 touch_dentry_bottom(dn);
7392 };
7393
7c673cae
FG
7394 ls->new_dirfrags.clear_list();
7395 ls->open_files.clear_list();
7396
7397 while (!ls->dirty_dirfrags.empty()) {
7398 CDir *dir = ls->dirty_dirfrags.front();
7399 dir->mark_clean();
494da23a
TL
7400 if (dir->inode)
7401 try_trim_inode(dir->inode);
7c673cae
FG
7402 }
7403 while (!ls->dirty_inodes.empty()) {
7404 CInode *in = ls->dirty_inodes.front();
7405 in->mark_clean();
494da23a 7406 try_trim_inode(in);
7c673cae
FG
7407 }
7408 while (!ls->dirty_dentries.empty()) {
7409 CDentry *dn = ls->dirty_dentries.front();
7410 dn->mark_clean();
494da23a 7411 try_trim_dentry(dn);
7c673cae
FG
7412 }
7413 while (!ls->dirty_parent_inodes.empty()) {
7414 CInode *in = ls->dirty_parent_inodes.front();
7415 in->clear_dirty_parent();
494da23a 7416 try_trim_inode(in);
7c673cae
FG
7417 }
7418 while (!ls->dirty_dirfrag_dir.empty()) {
7419 CInode *in = ls->dirty_dirfrag_dir.front();
7420 in->filelock.remove_dirty();
494da23a 7421 try_trim_inode(in);
7c673cae
FG
7422 }
7423 while (!ls->dirty_dirfrag_nest.empty()) {
7424 CInode *in = ls->dirty_dirfrag_nest.front();
7425 in->nestlock.remove_dirty();
494da23a 7426 try_trim_inode(in);
7c673cae
FG
7427 }
7428 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7429 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7430 in->dirfragtreelock.remove_dirty();
494da23a 7431 try_trim_inode(in);
7c673cae 7432 }
eafe8130
TL
7433 while (!ls->truncating_inodes.empty()) {
7434 auto it = ls->truncating_inodes.begin();
7435 CInode *in = *it;
7436 ls->truncating_inodes.erase(it);
7437 in->put(CInode::PIN_TRUNCATING);
7438 try_trim_inode(in);
7439 }
7c673cae
FG
7440}
7441
9f95a23c 7442void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7c673cae
FG
7443{
7444 mds_rank_t from = mds_rank_t(m->get_from());
7445
7446 dout(7) << "cache_expire from mds." << from << dendl;
7447
7448 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
7449 return;
7450 }
7451
7452 set<SimpleLock *> gather_locks;
7453 // loop over realms
11fdf7f2 7454 for (const auto &p : m->realms) {
7c673cae 7455 // check container?
11fdf7f2
TL
7456 if (p.first.ino > 0) {
7457 CInode *expired_inode = get_inode(p.first.ino);
7458 ceph_assert(expired_inode); // we had better have this.
7459 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7460 ceph_assert(parent_dir);
7c673cae
FG
7461
7462 int export_state = -1;
7463 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7464 export_state = migrator->get_export_state(parent_dir);
11fdf7f2 7465 ceph_assert(export_state >= 0);
7c673cae
FG
7466 }
7467
7468 if (!parent_dir->is_auth() ||
7469 (export_state != -1 &&
7470 ((export_state == Migrator::EXPORT_WARNING &&
7471 migrator->export_has_warned(parent_dir,from)) ||
7472 export_state == Migrator::EXPORT_EXPORTING ||
7473 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7474 (export_state == Migrator::EXPORT_NOTIFYING &&
7475 !migrator->export_has_notified(parent_dir,from))))) {
7476
7477 // not auth.
7478 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
11fdf7f2 7479 ceph_assert(parent_dir->is_frozen_tree_root());
7c673cae
FG
7480
7481 // make a message container
11fdf7f2
TL
7482
7483 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7484 if (em.second)
9f95a23c 7485 em.first->second = make_message<MCacheExpire>(from); /* new */
11fdf7f2 7486
7c673cae 7487 // merge these expires into it
11fdf7f2 7488 em.first->second->add_realm(p.first, p.second);
7c673cae
FG
7489 continue;
7490 }
11fdf7f2 7491 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7c673cae
FG
7492 (export_state == Migrator::EXPORT_WARNING &&
7493 !migrator->export_has_warned(parent_dir, from)));
7494
7495 dout(7) << "expires for " << *parent_dir << dendl;
7496 } else {
7497 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7498 }
7499
7500 // INODES
11fdf7f2
TL
7501 for (const auto &q : p.second.inodes) {
7502 CInode *in = get_inode(q.first);
7503 unsigned nonce = q.second;
7c673cae
FG
7504
7505 if (!in) {
11fdf7f2 7506 dout(0) << " inode expire on " << q.first << " from " << from
7c673cae 7507 << ", don't have it" << dendl;
11fdf7f2 7508 ceph_assert(in);
7c673cae 7509 }
11fdf7f2 7510 ceph_assert(in->is_auth());
7c673cae
FG
7511 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7512
7513 // check nonce
7514 if (nonce == in->get_replica_nonce(from)) {
7515 // remove from our cached_by
7516 dout(7) << " inode expire on " << *in << " from mds." << from
7517 << " cached_by was " << in->get_replicas() << dendl;
7518 inode_remove_replica(in, from, false, gather_locks);
7519 }
7520 else {
7521 // this is an old nonce, ignore expire.
7522 dout(7) << " inode expire on " << *in << " from mds." << from
7523 << " with old nonce " << nonce
7524 << " (current " << in->get_replica_nonce(from) << "), dropping"
7525 << dendl;
7526 }
7527 }
7528
7529 // DIRS
11fdf7f2
TL
7530 for (const auto &q : p.second.dirs) {
7531 CDir *dir = get_dirfrag(q.first);
7532 unsigned nonce = q.second;
7c673cae
FG
7533
7534 if (!dir) {
11fdf7f2 7535 CInode *diri = get_inode(q.first.ino);
7c673cae
FG
7536 if (diri) {
7537 if (mds->is_rejoin() &&
7538 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7539 !diri->is_replica(from)) {
9f95a23c 7540 auto&& ls = diri->get_nested_dirfrags();
11fdf7f2 7541 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae 7542 << " while rejoining, inode isn't replicated" << dendl;
9f95a23c
TL
7543 for (const auto& d : ls) {
7544 dir = d;
7c673cae
FG
7545 if (dir->is_replica(from)) {
7546 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7547 dir->remove_replica(from);
7548 }
7549 }
7550 continue;
7551 }
11fdf7f2 7552 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7c673cae 7553 if (other) {
11fdf7f2 7554 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7555 << " have " << *other << ", mismatched frags, dropping" << dendl;
7556 continue;
7557 }
7558 }
11fdf7f2 7559 dout(0) << " dir expire on " << q.first << " from " << from
7c673cae 7560 << ", don't have it" << dendl;
11fdf7f2 7561 ceph_assert(dir);
7c673cae
FG
7562 }
7563 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7564
11fdf7f2 7565 ceph_assert(dir->is_auth());
7c673cae
FG
7566
7567 // check nonce
7568 if (nonce == dir->get_replica_nonce(from)) {
7569 // remove from our cached_by
7570 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7571 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7572 dir->remove_replica(from);
7573 }
7574 else {
7575 // this is an old nonce, ignore expire.
7576 dout(7) << " dir expire on " << *dir << " from mds." << from
7577 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7578 << "), dropping" << dendl;
7579 }
7580 }
7581
7582 // DENTRIES
11fdf7f2
TL
7583 for (const auto &pd : p.second.dentries) {
7584 dout(10) << " dn expires in dir " << pd.first << dendl;
7585 CInode *diri = get_inode(pd.first.ino);
7586 ceph_assert(diri);
7587 CDir *dir = diri->get_dirfrag(pd.first.frag);
7c673cae
FG
7588
7589 if (!dir) {
11fdf7f2 7590 dout(0) << " dn expires on " << pd.first << " from " << from
7c673cae
FG
7591 << ", must have refragmented" << dendl;
7592 } else {
11fdf7f2 7593 ceph_assert(dir->is_auth());
7c673cae
FG
7594 }
7595
11fdf7f2
TL
7596 for (const auto &p : pd.second) {
7597 unsigned nonce = p.second;
7c673cae
FG
7598 CDentry *dn;
7599
7600 if (dir) {
11fdf7f2 7601 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7602 } else {
7603 // which dirfrag for this dentry?
11fdf7f2
TL
7604 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7605 ceph_assert(dir);
7606 ceph_assert(dir->is_auth());
7607 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7608 }
7609
7610 if (!dn) {
7611 if (dir)
11fdf7f2 7612 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7c673cae 7613 else
11fdf7f2 7614 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7c673cae 7615 }
11fdf7f2 7616 ceph_assert(dn);
7c673cae
FG
7617
7618 if (nonce == dn->get_replica_nonce(from)) {
7619 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7620 dentry_remove_replica(dn, from, gather_locks);
7621 }
7622 else {
7623 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7624 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7625 << "), dropping" << dendl;
7626 }
7627 }
7628 }
7629 }
7630
7c673cae
FG
7631 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7632 if (!(*p)->is_stable())
7633 mds->locker->eval_gather(*p);
7634 }
7635}
7636
7637void MDCache::process_delayed_expire(CDir *dir)
7638{
7639 dout(7) << "process_delayed_expire on " << *dir << dendl;
11fdf7f2
TL
7640 for (const auto &p : delayed_expire[dir]) {
7641 handle_cache_expire(p.second);
7642 }
7c673cae
FG
7643 delayed_expire.erase(dir);
7644}
7645
7646void MDCache::discard_delayed_expire(CDir *dir)
7647{
7648 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7c673cae
FG
7649 delayed_expire.erase(dir);
7650}
7651
7652void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7653 set<SimpleLock *>& gather_locks)
7654{
7655 in->remove_replica(from);
11fdf7f2 7656 in->set_mds_caps_wanted(from, 0);
7c673cae
FG
7657
7658 // note: this code calls _eval more often than it needs to!
7659 // fix lock
7660 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7661 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7662 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7663 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7664 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7665 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7666
7667 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7668 // Don't remove the recovering mds from lock's gathering list because
7669 // it may hold rejoined wrlocks.
7670 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7671 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7672 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7673}
7674
7675void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7676{
7677 dn->remove_replica(from);
7678
7679 // fix lock
7680 if (dn->lock.remove_replica(from))
7681 gather_locks.insert(&dn->lock);
7682
7683 // Replicated strays might now be elegible for purge
11fdf7f2 7684 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7685 if (dnl->is_primary()) {
7686 maybe_eval_stray(dnl->get_inode());
7687 }
7688}
7689
7690void MDCache::trim_client_leases()
7691{
7692 utime_t now = ceph_clock_now();
7693
7694 dout(10) << "trim_client_leases" << dendl;
7695
eafe8130
TL
7696 std::size_t pool = 0;
7697 for (const auto& list : client_leases) {
7698 pool += 1;
7699 if (list.empty())
7c673cae
FG
7700 continue;
7701
eafe8130
TL
7702 auto before = list.size();
7703 while (!list.empty()) {
7704 ClientLease *r = list.front();
7c673cae
FG
7705 if (r->ttl > now) break;
7706 CDentry *dn = static_cast<CDentry*>(r->parent);
7707 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7708 dn->remove_client_lease(r, mds->locker);
7709 }
eafe8130 7710 auto after = list.size();
7c673cae
FG
7711 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7712 << (before-after) << " leases, " << after << " left" << dendl;
7713 }
7714}
7715
7c673cae
FG
7716void MDCache::check_memory_usage()
7717{
7718 static MemoryModel mm(g_ceph_context);
7719 static MemoryModel::snap last;
7720 mm.sample(&last);
7721 static MemoryModel::snap baseline = last;
7722
7723 // check client caps
11fdf7f2 7724 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7725 double caps_per_inode = 0.0;
7c673cae 7726 if (CInode::count())
181888fb 7727 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae 7728
a8e16298 7729 dout(2) << "Memory usage: "
7c673cae
FG
7730 << " total " << last.get_total()
7731 << ", rss " << last.get_rss()
7732 << ", heap " << last.get_heap()
7733 << ", baseline " << baseline.get_heap()
7c673cae
FG
7734 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7735 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7736 << dendl;
7737
c07f9fc5 7738 mds->update_mlogger();
7c673cae
FG
7739 mds->mlogger->set(l_mdm_rss, last.get_rss());
7740 mds->mlogger->set(l_mdm_heap, last.get_heap());
7c673cae
FG
7741}
7742
7743
7744
7745// =========================================================================================
7746// shutdown
7747
7748class C_MDC_ShutdownCheck : public MDCacheContext {
7749public:
7750 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7751 void finish(int) override {
7752 mdcache->shutdown_check();
7753 }
7754};
7755
7756void MDCache::shutdown_check()
7757{
7758 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7759
7760 // cache
7761 char old_val[32] = { 0 };
7762 char *o = old_val;
11fdf7f2
TL
7763 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7764 g_conf().set_val("debug_mds", "10");
7765 g_conf().apply_changes(nullptr);
7c673cae 7766 show_cache();
11fdf7f2
TL
7767 g_conf().set_val("debug_mds", old_val);
7768 g_conf().apply_changes(nullptr);
7769 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae
FG
7770
7771 // this
31f18b77 7772 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7773 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7774
7775
7776 if (mds->objecter->is_active()) {
7777 dout(0) << "objecter still active" << dendl;
7778 mds->objecter->dump_active();
7779 }
7780}
7781
7782
7783void MDCache::shutdown_start()
7784{
a8e16298 7785 dout(5) << "shutdown_start" << dendl;
7c673cae 7786
11fdf7f2
TL
7787 if (g_conf()->mds_shutdown_check)
7788 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae 7789
11fdf7f2 7790 // g_conf()->debug_mds = 10;
7c673cae
FG
7791}
7792
7793
7794
7795bool MDCache::shutdown_pass()
7796{
7797 dout(7) << "shutdown_pass" << dendl;
7798
7799 if (mds->is_stopped()) {
7800 dout(7) << " already shut down" << dendl;
7801 show_cache();
7802 show_subtrees();
7803 return true;
7804 }
7805
7806 // empty stray dir
28e407b8 7807 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7808
7809 // trim cache
181888fb 7810 trim(UINT64_MAX);
31f18b77 7811 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7812
f6b5b4d7
TL
7813
7814 {
7815 dout(10) << "Migrating any ephemerally pinned inodes" << dendl;
7816 /* copy to vector to avoid removals during iteration */
7817 std::vector<CInode*> migrate;
7818 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
7819 for (auto& in : migrate) {
7820 in->maybe_ephemeral_rand();
7821 }
7822 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
7823 for (auto& in : migrate) {
7824 in->maybe_ephemeral_dist();
7825 }
7826 mds->balancer->handle_export_pins();
7827 }
7828
28e407b8 7829 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae 7830 int num_auth_subtree = 0;
f6b5b4d7
TL
7831 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7832 dout(7) << "looking for subtrees to export" << dendl;
9f95a23c 7833 std::vector<CDir*> ls;
f6b5b4d7
TL
7834 for (auto& [dir, bounds] : subtrees) {
7835 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7836 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7c673cae 7837 continue;
f6b5b4d7
TL
7838 num_auth_subtree++;
7839 if (dir->is_frozen() ||
7840 dir->is_freezing() ||
7841 dir->is_ambiguous_dir_auth() ||
7842 dir->state_test(CDir::STATE_EXPORTING) ||
7843 dir->get_inode()->is_ephemerally_pinned()) {
7844 continue;
7c673cae 7845 }
f6b5b4d7 7846 ls.push_back(dir);
7c673cae 7847 }
28e407b8
AA
7848
7849 migrator->clear_export_queue();
f6b5b4d7 7850
9f95a23c 7851 for (const auto& dir : ls) {
7c673cae
FG
7852 mds_rank_t dest = dir->get_inode()->authority().first;
7853 if (dest > 0 && !mds->mdsmap->is_active(dest))
7854 dest = 0;
7855 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7856 migrator->export_dir_nicely(dir, dest);
7857 }
7858 }
7859
28e407b8
AA
7860 if (!strays_all_exported) {
7861 dout(7) << "waiting for strays to migrate" << dendl;
7862 return false;
7863 }
7864
7c673cae 7865 if (num_auth_subtree > 0) {
11fdf7f2 7866 ceph_assert(mds->get_nodeid() > 0);
7c673cae
FG
7867 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7868 show_subtrees();
7869 return false;
7870 }
7871
7872 // close out any sessions (and open files!) before we try to trim the log, etc.
7873 if (mds->sessionmap.have_unclosed_sessions()) {
7874 if (!mds->server->terminating_sessions)
7875 mds->server->terminate_sessions();
7876 return false;
7877 }
7878
28e407b8
AA
7879 // Fully trim the log so that all objects in cache are clean and may be
7880 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7881 // trim the log such that the cache eventually becomes clean.
f64942e4
AA
7882 if (mds->mdlog->get_num_segments() > 0) {
7883 auto ls = mds->mdlog->get_current_segment();
7884 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7885 // Current segment contains events other than subtreemap or
7886 // there are dirty dirfrags (see CDir::log_mark_dirty())
7887 mds->mdlog->start_new_segment();
7888 mds->mdlog->flush();
7889 }
7890 }
7891 mds->mdlog->trim_all();
28e407b8
AA
7892 if (mds->mdlog->get_num_segments() > 1) {
7893 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7894 return false;
7895 }
7896
7897 // drop our reference to our stray dir inode
7898 for (int i = 0; i < NUM_STRAY; ++i) {
7899 if (strays[i] &&
7900 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7901 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7902 strays[i]->put(CInode::PIN_STRAY);
7903 strays[i]->put_stickydirs();
7904 }
7905 }
7906
7c673cae
FG
7907 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7908 if (mydir && !mydir->is_subtree_root())
7909 mydir = NULL;
7910
7911 // subtrees map not empty yet?
7912 if (subtrees.size() > (mydir ? 1 : 0)) {
7913 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7914 show_subtrees();
7915 migrator->show_importing();
7916 migrator->show_exporting();
7917 if (!migrator->is_importing() && !migrator->is_exporting())
7918 show_cache();
7919 return false;
7920 }
11fdf7f2
TL
7921 ceph_assert(!migrator->is_exporting());
7922 ceph_assert(!migrator->is_importing());
7c673cae 7923
f64942e4
AA
7924 // replicas may dirty scatter locks
7925 if (myin && myin->is_replicated()) {
7926 dout(7) << "still have replicated objects" << dendl;
7927 return false;
7928 }
7929
11fdf7f2
TL
7930 if ((myin && myin->get_num_auth_pins()) ||
7931 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
181888fb
FG
7932 dout(7) << "still have auth pinned objects" << dendl;
7933 return false;
7934 }
7935
7c673cae
FG
7936 // (only do this once!)
7937 if (!mds->mdlog->is_capped()) {
7938 dout(7) << "capping the log" << dendl;
7939 mds->mdlog->cap();
7c673cae
FG
7940 }
7941
f64942e4
AA
7942 if (!mds->mdlog->empty())
7943 mds->mdlog->trim(0);
7944
7c673cae
FG
7945 if (!mds->mdlog->empty()) {
7946 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7947 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7948 return false;
7949 }
7950
7951 if (!did_shutdown_log_cap) {
7952 // flush journal header
7953 dout(7) << "writing header for (now-empty) journal" << dendl;
11fdf7f2 7954 ceph_assert(mds->mdlog->empty());
7c673cae
FG
7955 mds->mdlog->write_head(0);
7956 // NOTE: filer active checker below will block us until this completes.
7957 did_shutdown_log_cap = true;
7958 return false;
7959 }
7960
7961 // filer active?
7962 if (mds->objecter->is_active()) {
7963 dout(7) << "objecter still active" << dendl;
7964 mds->objecter->dump_active();
7965 return false;
7966 }
7967
7968 // trim what we can from the cache
31f18b77
FG
7969 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7970 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7971 show_cache();
7972 //dump();
7973 return false;
7974 }
31f18b77
FG
7975
7976 // make mydir subtree go away
7977 if (mydir) {
7978 if (mydir->get_num_ref() > 1) { // subtree pin
7979 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7980 show_cache();
7981 return false;
7982 }
7983
7984 remove_subtree(mydir);
7985 myin->close_dirfrag(mydir->get_frag());
7986 }
11fdf7f2 7987 ceph_assert(subtrees.empty());
31f18b77 7988
1adf2230 7989 if (myin) {
31f18b77 7990 remove_inode(myin);
11fdf7f2 7991 ceph_assert(!myin);
1adf2230
AA
7992 }
7993
11fdf7f2
TL
7994 if (global_snaprealm) {
7995 remove_inode(global_snaprealm->inode);
7996 global_snaprealm = nullptr;
7997 }
7998
7c673cae 7999 // done!
a8e16298 8000 dout(5) << "shutdown done." << dendl;
7c673cae
FG
8001 return true;
8002}
8003
8004bool MDCache::shutdown_export_strays()
8005{
f64942e4
AA
8006 static const unsigned MAX_EXPORTING = 100;
8007
7c673cae
FG
8008 if (mds->get_nodeid() == 0)
8009 return true;
f64942e4
AA
8010
8011 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
8012 return false;
8013
8014 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
8015 << " '" << shutdown_export_next.second << "'" << dendl;
7c673cae
FG
8016
8017 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
f64942e4 8018 bool all_exported = false;
7c673cae 8019
f64942e4
AA
8020again:
8021 auto next = shutdown_export_next;
7c673cae 8022
7c673cae 8023 for (int i = 0; i < NUM_STRAY; ++i) {
f64942e4
AA
8024 CInode *strayi = strays[i];
8025 if (!strayi ||
8026 !strayi->state_test(CInode::STATE_STRAYPINNED))
8027 continue;
8028 if (strayi->ino() < next.first.ino)
7c673cae 8029 continue;
7c673cae 8030
f64942e4
AA
8031 deque<CDir*> dfls;
8032 strayi->get_dirfrags(dfls);
7c673cae 8033
f64942e4
AA
8034 while (!dfls.empty()) {
8035 CDir *dir = dfls.front();
8036 dfls.pop_front();
8037
8038 if (dir->dirfrag() < next.first)
7c673cae 8039 continue;
f64942e4
AA
8040 if (next.first < dir->dirfrag()) {
8041 next.first = dir->dirfrag();
8042 next.second.clear();
8043 }
8044
8045 if (!dir->is_complete()) {
11fdf7f2 8046 MDSContext *fin = nullptr;
f64942e4
AA
8047 if (shutdown_exporting_strays.empty()) {
8048 fin = new MDSInternalContextWrapper(mds,
9f95a23c 8049 new LambdaContext([this](int r) {
f64942e4
AA
8050 shutdown_export_strays();
8051 })
8052 );
8053 }
8054 dir->fetch(fin);
8055 goto done;
7c673cae
FG
8056 }
8057
f64942e4
AA
8058 CDir::dentry_key_map::iterator it;
8059 if (next.second.empty()) {
8060 it = dir->begin();
7c673cae 8061 } else {
f64942e4
AA
8062 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8063 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
7c673cae 8064 }
f64942e4
AA
8065
8066 for (; it != dir->end(); ++it) {
8067 CDentry *dn = it->second;
8068 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8069 if (dnl->is_null())
8070 continue;
8071
8072 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
11fdf7f2 8073 next.second = it->first.name;
f64942e4
AA
8074 goto done;
8075 }
8076
8077 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8078 if (!ret.second) {
8079 dout(10) << "already exporting/purging " << *dn << dendl;
8080 continue;
8081 }
8082
8083 // Don't try to migrate anything that is actually
8084 // being purged right now
8085 if (!dn->state_test(CDentry::STATE_PURGING))
8086 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8087
8088 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8089 ++it;
8090 if (it != dir->end()) {
11fdf7f2 8091 next.second = it->first.name;
f64942e4
AA
8092 } else {
8093 if (dfls.empty())
8094 next.first.ino.val++;
8095 else
8096 next.first = dfls.front()->dirfrag();
8097 next.second.clear();
8098 }
8099 goto done;
8100 }
8101 }
8102 }
8103 }
8104
8105 if (shutdown_exporting_strays.empty()) {
8106 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8107 if (first_df < shutdown_export_next.first ||
8108 !shutdown_export_next.second.empty()) {
8109 shutdown_export_next.first = first_df;
8110 shutdown_export_next.second.clear();
8111 goto again;
7c673cae 8112 }
f64942e4 8113 all_exported = true;
7c673cae
FG
8114 }
8115
f64942e4
AA
8116done:
8117 shutdown_export_next = next;
8118 return all_exported;
7c673cae
FG
8119}
8120
8121// ========= messaging ==============
8122
9f95a23c 8123void MDCache::dispatch(const cref_t<Message> &m)
7c673cae
FG
8124{
8125 switch (m->get_type()) {
8126
8127 // RESOLVE
8128 case MSG_MDS_RESOLVE:
9f95a23c 8129 handle_resolve(ref_cast<MMDSResolve>(m));
7c673cae
FG
8130 break;
8131 case MSG_MDS_RESOLVEACK:
9f95a23c 8132 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
7c673cae
FG
8133 break;
8134
8135 // REJOIN
8136 case MSG_MDS_CACHEREJOIN:
9f95a23c 8137 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
7c673cae
FG
8138 break;
8139
8140 case MSG_MDS_DISCOVER:
9f95a23c 8141 handle_discover(ref_cast<MDiscover>(m));
7c673cae
FG
8142 break;
8143 case MSG_MDS_DISCOVERREPLY:
9f95a23c 8144 handle_discover_reply(ref_cast<MDiscoverReply>(m));
7c673cae
FG
8145 break;
8146
8147 case MSG_MDS_DIRUPDATE:
9f95a23c 8148 handle_dir_update(ref_cast<MDirUpdate>(m));
7c673cae
FG
8149 break;
8150
8151 case MSG_MDS_CACHEEXPIRE:
9f95a23c 8152 handle_cache_expire(ref_cast<MCacheExpire>(m));
7c673cae
FG
8153 break;
8154
8155 case MSG_MDS_DENTRYLINK:
9f95a23c 8156 handle_dentry_link(ref_cast<MDentryLink>(m));
7c673cae
FG
8157 break;
8158 case MSG_MDS_DENTRYUNLINK:
9f95a23c 8159 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
7c673cae
FG
8160 break;
8161
8162 case MSG_MDS_FRAGMENTNOTIFY:
9f95a23c 8163 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
7c673cae 8164 break;
a8e16298 8165 case MSG_MDS_FRAGMENTNOTIFYACK:
9f95a23c 8166 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
a8e16298 8167 break;
7c673cae
FG
8168
8169 case MSG_MDS_FINDINO:
9f95a23c 8170 handle_find_ino(ref_cast<MMDSFindIno>(m));
7c673cae
FG
8171 break;
8172 case MSG_MDS_FINDINOREPLY:
9f95a23c 8173 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
7c673cae
FG
8174 break;
8175
8176 case MSG_MDS_OPENINO:
9f95a23c 8177 handle_open_ino(ref_cast<MMDSOpenIno>(m));
7c673cae
FG
8178 break;
8179 case MSG_MDS_OPENINOREPLY:
9f95a23c 8180 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
11fdf7f2
TL
8181 break;
8182
8183 case MSG_MDS_SNAPUPDATE:
9f95a23c 8184 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
7c673cae
FG
8185 break;
8186
8187 default:
8188 derr << "cache unknown message " << m->get_type() << dendl;
11fdf7f2 8189 ceph_abort_msg("cache unknown message");
7c673cae
FG
8190 }
8191}
8192
9f95a23c
TL
8193int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8194 const filepath& path, int flags,
8195 vector<CDentry*> *pdnvec, CInode **pin)
7c673cae 8196{
9f95a23c
TL
8197 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8198 bool forward = !discover;
8199 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8200 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8201 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8202 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8203 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8204 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8205 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
7c673cae 8206
9f95a23c
TL
8207 if (forward)
8208 ceph_assert(mdr); // forward requires a request
7c673cae
FG
8209
8210 snapid_t snapid = CEPH_NOSNAP;
8211 if (mdr)
8212 mdr->snapid = snapid;
8213
8214 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8215
8216 if (mds->logger) mds->logger->inc(l_mds_traverse);
8217
8218 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8219 CInode *cur = get_inode(path.get_ino());
9f95a23c
TL
8220 if (!cur) {
8221 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
11fdf7f2 8222 open_foreign_mdsdir(path.get_ino(), cf.build());
9f95a23c 8223 return 1;
7c673cae 8224 }
9f95a23c
TL
8225 if (MDS_INO_IS_STRAY(path.get_ino())) {
8226 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8227 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8228 filepath path(strays[idx]->get_parent_dn()->get_name(),
8229 MDS_INO_MDSDIR(rank));
8230 MDRequestRef null_ref;
8231 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8232 }
8233 return -ESTALE;
7c673cae
FG
8234 }
8235 if (cur->state_test(CInode::STATE_PURGING))
8236 return -ESTALE;
8237
8238 // make sure snaprealm are open...
11fdf7f2
TL
8239 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8240 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8241 return 1;
8242 }
8243
9f95a23c
TL
8244 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8245 mds->locker->find_and_attach_lock_cache(mdr, cur);
8246
8247 if (mdr && mdr->lock_cache) {
8248 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8249 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8250 } else if (rdlock_snap) {
8251 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8252 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8253 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8254 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8255 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8256 return 1;
8257 }
8258 }
8259
7c673cae
FG
8260 // start trace
8261 if (pdnvec)
8262 pdnvec->clear();
8263 if (pin)
8264 *pin = cur;
8265
9f95a23c
TL
8266 MutationImpl::LockOpVec lov;
8267
8268 for (unsigned depth = 0; depth < path.depth(); ) {
7c673cae
FG
8269 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8270 << "' snapid " << snapid << dendl;
8271
8272 if (!cur->is_dir()) {
8273 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8274 return -ENOTDIR;
8275 }
8276
8277 // walk into snapdir?
8278 if (path[depth].length() == 0) {
8279 dout(10) << "traverse: snapdir" << dendl;
9f95a23c 8280 if (!mdr || depth > 0) // snapdir must be the first component
7c673cae
FG
8281 return -EINVAL;
8282 snapid = CEPH_SNAPDIR;
8283 mdr->snapid = snapid;
8284 depth++;
8285 continue;
8286 }
8287 // walk thru snapdir?
8288 if (snapid == CEPH_SNAPDIR) {
8289 if (!mdr)
8290 return -EINVAL;
8291 SnapRealm *realm = cur->find_snaprealm();
8292 snapid = realm->resolve_snapname(path[depth], cur->ino());
8293 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
11fdf7f2 8294 if (!snapid) {
9f95a23c
TL
8295 if (pdnvec)
8296 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
7c673cae 8297 return -ENOENT;
11fdf7f2 8298 }
7c673cae
FG
8299 mdr->snapid = snapid;
8300 depth++;
8301 continue;
8302 }
8303
8304 // open dir
8305 frag_t fg = cur->pick_dirfrag(path[depth]);
8306 CDir *curdir = cur->get_dirfrag(fg);
8307 if (!curdir) {
8308 if (cur->is_auth()) {
8309 // parent dir frozen_dir?
8310 if (cur->is_frozen()) {
8311 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
11fdf7f2 8312 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
7c673cae
FG
8313 return 1;
8314 }
8315 curdir = cur->get_or_open_dirfrag(this, fg);
8316 } else {
8317 // discover?
8318 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
11fdf7f2 8319 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
9f95a23c 8320 path_locked);
7c673cae
FG
8321 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8322 return 1;
8323 }
8324 }
11fdf7f2 8325 ceph_assert(curdir);
7c673cae
FG
8326
8327#ifdef MDS_VERIFY_FRAGSTAT
8328 if (curdir->is_complete())
8329 curdir->verify_fragstat();
8330#endif
8331
8332 // frozen?
8333 /*
8334 if (curdir->is_frozen()) {
8335 // doh!
8336 // FIXME: traverse is allowed?
8337 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8338 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8339 if (onfinish) delete onfinish;
8340 return 1;
8341 }
8342 */
8343
9f95a23c
TL
8344 if (want_auth && want_dentry && depth == path.depth() - 1) {
8345 if (curdir->is_ambiguous_auth()) {
8346 dout(10) << "waiting for single auth on " << *curdir << dendl;
8347 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8348 return 1;
8349 }
8350 if (!curdir->is_auth()) {
8351 dout(10) << "fw to auth for " << *curdir << dendl;
8352 request_forward(mdr, curdir->authority().first);
8353 return 2;
8354 }
8355 }
8356
7c673cae
FG
8357 // Before doing dirfrag->dn lookup, compare with DamageTable's
8358 // record of which dentries were unreadable
8359 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8360 dout(4) << "traverse: stopped lookup at damaged dentry "
8361 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8362 return -EIO;
8363 }
8364
8365 // dentry
8366 CDentry *dn = curdir->lookup(path[depth], snapid);
9f95a23c
TL
8367 if (dn) {
8368 if (dn->state_test(CDentry::STATE_PURGING))
8369 return -ENOENT;
8370
8371 if (rdlock_path) {
8372 lov.clear();
8373 if (xlock_dentry && depth == path.depth() - 1) {
8374 if (depth > 0 || !mdr->lock_cache) {
8375 lov.add_wrlock(&cur->filelock);
8376 lov.add_wrlock(&cur->nestlock);
8377 if (rdlock_authlock)
8378 lov.add_rdlock(&cur->authlock);
8379 }
8380 lov.add_xlock(&dn->lock);
8381 } else {
8382 // force client to flush async dir operation if necessary
8383 if (cur->filelock.is_cached())
8384 lov.add_wrlock(&cur->filelock);
8385 lov.add_rdlock(&dn->lock);
8386 }
8387 if (!mds->locker->acquire_locks(mdr, lov)) {
8388 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8389 return 1;
8390 }
8391 } else if (!path_locked &&
8392 !dn->lock.can_read(client) &&
8393 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8394 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8395 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8396 if (mds->logger)
8397 mds->logger->inc(l_mds_traverse_lock);
8398 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8399 mds->mdlog->flush();
8400 return 1;
8401 }
7c673cae 8402
7c673cae
FG
8403 if (pdnvec)
8404 pdnvec->push_back(dn);
7c673cae 8405
9f95a23c
TL
8406 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8407 // can we conclude ENOENT?
8408 if (dnl->is_null()) {
8409 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8410 if (depth == path.depth() - 1) {
8411 if (want_dentry)
8412 break;
8413 } else {
8414 if (pdnvec)
7c673cae
FG
8415 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8416 }
9f95a23c 8417 return -ENOENT;
7c673cae 8418 }
7c673cae 8419
7c673cae 8420 // do we have inode?
9f95a23c 8421 CInode *in = dnl->get_inode();
7c673cae 8422 if (!in) {
11fdf7f2 8423 ceph_assert(dnl->is_remote());
7c673cae
FG
8424 // do i have it?
8425 in = get_inode(dnl->get_remote_ino());
8426 if (in) {
8427 dout(7) << "linking in remote in " << *in << dendl;
8428 dn->link_remote(dnl, in);
8429 } else {
8430 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
11fdf7f2 8431 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7c673cae
FG
8432 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8433 dout(4) << "traverse: remote dentry points to damaged ino "
8434 << *dn << dendl;
8435 return -EIO;
8436 }
11fdf7f2 8437 open_remote_dentry(dn, true, cf.build(),
9f95a23c 8438 (path_locked && depth == path.depth() - 1));
7c673cae
FG
8439 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8440 return 1;
9f95a23c 8441 }
7c673cae
FG
8442 }
8443
8444 cur = in;
8445 // make sure snaprealm are open...
11fdf7f2
TL
8446 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8447 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8448 return 1;
8449 }
8450
9f95a23c
TL
8451 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8452 lov.clear();
8453 lov.add_rdlock(&cur->snaplock);
8454 if (!mds->locker->acquire_locks(mdr, lov)) {
8455 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8456 return 1;
8457 }
8458 }
8459
7c673cae
FG
8460 // add to trace, continue.
8461 touch_inode(cur);
7c673cae
FG
8462 if (pin)
8463 *pin = cur;
8464 depth++;
8465 continue;
8466 }
9f95a23c
TL
8467
8468 ceph_assert(!dn);
7c673cae
FG
8469
8470 // MISS. dentry doesn't exist.
8471 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8472
8473 if (curdir->is_auth()) {
8474 // dentry is mine.
8475 if (curdir->is_complete() ||
8476 (snapid == CEPH_NOSNAP &&
8477 curdir->has_bloom() &&
11fdf7f2 8478 !curdir->is_in_bloom(path[depth]))) {
7c673cae
FG
8479 // file not found
8480 if (pdnvec) {
8481 // instantiate a null dn?
9f95a23c 8482 if (depth < path.depth() - 1) {
7c673cae 8483 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
7c673cae
FG
8484 } else if (snapid < CEPH_MAXSNAP) {
8485 dout(20) << " not adding null for snapid " << snapid << dendl;
9f95a23c
TL
8486 } else if (curdir->is_frozen()) {
8487 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8488 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8489 return 1;
7c673cae
FG
8490 } else {
8491 // create a null dentry
8492 dn = curdir->add_null_dentry(path[depth]);
8493 dout(20) << " added null " << *dn << dendl;
9f95a23c
TL
8494
8495 if (rdlock_path) {
8496 lov.clear();
8497 if (xlock_dentry) {
8498 if (depth > 0 || !mdr->lock_cache) {
8499 lov.add_wrlock(&cur->filelock);
8500 lov.add_wrlock(&cur->nestlock);
8501 if (rdlock_authlock)
8502 lov.add_rdlock(&cur->authlock);
8503 }
8504 lov.add_xlock(&dn->lock);
8505 } else {
8506 // force client to flush async dir operation if necessary
8507 if (cur->filelock.is_cached())
8508 lov.add_wrlock(&cur->filelock);
8509 lov.add_rdlock(&dn->lock);
8510 }
8511 if (!mds->locker->acquire_locks(mdr, lov)) {
8512 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8513 return 1;
8514 }
8515 }
7c673cae 8516 }
9f95a23c 8517 if (dn) {
7c673cae 8518 pdnvec->push_back(dn);
9f95a23c
TL
8519 if (want_dentry)
8520 break;
8521 } else {
7c673cae 8522 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
9f95a23c 8523 }
7c673cae
FG
8524 }
8525 return -ENOENT;
8526 } else {
8527
8528 // Check DamageTable for missing fragments before trying to fetch
8529 // this
8530 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8531 dout(4) << "traverse: damaged dirfrag " << *curdir
8532 << ", blocking fetch" << dendl;
8533 return -EIO;
8534 }
8535
8536 // directory isn't complete; reload
8537 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8538 touch_inode(cur);
11fdf7f2 8539 curdir->fetch(cf.build(), path[depth]);
7c673cae
FG
8540 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8541 return 1;
8542 }
8543 } else {
8544 // dirfrag/dentry is not mine.
8545 mds_authority_t dauth = curdir->authority();
8546
f91f0fd5 8547 if (forward &&
11fdf7f2 8548 mdr && mdr->client_request &&
9f95a23c 8549 (int)depth < mdr->client_request->get_num_fwd()){
7c673cae
FG
8550 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8551 << " < fwd " << mdr->client_request->get_num_fwd()
8552 << ", discovering instead of forwarding" << dendl;
8553 discover = true;
8554 }
8555
9f95a23c 8556 if ((discover)) {
7c673cae 8557 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
11fdf7f2 8558 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
9f95a23c 8559 path_locked);
7c673cae
FG
8560 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8561 return 1;
8562 }
8563 if (forward) {
8564 // forward
8565 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8566
8567 if (curdir->is_ambiguous_auth()) {
8568 // wait
8569 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
11fdf7f2 8570 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
7c673cae
FG
8571 return 1;
8572 }
8573
8574 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
11fdf7f2
TL
8575
8576 request_forward(mdr, dauth.first);
8577
7c673cae 8578 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
7c673cae 8579 return 2;
11fdf7f2 8580 }
7c673cae 8581 }
11fdf7f2 8582
7c673cae
FG
8583 ceph_abort(); // i shouldn't get here
8584 }
9f95a23c
TL
8585
8586 if (want_auth && !want_dentry) {
8587 if (cur->is_ambiguous_auth()) {
8588 dout(10) << "waiting for single auth on " << *cur << dendl;
8589 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8590 return 1;
8591 }
8592 if (!cur->is_auth()) {
8593 dout(10) << "fw to auth for " << *cur << dendl;
8594 request_forward(mdr, cur->authority().first);
8595 return 2;
8596 }
8597 }
7c673cae
FG
8598
8599 // success.
8600 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8601 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8602 if (mdr)
11fdf7f2 8603 ceph_assert(mdr->snapid == snapid);
9f95a23c
TL
8604
8605 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8606 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8607 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8608 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8609
8610 if (rdlock_path)
8611 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8612
7c673cae
FG
8613 return 0;
8614}
8615
8616CInode *MDCache::cache_traverse(const filepath& fp)
8617{
8618 dout(10) << "cache_traverse " << fp << dendl;
8619
8620 CInode *in;
8621 if (fp.get_ino())
8622 in = get_inode(fp.get_ino());
8623 else
8624 in = root;
8625 if (!in)
8626 return NULL;
8627
8628 for (unsigned i = 0; i < fp.depth(); i++) {
11fdf7f2 8629 std::string_view dname = fp[i];
7c673cae
FG
8630 frag_t fg = in->pick_dirfrag(dname);
8631 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8632 CDir *curdir = in->get_dirfrag(fg);
8633 if (!curdir)
8634 return NULL;
8635 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8636 if (!dn)
8637 return NULL;
8638 in = dn->get_linkage()->get_inode();
8639 if (!in)
8640 return NULL;
8641 }
8642 dout(10) << " got " << *in << dendl;
8643 return in;
8644}
8645
8646
8647/**
8648 * open_remote_dir -- open up a remote dirfrag
8649 *
8650 * @param diri base inode
8651 * @param approxfg approximate fragment.
8652 * @param fin completion callback
8653 */
11fdf7f2 8654void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
7c673cae
FG
8655{
8656 dout(10) << "open_remote_dir on " << *diri << dendl;
11fdf7f2
TL
8657 ceph_assert(diri->is_dir());
8658 ceph_assert(!diri->is_auth());
8659 ceph_assert(diri->get_dirfrag(approxfg) == 0);
7c673cae 8660
224ce89b 8661 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8662}
8663
8664
8665/**
8666 * get_dentry_inode - get or open inode
8667 *
8668 * @param dn the dentry
8669 * @param mdr current request
8670 *
8671 * will return inode for primary, or link up/open up remote link's inode as necessary.
8672 * If it's not available right now, puts mdr on wait list and returns null.
8673 */
8674CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8675{
8676 CDentry::linkage_t *dnl;
8677 if (projected)
8678 dnl = dn->get_projected_linkage();
8679 else
8680 dnl = dn->get_linkage();
8681
11fdf7f2 8682 ceph_assert(!dnl->is_null());
7c673cae
FG
8683
8684 if (dnl->is_primary())
8685 return dnl->inode;
8686
11fdf7f2 8687 ceph_assert(dnl->is_remote());
7c673cae
FG
8688 CInode *in = get_inode(dnl->get_remote_ino());
8689 if (in) {
8690 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8691 dn->link_remote(dnl, in);
8692 return in;
8693 } else {
8694 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8695 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8696 return 0;
8697 }
8698}
8699
8700struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8701 CDentry *dn;
8702 inodeno_t ino;
11fdf7f2 8703 MDSContext *onfinish;
7c673cae 8704 bool want_xlocked;
11fdf7f2 8705 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
31f18b77
FG
8706 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8707 dn->get(MDSCacheObject::PIN_PTRWAITER);
8708 }
7c673cae
FG
8709 void finish(int r) override {
8710 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8711 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8712 }
8713};
8714
11fdf7f2 8715void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
7c673cae
FG
8716{
8717 dout(10) << "open_remote_dentry " << *dn << dendl;
8718 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8719 inodeno_t ino = dnl->get_remote_ino();
8720 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8721 open_ino(ino, pool,
8722 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8723}
8724
11fdf7f2 8725void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
8726 bool want_xlocked, int r)
8727{
8728 if (r < 0) {
31f18b77
FG
8729 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8730 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8731 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8732 dn->state_set(CDentry::STATE_BADREMOTEINO);
8733
8734 std::string path;
8735 CDir *dir = dn->get_dir();
8736 if (dir) {
31f18b77 8737 dir->get_inode()->make_path_string(path);
94b18763 8738 path += "/";
11fdf7f2 8739 path += dn->get_name();
7c673cae
FG
8740 }
8741
31f18b77 8742 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8743 if (fatal) {
31f18b77
FG
8744 mds->damaged();
8745 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8746 }
31f18b77
FG
8747 } else {
8748 r = 0;
8749 }
7c673cae
FG
8750 }
8751 fin->complete(r < 0 ? r : 0);
8752}
8753
8754
8755void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8756{
8757 // empty trace if we're a base inode
8758 if (in->is_base())
8759 return;
8760
8761 CInode *parent = in->get_parent_inode();
11fdf7f2 8762 ceph_assert(parent);
7c673cae
FG
8763 make_trace(trace, parent);
8764
8765 CDentry *dn = in->get_parent_dn();
8766 dout(15) << "make_trace adding " << *dn << dendl;
8767 trace.push_back(dn);
8768}
8769
8770
8771// -------------------------------------------------------------------------------
8772// Open inode by inode number
8773
8774class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8775 inodeno_t ino;
8776 public:
8777 bufferlist bl;
8778 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8779 MDCacheIOContext(c), ino(i) {}
8780 void finish(int r) override {
8781 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8782 }
91327a77
AA
8783 void print(ostream& out) const override {
8784 out << "openino_backtrace_fetch" << ino << ")";
8785 }
7c673cae
FG
8786};
8787
8788struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8789 inodeno_t ino;
9f95a23c 8790 cref_t<MMDSOpenIno> msg;
7c673cae
FG
8791 bool parent;
8792 public:
9f95a23c 8793 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
7c673cae
FG
8794 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8795 void finish(int r) override {
8796 if (r < 0 && !parent)
8797 r = -EAGAIN;
8798 if (msg) {
8799 mdcache->handle_open_ino(msg, r);
8800 return;
8801 }
11fdf7f2
TL
8802 auto& info = mdcache->opening_inodes.at(ino);
8803 mdcache->_open_ino_traverse_dir(ino, info, r);
7c673cae
FG
8804 }
8805};
8806
8807struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8808 inodeno_t ino;
8809 public:
8810 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8811 void finish(int r) override {
8812 mdcache->_open_ino_parent_opened(ino, r);
8813 }
8814};
8815
8816void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8817{
8818 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8819
11fdf7f2 8820 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8821
8822 CInode *in = get_inode(ino);
8823 if (in) {
8824 dout(10) << " found cached " << *in << dendl;
8825 open_ino_finish(ino, info, in->authority().first);
8826 return;
8827 }
8828
8829 inode_backtrace_t backtrace;
8830 if (err == 0) {
8831 try {
11fdf7f2 8832 decode(backtrace, bl);
7c673cae
FG
8833 } catch (const buffer::error &decode_exc) {
8834 derr << "corrupt backtrace on ino x0" << std::hex << ino
8835 << std::dec << ": " << decode_exc << dendl;
8836 open_ino_finish(ino, info, -EIO);
8837 return;
8838 }
8839 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8840 dout(10) << " old object in pool " << info.pool
8841 << ", retrying pool " << backtrace.pool << dendl;
8842 info.pool = backtrace.pool;
8843 C_IO_MDC_OpenInoBacktraceFetched *fin =
8844 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8845 fetch_backtrace(ino, info.pool, fin->bl,
8846 new C_OnFinisher(fin, mds->finisher));
8847 return;
8848 }
8849 } else if (err == -ENOENT) {
8850 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8851 if (info.pool != meta_pool) {
8852 dout(10) << " no object in pool " << info.pool
8853 << ", retrying pool " << meta_pool << dendl;
8854 info.pool = meta_pool;
8855 C_IO_MDC_OpenInoBacktraceFetched *fin =
8856 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8857 fetch_backtrace(ino, info.pool, fin->bl,
8858 new C_OnFinisher(fin, mds->finisher));
8859 return;
8860 }
8861 err = 0; // backtrace.ancestors.empty() is checked below
8862 }
8863
8864 if (err == 0) {
8865 if (backtrace.ancestors.empty()) {
8866 dout(10) << " got empty backtrace " << dendl;
92f5a8d4 8867 err = -ESTALE;
7c673cae
FG
8868 } else if (!info.ancestors.empty()) {
8869 if (info.ancestors[0] == backtrace.ancestors[0]) {
8870 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8871 err = -EINVAL;
8872 } else {
8873 info.last_err = 0;
8874 }
8875 }
8876 }
8877 if (err) {
8878 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8879 if (info.last_err)
8880 err = info.last_err;
8881 open_ino_finish(ino, info, err);
8882 return;
8883 }
8884
8885 dout(10) << " got backtrace " << backtrace << dendl;
8886 info.ancestors = backtrace.ancestors;
8887
8888 _open_ino_traverse_dir(ino, info, 0);
8889}
8890
8891void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8892{
8893 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8894
11fdf7f2 8895 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8896
8897 CInode *in = get_inode(ino);
8898 if (in) {
8899 dout(10) << " found cached " << *in << dendl;
8900 open_ino_finish(ino, info, in->authority().first);
8901 return;
8902 }
8903
8904 if (ret == mds->get_nodeid()) {
8905 _open_ino_traverse_dir(ino, info, 0);
8906 } else {
8907 if (ret >= 0) {
8908 mds_rank_t checked_rank = mds_rank_t(ret);
8909 info.check_peers = true;
8910 info.auth_hint = checked_rank;
8911 info.checked.erase(checked_rank);
8912 }
8913 do_open_ino(ino, info, ret);
8914 }
8915}
8916
8917void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8918{
8919 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8920
8921 CInode *in = get_inode(ino);
8922 if (in) {
8923 dout(10) << " found cached " << *in << dendl;
8924 open_ino_finish(ino, info, in->authority().first);
8925 return;
8926 }
8927
8928 if (ret) {
8929 do_open_ino(ino, info, ret);
8930 return;
8931 }
8932
8933 mds_rank_t hint = info.auth_hint;
8934 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8935 info.discover, info.want_xlocked, &hint);
8936 if (ret > 0)
8937 return;
8938 if (hint != mds->get_nodeid())
8939 info.auth_hint = hint;
8940 do_open_ino(ino, info, ret);
8941}
8942
9f95a23c 8943void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
7c673cae
FG
8944{
8945 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 8946 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
7c673cae 8947 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
11fdf7f2
TL
8948 if (mds->logger)
8949 mds->logger->inc(l_mds_openino_dir_fetch);
7c673cae
FG
8950}
8951
9f95a23c 8952int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
11fdf7f2 8953 const vector<inode_backpointer_t>& ancestors,
7c673cae
FG
8954 bool discover, bool want_xlocked, mds_rank_t *hint)
8955{
8956 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8957 int err = 0;
8958 for (unsigned i = 0; i < ancestors.size(); i++) {
11fdf7f2
TL
8959 const auto& ancestor = ancestors.at(i);
8960 CInode *diri = get_inode(ancestor.dirino);
7c673cae
FG
8961
8962 if (!diri) {
11fdf7f2
TL
8963 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8964 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
7c673cae
FG
8965 return 1;
8966 }
8967 continue;
8968 }
8969
8970 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8971 CDir *dir = diri->get_parent_dir();
8972 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8973 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8974 dir = dir->get_inode()->get_parent_dir();
8975 _open_ino_fetch_dir(ino, m, dir, i == 0);
8976 return 1;
8977 }
8978
8979 if (!diri->is_dir()) {
8980 dout(10) << " " << *diri << " is not dir" << dendl;
8981 if (i == 0)
8982 err = -ENOTDIR;
8983 break;
8984 }
8985
11fdf7f2 8986 const string& name = ancestor.dname;
7c673cae
FG
8987 frag_t fg = diri->pick_dirfrag(name);
8988 CDir *dir = diri->get_dirfrag(fg);
8989 if (!dir) {
8990 if (diri->is_auth()) {
8991 if (diri->is_frozen()) {
8992 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8993 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8994 return 1;
8995 }
8996 dir = diri->get_or_open_dirfrag(this, fg);
8997 } else if (discover) {
8998 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8999 return 1;
9000 }
9001 }
9002 if (dir) {
11fdf7f2 9003 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
7c673cae
FG
9004 CDentry *dn = dir->lookup(name);
9005 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
9006 if (dir->is_auth()) {
9007 if (dnl && dnl->is_primary() &&
9008 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
9009 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
9010 _open_ino_fetch_dir(ino, m, dir, i == 0);
9011 return 1;
9012 }
9013
9014 if (!dnl && !dir->is_complete() &&
9015 (!dir->has_bloom() || dir->is_in_bloom(name))) {
9016 dout(10) << " fetching incomplete " << *dir << dendl;
9017 _open_ino_fetch_dir(ino, m, dir, i == 0);
9018 return 1;
9019 }
9020
9021 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9022 if (i == 0)
9023 err = -ENOENT;
9024 } else if (discover) {
9025 if (!dnl) {
9026 filepath path(name, 0);
9027 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
9028 (i == 0 && want_xlocked));
9029 return 1;
9030 }
9031 if (dnl->is_null() && !dn->lock.can_read(-1)) {
9032 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
9033 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9034 return 1;
9035 }
9036 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9037 if (i == 0)
9038 err = -ENOENT;
9039 }
9040 }
9041 if (hint && i == 0)
9042 *hint = dir ? dir->authority().first : diri->authority().first;
9043 break;
9044 }
9045 return err;
9046}
9047
9048void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
9049{
9050 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9051
11fdf7f2 9052 MDSContext::vec waiters;
7c673cae
FG
9053 waiters.swap(info.waiters);
9054 opening_inodes.erase(ino);
9055 finish_contexts(g_ceph_context, waiters, ret);
9056}
9057
9058void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9059{
9060 if (err < 0 && err != -EAGAIN) {
9061 info.checked.clear();
7c673cae
FG
9062 info.checking = MDS_RANK_NONE;
9063 info.check_peers = true;
9064 info.fetch_backtrace = true;
9065 if (info.discover) {
9066 info.discover = false;
9067 info.ancestors.clear();
9068 }
9069 if (err != -ENOENT && err != -ENOTDIR)
9070 info.last_err = err;
9071 }
9072
d2e6a577
FG
9073 if (info.check_peers || info.discover) {
9074 if (info.discover) {
9075 // got backtrace from peer, but failed to find inode. re-check peers
9076 info.discover = false;
9077 info.ancestors.clear();
9078 info.checked.clear();
9079 }
7c673cae
FG
9080 info.check_peers = false;
9081 info.checking = MDS_RANK_NONE;
9082 do_open_ino_peer(ino, info);
9083 } else if (info.fetch_backtrace) {
9084 info.check_peers = true;
9085 info.fetch_backtrace = false;
9086 info.checking = mds->get_nodeid();
9087 info.checked.clear();
7c673cae
FG
9088 C_IO_MDC_OpenInoBacktraceFetched *fin =
9089 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9090 fetch_backtrace(ino, info.pool, fin->bl,
9091 new C_OnFinisher(fin, mds->finisher));
9092 } else {
11fdf7f2 9093 ceph_assert(!info.ancestors.empty());
7c673cae
FG
9094 info.checking = mds->get_nodeid();
9095 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
9096 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9097 }
9098}
9099
9100void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9101{
9102 set<mds_rank_t> all, active;
9103 mds->mdsmap->get_mds_set(all);
7c673cae 9104 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
9105 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9106 else
9107 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9108
9109 dout(10) << "do_open_ino_peer " << ino << " active " << active
9110 << " all " << all << " checked " << info.checked << dendl;
9111
11fdf7f2 9112 mds_rank_t whoami = mds->get_nodeid();
7c673cae 9113 mds_rank_t peer = MDS_RANK_NONE;
11fdf7f2 9114 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
7c673cae
FG
9115 if (active.count(info.auth_hint)) {
9116 peer = info.auth_hint;
9117 info.auth_hint = MDS_RANK_NONE;
9118 }
9119 } else {
9120 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
11fdf7f2 9121 if (*p != whoami && info.checked.count(*p) == 0) {
7c673cae
FG
9122 peer = *p;
9123 break;
9124 }
9125 }
9126 if (peer < 0) {
11fdf7f2 9127 all.erase(whoami);
d2e6a577 9128 if (all != info.checked) {
7c673cae
FG
9129 dout(10) << " waiting for more peers to be active" << dendl;
9130 } else {
9131 dout(10) << " all MDS peers have been checked " << dendl;
9132 do_open_ino(ino, info, 0);
9133 }
9134 } else {
9135 info.checking = peer;
9136 vector<inode_backpointer_t> *pa = NULL;
9137 // got backtrace from peer or backtrace just fetched
9138 if (info.discover || !info.fetch_backtrace)
9139 pa = &info.ancestors;
9f95a23c 9140 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
11fdf7f2
TL
9141 if (mds->logger)
9142 mds->logger->inc(l_mds_openino_peer_discover);
7c673cae
FG
9143 }
9144}
9145
9f95a23c 9146void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
7c673cae
FG
9147{
9148 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9149 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9150 return;
9151 }
9152
9153 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9154
11fdf7f2 9155 auto from = mds_rank_t(m->get_source().num());
7c673cae 9156 inodeno_t ino = m->ino;
9f95a23c 9157 ref_t<MMDSOpenInoReply> reply;
7c673cae
FG
9158 CInode *in = get_inode(ino);
9159 if (in) {
9160 dout(10) << " have " << *in << dendl;
9f95a23c 9161 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
7c673cae
FG
9162 if (in->is_auth()) {
9163 touch_inode(in);
9164 while (1) {
9165 CDentry *pdn = in->get_parent_dn();
9166 if (!pdn)
9167 break;
9168 CInode *diri = pdn->get_dir()->get_inode();
94b18763 9169 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
9170 in->inode.version));
9171 in = diri;
9172 }
9173 } else {
9174 reply->hint = in->authority().first;
9175 }
9176 } else if (err < 0) {
9f95a23c 9177 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
7c673cae
FG
9178 } else {
9179 mds_rank_t hint = MDS_RANK_NONE;
9180 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9181 if (ret > 0)
9182 return;
9f95a23c 9183 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
7c673cae 9184 }
11fdf7f2 9185 mds->send_message_mds(reply, from);
7c673cae
FG
9186}
9187
9f95a23c 9188void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
7c673cae
FG
9189{
9190 dout(10) << "handle_open_ino_reply " << *m << dendl;
9191
9192 inodeno_t ino = m->ino;
9193 mds_rank_t from = mds_rank_t(m->get_source().num());
9194 auto it = opening_inodes.find(ino);
9195 if (it != opening_inodes.end() && it->second.checking == from) {
9196 open_ino_info_t& info = it->second;
9197 info.checking = MDS_RANK_NONE;
9198 info.checked.insert(from);
9199
9200 CInode *in = get_inode(ino);
9201 if (in) {
9202 dout(10) << " found cached " << *in << dendl;
9203 open_ino_finish(ino, info, in->authority().first);
9204 } else if (!m->ancestors.empty()) {
9205 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9206 if (!info.want_replica) {
9207 open_ino_finish(ino, info, from);
7c673cae
FG
9208 return;
9209 }
9210
9211 info.ancestors = m->ancestors;
9212 info.auth_hint = from;
9213 info.checking = mds->get_nodeid();
9214 info.discover = true;
9215 _open_ino_traverse_dir(ino, info, 0);
9216 } else if (m->error) {
9217 dout(10) << " error " << m->error << " from mds." << from << dendl;
9218 do_open_ino(ino, info, m->error);
9219 } else {
9220 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9221 info.auth_hint = m->hint;
9222 info.checked.erase(m->hint);
9223 }
9224 do_open_ino_peer(ino, info);
9225 }
9226 }
7c673cae
FG
9227}
9228
9229void MDCache::kick_open_ino_peers(mds_rank_t who)
9230{
9231 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9232
9233 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9234 p != opening_inodes.end();
9235 ++p) {
9236 open_ino_info_t& info = p->second;
9237 if (info.checking == who) {
9238 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9239 info.checking = MDS_RANK_NONE;
9240 do_open_ino_peer(p->first, info);
9241 } else if (info.checking == MDS_RANK_NONE) {
9242 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9243 do_open_ino_peer(p->first, info);
9244 }
9245 }
9246}
9247
11fdf7f2 9248void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
f91f0fd5
TL
9249 bool want_replica, bool want_xlocked,
9250 vector<inode_backpointer_t> *ancestors_hint,
9251 mds_rank_t auth_hint)
7c673cae
FG
9252{
9253 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9254 << want_replica << dendl;
9255
11fdf7f2
TL
9256 auto it = opening_inodes.find(ino);
9257 if (it != opening_inodes.end()) {
9258 open_ino_info_t& info = it->second;
7c673cae
FG
9259 if (want_replica) {
9260 info.want_replica = true;
9261 if (want_xlocked && !info.want_xlocked) {
9262 if (!info.ancestors.empty()) {
9263 CInode *diri = get_inode(info.ancestors[0].dirino);
9264 if (diri) {
9265 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9266 CDir *dir = diri->get_dirfrag(fg);
9267 if (dir && !dir->is_auth()) {
9268 filepath path(info.ancestors[0].dname, 0);
9269 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9270 }
9271 }
9272 }
9273 info.want_xlocked = true;
9274 }
9275 }
9276 info.waiters.push_back(fin);
9277 } else {
9278 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
9279 info.want_replica = want_replica;
9280 info.want_xlocked = want_xlocked;
9281 info.tid = ++open_ino_last_tid;
9282 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9283 info.waiters.push_back(fin);
f91f0fd5
TL
9284 if (auth_hint != MDS_RANK_NONE)
9285 info.auth_hint = auth_hint;
9286 if (ancestors_hint) {
9287 info.ancestors = std::move(*ancestors_hint);
11fdf7f2
TL
9288 info.fetch_backtrace = false;
9289 info.checking = mds->get_nodeid();
9290 _open_ino_traverse_dir(ino, info, 0);
9291 } else {
9292 do_open_ino(ino, info, 0);
9293 }
7c673cae
FG
9294 }
9295}
9296
9297/* ---------------------------- */
9298
9299/*
9300 * search for a given inode on MDS peers. optionally start with the given node.
9301
9302
9303 TODO
9304 - recover from mds node failure, recovery
9305 - traverse path
9306
9307 */
9f95a23c
TL
9308void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9309 mds_rank_t hint, bool path_locked)
7c673cae
FG
9310{
9311 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
9312 CInode *in = get_inode(ino);
9313 if (in && in->state_test(CInode::STATE_PURGING)) {
9314 c->complete(-ESTALE);
9315 return;
9316 }
11fdf7f2 9317 ceph_assert(!in);
7c673cae
FG
9318
9319 ceph_tid_t tid = ++find_ino_peer_last_tid;
9320 find_ino_peer_info_t& fip = find_ino_peer[tid];
9321 fip.ino = ino;
9322 fip.tid = tid;
9323 fip.fin = c;
9f95a23c 9324 fip.path_locked = path_locked;
7c673cae 9325 fip.hint = hint;
7c673cae
FG
9326 _do_find_ino_peer(fip);
9327}
9328
9329void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9330{
9331 set<mds_rank_t> all, active;
9332 mds->mdsmap->get_mds_set(all);
1adf2230 9333 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9334
9335 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9336 << " active " << active << " all " << all
9337 << " checked " << fip.checked
9338 << dendl;
9339
9340 mds_rank_t m = MDS_RANK_NONE;
9341 if (fip.hint >= 0) {
9342 m = fip.hint;
9343 fip.hint = MDS_RANK_NONE;
9344 } else {
9345 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9346 if (*p != mds->get_nodeid() &&
9347 fip.checked.count(*p) == 0) {
9348 m = *p;
9349 break;
9350 }
9351 }
9352 if (m == MDS_RANK_NONE) {
d2e6a577
FG
9353 all.erase(mds->get_nodeid());
9354 if (all != fip.checked) {
7c673cae
FG
9355 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9356 } else {
9357 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9358 fip.fin->complete(-ESTALE);
9359 find_ino_peer.erase(fip.tid);
9360 }
9361 } else {
9362 fip.checking = m;
9f95a23c 9363 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
7c673cae
FG
9364 }
9365}
9366
9f95a23c 9367void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
7c673cae
FG
9368{
9369 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
9370 return;
9371 }
9372
9373 dout(10) << "handle_find_ino " << *m << dendl;
9f95a23c 9374 auto r = make_message<MMDSFindInoReply>(m->tid);
7c673cae
FG
9375 CInode *in = get_inode(m->ino);
9376 if (in) {
9377 in->make_path(r->path);
9378 dout(10) << " have " << r->path << " " << *in << dendl;
9379 }
11fdf7f2 9380 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
7c673cae
FG
9381}
9382
9383
9f95a23c 9384void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
7c673cae 9385{
9f95a23c 9386 auto p = find_ino_peer.find(m->tid);
7c673cae
FG
9387 if (p != find_ino_peer.end()) {
9388 dout(10) << "handle_find_ino_reply " << *m << dendl;
9389 find_ino_peer_info_t& fip = p->second;
9390
9391 // success?
9392 if (get_inode(fip.ino)) {
9393 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9394 mds->queue_waiter(fip.fin);
9395 find_ino_peer.erase(p);
7c673cae
FG
9396 return;
9397 }
9398
9399 mds_rank_t from = mds_rank_t(m->get_source().num());
9400 if (fip.checking == from)
9401 fip.checking = MDS_RANK_NONE;
9402 fip.checked.insert(from);
9403
9404 if (!m->path.empty()) {
9405 // we got a path!
9406 vector<CDentry*> trace;
11fdf7f2 9407 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 9408 MDRequestRef null_ref;
9f95a23c
TL
9409 int flags = MDS_TRAVERSE_DISCOVER;
9410 if (fip.path_locked)
9411 flags |= MDS_TRAVERSE_PATH_LOCKED;
9412 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
7c673cae
FG
9413 if (r > 0)
9414 return;
9415 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9416 << ", retrying" << dendl;
9417 fip.checked.clear();
9418 _do_find_ino_peer(fip);
9419 } else {
9420 // nope, continue.
9421 _do_find_ino_peer(fip);
9422 }
9423 } else {
9424 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9425 }
7c673cae
FG
9426}
9427
9428void MDCache::kick_find_ino_peers(mds_rank_t who)
9429{
9430 // find_ino_peers requests we should move on from
9431 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9432 p != find_ino_peer.end();
9433 ++p) {
9434 find_ino_peer_info_t& fip = p->second;
9435 if (fip.checking == who) {
9436 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9437 fip.checking = MDS_RANK_NONE;
9438 _do_find_ino_peer(fip);
9439 } else if (fip.checking == MDS_RANK_NONE) {
9440 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9441 _do_find_ino_peer(fip);
9442 }
9443 }
9444}
9445
9446/* ---------------------------- */
9447
9448int MDCache::get_num_client_requests()
9449{
9450 int count = 0;
9451 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9452 p != active_requests.end();
9453 ++p) {
9454 MDRequestRef& mdr = p->second;
9455 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9456 count++;
9457 }
9458 return count;
9459}
9460
9f95a23c 9461MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
7c673cae
FG
9462{
9463 // did we win a forward race against a slave?
9464 if (active_requests.count(req->get_reqid())) {
9465 MDRequestRef& mdr = active_requests[req->get_reqid()];
11fdf7f2 9466 ceph_assert(mdr);
7c673cae
FG
9467 if (mdr->is_slave()) {
9468 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9469 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9470 } else {
9471 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
7c673cae
FG
9472 }
9473 return MDRequestRef();
9474 }
9475
9476 // register new client request
9477 MDRequestImpl::Params params;
9478 params.reqid = req->get_reqid();
9479 params.attempt = req->get_num_fwd();
9480 params.client_req = req;
9481 params.initiated = req->get_recv_stamp();
9482 params.throttled = req->get_throttle_stamp();
9483 params.all_read = req->get_recv_complete_stamp();
9484 params.dispatched = req->get_dispatch_stamp();
9485
9486 MDRequestRef mdr =
11fdf7f2 9487 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae
FG
9488 active_requests[params.reqid] = mdr;
9489 mdr->set_op_stamp(req->get_stamp());
9490 dout(7) << "request_start " << *mdr << dendl;
9491 return mdr;
9492}
9493
9f95a23c 9494MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
7c673cae
FG
9495{
9496 int by = m->get_source().num();
9497 MDRequestImpl::Params params;
9498 params.reqid = ri;
9499 params.attempt = attempt;
9500 params.triggering_slave_req = m;
9501 params.slave_to = by;
9502 params.initiated = m->get_recv_stamp();
9503 params.throttled = m->get_throttle_stamp();
9504 params.all_read = m->get_recv_complete_stamp();
9505 params.dispatched = m->get_dispatch_stamp();
9506 MDRequestRef mdr =
11fdf7f2
TL
9507 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9508 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9509 active_requests[mdr->reqid] = mdr;
9510 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9511 return mdr;
9512}
9513
9514MDRequestRef MDCache::request_start_internal(int op)
9515{
91327a77 9516 utime_t now = ceph_clock_now();
7c673cae
FG
9517 MDRequestImpl::Params params;
9518 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9519 params.reqid.tid = mds->issue_tid();
91327a77
AA
9520 params.initiated = now;
9521 params.throttled = now;
9522 params.all_read = now;
9523 params.dispatched = now;
7c673cae
FG
9524 params.internal_op = op;
9525 MDRequestRef mdr =
11fdf7f2 9526 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae 9527
11fdf7f2 9528 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9529 active_requests[mdr->reqid] = mdr;
9530 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9531 return mdr;
9532}
9533
9534MDRequestRef MDCache::request_get(metareqid_t rid)
9535{
9536 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
11fdf7f2 9537 ceph_assert(p != active_requests.end());
7c673cae
FG
9538 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9539 return p->second;
9540}
9541
9542void MDCache::request_finish(MDRequestRef& mdr)
9543{
9544 dout(7) << "request_finish " << *mdr << dendl;
9545 mdr->mark_event("finishing request");
9546
9547 // slave finisher?
9548 if (mdr->has_more() && mdr->more()->slave_commit) {
9549 Context *fin = mdr->more()->slave_commit;
9550 mdr->more()->slave_commit = 0;
9551 int ret;
9552 if (mdr->aborted) {
9553 mdr->aborted = false;
9554 ret = -1;
9555 mdr->more()->slave_rolling_back = true;
9556 } else {
9557 ret = 0;
9558 mdr->committing = true;
9559 }
9560 fin->complete(ret); // this must re-call request_finish.
9561 return;
9562 }
9563
d2e6a577
FG
9564 switch(mdr->internal_op) {
9565 case CEPH_MDS_OP_FRAGMENTDIR:
9566 logger->inc(l_mdss_ireq_fragmentdir);
9567 break;
9568 case CEPH_MDS_OP_EXPORTDIR:
9569 logger->inc(l_mdss_ireq_exportdir);
9570 break;
9571 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9572 logger->inc(l_mdss_ireq_enqueue_scrub);
9573 break;
9574 case CEPH_MDS_OP_FLUSH:
9575 logger->inc(l_mdss_ireq_flush);
9576 break;
9577 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9578 logger->inc(l_mdss_ireq_fragstats);
9579 break;
9580 case CEPH_MDS_OP_REPAIR_INODESTATS:
9581 logger->inc(l_mdss_ireq_inodestats);
9582 break;
9583 }
9584
7c673cae
FG
9585 request_cleanup(mdr);
9586}
9587
9588
9589void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9590{
f91f0fd5
TL
9591 CachedStackStringStream css;
9592 *css << "forwarding request to mds." << who;
9593 mdr->mark_event(css->strv());
7c673cae
FG
9594 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9595 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9596 << *mdr->client_request << dendl;
f91f0fd5
TL
9597 if (mdr->is_batch_head()) {
9598 mdr->release_batch_op()->forward(who);
9f95a23c
TL
9599 } else {
9600 mds->forward_message_mds(mdr->release_client_request(), who);
9601 }
7c673cae
FG
9602 if (mds->logger) mds->logger->inc(l_mds_forward);
9603 } else if (mdr->internal_op >= 0) {
9604 dout(10) << "request_forward on internal op; cancelling" << dendl;
9605 mdr->internal_op_finish->complete(-EXDEV);
9606 } else {
9607 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9608 << " was from mds" << dendl;
9609 }
9610 request_cleanup(mdr);
9611}
9612
9613
9614void MDCache::dispatch_request(MDRequestRef& mdr)
9615{
9616 if (mdr->client_request) {
9617 mds->server->dispatch_client_request(mdr);
9618 } else if (mdr->slave_request) {
9619 mds->server->dispatch_slave_request(mdr);
9620 } else {
9621 switch (mdr->internal_op) {
9622 case CEPH_MDS_OP_FRAGMENTDIR:
9623 dispatch_fragment_dir(mdr);
9624 break;
9625 case CEPH_MDS_OP_EXPORTDIR:
9626 migrator->dispatch_export_dir(mdr, 0);
9627 break;
9628 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9629 enqueue_scrub_work(mdr);
9630 break;
9631 case CEPH_MDS_OP_FLUSH:
9632 flush_dentry_work(mdr);
9633 break;
9634 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9635 repair_dirfrag_stats_work(mdr);
9636 break;
9637 case CEPH_MDS_OP_REPAIR_INODESTATS:
9638 repair_inode_stats_work(mdr);
9639 break;
11fdf7f2
TL
9640 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9641 upgrade_inode_snaprealm_work(mdr);
9642 break;
7c673cae
FG
9643 default:
9644 ceph_abort();
9645 }
9646 }
9647}
9648
9649
9650void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9651{
9652 if (!mdr->has_more())
9653 return;
9654
9655 // clean up slaves
9656 // (will implicitly drop remote dn pins)
9657 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9658 p != mdr->more()->slaves.end();
9659 ++p) {
9f95a23c
TL
9660 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt,
9661 MMDSSlaveRequest::OP_FINISH);
7c673cae
FG
9662
9663 if (mdr->killed && !mdr->committing) {
9664 r->mark_abort();
9665 } else if (mdr->more()->srcdn_auth_mds == *p &&
9666 mdr->more()->inode_import.length() > 0) {
9667 // information about rename imported caps
9668 r->inode_export.claim(mdr->more()->inode_import);
9669 }
9670
9671 mds->send_message_mds(r, *p);
9672 }
9673
9674 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9675 * implicitly. Note that we don't call the finishers -- there shouldn't
9676 * be any on a remote lock and the request finish wakes up all
9677 * the waiters anyway! */
7c673cae 9678
11fdf7f2
TL
9679 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9680 SimpleLock *lock = it->lock;
9681 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9682 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9683 << " on " << lock->get_parent() << dendl;
9684 lock->put_xlock();
9685 mdr->locks.erase(it++);
9686 } else if (it->is_remote_wrlock()) {
9687 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9688 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9689 if (it->is_wrlock()) {
9690 it->clear_remote_wrlock();
9691 ++it;
9692 } else {
9693 mdr->locks.erase(it++);
9694 }
9695 } else {
9696 ++it;
9697 }
7c673cae
FG
9698 }
9699
9700 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9701 * leaving them in can cause double-notifies as
9702 * this function can get called more than once */
9703}
9704
9705void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9706{
9707 request_drop_foreign_locks(mdr);
9708 mds->locker->drop_non_rdlocks(mdr.get());
9709}
9710
9711void MDCache::request_drop_locks(MDRequestRef& mdr)
9712{
9713 request_drop_foreign_locks(mdr);
9714 mds->locker->drop_locks(mdr.get());
9715}
9716
9717void MDCache::request_cleanup(MDRequestRef& mdr)
9718{
9719 dout(15) << "request_cleanup " << *mdr << dendl;
9720
9721 if (mdr->has_more()) {
9722 if (mdr->more()->is_ambiguous_auth)
9723 mdr->clear_ambiguous_auth();
9724 if (!mdr->more()->waiting_for_finish.empty())
9725 mds->queue_waiters(mdr->more()->waiting_for_finish);
9726 }
9727
9728 request_drop_locks(mdr);
9729
9730 // drop (local) auth pins
9731 mdr->drop_local_auth_pins();
9732
9733 // drop stickydirs
11fdf7f2 9734 mdr->put_stickydirs();
7c673cae
FG
9735
9736 mds->locker->kick_cap_releases(mdr);
9737
9738 // drop cache pins
9739 mdr->drop_pins();
9740
9741 // remove from session
9742 mdr->item_session_request.remove_myself();
9743
9744 // remove from map
9745 active_requests.erase(mdr->reqid);
9746
9747 if (mds->logger)
9748 log_stat();
9749
9750 mdr->mark_event("cleaned up request");
9751}
9752
9753void MDCache::request_kill(MDRequestRef& mdr)
9754{
9755 // rollback slave requests is tricky. just let the request proceed.
94b18763 9756 if (mdr->has_more() &&
7c673cae 9757 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9f95a23c 9758 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11fdf7f2 9759 ceph_assert(mdr->more()->witnessed.empty());
94b18763
FG
9760 mdr->aborted = true;
9761 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9762 } else {
9763 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9764 }
7c673cae 9765
11fdf7f2
TL
9766 ceph_assert(mdr->used_prealloc_ino == 0);
9767 ceph_assert(mdr->prealloc_inos.empty());
7c673cae
FG
9768
9769 mdr->session = NULL;
9770 mdr->item_session_request.remove_myself();
9771 return;
9772 }
9773
9774 mdr->killed = true;
9775 mdr->mark_event("killing request");
9776
9777 if (mdr->committing) {
9778 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9779 } else {
9780 dout(10) << "request_kill " << *mdr << dendl;
9781 request_cleanup(mdr);
9782 }
9783}
9784
9785// -------------------------------------------------------------------------------
9786// SNAPREALMS
9787
11fdf7f2 9788void MDCache::create_global_snaprealm()
7c673cae 9789{
11fdf7f2 9790 CInode *in = new CInode(this); // dummy inode
ec96510d 9791 create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
11fdf7f2
TL
9792 add_inode(in);
9793 global_snaprealm = in->snaprealm;
7c673cae
FG
9794}
9795
11fdf7f2 9796void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
7c673cae
FG
9797{
9798 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9799
9800 vector<inodeno_t> split_inos;
9801 vector<inodeno_t> split_realms;
9802
11fdf7f2
TL
9803 if (notify_clients) {
9804 ceph_assert(in->snaprealm->have_past_parents_open());
9805 if (snapop == CEPH_SNAP_OP_SPLIT) {
9806 // notify clients of update|split
9807 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9808 !p.end(); ++p)
9809 split_inos.push_back((*p)->ino());
7c673cae 9810
11fdf7f2
TL
9811 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9812 p != in->snaprealm->open_children.end();
9813 ++p)
9814 split_realms.push_back((*p)->inode->ino());
9815 }
9816 }
7c673cae
FG
9817
9818 set<SnapRealm*> past_children;
9f95a23c 9819 map<client_t, ref_t<MClientSnap>> updates;
7c673cae
FG
9820 list<SnapRealm*> q;
9821 q.push_back(in->snaprealm);
9822 while (!q.empty()) {
9823 SnapRealm *realm = q.front();
9824 q.pop_front();
9825
9826 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9827 realm->invalidate_cached_snaps();
9828
11fdf7f2
TL
9829 if (notify_clients) {
9830 for (const auto& p : realm->client_caps) {
9831 const auto& client = p.first;
9832 const auto& caps = p.second;
9833 ceph_assert(!caps->empty());
9834
9835 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9836 if (em.second) {
9f95a23c 9837 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
11fdf7f2
TL
9838 update->head.split = in->ino();
9839 update->split_inos = split_inos;
9840 update->split_realms = split_realms;
9841 update->bl = in->snaprealm->get_snap_trace();
9842 em.first->second = std::move(update);
9843 }
7c673cae
FG
9844 }
9845 }
9846
9847 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9848 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9849 p != realm->open_past_children.end();
9850 ++p)
9851 past_children.insert(*p);
9852 }
9853
9854 // notify for active children, too.
9855 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9856 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9857 p != realm->open_children.end();
9858 ++p)
9859 q.push_back(*p);
9860 }
9861
11fdf7f2 9862 if (notify_clients)
7c673cae
FG
9863 send_snaps(updates);
9864
9865 // notify past children and their descendants if we update/delete old snapshots
9866 for (set<SnapRealm*>::iterator p = past_children.begin();
9867 p != past_children.end();
9868 ++p)
9869 q.push_back(*p);
9870
9871 while (!q.empty()) {
9872 SnapRealm *realm = q.front();
9873 q.pop_front();
9874
9875 realm->invalidate_cached_snaps();
9876
9877 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9878 p != realm->open_children.end();
9879 ++p) {
9880 if (past_children.count(*p) == 0)
9881 q.push_back(*p);
9882 }
9883
9884 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9885 p != realm->open_past_children.end();
9886 ++p) {
9887 if (past_children.count(*p) == 0) {
9888 q.push_back(*p);
9889 past_children.insert(*p);
9890 }
9891 }
9892 }
9893
9894 if (snapop == CEPH_SNAP_OP_DESTROY) {
9895 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9896 for (set<SnapRealm*>::iterator p = past_children.begin();
9897 p != past_children.end();
9898 ++p)
9899 maybe_eval_stray((*p)->inode, true);
9900 }
9901}
9902
11fdf7f2 9903void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
7c673cae 9904{
11fdf7f2
TL
9905 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9906 ceph_assert(in->is_auth());
7c673cae 9907
11fdf7f2
TL
9908 set<mds_rank_t> mds_set;
9909 if (stid > 0) {
9910 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9911 mds_set.erase(mds->get_nodeid());
9912 } else {
9913 in->list_replicas(mds_set);
9914 }
7c673cae 9915
11fdf7f2
TL
9916 if (!mds_set.empty()) {
9917 bufferlist snap_blob;
9918 in->encode_snap(snap_blob);
7c673cae 9919
11fdf7f2 9920 for (auto p : mds_set) {
9f95a23c 9921 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
11fdf7f2
TL
9922 m->snap_blob = snap_blob;
9923 mds->send_message_mds(m, p);
9924 }
9925 }
7c673cae 9926
11fdf7f2
TL
9927 if (stid > 0)
9928 notify_global_snaprealm_update(snap_op);
9929}
7c673cae 9930
9f95a23c 9931void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
11fdf7f2
TL
9932{
9933 mds_rank_t from = mds_rank_t(m->get_source().num());
9934 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 9935
11fdf7f2
TL
9936 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9937 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9938 return;
9939 }
7c673cae 9940
11fdf7f2
TL
9941 // null rejoin_done means open_snaprealms() has already been called
9942 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9943 (mds->is_rejoin() && !rejoin_done);
9944
9945 if (m->get_tid() > 0) {
9946 mds->snapclient->notify_commit(m->get_tid());
9947 if (notify_clients)
9948 notify_global_snaprealm_update(m->get_snap_op());
9949 }
9950
9951 CInode *in = get_inode(m->get_ino());
9952 if (in) {
9953 ceph_assert(!in->is_auth());
9954 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9955 (mds->is_rejoin() && !in->is_rejoining())) {
9956 auto p = m->snap_blob.cbegin();
9957 in->decode_snap(p);
9958
9959 if (!notify_clients) {
9960 if (!rejoin_pending_snaprealms.count(in)) {
9961 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9962 rejoin_pending_snaprealms.insert(in);
9963 }
9964 }
9965 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9966 }
9967 }
7c673cae
FG
9968}
9969
11fdf7f2
TL
9970void MDCache::notify_global_snaprealm_update(int snap_op)
9971{
9972 if (snap_op != CEPH_SNAP_OP_DESTROY)
9973 snap_op = CEPH_SNAP_OP_UPDATE;
9974 set<Session*> sessions;
9975 mds->sessionmap.get_client_session_set(sessions);
9976 for (auto &session : sessions) {
9977 if (!session->is_open() && !session->is_stale())
9978 continue;
9f95a23c 9979 auto update = make_message<MClientSnap>(snap_op);
11fdf7f2
TL
9980 update->head.split = global_snaprealm->inode->ino();
9981 update->bl = global_snaprealm->get_snap_trace();
9982 mds->send_message_client_counted(update, session);
9983 }
9984}
7c673cae
FG
9985
9986// -------------------------------------------------------------------------------
9987// STRAYS
9988
9989struct C_MDC_RetryScanStray : public MDCacheContext {
9990 dirfrag_t next;
9991 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9992 void finish(int r) override {
9993 mdcache->scan_stray_dir(next);
9994 }
9995};
9996
9997void MDCache::scan_stray_dir(dirfrag_t next)
9998{
9999 dout(10) << "scan_stray_dir " << next << dendl;
10000
9f95a23c 10001 std::vector<CDir*> ls;
7c673cae
FG
10002 for (int i = 0; i < NUM_STRAY; ++i) {
10003 if (strays[i]->ino() < next.ino)
10004 continue;
10005 strays[i]->get_dirfrags(ls);
10006 }
10007
9f95a23c 10008 for (const auto& dir : ls) {
7c673cae
FG
10009 if (dir->dirfrag() < next)
10010 continue;
10011 if (!dir->is_complete()) {
10012 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
10013 return;
10014 }
94b18763
FG
10015 for (auto &p : dir->items) {
10016 CDentry *dn = p.second;
7c673cae
FG
10017 dn->state_set(CDentry::STATE_STRAY);
10018 CDentry::linkage_t *dnl = dn->get_projected_linkage();
10019 if (dnl->is_primary()) {
10020 CInode *in = dnl->get_inode();
10021 if (in->inode.nlink == 0)
10022 in->state_set(CInode::STATE_ORPHAN);
10023 maybe_eval_stray(in);
10024 }
10025 }
10026 }
10027}
10028
7c673cae
FG
10029void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
10030{
10031 object_t oid = CInode::get_object_name(ino, frag_t(), "");
10032 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
11fdf7f2
TL
10033 if (mds->logger)
10034 mds->logger->inc(l_mds_openino_backtrace_fetch);
7c673cae
FG
10035}
10036
10037
10038
10039
10040
10041// ========================================================================================
10042// DISCOVER
10043/*
10044
10045 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10046 to the parent metadata object in the cache (pinning it).
10047
10048 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10049
10050*/
10051
10052void MDCache::_send_discover(discover_info_t& d)
10053{
9f95a23c
TL
10054 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
10055 d.want_base_dir, d.path_locked);
7c673cae
FG
10056 dis->set_tid(d.tid);
10057 mds->send_message_mds(dis, d.mds);
10058}
10059
10060void MDCache::discover_base_ino(inodeno_t want_ino,
11fdf7f2 10061 MDSContext *onfinish,
7c673cae
FG
10062 mds_rank_t from)
10063{
10064 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
10065 if (waiting_for_base_ino[from].count(want_ino) == 0) {
10066 discover_info_t& d = _create_discover(from);
10067 d.ino = want_ino;
10068 _send_discover(d);
10069 }
10070 waiting_for_base_ino[from][want_ino].push_back(onfinish);
10071}
10072
10073
10074void MDCache::discover_dir_frag(CInode *base,
10075 frag_t approx_fg,
11fdf7f2 10076 MDSContext *onfinish,
7c673cae
FG
10077 mds_rank_t from)
10078{
10079 if (from < 0)
10080 from = base->authority().first;
10081
10082 dirfrag_t df(base->ino(), approx_fg);
10083 dout(7) << "discover_dir_frag " << df
10084 << " from mds." << from << dendl;
10085
10086 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10087 discover_info_t& d = _create_discover(from);
10088 d.pin_base(base);
10089 d.ino = base->ino();
10090 d.frag = approx_fg;
10091 d.want_base_dir = true;
10092 _send_discover(d);
10093 }
10094
10095 if (onfinish)
10096 base->add_dir_waiter(approx_fg, onfinish);
10097}
10098
10099struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10100 CInode *base;
10101 snapid_t snapid;
10102 filepath path;
10103 mds_rank_t from;
10104 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10105 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10106 void finish(int r) override {
10107 mdcache->discover_path(base, snapid, path, 0, from);
10108 }
10109};
10110
10111void MDCache::discover_path(CInode *base,
10112 snapid_t snap,
10113 filepath want_path,
11fdf7f2 10114 MDSContext *onfinish,
9f95a23c 10115 bool path_locked,
7c673cae
FG
10116 mds_rank_t from)
10117{
10118 if (from < 0)
10119 from = base->authority().first;
10120
10121 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9f95a23c 10122 << (path_locked ? " path_locked":"")
7c673cae
FG
10123 << dendl;
10124
10125 if (base->is_ambiguous_auth()) {
10126 dout(10) << " waiting for single auth on " << *base << dendl;
10127 if (!onfinish)
10128 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10129 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10130 return;
10131 } else if (from == mds->get_nodeid()) {
11fdf7f2 10132 MDSContext::vec finished;
7c673cae
FG
10133 base->take_waiting(CInode::WAIT_DIR, finished);
10134 mds->queue_waiters(finished);
10135 return;
10136 }
10137
10138 frag_t fg = base->pick_dirfrag(want_path[0]);
9f95a23c 10139 if ((path_locked && want_path.depth() == 1) ||
7c673cae
FG
10140 !base->is_waiting_for_dir(fg) || !onfinish) {
10141 discover_info_t& d = _create_discover(from);
10142 d.ino = base->ino();
10143 d.pin_base(base);
10144 d.frag = fg;
10145 d.snap = snap;
10146 d.want_path = want_path;
10147 d.want_base_dir = true;
9f95a23c 10148 d.path_locked = path_locked;
7c673cae
FG
10149 _send_discover(d);
10150 }
10151
10152 // register + wait
10153 if (onfinish)
10154 base->add_dir_waiter(fg, onfinish);
10155}
10156
10157struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10158 CDir *base;
10159 snapid_t snapid;
10160 filepath path;
10161 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10162 MDCacheContext(c), base(b), snapid(s), path(p) {}
10163 void finish(int r) override {
10164 mdcache->discover_path(base, snapid, path, 0);
10165 }
10166};
10167
10168void MDCache::discover_path(CDir *base,
10169 snapid_t snap,
10170 filepath want_path,
11fdf7f2 10171 MDSContext *onfinish,
9f95a23c 10172 bool path_locked)
7c673cae
FG
10173{
10174 mds_rank_t from = base->authority().first;
10175
10176 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9f95a23c 10177 << (path_locked ? " path_locked":"")
7c673cae
FG
10178 << dendl;
10179
10180 if (base->is_ambiguous_auth()) {
10181 dout(7) << " waiting for single auth on " << *base << dendl;
10182 if (!onfinish)
10183 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10184 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10185 return;
10186 } else if (from == mds->get_nodeid()) {
11fdf7f2 10187 MDSContext::vec finished;
7c673cae
FG
10188 base->take_sub_waiting(finished);
10189 mds->queue_waiters(finished);
10190 return;
10191 }
10192
9f95a23c 10193 if ((path_locked && want_path.depth() == 1) ||
7c673cae
FG
10194 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10195 discover_info_t& d = _create_discover(from);
10196 d.ino = base->ino();
31f18b77 10197 d.pin_base(base->inode);
7c673cae
FG
10198 d.frag = base->get_frag();
10199 d.snap = snap;
10200 d.want_path = want_path;
10201 d.want_base_dir = false;
9f95a23c 10202 d.path_locked = path_locked;
7c673cae
FG
10203 _send_discover(d);
10204 }
10205
10206 // register + wait
10207 if (onfinish)
10208 base->add_dentry_waiter(want_path[0], snap, onfinish);
10209}
10210
10211void MDCache::kick_discovers(mds_rank_t who)
10212{
10213 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10214 p != discovers.end();
10215 ++p) {
10216 if (p->second.mds != who)
10217 continue;
10218 _send_discover(p->second);
10219 }
10220}
10221
10222
9f95a23c 10223void MDCache::handle_discover(const cref_t<MDiscover> &dis)
7c673cae
FG
10224{
10225 mds_rank_t whoami = mds->get_nodeid();
10226 mds_rank_t from = mds_rank_t(dis->get_source().num());
10227
11fdf7f2 10228 ceph_assert(from != whoami);
7c673cae
FG
10229
10230 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10231 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 10232 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
10233 return;
10234 }
10235
10236 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10237 // delay processing request from survivor because we may not yet choose lock states.
10238 if (!mds->mdsmap->is_rejoin(from)) {
10239 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10240 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10241 return;
10242 }
10243 }
10244
10245
10246 CInode *cur = 0;
9f95a23c 10247 auto reply = make_message<MDiscoverReply>(*dis);
7c673cae
FG
10248
10249 snapid_t snapid = dis->get_snapid();
10250
10251 // get started.
10252 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10253 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10254 // wants root
10255 dout(7) << "handle_discover from mds." << from
10256 << " wants base + " << dis->get_want().get_path()
10257 << " snap " << snapid
10258 << dendl;
10259
10260 cur = get_inode(dis->get_base_ino());
11fdf7f2 10261 ceph_assert(cur);
7c673cae
FG
10262
10263 // add root
10264 reply->starts_with = MDiscoverReply::INODE;
9f95a23c 10265 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
7c673cae
FG
10266 dout(10) << "added base " << *cur << dendl;
10267 }
10268 else {
10269 // there's a base inode
10270 cur = get_inode(dis->get_base_ino(), snapid);
10271 if (!cur && snapid != CEPH_NOSNAP) {
10272 cur = get_inode(dis->get_base_ino());
10273 if (cur && !cur->is_multiversion())
10274 cur = NULL; // nope!
10275 }
10276
10277 if (!cur) {
10278 dout(7) << "handle_discover mds." << from
10279 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10280 << dendl;
10281 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10282 reply->set_error_dentry(dis->get_dentry(0));
10283 reply->set_flag_error_dir();
10284 } else if (dis->wants_base_dir()) {
10285 dout(7) << "handle_discover mds." << from
10286 << " wants basedir+" << dis->get_want().get_path()
10287 << " has " << *cur
10288 << dendl;
10289 } else {
10290 dout(7) << "handle_discover mds." << from
10291 << " wants " << dis->get_want().get_path()
10292 << " has " << *cur
10293 << dendl;
10294 }
10295 }
10296
11fdf7f2 10297 ceph_assert(reply);
7c673cae
FG
10298
10299 // add content
10300 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10301 for (unsigned i = 0;
10302 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10303 i++) {
10304
10305 // -- figure out the dir
10306
10307 // is *cur even a dir at all?
10308 if (!cur->is_dir()) {
10309 dout(7) << *cur << " not a dir" << dendl;
10310 reply->set_flag_error_dir();
10311 break;
10312 }
10313
10314 // pick frag
10315 frag_t fg;
10316 if (dis->get_want().depth()) {
10317 // dentry specifies
10318 fg = cur->pick_dirfrag(dis->get_dentry(i));
10319 } else {
10320 // requester explicity specified the frag
11fdf7f2 10321 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
7c673cae
FG
10322 fg = dis->get_base_dir_frag();
10323 if (!cur->dirfragtree.is_leaf(fg))
10324 fg = cur->dirfragtree[fg.value()];
10325 }
10326 CDir *curdir = cur->get_dirfrag(fg);
10327
10328 if ((!curdir && !cur->is_auth()) ||
10329 (curdir && !curdir->is_auth())) {
10330
10331 /* before:
10332 * ONLY set flag if empty!!
10333 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10334 * resulting in duplicate discovers in flight,
10335 * which can wreak havoc when discovering rename srcdn (which may move)
10336 */
10337
10338 if (reply->is_empty()) {
10339 // only hint if empty.
10340 // someday this could be better, but right now the waiter logic isn't smart enough.
10341
10342 // hint
10343 if (curdir) {
10344 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10345 reply->set_dir_auth_hint(curdir->authority().first);
10346 } else {
10347 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10348 << *cur << dendl;
10349 reply->set_dir_auth_hint(cur->authority().first);
10350 }
10351
10352 // note error dentry, if any
10353 // NOTE: important, as it allows requester to issue an equivalent discover
10354 // to whomever we hint at.
10355 if (dis->get_want().depth() > i)
10356 reply->set_error_dentry(dis->get_dentry(i));
10357 }
10358
10359 break;
10360 }
10361
10362 if (!curdir) { // open dir?
10363 if (cur->is_frozen()) {
10364 if (!reply->is_empty()) {
10365 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10366 break;
10367 }
10368 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10369 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10370 return;
10371 }
10372 curdir = cur->get_or_open_dirfrag(this, fg);
10373 } else if (curdir->is_frozen_tree() ||
10374 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
10375 if (!reply->is_empty()) {
10376 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10377 break;
10378 }
7c673cae
FG
10379 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10380 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10381 reply->set_flag_error_dir();
10382 break;
10383 }
7c673cae
FG
10384 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10385 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10386 return;
10387 }
10388
10389 // add dir
10390 if (curdir->get_version() == 0) {
10391 // fetch newly opened dir
10392 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10393 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10394 // make sure the base frag is correct, though, in there was a refragment since the
10395 // original request was sent.
10396 reply->set_base_dir_frag(curdir->get_frag());
10397 } else {
11fdf7f2 10398 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
7c673cae
FG
10399 if (!reply->trace.length())
10400 reply->starts_with = MDiscoverReply::DIR;
9f95a23c 10401 encode_replica_dir(curdir, from, reply->trace);
7c673cae
FG
10402 dout(7) << "handle_discover added dir " << *curdir << dendl;
10403 }
10404
10405 // lookup
10406 CDentry *dn = 0;
10407 if (curdir->get_version() == 0) {
10408 // fetch newly opened dir
11fdf7f2 10409 ceph_assert(!curdir->has_bloom());
7c673cae
FG
10410 } else if (dis->get_want().depth() > 0) {
10411 // lookup dentry
10412 dn = curdir->lookup(dis->get_dentry(i), snapid);
10413 } else
10414 break; // done!
10415
10416 // incomplete dir?
10417 if (!dn) {
31f18b77 10418 if (!curdir->is_complete() &&
11fdf7f2
TL
10419 !(snapid == CEPH_NOSNAP &&
10420 curdir->has_bloom() &&
10421 !curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
10422 // readdir
10423 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10424 if (reply->is_empty()) {
10425 // fetch and wait
10426 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10427 dis->wants_base_dir() && curdir->get_version() == 0);
7c673cae
FG
10428 return;
10429 } else {
10430 // initiate fetch, but send what we have so far
10431 curdir->fetch(0);
10432 break;
10433 }
10434 }
10435
11fdf7f2
TL
10436 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10437 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10438 << " dne, non-empty reply, stopping" << dendl;
10439 break;
10440 }
10441
7c673cae
FG
10442 // send null dentry
10443 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10444 << *curdir << dendl;
11fdf7f2
TL
10445 if (snapid == CEPH_NOSNAP)
10446 dn = curdir->add_null_dentry(dis->get_dentry(i));
10447 else
10448 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
7c673cae 10449 }
11fdf7f2 10450 ceph_assert(dn);
7c673cae 10451
31f18b77
FG
10452 // don't add replica to purging dentry/inode
10453 if (dn->state_test(CDentry::STATE_PURGING)) {
10454 if (reply->is_empty())
10455 reply->set_flag_error_dn(dis->get_dentry(i));
10456 break;
10457 }
10458
7c673cae
FG
10459 CDentry::linkage_t *dnl = dn->get_linkage();
10460
10461 // xlocked dentry?
10462 // ...always block on non-tail items (they are unrelated)
10463 // ...allow xlocked tail disocvery _only_ if explicitly requested
7c673cae
FG
10464 if (dn->lock.is_xlocked()) {
10465 // is this the last (tail) item in the discover traversal?
9f95a23c
TL
10466 if (dis->is_path_locked()) {
10467 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
7c673cae
FG
10468 } else if (reply->is_empty()) {
10469 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10470 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10471 return;
10472 } else {
10473 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10474 break;
10475 }
10476 }
10477
10478 // frozen inode?
9f95a23c 10479 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
7c673cae 10480 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
9f95a23c 10481 if (tailitem && dis->is_path_locked()) {
7c673cae
FG
10482 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10483 } else if (reply->is_empty()) {
10484 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10485 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10486 return;
10487 } else {
10488 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10489 break;
10490 }
10491 }
10492
10493 // add dentry
10494 if (!reply->trace.length())
10495 reply->starts_with = MDiscoverReply::DENTRY;
9f95a23c 10496 encode_replica_dentry(dn, from, reply->trace);
7c673cae
FG
10497 dout(7) << "handle_discover added dentry " << *dn << dendl;
10498
10499 if (!dnl->is_primary()) break; // stop on null or remote link.
10500
10501 // add inode
10502 CInode *next = dnl->get_inode();
11fdf7f2 10503 ceph_assert(next->is_auth());
7c673cae 10504
9f95a23c 10505 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
7c673cae
FG
10506 dout(7) << "handle_discover added inode " << *next << dendl;
10507
10508 // descend, keep going.
10509 cur = next;
10510 continue;
10511 }
10512
10513 // how did we do?
11fdf7f2 10514 ceph_assert(!reply->is_empty());
7c673cae
FG
10515 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10516 mds->send_message(reply, dis->get_connection());
7c673cae
FG
10517}
10518
9f95a23c 10519void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
7c673cae
FG
10520{
10521 /*
10522 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10523 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
7c673cae
FG
10524 return;
10525 }
10526 */
10527 dout(7) << "discover_reply " << *m << dendl;
10528 if (m->is_flag_error_dir())
10529 dout(7) << " flag error, dir" << dendl;
10530 if (m->is_flag_error_dn())
10531 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10532
11fdf7f2 10533 MDSContext::vec finished, error;
7c673cae
FG
10534 mds_rank_t from = mds_rank_t(m->get_source().num());
10535
10536 // starting point
10537 CInode *cur = get_inode(m->get_base_ino());
11fdf7f2 10538 auto p = m->trace.cbegin();
7c673cae
FG
10539
10540 int next = m->starts_with;
10541
10542 // decrement discover counters
10543 if (m->get_tid()) {
10544 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10545 if (p != discovers.end()) {
10546 dout(10) << " found tid " << m->get_tid() << dendl;
10547 discovers.erase(p);
10548 } else {
10549 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10550 }
10551 }
10552
10553 // discover may start with an inode
10554 if (!p.end() && next == MDiscoverReply::INODE) {
9f95a23c 10555 decode_replica_inode(cur, p, NULL, finished);
7c673cae 10556 dout(7) << "discover_reply got base inode " << *cur << dendl;
11fdf7f2 10557 ceph_assert(cur->is_base());
7c673cae
FG
10558
10559 next = MDiscoverReply::DIR;
10560
10561 // take waiters?
10562 if (cur->is_base() &&
10563 waiting_for_base_ino[from].count(cur->ino())) {
10564 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10565 waiting_for_base_ino[from].erase(cur->ino());
10566 }
10567 }
11fdf7f2 10568 ceph_assert(cur);
7c673cae
FG
10569
10570 // loop over discover results.
10571 // indexes follow each ([[dir] dentry] inode)
10572 // can start, end with any type.
10573 while (!p.end()) {
10574 // dir
10575 frag_t fg;
9f95a23c 10576 CDir *curdir = nullptr;
7c673cae 10577 if (next == MDiscoverReply::DIR) {
9f95a23c 10578 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
7c673cae 10579 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
11fdf7f2 10580 ceph_assert(m->get_wanted_base_dir());
7c673cae
FG
10581 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10582 }
10583 } else {
10584 // note: this can only happen our first way around this loop.
10585 if (p.end() && m->is_flag_error_dn()) {
10586 fg = cur->pick_dirfrag(m->get_error_dentry());
10587 curdir = cur->get_dirfrag(fg);
10588 } else
10589 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10590 }
10591
10592 if (p.end())
10593 break;
10594
10595 // dentry
9f95a23c
TL
10596 CDentry *dn = nullptr;
10597 decode_replica_dentry(dn, p, curdir, finished);
7c673cae
FG
10598
10599 if (p.end())
10600 break;
10601
10602 // inode
9f95a23c 10603 decode_replica_inode(cur, p, dn, finished);
7c673cae
FG
10604
10605 next = MDiscoverReply::DIR;
10606 }
10607
10608 // dir error?
10609 // or dir_auth hint?
10610 if (m->is_flag_error_dir() && !cur->is_dir()) {
10611 // not a dir.
10612 cur->take_waiting(CInode::WAIT_DIR, error);
10613 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10614 mds_rank_t who = m->get_dir_auth_hint();
10615 if (who == mds->get_nodeid()) who = -1;
10616 if (who >= 0)
10617 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10618
7c673cae
FG
10619
10620 if (m->get_wanted_base_dir()) {
31f18b77
FG
10621 frag_t fg = m->get_base_dir_frag();
10622 CDir *dir = cur->get_dirfrag(fg);
10623
7c673cae
FG
10624 if (cur->is_waiting_for_dir(fg)) {
10625 if (cur->is_auth())
10626 cur->take_waiting(CInode::WAIT_DIR, finished);
10627 else if (dir || !cur->dirfragtree.is_leaf(fg))
10628 cur->take_dir_waiting(fg, finished);
10629 else
10630 discover_dir_frag(cur, fg, 0, who);
10631 } else
10632 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10633 }
10634
10635 // try again?
10636 if (m->get_error_dentry().length()) {
31f18b77
FG
10637 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10638 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10639 // wanted a dentry
10640 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10641 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10642 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10643 m->get_wanted_snapid(), finished);
10644 } else {
10645 filepath relpath(m->get_error_dentry(), 0);
9f95a23c 10646 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
7c673cae
FG
10647 }
10648 } else
10649 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10650 << m->get_error_dentry() << dendl;
10651 }
31f18b77
FG
10652 } else if (m->is_flag_error_dn()) {
10653 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10654 CDir *dir = cur->get_dirfrag(fg);
10655 if (dir) {
10656 if (dir->is_auth()) {
10657 dir->take_sub_waiting(finished);
10658 } else {
10659 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10660 m->get_wanted_snapid(), error);
10661 }
10662 }
7c673cae
FG
10663 }
10664
10665 // waiters
10666 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10667 mds->queue_waiters(finished);
7c673cae
FG
10668}
10669
10670
10671
10672// ----------------------------
10673// REPLICAS
10674
b32b8144 10675
9f95a23c 10676void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
b32b8144 10677{
9f95a23c 10678 ENCODE_START(1, 1, bl);
b32b8144 10679 dirfrag_t df = dir->dirfrag();
11fdf7f2 10680 encode(df, bl);
9f95a23c
TL
10681 __u32 nonce = dir->add_replica(to);
10682 encode(nonce, bl);
10683 dir->_encode_base(bl);
10684 ENCODE_FINISH(bl);
b32b8144
FG
10685}
10686
9f95a23c 10687void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
b32b8144 10688{
9f95a23c 10689 ENCODE_START(1, 1, bl);
11fdf7f2
TL
10690 encode(dn->get_name(), bl);
10691 encode(dn->last, bl);
9f95a23c
TL
10692
10693 __u32 nonce = dn->add_replica(to);
10694 encode(nonce, bl);
10695 encode(dn->first, bl);
10696 encode(dn->linkage.remote_ino, bl);
10697 encode(dn->linkage.remote_d_type, bl);
10698 dn->lock.encode_state_for_replica(bl);
10699 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10700 encode(need_recover, bl);
10701 ENCODE_FINISH(bl);
b32b8144
FG
10702}
10703
9f95a23c 10704void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144
FG
10705 uint64_t features)
10706{
f6b5b4d7 10707 ENCODE_START(2, 1, bl);
9f95a23c 10708 ceph_assert(in->is_auth());
11fdf7f2
TL
10709 encode(in->inode.ino, bl); // bleh, minor assymetry here
10710 encode(in->last, bl);
9f95a23c
TL
10711
10712 __u32 nonce = in->add_replica(to);
10713 encode(nonce, bl);
10714
10715 in->_encode_base(bl, features);
10716 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
f6b5b4d7
TL
10717
10718 __u32 state = in->state;
10719 encode(state, bl);
10720
9f95a23c 10721 ENCODE_FINISH(bl);
b32b8144
FG
10722}
10723
9f95a23c 10724void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
11fdf7f2 10725 MDSContext::vec& finished)
7c673cae 10726{
9f95a23c 10727 DECODE_START(1, p);
7c673cae 10728 dirfrag_t df;
11fdf7f2 10729 decode(df, p);
7c673cae 10730
11fdf7f2 10731 ceph_assert(diri->ino() == df.ino);
7c673cae
FG
10732
10733 // add it (_replica_)
9f95a23c 10734 dir = diri->get_dirfrag(df.frag);
7c673cae
FG
10735
10736 if (dir) {
10737 // had replica. update w/ new nonce.
9f95a23c
TL
10738 __u32 nonce;
10739 decode(nonce, p);
10740 dir->set_replica_nonce(nonce);
10741 dir->_decode_base(p);
10742 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
7c673cae
FG
10743 } else {
10744 // force frag to leaf in the diri tree
10745 if (!diri->dirfragtree.is_leaf(df.frag)) {
9f95a23c 10746 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
7c673cae
FG
10747 << diri->dirfragtree << dendl;
10748 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10749 }
7c673cae
FG
10750 // add replica.
10751 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
9f95a23c
TL
10752 __u32 nonce;
10753 decode(nonce, p);
10754 dir->set_replica_nonce(nonce);
10755 dir->_decode_base(p);
7c673cae
FG
10756 // is this a dir_auth delegation boundary?
10757 if (from != diri->authority().first ||
10758 diri->is_ambiguous_auth() ||
10759 diri->is_base())
10760 adjust_subtree_auth(dir, from);
10761
9f95a23c 10762 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
7c673cae
FG
10763 // get waiters
10764 diri->take_dir_waiting(df.frag, finished);
10765 }
9f95a23c 10766 DECODE_FINISH(p);
7c673cae
FG
10767}
10768
9f95a23c 10769void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
7c673cae 10770{
9f95a23c 10771 DECODE_START(1, p);
7c673cae
FG
10772 string name;
10773 snapid_t last;
11fdf7f2
TL
10774 decode(name, p);
10775 decode(last, p);
7c673cae 10776
9f95a23c 10777 dn = dir->lookup(name, last);
7c673cae
FG
10778
10779 // have it?
9f95a23c 10780 bool is_new = false;
7c673cae 10781 if (dn) {
9f95a23c
TL
10782 is_new = false;
10783 dout(7) << __func__ << " had " << *dn << dendl;
7c673cae 10784 } else {
9f95a23c 10785 is_new = true;
7c673cae 10786 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
9f95a23c 10787 dout(7) << __func__ << " added " << *dn << dendl;
7c673cae 10788 }
9f95a23c
TL
10789
10790 __u32 nonce;
10791 decode(nonce, p);
10792 dn->set_replica_nonce(nonce);
10793 decode(dn->first, p);
7c673cae 10794
9f95a23c
TL
10795 inodeno_t rino;
10796 unsigned char rdtype;
10797 decode(rino, p);
10798 decode(rdtype, p);
10799 dn->lock.decode_state(p, is_new);
7c673cae 10800
9f95a23c
TL
10801 bool need_recover;
10802 decode(need_recover, p);
10803
10804 if (is_new) {
10805 if (rino)
10806 dir->link_remote_inode(dn, rino, rdtype);
10807 if (need_recover)
10808 dn->lock.mark_need_recover();
10809 }
10810
10811 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10812 DECODE_FINISH(p);
7c673cae
FG
10813}
10814
9f95a23c 10815void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
7c673cae 10816{
f6b5b4d7 10817 DECODE_START(2, p);
7c673cae
FG
10818 inodeno_t ino;
10819 snapid_t last;
9f95a23c 10820 __u32 nonce;
11fdf7f2
TL
10821 decode(ino, p);
10822 decode(last, p);
9f95a23c
TL
10823 decode(nonce, p);
10824 in = get_inode(ino, last);
7c673cae
FG
10825 if (!in) {
10826 in = new CInode(this, false, 1, last);
9f95a23c
TL
10827 in->set_replica_nonce(nonce);
10828 in->_decode_base(p);
10829 in->_decode_locks_state_for_replica(p, true);
7c673cae 10830 add_inode(in);
ec96510d 10831 if (in->ino() == CEPH_INO_ROOT)
7c673cae
FG
10832 in->inode_auth.first = 0;
10833 else if (in->is_mdsdir())
10834 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
9f95a23c 10835 dout(10) << __func__ << " added " << *in << dendl;
7c673cae 10836 if (dn) {
11fdf7f2 10837 ceph_assert(dn->get_linkage()->is_null());
7c673cae
FG
10838 dn->dir->link_primary_inode(dn, in);
10839 }
10840 } else {
9f95a23c
TL
10841 in->set_replica_nonce(nonce);
10842 in->_decode_base(p);
10843 in->_decode_locks_state_for_replica(p, false);
10844 dout(10) << __func__ << " had " << *in << dendl;
7c673cae
FG
10845 }
10846
10847 if (dn) {
10848 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
9f95a23c 10849 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
7c673cae 10850 }
f6b5b4d7
TL
10851
10852 if (struct_v >= 2) {
10853 __u32 s;
10854 decode(s, p);
10855 s &= CInode::MASK_STATE_REPLICATED;
10856 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10857 dout(10) << "replica inode is random ephemeral pinned" << dendl;
10858 in->set_ephemeral_rand(true);
10859 }
10860 }
10861
9f95a23c 10862 DECODE_FINISH(p);
7c673cae
FG
10863}
10864
10865
9f95a23c 10866void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
7c673cae 10867{
9f95a23c 10868 ENCODE_START(1, 1, bl);
7c673cae 10869 uint64_t features = mds->mdsmap->get_up_features();
9f95a23c
TL
10870 encode_replica_inode(get_myin(), who, bl, features);
10871 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10872 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10873 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10874 encode_replica_dir(straydn->get_dir(), who, bl);
10875 encode_replica_dentry(straydn, who, bl);
10876 ENCODE_FINISH(bl);
7c673cae
FG
10877}
10878
9f95a23c 10879void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
7c673cae 10880{
11fdf7f2
TL
10881 MDSContext::vec finished;
10882 auto p = bl.cbegin();
7c673cae 10883
9f95a23c
TL
10884 DECODE_START(1, p);
10885 CInode *mdsin = nullptr;
10886 decode_replica_inode(mdsin, p, NULL, finished);
10887 CDir *mdsdir = nullptr;
10888 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10889 CDentry *straydirdn = nullptr;
10890 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10891 CInode *strayin = nullptr;
10892 decode_replica_inode(strayin, p, straydirdn, finished);
10893 CDir *straydir = nullptr;
10894 decode_replica_dir(straydir, p, strayin, from, finished);
10895
10896 decode_replica_dentry(straydn, p, straydir, finished);
7c673cae
FG
10897 if (!finished.empty())
10898 mds->queue_waiters(finished);
9f95a23c 10899 DECODE_FINISH(p);
7c673cae
FG
10900}
10901
10902
10903int MDCache::send_dir_updates(CDir *dir, bool bcast)
10904{
10905 // this is an FYI, re: replication
10906
10907 set<mds_rank_t> who;
10908 if (bcast) {
10909 mds->get_mds_map()->get_active_mds_set(who);
10910 } else {
181888fb
FG
10911 for (const auto &p : dir->get_replicas()) {
10912 who.insert(p.first);
10913 }
7c673cae
FG
10914 }
10915
10916 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10917
10918 filepath path;
10919 dir->inode->make_path(path);
10920
10921 mds_rank_t whoami = mds->get_nodeid();
10922 for (set<mds_rank_t>::iterator it = who.begin();
10923 it != who.end();
10924 ++it) {
10925 if (*it == whoami) continue;
10926 //if (*it == except) continue;
10927 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10928
94b18763
FG
10929 std::set<int32_t> s;
10930 for (const auto &r : dir->dir_rep_by) {
10931 s.insert(r);
10932 }
9f95a23c 10933 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
7c673cae
FG
10934 }
10935
10936 return 0;
10937}
10938
9f95a23c 10939void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
7c673cae 10940{
224ce89b
WB
10941 dirfrag_t df = m->get_dirfrag();
10942 CDir *dir = get_dirfrag(df);
7c673cae 10943 if (!dir) {
224ce89b 10944 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10945
10946 // discover it?
10947 if (m->should_discover()) {
10948 // only try once!
10949 // this is key to avoid a fragtree update race, among other things.
224ce89b 10950 m->inc_tried_discover();
7c673cae
FG
10951 vector<CDentry*> trace;
10952 CInode *in;
10953 filepath path = m->get_path();
10954 dout(5) << "trying discover on dir_update for " << path << dendl;
11fdf7f2 10955 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 10956 MDRequestRef null_ref;
9f95a23c 10957 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
7c673cae
FG
10958 if (r > 0)
10959 return;
224ce89b
WB
10960 if (r == 0 &&
10961 in->ino() == df.ino &&
10962 in->get_approx_dirfrag(df.frag) == NULL) {
10963 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10964 return;
10965 }
7c673cae
FG
10966 }
10967
7c673cae
FG
10968 return;
10969 }
10970
224ce89b
WB
10971 if (!m->has_tried_discover()) {
10972 // Update if it already exists. Othwerwise it got updated by discover reply.
10973 dout(5) << "dir_update on " << *dir << dendl;
10974 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10975 dir->dir_rep_by.clear();
10976 for (const auto &e : m->get_dir_rep_by()) {
10977 dir->dir_rep_by.insert(e);
10978 }
224ce89b 10979 }
7c673cae
FG
10980}
10981
10982
10983
10984
10985
10986// LINK
10987
9f95a23c
TL
10988void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10989{
10990 ENCODE_START(1, 1, bl);
10991 inodeno_t ino = dnl->get_remote_ino();
10992 encode(ino, bl);
10993 __u8 d_type = dnl->get_remote_d_type();
10994 encode(d_type, bl);
10995 ENCODE_FINISH(bl);
10996}
10997
10998void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10999{
11000 DECODE_START(1, p);
11001 inodeno_t ino;
11002 __u8 d_type;
11003 decode(ino, p);
11004 decode(d_type, p);
11005 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
11006 dir->link_remote_inode(dn, ino, d_type);
11007 DECODE_FINISH(p);
11008}
11009
7c673cae
FG
11010void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
11011{
9f95a23c 11012 dout(7) << __func__ << " " << *dn << dendl;
7c673cae
FG
11013
11014 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 11015 for (const auto &p : dn->get_replicas()) {
7c673cae 11016 // don't tell (rename) witnesses; they already know
181888fb 11017 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 11018 continue;
181888fb
FG
11019 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11020 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11021 rejoin_gather.count(p.first)))
7c673cae
FG
11022 continue;
11023 CDentry::linkage_t *dnl = dn->get_linkage();
9f95a23c 11024 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
7c673cae 11025 if (dnl->is_primary()) {
9f95a23c
TL
11026 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
11027 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
11028 mds->mdsmap->get_up_features());
11029 } else if (dnl->is_remote()) {
9f95a23c 11030 encode_remote_dentry_link(dnl, m->bl);
7c673cae
FG
11031 } else
11032 ceph_abort(); // aie, bad caller!
181888fb 11033 mds->send_message_mds(m, p.first);
7c673cae
FG
11034 }
11035}
11036
9f95a23c 11037void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
7c673cae 11038{
7c673cae
FG
11039 CDentry *dn = NULL;
11040 CDir *dir = get_dirfrag(m->get_dirfrag());
11041 if (!dir) {
9f95a23c 11042 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
7c673cae
FG
11043 } else {
11044 dn = dir->lookup(m->get_dn());
11045 if (!dn) {
9f95a23c 11046 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
7c673cae 11047 } else {
9f95a23c 11048 dout(7) << __func__ << " on " << *dn << dendl;
7c673cae
FG
11049 CDentry::linkage_t *dnl = dn->get_linkage();
11050
11fdf7f2
TL
11051 ceph_assert(!dn->is_auth());
11052 ceph_assert(dnl->is_null());
7c673cae
FG
11053 }
11054 }
11055
11fdf7f2
TL
11056 auto p = m->bl.cbegin();
11057 MDSContext::vec finished;
7c673cae
FG
11058 if (dn) {
11059 if (m->get_is_primary()) {
11060 // primary link.
9f95a23c
TL
11061 CInode *in = nullptr;
11062 decode_replica_inode(in, p, dn, finished);
7c673cae
FG
11063 } else {
11064 // remote link, easy enough.
9f95a23c 11065 decode_remote_dentry_link(dir, dn, p);
7c673cae
FG
11066 }
11067 } else {
11068 ceph_abort();
11069 }
11070
11071 if (!finished.empty())
11072 mds->queue_waiters(finished);
11073
7c673cae
FG
11074 return;
11075}
11076
11077
11078// UNLINK
11079
11080void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11081{
9f95a23c 11082 dout(10) << __func__ << " " << *dn << dendl;
7c673cae
FG
11083 // share unlink news with replicas
11084 set<mds_rank_t> replicas;
11085 dn->list_replicas(replicas);
11fdf7f2
TL
11086 bufferlist snapbl;
11087 if (straydn) {
7c673cae 11088 straydn->list_replicas(replicas);
11fdf7f2
TL
11089 CInode *strayin = straydn->get_linkage()->get_inode();
11090 strayin->encode_snap_blob(snapbl);
11091 }
7c673cae
FG
11092 for (set<mds_rank_t>::iterator it = replicas.begin();
11093 it != replicas.end();
11094 ++it) {
11095 // don't tell (rmdir) witnesses; they already know
11096 if (mdr.get() && mdr->more()->witnessed.count(*it))
11097 continue;
11098
11099 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11100 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11101 rejoin_gather.count(*it)))
11102 continue;
11103
9f95a23c 11104 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11fdf7f2 11105 if (straydn) {
9f95a23c 11106 encode_replica_stray(straydn, *it, unlink->straybl);
11fdf7f2
TL
11107 unlink->snapbl = snapbl;
11108 }
7c673cae
FG
11109 mds->send_message_mds(unlink, *it);
11110 }
11111}
11112
9f95a23c 11113void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
7c673cae
FG
11114{
11115 // straydn
9f95a23c 11116 CDentry *straydn = nullptr;
7c673cae 11117 if (m->straybl.length())
9f95a23c 11118 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
7c673cae
FG
11119
11120 CDir *dir = get_dirfrag(m->get_dirfrag());
11121 if (!dir) {
9f95a23c 11122 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
7c673cae
FG
11123 } else {
11124 CDentry *dn = dir->lookup(m->get_dn());
11125 if (!dn) {
9f95a23c 11126 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
7c673cae 11127 } else {
9f95a23c 11128 dout(7) << __func__ << " on " << *dn << dendl;
7c673cae
FG
11129 CDentry::linkage_t *dnl = dn->get_linkage();
11130
11131 // open inode?
11132 if (dnl->is_primary()) {
11133 CInode *in = dnl->get_inode();
11134 dn->dir->unlink_inode(dn);
11fdf7f2 11135 ceph_assert(straydn);
7c673cae
FG
11136 straydn->dir->link_primary_inode(straydn, in);
11137
11138 // in->first is lazily updated on replica; drag it forward so
11139 // that we always keep it in sync with the dnq
11fdf7f2 11140 ceph_assert(straydn->first >= in->first);
7c673cae
FG
11141 in->first = straydn->first;
11142
11143 // update subtree map?
11144 if (in->is_dir())
11145 adjust_subtree_after_rename(in, dir, false);
11146
11fdf7f2
TL
11147 if (m->snapbl.length()) {
11148 bool hadrealm = (in->snaprealm ? true : false);
11149 in->decode_snap_blob(m->snapbl);
11150 ceph_assert(in->snaprealm);
11151 ceph_assert(in->snaprealm->have_past_parents_open());
11152 if (!hadrealm)
11153 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11154 }
11155
7c673cae
FG
11156 // send caps to auth (if we're not already)
11157 if (in->is_any_caps() &&
11158 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11159 migrator->export_caps(in);
11160
7c673cae
FG
11161 straydn = NULL;
11162 } else {
11fdf7f2
TL
11163 ceph_assert(!straydn);
11164 ceph_assert(dnl->is_remote());
7c673cae
FG
11165 dn->dir->unlink_inode(dn);
11166 }
11fdf7f2 11167 ceph_assert(dnl->is_null());
7c673cae
FG
11168 }
11169 }
11170
11171 // race with trim_dentry()
11172 if (straydn) {
11fdf7f2
TL
11173 ceph_assert(straydn->get_num_ref() == 0);
11174 ceph_assert(straydn->get_linkage()->is_null());
11175 expiremap ex;
11176 trim_dentry(straydn, ex);
11177 send_expire_messages(ex);
7c673cae 11178 }
7c673cae
FG
11179}
11180
11181
11182
11183
11184
11185
11186// ===================================================================
11187
11188
11189
11190// ===================================================================
11191// FRAGMENT
11192
11193
11194/**
11195 * adjust_dir_fragments -- adjust fragmentation for a directory
11196 *
11197 * @param diri directory inode
11198 * @param basefrag base fragment
11199 * @param bits bit adjustment. positive for split, negative for merge.
11200 */
11201void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
9f95a23c 11202 std::vector<CDir*>* resultfrags,
11fdf7f2 11203 MDSContext::vec& waiters,
7c673cae
FG
11204 bool replay)
11205{
11206 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11207 << " on " << *diri << dendl;
11208
9f95a23c 11209 auto&& p = diri->get_dirfrags_under(basefrag);
7c673cae 11210
9f95a23c 11211 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
7c673cae
FG
11212}
11213
11214CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11215{
11216 CDir *dir = diri->get_dirfrag(fg);
11217 if (dir)
11218 return dir;
11219
11220 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11221
9f95a23c 11222 std::vector<CDir*> src, result;
11fdf7f2 11223 MDSContext::vec waiters;
7c673cae
FG
11224
11225 // split a parent?
11226 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11227 while (1) {
11228 CDir *pdir = diri->get_dirfrag(parent);
11229 if (pdir) {
11230 int split = fg.bits() - parent.bits();
11231 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11232 src.push_back(pdir);
9f95a23c 11233 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
7c673cae
FG
11234 dir = diri->get_dirfrag(fg);
11235 if (dir) {
11236 dout(10) << "force_dir_fragment result " << *dir << dendl;
11237 break;
11238 }
11239 }
11240 if (parent == frag_t())
11241 break;
11242 frag_t last = parent;
11243 parent = parent.parent();
11244 dout(10) << " " << last << " parent is " << parent << dendl;
11245 }
11246
11247 if (!dir) {
11248 // hoover up things under fg?
9f95a23c
TL
11249 {
11250 auto&& p = diri->get_dirfrags_under(fg);
11251 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11252 }
7c673cae
FG
11253 if (src.empty()) {
11254 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11255 } else {
11256 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
9f95a23c 11257 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
7c673cae
FG
11258 dir = result.front();
11259 dout(10) << "force_dir_fragment result " << *dir << dendl;
11260 }
11261 }
11262 if (!replay)
11263 mds->queue_waiters(waiters);
11264 return dir;
11265}
11266
11267void MDCache::adjust_dir_fragments(CInode *diri,
9f95a23c 11268 const std::vector<CDir*>& srcfrags,
7c673cae 11269 frag_t basefrag, int bits,
9f95a23c 11270 std::vector<CDir*>* resultfrags,
11fdf7f2 11271 MDSContext::vec& waiters,
7c673cae
FG
11272 bool replay)
11273{
11274 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11275 << " srcfrags " << srcfrags
11276 << " on " << *diri << dendl;
11277
11278 // adjust fragtree
11279 // yuck. we may have discovered the inode while it was being fragmented.
11280 if (!diri->dirfragtree.is_leaf(basefrag))
11281 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11282
11283 if (bits > 0)
11284 diri->dirfragtree.split(basefrag, bits);
11285 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11286
11287 if (srcfrags.empty())
11288 return;
11289
11290 // split
11291 CDir *parent_dir = diri->get_parent_dir();
11292 CDir *parent_subtree = 0;
11293 if (parent_dir)
11294 parent_subtree = get_subtree_root(parent_dir);
11295
9f95a23c 11296 ceph_assert(srcfrags.size() >= 1);
7c673cae
FG
11297 if (bits > 0) {
11298 // SPLIT
11fdf7f2 11299 ceph_assert(srcfrags.size() == 1);
7c673cae
FG
11300 CDir *dir = srcfrags.front();
11301
11302 dir->split(bits, resultfrags, waiters, replay);
11303
11304 // did i change the subtree map?
11305 if (dir->is_subtree_root()) {
11306 // new frags are now separate subtrees
9f95a23c
TL
11307 for (const auto& dir : *resultfrags) {
11308 subtrees[dir].clear(); // new frag is now its own subtree
11309 }
7c673cae
FG
11310
11311 // was i a bound?
11312 if (parent_subtree) {
11fdf7f2 11313 ceph_assert(subtrees[parent_subtree].count(dir));
7c673cae 11314 subtrees[parent_subtree].erase(dir);
9f95a23c
TL
11315 for (const auto& dir : *resultfrags) {
11316 ceph_assert(dir->is_subtree_root());
11317 subtrees[parent_subtree].insert(dir);
7c673cae
FG
11318 }
11319 }
11320
11321 // adjust my bounds.
11322 set<CDir*> bounds;
11323 bounds.swap(subtrees[dir]);
11324 subtrees.erase(dir);
11325 for (set<CDir*>::iterator p = bounds.begin();
11326 p != bounds.end();
11327 ++p) {
11328 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11329 subtrees[frag].insert(*p);
11330 }
11331
11332 show_subtrees(10);
7c673cae
FG
11333 }
11334
11335 diri->close_dirfrag(dir->get_frag());
11336
11337 } else {
11338 // MERGE
11339
11340 // are my constituent bits subtrees? if so, i will be too.
11341 // (it's all or none, actually.)
11fdf7f2 11342 bool any_subtree = false, any_non_subtree = false;
9f95a23c 11343 for (const auto& dir : srcfrags) {
11fdf7f2 11344 if (dir->is_subtree_root())
31f18b77 11345 any_subtree = true;
11fdf7f2
TL
11346 else
11347 any_non_subtree = true;
31f18b77 11348 }
11fdf7f2
TL
11349 ceph_assert(!any_subtree || !any_non_subtree);
11350
31f18b77
FG
11351 set<CDir*> new_bounds;
11352 if (any_subtree) {
9f95a23c 11353 for (const auto& dir : srcfrags) {
31f18b77
FG
11354 // this simplifies the code that find subtrees underneath the dirfrag
11355 if (!dir->is_subtree_root()) {
11356 dir->state_set(CDir::STATE_AUXSUBTREE);
11357 adjust_subtree_auth(dir, mds->get_nodeid());
11358 }
11359 }
11360
9f95a23c 11361 for (const auto& dir : srcfrags) {
11fdf7f2 11362 ceph_assert(dir->is_subtree_root());
7c673cae 11363 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
11364 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11365 set<CDir*>::iterator r = q->second.begin();
11366 while (r != subtrees[dir].end()) {
11367 new_bounds.insert(*r);
11368 subtrees[dir].erase(r++);
11369 }
11370 subtrees.erase(q);
31f18b77 11371
7c673cae
FG
11372 // remove myself as my parent's bound
11373 if (parent_subtree)
11374 subtrees[parent_subtree].erase(dir);
11375 }
11376 }
11377
11378 // merge
11379 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11380 f->merge(srcfrags, waiters, replay);
7c673cae 11381
31f18b77 11382 if (any_subtree) {
11fdf7f2 11383 ceph_assert(f->is_subtree_root());
7c673cae
FG
11384 subtrees[f].swap(new_bounds);
11385 if (parent_subtree)
11386 subtrees[parent_subtree].insert(f);
11387
11388 show_subtrees(10);
11389 }
11390
9f95a23c 11391 resultfrags->push_back(f);
7c673cae
FG
11392 }
11393}
11394
11395
11396class C_MDC_FragmentFrozen : public MDSInternalContext {
11397 MDCache *mdcache;
11398 MDRequestRef mdr;
11399public:
11400 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11401 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11402 void finish(int r) override {
11403 mdcache->fragment_frozen(mdr, r);
11404 }
11405};
11406
9f95a23c 11407bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
7c673cae
FG
11408{
11409 if (is_readonly()) {
11410 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11411 return false;
11412 }
11413 if (mds->is_cluster_degraded()) {
11414 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11415 return false;
11416 }
11417 if (diri->get_parent_dir() &&
11418 diri->get_parent_dir()->get_inode()->is_stray()) {
11419 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11420 return false;
11421 }
ec96510d 11422 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == CEPH_INO_CEPH) {
7c673cae
FG
11423 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11424 return false;
11425 }
11426
11427 if (diri->scrub_is_in_progress()) {
11428 dout(7) << "can_fragment: scrub in progress" << dendl;
11429 return false;
11430 }
11431
9f95a23c 11432 for (const auto& dir : dirs) {
7c673cae
FG
11433 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11434 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11435 return false;
11436 }
11437 if (!dir->is_auth()) {
11438 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11439 return false;
11440 }
11441 if (dir->is_bad()) {
11442 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11443 return false;
11444 }
11445 if (dir->is_frozen() ||
11446 dir->is_freezing()) {
11447 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11448 return false;
11449 }
11450 }
11451
11452 return true;
11453}
11454
11455void MDCache::split_dir(CDir *dir, int bits)
11456{
11457 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11fdf7f2 11458 ceph_assert(dir->is_auth());
7c673cae
FG
11459 CInode *diri = dir->inode;
11460
9f95a23c 11461 std::vector<CDir*> dirs;
7c673cae
FG
11462 dirs.push_back(dir);
11463
11464 if (!can_fragment(diri, dirs)) {
11465 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11466 return;
11467 }
11468
31f18b77
FG
11469 if (dir->frag.bits() + bits > 24) {
11470 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11471 return;
11472 }
11473
7c673cae
FG
11474 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11475 mdr->more()->fragment_base = dir->dirfrag();
11476
11fdf7f2 11477 ceph_assert(fragments.count(dir->dirfrag()) == 0);
7c673cae
FG
11478 fragment_info_t& info = fragments[dir->dirfrag()];
11479 info.mdr = mdr;
11480 info.dirs.push_back(dir);
11481 info.bits = bits;
11482 info.last_cum_auth_pins_change = ceph_clock_now();
11483
11484 fragment_freeze_dirs(dirs);
11485 // initial mark+complete pass
11486 fragment_mark_and_complete(mdr);
11487}
11488
11489void MDCache::merge_dir(CInode *diri, frag_t frag)
11490{
11491 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11492
9f95a23c
TL
11493 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11494 if (!all) {
7c673cae
FG
11495 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11496 return;
11497 }
11498
11499 if (diri->dirfragtree.is_leaf(frag)) {
11500 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11501 return;
11502 }
11503
11504 if (!can_fragment(diri, dirs))
11505 return;
11506
11507 CDir *first = dirs.front();
11508 int bits = first->get_frag().bits() - frag.bits();
1911f103 11509 dout(10) << " we are merging by " << bits << " bits" << dendl;
7c673cae
FG
11510
11511 dirfrag_t basedirfrag(diri->ino(), frag);
11512 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11513 mdr->more()->fragment_base = basedirfrag;
11514
11fdf7f2 11515 ceph_assert(fragments.count(basedirfrag) == 0);
7c673cae
FG
11516 fragment_info_t& info = fragments[basedirfrag];
11517 info.mdr = mdr;
11518 info.dirs = dirs;
11519 info.bits = -bits;
11520 info.last_cum_auth_pins_change = ceph_clock_now();
11521
11522 fragment_freeze_dirs(dirs);
11523 // initial mark+complete pass
11524 fragment_mark_and_complete(mdr);
11525}
11526
9f95a23c 11527void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
7c673cae 11528{
11fdf7f2 11529 bool any_subtree = false, any_non_subtree = false;
9f95a23c 11530 for (const auto& dir : dirs) {
7c673cae
FG
11531 dir->auth_pin(dir); // until we mark and complete them
11532 dir->state_set(CDir::STATE_FRAGMENTING);
11533 dir->freeze_dir();
11fdf7f2
TL
11534 ceph_assert(dir->is_freezing_dir());
11535
11536 if (dir->is_subtree_root())
11537 any_subtree = true;
11538 else
11539 any_non_subtree = true;
11540 }
11541
11542 if (any_subtree && any_non_subtree) {
11543 // either all dirfrags are subtree roots or all are not.
9f95a23c 11544 for (const auto& dir : dirs) {
11fdf7f2
TL
11545 if (dir->is_subtree_root()) {
11546 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11547 } else {
11548 dir->state_set(CDir::STATE_AUXSUBTREE);
11549 adjust_subtree_auth(dir, mds->get_nodeid());
11550 }
11551 }
7c673cae
FG
11552 }
11553}
11554
11555class C_MDC_FragmentMarking : public MDCacheContext {
11556 MDRequestRef mdr;
11557public:
11558 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11559 void finish(int r) override {
11560 mdcache->fragment_mark_and_complete(mdr);
11561 }
11562};
11563
11564void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11565{
11566 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11567 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11568 if (it == fragments.end() || it->second.mdr != mdr) {
11569 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11570 request_finish(mdr);
11571 return;
11572 }
11573
11574 fragment_info_t& info = it->second;
11575 CInode *diri = info.dirs.front()->get_inode();
11576 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11577
11578 MDSGatherBuilder gather(g_ceph_context);
11579
9f95a23c 11580 for (const auto& dir : info.dirs) {
7c673cae
FG
11581 bool ready = true;
11582 if (!dir->is_complete()) {
11583 dout(15) << " fetching incomplete " << *dir << dendl;
11584 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11585 ready = false;
11586 } else if (dir->get_frag() == frag_t()) {
11587 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11588 // the operation. To avoid CDir::fetch() complaining about missing object,
11589 // we commit new dirfrag first.
11590 if (dir->state_test(CDir::STATE_CREATING)) {
11591 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11592 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11593 ready = false;
11594 } else if (dir->is_new()) {
11595 dout(15) << " committing new " << *dir << dendl;
11fdf7f2 11596 ceph_assert(dir->is_dirty());
7c673cae
FG
11597 dir->commit(0, gather.new_sub(), true);
11598 ready = false;
11599 }
11600 }
11601 if (!ready)
11602 continue;
11603
11604 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11605 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11606 for (auto &p : dir->items) {
11607 CDentry *dn = p.second;
7c673cae 11608 dn->get(CDentry::PIN_FRAGMENTING);
11fdf7f2 11609 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11610 dn->state_set(CDentry::STATE_FRAGMENTING);
11611 }
11612 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11613 dir->auth_unpin(dir);
11614 } else {
11615 dout(15) << " already marked " << *dir << dendl;
11616 }
11617 }
11618 if (gather.has_subs()) {
11619 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11620 gather.activate();
11621 return;
11622 }
11623
9f95a23c 11624 for (const auto& dir : info.dirs) {
7c673cae 11625 if (!dir->is_frozen_dir()) {
11fdf7f2 11626 ceph_assert(dir->is_freezing_dir());
7c673cae
FG
11627 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11628 }
11629 }
11630 if (gather.has_subs()) {
11631 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11632 gather.activate();
11633 // flush log so that request auth_pins are retired
11634 mds->mdlog->flush();
11635 return;
11636 }
11637
11638 fragment_frozen(mdr, 0);
11639}
11640
9f95a23c 11641void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
7c673cae
FG
11642{
11643 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
9f95a23c 11644 for (const auto& dir : dirs) {
7c673cae
FG
11645 dout(10) << " frag " << *dir << dendl;
11646
11fdf7f2 11647 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
7c673cae
FG
11648 dir->state_clear(CDir::STATE_FRAGMENTING);
11649
11650 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11651 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11652
94b18763
FG
11653 for (auto &p : dir->items) {
11654 CDentry *dn = p.second;
11fdf7f2 11655 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11656 dn->state_clear(CDentry::STATE_FRAGMENTING);
11657 dn->put(CDentry::PIN_FRAGMENTING);
11658 }
11659 } else {
11660 dir->auth_unpin(dir);
11661 }
11662
11663 dir->unfreeze_dir();
11664 }
11665}
11666
11667bool MDCache::fragment_are_all_frozen(CDir *dir)
11668{
11fdf7f2 11669 ceph_assert(dir->is_frozen_dir());
7c673cae
FG
11670 map<dirfrag_t,fragment_info_t>::iterator p;
11671 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11672 p != fragments.end() && p->first.ino == dir->ino();
11673 ++p) {
11674 if (p->first.frag.contains(dir->get_frag()))
11675 return p->second.all_frozen;
11676 }
11677 ceph_abort();
11678 return false;
11679}
11680
11681void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11682{
11683 map<dirfrag_t,fragment_info_t>::iterator p;
11684 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11685 p != fragments.end() && p->first.ino == dir->ino();
11686 ++p) {
11687 if (p->first.frag.contains(dir->get_frag())) {
11688 p->second.num_remote_waiters++;
11689 return;
11690 }
11691 }
11692 ceph_abort();
11693}
11694
11695void MDCache::find_stale_fragment_freeze()
11696{
11697 dout(10) << "find_stale_fragment_freeze" << dendl;
11698 // see comment in Migrator::find_stale_export_freeze()
11699 utime_t now = ceph_clock_now();
11700 utime_t cutoff = now;
11fdf7f2 11701 cutoff -= g_conf()->mds_freeze_tree_timeout;
7c673cae
FG
11702
11703 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11704 p != fragments.end(); ) {
11705 dirfrag_t df = p->first;
11706 fragment_info_t& info = p->second;
11707 ++p;
11708 if (info.all_frozen)
11709 continue;
11710 CDir *dir;
11711 int total_auth_pins = 0;
9f95a23c
TL
11712 for (const auto& d : info.dirs) {
11713 dir = d;
7c673cae
FG
11714 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11715 total_auth_pins = -1;
11716 break;
11717 }
11718 if (dir->is_frozen_dir())
11719 continue;
11720 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11721 }
11722 if (total_auth_pins < 0)
11723 continue;
11724 if (info.last_cum_auth_pins != total_auth_pins) {
11725 info.last_cum_auth_pins = total_auth_pins;
11726 info.last_cum_auth_pins_change = now;
11727 continue;
11728 }
11729 if (info.last_cum_auth_pins_change >= cutoff)
11730 continue;
11731 dir = info.dirs.front();
11732 if (info.num_remote_waiters > 0 ||
11733 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11734 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
9f95a23c 11735 std::vector<CDir*> dirs;
7c673cae
FG
11736 info.dirs.swap(dirs);
11737 fragments.erase(df);
11738 fragment_unmark_unfreeze_dirs(dirs);
11739 }
11740 }
11741}
11742
11743class C_MDC_FragmentPrep : public MDCacheLogContext {
11744 MDRequestRef mdr;
11745public:
11746 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11747 void finish(int r) override {
11748 mdcache->_fragment_logged(mdr);
11749 }
11750};
11751
11752class C_MDC_FragmentStore : public MDCacheContext {
11753 MDRequestRef mdr;
11754public:
11755 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11756 void finish(int r) override {
11757 mdcache->_fragment_stored(mdr);
11758 }
11759};
11760
11761class C_MDC_FragmentCommit : public MDCacheLogContext {
11762 dirfrag_t basedirfrag;
a8e16298 11763 MDRequestRef mdr;
7c673cae 11764public:
a8e16298
TL
11765 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11766 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
7c673cae 11767 void finish(int r) override {
a8e16298 11768 mdcache->_fragment_committed(basedirfrag, mdr);
7c673cae
FG
11769 }
11770};
11771
a8e16298 11772class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
7c673cae 11773 dirfrag_t basedirfrag;
a8e16298
TL
11774 int bits;
11775 MDRequestRef mdr;
7c673cae 11776public:
a8e16298
TL
11777 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11778 const MDRequestRef& r) :
11779 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
7c673cae 11780 void finish(int r) override {
11fdf7f2 11781 ceph_assert(r == 0 || r == -ENOENT);
a8e16298 11782 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
7c673cae 11783 }
91327a77 11784 void print(ostream& out) const override {
a8e16298 11785 out << "fragment_purge_old(" << basedirfrag << ")";
91327a77 11786 }
7c673cae
FG
11787};
11788
11789void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11790{
11791 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11792 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11793 if (it == fragments.end() || it->second.mdr != mdr) {
11794 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11795 request_finish(mdr);
11796 return;
11797 }
11798
11fdf7f2 11799 ceph_assert(r == 0);
7c673cae
FG
11800 fragment_info_t& info = it->second;
11801 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11802 << " on " << info.dirs.front()->get_inode() << dendl;
11803
11804 info.all_frozen = true;
11805 dispatch_fragment_dir(mdr);
11806}
11807
11808void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11809{
11810 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11811 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11812 if (it == fragments.end() || it->second.mdr != mdr) {
11813 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11814 request_finish(mdr);
11815 return;
11816 }
11817
11818 fragment_info_t& info = it->second;
11819 CInode *diri = info.dirs.front()->get_inode();
11820
11821 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11822 << " on " << *diri << dendl;
9f95a23c
TL
11823
11824 if (mdr->more()->slave_error)
11825 mdr->aborted = true;
11826
7c673cae 11827 if (!mdr->aborted) {
11fdf7f2
TL
11828 MutationImpl::LockOpVec lov;
11829 lov.add_wrlock(&diri->dirfragtreelock);
7c673cae 11830 // prevent a racing gather on any other scatterlocks too
9f95a23c
TL
11831 lov.lock_scatter_gather(&diri->nestlock);
11832 lov.lock_scatter_gather(&diri->filelock);
11833 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
7c673cae
FG
11834 if (!mdr->aborted)
11835 return;
9f95a23c 11836 }
7c673cae
FG
11837 }
11838
11839 if (mdr->aborted) {
11840 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11841 << info.dirs.front()->dirfrag() << dendl;
11842 if (info.bits > 0)
11843 mds->balancer->queue_split(info.dirs.front(), false);
11844 else
11845 mds->balancer->queue_merge(info.dirs.front());
11846 fragment_unmark_unfreeze_dirs(info.dirs);
11847 fragments.erase(it);
11848 request_finish(mdr);
11849 return;
11850 }
11851
11852 mdr->ls = mds->mdlog->get_current_segment();
11853 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11854 mds->mdlog->start_entry(le);
11855
9f95a23c 11856 for (const auto& dir : info.dirs) {
7c673cae
FG
11857 dirfrag_rollback rollback;
11858 rollback.fnode = dir->fnode;
11859 le->add_orig_frag(dir->get_frag(), &rollback);
11860 }
11861
11862 // refragment
11fdf7f2 11863 MDSContext::vec waiters;
7c673cae 11864 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
9f95a23c 11865 &info.resultfrags, waiters, false);
11fdf7f2 11866 if (g_conf()->mds_debug_frag)
7c673cae
FG
11867 diri->verify_dirfrags();
11868 mds->queue_waiters(waiters);
11869
11fdf7f2
TL
11870 for (const auto& fg : le->orig_frags)
11871 ceph_assert(!diri->dirfragtree.is_leaf(fg));
7c673cae 11872
9f95a23c
TL
11873 le->metablob.add_dir_context(info.resultfrags.front());
11874 for (const auto& dir : info.resultfrags) {
7c673cae 11875 if (diri->is_auth()) {
9f95a23c 11876 le->metablob.add_fragmented_dir(dir, false, false);
7c673cae 11877 } else {
9f95a23c
TL
11878 dir->state_set(CDir::STATE_DIRTYDFT);
11879 le->metablob.add_fragmented_dir(dir, false, true);
7c673cae
FG
11880 }
11881 }
11882
11883 // dft lock
11884 if (diri->is_auth()) {
11885 // journal dirfragtree
94b18763
FG
11886 auto &pi = diri->project_inode();
11887 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11888 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11889 } else {
11890 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11891 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11892 mdr->add_updated_lock(&diri->dirfragtreelock);
11893 }
11894
11895 /*
11896 // filelock
11897 mds->locker->mark_updated_scatterlock(&diri->filelock);
11898 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11899 mut->add_updated_lock(&diri->filelock);
11900
11901 // dirlock
11902 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11903 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11904 mut->add_updated_lock(&diri->nestlock);
11905 */
11906
11907 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11908 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11909 mdr, __func__);
11910 mds->mdlog->flush();
11911}
11912
11913void MDCache::_fragment_logged(MDRequestRef& mdr)
11914{
11915 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298 11916 auto& info = fragments.at(basedirfrag);
7c673cae
FG
11917 CInode *diri = info.resultfrags.front()->get_inode();
11918
11919 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11920 << " on " << *diri << dendl;
a8e16298 11921 mdr->mark_event("prepare logged");
7c673cae
FG
11922
11923 if (diri->is_auth())
11924 diri->pop_and_dirty_projected_inode(mdr->ls);
11925
11926 mdr->apply(); // mark scatterlock
11927
11928 // store resulting frags
11929 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11930
9f95a23c 11931 for (const auto& dir : info.resultfrags) {
7c673cae
FG
11932 dout(10) << " storing result frag " << *dir << dendl;
11933
adb31ebb 11934 dir->mark_dirty(dir->pre_dirty(), mdr->ls);
f91f0fd5
TL
11935 dir->mark_new(mdr->ls);
11936
7c673cae
FG
11937 // freeze and store them too
11938 dir->auth_pin(this);
11939 dir->state_set(CDir::STATE_FRAGMENTING);
11940 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11941 }
11942
11943 gather.activate();
11944}
11945
11946void MDCache::_fragment_stored(MDRequestRef& mdr)
11947{
11948 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298
TL
11949 fragment_info_t &info = fragments.at(basedirfrag);
11950 CDir *first = info.resultfrags.front();
11951 CInode *diri = first->get_inode();
7c673cae
FG
11952
11953 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11954 << " on " << *diri << dendl;
a8e16298 11955 mdr->mark_event("new frags stored");
7c673cae
FG
11956
11957 // tell peers
a8e16298
TL
11958 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11959 diri->authority().first : CDIR_AUTH_UNKNOWN;
181888fb
FG
11960 for (const auto &p : first->get_replicas()) {
11961 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11962 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11963 rejoin_gather.count(p.first)))
7c673cae
FG
11964 continue;
11965
9f95a23c 11966 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
a8e16298
TL
11967 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11968 diri_auth != p.first) { // not auth mds of diri
11969 /*
11970 * In the nornal case, mds does not trim dir inode whose child dirfrags
11971 * are likely being fragmented (see trim_inode()). But when fragmenting
11972 * subtree roots, following race can happen:
11973 *
11974 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11975 * mds.c and drops wrlock on dirfragtreelock.
11976 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11977 * SYNC and send lock message mds.c
11978 * - mds.c receives the lock message and changes dirfragtreelock state
11979 * to SYNC
11980 * - mds.c trim dirfrag and dir inode from its cache
11981 * - mds.c receives the fragment_notify message
11982 *
11983 * So we need to ensure replicas have received the notify, then unlock
11984 * the dirfragtreelock.
11985 */
11986 notify->mark_ack_wanted();
11987 info.notify_ack_waiting.insert(p.first);
11988 }
7c673cae
FG
11989
11990 // freshly replicate new dirs to peers
9f95a23c
TL
11991 for (const auto& dir : info.resultfrags) {
11992 encode_replica_dir(dir, p.first, notify->basebl);
11993 }
7c673cae 11994
181888fb 11995 mds->send_message_mds(notify, p.first);
7c673cae
FG
11996 }
11997
11998 // journal commit
11999 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
a8e16298 12000 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
7c673cae 12001
7c673cae
FG
12002
12003 // unfreeze resulting frags
9f95a23c 12004 for (const auto& dir : info.resultfrags) {
7c673cae
FG
12005 dout(10) << " result frag " << *dir << dendl;
12006
94b18763
FG
12007 for (auto &p : dir->items) {
12008 CDentry *dn = p.second;
11fdf7f2 12009 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
12010 dn->state_clear(CDentry::STATE_FRAGMENTING);
12011 dn->put(CDentry::PIN_FRAGMENTING);
12012 }
12013
12014 // unfreeze
12015 dir->unfreeze_dir();
12016 }
12017
a8e16298
TL
12018 if (info.notify_ack_waiting.empty()) {
12019 fragment_drop_locks(info);
12020 } else {
12021 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
12022 }
7c673cae
FG
12023}
12024
a8e16298 12025void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
7c673cae
FG
12026{
12027 dout(10) << "fragment_committed " << basedirfrag << dendl;
a8e16298
TL
12028 if (mdr)
12029 mdr->mark_event("commit logged");
12030
12031 ufragment &uf = uncommitted_fragments.at(basedirfrag);
7c673cae
FG
12032
12033 // remove old frags
12034 C_GatherBuilder gather(
12035 g_ceph_context,
12036 new C_OnFinisher(
a8e16298 12037 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
7c673cae
FG
12038 mds->finisher));
12039
12040 SnapContext nullsnapc;
12041 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11fdf7f2
TL
12042 for (const auto& fg : uf.old_frags) {
12043 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
7c673cae 12044 ObjectOperation op;
11fdf7f2 12045 if (fg == frag_t()) {
7c673cae
FG
12046 // backtrace object
12047 dout(10) << " truncate orphan dirfrag " << oid << dendl;
12048 op.truncate(0);
12049 op.omap_clear();
12050 } else {
12051 dout(10) << " removing orphan dirfrag " << oid << dendl;
12052 op.remove();
12053 }
12054 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12055 ceph::real_clock::now(),
12056 0, gather.new_sub());
12057 }
12058
11fdf7f2 12059 ceph_assert(gather.has_subs());
7c673cae
FG
12060 gather.activate();
12061}
12062
a8e16298 12063void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
7c673cae 12064{
a8e16298
TL
12065 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12066 if (mdr)
12067 mdr->mark_event("old frags purged");
12068
12069 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12070 mds->mdlog->start_submit_entry(le);
12071
12072 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12073
12074 if (mds->logger) {
12075 if (bits > 0) {
12076 mds->logger->inc(l_mds_dir_split);
12077 } else {
12078 mds->logger->inc(l_mds_dir_merge);
12079 }
12080 }
12081
12082 if (mdr) {
12083 auto it = fragments.find(basedirfrag);
12084 ceph_assert(it != fragments.end());
12085 it->second.finishing = true;
12086 if (it->second.notify_ack_waiting.empty())
12087 fragment_maybe_finish(it);
12088 else
12089 mdr->mark_event("wating for notify acks");
12090 }
12091}
12092
12093void MDCache::fragment_drop_locks(fragment_info_t& info)
12094{
12095 mds->locker->drop_locks(info.mdr.get());
12096 request_finish(info.mdr);
12097 //info.mdr.reset();
12098}
12099
12100void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12101{
12102 if (!it->second.finishing)
12103 return;
7c673cae
FG
12104
12105 // unmark & auth_unpin
a8e16298 12106 for (const auto &dir : it->second.resultfrags) {
7c673cae
FG
12107 dir->state_clear(CDir::STATE_FRAGMENTING);
12108 dir->auth_unpin(this);
12109
12110 // In case the resulting fragments are beyond the split size,
12111 // we might need to split them again right away (they could
12112 // have been taking inserts between unfreezing and getting
12113 // here)
12114 mds->balancer->maybe_fragment(dir, false);
12115 }
12116
a8e16298
TL
12117 fragments.erase(it);
12118}
12119
12120
9f95a23c 12121void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
a8e16298
TL
12122{
12123 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12124 mds_rank_t from = mds_rank_t(ack->get_source().num());
12125
12126 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
a8e16298 12127 return;
7c673cae
FG
12128 }
12129
a8e16298
TL
12130 auto it = fragments.find(ack->get_base_dirfrag());
12131 if (it == fragments.end() ||
12132 it->second.get_tid() != ack->get_tid()) {
12133 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
a8e16298
TL
12134 return;
12135 }
7c673cae 12136
a8e16298
TL
12137 if (it->second.notify_ack_waiting.erase(from) &&
12138 it->second.notify_ack_waiting.empty()) {
12139 fragment_drop_locks(it->second);
12140 fragment_maybe_finish(it);
12141 }
7c673cae
FG
12142}
12143
9f95a23c 12144void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
7c673cae
FG
12145{
12146 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
a8e16298 12147 mds_rank_t from = mds_rank_t(notify->get_source().num());
7c673cae
FG
12148
12149 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
12150 return;
12151 }
12152
12153 CInode *diri = get_inode(notify->get_ino());
12154 if (diri) {
12155 frag_t base = notify->get_basefrag();
12156 int bits = notify->get_bits();
12157
12158/*
12159 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12160 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12161 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12162 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
7c673cae
FG
12163 return;
12164 }
12165*/
12166
12167 // refragment
11fdf7f2 12168 MDSContext::vec waiters;
9f95a23c
TL
12169 std::vector<CDir*> resultfrags;
12170 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
11fdf7f2 12171 if (g_conf()->mds_debug_frag)
7c673cae
FG
12172 diri->verify_dirfrags();
12173
9f95a23c
TL
12174 for (const auto& dir : resultfrags) {
12175 diri->take_dir_waiting(dir->get_frag(), waiters);
12176 }
7c673cae
FG
12177
12178 // add new replica dirs values
11fdf7f2 12179 auto p = notify->basebl.cbegin();
9f95a23c
TL
12180 while (!p.end()) {
12181 CDir *tmp_dir = nullptr;
12182 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12183 }
7c673cae
FG
12184
12185 mds->queue_waiters(waiters);
12186 } else {
12187 ceph_abort();
12188 }
12189
a8e16298 12190 if (notify->is_ack_wanted()) {
9f95a23c 12191 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
11fdf7f2 12192 notify->get_bits(), notify->get_tid());
a8e16298
TL
12193 mds->send_message_mds(ack, from);
12194 }
7c673cae
FG
12195}
12196
11fdf7f2 12197void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
7c673cae
FG
12198 LogSegment *ls, bufferlist *rollback)
12199{
12200 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11fdf7f2 12201 ceph_assert(!uncommitted_fragments.count(basedirfrag));
7c673cae
FG
12202 ufragment& uf = uncommitted_fragments[basedirfrag];
12203 uf.old_frags = old_frags;
12204 uf.bits = bits;
12205 uf.ls = ls;
12206 ls->uncommitted_fragments.insert(basedirfrag);
12207 if (rollback)
12208 uf.rollback.swap(*rollback);
12209}
12210
12211void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12212{
12213 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12214 << " op " << EFragment::op_name(op) << dendl;
12215 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12216 if (it != uncommitted_fragments.end()) {
12217 ufragment& uf = it->second;
12218 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12219 uf.committed = true;
12220 } else {
12221 uf.ls->uncommitted_fragments.erase(basedirfrag);
12222 mds->queue_waiters(uf.waiters);
12223 uncommitted_fragments.erase(it);
12224 }
12225 }
12226}
12227
11fdf7f2 12228void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
7c673cae
FG
12229{
12230 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12231 << " old_frags (" << old_frags << ")" << dendl;
12232 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12233 if (it != uncommitted_fragments.end()) {
12234 ufragment& uf = it->second;
12235 if (!uf.old_frags.empty()) {
11fdf7f2 12236 uf.old_frags = std::move(old_frags);
7c673cae
FG
12237 uf.committed = true;
12238 } else {
12239 uf.ls->uncommitted_fragments.erase(basedirfrag);
12240 uncommitted_fragments.erase(it);
12241 }
12242 }
12243}
12244
f91f0fd5 12245void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
e306af50 12246{
f91f0fd5
TL
12247 MDSGatherBuilder gather(g_ceph_context, finisher);
12248 for (auto& p : uncommitted_fragments) {
12249 p.second.waiters.push_back(gather.new_sub());
12250 }
12251 gather.activate();
e306af50
TL
12252}
12253
7c673cae
FG
12254void MDCache::rollback_uncommitted_fragments()
12255{
12256 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12257 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12258 p != uncommitted_fragments.end();
12259 ++p) {
12260 ufragment &uf = p->second;
12261 CInode *diri = get_inode(p->first.ino);
11fdf7f2 12262 ceph_assert(diri);
7c673cae
FG
12263
12264 if (uf.committed) {
a8e16298 12265 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
12266 continue;
12267 }
12268
12269 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12270
12271 LogSegment *ls = mds->mdlog->get_current_segment();
12272 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12273 mds->mdlog->start_entry(le);
12274 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12275
11fdf7f2 12276 frag_vec_t old_frags;
7c673cae
FG
12277 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12278
9f95a23c 12279 std::vector<CDir*> resultfrags;
7c673cae
FG
12280 if (uf.old_frags.empty()) {
12281 // created by old format EFragment
11fdf7f2 12282 MDSContext::vec waiters;
9f95a23c 12283 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
7c673cae 12284 } else {
11fdf7f2
TL
12285 auto bp = uf.rollback.cbegin();
12286 for (const auto& fg : uf.old_frags) {
12287 CDir *dir = force_dir_fragment(diri, fg);
7c673cae
FG
12288 resultfrags.push_back(dir);
12289
12290 dirfrag_rollback rollback;
11fdf7f2 12291 decode(rollback, bp);
7c673cae
FG
12292
12293 dir->set_version(rollback.fnode.version);
12294 dir->fnode = rollback.fnode;
12295
12296 dir->_mark_dirty(ls);
12297
12298 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
12299 dout(10) << " dirty nestinfo on " << *dir << dendl;
12300 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
12301 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
12302 }
12303 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
12304 dout(10) << " dirty fragstat on " << *dir << dendl;
12305 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
12306 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
12307 }
12308
12309 le->add_orig_frag(dir->get_frag());
12310 le->metablob.add_dir_context(dir);
12311 if (diri_auth) {
12312 le->metablob.add_fragmented_dir(dir, true, false);
12313 } else {
12314 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12315 dir->state_set(CDir::STATE_DIRTYDFT);
12316 le->metablob.add_fragmented_dir(dir, true, true);
12317 }
12318 }
12319 }
12320
12321 if (diri_auth) {
94b18763
FG
12322 auto &pi = diri->project_inode();
12323 pi.inode.version = diri->pre_dirty();
7c673cae
FG
12324 diri->pop_and_dirty_projected_inode(ls); // hacky
12325 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12326 } else {
12327 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12328 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12329 }
12330
11fdf7f2 12331 if (g_conf()->mds_debug_frag)
7c673cae
FG
12332 diri->verify_dirfrags();
12333
11fdf7f2
TL
12334 for (const auto& leaf : old_frags) {
12335 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12336 }
7c673cae 12337
7c673cae
FG
12338 mds->mdlog->submit_entry(le);
12339
12340 uf.old_frags.swap(old_frags);
a8e16298 12341 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
12342 }
12343}
12344
12345void MDCache::force_readonly()
12346{
12347 if (is_readonly())
12348 return;
12349
12350 dout(1) << "force file system read-only" << dendl;
12351 mds->clog->warn() << "force file system read-only";
12352
12353 set_readonly();
12354
12355 mds->server->force_clients_readonly();
12356
12357 // revoke write caps
81eedcae 12358 int count = 0;
94b18763 12359 for (auto &p : inode_map) {
b32b8144 12360 CInode *in = p.second;
7c673cae
FG
12361 if (in->is_head())
12362 mds->locker->eval(in, CEPH_CAP_LOCKS);
81eedcae
TL
12363 if (!(++count % 1000))
12364 mds->heartbeat_reset();
7c673cae
FG
12365 }
12366
12367 mds->mdlog->flush();
12368}
12369
12370
12371// ==============================================================
12372// debug crap
12373
81eedcae 12374void MDCache::show_subtrees(int dbl, bool force_print)
7c673cae 12375{
11fdf7f2 12376 if (g_conf()->mds_thrash_exports)
7c673cae
FG
12377 dbl += 15;
12378
12379 //dout(10) << "show_subtrees" << dendl;
12380
11fdf7f2 12381 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
7c673cae
FG
12382 return; // i won't print anything.
12383
12384 if (subtrees.empty()) {
11fdf7f2
TL
12385 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12386 << dendl;
7c673cae
FG
12387 return;
12388 }
12389
81eedcae
TL
12390 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12391 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12392 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12393 "printing subtrees" << dendl;
12394 return;
12395 }
12396
7c673cae 12397 // root frags
9f95a23c 12398 std::vector<CDir*> basefrags;
7c673cae
FG
12399 for (set<CInode*>::iterator p = base_inodes.begin();
12400 p != base_inodes.end();
12401 ++p)
12402 (*p)->get_dirfrags(basefrags);
12403 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12404 dout(15) << "show_subtrees" << dendl;
12405
12406 // queue stuff
12407 list<pair<CDir*,int> > q;
12408 string indent;
12409 set<CDir*> seen;
12410
12411 // calc max depth
9f95a23c
TL
12412 for (const auto& dir : basefrags) {
12413 q.emplace_back(dir, 0);
12414 }
7c673cae
FG
12415
12416 set<CDir*> subtrees_seen;
12417
81eedcae 12418 unsigned int depth = 0;
7c673cae
FG
12419 while (!q.empty()) {
12420 CDir *dir = q.front().first;
81eedcae 12421 unsigned int d = q.front().second;
7c673cae
FG
12422 q.pop_front();
12423
12424 if (subtrees.count(dir) == 0) continue;
12425
12426 subtrees_seen.insert(dir);
12427
12428 if (d > depth) depth = d;
12429
12430 // sanity check
12431 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12432 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11fdf7f2 12433 ceph_assert(seen.count(dir) == 0);
7c673cae
FG
12434 seen.insert(dir);
12435
12436 // nested items?
12437 if (!subtrees[dir].empty()) {
12438 for (set<CDir*>::iterator p = subtrees[dir].begin();
12439 p != subtrees[dir].end();
12440 ++p) {
12441 //dout(25) << " saw sub " << **p << dendl;
12442 q.push_front(pair<CDir*,int>(*p, d+1));
12443 }
12444 }
12445 }
12446
81eedcae
TL
12447 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12448 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12449 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12450 "subtrees" << dendl;
12451 return;
12452 }
7c673cae
FG
12453
12454 // print tree
9f95a23c
TL
12455 for (const auto& dir : basefrags) {
12456 q.emplace_back(dir, 0);
12457 }
7c673cae
FG
12458
12459 while (!q.empty()) {
12460 CDir *dir = q.front().first;
12461 int d = q.front().second;
12462 q.pop_front();
12463
12464 if (subtrees.count(dir) == 0) continue;
12465
12466 // adjust indenter
12467 while ((unsigned)d < indent.size())
12468 indent.resize(d);
12469
12470 // pad
12471 string pad = "______________________________________";
12472 pad.resize(depth*2+1-indent.size());
12473 if (!subtrees[dir].empty())
12474 pad[0] = '.'; // parent
12475
12476
12477 string auth;
12478 if (dir->is_auth())
12479 auth = "auth ";
12480 else
12481 auth = " rep ";
12482
12483 char s[10];
12484 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12485 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12486 else
12487 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12488
12489 // print
11fdf7f2
TL
12490 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12491 << " " << auth << *dir << dendl;
7c673cae 12492
ec96510d 12493 if (dir->ino() == CEPH_INO_ROOT)
11fdf7f2 12494 ceph_assert(dir->inode == root);
7c673cae 12495 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11fdf7f2 12496 ceph_assert(dir->inode == myin);
7c673cae 12497 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11fdf7f2 12498 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
7c673cae
FG
12499
12500 // nested items?
12501 if (!subtrees[dir].empty()) {
12502 // more at my level?
12503 if (!q.empty() && q.front().second == d)
12504 indent += "| ";
12505 else
12506 indent += " ";
12507
12508 for (set<CDir*>::iterator p = subtrees[dir].begin();
12509 p != subtrees[dir].end();
12510 ++p)
12511 q.push_front(pair<CDir*,int>(*p, d+2));
12512 }
12513 }
12514
12515 // verify there isn't stray crap in subtree map
12516 int lost = 0;
12517 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12518 p != subtrees.end();
12519 ++p) {
12520 if (subtrees_seen.count(p->first)) continue;
12521 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12522 lost++;
12523 }
11fdf7f2 12524 ceph_assert(lost == 0);
7c673cae
FG
12525}
12526
7c673cae
FG
12527void MDCache::show_cache()
12528{
12529 dout(7) << "show_cache" << dendl;
b32b8144
FG
12530
12531 auto show_func = [this](CInode *in) {
7c673cae 12532 // unlinked?
b32b8144
FG
12533 if (!in->parent)
12534 dout(7) << " unlinked " << *in << dendl;
12535
7c673cae 12536 // dirfrags?
9f95a23c
TL
12537 auto&& dfs = in->get_dirfrags();
12538 for (const auto& dir : dfs) {
7c673cae 12539 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 12540
94b18763
FG
12541 for (auto &p : dir->items) {
12542 CDentry *dn = p.second;
7c673cae
FG
12543 dout(7) << " dentry " << *dn << dendl;
12544 CDentry::linkage_t *dnl = dn->get_linkage();
12545 if (dnl->is_primary() && dnl->get_inode())
12546 dout(7) << " inode " << *dnl->get_inode() << dendl;
12547 }
12548 }
b32b8144
FG
12549 };
12550
94b18763 12551 for (auto &p : inode_map)
b32b8144 12552 show_func(p.second);
94b18763 12553 for (auto &p : snap_inode_map)
b32b8144 12554 show_func(p.second);
7c673cae
FG
12555}
12556
f64942e4 12557void MDCache::cache_status(Formatter *f)
181888fb
FG
12558{
12559 f->open_object_section("cache");
12560
12561 f->open_object_section("pool");
12562 mempool::get_pool(mempool::mds_co::id).dump(f);
12563 f->close_section();
12564
12565 f->close_section();
181888fb
FG
12566}
12567
11fdf7f2 12568void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
7c673cae 12569{
11fdf7f2
TL
12570 ceph_assert(in);
12571 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12572 return;
12573 }
9f95a23c 12574 auto&& ls = in->get_dirfrags();
11fdf7f2
TL
12575 for (const auto &subdir : ls) {
12576 for (const auto &p : subdir->items) {
12577 CDentry *dn = p.second;
12578 CInode *in = dn->get_linkage()->get_inode();
12579 if (in) {
12580 dump_tree(in, cur_depth + 1, max_depth, f);
12581 }
12582 }
12583 }
12584 f->open_object_section("inode");
12585 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12586 f->close_section();
7c673cae
FG
12587}
12588
11fdf7f2 12589int MDCache::dump_cache(std::string_view file_name)
7c673cae 12590{
11fdf7f2 12591 return dump_cache(file_name, NULL);
7c673cae
FG
12592}
12593
11fdf7f2 12594int MDCache::dump_cache(Formatter *f)
7c673cae 12595{
11fdf7f2 12596 return dump_cache(std::string_view(""), f);
7c673cae
FG
12597}
12598
12599/**
12600 * Dump the metadata cache, either to a Formatter, if
12601 * provided, else to a plain text file.
12602 */
11fdf7f2 12603int MDCache::dump_cache(std::string_view fn, Formatter *f)
7c673cae
FG
12604{
12605 int r = 0;
f64942e4
AA
12606
12607 // dumping large caches may cause mds to hang or worse get killed.
12608 // so, disallow the dump if the cache size exceeds the configured
12609 // threshold, which is 1G for formatter and unlimited for file (note
12610 // that this can be jacked up by the admin... and is nothing but foot
12611 // shooting, but the option itself is for devs and hence dangerous to
12612 // tune). TODO: remove this when fixed.
12613 uint64_t threshold = f ?
11fdf7f2
TL
12614 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12615 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
f64942e4
AA
12616
12617 if (threshold && cache_size() > threshold) {
12618 if (f) {
12619 std::stringstream ss;
12620 ss << "cache usage exceeds dump threshold";
12621 f->open_object_section("result");
12622 f->dump_string("error", ss.str());
12623 f->close_section();
12624 } else {
12625 derr << "cache usage exceeds dump threshold" << dendl;
12626 r = -EINVAL;
12627 }
12628 return r;
12629 }
12630
12631 r = 0;
7c673cae
FG
12632 int fd = -1;
12633
12634 if (f) {
12635 f->open_array_section("inodes");
12636 } else {
94b18763
FG
12637 char path[PATH_MAX] = "";
12638 if (fn.length()) {
12639 snprintf(path, sizeof path, "%s", fn.data());
12640 } else {
12641 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
12642 }
12643
94b18763 12644 dout(1) << "dump_cache to " << path << dendl;
7c673cae 12645
91327a77 12646 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
7c673cae 12647 if (fd < 0) {
94b18763 12648 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 12649 return errno;
7c673cae
FG
12650 }
12651 }
12652
11fdf7f2 12653 auto dump_func = [fd, f](CInode *in) {
b32b8144 12654 int r;
7c673cae
FG
12655 if (f) {
12656 f->open_object_section("inode");
11fdf7f2
TL
12657 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12658 f->close_section();
12659 return 1;
12660 }
12661 ostringstream ss;
12662 ss << *in << std::endl;
12663 std::string s = ss.str();
12664 r = safe_write(fd, s.c_str(), s.length());
12665 if (r < 0)
12666 return r;
9f95a23c 12667 auto&& dfs = in->get_dirfrags();
11fdf7f2
TL
12668 for (auto &dir : dfs) {
12669 ostringstream tt;
12670 tt << " " << *dir << std::endl;
12671 std::string t = tt.str();
12672 r = safe_write(fd, t.c_str(), t.length());
12673 if (r < 0)
12674 return r;
94b18763
FG
12675 for (auto &p : dir->items) {
12676 CDentry *dn = p.second;
11fdf7f2
TL
12677 ostringstream uu;
12678 uu << " " << *dn << std::endl;
12679 std::string u = uu.str();
12680 r = safe_write(fd, u.c_str(), u.length());
12681 if (r < 0)
12682 return r;
7c673cae
FG
12683 }
12684 dir->check_rstats();
7c673cae 12685 }
b32b8144
FG
12686 return 1;
12687 };
12688
94b18763 12689 for (auto &p : inode_map) {
b32b8144
FG
12690 r = dump_func(p.second);
12691 if (r < 0)
12692 goto out;
12693 }
94b18763 12694 for (auto &p : snap_inode_map) {
b32b8144
FG
12695 r = dump_func(p.second);
12696 if (r < 0)
12697 goto out;
7c673cae 12698 }
b32b8144 12699 r = 0;
7c673cae
FG
12700
12701 out:
12702 if (f) {
12703 f->close_section(); // inodes
12704 } else {
12705 ::close(fd);
12706 }
31f18b77 12707 return r;
7c673cae
FG
12708}
12709
12710
12711
12712C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12713 : MDSInternalContext(c->mds), cache(c), mdr(r)
12714{}
12715
12716void C_MDS_RetryRequest::finish(int r)
12717{
12718 mdr->retry++;
12719 cache->dispatch_request(mdr);
12720}
12721
12722
12723class C_MDS_EnqueueScrub : public Context
12724{
11fdf7f2 12725 std::string tag;
7c673cae
FG
12726 Formatter *formatter;
12727 Context *on_finish;
12728public:
12729 ScrubHeaderRef header;
11fdf7f2
TL
12730 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12731 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
7c673cae
FG
12732
12733 Context *take_finisher() {
12734 Context *fin = on_finish;
12735 on_finish = NULL;
12736 return fin;
12737 }
12738
12739 void finish(int r) override {
11fdf7f2
TL
12740 if (r == 0) {
12741 // since recursive scrub is asynchronous, dump minimal output
12742 // to not upset cli tools.
12743 if (header && header->get_recursive()) {
12744 formatter->open_object_section("results");
12745 formatter->dump_int("return_code", 0);
12746 formatter->dump_string("scrub_tag", tag);
12747 formatter->dump_string("mode", "asynchronous");
12748 formatter->close_section(); // results
12749 }
12750 } else { // we failed the lookup or something; dump ourselves
7c673cae
FG
12751 formatter->open_object_section("results");
12752 formatter->dump_int("return_code", r);
12753 formatter->close_section(); // results
11fdf7f2 12754 r = 0; // already dumped in formatter
7c673cae
FG
12755 }
12756 if (on_finish)
12757 on_finish->complete(r);
12758 }
12759};
12760
12761void MDCache::enqueue_scrub(
11fdf7f2
TL
12762 std::string_view path,
12763 std::string_view tag,
7c673cae
FG
12764 bool force, bool recursive, bool repair,
12765 Formatter *f, Context *fin)
12766{
11fdf7f2 12767 dout(10) << __func__ << " " << path << dendl;
7c673cae 12768 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
11fdf7f2
TL
12769 if (path == "~mdsdir") {
12770 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12771 mdr->set_filepath(fp);
12772 } else {
12773 filepath fp(path);
12774 mdr->set_filepath(path);
12775 }
12776
12777 bool is_internal = false;
12778 std::string tag_str(tag);
12779 if (tag_str.empty()) {
12780 uuid_d uuid_gen;
12781 uuid_gen.generate_random();
12782 tag_str = uuid_gen.to_string();
12783 is_internal = true;
12784 }
7c673cae 12785
11fdf7f2 12786 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
7c673cae 12787 cs->header = std::make_shared<ScrubHeader>(
11fdf7f2 12788 tag_str, is_internal, force, recursive, repair, f);
7c673cae
FG
12789
12790 mdr->internal_op_finish = cs;
12791 enqueue_scrub_work(mdr);
12792}
12793
12794void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12795{
9f95a23c 12796 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
7c673cae
FG
12797 if (NULL == in)
12798 return;
12799
12800 // TODO: Remove this restriction
11fdf7f2 12801 ceph_assert(in->is_auth());
7c673cae 12802
7c673cae 12803 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
11fdf7f2 12804 ScrubHeaderRef header = cs->header;
7c673cae
FG
12805
12806 // Cannot scrub same dentry twice at same time
11fdf7f2 12807 if (in->scrub_is_in_progress()) {
7c673cae
FG
12808 mds->server->respond_to_request(mdr, -EBUSY);
12809 return;
12810 } else {
12811 in->scrub_info();
12812 }
12813
12814 header->set_origin(in);
12815
11fdf7f2
TL
12816 Context *fin;
12817 if (header->get_recursive()) {
12818 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12819 fin = new MDSInternalContextWrapper(mds,
9f95a23c 12820 new LambdaContext([this, header](int r) {
11fdf7f2
TL
12821 recursive_scrub_finish(header);
12822 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12823 })
12824 );
12825 } else {
b32b8144
FG
12826 fin = cs->take_finisher();
12827 }
12828
12829 // If the scrub did some repair, then flush the journal at the end of
12830 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12831 // the on disk state will still look damaged.
9f95a23c 12832 auto scrub_finish = new LambdaContext([this, header, fin](int r){
28e407b8
AA
12833 if (!header->get_repaired()) {
12834 if (fin)
12835 fin->complete(r);
12836 return;
12837 }
12838
9f95a23c 12839 auto flush_finish = new LambdaContext([this, fin](int r){
28e407b8
AA
12840 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12841 mds->mdlog->trim_all();
12842
12843 if (fin) {
12844 MDSGatherBuilder gather(g_ceph_context);
12845 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12846 for (auto logseg : expiring_segments)
12847 logseg->wait_for_expiry(gather.new_sub());
11fdf7f2 12848 ceph_assert(gather.has_subs());
28e407b8
AA
12849 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12850 gather.activate();
b32b8144 12851 }
28e407b8
AA
12852 });
12853
12854 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12855 mds->mdlog->start_new_segment();
12856 mds->mdlog->flush();
12857 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12858 });
12859
7c673cae 12860 if (!header->get_recursive()) {
7c673cae 12861 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12862 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12863 } else {
12864 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12865 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12866 }
7c673cae
FG
12867
12868 mds->server->respond_to_request(mdr, 0);
12869 return;
12870}
12871
11fdf7f2
TL
12872void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12873{
12874 if (header->get_origin()->is_base() &&
12875 header->get_force() && header->get_repair()) {
12876 // notify snapserver that base directory is recursively scrubbed.
12877 // After both root and mdsdir are recursively scrubbed, snapserver
12878 // knows that all old format snaprealms are converted to the new
12879 // format.
12880 if (mds->mdsmap->get_num_in_mds() == 1 &&
12881 mds->mdsmap->get_num_failed_mds() == 0 &&
12882 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12883 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12884 }
12885 }
12886}
12887
12888struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
7c673cae 12889 MDRequestRef mdr;
11fdf7f2 12890 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
7c673cae
FG
12891 MDCacheLogContext(c), mdr(m) {}
12892 void finish(int r) override {
12893 mdr->apply();
12894 get_mds()->server->respond_to_request(mdr, r);
12895 }
12896};
12897
12898void MDCache::repair_dirfrag_stats(CDir *dir)
12899{
12900 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12901 mdr->pin(dir);
12902 mdr->internal_op_private = dir;
12903 mdr->internal_op_finish = new C_MDSInternalNoop;
12904 repair_dirfrag_stats_work(mdr);
12905}
12906
12907void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12908{
12909 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12910 dout(10) << __func__ << " " << *dir << dendl;
12911
12912 if (!dir->is_auth()) {
12913 mds->server->respond_to_request(mdr, -ESTALE);
12914 return;
12915 }
12916
12917 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12918 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12919
7c673cae
FG
12920 mds->locker->drop_locks(mdr.get());
12921 mdr->drop_local_auth_pins();
9f95a23c 12922 if (mdr->is_any_remote_auth_pin())
224ce89b 12923 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12924 return;
12925 }
12926
12927 mdr->auth_pin(dir);
12928
11fdf7f2 12929 MutationImpl::LockOpVec lov;
7c673cae 12930 CInode *diri = dir->inode;
11fdf7f2
TL
12931 lov.add_rdlock(&diri->dirfragtreelock);
12932 lov.add_wrlock(&diri->nestlock);
12933 lov.add_wrlock(&diri->filelock);
12934 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12935 return;
12936
12937 if (!dir->is_complete()) {
12938 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12939 return;
12940 }
12941
12942 frag_info_t frag_info;
12943 nest_info_t nest_info;
94b18763 12944 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12945 CDentry *dn = it->second;
12946 if (dn->last != CEPH_NOSNAP)
12947 continue;
12948 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12949 if (dnl->is_primary()) {
12950 CInode *in = dnl->get_inode();
12951 nest_info.add(in->get_projected_inode()->accounted_rstat);
12952 if (in->is_dir())
12953 frag_info.nsubdirs++;
12954 else
12955 frag_info.nfiles++;
12956 } else if (dnl->is_remote())
12957 frag_info.nfiles++;
12958 }
12959
12960 fnode_t *pf = dir->get_projected_fnode();
12961 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12962 bool good_rstat = nest_info.same_sums(pf->rstat);
12963 if (good_fragstat && good_rstat) {
12964 dout(10) << __func__ << " no corruption found" << dendl;
12965 mds->server->respond_to_request(mdr, 0);
12966 return;
12967 }
12968
12969 pf = dir->project_fnode();
12970 pf->version = dir->pre_dirty();
12971 mdr->add_projected_fnode(dir);
12972
12973 mdr->ls = mds->mdlog->get_current_segment();
12974 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12975 mds->mdlog->start_entry(le);
12976
12977 if (!good_fragstat) {
12978 if (pf->fragstat.mtime > frag_info.mtime)
12979 frag_info.mtime = pf->fragstat.mtime;
12980 if (pf->fragstat.change_attr > frag_info.change_attr)
12981 frag_info.change_attr = pf->fragstat.change_attr;
12982 pf->fragstat = frag_info;
12983 mds->locker->mark_updated_scatterlock(&diri->filelock);
12984 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12985 mdr->add_updated_lock(&diri->filelock);
12986 }
12987
12988 if (!good_rstat) {
12989 if (pf->rstat.rctime > nest_info.rctime)
12990 nest_info.rctime = pf->rstat.rctime;
12991 pf->rstat = nest_info;
12992 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12993 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12994 mdr->add_updated_lock(&diri->nestlock);
12995 }
12996
12997 le->metablob.add_dir_context(dir);
12998 le->metablob.add_dir(dir, true);
12999
11fdf7f2 13000 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
7c673cae
FG
13001}
13002
13003void MDCache::repair_inode_stats(CInode *diri)
13004{
13005 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
13006 mdr->pin(diri);
13007 mdr->internal_op_private = diri;
13008 mdr->internal_op_finish = new C_MDSInternalNoop;
13009 repair_inode_stats_work(mdr);
13010}
13011
13012void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
13013{
13014 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13015 dout(10) << __func__ << " " << *diri << dendl;
13016
13017 if (!diri->is_auth()) {
13018 mds->server->respond_to_request(mdr, -ESTALE);
13019 return;
13020 }
13021 if (!diri->is_dir()) {
13022 mds->server->respond_to_request(mdr, -ENOTDIR);
13023 return;
13024 }
13025
11fdf7f2 13026 MutationImpl::LockOpVec lov;
7c673cae
FG
13027
13028 if (mdr->ls) // already marked filelock/nestlock dirty ?
13029 goto do_rdlocks;
13030
11fdf7f2
TL
13031 lov.add_rdlock(&diri->dirfragtreelock);
13032 lov.add_wrlock(&diri->nestlock);
13033 lov.add_wrlock(&diri->filelock);
13034 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
13035 return;
13036
13037 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13038 // the scatter-gather process, which will fix any fragstat/rstat errors.
11fdf7f2
TL
13039 {
13040 frag_vec_t leaves;
13041 diri->dirfragtree.get_leaves(leaves);
13042 for (const auto& leaf : leaves) {
13043 CDir *dir = diri->get_dirfrag(leaf);
13044 if (!dir) {
13045 ceph_assert(mdr->is_auth_pinned(diri));
13046 dir = diri->get_or_open_dirfrag(this, leaf);
13047 }
13048 if (dir->get_version() == 0) {
13049 ceph_assert(dir->is_auth());
13050 dir->fetch(new C_MDS_RetryRequest(this, mdr));
13051 return;
13052 }
7c673cae
FG
13053 }
13054 }
13055
13056 diri->state_set(CInode::STATE_REPAIRSTATS);
13057 mdr->ls = mds->mdlog->get_current_segment();
13058 mds->locker->mark_updated_scatterlock(&diri->filelock);
13059 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13060 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13061 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13062
13063 mds->locker->drop_locks(mdr.get());
13064
13065do_rdlocks:
13066 // force the scatter-gather process
11fdf7f2
TL
13067 lov.clear();
13068 lov.add_rdlock(&diri->dirfragtreelock);
13069 lov.add_rdlock(&diri->nestlock);
13070 lov.add_rdlock(&diri->filelock);
13071 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
13072 return;
13073
13074 diri->state_clear(CInode::STATE_REPAIRSTATS);
13075
13076 frag_info_t dir_info;
13077 nest_info_t nest_info;
11fdf7f2
TL
13078 nest_info.rsubdirs = 1; // it gets one to account for self
13079 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13080 nest_info.rsnaps = srnode->snaps.size();
7c673cae 13081
11fdf7f2
TL
13082 {
13083 frag_vec_t leaves;
13084 diri->dirfragtree.get_leaves(leaves);
13085 for (const auto& leaf : leaves) {
13086 CDir *dir = diri->get_dirfrag(leaf);
13087 ceph_assert(dir);
13088 ceph_assert(dir->get_version() > 0);
13089 dir_info.add(dir->fnode.accounted_fragstat);
13090 nest_info.add(dir->fnode.accounted_rstat);
13091 }
7c673cae
FG
13092 }
13093
13094 if (!dir_info.same_sums(diri->inode.dirstat) ||
13095 !nest_info.same_sums(diri->inode.rstat)) {
13096 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13097 << *diri << dendl;
13098 }
13099
13100 mds->server->respond_to_request(mdr, 0);
13101}
13102
11fdf7f2
TL
13103void MDCache::upgrade_inode_snaprealm(CInode *in)
13104{
13105 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
13106 mdr->pin(in);
13107 mdr->internal_op_private = in;
13108 mdr->internal_op_finish = new C_MDSInternalNoop;
13109 upgrade_inode_snaprealm_work(mdr);
13110}
13111
13112void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
13113{
13114 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
13115 dout(10) << __func__ << " " << *in << dendl;
13116
13117 if (!in->is_auth()) {
13118 mds->server->respond_to_request(mdr, -ESTALE);
13119 return;
13120 }
13121
13122 MutationImpl::LockOpVec lov;
11fdf7f2 13123 lov.add_xlock(&in->snaplock);
11fdf7f2
TL
13124 if (!mds->locker->acquire_locks(mdr, lov))
13125 return;
13126
13127 // project_snaprealm() upgrades snaprealm format
13128 auto &pi = in->project_inode(false, true);
13129 mdr->add_projected_inode(in);
13130 pi.inode.version = in->pre_dirty();
13131
13132 mdr->ls = mds->mdlog->get_current_segment();
13133 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
13134 mds->mdlog->start_entry(le);
13135
13136 if (in->is_base()) {
13137 le->metablob.add_root(true, in);
13138 } else {
13139 CDentry *pdn = in->get_projected_parent_dn();
13140 le->metablob.add_dir_context(pdn->get_dir());
13141 le->metablob.add_primary_dentry(pdn, in, true);
13142 }
13143
13144 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13145}
13146
13147void MDCache::flush_dentry(std::string_view path, Context *fin)
7c673cae
FG
13148{
13149 if (is_readonly()) {
13150 dout(10) << __func__ << ": read-only FS" << dendl;
13151 fin->complete(-EROFS);
13152 return;
13153 }
13154 dout(10) << "flush_dentry " << path << dendl;
13155 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 13156 filepath fp(path);
7c673cae
FG
13157 mdr->set_filepath(fp);
13158 mdr->internal_op_finish = fin;
13159 flush_dentry_work(mdr);
13160}
13161
11fdf7f2 13162class C_FinishIOMDR : public MDSContext {
7c673cae
FG
13163protected:
13164 MDSRank *mds;
13165 MDRequestRef mdr;
13166 MDSRank *get_mds() override { return mds; }
13167public:
13168 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13169 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13170};
13171
13172void MDCache::flush_dentry_work(MDRequestRef& mdr)
13173{
11fdf7f2 13174 MutationImpl::LockOpVec lov;
9f95a23c
TL
13175 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13176 if (!in)
7c673cae
FG
13177 return;
13178
11fdf7f2 13179 ceph_assert(in->is_auth());
7c673cae
FG
13180 in->flush(new C_FinishIOMDR(mds, mdr));
13181}
13182
13183
13184/**
13185 * Initialize performance counters with global perfcounter
13186 * collection.
13187 */
13188void MDCache::register_perfcounters()
13189{
91327a77
AA
13190 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13191
13192 // Stray/purge statistics
13193 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13194 PerfCountersBuilder::PRIO_INTERESTING);
13195 pcb.add_u64(l_mdc_num_recovering_enqueued,
13196 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13197 PerfCountersBuilder::PRIO_INTERESTING);
13198 pcb.add_u64_counter(l_mdc_recovery_completed,
13199 "recovery_completed", "File recoveries completed", "recd",
13200 PerfCountersBuilder::PRIO_INTERESTING);
13201
13202 // useful recovery queue statistics
13203 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13204 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13205 "Files currently being recovered");
13206 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13207 "Files waiting for recovery with elevated priority");
13208 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13209 "File recoveries started");
13210
13211 // along with other stray dentries stats
13212 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13213 "Stray dentries delayed");
13214 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13215 "Stray dentries enqueuing for purge");
13216 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13217 "Stray dentries created");
7c673cae 13218 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
91327a77
AA
13219 "Stray dentries enqueued for purge");
13220 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13221 "Stray dentries reintegrated");
13222 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13223 "Stray dentries migrated");
7c673cae 13224
91327a77 13225 // low prio internal request stats
d2e6a577 13226 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
91327a77 13227 "Internal Request type enqueue scrub");
d2e6a577 13228 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
91327a77 13229 "Internal Request type export dir");
d2e6a577 13230 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
91327a77 13231 "Internal Request type flush");
d2e6a577 13232 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
91327a77 13233 "Internal Request type fragmentdir");
d2e6a577 13234 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
91327a77 13235 "Internal Request type frag stats");
d2e6a577 13236 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
91327a77 13237 "Internal Request type inode stats");
d2e6a577 13238
7c673cae
FG
13239 logger.reset(pcb.create_perf_counters());
13240 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13241 recovery_queue.set_logger(logger.get());
13242 stray_manager.set_logger(logger.get());
13243}
13244
7c673cae
FG
13245/**
13246 * Call this when putting references to an inode/dentry or
13247 * when attempting to trim it.
13248 *
13249 * If this inode is no longer linked by anyone, and this MDS
13250 * rank holds the primary dentry, and that dentry is in a stray
13251 * directory, then give up the dentry to the StrayManager, never
13252 * to be seen again by MDCache.
13253 *
13254 * @param delay if true, then purgeable inodes are stashed til
13255 * the next trim(), rather than being purged right
13256 * away.
13257 */
13258void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
13259 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
13260 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 13261 return;
224ce89b 13262
7c673cae
FG
13263 CDentry *dn = in->get_projected_parent_dn();
13264
13265 if (dn->state_test(CDentry::STATE_PURGING)) {
13266 /* We have already entered the purging process, no need
13267 * to re-evaluate me ! */
13268 return;
13269 }
13270
11fdf7f2
TL
13271 if (dn->get_dir()->get_inode()->is_stray()) {
13272 if (delay)
13273 stray_manager.queue_delayed(dn);
13274 else
13275 stray_manager.eval_stray(dn);
7c673cae
FG
13276 }
13277}
13278
31f18b77
FG
13279void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13280 dout(10) << __func__ << " " << *diri << dendl;
11fdf7f2 13281 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
9f95a23c 13282 auto&& ls = diri->get_dirfrags();
94b18763 13283 for (auto &p : ls) {
31f18b77
FG
13284 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13285 p->try_remove_dentries_for_stray();
13286 }
13287 if (!diri->snaprealm) {
13288 if (diri->is_auth())
13289 diri->clear_dirty_rstat();
13290 diri->clear_scatter_dirty();
13291 }
13292}
13293
11fdf7f2
TL
13294bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13295 CInode *in = get_inode(number);
13296 if (!in) {
13297 return false;
13298 }
13299 f->open_object_section("inode");
13300 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13301 f->close_section();
13302 return true;
13303}
eafe8130 13304
f6b5b4d7 13305void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
eafe8130
TL
13306 // process export_pin_delayed_queue whenever a new MDSMap received
13307 auto &q = export_pin_delayed_queue;
13308 for (auto it = q.begin(); it != q.end(); ) {
13309 auto *in = *it;
13310 mds_rank_t export_pin = in->get_export_pin(false);
f6b5b4d7
TL
13311 if (in->is_ephemerally_pinned()) {
13312 dout(10) << "ephemeral export pin to " << export_pin << " for " << *in << dendl;
13313 }
eafe8130
TL
13314 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13315 << " max_mds=" << mdsmap.get_max_mds() << dendl;
13316 if (export_pin >= mdsmap.get_max_mds()) {
13317 it++;
13318 continue;
13319 }
13320
13321 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13322 it = q.erase(it);
f6b5b4d7 13323 in->queue_export_pin(export_pin);
eafe8130 13324 }
eafe8130 13325
f6b5b4d7
TL
13326 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13327 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13328 /* copy to vector to avoid removals during iteration */
13329 std::vector<CInode*> migrate;
13330 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
13331 for (auto& in : migrate) {
13332 in->maybe_ephemeral_rand();
13333 }
13334 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
13335 for (auto& in : migrate) {
13336 in->maybe_ephemeral_dist();
13337 }
13338 }
13339}
7f7e6c64
TL
13340
13341void MDCache::upkeep_main(void)
13342{
13343 std::unique_lock lock(upkeep_mutex);
13344 while (!upkeep_trim_shutdown.load()) {
13345 auto now = clock::now();
13346 auto since = now-upkeep_last_trim;
13347 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
13348 if (since >= trim_interval*.90) {
13349 lock.unlock(); /* mds_lock -> upkeep_mutex */
13350 std::scoped_lock mds_lock(mds->mds_lock);
13351 lock.lock();
13352 if (upkeep_trim_shutdown.load())
13353 return;
13354 check_memory_usage();
13355 if (mds->is_cache_trimmable()) {
13356 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
13357 bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
13358 if (active_with_clients) {
13359 trim_client_leases();
13360 }
13361 trim();
13362 if (active_with_clients) {
13363 auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
13364 if (cache_toofull()) {
13365 recall_flags = recall_flags|Server::RecallFlags::TRIM;
13366 }
13367 mds->server->recall_client_state(nullptr, recall_flags);
13368 }
13369 upkeep_last_trim = now = clock::now();
13370 } else {
13371 dout(10) << "cache not ready for trimming" << dendl;
13372 }
13373 } else {
13374 trim_interval -= since;
13375 }
13376 since = now-upkeep_last_release;
13377 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
13378 if (since >= release_interval*.90) {
13379 /* XXX not necessary once MDCache uses PriorityCache */
13380 dout(10) << "releasing free memory" << dendl;
13381 ceph_heap_release_free_memory();
13382 upkeep_last_release = clock::now();
13383 } else {
13384 release_interval -= since;
13385 }
13386 auto interval = std::min(release_interval, trim_interval);
13387 dout(20) << "upkeep thread waiting interval " << interval << dendl;
13388 upkeep_cvar.wait_for(lock, interval);
13389 }
13390}