]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / mds / MDCache.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <errno.h>
16#include <fstream>
17#include <iostream>
18#include <sstream>
19#include <string>
11fdf7f2 20#include <string_view>
7c673cae
FG
21#include <map>
22
23#include "MDCache.h"
24#include "MDSRank.h"
25#include "Server.h"
26#include "Locker.h"
27#include "MDLog.h"
28#include "MDBalancer.h"
29#include "Migrator.h"
30#include "ScrubStack.h"
31
32#include "SnapClient.h"
33
34#include "MDSMap.h"
35
36#include "CInode.h"
37#include "CDir.h"
38
39#include "Mutation.h"
40
41#include "include/ceph_fs.h"
42#include "include/filepath.h"
181888fb 43#include "include/util.h"
7c673cae 44
11fdf7f2
TL
45#include "messages/MClientCaps.h"
46
7c673cae
FG
47#include "msg/Message.h"
48#include "msg/Messenger.h"
49
181888fb 50#include "common/MemoryModel.h"
7c673cae 51#include "common/errno.h"
7c673cae 52#include "common/perf_counters.h"
181888fb
FG
53#include "common/safe_io.h"
54
7c673cae
FG
55#include "osdc/Journaler.h"
56#include "osdc/Filer.h"
57
58#include "events/ESubtreeMap.h"
59#include "events/EUpdate.h"
60#include "events/ESlaveUpdate.h"
61#include "events/EImportFinish.h"
62#include "events/EFragment.h"
63#include "events/ECommitted.h"
64#include "events/ESessions.h"
65
7c673cae
FG
66#include "InoTable.h"
67
68#include "common/Timer.h"
69
70#include "perfglue/heap_profiler.h"
71
7c673cae
FG
72
73#include "common/config.h"
11fdf7f2 74#include "include/ceph_assert.h"
7c673cae
FG
75
76#define dout_context g_ceph_context
77#define dout_subsys ceph_subsys_mds
78#undef dout_prefix
79#define dout_prefix _prefix(_dout, mds)
80static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
81 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
82}
83
84set<int> SimpleLock::empty_gather_set;
85
86
87/**
88 * All non-I/O contexts that require a reference
89 * to an MDCache instance descend from this.
90 */
11fdf7f2 91class MDCacheContext : public virtual MDSContext {
7c673cae
FG
92protected:
93 MDCache *mdcache;
94 MDSRank *get_mds() override
95 {
11fdf7f2 96 ceph_assert(mdcache != NULL);
7c673cae
FG
97 return mdcache->mds;
98 }
99public:
100 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
101};
102
103
104/**
105 * Only for contexts called back from an I/O completion
106 *
107 * Note: duplication of members wrt MDCacheContext, because
108 * it'ls the lesser of two evils compared with introducing
109 * yet another piece of (multiple) inheritance.
110 */
111class MDCacheIOContext : public virtual MDSIOContextBase {
112protected:
113 MDCache *mdcache;
114 MDSRank *get_mds() override
115 {
11fdf7f2 116 ceph_assert(mdcache != NULL);
7c673cae
FG
117 return mdcache->mds;
118 }
119public:
91327a77
AA
120 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
121 MDSIOContextBase(track), mdcache(mdc_) {}
7c673cae
FG
122};
123
124class MDCacheLogContext : public virtual MDSLogContextBase {
125protected:
126 MDCache *mdcache;
127 MDSRank *get_mds() override
128 {
11fdf7f2 129 ceph_assert(mdcache != NULL);
7c673cae
FG
130 return mdcache->mds;
131 }
132public:
133 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
134};
135
136MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
137 mds(m),
138 filer(m->objecter, m->finisher),
139 exceeded_size_limit(false),
140 recovery_queue(m),
a8e16298 141 stray_manager(m, purge_queue_),
11fdf7f2
TL
142 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")),
143 open_file_table(m)
7c673cae
FG
144{
145 migrator.reset(new Migrator(mds, this));
146 root = NULL;
147 myin = NULL;
148 readonly = false;
149
150 stray_index = 0;
151 for (int i = 0; i < NUM_STRAY; ++i) {
152 strays[i] = NULL;
153 }
154
b32b8144 155 num_shadow_inodes = 0;
7c673cae
FG
156 num_inodes_with_caps = 0;
157
11fdf7f2
TL
158 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
159 (g_conf()->mds_dir_max_commit_size << 20) :
160 (0.9 *(g_conf()->osd_max_write_size << 20));
7c673cae
FG
161
162 discover_last_tid = 0;
163 open_ino_last_tid = 0;
164 find_ino_peer_last_tid = 0;
165
166 last_cap_id = 0;
167
168 client_lease_durations[0] = 5.0;
169 client_lease_durations[1] = 30.0;
170 client_lease_durations[2] = 300.0;
171
172 resolves_pending = false;
173 rejoins_pending = false;
174 cap_imports_num_opening = 0;
175
176 opening_root = open = false;
91327a77 177
11fdf7f2
TL
178 cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
179 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
180 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
181 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 182
11fdf7f2 183 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
7c673cae 184
31f18b77
FG
185 bottom_lru.lru_set_midpoint(0);
186
11fdf7f2 187 decayrate.set_halflife(g_conf()->mds_decay_halflife);
7c673cae
FG
188
189 did_shutdown_log_cap = false;
11fdf7f2
TL
190
191 global_snaprealm = NULL;
7c673cae
FG
192}
193
194MDCache::~MDCache()
195{
196 if (logger) {
197 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
198 }
199}
200
11fdf7f2 201void MDCache::handle_conf_change(const ConfigProxy& conf,
91327a77
AA
202 const std::set <std::string> &changed,
203 const MDSMap &mdsmap)
204{
205 if (changed.count("mds_cache_size"))
11fdf7f2 206 cache_inode_limit = g_conf().get_val<int64_t>("mds_cache_size");
91327a77 207 if (changed.count("mds_cache_memory_limit"))
11fdf7f2 208 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
91327a77 209 if (changed.count("mds_cache_reservation"))
11fdf7f2 210 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
91327a77 211 if (changed.count("mds_health_cache_threshold"))
11fdf7f2 212 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
91327a77 213 if (changed.count("mds_cache_mid"))
11fdf7f2 214 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
a8e16298 215 if (changed.count("mds_cache_trim_decay_rate")) {
11fdf7f2 216 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
a8e16298 217 }
7c673cae 218
91327a77
AA
219 migrator->handle_conf_change(conf, changed, mdsmap);
220 mds->balancer->handle_conf_change(conf, changed, mdsmap);
221}
7c673cae
FG
222
223void MDCache::log_stat()
224{
91327a77 225 mds->logger->set(l_mds_inode_max, cache_inode_limit ? : INT_MAX);
7c673cae
FG
226 mds->logger->set(l_mds_inodes, lru.lru_get_size());
227 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
228 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
229 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
230 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
231 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
232 mds->logger->set(l_mds_caps, Capability::count());
233}
234
235
236//
237
238bool MDCache::shutdown()
239{
240 if (lru.lru_get_size() > 0) {
241 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
242 //show_cache();
243 show_subtrees();
244 //dump();
245 }
246 return true;
247}
248
249
250// ====================================================================
251// some inode functions
252
253void MDCache::add_inode(CInode *in)
254{
255 // add to lru, inode map
b32b8144
FG
256 if (in->last == CEPH_NOSNAP) {
257 auto &p = inode_map[in->ino()];
11fdf7f2 258 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
259 p = in;
260 } else {
261 auto &p = snap_inode_map[in->vino()];
11fdf7f2 262 ceph_assert(!p); // should be no dup inos!
b32b8144
FG
263 p = in;
264 }
7c673cae
FG
265
266 if (in->ino() < MDS_INO_SYSTEM_BASE) {
267 if (in->ino() == MDS_INO_ROOT)
268 root = in;
269 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
270 myin = in;
271 else if (in->is_stray()) {
272 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
273 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
274 }
275 }
276 if (in->is_base())
277 base_inodes.insert(in);
278 }
279
181888fb 280 if (cache_toofull()) {
7c673cae
FG
281 exceeded_size_limit = true;
282 }
283}
284
285void MDCache::remove_inode(CInode *o)
286{
287 dout(14) << "remove_inode " << *o << dendl;
288
289 if (o->get_parent_dn()) {
290 // FIXME: multiple parents?
291 CDentry *dn = o->get_parent_dn();
11fdf7f2 292 ceph_assert(!dn->is_dirty());
7c673cae
FG
293 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
294 }
295
296 if (o->is_dirty())
297 o->mark_clean();
298 if (o->is_dirty_parent())
299 o->clear_dirty_parent();
300
301 o->clear_scatter_dirty();
302
303 o->item_open_file.remove_myself();
304
31f18b77
FG
305 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
306 export_pin_queue.erase(o);
7c673cae
FG
307
308 // remove from inode map
11fdf7f2 309 if (o->last == CEPH_NOSNAP) {
b32b8144 310 inode_map.erase(o->ino());
11fdf7f2
TL
311 } else {
312 o->item_caps.remove_myself();
b32b8144 313 snap_inode_map.erase(o->vino());
11fdf7f2 314 }
7c673cae
FG
315
316 if (o->ino() < MDS_INO_SYSTEM_BASE) {
317 if (o == root) root = 0;
318 if (o == myin) myin = 0;
319 if (o->is_stray()) {
320 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
321 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
322 }
323 }
324 if (o->is_base())
325 base_inodes.erase(o);
11fdf7f2 326 }
7c673cae
FG
327
328 // delete it
11fdf7f2 329 ceph_assert(o->get_num_ref() == 0);
7c673cae
FG
330 delete o;
331}
332
333file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
334{
335 file_layout_t result = file_layout_t::get_default();
336 result.pool_id = mdsmap.get_first_data_pool();
337 return result;
338}
339
340file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
341{
342 file_layout_t result = file_layout_t::get_default();
343 result.pool_id = mdsmap.get_metadata_pool();
11fdf7f2
TL
344 if (g_conf()->mds_log_segment_size > 0) {
345 result.object_size = g_conf()->mds_log_segment_size;
346 result.stripe_unit = g_conf()->mds_log_segment_size;
7c673cae
FG
347 }
348 return result;
349}
350
351void MDCache::init_layouts()
352{
353 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
354 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
355}
356
357void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
358 int mode) const
359{
360 in->inode.ino = ino;
361 in->inode.version = 1;
362 in->inode.xattr_version = 1;
363 in->inode.mode = 0500 | mode;
364 in->inode.size = 0;
365 in->inode.ctime =
366 in->inode.mtime =
367 in->inode.btime = ceph_clock_now();
368 in->inode.nlink = 1;
369 in->inode.truncate_size = -1ull;
370 in->inode.change_attr = 0;
371 in->inode.export_pin = MDS_RANK_NONE;
372
373 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
374 if (in->inode.is_dir()) {
11fdf7f2 375 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
f64942e4
AA
376 in->inode.rstat.rsubdirs = 1; /* itself */
377 in->inode.rstat.rctime = in->inode.ctime;
7c673cae
FG
378 } else {
379 in->inode.layout = default_file_layout;
380 ++in->inode.rstat.rfiles;
381 }
382 in->inode.accounted_rstat = in->inode.rstat;
383
384 if (in->is_base()) {
385 if (in->is_root())
386 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
387 else
388 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
389 in->open_snaprealm(); // empty snaprealm
11fdf7f2 390 ceph_assert(!in->snaprealm->parent); // created its own
7c673cae
FG
391 in->snaprealm->srnode.seq = 1;
392 }
393}
394
395CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
396{
397 dout(0) << "creating system inode with ino:" << ino << dendl;
398 CInode *in = new CInode(this);
399 create_unlinked_system_inode(in, ino, mode);
400 add_inode(in);
401 return in;
402}
403
404CInode *MDCache::create_root_inode()
405{
406 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
11fdf7f2
TL
407 i->inode.uid = g_conf()->mds_root_ino_uid;
408 i->inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
409 i->inode.layout = default_file_layout;
410 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
411 return i;
412}
413
414void MDCache::create_empty_hierarchy(MDSGather *gather)
415{
416 // create root dir
417 CInode *root = create_root_inode();
418
419 // force empty root dir
420 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
421 adjust_subtree_auth(rootdir, mds->get_nodeid());
422 rootdir->dir_rep = CDir::REP_ALL; //NONE;
423
11fdf7f2
TL
424 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
425 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
426 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
f64942e4
AA
427 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
428 * assume version 0 is stale/invalid.
429 */
7c673cae
FG
430
431 rootdir->mark_complete();
432 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
433 rootdir->commit(0, gather->new_sub());
434
28e407b8
AA
435 root->mark_clean();
436 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
437 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
438 root->flush(gather->new_sub());
7c673cae
FG
439}
440
441void MDCache::create_mydir_hierarchy(MDSGather *gather)
442{
443 // create mds dir
444 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
445
446 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
447 adjust_subtree_auth(mydir, mds->get_nodeid());
448
449 LogSegment *ls = mds->mdlog->get_current_segment();
450
451 // stray dir
452 for (int i = 0; i < NUM_STRAY; ++i) {
453 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
454 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
455 stringstream name;
456 name << "stray" << i;
457 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
458 sdn->_mark_dirty(mds->mdlog->get_current_segment());
459
460 stray->inode.dirstat = straydir->fnode.fragstat;
461
462 mydir->fnode.rstat.add(stray->inode.rstat);
463 mydir->fnode.fragstat.nsubdirs++;
464 // save them
465 straydir->mark_complete();
466 straydir->mark_dirty(straydir->pre_dirty(), ls);
467 straydir->commit(0, gather->new_sub());
28e407b8 468 stray->mark_dirty_parent(ls, true);
7c673cae
FG
469 stray->store_backtrace(gather->new_sub());
470 }
471
472 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
473 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
474
475 myin->inode.dirstat = mydir->fnode.fragstat;
476 myin->inode.rstat = mydir->fnode.rstat;
477 ++myin->inode.rstat.rsubdirs;
478 myin->inode.accounted_rstat = myin->inode.rstat;
479
480 mydir->mark_complete();
481 mydir->mark_dirty(mydir->pre_dirty(), ls);
482 mydir->commit(0, gather->new_sub());
483
484 myin->store(gather->new_sub());
485}
486
487struct C_MDC_CreateSystemFile : public MDCacheLogContext {
488 MutationRef mut;
489 CDentry *dn;
490 version_t dpv;
11fdf7f2
TL
491 MDSContext *fin;
492 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
7c673cae
FG
493 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
494 void finish(int r) override {
495 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
496 }
497};
498
11fdf7f2 499void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
7c673cae
FG
500{
501 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
502 CDentry *dn = dir->add_null_dentry(name);
503
504 dn->push_projected_linkage(in);
505 version_t dpv = dn->pre_dirty();
506
507 CDir *mdir = 0;
508 if (in->inode.is_dir()) {
509 in->inode.rstat.rsubdirs = 1;
510
511 mdir = in->get_or_open_dirfrag(this, frag_t());
512 mdir->mark_complete();
513 mdir->pre_dirty();
514 } else
515 in->inode.rstat.rfiles = 1;
516 in->inode.version = dn->pre_dirty();
517
518 SnapRealm *realm = dir->get_inode()->find_snaprealm();
519 dn->first = in->first = realm->get_newest_seq() + 1;
520
521 MutationRef mut(new MutationImpl());
522
523 // force some locks. hacky.
524 mds->locker->wrlock_force(&dir->inode->filelock, mut);
525 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
526
527 mut->ls = mds->mdlog->get_current_segment();
528 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
529 mds->mdlog->start_entry(le);
530
531 if (!in->is_mdsdir()) {
532 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
533 le->metablob.add_primary_dentry(dn, in, true);
534 } else {
535 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
536 journal_dirty_inode(mut.get(), &le->metablob, in);
537 dn->push_projected_linkage(in->ino(), in->d_type());
538 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
539 le->metablob.add_root(true, in);
540 }
541 if (mdir)
542 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
543
544 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
545 mds->mdlog->flush();
546}
547
11fdf7f2 548void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
7c673cae
FG
549{
550 dout(10) << "_create_system_file_finish " << *dn << dendl;
551
552 dn->pop_projected_linkage();
553 dn->mark_dirty(dpv, mut->ls);
554
555 CInode *in = dn->get_linkage()->get_inode();
556 in->inode.version--;
557 in->mark_dirty(in->inode.version + 1, mut->ls);
558
559 if (in->inode.is_dir()) {
560 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 561 ceph_assert(dir);
7c673cae
FG
562 dir->mark_dirty(1, mut->ls);
563 dir->mark_new(mut->ls);
564 }
565
566 mut->apply();
567 mds->locker->drop_locks(mut.get());
568 mut->cleanup();
569
570 fin->complete(0);
571
572 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
573 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
574}
575
576
577
578struct C_MDS_RetryOpenRoot : public MDSInternalContext {
579 MDCache *cache;
580 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
581 void finish(int r) override {
582 if (r < 0) {
583 // If we can't open root, something disastrous has happened: mark
584 // this rank damaged for operator intervention. Note that
585 // it is not okay to call suicide() here because we are in
586 // a Finisher callback.
587 cache->mds->damaged();
588 ceph_abort(); // damaged should never return
589 } else {
590 cache->open_root();
591 }
592 }
593};
594
11fdf7f2 595void MDCache::open_root_inode(MDSContext *c)
7c673cae
FG
596{
597 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
598 CInode *in;
599 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
600 in->fetch(c);
601 } else {
602 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
603 }
604}
605
11fdf7f2 606void MDCache::open_mydir_inode(MDSContext *c)
7c673cae 607{
7c673cae 608 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
11fdf7f2 609 in->fetch(c);
7c673cae
FG
610}
611
11fdf7f2 612void MDCache::open_mydir_frag(MDSContext *c)
28e407b8
AA
613{
614 open_mydir_inode(
615 new MDSInternalContextWrapper(mds,
616 new FunctionContext([this, c](int r) {
617 if (r < 0) {
618 c->complete(r);
619 return;
620 }
621 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 622 ceph_assert(mydir);
28e407b8
AA
623 adjust_subtree_auth(mydir, mds->get_nodeid());
624 mydir->fetch(c);
625 })
626 )
627 );
628}
629
7c673cae
FG
630void MDCache::open_root()
631{
632 dout(10) << "open_root" << dendl;
633
634 if (!root) {
635 open_root_inode(new C_MDS_RetryOpenRoot(this));
636 return;
637 }
638 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
11fdf7f2 639 ceph_assert(root->is_auth());
7c673cae 640 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
11fdf7f2 641 ceph_assert(rootdir);
7c673cae
FG
642 if (!rootdir->is_subtree_root())
643 adjust_subtree_auth(rootdir, mds->get_nodeid());
644 if (!rootdir->is_complete()) {
645 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
646 return;
647 }
648 } else {
11fdf7f2 649 ceph_assert(!root->is_auth());
7c673cae
FG
650 CDir *rootdir = root->get_dirfrag(frag_t());
651 if (!rootdir) {
224ce89b 652 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
7c673cae
FG
653 return;
654 }
655 }
656
657 if (!myin) {
658 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
659 in->fetch(new C_MDS_RetryOpenRoot(this));
660 return;
661 }
662 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 663 ceph_assert(mydir);
7c673cae
FG
664 adjust_subtree_auth(mydir, mds->get_nodeid());
665
666 populate_mydir();
667}
668
669void MDCache::populate_mydir()
670{
11fdf7f2 671 ceph_assert(myin);
7c673cae 672 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
11fdf7f2 673 ceph_assert(mydir);
7c673cae
FG
674
675 dout(10) << "populate_mydir " << *mydir << dendl;
676
677 if (!mydir->is_complete()) {
678 mydir->fetch(new C_MDS_RetryOpenRoot(this));
679 return;
680 }
681
682 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
683 // A missing dirfrag, we will recreate it. Before that, we must dirty
684 // it before dirtying any of the strays we create within it.
685 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
686 "recreating it now";
687 LogSegment *ls = mds->mdlog->get_current_segment();
688 mydir->state_clear(CDir::STATE_BADFRAG);
689 mydir->mark_complete();
690 mydir->mark_dirty(mydir->pre_dirty(), ls);
691 }
692
693 // open or create stray
694 uint64_t num_strays = 0;
695 for (int i = 0; i < NUM_STRAY; ++i) {
696 stringstream name;
697 name << "stray" << i;
698 CDentry *straydn = mydir->lookup(name.str());
699
700 // allow for older fs's with stray instead of stray0
701 if (straydn == NULL && i == 0)
702 straydn = mydir->lookup("stray");
703
704 if (!straydn || !straydn->get_linkage()->get_inode()) {
705 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
706 new C_MDS_RetryOpenRoot(this));
707 return;
708 }
11fdf7f2
TL
709 ceph_assert(straydn);
710 ceph_assert(strays[i]);
7c673cae
FG
711 // we make multiple passes through this method; make sure we only pin each stray once.
712 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
713 strays[i]->get(CInode::PIN_STRAY);
714 strays[i]->state_set(CInode::STATE_STRAYPINNED);
715 strays[i]->get_stickydirs();
716 }
717 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
718
719 // open all frags
11fdf7f2
TL
720 frag_vec_t leaves;
721 strays[i]->dirfragtree.get_leaves(leaves);
722 for (const auto& leaf : leaves) {
723 CDir *dir = strays[i]->get_dirfrag(leaf);
7c673cae 724 if (!dir) {
11fdf7f2 725 dir = strays[i]->get_or_open_dirfrag(this, leaf);
7c673cae
FG
726 }
727
728 // DamageTable applies special handling to strays: it will
729 // have damaged() us out if one is damaged.
11fdf7f2 730 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
7c673cae
FG
731
732 if (dir->get_version() == 0) {
733 dir->fetch(new C_MDS_RetryOpenRoot(this));
734 return;
735 }
736
737 if (dir->get_frag_size() > 0)
738 num_strays += dir->get_frag_size();
739 }
740 }
741
7c673cae
FG
742 // okay!
743 dout(10) << "populate_mydir done" << dendl;
11fdf7f2 744 ceph_assert(!open);
7c673cae
FG
745 open = true;
746 mds->queue_waiters(waiting_for_open);
747
11fdf7f2
TL
748 stray_manager.set_num_strays(num_strays);
749 stray_manager.activate();
750
7c673cae
FG
751 scan_stray_dir();
752}
753
11fdf7f2 754void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
7c673cae
FG
755{
756 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
757}
758
759CDir *MDCache::get_stray_dir(CInode *in)
760{
761 string straydname;
762 in->name_stray_dentry(straydname);
763
764 CInode *strayi = get_stray();
11fdf7f2 765 ceph_assert(strayi);
7c673cae
FG
766 frag_t fg = strayi->pick_dirfrag(straydname);
767 CDir *straydir = strayi->get_dirfrag(fg);
11fdf7f2 768 ceph_assert(straydir);
7c673cae
FG
769 return straydir;
770}
771
772CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
773{
774 CDir *straydir = get_stray_dir(in);
775 string straydname;
776 in->name_stray_dentry(straydname);
777 CDentry *straydn = straydir->lookup(straydname);
778 if (!straydn) {
779 straydn = straydir->add_null_dentry(straydname);
780 straydn->mark_new();
781 } else {
11fdf7f2 782 ceph_assert(straydn->get_projected_linkage()->is_null());
7c673cae
FG
783 }
784
785 straydn->state_set(CDentry::STATE_STRAY);
786 return straydn;
787}
788
789
790
11fdf7f2 791MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
7c673cae
FG
792{
793 // inode?
794 if (info.ino)
795 return get_inode(info.ino, info.snapid);
796
797 // dir or dentry.
798 CDir *dir = get_dirfrag(info.dirfrag);
799 if (!dir) return 0;
800
801 if (info.dname.length())
802 return dir->lookup(info.dname, info.snapid);
803 else
804 return dir;
805}
806
807
808
809
810// ====================================================================
811// subtree management
812
7c673cae
FG
813/*
814 * adjust the dir_auth of a subtree.
815 * merge with parent and/or child subtrees, if is it appropriate.
816 * merge can ONLY happen if both parent and child have unambiguous auth.
817 */
28e407b8 818void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
7c673cae
FG
819{
820 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
821 << " on " << *dir << dendl;
822
7c673cae
FG
823 show_subtrees();
824
825 CDir *root;
826 if (dir->inode->is_base()) {
827 root = dir; // bootstrap hack.
828 if (subtrees.count(root) == 0) {
829 subtrees[root];
830 root->get(CDir::PIN_SUBTREE);
831 }
832 } else {
833 root = get_subtree_root(dir); // subtree root
834 }
11fdf7f2
TL
835 ceph_assert(root);
836 ceph_assert(subtrees.count(root));
7c673cae
FG
837 dout(7) << " current root is " << *root << dendl;
838
839 if (root == dir) {
840 // i am already a subtree.
841 dir->set_dir_auth(auth);
842 } else {
843 // i am a new subtree.
844 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 845 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
846 subtrees[dir]; // create empty subtree bounds list for me.
847 dir->get(CDir::PIN_SUBTREE);
848
849 // set dir_auth
850 dir->set_dir_auth(auth);
851
852 // move items nested beneath me, under me.
853 set<CDir*>::iterator p = subtrees[root].begin();
854 while (p != subtrees[root].end()) {
855 set<CDir*>::iterator next = p;
856 ++next;
857 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
858 // move under me
859 dout(10) << " claiming child bound " << **p << dendl;
860 subtrees[dir].insert(*p);
861 subtrees[root].erase(p);
862 }
863 p = next;
864 }
865
866 // i am a bound of the parent subtree.
867 subtrees[root].insert(dir);
868
869 // i am now the subtree root.
870 root = dir;
871
872 // adjust recursive pop counters
28e407b8 873 if (adjust_pop && dir->is_auth()) {
7c673cae
FG
874 CDir *p = dir->get_parent_dir();
875 while (p) {
11fdf7f2 876 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
7c673cae
FG
877 if (p->is_subtree_root()) break;
878 p = p->inode->get_parent_dir();
879 }
880 }
7c673cae
FG
881 }
882
883 show_subtrees();
884}
885
886
887void MDCache::try_subtree_merge(CDir *dir)
888{
889 dout(7) << "try_subtree_merge " << *dir << dendl;
b32b8144
FG
890 // record my old bounds
891 auto oldbounds = subtrees.at(dir);
7c673cae 892
224ce89b 893 set<CInode*> to_eval;
7c673cae 894 // try merge at my root
224ce89b 895 try_subtree_merge_at(dir, &to_eval);
7c673cae
FG
896
897 // try merge at my old bounds
224ce89b
WB
898 for (auto bound : oldbounds)
899 try_subtree_merge_at(bound, &to_eval);
900
901 if (!(mds->is_any_replay() || mds->is_resolve())) {
902 for(auto in : to_eval)
903 eval_subtree_root(in);
904 }
7c673cae
FG
905}
906
907class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
908 CInode *in;
909 MutationRef mut;
910public:
911 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
912 void finish(int r) override {
913 mdcache->subtree_merge_writebehind_finish(in, mut);
914 }
915};
916
28e407b8 917void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
7c673cae
FG
918{
919 dout(10) << "try_subtree_merge_at " << *dir << dendl;
b32b8144
FG
920
921 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
922 dir->state_test(CDir::STATE_EXPORTBOUND) ||
923 dir->state_test(CDir::STATE_AUXSUBTREE))
924 return;
925
926 auto it = subtrees.find(dir);
11fdf7f2 927 ceph_assert(it != subtrees.end());
7c673cae 928
7c673cae
FG
929 // merge with parent?
930 CDir *parent = dir;
931 if (!dir->inode->is_base())
932 parent = get_subtree_root(dir->get_parent_dir());
933
b32b8144
FG
934 if (parent != dir && // we have a parent,
935 parent->dir_auth == dir->dir_auth) { // auth matches,
7c673cae
FG
936 // merge with parent.
937 dout(10) << " subtree merge at " << *dir << dendl;
938 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
939
940 // move our bounds under the parent
b32b8144 941 subtrees[parent].insert(it->second.begin(), it->second.end());
7c673cae
FG
942
943 // we are no longer a subtree or bound
944 dir->put(CDir::PIN_SUBTREE);
b32b8144 945 subtrees.erase(it);
7c673cae
FG
946 subtrees[parent].erase(dir);
947
948 // adjust popularity?
28e407b8 949 if (adjust_pop && dir->is_auth()) {
28e407b8 950 CDir *cur = dir;
7c673cae
FG
951 CDir *p = dir->get_parent_dir();
952 while (p) {
11fdf7f2 953 p->pop_auth_subtree.add(dir->pop_auth_subtree);
28e407b8 954 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
7c673cae 955 if (p->is_subtree_root()) break;
28e407b8 956 cur = p;
7c673cae
FG
957 p = p->inode->get_parent_dir();
958 }
959 }
960
224ce89b
WB
961 if (to_eval && dir->get_inode()->is_auth())
962 to_eval->insert(dir->get_inode());
7c673cae 963
181888fb
FG
964 show_subtrees(15);
965 }
7c673cae
FG
966}
967
968void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
969{
970 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
971 in->pop_and_dirty_projected_inode(mut->ls);
972
973 mut->apply();
974 mds->locker->drop_locks(mut.get());
975 mut->cleanup();
976
977 in->auth_unpin(this);
978}
979
980void MDCache::eval_subtree_root(CInode *diri)
981{
982 // evaluate subtree inode filelock?
983 // (we should scatter the filelock on subtree bounds)
11fdf7f2 984 ceph_assert(diri->is_auth());
224ce89b 985 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
7c673cae
FG
986}
987
988
11fdf7f2 989void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
7c673cae
FG
990{
991 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
992 << " on " << *dir
993 << " bounds " << bounds
994 << dendl;
995
996 show_subtrees();
997
998 CDir *root;
999 if (dir->ino() == MDS_INO_ROOT) {
1000 root = dir; // bootstrap hack.
1001 if (subtrees.count(root) == 0) {
1002 subtrees[root];
1003 root->get(CDir::PIN_SUBTREE);
1004 }
1005 } else {
1006 root = get_subtree_root(dir); // subtree root
1007 }
11fdf7f2
TL
1008 ceph_assert(root);
1009 ceph_assert(subtrees.count(root));
7c673cae
FG
1010 dout(7) << " current root is " << *root << dendl;
1011
1012 mds_authority_t oldauth = dir->authority();
1013
1014 if (root == dir) {
1015 // i am already a subtree.
1016 dir->set_dir_auth(auth);
1017 } else {
1018 // i am a new subtree.
1019 dout(10) << " new subtree at " << *dir << dendl;
11fdf7f2 1020 ceph_assert(subtrees.count(dir) == 0);
7c673cae
FG
1021 subtrees[dir]; // create empty subtree bounds list for me.
1022 dir->get(CDir::PIN_SUBTREE);
1023
1024 // set dir_auth
1025 dir->set_dir_auth(auth);
1026
1027 // move items nested beneath me, under me.
1028 set<CDir*>::iterator p = subtrees[root].begin();
1029 while (p != subtrees[root].end()) {
1030 set<CDir*>::iterator next = p;
1031 ++next;
1032 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1033 // move under me
1034 dout(10) << " claiming child bound " << **p << dendl;
1035 subtrees[dir].insert(*p);
1036 subtrees[root].erase(p);
1037 }
1038 p = next;
1039 }
1040
1041 // i am a bound of the parent subtree.
1042 subtrees[root].insert(dir);
1043
1044 // i am now the subtree root.
1045 root = dir;
1046 }
1047
224ce89b
WB
1048 set<CInode*> to_eval;
1049
7c673cae
FG
1050 // verify/adjust bounds.
1051 // - these may be new, or
1052 // - beneath existing ambiguous bounds (which will be collapsed),
1053 // - but NOT beneath unambiguous bounds.
11fdf7f2 1054 for (const auto& bound : bounds) {
7c673cae
FG
1055 // new bound?
1056 if (subtrees[dir].count(bound) == 0) {
1057 if (get_subtree_root(bound) == dir) {
1058 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1059 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1060 }
1061 else {
1062 dout(10) << " want bound " << *bound << dendl;
1063 CDir *t = get_subtree_root(bound->get_parent_dir());
1064 if (subtrees[t].count(bound) == 0) {
11fdf7f2 1065 ceph_assert(t != dir);
7c673cae
FG
1066 dout(10) << " new bound " << *bound << dendl;
1067 adjust_subtree_auth(bound, t->authority());
1068 }
1069 // make sure it's nested beneath ambiguous subtree(s)
1070 while (1) {
1071 while (subtrees[dir].count(t) == 0)
1072 t = get_subtree_root(t->get_parent_dir());
1073 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1074 adjust_subtree_auth(t, auth);
224ce89b 1075 try_subtree_merge_at(t, &to_eval);
7c673cae
FG
1076 t = get_subtree_root(bound->get_parent_dir());
1077 if (t == dir) break;
1078 }
1079 }
1080 }
1081 else {
1082 dout(10) << " already have bound " << *bound << dendl;
1083 }
1084 }
1085 // merge stray bounds?
1086 while (!subtrees[dir].empty()) {
1087 set<CDir*> copy = subtrees[dir];
1088 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1089 if (bounds.count(*p) == 0) {
1090 CDir *stray = *p;
1091 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1092 adjust_subtree_auth(stray, auth);
224ce89b 1093 try_subtree_merge_at(stray, &to_eval);
7c673cae
FG
1094 }
1095 }
1096 // swallowing subtree may add new subtree bounds
1097 if (copy == subtrees[dir])
1098 break;
1099 }
1100
1101 // bound should now match.
1102 verify_subtree_bounds(dir, bounds);
1103
1104 show_subtrees();
224ce89b
WB
1105
1106 if (!(mds->is_any_replay() || mds->is_resolve())) {
1107 for(auto in : to_eval)
1108 eval_subtree_root(in);
1109 }
7c673cae
FG
1110}
1111
1112
1113/*
1114 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1115 * fragmentation as necessary to get an equivalent bounding set. That is, only
1116 * split if one of our frags spans the provided bounding set. Never merge.
1117 */
11fdf7f2 1118void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
7c673cae
FG
1119{
1120 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1121
1122 // sort by ino
1123 map<inodeno_t, fragset_t> byino;
11fdf7f2
TL
1124 for (auto& frag : dfs) {
1125 byino[frag.ino].insert(frag.frag);
1126 }
7c673cae
FG
1127 dout(10) << " by ino: " << byino << dendl;
1128
1129 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1130 CInode *diri = get_inode(p->first);
1131 if (!diri)
1132 continue;
1133 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1134
1135 fragtree_t tmpdft;
1136 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1137 tmpdft.force_to_leaf(g_ceph_context, *q);
1138
11fdf7f2
TL
1139 for (const auto& fg : p->second) {
1140 frag_vec_t leaves;
1141 diri->dirfragtree.get_leaves_under(fg, leaves);
1142 if (leaves.empty()) {
7c673cae
FG
1143 bool all = true;
1144 frag_t approx_fg = diri->dirfragtree[fg.value()];
11fdf7f2
TL
1145 frag_vec_t approx_leaves;
1146 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1147 for (const auto& leaf : approx_leaves) {
1148 if (p->second.get().count(leaf) == 0) {
7c673cae 1149 // not bound, so the resolve message is from auth MDS of the dirfrag
11fdf7f2 1150 force_dir_fragment(diri, leaf);
7c673cae
FG
1151 all = false;
1152 }
1153 }
1154 if (all)
11fdf7f2 1155 leaves.push_back(approx_fg);
7c673cae 1156 else
11fdf7f2 1157 diri->dirfragtree.get_leaves_under(fg, leaves);
7c673cae 1158 }
11fdf7f2
TL
1159 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1160 for (const auto& leaf : leaves) {
1161 CDir *dir = diri->get_dirfrag(leaf);
7c673cae
FG
1162 if (dir)
1163 bounds.insert(dir);
1164 }
1165 }
1166 }
1167}
1168
11fdf7f2 1169void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
7c673cae
FG
1170{
1171 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1172 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1173
1174 set<CDir*> bounds;
1175 get_force_dirfrag_bound_set(bound_dfs, bounds);
1176 adjust_bounded_subtree_auth(dir, bounds, auth);
1177}
1178
11fdf7f2 1179void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
7c673cae
FG
1180{
1181 dout(10) << "map_dirfrag_set " << dfs << dendl;
1182
1183 // group by inode
1184 map<inodeno_t, fragset_t> ino_fragset;
11fdf7f2
TL
1185 for (const auto &df : dfs) {
1186 ino_fragset[df.ino].insert(df.frag);
1187 }
7c673cae
FG
1188
1189 // get frags
1190 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1191 p != ino_fragset.end();
1192 ++p) {
1193 CInode *in = get_inode(p->first);
1194 if (!in)
1195 continue;
1196
11fdf7f2
TL
1197 frag_vec_t fgs;
1198 for (const auto& fg : p->second) {
1199 in->dirfragtree.get_leaves_under(fg, fgs);
1200 }
7c673cae 1201
11fdf7f2 1202 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
7c673cae
FG
1203 << " on " << *in << dendl;
1204
11fdf7f2
TL
1205 for (const auto& fg : fgs) {
1206 CDir *dir = in->get_dirfrag(fg);
7c673cae
FG
1207 if (dir)
1208 result.insert(dir);
1209 }
1210 }
1211}
1212
1213
1214
1215CDir *MDCache::get_subtree_root(CDir *dir)
1216{
1217 // find the underlying dir that delegates (or is about to delegate) auth
1218 while (true) {
1219 if (dir->is_subtree_root())
1220 return dir;
1221 dir = dir->get_inode()->get_parent_dir();
1222 if (!dir)
1223 return 0; // none
1224 }
1225}
1226
1227CDir *MDCache::get_projected_subtree_root(CDir *dir)
1228{
1229 // find the underlying dir that delegates (or is about to delegate) auth
1230 while (true) {
1231 if (dir->is_subtree_root())
1232 return dir;
1233 dir = dir->get_inode()->get_projected_parent_dir();
1234 if (!dir)
1235 return 0; // none
1236 }
1237}
1238
1239void MDCache::remove_subtree(CDir *dir)
1240{
1241 dout(10) << "remove_subtree " << *dir << dendl;
11fdf7f2
TL
1242 ceph_assert(subtrees.count(dir));
1243 ceph_assert(subtrees[dir].empty());
7c673cae
FG
1244 subtrees.erase(dir);
1245 dir->put(CDir::PIN_SUBTREE);
1246 if (dir->get_parent_dir()) {
1247 CDir *p = get_subtree_root(dir->get_parent_dir());
11fdf7f2 1248 ceph_assert(subtrees[p].count(dir));
7c673cae
FG
1249 subtrees[p].erase(dir);
1250 }
1251}
1252
1253void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1254{
11fdf7f2 1255 ceph_assert(subtrees.count(dir));
7c673cae
FG
1256 bounds = subtrees[dir];
1257}
1258
1259void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1260{
1261 if (subtrees.count(dir)) {
1262 // just copy them, dir is a subtree.
1263 get_subtree_bounds(dir, bounds);
1264 } else {
1265 // find them
1266 CDir *root = get_subtree_root(dir);
1267 for (set<CDir*>::iterator p = subtrees[root].begin();
1268 p != subtrees[root].end();
1269 ++p) {
1270 CDir *t = *p;
1271 while (t != root) {
1272 t = t->get_parent_dir();
11fdf7f2 1273 ceph_assert(t);
7c673cae
FG
1274 if (t == dir) {
1275 bounds.insert(*p);
1276 continue;
1277 }
1278 }
1279 }
1280 }
1281}
1282
1283void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1284{
1285 // for debugging only.
11fdf7f2 1286 ceph_assert(subtrees.count(dir));
7c673cae
FG
1287 if (bounds != subtrees[dir]) {
1288 dout(0) << "verify_subtree_bounds failed" << dendl;
1289 set<CDir*> b = bounds;
1290 for (auto &cd : subtrees[dir]) {
1291 if (bounds.count(cd)) {
1292 b.erase(cd);
1293 continue;
1294 }
1295 dout(0) << " missing bound " << *cd << dendl;
1296 }
1297 for (const auto &cd : b)
1298 dout(0) << " extra bound " << *cd << dendl;
1299 }
11fdf7f2 1300 ceph_assert(bounds == subtrees[dir]);
7c673cae
FG
1301}
1302
1303void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1304{
1305 // for debugging only.
11fdf7f2 1306 ceph_assert(subtrees.count(dir));
7c673cae
FG
1307
1308 // make sure that any bounds i do have are properly noted as such.
1309 int failed = 0;
1310 for (const auto &fg : bounds) {
1311 CDir *bd = get_dirfrag(fg);
1312 if (!bd) continue;
1313 if (subtrees[dir].count(bd) == 0) {
1314 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1315 failed++;
1316 }
1317 }
11fdf7f2 1318 ceph_assert(failed == 0);
7c673cae
FG
1319}
1320
1321void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1322{
1323 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1324 << " to " << *newdir << dendl;
1325 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1326}
1327
224ce89b 1328void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
7c673cae
FG
1329{
1330 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1331
7c673cae
FG
1332 CDir *newdir = diri->get_parent_dir();
1333
1334 if (pop) {
1335 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
11fdf7f2
TL
1336 ceph_assert(p != projected_subtree_renames.end());
1337 ceph_assert(!p->second.empty());
1338 ceph_assert(p->second.front().first == olddir);
1339 ceph_assert(p->second.front().second == newdir);
7c673cae
FG
1340 p->second.pop_front();
1341 if (p->second.empty())
1342 projected_subtree_renames.erase(p);
1343 }
1344
11fdf7f2
TL
1345 vector<CDir*> dfls;
1346
1347 // adjust total auth pin of freezing subtree
1348 if (olddir != newdir) {
1349 diri->get_nested_dirfrags(dfls);
1350 for (auto dir : dfls)
1351 olddir->adjust_freeze_after_rename(dir);
1352 dfls.clear();
1353 }
1354
7c673cae 1355 // adjust subtree
7c673cae
FG
1356 // make sure subtree dirfrags are at the front of the list
1357 diri->get_subtree_dirfrags(dfls);
1358 diri->get_nested_dirfrags(dfls);
11fdf7f2 1359 for (auto dir : dfls) {
7c673cae
FG
1360 dout(10) << "dirfrag " << *dir << dendl;
1361 CDir *oldparent = get_subtree_root(olddir);
1362 dout(10) << " old parent " << *oldparent << dendl;
1363 CDir *newparent = get_subtree_root(newdir);
1364 dout(10) << " new parent " << *newparent << dendl;
1365
28e407b8 1366 if (olddir != newdir)
11fdf7f2 1367 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
28e407b8 1368
7c673cae
FG
1369 if (oldparent == newparent) {
1370 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
28e407b8 1371 } else if (dir->is_subtree_root()) {
7c673cae
FG
1372 // children are fine. change parent.
1373 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
11fdf7f2 1374 ceph_assert(subtrees[oldparent].count(dir));
7c673cae 1375 subtrees[oldparent].erase(dir);
11fdf7f2 1376 ceph_assert(subtrees.count(newparent));
7c673cae 1377 subtrees[newparent].insert(dir);
224ce89b 1378 // caller is responsible for 'eval diri'
28e407b8 1379 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1380 } else {
1381 // mid-subtree.
1382
1383 // see if any old bounds move to the new parent.
1384 list<CDir*> tomove;
1385 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
1386 p != subtrees[oldparent].end();
1387 ++p) {
1388 CDir *bound = *p;
1389 CDir *broot = get_subtree_root(bound->get_parent_dir());
1390 if (broot != oldparent) {
11fdf7f2 1391 ceph_assert(broot == newparent);
7c673cae
FG
1392 tomove.push_back(bound);
1393 }
1394 }
1395 for (list<CDir*>::iterator p = tomove.begin(); p != tomove.end(); ++p) {
1396 CDir *bound = *p;
1397 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1398 subtrees[oldparent].erase(bound);
1399 subtrees[newparent].insert(bound);
1400 }
1401
1402 // did auth change?
1403 if (oldparent->authority() != newparent->authority()) {
28e407b8 1404 adjust_subtree_auth(dir, oldparent->authority(), false);
224ce89b 1405 // caller is responsible for 'eval diri'
28e407b8 1406 try_subtree_merge_at(dir, NULL, false);
7c673cae
FG
1407 }
1408 }
28e407b8
AA
1409
1410 if (olddir != newdir)
11fdf7f2 1411 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
7c673cae
FG
1412 }
1413
1414 show_subtrees();
1415}
1416
7c673cae
FG
1417// ===================================
1418// journal and snap/cow helpers
1419
1420
1421/*
1422 * find first inode in cache that follows given snapid. otherwise, return current.
1423 */
1424CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1425{
1426 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
11fdf7f2 1427 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae 1428
b32b8144
FG
1429 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1430 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1431 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1432 in = p->second;
7c673cae 1433 }
b32b8144 1434
7c673cae
FG
1435 return in;
1436}
1437
1438
1439/*
1440 * note: i'm currently cheating wrt dirty and inode.version on cow
1441 * items. instead of doing a full dir predirty, i just take the
1442 * original item's version, and set the dirty flag (via
1443 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1444 * means a special case in the dir commit clean sweep assertions.
1445 * bah.
1446 */
1447CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1448{
11fdf7f2 1449 ceph_assert(last >= in->first);
7c673cae 1450
b32b8144 1451 CInode *oldin = new CInode(this, true, in->first, last);
7c673cae 1452 oldin->inode = *in->get_previous_projected_inode();
7c673cae 1453 oldin->xattrs = *in->get_previous_projected_xattrs();
11fdf7f2 1454 oldin->symlink = in->symlink;
7c673cae
FG
1455 oldin->inode.trim_client_ranges(last);
1456
1457 if (in->first < in->oldest_snap)
1458 in->oldest_snap = in->first;
1459
1460 in->first = last+1;
1461
1462 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1463 add_inode(oldin);
1464
1465 if (in->last != CEPH_NOSNAP) {
1466 CInode *head_in = get_inode(in->ino());
11fdf7f2 1467 ceph_assert(head_in);
7c673cae
FG
1468 if (head_in->split_need_snapflush(oldin, in)) {
1469 oldin->client_snap_caps = in->client_snap_caps;
94b18763
FG
1470 for (const auto &p : in->client_snap_caps) {
1471 SimpleLock *lock = oldin->get_lock(p.first);
11fdf7f2 1472 ceph_assert(lock);
94b18763 1473 for (const auto &q : p.second) {
7c673cae
FG
1474 oldin->auth_pin(lock);
1475 lock->set_state(LOCK_SNAP_SYNC); // gathering
1476 lock->get_wrlock(true);
94b18763 1477 (void)q; /* unused */
7c673cae
FG
1478 }
1479 }
1480 }
1481 return oldin;
1482 }
1483
b32b8144
FG
1484 if (!in->client_caps.empty()) {
1485 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1486 // clone caps?
94b18763 1487 for (auto &p : in->client_caps) {
b32b8144 1488 client_t client = p.first;
11fdf7f2
TL
1489 Capability *cap = &p.second;
1490 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
b32b8144
FG
1491 if ((issued & CEPH_CAP_ANY_WR) &&
1492 cap->client_follows < last) {
1493 // note in oldin
1494 for (int i = 0; i < num_cinode_locks; i++) {
1495 if (issued & cinode_lock_info[i].wr_caps) {
1496 int lockid = cinode_lock_info[i].lock;
1497 SimpleLock *lock = oldin->get_lock(lockid);
11fdf7f2 1498 ceph_assert(lock);
b32b8144
FG
1499 oldin->client_snap_caps[lockid].insert(client);
1500 oldin->auth_pin(lock);
1501 lock->set_state(LOCK_SNAP_SYNC); // gathering
1502 lock->get_wrlock(true);
1503 dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps)
1504 << " wrlock lock " << *lock << " on " << *oldin << dendl;
1505 }
7c673cae 1506 }
b32b8144
FG
1507 cap->client_follows = last;
1508
1509 // we need snapflushes for any intervening snaps
1510 dout(10) << " snaps " << snaps << dendl;
1511 for (auto q = snaps.lower_bound(oldin->first);
1512 q != snaps.end() && *q <= last;
1513 ++q) {
1514 in->add_need_snapflush(oldin, *q, client);
1515 }
1516 } else {
1517 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
7c673cae 1518 }
7c673cae
FG
1519 }
1520 }
7c673cae
FG
1521 return oldin;
1522}
1523
1524void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1525 CDentry *dn, snapid_t follows,
1526 CInode **pcow_inode, CDentry::linkage_t *dnl)
1527{
1528 if (!dn) {
1529 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1530 return;
1531 }
1532 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
11fdf7f2 1533 ceph_assert(dn->is_auth());
7c673cae
FG
1534
1535 // nothing to cow on a null dentry, fix caller
1536 if (!dnl)
1537 dnl = dn->get_projected_linkage();
11fdf7f2 1538 ceph_assert(!dnl->is_null());
7c673cae 1539
11fdf7f2
TL
1540 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1541 bool cow_head = false;
1542 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1543 ceph_assert(in->is_frozen_inode());
1544 cow_head = true;
1545 }
1546 if (in && (in->is_multiversion() || cow_head)) {
7c673cae 1547 // multiversion inode.
7c673cae
FG
1548 SnapRealm *realm = NULL;
1549
1550 if (in->get_projected_parent_dn() != dn) {
11fdf7f2 1551 ceph_assert(follows == CEPH_NOSNAP);
7c673cae 1552 realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1553 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1554 ceph_assert(dir_follows >= realm->get_newest_seq());
7c673cae
FG
1555
1556 if (dir_follows+1 > dn->first) {
1557 snapid_t oldfirst = dn->first;
1558 dn->first = dir_follows+1;
1559 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
94b18763 1560 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
7c673cae
FG
1561 oldfirst, dir_follows);
1562 olddn->pre_dirty();
1563 dout(10) << " olddn " << *olddn << dendl;
1564 metablob->add_remote_dentry(olddn, true);
1565 mut->add_cow_dentry(olddn);
1566 // FIXME: adjust link count here? hmm.
1567
1568 if (dir_follows+1 > in->first)
11fdf7f2 1569 in->cow_old_inode(dir_follows, cow_head);
7c673cae
FG
1570 }
1571 }
1572
11fdf7f2 1573 follows = dir_follows;
7c673cae
FG
1574 if (in->snaprealm) {
1575 realm = in->snaprealm;
11fdf7f2
TL
1576 ceph_assert(follows >= realm->get_newest_seq());
1577 }
7c673cae
FG
1578 } else {
1579 realm = in->find_snaprealm();
11fdf7f2
TL
1580 if (follows == CEPH_NOSNAP) {
1581 follows = get_global_snaprealm()->get_newest_seq();
1582 ceph_assert(follows >= realm->get_newest_seq());
1583 }
7c673cae
FG
1584 }
1585
1586 // already cloned?
1587 if (follows < in->first) {
1588 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1589 return;
1590 }
1591
1592 if (!realm->has_snaps_in_range(in->first, follows)) {
1593 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1594 in->first = follows + 1;
1595 return;
1596 }
1597
11fdf7f2 1598 in->cow_old_inode(follows, cow_head);
7c673cae
FG
1599
1600 } else {
1601 SnapRealm *realm = dn->dir->inode->find_snaprealm();
11fdf7f2
TL
1602 if (follows == CEPH_NOSNAP) {
1603 follows = get_global_snaprealm()->get_newest_seq();
1604 ceph_assert(follows >= realm->get_newest_seq());
1605 }
7c673cae
FG
1606
1607 // already cloned?
1608 if (follows < dn->first) {
1609 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1610 return;
1611 }
1612
1613 // update dn.first before adding old dentry to cdir's map
1614 snapid_t oldfirst = dn->first;
1615 dn->first = follows+1;
1616
7c673cae
FG
1617 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1618 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1619 if (in)
1620 in->first = follows+1;
1621 return;
1622 }
1623
1624 dout(10) << " dn " << *dn << dendl;
1625 if (in) {
1626 CInode *oldin = cow_inode(in, follows);
1627 mut->add_cow_inode(oldin);
1628 if (pcow_inode)
1629 *pcow_inode = oldin;
11fdf7f2 1630 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
7c673cae
FG
1631 oldin->inode.version = olddn->pre_dirty();
1632 dout(10) << " olddn " << *olddn << dendl;
1633 bool need_snapflush = !oldin->client_snap_caps.empty();
11fdf7f2 1634 if (need_snapflush) {
7c673cae 1635 mut->ls->open_files.push_back(&oldin->item_open_file);
11fdf7f2
TL
1636 mds->locker->mark_need_snapflush_inode(oldin);
1637 }
7c673cae
FG
1638 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1639 mut->add_cow_dentry(olddn);
1640 } else {
11fdf7f2 1641 ceph_assert(dnl->is_remote());
94b18763 1642 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
7c673cae
FG
1643 oldfirst, follows);
1644 olddn->pre_dirty();
1645 dout(10) << " olddn " << *olddn << dendl;
1646 metablob->add_remote_dentry(olddn, true);
1647 mut->add_cow_dentry(olddn);
1648 }
1649 }
1650}
1651
1652
1653void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1654 CInode *in, snapid_t follows,
1655 CInode **pcow_inode)
1656{
1657 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1658 CDentry *dn = in->get_projected_parent_dn();
1659 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1660}
1661
1662void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1663{
1664 if (in->is_base()) {
11fdf7f2 1665 metablob->add_root(true, in);
7c673cae
FG
1666 } else {
1667 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1668 follows = in->first - 1;
1669 CDentry *dn = in->get_projected_parent_dn();
1670 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1671 journal_cow_dentry(mut, metablob, dn, follows);
1672 if (in->get_projected_inode()->is_backtrace_updated()) {
1673 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1674 in->get_previous_projected_inode()->layout.pool_id;
1675 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1676 } else {
1677 metablob->add_primary_dentry(dn, in, true);
1678 }
1679 }
1680}
1681
1682
1683
1684// nested ---------------------------------------------------------------
1685
1686void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1687 int linkunlink, SnapRealm *prealm)
1688{
1689 CDentry *parentdn = cur->get_projected_parent_dn();
94b18763 1690 CInode::mempool_inode *curi = cur->get_projected_inode();
7c673cae
FG
1691
1692 if (cur->first > first)
1693 first = cur->first;
1694
1695 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1696 << " " << *cur << dendl;
1697 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1698 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1699
1700 /*
1701 * FIXME. this incompletely propagates rstats to _old_ parents
1702 * (i.e. shortly after a directory rename). but we need full
1703 * blown hard link backpointers to make this work properly...
1704 */
1705 snapid_t floor = parentdn->first;
1706 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1707
1708 if (!prealm)
1709 prealm = parent->inode->find_snaprealm();
1710 const set<snapid_t> snaps = prealm->get_snaps();
1711
1712 if (cur->last != CEPH_NOSNAP) {
11fdf7f2
TL
1713 ceph_assert(cur->dirty_old_rstats.empty());
1714 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
7c673cae
FG
1715 if (q == snaps.end() || *q > cur->last)
1716 return;
1717 }
1718
1719 if (cur->last >= floor) {
1720 bool update = true;
1721 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1722 // rename src inode is not projected in the slave rename prep case. so we should
1723 // avoid updateing the inode.
11fdf7f2
TL
1724 ceph_assert(linkunlink < 0);
1725 ceph_assert(cur->is_frozen_inode());
7c673cae
FG
1726 update = false;
1727 }
11fdf7f2 1728 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
7c673cae
FG
1729 linkunlink, update);
1730 }
1731
11fdf7f2 1732 if (g_conf()->mds_snap_rstat) {
94b18763
FG
1733 for (const auto &p : cur->dirty_old_rstats) {
1734 auto &old = cur->old_inodes[p];
1735 snapid_t ofirst = std::max(old.first, floor);
1736 auto it = snaps.lower_bound(ofirst);
1737 if (it == snaps.end() || *it > p)
7c673cae 1738 continue;
94b18763
FG
1739 if (p >= floor)
1740 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
7c673cae
FG
1741 }
1742 }
1743 cur->dirty_old_rstats.clear();
1744}
1745
1746
94b18763 1747void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
1748 CDir *parent, int linkunlink, bool update_inode)
1749{
1750 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1751 dout(20) << " inode rstat " << inode.rstat << dendl;
1752 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1753 nest_info_t delta;
1754 if (linkunlink == 0) {
1755 delta.add(inode.rstat);
1756 delta.sub(inode.accounted_rstat);
1757 } else if (linkunlink < 0) {
1758 delta.sub(inode.accounted_rstat);
1759 } else {
1760 delta.add(inode.rstat);
1761 }
1762 dout(20) << " delta " << delta << dendl;
1763
1764 if (update_inode)
1765 inode.accounted_rstat = inode.rstat;
1766
1767 while (last >= ofirst) {
1768 /*
1769 * pick fnode version to update. at each iteration, we want to
1770 * pick a segment ending in 'last' to update. split as necessary
1771 * to make that work. then, adjust first up so that we only
1772 * update one segment at a time. then loop to cover the whole
1773 * [ofirst,last] interval.
1774 */
1775 nest_info_t *prstat;
1776 snapid_t first;
1777 fnode_t *pf = parent->get_projected_fnode();
1778 if (last == CEPH_NOSNAP) {
11fdf7f2
TL
1779 if (g_conf()->mds_snap_rstat)
1780 first = std::max(ofirst, parent->first);
7c673cae
FG
1781 else
1782 first = parent->first;
1783 prstat = &pf->rstat;
1784 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1785
1786 if (first > parent->first &&
1787 !(pf->rstat == pf->accounted_rstat)) {
1788 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1789 << parent->first << "," << (first-1) << "] "
1790 << " " << *prstat << "/" << pf->accounted_rstat
1791 << dendl;
1792 parent->dirty_old_rstat[first-1].first = parent->first;
1793 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1794 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1795 }
1796 parent->first = first;
11fdf7f2 1797 } else if (!g_conf()->mds_snap_rstat) {
7c673cae
FG
1798 // drop snapshots' rstats
1799 break;
1800 } else if (last >= parent->first) {
1801 first = parent->first;
1802 parent->dirty_old_rstat[last].first = first;
1803 parent->dirty_old_rstat[last].rstat = pf->rstat;
1804 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1805 prstat = &parent->dirty_old_rstat[last].rstat;
1806 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1807 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1808 } else {
1809 // be careful, dirty_old_rstat is a _sparse_ map.
1810 // sorry, this is ugly.
1811 first = ofirst;
1812
1813 // find any intersection with last
94b18763
FG
1814 auto it = parent->dirty_old_rstat.lower_bound(last);
1815 if (it == parent->dirty_old_rstat.end()) {
7c673cae
FG
1816 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1817 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1818 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1819 first = parent->dirty_old_rstat.rbegin()->first+1;
1820 }
1821 } else {
94b18763
FG
1822 // *it last is >= last
1823 if (it->second.first <= last) {
1824 // *it intersects [first,last]
1825 if (it->second.first < first) {
1826 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1827 parent->dirty_old_rstat[first-1] = it->second;
1828 it->second.first = first;
7c673cae 1829 }
94b18763
FG
1830 if (it->second.first > first)
1831 first = it->second.first;
1832 if (last < it->first) {
1833 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1834 parent->dirty_old_rstat[last] = it->second;
1835 it->second.first = last+1;
7c673cae
FG
1836 }
1837 } else {
94b18763
FG
1838 // *it is to the _right_ of [first,last]
1839 it = parent->dirty_old_rstat.lower_bound(first);
1840 // new *it last is >= first
1841 if (it->second.first <= last && // new *it isn't also to the right, and
1842 it->first >= first) { // it intersects our first bit,
1843 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1844 first = it->first+1;
7c673cae
FG
1845 }
1846 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1847 }
1848 }
1849 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1850 parent->dirty_old_rstat[last].first = first;
1851 prstat = &parent->dirty_old_rstat[last].rstat;
1852 }
1853
1854 // apply
1855 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
11fdf7f2 1856 ceph_assert(last >= first);
7c673cae
FG
1857 prstat->add(delta);
1858 if (update_inode)
1859 inode.accounted_rstat = inode.rstat;
1860 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1861
1862 last = first-1;
1863 }
1864}
1865
1866void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1867 snapid_t ofirst, snapid_t last,
1868 CInode *pin, bool cow_head)
1869{
1870 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1871 dout(20) << " frag rstat " << rstat << dendl;
1872 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1873 nest_info_t delta = rstat;
1874 delta.sub(accounted_rstat);
1875 dout(20) << " delta " << delta << dendl;
1876
1877 while (last >= ofirst) {
94b18763 1878 CInode::mempool_inode *pi;
7c673cae
FG
1879 snapid_t first;
1880 if (last == pin->last) {
1881 pi = pin->get_projected_inode();
11fdf7f2 1882 first = std::max(ofirst, pin->first);
7c673cae 1883 if (first > pin->first) {
94b18763 1884 auto &old = pin->cow_old_inode(first-1, cow_head);
7c673cae
FG
1885 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1886 }
1887 } else {
1888 if (last >= pin->first) {
1889 first = pin->first;
1890 pin->cow_old_inode(last, cow_head);
1891 } else {
1892 // our life is easier here because old_inodes is not sparse
1893 // (although it may not begin at snapid 1)
94b18763
FG
1894 auto it = pin->old_inodes.lower_bound(last);
1895 if (it == pin->old_inodes.end()) {
7c673cae
FG
1896 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1897 break;
1898 }
94b18763 1899 first = it->second.first;
7c673cae 1900 if (first > last) {
94b18763 1901 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
7c673cae
FG
1902 //assert(p == pin->old_inodes.begin());
1903 break;
1904 }
94b18763
FG
1905 if (it->first > last) {
1906 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1907 << (last+1) << "," << it->first << "]" << dendl;
1908 pin->old_inodes[last] = it->second;
1909 it->second.first = last+1;
1910 pin->dirty_old_rstats.insert(it->first);
7c673cae
FG
1911 }
1912 }
1913 if (first < ofirst) {
1914 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1915 << first << "," << ofirst-1 << "]" << dendl;
1916 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1917 pin->dirty_old_rstats.insert(ofirst-1);
1918 pin->old_inodes[last].first = first = ofirst;
1919 }
1920 pi = &pin->old_inodes[last].inode;
1921 pin->dirty_old_rstats.insert(last);
1922 }
1923 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1924 pi->rstat.add(delta);
1925 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1926
1927 last = first-1;
1928 }
1929}
1930
a8e16298 1931void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
7c673cae 1932{
11fdf7f2
TL
1933 if (!(mds->is_active() || mds->is_stopping()))
1934 return;
1935
7c673cae
FG
1936 if (!in->is_auth() || in->is_frozen())
1937 return;
1938
94b18763 1939 auto i = in->get_projected_inode();
a8e16298
TL
1940
1941 if (!i->quota.is_enable() &&
1942 !quota_change)
7c673cae
FG
1943 return;
1944
11fdf7f2
TL
1945 // creaete snaprealm for quota inode (quota was set before mimic)
1946 if (!in->get_projected_srnode())
1947 mds->server->create_quota_realm(in);
7c673cae 1948
11fdf7f2
TL
1949 for (auto &p : in->client_caps) {
1950 Capability *cap = &p.second;
1951 if (cap->is_noquota())
1952 continue;
28e407b8 1953
11fdf7f2 1954 if (exclude_ct >= 0 && exclude_ct != p.first)
28e407b8
AA
1955 goto update;
1956
7c673cae
FG
1957 if (cap->last_rbytes == i->rstat.rbytes &&
1958 cap->last_rsize == i->rstat.rsize())
1959 continue;
1960
1961 if (i->quota.max_files > 0) {
1962 if (i->rstat.rsize() >= i->quota.max_files)
1963 goto update;
1964
1965 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
1966 abs(cap->last_rsize - i->rstat.rsize()))
1967 goto update;
1968 }
1969
1970 if (i->quota.max_bytes > 0) {
1971 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
1972 goto update;
1973
1974 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
1975 abs(cap->last_rbytes - i->rstat.rbytes))
1976 goto update;
1977 }
1978
1979 continue;
1980
1981update:
1982 cap->last_rsize = i->rstat.rsize();
1983 cap->last_rbytes = i->rstat.rbytes;
1984
11fdf7f2 1985 auto msg = MClientQuota::create();
7c673cae
FG
1986 msg->ino = in->ino();
1987 msg->rstat = i->rstat;
1988 msg->quota = i->quota;
11fdf7f2 1989 mds->send_message_client_counted(msg, cap->get_session());
7c673cae 1990 }
181888fb 1991 for (const auto &it : in->get_replicas()) {
11fdf7f2 1992 auto msg = MGatherCaps::create();
7c673cae 1993 msg->ino = in->ino();
181888fb 1994 mds->send_message_mds(msg, it.first);
7c673cae
FG
1995 }
1996}
1997
1998/*
1999 * NOTE: we _have_ to delay the scatter if we are called during a
2000 * rejoin, because we can't twiddle locks between when the
2001 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2002 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2003 * (no requests), and a survivor acks immediately. _except_ that
2004 * during rejoin_(weak|strong) processing, we may complete a lock
2005 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2006 * scatterlock state in that case or the lock states will get out of
2007 * sync between the auth and replica.
2008 *
2009 * the simple solution is to never do the scatter here. instead, put
2010 * the scatterlock on a list if it isn't already wrlockable. this is
2011 * probably the best plan anyway, since we avoid too many
2012 * scatters/locks under normal usage.
2013 */
2014/*
2015 * some notes on dirlock/nestlock scatterlock semantics:
2016 *
2017 * the fragstat (dirlock) will never be updated without
2018 * dirlock+nestlock wrlock held by the caller.
2019 *
2020 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2021 * data is pushed up the tree. this could be changed with some
2022 * restructuring here, but in its current form we ensure that the
2023 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2024 * frag, which is nice. and, we only need to track frags that need to
2025 * be nudged (and not inodes with pending rstat changes that need to
2026 * be pushed into the frag). a consequence of this is that the
2027 * accounted_rstat on scatterlock sync may not match our current
2028 * rstat. this is normal and expected.
2029 */
2030void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2031 CInode *in, CDir *parent,
2032 int flags, int linkunlink,
2033 snapid_t cfollows)
2034{
2035 bool primary_dn = flags & PREDIRTY_PRIMARY;
2036 bool do_parent_mtime = flags & PREDIRTY_DIR;
2037 bool shallow = flags & PREDIRTY_SHALLOW;
2038
11fdf7f2 2039 ceph_assert(mds->mdlog->entry_is_open());
7c673cae
FG
2040
2041 // make sure stamp is set
2042 if (mut->get_mds_stamp() == utime_t())
2043 mut->set_mds_stamp(ceph_clock_now());
2044
2045 if (in->is_base())
2046 return;
2047
2048 dout(10) << "predirty_journal_parents"
2049 << (do_parent_mtime ? " do_parent_mtime":"")
2050 << " linkunlink=" << linkunlink
2051 << (primary_dn ? " primary_dn":" remote_dn")
2052 << (shallow ? " SHALLOW":"")
2053 << " follows " << cfollows
2054 << " " << *in << dendl;
2055
2056 if (!parent) {
11fdf7f2 2057 ceph_assert(primary_dn);
7c673cae
FG
2058 parent = in->get_projected_parent_dn()->get_dir();
2059 }
2060
2061 if (flags == 0 && linkunlink == 0) {
2062 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2063 blob->add_dir_context(parent);
2064 return;
2065 }
2066
2067 // build list of inodes to wrlock, dirty, and update
2068 list<CInode*> lsi;
2069 CInode *cur = in;
2070 CDentry *parentdn = NULL;
2071 bool first = true;
2072 while (parent) {
2073 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
11fdf7f2 2074 ceph_assert(parent->is_auth());
7c673cae
FG
2075
2076 // opportunistically adjust parent dirfrag
2077 CInode *pin = parent->get_inode();
2078
2079 // inode -> dirfrag
2080 mut->auth_pin(parent);
2081 mut->add_projected_fnode(parent);
2082
2083 fnode_t *pf = parent->project_fnode();
2084 pf->version = parent->pre_dirty();
2085
2086 if (do_parent_mtime || linkunlink) {
11fdf7f2
TL
2087 ceph_assert(mut->is_wrlocked(&pin->filelock));
2088 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2089 ceph_assert(cfollows == CEPH_NOSNAP);
7c673cae
FG
2090
2091 // update stale fragstat/rstat?
2092 parent->resync_accounted_fragstat();
2093 parent->resync_accounted_rstat();
2094
2095 if (do_parent_mtime) {
2096 pf->fragstat.mtime = mut->get_op_stamp();
2097 pf->fragstat.change_attr++;
2098 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2099 if (pf->fragstat.mtime > pf->rstat.rctime) {
2100 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2101 pf->rstat.rctime = pf->fragstat.mtime;
2102 } else {
2103 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2104 }
2105 }
2106 if (linkunlink) {
2107 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2108 if (in->is_dir()) {
2109 pf->fragstat.nsubdirs += linkunlink;
2110 //pf->rstat.rsubdirs += linkunlink;
2111 } else {
2112 pf->fragstat.nfiles += linkunlink;
2113 //pf->rstat.rfiles += linkunlink;
2114 }
2115 }
2116 }
2117
2118 // rstat
2119 if (!primary_dn) {
2120 // don't update parent this pass
2121 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2122 pin->versionlock.can_wrlock())) {
2123 dout(20) << " unwritable parent nestlock " << pin->nestlock
2124 << ", marking dirty rstat on " << *cur << dendl;
2125 cur->mark_dirty_rstat();
2126 } else {
2127 // if we don't hold a wrlock reference on this nestlock, take one,
2128 // because we are about to write into the dirfrag fnode and that needs
2129 // to commit before the lock can cycle.
2130 if (linkunlink) {
11fdf7f2 2131 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
7c673cae
FG
2132 }
2133
11fdf7f2 2134 if (!mut->is_wrlocked(&pin->nestlock)) {
7c673cae
FG
2135 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2136 mds->locker->wrlock_force(&pin->nestlock, mut);
2137 }
2138
2139 // now we can project the inode rstat diff the dirfrag
2140 SnapRealm *prealm = pin->find_snaprealm();
2141
2142 snapid_t follows = cfollows;
2143 if (follows == CEPH_NOSNAP)
2144 follows = prealm->get_newest_seq();
2145
2146 snapid_t first = follows+1;
2147
2148 // first, if the frag is stale, bring it back in sync.
2149 parent->resync_accounted_rstat();
2150
2151 // now push inode rstats into frag
2152 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2153 cur->clear_dirty_rstat();
2154 }
2155
2156 bool stop = false;
2157 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2158 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2159 stop = true;
2160 }
2161
2162 // delay propagating until later?
2163 if (!stop && !first &&
11fdf7f2 2164 g_conf()->mds_dirstat_min_interval > 0) {
7c673cae 2165 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
11fdf7f2 2166 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
7c673cae 2167 dout(10) << "predirty_journal_parents last prop " << since_last_prop
11fdf7f2 2168 << " < " << g_conf()->mds_dirstat_min_interval
7c673cae
FG
2169 << ", stopping" << dendl;
2170 stop = true;
2171 } else {
2172 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2173 }
2174 }
2175
2176 // can cast only because i'm passing nowait=true in the sole user
2177 MDRequestRef mdmut = static_cast<MDRequestImpl*>(mut.get());
2178 if (!stop &&
11fdf7f2 2179 !mut->is_wrlocked(&pin->nestlock) &&
7c673cae
FG
2180 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2181 //true
2182 !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
2183 )) { // ** do not initiate.. see above comment **
2184 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2185 << " on " << *pin << dendl;
2186 stop = true;
2187 }
2188 if (stop) {
2189 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2190 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2191 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2192 mut->add_updated_lock(&pin->nestlock);
2193 if (do_parent_mtime || linkunlink) {
2194 mds->locker->mark_updated_scatterlock(&pin->filelock);
2195 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2196 mut->add_updated_lock(&pin->filelock);
2197 }
2198 break;
2199 }
11fdf7f2 2200 if (!mut->is_wrlocked(&pin->versionlock))
7c673cae
FG
2201 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2202
11fdf7f2 2203 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
7c673cae
FG
2204
2205 pin->last_dirstat_prop = mut->get_mds_stamp();
2206
2207 // dirfrag -> diri
2208 mut->auth_pin(pin);
2209 mut->add_projected_inode(pin);
2210 lsi.push_front(pin);
2211
2212 pin->pre_cow_old_inode(); // avoid cow mayhem!
2213
94b18763
FG
2214 auto &pi = pin->project_inode();
2215 pi.inode.version = pin->pre_dirty();
7c673cae
FG
2216
2217 // dirstat
2218 if (do_parent_mtime || linkunlink) {
2219 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2220 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2221 bool touched_mtime = false, touched_chattr = false;
94b18763 2222 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
7c673cae
FG
2223 pf->accounted_fragstat = pf->fragstat;
2224 if (touched_mtime)
94b18763 2225 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
7c673cae 2226 if (touched_chattr)
94b18763
FG
2227 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2228 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
7c673cae
FG
2229
2230 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2231 if (pi.inode.dirstat.size() < 0)
11fdf7f2 2232 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
94b18763 2233 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
7c673cae 2234 mds->clog->error() << "unmatched fragstat size on single dirfrag "
94b18763 2235 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
7c673cae
FG
2236 << ", dirfrag has " << pf->fragstat;
2237
2238 // trust the dirfrag for now
94b18763 2239 pi.inode.dirstat = pf->fragstat;
7c673cae 2240
11fdf7f2 2241 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
7c673cae
FG
2242 }
2243 }
2244 }
2245
2246 /*
2247 * the rule here is to follow the _oldest_ parent with dirty rstat
2248 * data. if we don't propagate all data, we add ourselves to the
2249 * nudge list. that way all rstat data will (eventually) get
2250 * pushed up the tree.
2251 *
2252 * actually, no. for now, silently drop rstats for old parents. we need
2253 * hard link backpointers to do the above properly.
2254 */
2255
2256 // stop?
2257 if (pin->is_base())
2258 break;
2259 parentdn = pin->get_projected_parent_dn();
11fdf7f2 2260 ceph_assert(parentdn);
7c673cae
FG
2261
2262 // rstat
2263 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2264
2265 // first, if the frag is stale, bring it back in sync.
2266 parent->resync_accounted_rstat();
2267
11fdf7f2 2268 if (g_conf()->mds_snap_rstat) {
94b18763
FG
2269 for (auto &p : parent->dirty_old_rstat) {
2270 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2271 p.first, pin, true);
2272 }
7c673cae
FG
2273 }
2274 parent->dirty_old_rstat.clear();
2275 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2276
2277 pf->accounted_rstat = pf->rstat;
2278
2279 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
94b18763 2280 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
7c673cae 2281 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
94b18763 2282 << parent->dirfrag() << ", inode has " << pi.inode.rstat
7c673cae
FG
2283 << ", dirfrag has " << pf->rstat;
2284
2285 // trust the dirfrag for now
94b18763 2286 pi.inode.rstat = pf->rstat;
7c673cae 2287
11fdf7f2 2288 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
7c673cae
FG
2289 }
2290 }
2291
2292 parent->check_rstats();
2293 broadcast_quota_to_client(pin);
2294 // next parent!
2295 cur = pin;
2296 parent = parentdn->get_dir();
2297 linkunlink = 0;
2298 do_parent_mtime = false;
2299 primary_dn = true;
2300 first = false;
2301 }
2302
2303 // now, stick it in the blob
11fdf7f2
TL
2304 ceph_assert(parent);
2305 ceph_assert(parent->is_auth());
7c673cae
FG
2306 blob->add_dir_context(parent);
2307 blob->add_dir(parent, true);
2308 for (list<CInode*>::iterator p = lsi.begin();
2309 p != lsi.end();
2310 ++p) {
2311 CInode *cur = *p;
2312 journal_dirty_inode(mut.get(), blob, cur);
2313 }
2314
2315}
2316
2317
2318
2319
2320
2321// ===================================
2322// slave requests
2323
2324
2325/*
2326 * some handlers for master requests with slaves. we need to make
2327 * sure slaves journal commits before we forget we mastered them and
2328 * remove them from the uncommitted_masters map (used during recovery
2329 * to commit|abort slaves).
2330 */
2331struct C_MDC_CommittedMaster : public MDCacheLogContext {
2332 metareqid_t reqid;
2333 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2334 void finish(int r) override {
2335 mdcache->_logged_master_commit(reqid);
2336 }
2337};
2338
2339void MDCache::log_master_commit(metareqid_t reqid)
2340{
2341 dout(10) << "log_master_commit " << reqid << dendl;
2342 uncommitted_masters[reqid].committing = true;
2343 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2344 new C_MDC_CommittedMaster(this, reqid));
2345}
2346
2347void MDCache::_logged_master_commit(metareqid_t reqid)
2348{
2349 dout(10) << "_logged_master_commit " << reqid << dendl;
11fdf7f2 2350 ceph_assert(uncommitted_masters.count(reqid));
7c673cae
FG
2351 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2352 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2353 uncommitted_masters.erase(reqid);
2354}
2355
2356// while active...
2357
2358void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2359{
2360 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
11fdf7f2 2361 ceph_assert(uncommitted_masters.count(r));
7c673cae
FG
2362 uncommitted_masters[r].slaves.erase(from);
2363 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2364 log_master_commit(r);
2365}
2366
2367void MDCache::logged_master_update(metareqid_t reqid)
2368{
2369 dout(10) << "logged_master_update " << reqid << dendl;
11fdf7f2 2370 ceph_assert(uncommitted_masters.count(reqid));
7c673cae 2371 uncommitted_masters[reqid].safe = true;
11fdf7f2
TL
2372 auto p = pending_masters.find(reqid);
2373 if (p != pending_masters.end()) {
2374 pending_masters.erase(p);
7c673cae
FG
2375 if (pending_masters.empty())
2376 process_delayed_resolve();
2377 }
2378}
2379
2380/*
2381 * Master may crash after receiving all slaves' commit acks, but before journalling
2382 * the final commit. Slaves may crash after journalling the slave commit, but before
2383 * sending commit ack to the master. Commit masters with no uncommitted slave when
2384 * resolve finishes.
2385 */
2386void MDCache::finish_committed_masters()
2387{
2388 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2389 p != uncommitted_masters.end();
2390 ++p) {
2391 p->second.recovering = false;
2392 if (!p->second.committing && p->second.slaves.empty()) {
2393 dout(10) << "finish_committed_masters " << p->first << dendl;
2394 log_master_commit(p->first);
2395 }
2396 }
2397}
2398
2399/*
2400 * at end of resolve... we must journal a commit|abort for all slave
2401 * updates, before moving on.
2402 *
2403 * this is so that the master can safely journal ECommitted on ops it
2404 * masters when it reaches up:active (all other recovering nodes must
2405 * complete resolve before that happens).
2406 */
2407struct C_MDC_SlaveCommit : public MDCacheLogContext {
2408 mds_rank_t from;
2409 metareqid_t reqid;
2410 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2411 void finish(int r) override {
2412 mdcache->_logged_slave_commit(from, reqid);
2413 }
2414};
2415
2416void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2417{
2418 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2419
2420 // send a message
11fdf7f2 2421 auto req = MMDSSlaveRequest::create(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
7c673cae
FG
2422 mds->send_message_mds(req, from);
2423}
2424
2425
2426
2427
2428
2429
2430// ====================================================================
2431// import map, recovery
2432
2433void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2434 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2435{
2436 if (subtrees.count(oldparent)) {
2437 vector<dirfrag_t>& v = subtrees[oldparent];
2438 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2439 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2440 if (*it == df) {
2441 v.erase(it);
2442 break;
2443 }
2444 }
2445 if (subtrees.count(newparent)) {
2446 vector<dirfrag_t>& v = subtrees[newparent];
2447 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2448 v.push_back(df);
2449 }
2450}
2451
2452ESubtreeMap *MDCache::create_subtree_map()
2453{
2454 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2455 << num_subtrees_fullauth() << " fullauth"
2456 << dendl;
2457
2458 show_subtrees();
2459
2460 ESubtreeMap *le = new ESubtreeMap();
2461 mds->mdlog->_start_entry(le);
2462
2463 map<dirfrag_t, CDir*> dirs_to_add;
2464
2465 if (myin) {
2466 CDir* mydir = myin->get_dirfrag(frag_t());
2467 dirs_to_add[mydir->dirfrag()] = mydir;
2468 }
2469
2470 // include all auth subtrees, and their bounds.
2471 // and a spanning tree to tie it to the root.
2472 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2473 p != subtrees.end();
2474 ++p) {
2475 CDir *dir = p->first;
2476
2477 // journal subtree as "ours" if we are
2478 // me, -2
2479 // me, me
2480 // me, !me (may be importing and ambiguous!)
2481
2482 // so not
2483 // !me, *
2484 if (dir->get_dir_auth().first != mds->get_nodeid())
2485 continue;
2486
2487 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2488 my_ambiguous_imports.count(dir->dirfrag())) {
2489 dout(15) << " ambig subtree " << *dir << dendl;
2490 le->ambiguous_subtrees.insert(dir->dirfrag());
2491 } else {
2492 dout(15) << " subtree " << *dir << dendl;
2493 }
2494
2495 dirs_to_add[dir->dirfrag()] = dir;
2496 le->subtrees[dir->dirfrag()].clear();
2497
2498
2499 // bounds
2500 for (set<CDir*>::iterator q = p->second.begin();
2501 q != p->second.end();
2502 ++q) {
2503 CDir *bound = *q;
2504 dout(15) << " subtree bound " << *bound << dendl;
2505 dirs_to_add[bound->dirfrag()] = bound;
2506 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2507 }
2508 }
2509
2510 // apply projected renames
2511 for (map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.begin();
2512 p != projected_subtree_renames.end();
2513 ++p) {
2514 for (list<pair<CDir*,CDir*> >::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2515 CInode *diri = p->first;
2516 CDir *olddir = q->first;
2517 CDir *newdir = q->second;
2518 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2519
2520 list<CDir*> dfls;
2521 diri->get_dirfrags(dfls);
2522 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
2523 CDir *dir = *p;
2524 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2525 CDir *oldparent = get_projected_subtree_root(olddir);
2526 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2527 CDir *newparent = get_projected_subtree_root(newdir);
2528 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2529
2530 if (oldparent == newparent) {
2531 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2532 << oldparent->dirfrag() << dendl;
2533 continue;
2534 }
2535
2536 if (dir->is_subtree_root()) {
2537 if (le->subtrees.count(newparent->dirfrag()) &&
2538 oldparent->get_dir_auth() != newparent->get_dir_auth())
2539 dirs_to_add[dir->dirfrag()] = dir;
2540 // children are fine. change parent.
2541 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2542 le->subtrees);
2543 } else {
2544 // mid-subtree.
2545
2546 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2547 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2548 // if oldparent is auth, subtree is mine; include it.
2549 if (le->subtrees.count(oldparent->dirfrag())) {
2550 dirs_to_add[dir->dirfrag()] = dir;
2551 le->subtrees[dir->dirfrag()].clear();
2552 }
2553 // if newparent is auth, subtree is a new bound
2554 if (le->subtrees.count(newparent->dirfrag())) {
2555 dirs_to_add[dir->dirfrag()] = dir;
2556 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2557 }
2558 newparent = dir;
2559 }
2560
2561 // see if any old bounds move to the new parent.
2562 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2563 p != subtrees[oldparent].end();
2564 ++p) {
2565 CDir *bound = *p;
2566 if (dir->contains(bound->get_parent_dir()))
2567 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2568 le->subtrees);
2569 }
2570 }
2571 }
2572 }
2573 }
2574
2575 // simplify the journaled map. our in memory map may have more
2576 // subtrees than needed due to migrations that are just getting
2577 // started or just completing. but on replay, the "live" map will
2578 // be simple and we can do a straight comparison.
2579 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2580 if (le->ambiguous_subtrees.count(p->first))
2581 continue;
2582 unsigned i = 0;
2583 while (i < p->second.size()) {
2584 dirfrag_t b = p->second[i];
2585 if (le->subtrees.count(b) &&
2586 le->ambiguous_subtrees.count(b) == 0) {
2587 vector<dirfrag_t>& bb = le->subtrees[b];
2588 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2589 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2590 p->second.push_back(*r);
2591 dirs_to_add.erase(b);
2592 le->subtrees.erase(b);
2593 p->second.erase(p->second.begin() + i);
2594 } else {
2595 ++i;
2596 }
2597 }
2598 }
2599
94b18763 2600 for (auto &p : dirs_to_add) {
7c673cae
FG
2601 CDir *dir = p.second;
2602 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2603 le->metablob.add_dir(dir, false);
2604 }
2605
2606 dout(15) << " subtrees " << le->subtrees << dendl;
2607 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2608
2609 //le->metablob.print(cout);
2610 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2611 return le;
2612}
2613
2614void MDCache::dump_resolve_status(Formatter *f) const
2615{
2616 f->open_object_section("resolve_status");
2617 f->dump_stream("resolve_gather") << resolve_gather;
2618 f->dump_stream("resolve_ack_gather") << resolve_gather;
2619 f->close_section();
2620}
2621
11fdf7f2 2622void MDCache::resolve_start(MDSContext *resolve_done_)
7c673cae
FG
2623{
2624 dout(10) << "resolve_start" << dendl;
11fdf7f2 2625 ceph_assert(!resolve_done);
7c673cae
FG
2626 resolve_done.reset(resolve_done_);
2627
2628 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2629 // if we don't have the root dir, adjust it to UNKNOWN. during
2630 // resolve we want mds0 to explicit claim the portion of it that
2631 // it owns, so that anything beyond its bounds get left as
2632 // unknown.
2633 CDir *rootdir = root->get_dirfrag(frag_t());
2634 if (rootdir)
2635 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2636 }
2637 resolve_gather = recovery_set;
11fdf7f2
TL
2638
2639 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
7c673cae
FG
2640}
2641
2642void MDCache::send_resolves()
2643{
2644 send_slave_resolves();
11fdf7f2
TL
2645
2646 if (!resolve_done) {
2647 // I'm survivor: refresh snap cache
2648 mds->snapclient->sync(
2649 new MDSInternalContextWrapper(mds,
2650 new FunctionContext([this](int r) {
2651 maybe_finish_slave_resolve();
2652 })
2653 )
2654 );
2655 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2656 return;
2657 }
7c673cae
FG
2658 if (!resolve_ack_gather.empty()) {
2659 dout(10) << "send_resolves still waiting for resolve ack from ("
2660 << resolve_ack_gather << ")" << dendl;
2661 return;
2662 }
11fdf7f2 2663 if (!resolve_need_rollback.empty()) {
7c673cae 2664 dout(10) << "send_resolves still waiting for rollback to commit on ("
11fdf7f2 2665 << resolve_need_rollback << ")" << dendl;
7c673cae
FG
2666 return;
2667 }
11fdf7f2 2668
7c673cae
FG
2669 send_subtree_resolves();
2670}
2671
2672void MDCache::send_slave_resolves()
2673{
2674 dout(10) << "send_slave_resolves" << dendl;
2675
11fdf7f2 2676 map<mds_rank_t, MMDSResolve::ref> resolves;
7c673cae
FG
2677
2678 if (mds->is_resolve()) {
2679 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2680 p != uncommitted_slave_updates.end();
2681 ++p) {
11fdf7f2 2682 resolves[p->first] = MMDSResolve::create();
7c673cae
FG
2683 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2684 q != p->second.end();
2685 ++q) {
2686 dout(10) << " including uncommitted " << q->first << dendl;
2687 resolves[p->first]->add_slave_request(q->first, false);
2688 }
2689 }
2690 } else {
2691 set<mds_rank_t> resolve_set;
2692 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2693 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2694 p != active_requests.end();
2695 ++p) {
2696 MDRequestRef& mdr = p->second;
2697 if (!mdr->is_slave())
2698 continue;
2699 if (!mdr->slave_did_prepare() && !mdr->committing) {
2700 continue;
2701 }
2702 mds_rank_t master = mdr->slave_to_mds;
2703 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2704 dout(10) << " including uncommitted " << *mdr << dendl;
2705 if (!resolves.count(master))
11fdf7f2 2706 resolves[master] = MMDSResolve::create();
7c673cae
FG
2707 if (!mdr->committing &&
2708 mdr->has_more() && mdr->more()->is_inode_exporter) {
2709 // re-send cap exports
2710 CInode *in = mdr->more()->rename_inode;
2711 map<client_t, Capability::Export> cap_map;
2712 in->export_client_caps(cap_map);
2713 bufferlist bl;
11fdf7f2
TL
2714 encode(in->ino(), bl);
2715 encode(cap_map, bl);
7c673cae
FG
2716 resolves[master]->add_slave_request(p->first, bl);
2717 } else {
2718 resolves[master]->add_slave_request(p->first, mdr->committing);
2719 }
2720 }
2721 }
2722 }
2723
11fdf7f2
TL
2724 for (auto &p : resolves) {
2725 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2726 mds->send_message_mds(p.second, p.first);
2727 resolve_ack_gather.insert(p.first);
7c673cae
FG
2728 }
2729}
2730
2731void MDCache::send_subtree_resolves()
2732{
2733 dout(10) << "send_subtree_resolves" << dendl;
2734
2735 if (migrator->is_exporting() || migrator->is_importing()) {
2736 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2737 migrator->show_importing();
2738 migrator->show_exporting();
2739 resolves_pending = true;
2740 return; // not now
2741 }
2742
11fdf7f2 2743 map<mds_rank_t, MMDSResolve::ref> resolves;
7c673cae
FG
2744 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2745 p != recovery_set.end();
2746 ++p) {
2747 if (*p == mds->get_nodeid())
2748 continue;
2749 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
11fdf7f2 2750 resolves[*p] = MMDSResolve::create();
7c673cae
FG
2751 }
2752
2753 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2754 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2755
2756 // known
2757 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2758 p != subtrees.end();
2759 ++p) {
2760 CDir *dir = p->first;
2761
2762 // only our subtrees
2763 if (dir->authority().first != mds->get_nodeid())
2764 continue;
2765
2766 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2767 continue; // we'll add it below
2768
2769 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2770 // ambiguous (mid-import)
2771 set<CDir*> bounds;
2772 get_subtree_bounds(dir, bounds);
2773 vector<dirfrag_t> dfls;
2774 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2775 dfls.push_back((*q)->dirfrag());
2776
2777 my_ambig_imports[dir->dirfrag()] = dfls;
2778 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2779 } else {
2780 // not ambiguous.
11fdf7f2
TL
2781 for (auto &q : resolves) {
2782 resolves[q.first]->add_subtree(dir->dirfrag());
2783 }
7c673cae
FG
2784 // bounds too
2785 vector<dirfrag_t> dfls;
2786 for (set<CDir*>::iterator q = subtrees[dir].begin();
2787 q != subtrees[dir].end();
2788 ++q) {
2789 CDir *bound = *q;
2790 dfls.push_back(bound->dirfrag());
2791 }
2792
2793 my_subtrees[dir->dirfrag()] = dfls;
2794 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2795 }
2796 }
2797
2798 // ambiguous
2799 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2800 p != my_ambiguous_imports.end();
2801 ++p) {
2802 my_ambig_imports[p->first] = p->second;
2803 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2804 }
2805
2806 // simplify the claimed subtree.
2807 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2808 unsigned i = 0;
2809 while (i < p->second.size()) {
2810 dirfrag_t b = p->second[i];
2811 if (my_subtrees.count(b)) {
2812 vector<dirfrag_t>& bb = my_subtrees[b];
2813 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2814 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2815 p->second.push_back(*r);
2816 my_subtrees.erase(b);
2817 p->second.erase(p->second.begin() + i);
2818 } else {
2819 ++i;
2820 }
2821 }
2822 }
2823
2824 // send
11fdf7f2
TL
2825 for (auto &p : resolves) {
2826 const MMDSResolve::ref &m = p.second;
2827 if (mds->is_resolve()) {
2828 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2829 } else {
2830 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2831 }
7c673cae
FG
2832 m->subtrees = my_subtrees;
2833 m->ambiguous_imports = my_ambig_imports;
11fdf7f2
TL
2834 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2835 mds->send_message_mds(m, p.first);
7c673cae
FG
2836 }
2837 resolves_pending = false;
2838}
2839
11fdf7f2
TL
2840void MDCache::maybe_finish_slave_resolve() {
2841 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2842 // snap cache get synced or I'm in resolve state
2843 if (mds->snapclient->is_synced() || resolve_done)
2844 send_subtree_resolves();
2845 process_delayed_resolve();
2846 }
2847}
2848
7c673cae
FG
2849void MDCache::handle_mds_failure(mds_rank_t who)
2850{
2851 dout(7) << "handle_mds_failure mds." << who << dendl;
2852
2853 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2854
2855 resolve_gather.insert(who);
2856 discard_delayed_resolve(who);
2857 ambiguous_slave_updates.erase(who);
2858
2859 rejoin_gather.insert(who);
2860 rejoin_sent.erase(who); // i need to send another
31f18b77 2861 rejoin_ack_sent.erase(who); // i need to send another
7c673cae
FG
2862 rejoin_ack_gather.erase(who); // i'll need/get another.
2863
2864 dout(10) << " resolve_gather " << resolve_gather << dendl;
2865 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2866 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2867 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2868 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2869
2870
2871 // tell the migrator too.
2872 migrator->handle_mds_failure_or_stop(who);
2873
224ce89b
WB
2874 // tell the balancer too.
2875 mds->balancer->handle_mds_failure(who);
2876
7c673cae
FG
2877 // clean up any requests slave to/from this node
2878 list<MDRequestRef> finish;
2879 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2880 p != active_requests.end();
2881 ++p) {
2882 MDRequestRef& mdr = p->second;
2883 // slave to the failed node?
2884 if (mdr->slave_to_mds == who) {
2885 if (mdr->slave_did_prepare()) {
2886 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2887 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2888 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2889
2890 if (!mdr->more()->waiting_on_slave.empty()) {
11fdf7f2 2891 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae 2892 // will rollback, no need to wait
91327a77 2893 mdr->reset_slave_request();
7c673cae
FG
2894 mdr->more()->waiting_on_slave.clear();
2895 }
2896 } else if (!mdr->committing) {
2897 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2898 if (mdr->slave_request || mdr->slave_rolling_back())
2899 mdr->aborted = true;
2900 else
2901 finish.push_back(mdr);
2902 }
2903 }
2904
2905 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2906 if (mdr->more()->waiting_on_slave.count(who)) {
11fdf7f2 2907 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
7c673cae
FG
2908 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2909 << who << dendl;
2910 mdr->more()->waiting_on_slave.erase(who);
2911 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2912 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2913 }
2914
2915 if (mdr->more()->srcdn_auth_mds == who &&
2916 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2917 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2918 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2919 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2920 }
31f18b77 2921 } else if (mdr->slave_request) {
11fdf7f2 2922 const MMDSSlaveRequest::const_ref &slave_req = mdr->slave_request;
31f18b77
FG
2923 // FIXME: Slave rename request can arrive after we notice mds failure.
2924 // This can cause mds to crash (does not affect integrity of FS).
2925 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2926 slave_req->srcdn_auth == who)
2927 slave_req->mark_interrupted();
7c673cae
FG
2928 }
2929
2930 // failed node is slave?
2931 if (mdr->is_master() && !mdr->committing) {
2932 if (mdr->more()->srcdn_auth_mds == who) {
2933 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2934 << who << " to recover" << dendl;
11fdf7f2 2935 ceph_assert(mdr->more()->witnessed.count(who) == 0);
7c673cae
FG
2936 if (mdr->more()->is_ambiguous_auth)
2937 mdr->clear_ambiguous_auth();
2938 // rename srcdn's auth mds failed, all witnesses will rollback
2939 mdr->more()->witnessed.clear();
2940 pending_masters.erase(p->first);
2941 }
2942
2943 if (mdr->more()->witnessed.count(who)) {
2944 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2945 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2946 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2947 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2948 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2949 // until either the request is committing or the slave also fails.
11fdf7f2 2950 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
7c673cae
FG
2951 pending_masters.insert(p->first);
2952 } else {
2953 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2954 << who << " to recover" << dendl;
2955 if (srcdn_auth >= 0)
11fdf7f2 2956 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
7c673cae
FG
2957
2958 // discard this peer's prepare (if any)
2959 mdr->more()->witnessed.erase(who);
2960 }
2961 }
2962
2963 if (mdr->more()->waiting_on_slave.count(who)) {
2964 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2965 << " to recover" << dendl;
2966 // retry request when peer recovers
2967 mdr->more()->waiting_on_slave.erase(who);
2968 if (mdr->more()->waiting_on_slave.empty())
2969 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
2970 }
2971
2972 if (mdr->locking && mdr->locking_target_mds == who)
2973 mdr->finish_locking(mdr->locking);
2974 }
2975 }
2976
2977 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2978 p != uncommitted_masters.end();
2979 ++p) {
2980 // The failed MDS may have already committed the slave update
2981 if (p->second.slaves.count(who)) {
2982 p->second.recovering = true;
2983 p->second.slaves.erase(who);
2984 }
2985 }
2986
2987 while (!finish.empty()) {
2988 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
2989 request_finish(finish.front());
2990 finish.pop_front();
2991 }
2992
2993 kick_find_ino_peers(who);
2994 kick_open_ino_peers(who);
2995
2996 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
2997 p != fragments.end(); ) {
2998 dirfrag_t df = p->first;
2999 fragment_info_t& info = p->second;
a8e16298
TL
3000
3001 if (info.is_fragmenting()) {
3002 if (info.notify_ack_waiting.erase(who) &&
3003 info.notify_ack_waiting.empty()) {
3004 fragment_drop_locks(info);
3005 fragment_maybe_finish(p++);
3006 } else {
3007 ++p;
3008 }
7c673cae 3009 continue;
a8e16298
TL
3010 }
3011
3012 ++p;
7c673cae
FG
3013 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3014 list<CDir*> dirs;
3015 info.dirs.swap(dirs);
3016 fragments.erase(df);
3017 fragment_unmark_unfreeze_dirs(dirs);
3018 }
3019
3020 // MDCache::shutdown_export_strays() always exports strays to mds.0
3021 if (who == mds_rank_t(0))
f64942e4 3022 shutdown_exporting_strays.clear();
7c673cae
FG
3023
3024 show_subtrees();
3025}
3026
3027/*
3028 * handle_mds_recovery - called on another node's transition
3029 * from resolve -> active.
3030 */
3031void MDCache::handle_mds_recovery(mds_rank_t who)
3032{
3033 dout(7) << "handle_mds_recovery mds." << who << dendl;
3034
3035 // exclude all discover waiters. kick_discovers() will do the job
3036 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3037 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3038
11fdf7f2 3039 MDSContext::vec waiters;
7c673cae
FG
3040
3041 // wake up any waiters in their subtrees
3042 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3043 p != subtrees.end();
3044 ++p) {
3045 CDir *dir = p->first;
3046
3047 if (dir->authority().first != who ||
3048 dir->authority().second == mds->get_nodeid())
3049 continue;
11fdf7f2 3050 ceph_assert(!dir->is_auth());
7c673cae
FG
3051
3052 // wake any waiters
3053 list<CDir*> q;
3054 q.push_back(dir);
3055
3056 while (!q.empty()) {
3057 CDir *d = q.front();
3058 q.pop_front();
3059 d->take_waiting(d_mask, waiters);
3060
3061 // inode waiters too
94b18763
FG
3062 for (auto &p : d->items) {
3063 CDentry *dn = p.second;
7c673cae
FG
3064 CDentry::linkage_t *dnl = dn->get_linkage();
3065 if (dnl->is_primary()) {
3066 dnl->get_inode()->take_waiting(i_mask, waiters);
3067
3068 // recurse?
3069 list<CDir*> ls;
3070 dnl->get_inode()->get_dirfrags(ls);
3071 for (list<CDir*>::iterator p = ls.begin();
3072 p != ls.end();
3073 ++p) {
3074 CDir *subdir = *p;
3075 if (!subdir->is_subtree_root())
3076 q.push_back(subdir);
3077 }
3078 }
3079 }
3080 }
3081 }
3082
3083 kick_open_ino_peers(who);
3084 kick_find_ino_peers(who);
3085
3086 // queue them up.
3087 mds->queue_waiters(waiters);
3088}
3089
3090void MDCache::set_recovery_set(set<mds_rank_t>& s)
3091{
3092 dout(7) << "set_recovery_set " << s << dendl;
3093 recovery_set = s;
3094}
3095
3096
3097/*
3098 * during resolve state, we share resolves to determine who
3099 * is authoritative for which trees. we expect to get an resolve
3100 * from _everyone_ in the recovery_set (the mds cluster at the time of
3101 * the first failure).
3102 *
3103 * This functions puts the passed message before returning
3104 */
11fdf7f2 3105void MDCache::handle_resolve(const MMDSResolve::const_ref &m)
7c673cae
FG
3106{
3107 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3108 mds_rank_t from = mds_rank_t(m->get_source().num());
3109
3110 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3111 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3112 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3113 return;
3114 }
3115 // wait until we reach the resolve stage!
7c673cae
FG
3116 return;
3117 }
3118
3119 discard_delayed_resolve(from);
3120
3121 // ambiguous slave requests?
3122 if (!m->slave_requests.empty()) {
3123 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3124 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3125 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
11fdf7f2 3126 ceph_assert(!p->second.committing);
7c673cae
FG
3127 pending_masters.insert(p->first);
3128 }
3129 }
3130
3131 if (!pending_masters.empty()) {
3132 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3133 delayed_resolve[from] = m;
3134 return;
3135 }
3136 }
3137
11fdf7f2
TL
3138 auto ack = MMDSResolveAck::create();
3139 for (const auto &p : m->slave_requests) {
3140 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
7c673cae 3141 // COMMIT
11fdf7f2 3142 if (p.second.committing) {
7c673cae 3143 // already committing, waiting for the OP_COMMITTED slave reply
11fdf7f2 3144 dout(10) << " already committing slave request " << p << " noop "<< dendl;
7c673cae 3145 } else {
11fdf7f2
TL
3146 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3147 ack->add_commit(p.first);
7c673cae 3148 }
11fdf7f2 3149 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
7c673cae 3150
11fdf7f2 3151 if (p.second.inode_caps.length() > 0) {
7c673cae 3152 // slave wants to export caps (rename)
11fdf7f2 3153 ceph_assert(mds->is_resolve());
7c673cae
FG
3154
3155 inodeno_t ino;
3156 map<client_t,Capability::Export> cap_exports;
11fdf7f2
TL
3157 auto q = p.second.inode_caps.cbegin();
3158 decode(ino, q);
3159 decode(cap_exports, q);
7c673cae 3160
11fdf7f2 3161 ceph_assert(get_inode(ino));
7c673cae
FG
3162
3163 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3164 q != cap_exports.end();
3165 ++q) {
3166 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3167 im.cap_id = ++last_cap_id; // assign a new cap ID
3168 im.issue_seq = 1;
3169 im.mseq = q->second.mseq;
28e407b8
AA
3170
3171 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3172 if (session)
3173 rejoin_client_map.emplace(q->first, session->info.inst);
7c673cae
FG
3174 }
3175
3176 // will process these caps in rejoin stage
3177 rejoin_slave_exports[ino].first = from;
3178 rejoin_slave_exports[ino].second.swap(cap_exports);
3179
3180 // send information of imported caps back to slave
11fdf7f2 3181 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
7c673cae
FG
3182 }
3183 } else {
3184 // ABORT
11fdf7f2
TL
3185 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3186 ceph_assert(!p.second.committing);
3187 ack->add_abort(p.first);
7c673cae
FG
3188 }
3189 }
3190 mds->send_message(ack, m->get_connection());
7c673cae
FG
3191 return;
3192 }
3193
11fdf7f2 3194 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
7c673cae
FG
3195 dout(10) << "delay processing subtree resolve" << dendl;
3196 delayed_resolve[from] = m;
3197 return;
3198 }
3199
3200 bool survivor = false;
3201 // am i a surviving ambiguous importer?
3202 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3203 survivor = true;
3204 // check for any import success/failure (from this node)
3205 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3206 while (p != my_ambiguous_imports.end()) {
3207 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3208 ++next;
3209 CDir *dir = get_dirfrag(p->first);
11fdf7f2 3210 ceph_assert(dir);
7c673cae
FG
3211 dout(10) << "checking ambiguous import " << *dir << dendl;
3212 if (migrator->is_importing(dir->dirfrag()) &&
3213 migrator->get_import_peer(dir->dirfrag()) == from) {
11fdf7f2 3214 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
7c673cae
FG
3215
3216 // check if sender claims the subtree
3217 bool claimed_by_sender = false;
11fdf7f2 3218 for (const auto &q : m->subtrees) {
7c673cae 3219 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
11fdf7f2 3220 CDir *base = get_force_dirfrag(q.first, false);
7c673cae
FG
3221 if (!base || !base->contains(dir))
3222 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3223
3224 bool inside = true;
3225 set<CDir*> bounds;
11fdf7f2 3226 get_force_dirfrag_bound_set(q.second, bounds);
7c673cae
FG
3227 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3228 CDir *bound = *p;
3229 if (bound->contains(dir)) {
3230 inside = false; // nope, bound is dir or parent of dir, not inside.
3231 break;
3232 }
3233 }
3234 if (inside)
3235 claimed_by_sender = true;
3236 }
3237
3238 my_ambiguous_imports.erase(p); // no longer ambiguous.
3239 if (claimed_by_sender) {
3240 dout(7) << "ambiguous import failed on " << *dir << dendl;
3241 migrator->import_reverse(dir);
3242 } else {
3243 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3244 migrator->import_finish(dir, true);
3245 }
3246 }
3247 p = next;
3248 }
3249 }
3250
3251 // update my dir_auth values
3252 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3253 // migrations between other nodes)
11fdf7f2
TL
3254 for (const auto& p : m->subtrees) {
3255 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3256 CDir *dir = get_force_dirfrag(p.first, !survivor);
7c673cae
FG
3257 if (!dir)
3258 continue;
11fdf7f2 3259 adjust_bounded_subtree_auth(dir, p.second, from);
7c673cae
FG
3260 try_subtree_merge(dir);
3261 }
3262
3263 show_subtrees();
3264
3265 // note ambiguous imports too
11fdf7f2
TL
3266 for (const auto& p : m->ambiguous_imports) {
3267 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3268 other_ambiguous_imports[from][p.first] = p.second;
3269 }
3270
3271 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3272 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3273 for (const auto& p : m->table_clients) {
3274 dout(10) << " noting " << get_mdstable_name(p.type)
3275 << " pending_commits " << p.pending_commits << dendl;
3276 MDSTableClient *client = mds->get_table_client(p.type);
3277 for (const auto& q : p.pending_commits)
3278 client->notify_commit(q);
7c673cae
FG
3279 }
3280
3281 // did i get them all?
3282 resolve_gather.erase(from);
3283
3284 maybe_resolve_finish();
7c673cae
FG
3285}
3286
3287void MDCache::process_delayed_resolve()
3288{
3289 dout(10) << "process_delayed_resolve" << dendl;
11fdf7f2 3290 map<mds_rank_t, MMDSResolve::const_ref> tmp;
7c673cae 3291 tmp.swap(delayed_resolve);
11fdf7f2
TL
3292 for (auto &p : tmp) {
3293 handle_resolve(p.second);
3294 }
7c673cae
FG
3295}
3296
3297void MDCache::discard_delayed_resolve(mds_rank_t who)
3298{
11fdf7f2 3299 delayed_resolve.erase(who);
7c673cae
FG
3300}
3301
3302void MDCache::maybe_resolve_finish()
3303{
11fdf7f2
TL
3304 ceph_assert(resolve_ack_gather.empty());
3305 ceph_assert(resolve_need_rollback.empty());
7c673cae
FG
3306
3307 if (!resolve_gather.empty()) {
3308 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3309 << resolve_gather << ")" << dendl;
3310 return;
3311 }
3312
3313 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3314 disambiguate_my_imports();
3315 finish_committed_masters();
3316
3317 if (resolve_done) {
11fdf7f2 3318 ceph_assert(mds->is_resolve());
7c673cae
FG
3319 trim_unlinked_inodes();
3320 recalc_auth_bits(false);
3321 resolve_done.release()->complete(0);
3322 } else {
11fdf7f2 3323 // I am survivor.
7c673cae
FG
3324 maybe_send_pending_rejoins();
3325 }
3326}
3327
11fdf7f2 3328void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref &ack)
7c673cae
FG
3329{
3330 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3331 mds_rank_t from = mds_rank_t(ack->get_source().num());
3332
3333 if (!resolve_ack_gather.count(from) ||
3334 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
7c673cae
FG
3335 return;
3336 }
3337
3338 if (ambiguous_slave_updates.count(from)) {
11fdf7f2
TL
3339 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3340 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
7c673cae
FG
3341 }
3342
11fdf7f2
TL
3343 for (const auto &p : ack->commit) {
3344 dout(10) << " commit on slave " << p.first << dendl;
7c673cae
FG
3345
3346 if (ambiguous_slave_updates.count(from)) {
11fdf7f2 3347 remove_ambiguous_slave_update(p.first, from);
7c673cae
FG
3348 continue;
3349 }
3350
3351 if (mds->is_resolve()) {
3352 // replay
11fdf7f2
TL
3353 MDSlaveUpdate *su = get_uncommitted_slave_update(p.first, from);
3354 ceph_assert(su);
7c673cae
FG
3355
3356 // log commit
11fdf7f2 3357 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
7c673cae 3358 ESlaveUpdate::OP_COMMIT, su->origop),
11fdf7f2 3359 new C_MDC_SlaveCommit(this, from, p.first));
7c673cae
FG
3360 mds->mdlog->flush();
3361
11fdf7f2 3362 finish_uncommitted_slave_update(p.first, from);
7c673cae 3363 } else {
11fdf7f2 3364 MDRequestRef mdr = request_get(p.first);
7c673cae 3365 // information about master imported caps
11fdf7f2
TL
3366 if (p.second.length() > 0)
3367 mdr->more()->inode_import.share(p.second);
7c673cae 3368
11fdf7f2 3369 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
7c673cae
FG
3370 request_finish(mdr);
3371 }
3372 }
3373
11fdf7f2
TL
3374 for (const auto &metareq : ack->abort) {
3375 dout(10) << " abort on slave " << metareq << dendl;
7c673cae
FG
3376
3377 if (mds->is_resolve()) {
11fdf7f2
TL
3378 MDSlaveUpdate *su = get_uncommitted_slave_update(metareq, from);
3379 ceph_assert(su);
7c673cae
FG
3380
3381 // perform rollback (and journal a rollback entry)
3382 // note: this will hold up the resolve a bit, until the rollback entries journal.
3383 MDRequestRef null_ref;
3384 switch (su->origop) {
3385 case ESlaveUpdate::LINK:
3386 mds->server->do_link_rollback(su->rollback, from, null_ref);
3387 break;
3388 case ESlaveUpdate::RENAME:
3389 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3390 break;
3391 case ESlaveUpdate::RMDIR:
3392 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3393 break;
3394 default:
3395 ceph_abort();
3396 }
3397 } else {
11fdf7f2 3398 MDRequestRef mdr = request_get(metareq);
7c673cae
FG
3399 mdr->aborted = true;
3400 if (mdr->slave_request) {
3401 if (mdr->slave_did_prepare()) // journaling slave prepare ?
11fdf7f2 3402 add_rollback(metareq, from);
7c673cae
FG
3403 } else {
3404 request_finish(mdr);
3405 }
3406 }
3407 }
3408
11fdf7f2 3409 if (!ambiguous_slave_updates.count(from)) {
7c673cae 3410 resolve_ack_gather.erase(from);
11fdf7f2 3411 maybe_finish_slave_resolve();
7c673cae 3412 }
7c673cae
FG
3413}
3414
3415void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3416{
11fdf7f2 3417 ceph_assert(uncommitted_slave_updates[master].count(reqid) == 0);
7c673cae
FG
3418 uncommitted_slave_updates[master][reqid] = su;
3419 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3420 uncommitted_slave_rename_olddir[*p]++;
3421 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3422 uncommitted_slave_unlink[*p]++;
3423}
3424
3425void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3426{
11fdf7f2 3427 ceph_assert(uncommitted_slave_updates[master].count(reqid));
7c673cae
FG
3428 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3429
3430 uncommitted_slave_updates[master].erase(reqid);
3431 if (uncommitted_slave_updates[master].empty())
3432 uncommitted_slave_updates.erase(master);
3433 // discard the non-auth subtree we renamed out of
3434 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3435 CInode *diri = *p;
3436 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
11fdf7f2 3437 ceph_assert(it != uncommitted_slave_rename_olddir.end());
7c673cae
FG
3438 it->second--;
3439 if (it->second == 0) {
3440 uncommitted_slave_rename_olddir.erase(it);
3441 list<CDir*> ls;
3442 diri->get_dirfrags(ls);
3443 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
3444 CDir *root = get_subtree_root(*q);
3445 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3446 try_trim_non_auth_subtree(root);
3447 if (*q != root)
3448 break;
3449 }
3450 }
3451 } else
11fdf7f2 3452 ceph_assert(it->second > 0);
7c673cae
FG
3453 }
3454 // removed the inodes that were unlinked by slave update
3455 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3456 CInode *in = *p;
3457 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
11fdf7f2 3458 ceph_assert(it != uncommitted_slave_unlink.end());
7c673cae
FG
3459 it->second--;
3460 if (it->second == 0) {
3461 uncommitted_slave_unlink.erase(it);
3462 if (!in->get_projected_parent_dn())
3463 mds->mdcache->remove_inode_recursive(in);
3464 } else
11fdf7f2 3465 ceph_assert(it->second > 0);
7c673cae
FG
3466 }
3467 delete su;
3468}
3469
3470MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3471{
3472
3473 MDSlaveUpdate* su = NULL;
3474 if (uncommitted_slave_updates.count(master) &&
3475 uncommitted_slave_updates[master].count(reqid)) {
3476 su = uncommitted_slave_updates[master][reqid];
11fdf7f2 3477 ceph_assert(su);
7c673cae
FG
3478 }
3479 return su;
3480}
3481
3482void MDCache::finish_rollback(metareqid_t reqid) {
11fdf7f2
TL
3483 auto p = resolve_need_rollback.find(reqid);
3484 ceph_assert(p != resolve_need_rollback.end());
7c673cae 3485 if (mds->is_resolve())
11fdf7f2
TL
3486 finish_uncommitted_slave_update(reqid, p->second);
3487 resolve_need_rollback.erase(p);
3488 maybe_finish_slave_resolve();
7c673cae
FG
3489}
3490
3491void MDCache::disambiguate_other_imports()
3492{
3493 dout(10) << "disambiguate_other_imports" << dendl;
3494
3495 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3496 // other nodes' ambiguous imports
3497 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3498 p != other_ambiguous_imports.end();
3499 ++p) {
3500 mds_rank_t who = p->first;
3501 dout(10) << "ambiguous imports for mds." << who << dendl;
3502
3503 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3504 q != p->second.end();
3505 ++q) {
3506 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3507 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3508 CDir *dir = get_force_dirfrag(q->first, recovering);
3509 if (!dir) continue;
3510
3511 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3512 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3513 dout(10) << " mds." << who << " did import " << *dir << dendl;
3514 adjust_bounded_subtree_auth(dir, q->second, who);
3515 try_subtree_merge(dir);
3516 } else {
3517 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3518 }
3519 }
3520 }
3521 other_ambiguous_imports.clear();
3522}
3523
3524void MDCache::disambiguate_my_imports()
3525{
3526 dout(10) << "disambiguate_my_imports" << dendl;
3527
3528 if (!mds->is_resolve()) {
11fdf7f2 3529 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3530 return;
3531 }
3532
3533 disambiguate_other_imports();
3534
3535 // my ambiguous imports
3536 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3537 while (!my_ambiguous_imports.empty()) {
3538 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3539
3540 CDir *dir = get_dirfrag(q->first);
11fdf7f2 3541 ceph_assert(dir);
7c673cae
FG
3542
3543 if (dir->authority() != me_ambig) {
3544 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3545 cancel_ambiguous_import(dir);
3546
3547 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3548
3549 // subtree may have been swallowed by another node claiming dir
3550 // as their own.
3551 CDir *root = get_subtree_root(dir);
3552 if (root != dir)
3553 dout(10) << " subtree root is " << *root << dendl;
11fdf7f2 3554 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
7c673cae
FG
3555 try_trim_non_auth_subtree(root);
3556 } else {
3557 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3558 finish_ambiguous_import(q->first);
3559 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3560 }
3561 }
11fdf7f2 3562 ceph_assert(my_ambiguous_imports.empty());
7c673cae
FG
3563 mds->mdlog->flush();
3564
3565 // verify all my subtrees are unambiguous!
3566 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3567 p != subtrees.end();
3568 ++p) {
3569 CDir *dir = p->first;
3570 if (dir->is_ambiguous_dir_auth()) {
3571 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3572 }
11fdf7f2 3573 ceph_assert(!dir->is_ambiguous_dir_auth());
7c673cae
FG
3574 }
3575
3576 show_subtrees();
3577}
3578
3579
3580void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3581{
11fdf7f2 3582 ceph_assert(my_ambiguous_imports.count(base) == 0);
7c673cae
FG
3583 my_ambiguous_imports[base] = bounds;
3584}
3585
3586
3587void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3588{
3589 // make a list
3590 vector<dirfrag_t> binos;
3591 for (set<CDir*>::iterator p = bounds.begin();
3592 p != bounds.end();
3593 ++p)
3594 binos.push_back((*p)->dirfrag());
3595
3596 // note: this can get called twice if the exporter fails during recovery
3597 if (my_ambiguous_imports.count(base->dirfrag()))
3598 my_ambiguous_imports.erase(base->dirfrag());
3599
3600 add_ambiguous_import(base->dirfrag(), binos);
3601}
3602
3603void MDCache::cancel_ambiguous_import(CDir *dir)
3604{
3605 dirfrag_t df = dir->dirfrag();
11fdf7f2 3606 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3607 dout(10) << "cancel_ambiguous_import " << df
3608 << " bounds " << my_ambiguous_imports[df]
3609 << " " << *dir
3610 << dendl;
3611 my_ambiguous_imports.erase(df);
3612}
3613
3614void MDCache::finish_ambiguous_import(dirfrag_t df)
3615{
11fdf7f2 3616 ceph_assert(my_ambiguous_imports.count(df));
7c673cae
FG
3617 vector<dirfrag_t> bounds;
3618 bounds.swap(my_ambiguous_imports[df]);
3619 my_ambiguous_imports.erase(df);
3620
3621 dout(10) << "finish_ambiguous_import " << df
3622 << " bounds " << bounds
3623 << dendl;
3624 CDir *dir = get_dirfrag(df);
11fdf7f2 3625 ceph_assert(dir);
7c673cae
FG
3626
3627 // adjust dir_auth, import maps
3628 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3629 try_subtree_merge(dir);
3630}
3631
3632void MDCache::remove_inode_recursive(CInode *in)
3633{
3634 dout(10) << "remove_inode_recursive " << *in << dendl;
3635 list<CDir*> ls;
3636 in->get_dirfrags(ls);
3637 list<CDir*>::iterator p = ls.begin();
3638 while (p != ls.end()) {
3639 CDir *subdir = *p++;
3640
3641 dout(10) << " removing dirfrag " << subdir << dendl;
94b18763
FG
3642 auto it = subdir->items.begin();
3643 while (it != subdir->items.end()) {
3644 CDentry *dn = it->second;
3645 ++it;
7c673cae
FG
3646 CDentry::linkage_t *dnl = dn->get_linkage();
3647 if (dnl->is_primary()) {
3648 CInode *tin = dnl->get_inode();
31f18b77 3649 subdir->unlink_inode(dn, false);
7c673cae
FG
3650 remove_inode_recursive(tin);
3651 }
3652 subdir->remove_dentry(dn);
3653 }
3654
3655 if (subdir->is_subtree_root())
3656 remove_subtree(subdir);
3657 in->close_dirfrag(subdir->dirfrag().frag);
3658 }
3659 remove_inode(in);
3660}
3661
11fdf7f2 3662bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
7c673cae 3663{
11fdf7f2 3664 ceph_assert(!in->is_auth());
7c673cae
FG
3665
3666 dout(10) << __func__ << ":" << *in << dendl;
3667
3668 // Recurse into any dirfrags beneath this inode
3669 list<CDir*> ls;
3670 in->get_dirfrags(ls);
3671 for (auto subdir : ls) {
3672 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3673 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3674 return true;
3675 }
3676
3677 for (auto &it : subdir->items) {
3678 CDentry *dn = it.second;
3679 CDentry::linkage_t *dnl = dn->get_linkage();
3680 if (dnl->is_primary()) {
3681 CInode *tin = dnl->get_inode();
3682
3683 /* Remote strays with linkage (i.e. hardlinks) should not be
3684 * expired, because they may be the target of
3685 * a rename() as the owning MDS shuts down */
3686 if (!tin->is_stray() && tin->inode.nlink) {
3687 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3688 return true;
3689 }
3690
3691 const bool abort = expire_recursive(tin, expiremap);
3692 if (abort) {
3693 return true;
3694 }
3695 }
3696 if (dn->lru_is_expireable()) {
3697 trim_dentry(dn, expiremap);
3698 } else {
3699 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3700 return true;
3701 }
3702 }
3703 }
3704
3705 return false;
3706}
3707
3708void MDCache::trim_unlinked_inodes()
3709{
3710 dout(7) << "trim_unlinked_inodes" << dendl;
81eedcae
TL
3711 int count = 0;
3712 vector<CInode*> q;
94b18763 3713 for (auto &p : inode_map) {
b32b8144 3714 CInode *in = p.second;
7c673cae
FG
3715 if (in->get_parent_dn() == NULL && !in->is_base()) {
3716 dout(7) << " will trim from " << *in << dendl;
3717 q.push_back(in);
3718 }
81eedcae
TL
3719
3720 if (!(++count % 1000))
3721 mds->heartbeat_reset();
3722 }
3723
3724 for (auto& in : q) {
3725 remove_inode_recursive(in);
3726
3727 if (!(++count % 1000))
3728 mds->heartbeat_reset();
7c673cae 3729 }
7c673cae
FG
3730}
3731
3732/** recalc_auth_bits()
3733 * once subtree auth is disambiguated, we need to adjust all the
3734 * auth and dirty bits in our cache before moving on.
3735 */
3736void MDCache::recalc_auth_bits(bool replay)
3737{
3738 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3739
3740 if (root) {
3741 root->inode_auth.first = mds->mdsmap->get_root();
3742 bool auth = mds->get_nodeid() == root->inode_auth.first;
3743 if (auth) {
3744 root->state_set(CInode::STATE_AUTH);
3745 } else {
3746 root->state_clear(CInode::STATE_AUTH);
3747 if (!replay)
3748 root->state_set(CInode::STATE_REJOINING);
3749 }
3750 }
3751
3752 set<CInode*> subtree_inodes;
3753 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3754 p != subtrees.end();
3755 ++p) {
3756 if (p->first->dir_auth.first == mds->get_nodeid())
3757 subtree_inodes.insert(p->first->inode);
3758 }
3759
3760 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3761 p != subtrees.end();
3762 ++p) {
3763 if (p->first->inode->is_mdsdir()) {
3764 CInode *in = p->first->inode;
3765 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3766 if (auth) {
3767 in->state_set(CInode::STATE_AUTH);
3768 } else {
3769 in->state_clear(CInode::STATE_AUTH);
3770 if (!replay)
3771 in->state_set(CInode::STATE_REJOINING);
3772 }
3773 }
3774
3775 list<CDir*> dfq; // dirfrag queue
3776 dfq.push_back(p->first);
3777
3778 bool auth = p->first->authority().first == mds->get_nodeid();
3779 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3780
3781 while (!dfq.empty()) {
3782 CDir *dir = dfq.front();
3783 dfq.pop_front();
3784
3785 // dir
3786 if (auth) {
3787 dir->state_set(CDir::STATE_AUTH);
3788 } else {
3789 dir->state_clear(CDir::STATE_AUTH);
3790 if (!replay) {
3791 // close empty non-auth dirfrag
3792 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3793 dir->inode->close_dirfrag(dir->get_frag());
3794 continue;
3795 }
3796 dir->state_set(CDir::STATE_REJOINING);
3797 dir->state_clear(CDir::STATE_COMPLETE);
3798 if (dir->is_dirty())
3799 dir->mark_clean();
3800 }
3801 }
3802
3803 // dentries in this dir
94b18763 3804 for (auto &p : dir->items) {
7c673cae 3805 // dn
94b18763 3806 CDentry *dn = p.second;
7c673cae
FG
3807 CDentry::linkage_t *dnl = dn->get_linkage();
3808 if (auth) {
3809 dn->state_set(CDentry::STATE_AUTH);
3810 } else {
3811 dn->state_clear(CDentry::STATE_AUTH);
3812 if (!replay) {
3813 dn->state_set(CDentry::STATE_REJOINING);
3814 if (dn->is_dirty())
3815 dn->mark_clean();
3816 }
3817 }
3818
3819 if (dnl->is_primary()) {
3820 // inode
3821 CInode *in = dnl->get_inode();
3822 if (auth) {
3823 in->state_set(CInode::STATE_AUTH);
3824 } else {
3825 in->state_clear(CInode::STATE_AUTH);
3826 if (!replay) {
3827 in->state_set(CInode::STATE_REJOINING);
3828 if (in->is_dirty())
3829 in->mark_clean();
3830 if (in->is_dirty_parent())
3831 in->clear_dirty_parent();
3832 // avoid touching scatterlocks for our subtree roots!
3833 if (subtree_inodes.count(in) == 0)
3834 in->clear_scatter_dirty();
3835 }
3836 }
3837 // recurse?
3838 if (in->is_dir())
3839 in->get_nested_dirfrags(dfq);
3840 }
3841 }
3842 }
3843 }
3844
3845 show_subtrees();
3846 show_cache();
3847}
3848
3849
3850
3851// ===========================================================================
3852// REJOIN
3853
3854/*
3855 * notes on scatterlock recovery:
3856 *
3857 * - recovering inode replica sends scatterlock data for any subtree
3858 * roots (the only ones that are possibly dirty).
3859 *
3860 * - surviving auth incorporates any provided scatterlock data. any
3861 * pending gathers are then finished, as with the other lock types.
3862 *
3863 * that takes care of surviving auth + (recovering replica)*.
3864 *
3865 * - surviving replica sends strong_inode, which includes current
3866 * scatterlock state, AND any dirty scatterlock data. this
3867 * provides the recovering auth with everything it might need.
3868 *
3869 * - recovering auth must pick initial scatterlock state based on
3870 * (weak|strong) rejoins.
3871 * - always assimilate scatterlock data (it can't hurt)
3872 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3873 * - include base inode in ack for all inodes that saw scatterlock content
3874 *
3875 * also, for scatter gather,
3876 *
3877 * - auth increments {frag,r}stat.version on completion of any gather.
3878 *
3879 * - auth incorporates changes in a gather _only_ if the version
3880 * matches.
3881 *
3882 * - replica discards changes any time the scatterlock syncs, and
3883 * after recovery.
3884 */
3885
3886void MDCache::dump_rejoin_status(Formatter *f) const
3887{
3888 f->open_object_section("rejoin_status");
3889 f->dump_stream("rejoin_gather") << rejoin_gather;
3890 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3891 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3892 f->close_section();
3893}
3894
11fdf7f2 3895void MDCache::rejoin_start(MDSContext *rejoin_done_)
7c673cae
FG
3896{
3897 dout(10) << "rejoin_start" << dendl;
11fdf7f2 3898 ceph_assert(!rejoin_done);
7c673cae
FG
3899 rejoin_done.reset(rejoin_done_);
3900
3901 rejoin_gather = recovery_set;
3902 // need finish opening cap inodes before sending cache rejoins
3903 rejoin_gather.insert(mds->get_nodeid());
3904 process_imported_caps();
3905}
3906
3907/*
3908 * rejoin phase!
3909 *
11fdf7f2 3910 * this initiates rejoin. it should be called before we get any
7c673cae
FG
3911 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3912 *
3913 * we start out by sending rejoins to everyone in the recovery set.
3914 *
3915 * if we are rejoin, send for all regions in our cache.
11fdf7f2 3916 * if we are active|stopping, send only to nodes that are rejoining.
7c673cae
FG
3917 */
3918void MDCache::rejoin_send_rejoins()
3919{
3920 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3921
3922 if (rejoin_gather.count(mds->get_nodeid())) {
3923 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3924 rejoins_pending = true;
3925 return;
3926 }
3927 if (!resolve_gather.empty()) {
3928 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3929 << resolve_gather << ")" << dendl;
3930 rejoins_pending = true;
3931 return;
3932 }
3933
11fdf7f2
TL
3934 ceph_assert(!migrator->is_importing());
3935 ceph_assert(!migrator->is_exporting());
7c673cae
FG
3936
3937 if (!mds->is_rejoin()) {
3938 disambiguate_other_imports();
3939 }
3940
11fdf7f2 3941 map<mds_rank_t, MMDSCacheRejoin::ref> rejoins;
7c673cae
FG
3942
3943
3944 // if i am rejoining, send a rejoin to everyone.
3945 // otherwise, just send to others who are rejoining.
3946 for (set<mds_rank_t>::iterator p = recovery_set.begin();
3947 p != recovery_set.end();
3948 ++p) {
3949 if (*p == mds->get_nodeid()) continue; // nothing to myself!
3950 if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node!
3951 if (mds->is_rejoin())
11fdf7f2 3952 rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK);
7c673cae 3953 else if (mds->mdsmap->is_rejoin(*p))
11fdf7f2 3954 rejoins[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG);
7c673cae
FG
3955 }
3956
3957 if (mds->is_rejoin()) {
11fdf7f2
TL
3958 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
3959 for (auto& p : cap_exports) {
3960 mds_rank_t target = p.second.first;
7c673cae
FG
3961 if (rejoins.count(target) == 0)
3962 continue;
11fdf7f2
TL
3963 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
3964 Session *session = nullptr;
3965 auto it = client_exports.find(q->first);
3966 if (it != client_exports.end()) {
3967 session = it->second.first;
3968 if (session)
3969 it->second.second.insert(target);
3970 } else {
3971 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3972 auto& r = client_exports[q->first];
3973 r.first = session;
3974 if (session)
3975 r.second.insert(target);
3976 }
3977 if (session) {
3978 ++q;
3979 } else {
3980 // remove reconnect with no session
3981 p.second.second.erase(q++);
3982 }
3983 }
3984 rejoins[target]->cap_exports[p.first] = p.second.second;
7c673cae 3985 }
11fdf7f2
TL
3986 for (auto& p : client_exports) {
3987 Session *session = p.second.first;
3988 for (auto& q : p.second.second) {
3989 auto rejoin = rejoins[q];
3990 rejoin->client_map[p.first] = session->info.inst;
3991 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
3992 }
7c673cae
FG
3993 }
3994 }
3995
3996
3997 // check all subtrees
3998 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
3999 p != subtrees.end();
4000 ++p) {
4001 CDir *dir = p->first;
11fdf7f2 4002 ceph_assert(dir->is_subtree_root());
7c673cae
FG
4003 if (dir->is_ambiguous_dir_auth()) {
4004 // exporter is recovering, importer is survivor.
11fdf7f2
TL
4005 ceph_assert(rejoins.count(dir->authority().first));
4006 ceph_assert(!rejoins.count(dir->authority().second));
7c673cae
FG
4007 continue;
4008 }
4009
4010 // my subtree?
4011 if (dir->is_auth())
4012 continue; // skip my own regions!
4013
4014 mds_rank_t auth = dir->get_dir_auth().first;
11fdf7f2 4015 ceph_assert(auth >= 0);
7c673cae
FG
4016 if (rejoins.count(auth) == 0)
4017 continue; // don't care about this node's subtrees
4018
4019 rejoin_walk(dir, rejoins[auth]);
4020 }
4021
4022 // rejoin root inodes, too
11fdf7f2 4023 for (auto &p : rejoins) {
7c673cae
FG
4024 if (mds->is_rejoin()) {
4025 // weak
11fdf7f2
TL
4026 if (p.first == 0 && root) {
4027 p.second->add_weak_inode(root->vino());
7c673cae
FG
4028 if (root->is_dirty_scattered()) {
4029 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4030 p.second->add_scatterlock_state(root);
7c673cae
FG
4031 }
4032 }
11fdf7f2 4033 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
7c673cae 4034 if (in)
11fdf7f2 4035 p.second->add_weak_inode(in->vino());
7c673cae
FG
4036 }
4037 } else {
4038 // strong
11fdf7f2
TL
4039 if (p.first == 0 && root) {
4040 p.second->add_strong_inode(root->vino(),
7c673cae
FG
4041 root->get_replica_nonce(),
4042 root->get_caps_wanted(),
4043 root->filelock.get_state(),
4044 root->nestlock.get_state(),
4045 root->dirfragtreelock.get_state());
4046 root->state_set(CInode::STATE_REJOINING);
4047 if (root->is_dirty_scattered()) {
4048 dout(10) << " sending scatterlock state on root " << *root << dendl;
11fdf7f2 4049 p.second->add_scatterlock_state(root);
7c673cae
FG
4050 }
4051 }
4052
11fdf7f2
TL
4053 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4054 p.second->add_strong_inode(in->vino(),
7c673cae
FG
4055 in->get_replica_nonce(),
4056 in->get_caps_wanted(),
4057 in->filelock.get_state(),
4058 in->nestlock.get_state(),
4059 in->dirfragtreelock.get_state());
4060 in->state_set(CInode::STATE_REJOINING);
4061 }
4062 }
4063 }
4064
4065 if (!mds->is_rejoin()) {
4066 // i am survivor. send strong rejoin.
4067 // note request remote_auth_pins, xlocks
4068 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4069 p != active_requests.end();
4070 ++p) {
4071 MDRequestRef& mdr = p->second;
4072 if (mdr->is_slave())
4073 continue;
4074 // auth pins
11fdf7f2
TL
4075 for (const auto& q : mdr->remote_auth_pins) {
4076 if (!q.first->is_auth()) {
4077 ceph_assert(q.second == q.first->authority().first);
4078 if (rejoins.count(q.second) == 0) continue;
4079 const MMDSCacheRejoin::ref &rejoin = rejoins[q.second];
7c673cae 4080
11fdf7f2 4081 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
7c673cae 4082 MDSCacheObjectInfo i;
11fdf7f2 4083 q.first->set_object_info(i);
7c673cae
FG
4084 if (i.ino)
4085 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4086 else
4087 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4088
4089 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
11fdf7f2 4090 mdr->more()->rename_inode == q.first)
7c673cae
FG
4091 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4092 mdr->reqid, mdr->attempt);
4093 }
4094 }
4095 // xlocks
11fdf7f2
TL
4096 for (const auto& q : mdr->locks) {
4097 auto lock = q.lock;
4098 auto obj = lock->get_parent();
4099 if (q.is_xlock() && !obj->is_auth()) {
4100 mds_rank_t who = obj->authority().first;
7c673cae 4101 if (rejoins.count(who) == 0) continue;
11fdf7f2 4102 const MMDSCacheRejoin::ref &rejoin = rejoins[who];
7c673cae 4103
11fdf7f2 4104 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
7c673cae 4105 MDSCacheObjectInfo i;
11fdf7f2 4106 obj->set_object_info(i);
7c673cae 4107 if (i.ino)
11fdf7f2 4108 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
7c673cae
FG
4109 mdr->reqid, mdr->attempt);
4110 else
4111 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4112 mdr->reqid, mdr->attempt);
11fdf7f2
TL
4113 } else if (q.is_remote_wrlock()) {
4114 mds_rank_t who = q.wrlock_target;
4115 if (rejoins.count(who) == 0) continue;
4116 const MMDSCacheRejoin::ref &rejoin = rejoins[who];
7c673cae 4117
11fdf7f2
TL
4118 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4119 MDSCacheObjectInfo i;
4120 obj->set_object_info(i);
4121 ceph_assert(i.ino);
4122 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4123 mdr->reqid, mdr->attempt);
4124 }
7c673cae
FG
4125 }
4126 }
4127 }
4128
4129 // send the messages
11fdf7f2
TL
4130 for (auto &p : rejoins) {
4131 ceph_assert(rejoin_sent.count(p.first) == 0);
4132 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4133 rejoin_sent.insert(p.first);
4134 rejoin_ack_gather.insert(p.first);
4135 mds->send_message_mds(p.second, p.first);
7c673cae
FG
4136 }
4137 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4138 rejoins_pending = false;
4139
4140 // nothing?
28e407b8 4141 if (mds->is_rejoin() && rejoin_gather.empty()) {
7c673cae
FG
4142 dout(10) << "nothing to rejoin" << dendl;
4143 rejoin_gather_finish();
4144 }
4145}
4146
4147
4148/**
4149 * rejoin_walk - build rejoin declarations for a subtree
4150 *
4151 * @param dir subtree root
4152 * @param rejoin rejoin message
4153 *
4154 * from a rejoining node:
4155 * weak dirfrag
4156 * weak dentries (w/ connectivity)
4157 *
4158 * from a surviving node:
4159 * strong dirfrag
4160 * strong dentries (no connectivity!)
4161 * strong inodes
4162 */
11fdf7f2 4163void MDCache::rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin)
7c673cae
FG
4164{
4165 dout(10) << "rejoin_walk " << *dir << dendl;
4166
4167 list<CDir*> nested; // finish this dir, then do nested items
4168
4169 if (mds->is_rejoin()) {
4170 // WEAK
4171 rejoin->add_weak_dirfrag(dir->dirfrag());
94b18763
FG
4172 for (auto &p : dir->items) {
4173 CDentry *dn = p.second;
11fdf7f2 4174 ceph_assert(dn->last == CEPH_NOSNAP);
7c673cae
FG
4175 CDentry::linkage_t *dnl = dn->get_linkage();
4176 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
11fdf7f2 4177 ceph_assert(dnl->is_primary());
7c673cae 4178 CInode *in = dnl->get_inode();
11fdf7f2 4179 ceph_assert(dnl->get_inode()->is_dir());
94b18763 4180 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
7c673cae
FG
4181 in->get_nested_dirfrags(nested);
4182 if (in->is_dirty_scattered()) {
4183 dout(10) << " sending scatterlock state on " << *in << dendl;
4184 rejoin->add_scatterlock_state(in);
4185 }
4186 }
4187 } else {
4188 // STRONG
4189 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4190 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4191 dir->state_set(CDir::STATE_REJOINING);
4192
11fdf7f2 4193 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
94b18763 4194 CDentry *dn = it->second;
11fdf7f2
TL
4195 ++it;
4196 dn->state_set(CDentry::STATE_REJOINING);
7c673cae 4197 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2
TL
4198 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4199
4200 // trim snap dentries. because they may have been pruned by
4201 // their auth mds (snap deleted)
4202 if (dn->last != CEPH_NOSNAP) {
4203 if (in && !in->remote_parents.empty()) {
4204 // unlink any stale remote snap dentry.
4205 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4206 CDentry *remote_dn = *it2;
4207 ++it2;
4208 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4209 remote_dn->unlink_remote(remote_dn->get_linkage());
4210 }
4211 }
4212 if (dn->lru_is_expireable()) {
4213 if (!dnl->is_null())
4214 dir->unlink_inode(dn, false);
4215 if (in)
4216 remove_inode(in);
4217 dir->remove_dentry(dn);
4218 continue;
4219 } else {
4220 // Inventing null/remote dentry shouldn't cause problem
4221 ceph_assert(!dnl->is_primary());
4222 }
4223 }
4224
7c673cae 4225 dout(15) << " add_strong_dentry " << *dn << dendl;
94b18763 4226 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4227 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4228 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4229 dnl->is_remote() ? dnl->get_remote_d_type():0,
4230 dn->get_replica_nonce(),
4231 dn->lock.get_state());
4232 dn->state_set(CDentry::STATE_REJOINING);
4233 if (dnl->is_primary()) {
4234 CInode *in = dnl->get_inode();
4235 dout(15) << " add_strong_inode " << *in << dendl;
4236 rejoin->add_strong_inode(in->vino(),
4237 in->get_replica_nonce(),
4238 in->get_caps_wanted(),
4239 in->filelock.get_state(),
4240 in->nestlock.get_state(),
4241 in->dirfragtreelock.get_state());
4242 in->state_set(CInode::STATE_REJOINING);
4243 in->get_nested_dirfrags(nested);
4244 if (in->is_dirty_scattered()) {
4245 dout(10) << " sending scatterlock state on " << *in << dendl;
4246 rejoin->add_scatterlock_state(in);
4247 }
4248 }
4249 }
4250 }
4251
4252 // recurse into nested dirs
4253 for (list<CDir*>::iterator p = nested.begin();
4254 p != nested.end();
4255 ++p)
4256 rejoin_walk(*p, rejoin);
4257}
4258
4259
4260/*
4261 * i got a rejoin.
4262 * - reply with the lockstate
4263 *
4264 * if i am active|stopping,
4265 * - remove source from replica list for everything not referenced here.
7c673cae 4266 */
11fdf7f2 4267void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m)
7c673cae
FG
4268{
4269 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4270 << " (" << m->get_payload().length() << " bytes)"
4271 << dendl;
4272
4273 switch (m->op) {
4274 case MMDSCacheRejoin::OP_WEAK:
4275 handle_cache_rejoin_weak(m);
4276 break;
4277 case MMDSCacheRejoin::OP_STRONG:
4278 handle_cache_rejoin_strong(m);
4279 break;
4280 case MMDSCacheRejoin::OP_ACK:
4281 handle_cache_rejoin_ack(m);
4282 break;
4283
4284 default:
4285 ceph_abort();
4286 }
7c673cae
FG
4287}
4288
4289
4290/*
4291 * handle_cache_rejoin_weak
4292 *
4293 * the sender
4294 * - is recovering from their journal.
4295 * - may have incorrect (out of date) inode contents
4296 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4297 *
4298 * if the sender didn't trim_non_auth(), they
4299 * - may have incorrect (out of date) dentry/inode linkage
4300 * - may have deleted/purged inodes
4301 * and i may have to go to disk to get accurate inode contents. yuck.
7c673cae 4302 */
11fdf7f2 4303void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &weak)
7c673cae
FG
4304{
4305 mds_rank_t from = mds_rank_t(weak->get_source().num());
4306
4307 // possible response(s)
11fdf7f2 4308 MMDSCacheRejoin::ref ack; // if survivor
7c673cae
FG
4309 set<vinodeno_t> acked_inodes; // if survivor
4310 set<SimpleLock *> gather_locks; // if survivor
4311 bool survivor = false; // am i a survivor?
4312
4313 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4314 survivor = true;
4315 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
11fdf7f2 4316 ack = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
7c673cae
FG
4317
4318 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4319
4320 // check cap exports
4321 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4322 CInode *in = get_inode(p->first);
11fdf7f2 4323 ceph_assert(!in || in->is_auth());
7c673cae
FG
4324 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4325 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4326 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4327 Capability::Import& im = imported_caps[p->first][q->first];
4328 if (cap) {
4329 im.cap_id = cap->get_cap_id();
4330 im.issue_seq = cap->get_last_seq();
4331 im.mseq = cap->get_mseq();
4332 } else {
4333 // all are zero
4334 }
4335 }
4336 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4337 }
4338
11fdf7f2 4339 encode(imported_caps, ack->imported_caps);
7c673cae 4340 } else {
11fdf7f2 4341 ceph_assert(mds->is_rejoin());
7c673cae
FG
4342
4343 // we may have already received a strong rejoin from the sender.
4344 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
11fdf7f2 4345 ceph_assert(gather_locks.empty());
7c673cae
FG
4346
4347 // check cap exports.
4348 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
11fdf7f2
TL
4349 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4350 weak->client_metadata_map.end());
7c673cae
FG
4351
4352 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4353 CInode *in = get_inode(p->first);
11fdf7f2 4354 ceph_assert(!in || in->is_auth());
7c673cae
FG
4355 // note
4356 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4357 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4358 cap_imports[p->first][q->first][from] = q->second;
4359 }
4360 }
4361 }
4362
4363 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4364 for (const auto &p : weak->inode_scatterlocks) {
4365 CInode *in = get_inode(p.first);
4366 ceph_assert(in);
4367 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4368 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4369 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4370 if (!survivor)
4371 rejoin_potential_updated_scatterlocks.insert(in);
4372 }
4373
4374 // recovering peer may send incorrect dirfrags here. we need to
4375 // infer which dirfrag they meant. the ack will include a
4376 // strong_dirfrag that will set them straight on the fragmentation.
4377
4378 // walk weak map
4379 set<CDir*> dirs_to_share;
11fdf7f2
TL
4380 for (const auto &p : weak->weak_dirfrags) {
4381 CInode *diri = get_inode(p.ino);
7c673cae 4382 if (!diri)
11fdf7f2
TL
4383 dout(0) << " missing dir ino " << p.ino << dendl;
4384 ceph_assert(diri);
7c673cae 4385
11fdf7f2
TL
4386 frag_vec_t leaves;
4387 if (diri->dirfragtree.is_leaf(p.frag)) {
4388 leaves.push_back(p.frag);
7c673cae 4389 } else {
11fdf7f2
TL
4390 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4391 if (leaves.empty())
4392 leaves.push_back(diri->dirfragtree[p.frag.value()]);
7c673cae 4393 }
11fdf7f2
TL
4394 for (const auto& leaf : leaves) {
4395 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4396 if (!dir) {
11fdf7f2 4397 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
7c673cae
FG
4398 continue;
4399 }
11fdf7f2 4400 ceph_assert(dir);
7c673cae 4401 if (dirs_to_share.count(dir)) {
11fdf7f2 4402 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4403 } else {
4404 dirs_to_share.insert(dir);
4405 unsigned nonce = dir->add_replica(from);
11fdf7f2 4406 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
7c673cae
FG
4407 if (ack) {
4408 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4409 ack->add_dirfrag_base(dir);
4410 }
4411 }
4412 }
4413 }
4414
11fdf7f2
TL
4415 for (const auto &p : weak->weak) {
4416 CInode *diri = get_inode(p.first);
7c673cae 4417 if (!diri)
11fdf7f2
TL
4418 dout(0) << " missing dir ino " << p.first << dendl;
4419 ceph_assert(diri);
7c673cae
FG
4420
4421 // weak dentries
4422 CDir *dir = 0;
11fdf7f2 4423 for (const auto &q : p.second) {
7c673cae
FG
4424 // locate proper dirfrag.
4425 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
11fdf7f2 4426 frag_t fg = diri->pick_dirfrag(q.first.name);
7c673cae
FG
4427 if (!dir || dir->get_frag() != fg) {
4428 dir = diri->get_dirfrag(fg);
4429 if (!dir)
4430 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
11fdf7f2
TL
4431 ceph_assert(dir);
4432 ceph_assert(dirs_to_share.count(dir));
7c673cae
FG
4433 }
4434
4435 // and dentry
11fdf7f2
TL
4436 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4437 ceph_assert(dn);
7c673cae 4438 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 4439 ceph_assert(dnl->is_primary());
7c673cae
FG
4440
4441 if (survivor && dn->is_replica(from))
4442 dentry_remove_replica(dn, from, gather_locks);
4443 unsigned dnonce = dn->add_replica(from);
4444 dout(10) << " have " << *dn << dendl;
4445 if (ack)
94b18763 4446 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
4447 dnl->get_inode()->ino(), inodeno_t(0), 0,
4448 dnonce, dn->lock.get_replica_state());
4449
4450 // inode
4451 CInode *in = dnl->get_inode();
11fdf7f2 4452 ceph_assert(in);
7c673cae
FG
4453
4454 if (survivor && in->is_replica(from))
4455 inode_remove_replica(in, from, true, gather_locks);
4456 unsigned inonce = in->add_replica(from);
4457 dout(10) << " have " << *in << dendl;
4458
4459 // scatter the dirlock, just in case?
4460 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4461 in->filelock.set_state(LOCK_MIX);
4462
4463 if (ack) {
4464 acked_inodes.insert(in->vino());
4465 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4466 bufferlist bl;
4467 in->_encode_locks_state_for_rejoin(bl, from);
4468 ack->add_inode_locks(in, inonce, bl);
4469 }
4470 }
4471 }
4472
4473 // weak base inodes? (root, stray, etc.)
4474 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4475 p != weak->weak_inodes.end();
4476 ++p) {
4477 CInode *in = get_inode(*p);
11fdf7f2 4478 ceph_assert(in); // hmm fixme wrt stray?
7c673cae
FG
4479 if (survivor && in->is_replica(from))
4480 inode_remove_replica(in, from, true, gather_locks);
4481 unsigned inonce = in->add_replica(from);
4482 dout(10) << " have base " << *in << dendl;
4483
4484 if (ack) {
4485 acked_inodes.insert(in->vino());
4486 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4487 bufferlist bl;
4488 in->_encode_locks_state_for_rejoin(bl, from);
4489 ack->add_inode_locks(in, inonce, bl);
4490 }
4491 }
4492
11fdf7f2 4493 ceph_assert(rejoin_gather.count(from));
7c673cae
FG
4494 rejoin_gather.erase(from);
4495 if (survivor) {
4496 // survivor. do everything now.
11fdf7f2
TL
4497 for (const auto &p : weak->inode_scatterlocks) {
4498 CInode *in = get_inode(p.first);
4499 ceph_assert(in);
7c673cae
FG
4500 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4501 acked_inodes.insert(in->vino());
4502 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4503 }
4504
4505 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4506 mds->send_message(ack, weak->get_connection());
4507
4508 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4509 if (!(*p)->is_stable())
4510 mds->locker->eval_gather(*p);
4511 }
4512 } else {
4513 // done?
28e407b8 4514 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4515 rejoin_gather_finish();
4516 } else {
4517 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4518 }
4519 }
4520}
4521
7c673cae
FG
4522/*
4523 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4524 *
4525 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4526 * ack, the replica dne, and we can remove it from our replica maps.
4527 */
11fdf7f2 4528void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
7c673cae
FG
4529 set<vinodeno_t>& acked_inodes,
4530 set<SimpleLock *>& gather_locks)
4531{
4532 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4533
b32b8144 4534 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
7c673cae
FG
4535 // inode?
4536 if (in->is_auth() &&
4537 in->is_replica(from) &&
b32b8144 4538 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
7c673cae
FG
4539 inode_remove_replica(in, from, false, gather_locks);
4540 dout(10) << " rem " << *in << dendl;
4541 }
4542
b32b8144
FG
4543 if (!in->is_dir())
4544 return;
7c673cae
FG
4545
4546 list<CDir*> dfs;
4547 in->get_dirfrags(dfs);
4548 for (list<CDir*>::iterator p = dfs.begin();
4549 p != dfs.end();
4550 ++p) {
4551 CDir *dir = *p;
181888fb
FG
4552 if (!dir->is_auth())
4553 continue;
7c673cae 4554
181888fb 4555 if (dir->is_replica(from) &&
7c673cae
FG
4556 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4557 dir->remove_replica(from);
4558 dout(10) << " rem " << *dir << dendl;
4559 }
4560
4561 // dentries
94b18763
FG
4562 for (auto &p : dir->items) {
4563 CDentry *dn = p.second;
7c673cae 4564
11fdf7f2
TL
4565 if (dn->is_replica(from)) {
4566 if (ack) {
4567 const auto it = ack->strong_dentries.find(dir->dirfrag());
4568 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4569 continue;
4570 }
4571 }
7c673cae
FG
4572 dentry_remove_replica(dn, from, gather_locks);
4573 dout(10) << " rem " << *dn << dendl;
4574 }
4575 }
4576 }
b32b8144
FG
4577 };
4578
94b18763 4579 for (auto &p : inode_map)
b32b8144 4580 scour_func(p.second);
94b18763 4581 for (auto &p : snap_inode_map)
b32b8144 4582 scour_func(p.second);
7c673cae
FG
4583}
4584
4585
4586CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4587{
4588 CInode *in = new CInode(this, true, 1, last);
4589 in->inode.ino = ino;
4590 in->state_set(CInode::STATE_REJOINUNDEF);
4591 add_inode(in);
4592 rejoin_undef_inodes.insert(in);
4593 dout(10) << " invented " << *in << dendl;
4594 return in;
4595}
4596
4597CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4598{
4599 CInode *in = get_inode(df.ino);
4600 if (!in)
4601 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4602 if (!in->is_dir()) {
11fdf7f2 4603 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
7c673cae 4604 in->inode.mode = S_IFDIR;
11fdf7f2 4605 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
4606 }
4607 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4608 dir->state_set(CDir::STATE_REJOINUNDEF);
4609 rejoin_undef_dirfrags.insert(dir);
4610 dout(10) << " invented " << *dir << dendl;
4611 return dir;
4612}
4613
11fdf7f2 4614void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &strong)
7c673cae
FG
4615{
4616 mds_rank_t from = mds_rank_t(strong->get_source().num());
4617
4618 // only a recovering node will get a strong rejoin.
a8e16298
TL
4619 if (!mds->is_rejoin()) {
4620 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4621 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4622 return;
4623 }
11fdf7f2 4624 ceph_abort_msg("got unexpected rejoin message during recovery");
a8e16298 4625 }
7c673cae
FG
4626
4627 // assimilate any potentially dirty scatterlock state
11fdf7f2
TL
4628 for (const auto &p : strong->inode_scatterlocks) {
4629 CInode *in = get_inode(p.first);
4630 ceph_assert(in);
4631 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4632 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4633 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
7c673cae
FG
4634 rejoin_potential_updated_scatterlocks.insert(in);
4635 }
4636
4637 rejoin_unlinked_inodes[from].clear();
4638
4639 // surviving peer may send incorrect dirfrag here (maybe they didn't
4640 // get the fragment notify, or maybe we rolled back?). we need to
4641 // infer the right frag and get them with the program. somehow.
4642 // we don't normally send ACK.. so we'll need to bundle this with
4643 // MISSING or something.
4644
4645 // strong dirfrags/dentries.
4646 // also process auth_pins, xlocks.
11fdf7f2
TL
4647 for (const auto &p : strong->strong_dirfrags) {
4648 auto& dirfrag = p.first;
4649 CInode *diri = get_inode(dirfrag.ino);
7c673cae 4650 if (!diri)
11fdf7f2
TL
4651 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4652 CDir *dir = diri->get_dirfrag(dirfrag.frag);
7c673cae
FG
4653 bool refragged = false;
4654 if (dir) {
4655 dout(10) << " have " << *dir << dendl;
4656 } else {
4657 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4658 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
11fdf7f2
TL
4659 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4660 dir = rejoin_invent_dirfrag(dirfrag);
7c673cae
FG
4661 }
4662 if (dir) {
11fdf7f2
TL
4663 dir->add_replica(from, p.second.nonce);
4664 dir->dir_rep = p.second.dir_rep;
7c673cae 4665 } else {
11fdf7f2
TL
4666 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4667 frag_vec_t leaves;
4668 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4669 if (leaves.empty())
4670 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4671 dout(10) << " maps to frag(s) " << leaves << dendl;
4672 for (const auto& leaf : leaves) {
4673 CDir *dir = diri->get_dirfrag(leaf);
7c673cae 4674 if (!dir)
11fdf7f2 4675 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
7c673cae
FG
4676 else
4677 dout(10) << " have(approx) " << *dir << dendl;
11fdf7f2
TL
4678 dir->add_replica(from, p.second.nonce);
4679 dir->dir_rep = p.second.dir_rep;
7c673cae
FG
4680 }
4681 refragged = true;
4682 }
4683
11fdf7f2
TL
4684 const auto it = strong->strong_dentries.find(dirfrag);
4685 if (it != strong->strong_dentries.end()) {
4686 const map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = it->second;
4687 for (const auto &q : dmap) {
4688 const string_snap_t& ss = q.first;
4689 const MMDSCacheRejoin::dn_strong& d = q.second;
4690 CDentry *dn;
4691 if (!refragged)
4692 dn = dir->lookup(ss.name, ss.snapid);
4693 else {
4694 frag_t fg = diri->pick_dirfrag(ss.name);
4695 dir = diri->get_dirfrag(fg);
4696 ceph_assert(dir);
4697 dn = dir->lookup(ss.name, ss.snapid);
4698 }
4699 if (!dn) {
4700 if (d.is_remote()) {
4701 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4702 } else if (d.is_null()) {
4703 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4704 } else {
4705 CInode *in = get_inode(d.ino, ss.snapid);
4706 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4707 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4708 }
4709 dout(10) << " invented " << *dn << dendl;
4710 }
4711 CDentry::linkage_t *dnl = dn->get_linkage();
4712
4713 // dn auth_pin?
4714 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4715 if (pinned_it != strong->authpinned_dentries.end()) {
4716 const auto slave_reqid_it = pinned_it->second.find(ss);
4717 if (slave_reqid_it != pinned_it->second.end()) {
4718 for (const auto &r : slave_reqid_it->second) {
4719 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4720
4721 // get/create slave mdrequest
4722 MDRequestRef mdr;
4723 if (have_request(r.reqid))
4724 mdr = request_get(r.reqid);
4725 else
4726 mdr = request_start_slave(r.reqid, r.attempt, strong);
4727 mdr->auth_pin(dn);
4728 }
4729 }
7c673cae 4730 }
7c673cae 4731
11fdf7f2
TL
4732 // dn xlock?
4733 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4734 if (xlocked_it != strong->xlocked_dentries.end()) {
4735 const auto ss_req_it = xlocked_it->second.find(ss);
4736 if (ss_req_it != xlocked_it->second.end()) {
4737 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4738 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4739 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4740 ceph_assert(mdr->is_auth_pinned(dn));
4741 if (!mdr->is_xlocked(&dn->versionlock)) {
4742 ceph_assert(dn->versionlock.can_xlock_local());
4743 dn->versionlock.get_xlock(mdr, mdr->get_client());
4744 mdr->locks.emplace(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4745 }
4746 if (dn->lock.is_stable())
4747 dn->auth_pin(&dn->lock);
4748 dn->lock.set_state(LOCK_XLOCK);
4749 dn->lock.get_xlock(mdr, mdr->get_client());
4750 mdr->locks.emplace(&dn->lock, MutationImpl::LockOp::XLOCK);
4751 }
4752 }
7c673cae 4753
11fdf7f2
TL
4754 dn->add_replica(from, d.nonce);
4755 dout(10) << " have " << *dn << dendl;
4756
4757 if (dnl->is_primary()) {
4758 if (d.is_primary()) {
4759 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4760 // the survivor missed MDentryUnlink+MDentryLink messages ?
4761 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4762 CInode *in = get_inode(d.ino, ss.snapid);
4763 ceph_assert(in);
4764 ceph_assert(in->get_parent_dn());
4765 rejoin_unlinked_inodes[from].insert(in);
4766 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4767 }
4768 } else {
4769 // the survivor missed MDentryLink message ?
4770 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4771 dout(7) << " sender doesn't have primay dentry" << dendl;
4772 }
4773 } else {
4774 if (d.is_primary()) {
4775 // the survivor missed MDentryUnlink message ?
4776 CInode *in = get_inode(d.ino, ss.snapid);
4777 ceph_assert(in);
4778 ceph_assert(in->get_parent_dn());
7c673cae 4779 rejoin_unlinked_inodes[from].insert(in);
11fdf7f2 4780 dout(7) << " sender has primary dentry but we don't" << dendl;
7c673cae 4781 }
11fdf7f2 4782 }
7c673cae
FG
4783 }
4784 }
4785 }
4786
11fdf7f2
TL
4787 for (const auto &p : strong->strong_inodes) {
4788 CInode *in = get_inode(p.first);
4789 ceph_assert(in);
4790 in->add_replica(from, p.second.nonce);
7c673cae
FG
4791 dout(10) << " have " << *in << dendl;
4792
11fdf7f2 4793 const MMDSCacheRejoin::inode_strong& is = p.second;
7c673cae
FG
4794
4795 // caps_wanted
4796 if (is.caps_wanted) {
11fdf7f2 4797 in->set_mds_caps_wanted(from, is.caps_wanted);
7c673cae
FG
4798 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4799 << " on " << *in << dendl;
4800 }
4801
4802 // scatterlocks?
4803 // infer state from replica state:
4804 // * go to MIX if they might have wrlocks
4805 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4806 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4807 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4808 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4809
4810 // auth pin?
11fdf7f2
TL
4811 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4812 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4813 for (const auto& r : authpinned_inodes_it->second) {
4814 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
7c673cae
FG
4815
4816 // get/create slave mdrequest
4817 MDRequestRef mdr;
11fdf7f2
TL
4818 if (have_request(r.reqid))
4819 mdr = request_get(r.reqid);
7c673cae 4820 else
11fdf7f2 4821 mdr = request_start_slave(r.reqid, r.attempt, strong);
7c673cae 4822 if (strong->frozen_authpin_inodes.count(in->vino())) {
11fdf7f2 4823 ceph_assert(!in->get_num_auth_pins());
7c673cae
FG
4824 mdr->freeze_auth_pin(in);
4825 } else {
11fdf7f2 4826 ceph_assert(!in->is_frozen_auth_pin());
7c673cae
FG
4827 }
4828 mdr->auth_pin(in);
4829 }
4830 }
4831 // xlock(s)?
11fdf7f2
TL
4832 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4833 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4834 for (const auto &q : xlocked_inodes_it->second) {
4835 SimpleLock *lock = in->get_lock(q.first);
4836 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4837 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4838 ceph_assert(mdr->is_auth_pinned(in));
4839 if (!mdr->is_xlocked(&in->versionlock)) {
4840 ceph_assert(in->versionlock.can_xlock_local());
7c673cae 4841 in->versionlock.get_xlock(mdr, mdr->get_client());
11fdf7f2 4842 mdr->locks.emplace(&in->versionlock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4843 }
4844 if (lock->is_stable())
4845 in->auth_pin(lock);
4846 lock->set_state(LOCK_XLOCK);
4847 if (lock == &in->filelock)
4848 in->loner_cap = -1;
4849 lock->get_xlock(mdr, mdr->get_client());
11fdf7f2 4850 mdr->locks.emplace(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
4851 }
4852 }
4853 }
4854 // wrlock(s)?
11fdf7f2
TL
4855 for (const auto &p : strong->wrlocked_inodes) {
4856 CInode *in = get_inode(p.first);
4857 for (const auto &q : p.second) {
4858 SimpleLock *lock = in->get_lock(q.first);
4859 for (const auto &r : q.second) {
4860 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4861 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
7c673cae 4862 if (in->is_auth())
11fdf7f2 4863 ceph_assert(mdr->is_auth_pinned(in));
7c673cae
FG
4864 lock->set_state(LOCK_MIX);
4865 if (lock == &in->filelock)
4866 in->loner_cap = -1;
4867 lock->get_wrlock(true);
11fdf7f2 4868 mdr->locks.emplace(lock, MutationImpl::LockOp::WRLOCK);
7c673cae
FG
4869 }
4870 }
4871 }
4872
4873 // done?
11fdf7f2 4874 ceph_assert(rejoin_gather.count(from));
7c673cae 4875 rejoin_gather.erase(from);
28e407b8 4876 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
7c673cae
FG
4877 rejoin_gather_finish();
4878 } else {
4879 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4880 }
4881}
4882
11fdf7f2 4883void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &ack)
7c673cae
FG
4884{
4885 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4886 mds_rank_t from = mds_rank_t(ack->get_source().num());
4887
11fdf7f2 4888 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
b32b8144
FG
4889 bool survivor = !mds->is_rejoin();
4890
7c673cae
FG
4891 // for sending cache expire message
4892 set<CInode*> isolated_inodes;
4893 set<CInode*> refragged_inodes;
11fdf7f2 4894 list<pair<CInode*,int> > updated_realms;
7c673cae
FG
4895
4896 // dirs
11fdf7f2 4897 for (const auto &p : ack->strong_dirfrags) {
7c673cae
FG
4898 // we may have had incorrect dir fragmentation; refragment based
4899 // on what they auth tells us.
11fdf7f2 4900 CDir *dir = get_dirfrag(p.first);
7c673cae 4901 if (!dir) {
11fdf7f2 4902 dir = get_force_dirfrag(p.first, false);
7c673cae
FG
4903 if (dir)
4904 refragged_inodes.insert(dir->get_inode());
4905 }
4906 if (!dir) {
11fdf7f2 4907 CInode *diri = get_inode(p.first.ino);
7c673cae
FG
4908 if (!diri) {
4909 // barebones inode; the full inode loop below will clean up.
4910 diri = new CInode(this, false);
11fdf7f2 4911 diri->inode.ino = p.first.ino;
7c673cae 4912 diri->inode.mode = S_IFDIR;
11fdf7f2 4913 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 4914 add_inode(diri);
11fdf7f2 4915 if (MDS_INO_MDSDIR(from) == p.first.ino) {
7c673cae
FG
4916 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4917 dout(10) << " add inode " << *diri << dendl;
4918 } else {
4919 diri->inode_auth = CDIR_AUTH_DEFAULT;
4920 isolated_inodes.insert(diri);
11fdf7f2 4921 dout(10) << " unconnected dirfrag " << p.first << dendl;
7c673cae
FG
4922 }
4923 }
4924 // barebones dirfrag; the full dirfrag loop below will clean up.
11fdf7f2
TL
4925 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4926 if (MDS_INO_MDSDIR(from) == p.first.ino ||
7c673cae
FG
4927 (dir->authority() != CDIR_AUTH_UNDEF &&
4928 dir->authority().first != from))
4929 adjust_subtree_auth(dir, from);
4930 dout(10) << " add dirfrag " << *dir << dendl;
4931 }
4932
11fdf7f2 4933 dir->set_replica_nonce(p.second.nonce);
7c673cae
FG
4934 dir->state_clear(CDir::STATE_REJOINING);
4935 dout(10) << " got " << *dir << dendl;
4936
4937 // dentries
11fdf7f2
TL
4938 auto it = ack->strong_dentries.find(p.first);
4939 if (it != ack->strong_dentries.end()) {
4940 for (const auto &q : it->second) {
4941 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4942 if(!dn)
4943 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
4944
4945 CDentry::linkage_t *dnl = dn->get_linkage();
4946
4947 ceph_assert(dn->last == q.first.snapid);
4948 if (dn->first != q.second.first) {
4949 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
4950 dn->first = q.second.first;
4951 }
7c673cae 4952
11fdf7f2
TL
4953 // may have bad linkage if we missed dentry link/unlink messages
4954 if (dnl->is_primary()) {
4955 CInode *in = dnl->get_inode();
4956 if (!q.second.is_primary() ||
4957 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
4958 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4959 dir->unlink_inode(dn);
4960 }
4961 } else if (dnl->is_remote()) {
4962 if (!q.second.is_remote() ||
4963 q.second.remote_ino != dnl->get_remote_ino() ||
4964 q.second.remote_d_type != dnl->get_remote_d_type()) {
4965 dout(10) << " had bad linkage for " << *dn << dendl;
4966 dir->unlink_inode(dn);
4967 }
4968 } else {
4969 if (!q.second.is_null())
4970 dout(10) << " had bad linkage for " << *dn << dendl;
4971 }
7c673cae 4972
11fdf7f2
TL
4973 // hmm, did we have the proper linkage here?
4974 if (dnl->is_null() && !q.second.is_null()) {
4975 if (q.second.is_remote()) {
4976 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
4977 } else {
4978 CInode *in = get_inode(q.second.ino, q.first.snapid);
4979 if (!in) {
4980 // barebones inode; assume it's dir, the full inode loop below will clean up.
4981 in = new CInode(this, false, q.second.first, q.first.snapid);
4982 in->inode.ino = q.second.ino;
4983 in->inode.mode = S_IFDIR;
4984 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4985 add_inode(in);
4986 dout(10) << " add inode " << *in << dendl;
4987 } else if (in->get_parent_dn()) {
4988 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
4989 << ", unlinking " << *in << dendl;
4990 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
4991 }
4992 dn->dir->link_primary_inode(dn, in);
4993 isolated_inodes.erase(in);
7c673cae 4994 }
11fdf7f2 4995 }
7c673cae 4996
11fdf7f2
TL
4997 dn->set_replica_nonce(q.second.nonce);
4998 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
4999 dn->state_clear(CDentry::STATE_REJOINING);
5000 dout(10) << " got " << *dn << dendl;
5001 }
7c673cae
FG
5002 }
5003 }
5004
5005 for (set<CInode*>::iterator p = refragged_inodes.begin();
5006 p != refragged_inodes.end();
5007 ++p) {
5008 list<CDir*> ls;
5009 (*p)->get_nested_dirfrags(ls);
5010 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
5011 if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag()))
5012 continue;
11fdf7f2 5013 ceph_assert((*q)->get_num_any() == 0);
7c673cae
FG
5014 (*p)->close_dirfrag((*q)->get_frag());
5015 }
5016 }
5017
5018 // full dirfrags
11fdf7f2
TL
5019 for (const auto &p : ack->dirfrag_bases) {
5020 CDir *dir = get_dirfrag(p.first);
5021 ceph_assert(dir);
5022 auto q = p.second.cbegin();
7c673cae
FG
5023 dir->_decode_base(q);
5024 dout(10) << " got dir replica " << *dir << dendl;
5025 }
5026
5027 // full inodes
11fdf7f2 5028 auto p = ack->inode_base.cbegin();
7c673cae
FG
5029 while (!p.end()) {
5030 inodeno_t ino;
5031 snapid_t last;
5032 bufferlist basebl;
11fdf7f2
TL
5033 decode(ino, p);
5034 decode(last, p);
5035 decode(basebl, p);
7c673cae 5036 CInode *in = get_inode(ino, last);
11fdf7f2
TL
5037 ceph_assert(in);
5038 auto q = basebl.cbegin();
5039 snapid_t sseq = 0;
5040 if (in->snaprealm)
5041 sseq = in->snaprealm->srnode.seq;
7c673cae 5042 in->_decode_base(q);
11fdf7f2
TL
5043 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5044 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5045 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5046 }
7c673cae
FG
5047 dout(10) << " got inode base " << *in << dendl;
5048 }
5049
5050 // inodes
11fdf7f2 5051 p = ack->inode_locks.cbegin();
7c673cae
FG
5052 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5053 while (!p.end()) {
5054 inodeno_t ino;
5055 snapid_t last;
5056 __u32 nonce;
5057 bufferlist lockbl;
11fdf7f2
TL
5058 decode(ino, p);
5059 decode(last, p);
5060 decode(nonce, p);
5061 decode(lockbl, p);
7c673cae
FG
5062
5063 CInode *in = get_inode(ino, last);
11fdf7f2 5064 ceph_assert(in);
7c673cae 5065 in->set_replica_nonce(nonce);
11fdf7f2 5066 auto q = lockbl.cbegin();
b32b8144 5067 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
7c673cae
FG
5068 in->state_clear(CInode::STATE_REJOINING);
5069 dout(10) << " got inode locks " << *in << dendl;
5070 }
5071
5072 // FIXME: This can happen if entire subtree, together with the inode subtree root
5073 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
11fdf7f2 5074 ceph_assert(isolated_inodes.empty());
7c673cae
FG
5075
5076 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
11fdf7f2
TL
5077 auto bp = ack->imported_caps.cbegin();
5078 decode(peer_imported, bp);
7c673cae
FG
5079
5080 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5081 p != peer_imported.end();
5082 ++p) {
28e407b8 5083 auto& ex = cap_exports.at(p->first);
11fdf7f2 5084 ceph_assert(ex.first == from);
7c673cae
FG
5085 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5086 q != p->second.end();
5087 ++q) {
28e407b8 5088 auto r = ex.second.find(q->first);
11fdf7f2 5089 ceph_assert(r != ex.second.end());
7c673cae
FG
5090
5091 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5092 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
28e407b8
AA
5093 if (!session) {
5094 dout(10) << " no session for client." << p->first << dendl;
5095 ex.second.erase(r);
5096 continue;
5097 }
7c673cae
FG
5098
5099 // mark client caps stale.
11fdf7f2 5100 auto m = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0,
28e407b8 5101 r->second.capinfo.cap_id, 0,
7c673cae
FG
5102 mds->get_osd_epoch_barrier());
5103 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5104 (q->second.cap_id > 0 ? from : -1), 0);
5105 mds->send_message_client_counted(m, session);
5106
28e407b8 5107 ex.second.erase(r);
7c673cae 5108 }
11fdf7f2
TL
5109 ceph_assert(ex.second.empty());
5110 }
5111
5112 for (auto p : updated_realms) {
5113 CInode *in = p.first;
5114 bool notify_clients;
5115 if (mds->is_rejoin()) {
5116 if (!rejoin_pending_snaprealms.count(in)) {
5117 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5118 rejoin_pending_snaprealms.insert(in);
5119 }
5120 notify_clients = false;
5121 } else {
5122 // notify clients if I'm survivor
5123 notify_clients = true;
5124 }
5125 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
7c673cae
FG
5126 }
5127
5128 // done?
11fdf7f2 5129 ceph_assert(rejoin_ack_gather.count(from));
7c673cae 5130 rejoin_ack_gather.erase(from);
b32b8144 5131 if (!survivor) {
7c673cae
FG
5132 if (rejoin_gather.empty()) {
5133 // eval unstable scatter locks after all wrlocks are rejoined.
5134 while (!rejoin_eval_locks.empty()) {
5135 SimpleLock *lock = rejoin_eval_locks.front();
5136 rejoin_eval_locks.pop_front();
5137 if (!lock->is_stable())
5138 mds->locker->eval_gather(lock);
5139 }
5140 }
5141
5142 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5143 rejoin_ack_gather.empty()) {
5144 // finally, kickstart past snap parent opens
11fdf7f2 5145 open_snaprealms();
7c673cae
FG
5146 } else {
5147 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5148 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5149 }
5150 } else {
5151 // survivor.
5152 mds->queue_waiters(rejoin_waiters);
5153 }
5154}
5155
5156/**
5157 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5158 *
5159 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5160 * messages that clean these guys up...
5161 */
5162void MDCache::rejoin_trim_undef_inodes()
5163{
5164 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5165
5166 while (!rejoin_undef_inodes.empty()) {
5167 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5168 CInode *in = *p;
5169 rejoin_undef_inodes.erase(p);
5170
5171 in->clear_replica_map();
5172
5173 // close out dirfrags
5174 if (in->is_dir()) {
5175 list<CDir*> dfls;
5176 in->get_dirfrags(dfls);
5177 for (list<CDir*>::iterator p = dfls.begin();
5178 p != dfls.end();
5179 ++p) {
5180 CDir *dir = *p;
5181 dir->clear_replica_map();
5182
94b18763
FG
5183 for (auto &p : dir->items) {
5184 CDentry *dn = p.second;
7c673cae
FG
5185 dn->clear_replica_map();
5186
5187 dout(10) << " trimming " << *dn << dendl;
5188 dir->remove_dentry(dn);
5189 }
5190
5191 dout(10) << " trimming " << *dir << dendl;
5192 in->close_dirfrag(dir->dirfrag().frag);
5193 }
5194 }
5195
5196 CDentry *dn = in->get_parent_dn();
5197 if (dn) {
5198 dn->clear_replica_map();
5199 dout(10) << " trimming " << *dn << dendl;
5200 dn->dir->remove_dentry(dn);
5201 } else {
5202 dout(10) << " trimming " << *in << dendl;
5203 remove_inode(in);
5204 }
5205 }
5206
11fdf7f2 5207 ceph_assert(rejoin_undef_inodes.empty());
7c673cae
FG
5208}
5209
5210void MDCache::rejoin_gather_finish()
5211{
5212 dout(10) << "rejoin_gather_finish" << dendl;
11fdf7f2
TL
5213 ceph_assert(mds->is_rejoin());
5214 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae
FG
5215
5216 if (open_undef_inodes_dirfrags())
5217 return;
5218
5219 if (process_imported_caps())
5220 return;
5221
5222 choose_lock_states_and_reconnect_caps();
5223
5224 identify_files_to_recover();
5225 rejoin_send_acks();
5226
5227 // signal completion of fetches, rejoin_gather_finish, etc.
7c673cae
FG
5228 rejoin_ack_gather.erase(mds->get_nodeid());
5229
5230 // did we already get our acks too?
5231 if (rejoin_ack_gather.empty()) {
11fdf7f2
TL
5232 // finally, open snaprealms
5233 open_snaprealms();
7c673cae
FG
5234 }
5235}
5236
5237class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5238 inodeno_t ino;
5239public:
5240 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5241 void finish(int r) override {
5242 mdcache->rejoin_open_ino_finish(ino, r);
5243 }
5244};
5245
5246void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5247{
5248 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5249
5250 if (ret < 0) {
5251 cap_imports_missing.insert(ino);
5252 } else if (ret == mds->get_nodeid()) {
11fdf7f2 5253 ceph_assert(get_inode(ino));
7c673cae
FG
5254 } else {
5255 auto p = cap_imports.find(ino);
11fdf7f2 5256 ceph_assert(p != cap_imports.end());
7c673cae 5257 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
11fdf7f2
TL
5258 ceph_assert(q->second.count(MDS_RANK_NONE));
5259 ceph_assert(q->second.size() == 1);
7c673cae
FG
5260 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5261 }
5262 cap_imports.erase(p);
5263 }
5264
11fdf7f2 5265 ceph_assert(cap_imports_num_opening > 0);
7c673cae
FG
5266 cap_imports_num_opening--;
5267
5268 if (cap_imports_num_opening == 0) {
5269 if (rejoin_gather.empty())
5270 rejoin_gather_finish();
5271 else if (rejoin_gather.count(mds->get_nodeid()))
5272 process_imported_caps();
5273 }
5274}
5275
5276class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5277public:
28e407b8
AA
5278 map<client_t,pair<Session*,uint64_t> > session_map;
5279 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
7c673cae 5280 void finish(int r) override {
11fdf7f2 5281 ceph_assert(r == 0);
28e407b8 5282 mdcache->rejoin_open_sessions_finish(session_map);
7c673cae
FG
5283 }
5284};
5285
28e407b8 5286void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
7c673cae
FG
5287{
5288 dout(10) << "rejoin_open_sessions_finish" << dendl;
28e407b8
AA
5289 mds->server->finish_force_open_sessions(session_map);
5290 rejoin_session_map.swap(session_map);
7c673cae
FG
5291 if (rejoin_gather.empty())
5292 rejoin_gather_finish();
5293}
5294
11fdf7f2
TL
5295void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5296{
5297 auto p = cap_imports.find(ino);
5298 if (p != cap_imports.end()) {
5299 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5300 if (ret < 0) {
5301 cap_imports_missing.insert(ino);
5302 } else if (ret != mds->get_nodeid()) {
5303 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5304 ceph_assert(q->second.count(MDS_RANK_NONE));
5305 ceph_assert(q->second.size() == 1);
5306 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5307 }
5308 cap_imports.erase(p);
5309 }
5310 }
5311}
5312
7c673cae
FG
5313bool MDCache::process_imported_caps()
5314{
5315 dout(10) << "process_imported_caps" << dendl;
5316
11fdf7f2
TL
5317 if (!open_file_table.is_prefetched() &&
5318 open_file_table.prefetch_inodes()) {
5319 open_file_table.wait_for_prefetch(
5320 new MDSInternalContextWrapper(mds,
5321 new FunctionContext([this](int r) {
5322 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5323 process_imported_caps();
5324 })
5325 )
5326 );
5327 return true;
5328 }
5329
7c673cae
FG
5330 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5331 CInode *in = get_inode(p->first);
5332 if (in) {
11fdf7f2 5333 ceph_assert(in->is_auth());
7c673cae
FG
5334 cap_imports_missing.erase(p->first);
5335 continue;
5336 }
5337 if (cap_imports_missing.count(p->first) > 0)
5338 continue;
5339
5340 cap_imports_num_opening++;
5341 dout(10) << " opening missing ino " << p->first << dendl;
5342 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
28e407b8
AA
5343 if (!(cap_imports_num_opening % 1000))
5344 mds->heartbeat_reset();
7c673cae
FG
5345 }
5346
5347 if (cap_imports_num_opening > 0)
5348 return true;
5349
5350 // called by rejoin_gather_finish() ?
5351 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
28e407b8
AA
5352 if (!rejoin_client_map.empty() &&
5353 rejoin_session_map.empty()) {
5354 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5355 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
11fdf7f2 5356 rejoin_client_metadata_map,
28e407b8 5357 finish->session_map);
11fdf7f2
TL
5358 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5359 std::move(rejoin_client_metadata_map));
5360 mds->mdlog->start_submit_entry(le, finish);
28e407b8
AA
5361 mds->mdlog->flush();
5362 rejoin_client_map.clear();
11fdf7f2 5363 rejoin_client_metadata_map.clear();
28e407b8 5364 return true;
7c673cae 5365 }
7c673cae
FG
5366
5367 // process caps that were exported by slave rename
5368 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5369 p != rejoin_slave_exports.end();
5370 ++p) {
5371 CInode *in = get_inode(p->first);
11fdf7f2 5372 ceph_assert(in);
7c673cae
FG
5373 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5374 q != p->second.second.end();
5375 ++q) {
28e407b8
AA
5376 auto r = rejoin_session_map.find(q->first);
5377 if (r == rejoin_session_map.end())
5378 continue;
7c673cae 5379
28e407b8 5380 Session *session = r->second.first;
7c673cae 5381 Capability *cap = in->get_client_cap(q->first);
11fdf7f2 5382 if (!cap) {
7c673cae 5383 cap = in->add_client_cap(q->first, session);
11fdf7f2
TL
5384 // add empty item to reconnected_caps
5385 (void)reconnected_caps[p->first][q->first];
5386 }
7c673cae
FG
5387 cap->merge(q->second, true);
5388
5389 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
11fdf7f2
TL
5390 ceph_assert(cap->get_last_seq() == im.issue_seq);
5391 ceph_assert(cap->get_mseq() == im.mseq);
7c673cae
FG
5392 cap->set_cap_id(im.cap_id);
5393 // send cap import because we assigned a new cap ID
5394 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5395 p->second.first, CEPH_CAP_FLAG_AUTH);
5396 }
5397 }
5398 rejoin_slave_exports.clear();
5399 rejoin_imported_caps.clear();
5400
5401 // process cap imports
5402 // ino -> client -> frommds -> capex
5403 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5404 CInode *in = get_inode(p->first);
5405 if (!in) {
5406 dout(10) << " still missing ino " << p->first
5407 << ", will try again after replayed client requests" << dendl;
5408 ++p;
5409 continue;
5410 }
11fdf7f2 5411 ceph_assert(in->is_auth());
7c673cae 5412 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
28e407b8
AA
5413 Session *session;
5414 {
5415 auto r = rejoin_session_map.find(q->first);
5416 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5417 }
5418
7c673cae 5419 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
28e407b8
AA
5420 if (!session) {
5421 if (r->first >= 0)
5422 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5423 continue;
5424 }
5425
7c673cae
FG
5426 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5427 add_reconnected_cap(q->first, in->ino(), r->second);
5428 if (r->first >= 0) {
5429 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5430 cap->inc_mseq();
5431 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5432
5433 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5434 im.cap_id = cap->get_cap_id();
5435 im.issue_seq = cap->get_last_seq();
5436 im.mseq = cap->get_mseq();
5437 }
5438 }
5439 }
5440 cap_imports.erase(p++); // remove and move on
5441 }
5442 } else {
5443 trim_non_auth();
5444
11fdf7f2 5445 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
7c673cae 5446 rejoin_gather.erase(mds->get_nodeid());
11fdf7f2 5447 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
7c673cae 5448 maybe_send_pending_rejoins();
7c673cae
FG
5449 }
5450 return false;
5451}
5452
7c673cae
FG
5453void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5454 client_t client, snapid_t snap_follows)
5455{
5456 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5457
11fdf7f2
TL
5458 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5459 return;
5460
7c673cae
FG
5461 const set<snapid_t>& snaps = realm->get_snaps();
5462 snapid_t follows = snap_follows;
5463
5464 while (true) {
5465 CInode *in = pick_inode_snap(head_in, follows);
5466 if (in == head_in)
5467 break;
11fdf7f2
TL
5468
5469 bool need_snapflush = false;
5470 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5471 p != snaps.end() && *p <= in->last;
5472 ++p) {
5473 head_in->add_need_snapflush(in, *p, client);
5474 need_snapflush = true;
5475 }
5476 follows = in->last;
5477 if (!need_snapflush)
5478 continue;
5479
7c673cae
FG
5480 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5481
11fdf7f2 5482 /* TODO: we can check the reconnected/flushing caps to find
7c673cae
FG
5483 * which locks need gathering */
5484 for (int i = 0; i < num_cinode_locks; i++) {
5485 int lockid = cinode_lock_info[i].lock;
5486 SimpleLock *lock = in->get_lock(lockid);
11fdf7f2 5487 ceph_assert(lock);
7c673cae
FG
5488 in->client_snap_caps[lockid].insert(client);
5489 in->auth_pin(lock);
5490 lock->set_state(LOCK_SNAP_SYNC);
5491 lock->get_wrlock(true);
5492 }
11fdf7f2 5493 mds->locker->mark_need_snapflush_inode(in);
7c673cae
FG
5494 }
5495}
5496
5497/*
5498 * choose lock states based on reconnected caps
5499 */
5500void MDCache::choose_lock_states_and_reconnect_caps()
5501{
5502 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5503
81eedcae 5504 int count = 0;
11fdf7f2
TL
5505 for (auto p : inode_map) {
5506 CInode *in = p.second;
7c673cae
FG
5507 if (in->last != CEPH_NOSNAP)
5508 continue;
5509
5510 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5511 in->mark_dirty_rstat();
5512
7c673cae 5513 int dirty_caps = 0;
11fdf7f2
TL
5514 auto q = reconnected_caps.find(in->ino());
5515 if (q != reconnected_caps.end()) {
5516 for (const auto &it : q->second)
7c673cae
FG
5517 dirty_caps |= it.second.dirty_caps;
5518 }
5519 in->choose_lock_states(dirty_caps);
5520 dout(15) << " chose lock states on " << *in << dendl;
5521
11fdf7f2
TL
5522 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5523 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5524 rejoin_pending_snaprealms.insert(in);
7c673cae 5525 }
81eedcae
TL
5526
5527 if (!(++count % 1000))
5528 mds->heartbeat_reset();
11fdf7f2 5529 }
7c673cae
FG
5530}
5531
5532void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
11fdf7f2 5533 map<client_t,MClientSnap::ref>& splits)
7c673cae 5534{
11fdf7f2
TL
5535 MClientSnap::ref snap;
5536 auto it = splits.find(client);
5537 if (it != splits.end()) {
5538 snap = it->second;
5539 snap->head.op = CEPH_SNAP_OP_SPLIT;
5540 } else {
5541 snap = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
5542 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae 5543 snap->head.split = realm->inode->ino();
11fdf7f2 5544 snap->bl = realm->get_snap_trace();
7c673cae 5545
11fdf7f2
TL
5546 for (const auto& child : realm->open_children)
5547 snap->split_realms.push_back(child->inode->ino());
5548 }
7c673cae
FG
5549 snap->split_inos.push_back(ino);
5550}
5551
11fdf7f2
TL
5552void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5553 map<client_t,MClientSnap::ref>& splits)
5554{
5555 ceph_assert(parent_realm);
5556
5557 vector<inodeno_t> split_inos;
5558 vector<inodeno_t> split_realms;
5559
5560 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5561 !p.end();
5562 ++p)
5563 split_inos.push_back((*p)->ino());
5564 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5565 p != realm->open_children.end();
5566 ++p)
5567 split_realms.push_back((*p)->inode->ino());
5568
5569 for (const auto& p : realm->client_caps) {
5570 ceph_assert(!p.second->empty());
5571 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5572 if (em.second) {
5573 auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
5574 update->head.split = parent_realm->inode->ino();
5575 update->split_inos = split_inos;
5576 update->split_realms = split_realms;
5577 update->bl = parent_realm->get_snap_trace();
5578 em.first->second = std::move(update);
5579 }
5580 }
5581}
5582
5583void MDCache::send_snaps(map<client_t,MClientSnap::ref>& splits)
7c673cae
FG
5584{
5585 dout(10) << "send_snaps" << dendl;
5586
11fdf7f2
TL
5587 for (auto &p : splits) {
5588 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
7c673cae 5589 if (session) {
11fdf7f2
TL
5590 dout(10) << " client." << p.first
5591 << " split " << p.second->head.split
5592 << " inos " << p.second->split_inos
7c673cae 5593 << dendl;
11fdf7f2 5594 mds->send_message_client_counted(p.second, session);
7c673cae 5595 } else {
11fdf7f2 5596 dout(10) << " no session for client." << p.first << dendl;
7c673cae
FG
5597 }
5598 }
5599 splits.clear();
5600}
5601
5602
5603/*
5604 * remove any items from logsegment open_file lists that don't have
5605 * any caps
5606 */
5607void MDCache::clean_open_file_lists()
5608{
5609 dout(10) << "clean_open_file_lists" << dendl;
5610
5611 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5612 p != mds->mdlog->segments.end();
5613 ++p) {
5614 LogSegment *ls = p->second;
5615
5616 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5617 while (!q.end()) {
5618 CInode *in = *q;
5619 ++q;
5620 if (in->last == CEPH_NOSNAP) {
11fdf7f2
TL
5621 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5622 in->item_open_file.remove_myself();
5623 } else {
7c673cae
FG
5624 if (in->client_snap_caps.empty()) {
5625 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5626 in->item_open_file.remove_myself();
5627 }
5628 }
5629 }
5630 }
5631}
5632
11fdf7f2
TL
5633void MDCache::dump_openfiles(Formatter *f)
5634{
5635 f->open_array_section("openfiles");
5636 for (auto p = mds->mdlog->segments.begin();
5637 p != mds->mdlog->segments.end();
5638 ++p) {
5639 LogSegment *ls = p->second;
5640
5641 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5642 while (!q.end()) {
5643 CInode *in = *q;
5644 ++q;
5645 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5646 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5647 continue;
5648 f->open_object_section("file");
5649 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5650 f->close_section();
5651 }
5652 }
5653 f->close_section();
5654}
7c673cae
FG
5655
5656Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5657{
5658 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5659 << " on " << *in << dendl;
5660 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5661 if (!session) {
5662 dout(10) << " no session for client." << client << dendl;
5663 return NULL;
5664 }
5665
5666 Capability *cap = in->reconnect_cap(client, icr, session);
5667
5668 if (frommds >= 0) {
5669 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5670 cap->inc_mseq();
5671 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5672 }
5673
5674 return cap;
5675}
5676
5677void MDCache::export_remaining_imported_caps()
5678{
5679 dout(10) << "export_remaining_imported_caps" << dendl;
5680
5681 stringstream warn_str;
5682
81eedcae 5683 int count = 0;
7c673cae
FG
5684 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5685 warn_str << " ino " << p->first << "\n";
5686 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5687 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5688 if (session) {
5689 // mark client caps stale.
11fdf7f2 5690 auto stale = MClientCaps::create(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier());
7c673cae
FG
5691 stale->set_cap_peer(0, 0, 0, -1, 0);
5692 mds->send_message_client_counted(stale, q->first);
5693 }
5694 }
5695
81eedcae
TL
5696 if (!(++count % 1000))
5697 mds->heartbeat_reset();
7c673cae
FG
5698 }
5699
11fdf7f2 5700 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
7c673cae
FG
5701 p != cap_reconnect_waiters.end();
5702 ++p)
5703 mds->queue_waiters(p->second);
5704
5705 cap_imports.clear();
5706 cap_reconnect_waiters.clear();
5707
5708 if (warn_str.peek() != EOF) {
5709 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5710 mds->clog->warn(warn_str);
5711 }
5712}
5713
a8e16298 5714Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
7c673cae
FG
5715{
5716 client_t client = session->info.get_client();
a8e16298 5717 Capability *cap = nullptr;
7c673cae
FG
5718 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5719 if (rc) {
a8e16298 5720 cap = in->reconnect_cap(client, *rc, session);
7c673cae
FG
5721 dout(10) << "try_reconnect_cap client." << client
5722 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5723 << " issue " << ccap_string(rc->capinfo.issued)
5724 << " on " << *in << dendl;
5725 remove_replay_cap_reconnect(in->ino(), client);
5726
5727 if (in->is_replicated()) {
5728 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5729 } else {
5730 int dirty_caps = 0;
5731 auto p = reconnected_caps.find(in->ino());
5732 if (p != reconnected_caps.end()) {
5733 auto q = p->second.find(client);
5734 if (q != p->second.end())
5735 dirty_caps = q->second.dirty_caps;
5736 }
5737 in->choose_lock_states(dirty_caps);
5738 dout(15) << " chose lock states on " << *in << dendl;
5739 }
5740
11fdf7f2 5741 map<inodeno_t, MDSContext::vec >::iterator it =
7c673cae
FG
5742 cap_reconnect_waiters.find(in->ino());
5743 if (it != cap_reconnect_waiters.end()) {
5744 mds->queue_waiters(it->second);
5745 cap_reconnect_waiters.erase(it);
5746 }
5747 }
a8e16298 5748 return cap;
7c673cae
FG
5749}
5750
5751
5752
5753// -------
5754// cap imports and delayed snap parent opens
5755
5756void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5757 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5758 int peer, int p_flags)
5759{
7c673cae
FG
5760 SnapRealm *realm = in->find_snaprealm();
5761 if (realm->have_past_parents_open()) {
5762 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5763 if (cap->get_last_seq() == 0) // reconnected cap
5764 cap->inc_last_seq();
5765 cap->set_last_issue();
5766 cap->set_last_issue_stamp(ceph_clock_now());
5767 cap->clear_new();
11fdf7f2 5768 auto reap = MClientCaps::create(CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(), cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
7c673cae 5769 in->encode_cap_message(reap, cap);
11fdf7f2 5770 reap->snapbl = realm->get_snap_trace();
7c673cae
FG
5771 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5772 mds->send_message_client_counted(reap, session);
5773 } else {
11fdf7f2 5774 ceph_abort();
7c673cae
FG
5775 }
5776}
5777
5778void MDCache::do_delayed_cap_imports()
5779{
5780 dout(10) << "do_delayed_cap_imports" << dendl;
5781
11fdf7f2 5782 ceph_assert(delayed_imported_caps.empty());
7c673cae
FG
5783}
5784
11fdf7f2
TL
5785struct C_MDC_OpenSnapRealms : public MDCacheContext {
5786 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
7c673cae 5787 void finish(int r) override {
11fdf7f2 5788 mdcache->open_snaprealms();
7c673cae
FG
5789 }
5790};
5791
11fdf7f2 5792void MDCache::open_snaprealms()
7c673cae 5793{
11fdf7f2 5794 dout(10) << "open_snaprealms" << dendl;
7c673cae 5795
7c673cae
FG
5796 MDSGatherBuilder gather(g_ceph_context);
5797
11fdf7f2
TL
5798 auto it = rejoin_pending_snaprealms.begin();
5799 while (it != rejoin_pending_snaprealms.end()) {
5800 CInode *in = *it;
5801 SnapRealm *realm = in->snaprealm;
5802 ceph_assert(realm);
5803 if (realm->have_past_parents_open() ||
5804 realm->open_parents(gather.new_sub())) {
7c673cae
FG
5805 dout(10) << " past parents now open on " << *in << dendl;
5806
11fdf7f2
TL
5807 map<client_t,MClientSnap::ref> splits;
5808 // finish off client snaprealm reconnects?
5809 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5810 if (q != reconnected_snaprealms.end()) {
5811 for (const auto& r : q->second)
5812 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5813 reconnected_snaprealms.erase(q);
5814 }
5815
5816 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5817 !p.end(); ++p) {
5818 CInode *child = *p;
7c673cae 5819 auto q = reconnected_caps.find(child->ino());
11fdf7f2 5820 ceph_assert(q != reconnected_caps.end());
7c673cae 5821 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
11fdf7f2
TL
5822 Capability *cap = child->get_client_cap(r->first);
5823 if (!cap)
5824 continue;
5825 if (r->second.snap_follows > 0) {
5826 if (r->second.snap_follows < child->first - 1) {
5827 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5828 } else if (r->second.snapflush) {
5829 // When processing a cap flush message that is re-sent, it's possble
5830 // that the sender has already released all WR caps. So we should
5831 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5832 cap->mark_needsnapflush();
5833 }
7c673cae
FG
5834 }
5835 // make sure client's cap is in the correct snaprealm.
5836 if (r->second.realm_ino != in->ino()) {
11fdf7f2 5837 prepare_realm_split(realm, r->first, child->ino(), splits);
7c673cae
FG
5838 }
5839 }
5840 }
5841
11fdf7f2 5842 rejoin_pending_snaprealms.erase(it++);
7c673cae
FG
5843 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5844
11fdf7f2 5845 send_snaps(splits);
7c673cae
FG
5846 } else {
5847 dout(10) << " opening past parents on " << *in << dendl;
11fdf7f2 5848 ++it;
7c673cae
FG
5849 }
5850 }
5851
7c673cae 5852 if (gather.has_subs()) {
11fdf7f2
TL
5853 if (gather.num_subs_remaining() == 0) {
5854 // cleanup gather
5855 gather.set_finisher(new C_MDSInternalNoop);
5856 gather.activate();
5857 } else {
5858 // for multimds, must succeed the first time
5859 ceph_assert(recovery_set.empty());
5860
5861 dout(10) << "open_snaprealms - waiting for "
5862 << gather.num_subs_remaining() << dendl;
5863 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5864 gather.activate();
5865 return;
5866 }
5867 }
5868
5869 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5870
5871 if (!reconnected_snaprealms.empty()) {
5872 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5873 for (auto& p : reconnected_snaprealms) {
7c673cae 5874 stringstream warn_str;
11fdf7f2
TL
5875 warn_str << " " << p.first << " {";
5876 bool first = true;
5877 for (auto& q : p.second) {
5878 if (!first)
5879 warn_str << ", ";
5880 warn_str << "client." << q.first << "/" << q.second;
7c673cae 5881 }
11fdf7f2
TL
5882 warn_str << "}";
5883 dout(5) << warn_str.str() << dendl;
7c673cae 5884 }
7c673cae 5885 }
11fdf7f2
TL
5886 ceph_assert(rejoin_waiters.empty());
5887 ceph_assert(rejoin_pending_snaprealms.empty());
5888 dout(10) << "open_snaprealms - all open" << dendl;
5889 do_delayed_cap_imports();
5890
5891 ceph_assert(rejoin_done);
5892 rejoin_done.release()->complete(0);
5893 reconnected_caps.clear();
7c673cae
FG
5894}
5895
5896bool MDCache::open_undef_inodes_dirfrags()
5897{
5898 dout(10) << "open_undef_inodes_dirfrags "
5899 << rejoin_undef_inodes.size() << " inodes "
5900 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5901
5902 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5903
5904 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5905 p != rejoin_undef_inodes.end();
5906 ++p) {
5907 CInode *in = *p;
11fdf7f2 5908 ceph_assert(!in->is_base());
7c673cae
FG
5909 fetch_queue.insert(in->get_parent_dir());
5910 }
5911
5912 if (fetch_queue.empty())
5913 return false;
5914
28e407b8
AA
5915 MDSGatherBuilder gather(g_ceph_context,
5916 new MDSInternalContextWrapper(mds,
5917 new FunctionContext([this](int r) {
5918 if (rejoin_gather.empty())
5919 rejoin_gather_finish();
5920 })
5921 )
5922 );
5923
7c673cae
FG
5924 for (set<CDir*>::iterator p = fetch_queue.begin();
5925 p != fetch_queue.end();
5926 ++p) {
5927 CDir *dir = *p;
5928 CInode *diri = dir->get_inode();
5929 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5930 continue;
5931 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 5932 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
7c673cae
FG
5933 dir->fetch(gather.new_sub());
5934 }
11fdf7f2 5935 ceph_assert(gather.has_subs());
7c673cae
FG
5936 gather.activate();
5937 return true;
5938}
5939
5940void MDCache::opened_undef_inode(CInode *in) {
5941 dout(10) << "opened_undef_inode " << *in << dendl;
5942 rejoin_undef_inodes.erase(in);
5943 if (in->is_dir()) {
5944 // FIXME: re-hash dentries if necessary
11fdf7f2 5945 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
7c673cae
FG
5946 if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5947 CDir *dir = in->get_dirfrag(frag_t());
11fdf7f2 5948 ceph_assert(dir);
7c673cae
FG
5949 rejoin_undef_dirfrags.erase(dir);
5950 in->force_dirfrags();
5951 list<CDir*> ls;
5952 in->get_dirfrags(ls);
5953 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
5954 rejoin_undef_dirfrags.insert(*p);
5955 }
5956 }
5957}
5958
11fdf7f2
TL
5959void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
5960 map<client_t,MClientSnap::ref>& updates)
7c673cae
FG
5961{
5962 if (seq < realm->get_newest_seq()) {
5963 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
11fdf7f2
TL
5964 << realm->get_newest_seq() << " on " << *realm << dendl;
5965 auto snap = MClientSnap::create(CEPH_SNAP_OP_UPDATE);
5966 snap->bl = realm->get_snap_trace();
5967 for (const auto& child : realm->open_children)
5968 snap->split_realms.push_back(child->inode->ino());
5969 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
7c673cae
FG
5970 } else {
5971 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
5972 << " on " << *realm << dendl;
5973 }
5974}
5975
5976
5977
5978void MDCache::rejoin_send_acks()
5979{
5980 dout(7) << "rejoin_send_acks" << dendl;
5981
5982 // replicate stray
5983 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
5984 p != rejoin_unlinked_inodes.end();
5985 ++p) {
5986 for (set<CInode*>::iterator q = p->second.begin();
5987 q != p->second.end();
5988 ++q) {
5989 CInode *in = *q;
5990 dout(7) << " unlinked inode " << *in << dendl;
5991 // inode expired
5992 if (!in->is_replica(p->first))
5993 continue;
5994 while (1) {
5995 CDentry *dn = in->get_parent_dn();
5996 if (dn->is_replica(p->first))
5997 break;
5998 dn->add_replica(p->first);
5999 CDir *dir = dn->get_dir();
6000 if (dir->is_replica(p->first))
6001 break;
6002 dir->add_replica(p->first);
6003 in = dir->get_inode();
6004 if (in->is_replica(p->first))
6005 break;
224ce89b 6006 in->add_replica(p->first);
7c673cae
FG
6007 if (in->is_base())
6008 break;
6009 }
6010 }
6011 }
6012 rejoin_unlinked_inodes.clear();
6013
6014 // send acks to everyone in the recovery set
11fdf7f2 6015 map<mds_rank_t,MMDSCacheRejoin::ref> acks;
7c673cae
FG
6016 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6017 p != recovery_set.end();
31f18b77
FG
6018 ++p) {
6019 if (rejoin_ack_sent.count(*p))
6020 continue;
11fdf7f2 6021 acks[*p] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK);
31f18b77
FG
6022 }
6023
6024 rejoin_ack_sent = recovery_set;
7c673cae
FG
6025
6026 // walk subtrees
6027 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6028 p != subtrees.end();
6029 ++p) {
6030 CDir *dir = p->first;
6031 if (!dir->is_auth())
6032 continue;
6033 dout(10) << "subtree " << *dir << dendl;
6034
6035 // auth items in this subtree
6036 list<CDir*> dq;
6037 dq.push_back(dir);
6038
6039 while (!dq.empty()) {
6040 CDir *dir = dq.front();
6041 dq.pop_front();
6042
6043 // dir
181888fb
FG
6044 for (auto &r : dir->get_replicas()) {
6045 auto it = acks.find(r.first);
31f18b77
FG
6046 if (it == acks.end())
6047 continue;
181888fb 6048 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
31f18b77 6049 it->second->add_dirfrag_base(dir);
7c673cae
FG
6050 }
6051
94b18763
FG
6052 for (auto &p : dir->items) {
6053 CDentry *dn = p.second;
7c673cae
FG
6054 CDentry::linkage_t *dnl = dn->get_linkage();
6055
6056 // inode
6057 CInode *in = NULL;
6058 if (dnl->is_primary())
6059 in = dnl->get_inode();
6060
6061 // dentry
181888fb
FG
6062 for (auto &r : dn->get_replicas()) {
6063 auto it = acks.find(r.first);
31f18b77
FG
6064 if (it == acks.end())
6065 continue;
94b18763 6066 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
7c673cae
FG
6067 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6068 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6069 dnl->is_remote() ? dnl->get_remote_d_type():0,
181888fb 6070 ++r.second,
7c673cae
FG
6071 dn->lock.get_replica_state());
6072 // peer missed MDentrylink message ?
181888fb
FG
6073 if (in && !in->is_replica(r.first))
6074 in->add_replica(r.first);
7c673cae
FG
6075 }
6076
6077 if (!in)
6078 continue;
6079
181888fb
FG
6080 for (auto &r : in->get_replicas()) {
6081 auto it = acks.find(r.first);
31f18b77
FG
6082 if (it == acks.end())
6083 continue;
6084 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
7c673cae 6085 bufferlist bl;
181888fb
FG
6086 in->_encode_locks_state_for_rejoin(bl, r.first);
6087 it->second->add_inode_locks(in, ++r.second, bl);
7c673cae
FG
6088 }
6089
6090 // subdirs in this subtree?
6091 in->get_nested_dirfrags(dq);
6092 }
6093 }
6094 }
6095
6096 // base inodes too
6097 if (root && root->is_auth())
181888fb
FG
6098 for (auto &r : root->get_replicas()) {
6099 auto it = acks.find(r.first);
31f18b77
FG
6100 if (it == acks.end())
6101 continue;
6102 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
7c673cae 6103 bufferlist bl;
181888fb
FG
6104 root->_encode_locks_state_for_rejoin(bl, r.first);
6105 it->second->add_inode_locks(root, ++r.second, bl);
7c673cae
FG
6106 }
6107 if (myin)
181888fb
FG
6108 for (auto &r : myin->get_replicas()) {
6109 auto it = acks.find(r.first);
31f18b77
FG
6110 if (it == acks.end())
6111 continue;
6112 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
7c673cae 6113 bufferlist bl;
181888fb
FG
6114 myin->_encode_locks_state_for_rejoin(bl, r.first);
6115 it->second->add_inode_locks(myin, ++r.second, bl);
7c673cae
FG
6116 }
6117
6118 // include inode base for any inodes whose scatterlocks may have updated
6119 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6120 p != rejoin_potential_updated_scatterlocks.end();
6121 ++p) {
6122 CInode *in = *p;
181888fb
FG
6123 for (const auto &r : in->get_replicas()) {
6124 auto it = acks.find(r.first);
31f18b77
FG
6125 if (it == acks.end())
6126 continue;
6127 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6128 }
7c673cae
FG
6129 }
6130
6131 // send acks
31f18b77 6132 for (auto p = acks.begin(); p != acks.end(); ++p) {
11fdf7f2 6133 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
7c673cae
FG
6134 mds->send_message_mds(p->second, p->first);
6135 }
6136
6137 rejoin_imported_caps.clear();
6138}
6139
c07f9fc5
FG
6140class C_MDC_ReIssueCaps : public MDCacheContext {
6141 CInode *in;
6142public:
6143 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6144 MDCacheContext(mdc), in(i)
6145 {
6146 in->get(CInode::PIN_PTRWAITER);
6147 }
6148 void finish(int r) override {
6149 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6150 mdcache->mds->locker->issue_caps(in);
6151 in->put(CInode::PIN_PTRWAITER);
6152 }
6153};
7c673cae
FG
6154
6155void MDCache::reissue_all_caps()
6156{
6157 dout(10) << "reissue_all_caps" << dendl;
6158
81eedcae 6159 int count = 0;
94b18763 6160 for (auto &p : inode_map) {
81eedcae 6161 int n = 1;
b32b8144 6162 CInode *in = p.second;
7c673cae 6163 if (in->is_head() && in->is_any_caps()) {
c07f9fc5
FG
6164 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6165 if (in->is_frozen_inode()) {
6166 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6167 continue;
6168 }
7c673cae 6169 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
81eedcae 6170 n += mds->locker->issue_caps(in);
7c673cae 6171 }
81eedcae
TL
6172
6173 if ((count % 1000) + n >= 1000)
6174 mds->heartbeat_reset();
6175 count += n;
7c673cae
FG
6176 }
6177}
6178
6179
6180// ===============================================================================
6181
6182struct C_MDC_QueuedCow : public MDCacheContext {
6183 CInode *in;
6184 MutationRef mut;
6185 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6186 MDCacheContext(mdc), in(i), mut(m) {}
6187 void finish(int r) override {
6188 mdcache->_queued_file_recover_cow(in, mut);
6189 }
6190};
6191
6192
6193void MDCache::queue_file_recover(CInode *in)
6194{
6195 dout(10) << "queue_file_recover " << *in << dendl;
11fdf7f2 6196 ceph_assert(in->is_auth());
7c673cae
FG
6197
6198 // cow?
6199 /*
6200 SnapRealm *realm = in->find_snaprealm();
6201 set<snapid_t> s = realm->get_snaps();
6202 while (!s.empty() && *s.begin() < in->first)
6203 s.erase(s.begin());
6204 while (!s.empty() && *s.rbegin() > in->last)
6205 s.erase(*s.rbegin());
6206 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6207 if (s.size() > 1) {
94b18763 6208 CInode::mempool_inode pi = in->project_inode();
7c673cae
FG
6209 pi->version = in->pre_dirty();
6210
6211 auto mut(std::make_shared<MutationImpl>());
6212 mut->ls = mds->mdlog->get_current_segment();
6213 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6214 mds->mdlog->start_entry(le);
6215 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6216
6217 s.erase(*s.begin());
6218 while (!s.empty()) {
6219 snapid_t snapid = *s.begin();
6220 CInode *cow_inode = 0;
6221 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
11fdf7f2 6222 ceph_assert(cow_inode);
7c673cae
FG
6223 recovery_queue.enqueue(cow_inode);
6224 s.erase(*s.begin());
6225 }
6226
6227 in->parent->first = in->first;
6228 le->metablob.add_primary_dentry(in->parent, in, true);
6229 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6230 mds->mdlog->flush();
6231 }
6232 */
6233
6234 recovery_queue.enqueue(in);
6235}
6236
6237void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6238{
6239 in->pop_and_dirty_projected_inode(mut->ls);
6240 mut->apply();
6241 mds->locker->drop_locks(mut.get());
6242 mut->cleanup();
6243}
6244
6245
6246/*
6247 * called after recovery to recover file sizes for previously opened (for write)
6248 * files. that is, those where max_size > size.
6249 */
6250void MDCache::identify_files_to_recover()
6251{
6252 dout(10) << "identify_files_to_recover" << dendl;
81eedcae 6253 int count = 0;
94b18763 6254 for (auto &p : inode_map) {
b32b8144 6255 CInode *in = p.second;
7c673cae
FG
6256 if (!in->is_auth())
6257 continue;
6258
6259 if (in->last != CEPH_NOSNAP)
6260 continue;
6261
6262 // Only normal files need file size recovery
6263 if (!in->is_file()) {
6264 continue;
6265 }
6266
6267 bool recover = false;
6268 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6269 p != in->inode.client_ranges.end();
6270 ++p) {
6271 Capability *cap = in->get_client_cap(p->first);
a8e16298
TL
6272 if (cap) {
6273 cap->mark_clientwriteable();
6274 } else {
7c673cae
FG
6275 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6276 recover = true;
6277 break;
6278 }
6279 }
6280
6281 if (recover) {
6282 if (in->filelock.is_stable()) {
6283 in->auth_pin(&in->filelock);
6284 } else {
11fdf7f2 6285 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
7c673cae
FG
6286 }
6287 in->filelock.set_state(LOCK_PRE_SCAN);
6288 rejoin_recover_q.push_back(in);
6289 } else {
6290 rejoin_check_q.push_back(in);
6291 }
81eedcae
TL
6292
6293 if (!(++count % 1000))
6294 mds->heartbeat_reset();
7c673cae
FG
6295 }
6296}
6297
6298void MDCache::start_files_to_recover()
6299{
6300 for (CInode *in : rejoin_check_q) {
6301 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6302 mds->locker->issue_caps(in);
6303 mds->locker->check_inode_max_size(in);
6304 }
6305 rejoin_check_q.clear();
6306 for (CInode *in : rejoin_recover_q) {
6307 mds->locker->file_recover(&in->filelock);
6308 }
6309 if (!rejoin_recover_q.empty()) {
6310 rejoin_recover_q.clear();
6311 do_file_recover();
6312 }
6313}
6314
6315void MDCache::do_file_recover()
6316{
6317 recovery_queue.advance();
6318}
6319
6320// ===============================================================================
6321
6322
6323// ----------------------------
6324// truncate
6325
6326class C_MDC_RetryTruncate : public MDCacheContext {
6327 CInode *in;
6328 LogSegment *ls;
6329public:
6330 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6331 MDCacheContext(c), in(i), ls(l) {}
6332 void finish(int r) override {
6333 mdcache->_truncate_inode(in, ls);
6334 }
6335};
6336
6337void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6338{
94b18763 6339 auto pi = in->get_projected_inode();
7c673cae
FG
6340 dout(10) << "truncate_inode "
6341 << pi->truncate_from << " -> " << pi->truncate_size
6342 << " on " << *in
6343 << dendl;
6344
6345 ls->truncating_inodes.insert(in);
6346 in->get(CInode::PIN_TRUNCATING);
6347 in->auth_pin(this);
6348
6349 if (!in->client_need_snapflush.empty() &&
6350 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6351 ceph_assert(in->filelock.is_xlocked());
7c673cae
FG
6352 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6353 mds->locker->issue_caps(in);
6354 return;
6355 }
6356
6357 _truncate_inode(in, ls);
6358}
6359
6360struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6361 CInode *in;
6362 LogSegment *ls;
6363 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
91327a77
AA
6364 MDCacheIOContext(c, false), in(i), ls(l) {
6365 }
7c673cae 6366 void finish(int r) override {
11fdf7f2 6367 ceph_assert(r == 0 || r == -ENOENT);
7c673cae
FG
6368 mdcache->truncate_inode_finish(in, ls);
6369 }
91327a77
AA
6370 void print(ostream& out) const override {
6371 out << "file_truncate(" << in->ino() << ")";
6372 }
7c673cae
FG
6373};
6374
6375void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6376{
94b18763 6377 auto pi = &in->inode;
7c673cae
FG
6378 dout(10) << "_truncate_inode "
6379 << pi->truncate_from << " -> " << pi->truncate_size
6380 << " on " << *in << dendl;
6381
11fdf7f2
TL
6382 ceph_assert(pi->is_truncating());
6383 ceph_assert(pi->truncate_size < (1ULL << 63));
6384 ceph_assert(pi->truncate_from < (1ULL << 63));
6385 ceph_assert(pi->truncate_size < pi->truncate_from);
7c673cae
FG
6386
6387
6388 SnapRealm *realm = in->find_snaprealm();
6389 SnapContext nullsnap;
6390 const SnapContext *snapc;
6391 if (realm) {
6392 dout(10) << " realm " << *realm << dendl;
6393 snapc = &realm->get_snap_context();
6394 } else {
6395 dout(10) << " NO realm, using null context" << dendl;
6396 snapc = &nullsnap;
11fdf7f2 6397 ceph_assert(in->last == CEPH_NOSNAP);
7c673cae
FG
6398 }
6399 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6400 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6401 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6402 pi->truncate_seq, ceph::real_time::min(), 0,
6403 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6404 mds->finisher));
6405}
6406
6407struct C_MDC_TruncateLogged : public MDCacheLogContext {
6408 CInode *in;
6409 MutationRef mut;
6410 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6411 MDCacheLogContext(m), in(i), mut(mu) {}
6412 void finish(int r) override {
6413 mdcache->truncate_inode_logged(in, mut);
6414 }
6415};
6416
6417void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6418{
6419 dout(10) << "truncate_inode_finish " << *in << dendl;
6420
6421 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6422 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6423 ls->truncating_inodes.erase(p);
6424
6425 // update
94b18763
FG
6426 auto &pi = in->project_inode();
6427 pi.inode.version = in->pre_dirty();
6428 pi.inode.truncate_from = 0;
6429 pi.inode.truncate_pending--;
7c673cae
FG
6430
6431 MutationRef mut(new MutationImpl());
6432 mut->ls = mds->mdlog->get_current_segment();
6433 mut->add_projected_inode(in);
6434
6435 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6436 mds->mdlog->start_entry(le);
6437 CDentry *dn = in->get_projected_parent_dn();
6438 le->metablob.add_dir_context(dn->get_dir());
6439 le->metablob.add_primary_dentry(dn, in, true);
6440 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6441
6442 journal_dirty_inode(mut.get(), &le->metablob, in);
6443 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6444
6445 // flush immediately if there are readers/writers waiting
6446 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6447 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6448 mds->mdlog->flush();
6449}
6450
6451void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6452{
6453 dout(10) << "truncate_inode_logged " << *in << dendl;
6454 mut->apply();
6455 mds->locker->drop_locks(mut.get());
6456 mut->cleanup();
6457
6458 in->put(CInode::PIN_TRUNCATING);
6459 in->auth_unpin(this);
6460
11fdf7f2 6461 MDSContext::vec waiters;
7c673cae
FG
6462 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6463 mds->queue_waiters(waiters);
6464}
6465
6466
6467void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6468{
6469 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6470 << ls->seq << "/" << ls->offset << dendl;
6471 ls->truncating_inodes.insert(in);
6472 in->get(CInode::PIN_TRUNCATING);
6473}
6474
6475void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6476{
6477 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6478 << ls->seq << "/" << ls->offset << dendl;
6479 // if we have the logseg the truncate started in, it must be in our list.
6480 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
11fdf7f2 6481 ceph_assert(p != ls->truncating_inodes.end());
7c673cae
FG
6482 ls->truncating_inodes.erase(p);
6483 in->put(CInode::PIN_TRUNCATING);
6484}
6485
6486void MDCache::start_recovered_truncates()
6487{
6488 dout(10) << "start_recovered_truncates" << dendl;
6489 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6490 p != mds->mdlog->segments.end();
6491 ++p) {
6492 LogSegment *ls = p->second;
6493 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6494 q != ls->truncating_inodes.end();
6495 ++q) {
6496 CInode *in = *q;
6497 in->auth_pin(this);
6498
6499 if (!in->client_need_snapflush.empty() &&
6500 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2 6501 ceph_assert(in->filelock.is_stable());
7c673cae
FG
6502 in->filelock.set_state(LOCK_XLOCKDONE);
6503 in->auth_pin(&in->filelock);
6504 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6505 // start_files_to_recover will revoke caps
6506 continue;
6507 }
6508 _truncate_inode(in, ls);
6509 }
6510 }
6511}
6512
6513
6514
6515
6516
6517
6518// ================================================================================
6519// cache trimming
6520
11fdf7f2 6521std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
181888fb 6522{
7c673cae 6523 bool is_standby_replay = mds->is_standby_replay();
181888fb
FG
6524 std::vector<CDentry *> unexpirables;
6525 uint64_t trimmed = 0;
6526
11fdf7f2 6527 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
a8e16298 6528
181888fb
FG
6529 dout(7) << "trim_lru trimming " << count
6530 << " items from LRU"
6531 << " size=" << lru.lru_get_size()
6532 << " mid=" << lru.lru_get_top()
6533 << " pintail=" << lru.lru_get_pintail()
6534 << " pinned=" << lru.lru_get_num_pinned()
6535 << dendl;
7c673cae 6536
11fdf7f2 6537 const uint64_t trim_counter_start = trim_counter.get();
a8e16298
TL
6538 bool throttled = false;
6539 while (1) {
6540 throttled |= trim_counter_start+trimmed >= trim_threshold;
6541 if (throttled) break;
31f18b77
FG
6542 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6543 if (!dn)
6544 break;
6545 if (trim_dentry(dn, expiremap)) {
6546 unexpirables.push_back(dn);
181888fb
FG
6547 } else {
6548 trimmed++;
31f18b77
FG
6549 }
6550 }
6551
181888fb 6552 for (auto &dn : unexpirables) {
31f18b77 6553 bottom_lru.lru_insert_mid(dn);
181888fb 6554 }
31f18b77
FG
6555 unexpirables.clear();
6556
181888fb 6557 // trim dentries from the LRU until count is reached
a8e16298
TL
6558 while (!throttled && (cache_toofull() || count > 0)) {
6559 throttled |= trim_counter_start+trimmed >= trim_threshold;
6560 if (throttled) break;
7c673cae
FG
6561 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6562 if (!dn) {
6563 break;
6564 }
7c673cae 6565 if ((is_standby_replay && dn->get_linkage()->inode &&
181888fb 6566 dn->get_linkage()->inode->item_open_file.is_on_list())) {
7c673cae 6567 unexpirables.push_back(dn);
181888fb
FG
6568 } else if (trim_dentry(dn, expiremap)) {
6569 unexpirables.push_back(dn);
6570 } else {
6571 trimmed++;
3efd9988 6572 if (count > 0) count--;
7c673cae
FG
6573 }
6574 }
11fdf7f2 6575 trim_counter.hit(trimmed);
181888fb
FG
6576
6577 for (auto &dn : unexpirables) {
31f18b77 6578 lru.lru_insert_mid(dn);
181888fb 6579 }
31f18b77 6580 unexpirables.clear();
7c673cae 6581
181888fb 6582 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
a8e16298 6583 return std::pair<bool, uint64_t>(throttled, trimmed);
181888fb
FG
6584}
6585
6586/*
6587 * note: only called while MDS is active or stopping... NOT during recovery.
6588 * however, we may expire a replica whose authority is recovering.
6589 *
6590 * @param count is number of dentries to try to expire
6591 */
a8e16298 6592std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
181888fb
FG
6593{
6594 uint64_t used = cache_size();
91327a77 6595 uint64_t limit = cache_memory_limit;
11fdf7f2 6596 expiremap expiremap;
181888fb
FG
6597
6598 dout(7) << "trim bytes_used=" << bytes2str(used)
6599 << " limit=" << bytes2str(limit)
91327a77 6600 << " reservation=" << cache_reservation
181888fb
FG
6601 << "% count=" << count << dendl;
6602
6603 // process delayed eval_stray()
6604 stray_manager.advance_delayed();
6605
a8e16298
TL
6606 auto result = trim_lru(count, expiremap);
6607 auto& trimmed = result.second;
181888fb 6608
7c673cae 6609 // trim non-auth, non-bound subtrees
181888fb 6610 for (auto p = subtrees.begin(); p != subtrees.end();) {
7c673cae
FG
6611 CDir *dir = p->first;
6612 ++p;
31f18b77
FG
6613 CInode *diri = dir->get_inode();
6614 if (dir->is_auth()) {
6615 if (!diri->is_auth() && !diri->is_base() &&
6616 dir->get_num_head_items() == 0) {
6617 if (dir->state_test(CDir::STATE_EXPORTING) ||
181888fb 6618 !(mds->is_active() || mds->is_stopping()) ||
31f18b77
FG
6619 dir->is_freezing() || dir->is_frozen())
6620 continue;
6621
6622 migrator->export_empty_import(dir);
a8e16298 6623 ++trimmed;
31f18b77
FG
6624 }
6625 } else {
6626 if (!diri->is_auth()) {
6627 if (dir->get_num_ref() > 1) // only subtree pin
6628 continue;
6629 list<CDir*> ls;
6630 diri->get_subtree_dirfrags(ls);
6631 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6632 continue;
6633
6634 // don't trim subtree root if its auth MDS is recovering.
6635 // This simplify the cache rejoin code.
6636 if (dir->is_subtree_root() &&
6637 rejoin_ack_gather.count(dir->get_dir_auth().first))
6638 continue;
7c673cae 6639 trim_dirfrag(dir, 0, expiremap);
a8e16298 6640 ++trimmed;
31f18b77 6641 }
7c673cae
FG
6642 }
6643 }
6644
6645 // trim root?
181888fb 6646 if (mds->is_stopping() && root) {
7c673cae
FG
6647 list<CDir*> ls;
6648 root->get_dirfrags(ls);
6649 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6650 CDir *dir = *p;
a8e16298 6651 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6652 trim_dirfrag(dir, 0, expiremap);
a8e16298
TL
6653 ++trimmed;
6654 }
7c673cae 6655 }
a8e16298 6656 if (root->get_num_ref() == 0) {
7c673cae 6657 trim_inode(0, root, 0, expiremap);
a8e16298
TL
6658 ++trimmed;
6659 }
7c673cae
FG
6660 }
6661
6662 std::set<mds_rank_t> stopping;
6663 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6664 stopping.erase(mds->get_nodeid());
6665 for (auto rank : stopping) {
6666 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6667 if (!mdsdir_in)
6668 continue;
6669
11fdf7f2
TL
6670 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6671 if (em.second) {
6672 em.first->second = MCacheExpire::create(mds->get_nodeid());
7c673cae
FG
6673 }
6674
6675 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6676
6677 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6678 if (!aborted) {
6679 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6680 list<CDir*> ls;
6681 mdsdir_in->get_dirfrags(ls);
6682 for (auto dir : ls) {
a8e16298 6683 if (dir->get_num_ref() == 1) { // subtree pin
7c673cae 6684 trim_dirfrag(dir, dir, expiremap);
a8e16298
TL
6685 ++trimmed;
6686 }
7c673cae 6687 }
a8e16298 6688 if (mdsdir_in->get_num_ref() == 0) {
7c673cae 6689 trim_inode(NULL, mdsdir_in, NULL, expiremap);
a8e16298
TL
6690 ++trimmed;
6691 }
7c673cae
FG
6692 } else {
6693 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6694 }
6695 }
6696
6697 // Other rank's base inodes (when I'm stopping)
181888fb 6698 if (mds->is_stopping()) {
7c673cae 6699 for (set<CInode*>::iterator p = base_inodes.begin();
11fdf7f2
TL
6700 p != base_inodes.end();) {
6701 CInode *base_in = *p;
6702 ++p;
6703 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6704 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6705 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6706 if (base_in->get_num_ref() == 0) {
6707 trim_inode(NULL, base_in, NULL, expiremap);
a8e16298 6708 ++trimmed;
7c673cae
FG
6709 }
6710 }
6711 }
6712 }
6713
6714 // send any expire messages
6715 send_expire_messages(expiremap);
6716
a8e16298 6717 return result;
7c673cae
FG
6718}
6719
11fdf7f2 6720void MDCache::send_expire_messages(expiremap& expiremap)
7c673cae
FG
6721{
6722 // send expires
11fdf7f2 6723 for (const auto &p : expiremap) {
7c673cae 6724 if (mds->is_cluster_degraded() &&
11fdf7f2
TL
6725 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6726 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6727 rejoin_sent.count(p.first) == 0))) {
7c673cae
FG
6728 continue;
6729 }
11fdf7f2
TL
6730 dout(7) << "sending cache_expire to " << p.first << dendl;
6731 mds->send_message_mds(p.second, p.first);
7c673cae 6732 }
11fdf7f2 6733 expiremap.clear();
7c673cae
FG
6734}
6735
6736
11fdf7f2 6737bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
7c673cae
FG
6738{
6739 dout(12) << "trim_dentry " << *dn << dendl;
6740
6741 CDentry::linkage_t *dnl = dn->get_linkage();
6742
6743 CDir *dir = dn->get_dir();
11fdf7f2 6744 ceph_assert(dir);
7c673cae
FG
6745
6746 CDir *con = get_subtree_root(dir);
6747 if (con)
6748 dout(12) << " in container " << *con << dendl;
6749 else {
6750 dout(12) << " no container; under a not-yet-linked dir" << dendl;
11fdf7f2 6751 ceph_assert(dn->is_auth());
7c673cae
FG
6752 }
6753
6754 // If replica dentry is not readable, it's likely we will receive
6755 // MDentryLink/MDentryUnlink message soon (It's possible we first
6756 // receive a MDentryUnlink message, then MDentryLink message)
6757 // MDentryLink message only replicates an inode, so we should
6758 // avoid trimming the inode's parent dentry. This is because that
6759 // unconnected replicas are problematic for subtree migration.
6760 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6761 !dn->get_dir()->get_inode()->is_stray())
6762 return true;
6763
6764 // adjust the dir state
6765 // NOTE: we can safely remove a clean, null dentry without effecting
6766 // directory completeness.
6767 // (check this _before_ we unlink the inode, below!)
6768 bool clear_complete = false;
6769 if (!(dnl->is_null() && dn->is_clean()))
6770 clear_complete = true;
6771
6772 // unlink the dentry
6773 if (dnl->is_remote()) {
6774 // just unlink.
31f18b77 6775 dir->unlink_inode(dn, false);
7c673cae
FG
6776 } else if (dnl->is_primary()) {
6777 // expire the inode, too.
6778 CInode *in = dnl->get_inode();
11fdf7f2 6779 ceph_assert(in);
7c673cae
FG
6780 if (trim_inode(dn, in, con, expiremap))
6781 return true; // purging stray instead of trimming
6782 } else {
11fdf7f2 6783 ceph_assert(dnl->is_null());
7c673cae
FG
6784 }
6785
6786 if (!dn->is_auth()) {
6787 // notify dentry authority.
6788 mds_authority_t auth = dn->authority();
6789
6790 for (int p=0; p<2; p++) {
6791 mds_rank_t a = auth.first;
6792 if (p) a = auth.second;
6793 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6794 if (mds->get_nodeid() == auth.second &&
6795 con->is_importing()) break; // don't send any expire while importing.
6796 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6797
6798 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
11fdf7f2
TL
6799 ceph_assert(a != mds->get_nodeid());
6800 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6801 if (em.second)
6802 em.first->second = MCacheExpire::create(mds->get_nodeid());
6803 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7c673cae
FG
6804 }
6805 }
6806
6807 // remove dentry
6808 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6809 dir->add_to_bloom(dn);
6810 dir->remove_dentry(dn);
6811
6812 if (clear_complete)
6813 dir->state_clear(CDir::STATE_COMPLETE);
6814
7c673cae
FG
6815 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6816 return false;
6817}
6818
6819
11fdf7f2 6820void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7c673cae
FG
6821{
6822 dout(15) << "trim_dirfrag " << *dir << dendl;
6823
6824 if (dir->is_subtree_root()) {
11fdf7f2 6825 ceph_assert(!dir->is_auth() ||
7c673cae
FG
6826 (!dir->is_replicated() && dir->inode->is_base()));
6827 remove_subtree(dir); // remove from subtree map
6828 }
11fdf7f2 6829 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
6830
6831 CInode *in = dir->get_inode();
6832
6833 if (!dir->is_auth()) {
6834 mds_authority_t auth = dir->authority();
6835
6836 // was this an auth delegation? (if so, slightly modified container)
6837 dirfrag_t condf;
6838 if (dir->is_subtree_root()) {
6839 dout(12) << " subtree root, container is " << *dir << dendl;
6840 con = dir;
6841 condf = dir->dirfrag();
6842 } else {
6843 condf = con->dirfrag();
6844 }
6845
6846 for (int p=0; p<2; p++) {
6847 mds_rank_t a = auth.first;
6848 if (p) a = auth.second;
6849 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6850 if (mds->get_nodeid() == auth.second &&
6851 con->is_importing()) break; // don't send any expire while importing.
6852 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6853
6854 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
11fdf7f2
TL
6855 ceph_assert(a != mds->get_nodeid());
6856 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6857 if (em.second)
6858 em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
6859 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7c673cae
FG
6860 }
6861 }
6862
6863 in->close_dirfrag(dir->dirfrag().frag);
6864}
6865
6866/**
6867 * Try trimming an inode from the cache
6868 *
6869 * @return true if the inode is still in cache, else false if it was trimmed
6870 */
11fdf7f2 6871bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7c673cae
FG
6872{
6873 dout(15) << "trim_inode " << *in << dendl;
11fdf7f2 6874 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
6875
6876 if (in->is_dir()) {
6877 // If replica inode's dirfragtreelock is not readable, it's likely
6878 // some dirfrags of the inode are being fragmented and we will receive
6879 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6880 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6881 // This is because that unconnected replicas are problematic for
6882 // subtree migration.
6883 //
28e407b8 6884 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1, nullptr)) {
7c673cae 6885 return true;
28e407b8 6886 }
7c673cae
FG
6887
6888 // DIR
6889 list<CDir*> dfls;
6890 in->get_dirfrags(dfls);
6891 for (list<CDir*>::iterator p = dfls.begin(); p != dfls.end(); ++p) {
6892 CDir *dir = *p;
11fdf7f2 6893 ceph_assert(!dir->is_subtree_root());
7c673cae
FG
6894 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6895 }
6896 }
6897
6898 // INODE
6899 if (in->is_auth()) {
6900 // eval stray after closing dirfrags
6901 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
6902 maybe_eval_stray(in);
6903 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
6904 return true;
6905 }
6906 } else {
6907 mds_authority_t auth = in->authority();
6908
6909 dirfrag_t df;
6910 if (con)
6911 df = con->dirfrag();
6912 else
6913 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
6914
6915 for (int p=0; p<2; p++) {
6916 mds_rank_t a = auth.first;
6917 if (p) a = auth.second;
6918 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6919 if (con && mds->get_nodeid() == auth.second &&
6920 con->is_importing()) break; // don't send any expire while importing.
6921 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6922
6923 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
11fdf7f2
TL
6924 ceph_assert(a != mds->get_nodeid());
6925 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6926 if (em.second)
6927 em.first->second = MCacheExpire::create(mds->get_nodeid()); /* new */
6928 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7c673cae
FG
6929 }
6930 }
6931
6932 /*
6933 if (in->is_auth()) {
6934 if (in->hack_accessed)
6935 mds->logger->inc("outt");
6936 else {
6937 mds->logger->inc("outut");
6938 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6939 }
6940 }
6941 */
6942
6943 // unlink
6944 if (dn)
31f18b77 6945 dn->get_dir()->unlink_inode(dn, false);
7c673cae
FG
6946 remove_inode(in);
6947 return false;
6948}
6949
6950
6951/**
6952 * trim_non_auth - remove any non-auth items from our cache
6953 *
6954 * this reduces the amount of non-auth metadata in our cache, reducing the
6955 * load incurred by the rejoin phase.
6956 *
6957 * the only non-auth items that remain are those that are needed to
6958 * attach our own subtrees to the root.
6959 *
6960 * when we are done, all dentries will be in the top bit of the lru.
6961 *
6962 * why we have to do this:
6963 * we may not have accurate linkage for non-auth items. which means we will
6964 * know which subtree it falls into, and can not be sure to declare it to the
6965 * correct authority.
6966 */
6967void MDCache::trim_non_auth()
6968{
6969 dout(7) << "trim_non_auth" << dendl;
6970
6971 // temporarily pin all subtree roots
6972 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
6973 p != subtrees.end();
6974 ++p)
6975 p->first->get(CDir::PIN_SUBTREETEMP);
6976
31f18b77 6977 list<CDentry*> auth_list;
7c673cae
FG
6978
6979 // trim non-auth items from the lru
31f18b77
FG
6980 for (;;) {
6981 CDentry *dn = NULL;
6982 if (bottom_lru.lru_get_size() > 0)
6983 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6984 if (!dn && lru.lru_get_size() > 0)
6985 dn = static_cast<CDentry*>(lru.lru_expire());
6986 if (!dn)
6987 break;
6988
7c673cae
FG
6989 CDentry::linkage_t *dnl = dn->get_linkage();
6990
6991 if (dn->is_auth()) {
6992 // add back into lru (at the top)
31f18b77 6993 auth_list.push_back(dn);
7c673cae
FG
6994
6995 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
6996 dn->unlink_remote(dnl);
7c673cae
FG
6997 } else {
6998 // non-auth. expire.
6999 CDir *dir = dn->get_dir();
11fdf7f2 7000 ceph_assert(dir);
7c673cae
FG
7001
7002 // unlink the dentry
7003 dout(10) << " removing " << *dn << dendl;
7004 if (dnl->is_remote()) {
31f18b77 7005 dir->unlink_inode(dn, false);
7c673cae
FG
7006 }
7007 else if (dnl->is_primary()) {
7008 CInode *in = dnl->get_inode();
7009 dout(10) << " removing " << *in << dendl;
7010 list<CDir*> ls;
7011 in->get_dirfrags(ls);
7012 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7013 CDir *subdir = *p;
11fdf7f2 7014 ceph_assert(!subdir->is_subtree_root());
7c673cae
FG
7015 in->close_dirfrag(subdir->dirfrag().frag);
7016 }
31f18b77 7017 dir->unlink_inode(dn, false);
7c673cae
FG
7018 remove_inode(in);
7019 }
7020 else {
11fdf7f2 7021 ceph_assert(dnl->is_null());
7c673cae
FG
7022 }
7023
11fdf7f2 7024 ceph_assert(!dir->has_bloom());
7c673cae
FG
7025 dir->remove_dentry(dn);
7026 // adjust the dir state
7027 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7028 // close empty non-auth dirfrag
7029 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7030 dir->inode->close_dirfrag(dir->get_frag());
7031 }
7032 }
7033
31f18b77
FG
7034 for (auto dn : auth_list) {
7035 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7036 bottom_lru.lru_insert_mid(dn);
7037 else
7038 lru.lru_insert_top(dn);
7039 }
7040
7c673cae
FG
7041 // move everything in the pintail to the top bit of the lru.
7042 lru.lru_touch_entire_pintail();
7043
7044 // unpin all subtrees
7045 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7046 p != subtrees.end();
7047 ++p)
7048 p->first->put(CDir::PIN_SUBTREETEMP);
7049
31f18b77
FG
7050 if (lru.lru_get_size() == 0 &&
7051 bottom_lru.lru_get_size() == 0) {
7c673cae 7052 // root, stray, etc.?
b32b8144 7053 auto p = inode_map.begin();
7c673cae 7054 while (p != inode_map.end()) {
7c673cae 7055 CInode *in = p->second;
b32b8144 7056 ++p;
7c673cae
FG
7057 if (!in->is_auth()) {
7058 list<CDir*> ls;
7059 in->get_dirfrags(ls);
7060 for (list<CDir*>::iterator p = ls.begin();
7061 p != ls.end();
7062 ++p) {
7063 dout(10) << " removing " << **p << dendl;
11fdf7f2 7064 ceph_assert((*p)->get_num_ref() == 1); // SUBTREE
7c673cae
FG
7065 remove_subtree((*p));
7066 in->close_dirfrag((*p)->dirfrag().frag);
7067 }
7068 dout(10) << " removing " << *in << dendl;
11fdf7f2
TL
7069 ceph_assert(!in->get_parent_dn());
7070 ceph_assert(in->get_num_ref() == 0);
7c673cae
FG
7071 remove_inode(in);
7072 }
7c673cae
FG
7073 }
7074 }
7075
7076 show_subtrees();
7077}
7078
7079/**
7080 * Recursively trim the subtree rooted at directory to remove all
7081 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7082 * of those links. This is used to clear invalid data out of the cache.
7083 * Note that it doesn't clear the passed-in directory, since that's not
7084 * always safe.
7085 */
7086bool MDCache::trim_non_auth_subtree(CDir *dir)
7087{
7088 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7089
7090 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7091
94b18763
FG
7092 auto j = dir->begin();
7093 auto i = j;
7c673cae
FG
7094 while (j != dir->end()) {
7095 i = j++;
7096 CDentry *dn = i->second;
7097 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7098 CDentry::linkage_t *dnl = dn->get_linkage();
7099 if (dnl->is_primary()) { // check for subdirectories, etc
7100 CInode *in = dnl->get_inode();
7101 bool keep_inode = false;
7102 if (in->is_dir()) {
7103 list<CDir*> subdirs;
7104 in->get_dirfrags(subdirs);
7105 for (list<CDir*>::iterator subdir = subdirs.begin();
7106 subdir != subdirs.end();
7107 ++subdir) {
7108 if ((*subdir)->is_subtree_root()) {
7109 keep_inode = true;
7110 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl;
7111 } else {
7112 if (trim_non_auth_subtree(*subdir))
7113 keep_inode = true;
7114 else {
7115 in->close_dirfrag((*subdir)->get_frag());
7116 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7117 }
7118 }
7119 }
7120
7121 }
7122 if (!keep_inode) { // remove it!
7123 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
31f18b77 7124 dir->unlink_inode(dn, false);
7c673cae 7125 remove_inode(in);
11fdf7f2 7126 ceph_assert(!dir->has_bloom());
7c673cae
FG
7127 dir->remove_dentry(dn);
7128 } else {
7129 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7130 dn->state_clear(CDentry::STATE_AUTH);
7131 in->state_clear(CInode::STATE_AUTH);
7132 }
7133 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7134 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7135 } else { // just remove it
7136 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7137 if (dnl->is_remote())
31f18b77 7138 dir->unlink_inode(dn, false);
7c673cae
FG
7139 dir->remove_dentry(dn);
7140 }
7141 }
7142 dir->state_clear(CDir::STATE_AUTH);
7143 /**
7144 * We've now checked all our children and deleted those that need it.
7145 * Now return to caller, and tell them if *we're* a keeper.
7146 */
7147 return keep_dir || dir->get_num_any();
7148}
7149
7150/*
7151 * during replay, when we determine a subtree is no longer ours, we
7152 * try to trim it from our cache. because subtrees must be connected
7153 * to the root, the fact that we can trim this tree may mean that our
7154 * children or parents can also be trimmed.
7155 */
7156void MDCache::try_trim_non_auth_subtree(CDir *dir)
7157{
7158 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7159
7160 // can we now trim child subtrees?
7161 set<CDir*> bounds;
7162 get_subtree_bounds(dir, bounds);
7163 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7164 CDir *bd = *p;
7165 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7166 bd->get_num_any() == 0 && // and empty
7167 can_trim_non_auth_dirfrag(bd)) {
7168 CInode *bi = bd->get_inode();
7169 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7170 remove_subtree(bd);
7171 bd->mark_clean();
7172 bi->close_dirfrag(bd->get_frag());
7173 }
7174 }
7175
7176 if (trim_non_auth_subtree(dir)) {
7177 // keep
7178 try_subtree_merge(dir);
7179 } else {
7180 // can we trim this subtree (and possibly our ancestors) too?
7181 while (true) {
7182 CInode *diri = dir->get_inode();
7183 if (diri->is_base()) {
7184 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7185 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7186 remove_subtree(dir);
7187 dir->mark_clean();
7188 diri->close_dirfrag(dir->get_frag());
7189
7190 dout(10) << " removing " << *diri << dendl;
11fdf7f2
TL
7191 ceph_assert(!diri->get_parent_dn());
7192 ceph_assert(diri->get_num_ref() == 0);
7c673cae
FG
7193 remove_inode(diri);
7194 }
7195 break;
7196 }
7197
7198 CDir *psub = get_subtree_root(diri->get_parent_dir());
7199 dout(10) << " parent subtree is " << *psub << dendl;
7200 if (psub->get_dir_auth().first == mds->get_nodeid())
7201 break; // we are auth, keep.
7202
7203 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7204 remove_subtree(dir);
7205 dir->mark_clean();
7206 diri->close_dirfrag(dir->get_frag());
7207
7208 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7209 if (trim_non_auth_subtree(psub))
7210 break;
7211 dir = psub;
7212 }
7213 }
7214
7215 show_subtrees();
7216}
7217
7218void MDCache::standby_trim_segment(LogSegment *ls)
7219{
7220 ls->new_dirfrags.clear_list();
7221 ls->open_files.clear_list();
7222
7223 while (!ls->dirty_dirfrags.empty()) {
7224 CDir *dir = ls->dirty_dirfrags.front();
7225 dir->mark_clean();
7226 }
7227 while (!ls->dirty_inodes.empty()) {
7228 CInode *in = ls->dirty_inodes.front();
7229 in->mark_clean();
7230 }
7231 while (!ls->dirty_dentries.empty()) {
7232 CDentry *dn = ls->dirty_dentries.front();
7233 dn->mark_clean();
7234 }
7235 while (!ls->dirty_parent_inodes.empty()) {
7236 CInode *in = ls->dirty_parent_inodes.front();
7237 in->clear_dirty_parent();
7238 }
7239 while (!ls->dirty_dirfrag_dir.empty()) {
7240 CInode *in = ls->dirty_dirfrag_dir.front();
7241 in->filelock.remove_dirty();
7242 }
7243 while (!ls->dirty_dirfrag_nest.empty()) {
7244 CInode *in = ls->dirty_dirfrag_nest.front();
7245 in->nestlock.remove_dirty();
7246 }
7247 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7248 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7249 in->dirfragtreelock.remove_dirty();
7250 }
7251}
7252
11fdf7f2 7253void MDCache::handle_cache_expire(const MCacheExpire::const_ref &m)
7c673cae
FG
7254{
7255 mds_rank_t from = mds_rank_t(m->get_from());
7256
7257 dout(7) << "cache_expire from mds." << from << dendl;
7258
7259 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
7260 return;
7261 }
7262
7263 set<SimpleLock *> gather_locks;
7264 // loop over realms
11fdf7f2 7265 for (const auto &p : m->realms) {
7c673cae 7266 // check container?
11fdf7f2
TL
7267 if (p.first.ino > 0) {
7268 CInode *expired_inode = get_inode(p.first.ino);
7269 ceph_assert(expired_inode); // we had better have this.
7270 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7271 ceph_assert(parent_dir);
7c673cae
FG
7272
7273 int export_state = -1;
7274 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7275 export_state = migrator->get_export_state(parent_dir);
11fdf7f2 7276 ceph_assert(export_state >= 0);
7c673cae
FG
7277 }
7278
7279 if (!parent_dir->is_auth() ||
7280 (export_state != -1 &&
7281 ((export_state == Migrator::EXPORT_WARNING &&
7282 migrator->export_has_warned(parent_dir,from)) ||
7283 export_state == Migrator::EXPORT_EXPORTING ||
7284 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7285 (export_state == Migrator::EXPORT_NOTIFYING &&
7286 !migrator->export_has_notified(parent_dir,from))))) {
7287
7288 // not auth.
7289 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
11fdf7f2 7290 ceph_assert(parent_dir->is_frozen_tree_root());
7c673cae
FG
7291
7292 // make a message container
11fdf7f2
TL
7293
7294 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7295 if (em.second)
7296 em.first->second = MCacheExpire::create(from); /* new */
7297
7c673cae 7298 // merge these expires into it
11fdf7f2 7299 em.first->second->add_realm(p.first, p.second);
7c673cae
FG
7300 continue;
7301 }
11fdf7f2 7302 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7c673cae
FG
7303 (export_state == Migrator::EXPORT_WARNING &&
7304 !migrator->export_has_warned(parent_dir, from)));
7305
7306 dout(7) << "expires for " << *parent_dir << dendl;
7307 } else {
7308 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7309 }
7310
7311 // INODES
11fdf7f2
TL
7312 for (const auto &q : p.second.inodes) {
7313 CInode *in = get_inode(q.first);
7314 unsigned nonce = q.second;
7c673cae
FG
7315
7316 if (!in) {
11fdf7f2 7317 dout(0) << " inode expire on " << q.first << " from " << from
7c673cae 7318 << ", don't have it" << dendl;
11fdf7f2 7319 ceph_assert(in);
7c673cae 7320 }
11fdf7f2 7321 ceph_assert(in->is_auth());
7c673cae
FG
7322 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7323
7324 // check nonce
7325 if (nonce == in->get_replica_nonce(from)) {
7326 // remove from our cached_by
7327 dout(7) << " inode expire on " << *in << " from mds." << from
7328 << " cached_by was " << in->get_replicas() << dendl;
7329 inode_remove_replica(in, from, false, gather_locks);
7330 }
7331 else {
7332 // this is an old nonce, ignore expire.
7333 dout(7) << " inode expire on " << *in << " from mds." << from
7334 << " with old nonce " << nonce
7335 << " (current " << in->get_replica_nonce(from) << "), dropping"
7336 << dendl;
7337 }
7338 }
7339
7340 // DIRS
11fdf7f2
TL
7341 for (const auto &q : p.second.dirs) {
7342 CDir *dir = get_dirfrag(q.first);
7343 unsigned nonce = q.second;
7c673cae
FG
7344
7345 if (!dir) {
11fdf7f2 7346 CInode *diri = get_inode(q.first.ino);
7c673cae
FG
7347 if (diri) {
7348 if (mds->is_rejoin() &&
7349 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7350 !diri->is_replica(from)) {
7351 list<CDir*> ls;
7352 diri->get_nested_dirfrags(ls);
11fdf7f2 7353 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7354 << " while rejoining, inode isn't replicated" << dendl;
7355 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
7356 dir = *q;
7357 if (dir->is_replica(from)) {
7358 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7359 dir->remove_replica(from);
7360 }
7361 }
7362 continue;
7363 }
11fdf7f2 7364 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7c673cae 7365 if (other) {
11fdf7f2 7366 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7c673cae
FG
7367 << " have " << *other << ", mismatched frags, dropping" << dendl;
7368 continue;
7369 }
7370 }
11fdf7f2 7371 dout(0) << " dir expire on " << q.first << " from " << from
7c673cae 7372 << ", don't have it" << dendl;
11fdf7f2 7373 ceph_assert(dir);
7c673cae
FG
7374 }
7375 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7376
11fdf7f2 7377 ceph_assert(dir->is_auth());
7c673cae
FG
7378
7379 // check nonce
7380 if (nonce == dir->get_replica_nonce(from)) {
7381 // remove from our cached_by
7382 dout(7) << " dir expire on " << *dir << " from mds." << from
181888fb 7383 << " replicas was " << dir->get_replicas() << dendl;
7c673cae
FG
7384 dir->remove_replica(from);
7385 }
7386 else {
7387 // this is an old nonce, ignore expire.
7388 dout(7) << " dir expire on " << *dir << " from mds." << from
7389 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7390 << "), dropping" << dendl;
7391 }
7392 }
7393
7394 // DENTRIES
11fdf7f2
TL
7395 for (const auto &pd : p.second.dentries) {
7396 dout(10) << " dn expires in dir " << pd.first << dendl;
7397 CInode *diri = get_inode(pd.first.ino);
7398 ceph_assert(diri);
7399 CDir *dir = diri->get_dirfrag(pd.first.frag);
7c673cae
FG
7400
7401 if (!dir) {
11fdf7f2 7402 dout(0) << " dn expires on " << pd.first << " from " << from
7c673cae
FG
7403 << ", must have refragmented" << dendl;
7404 } else {
11fdf7f2 7405 ceph_assert(dir->is_auth());
7c673cae
FG
7406 }
7407
11fdf7f2
TL
7408 for (const auto &p : pd.second) {
7409 unsigned nonce = p.second;
7c673cae
FG
7410 CDentry *dn;
7411
7412 if (dir) {
11fdf7f2 7413 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7414 } else {
7415 // which dirfrag for this dentry?
11fdf7f2
TL
7416 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7417 ceph_assert(dir);
7418 ceph_assert(dir->is_auth());
7419 dn = dir->lookup(p.first.first, p.first.second);
7c673cae
FG
7420 }
7421
7422 if (!dn) {
7423 if (dir)
11fdf7f2 7424 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7c673cae 7425 else
11fdf7f2 7426 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7c673cae 7427 }
11fdf7f2 7428 ceph_assert(dn);
7c673cae
FG
7429
7430 if (nonce == dn->get_replica_nonce(from)) {
7431 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7432 dentry_remove_replica(dn, from, gather_locks);
7433 }
7434 else {
7435 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7436 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7437 << "), dropping" << dendl;
7438 }
7439 }
7440 }
7441 }
7442
7c673cae
FG
7443 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7444 if (!(*p)->is_stable())
7445 mds->locker->eval_gather(*p);
7446 }
7447}
7448
7449void MDCache::process_delayed_expire(CDir *dir)
7450{
7451 dout(7) << "process_delayed_expire on " << *dir << dendl;
11fdf7f2
TL
7452 for (const auto &p : delayed_expire[dir]) {
7453 handle_cache_expire(p.second);
7454 }
7c673cae
FG
7455 delayed_expire.erase(dir);
7456}
7457
7458void MDCache::discard_delayed_expire(CDir *dir)
7459{
7460 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7c673cae
FG
7461 delayed_expire.erase(dir);
7462}
7463
7464void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7465 set<SimpleLock *>& gather_locks)
7466{
7467 in->remove_replica(from);
11fdf7f2 7468 in->set_mds_caps_wanted(from, 0);
7c673cae
FG
7469
7470 // note: this code calls _eval more often than it needs to!
7471 // fix lock
7472 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7473 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7474 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7475 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7476 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7477 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7478
7479 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7480 // Don't remove the recovering mds from lock's gathering list because
7481 // it may hold rejoined wrlocks.
7482 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7483 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7484 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7485}
7486
7487void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7488{
7489 dn->remove_replica(from);
7490
7491 // fix lock
7492 if (dn->lock.remove_replica(from))
7493 gather_locks.insert(&dn->lock);
7494
7495 // Replicated strays might now be elegible for purge
11fdf7f2 7496 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7c673cae
FG
7497 if (dnl->is_primary()) {
7498 maybe_eval_stray(dnl->get_inode());
7499 }
7500}
7501
7502void MDCache::trim_client_leases()
7503{
7504 utime_t now = ceph_clock_now();
7505
7506 dout(10) << "trim_client_leases" << dendl;
7507
7508 for (int pool=0; pool<client_lease_pools; pool++) {
7509 int before = client_leases[pool].size();
7510 if (client_leases[pool].empty())
7511 continue;
7512
7513 while (!client_leases[pool].empty()) {
7514 ClientLease *r = client_leases[pool].front();
7515 if (r->ttl > now) break;
7516 CDentry *dn = static_cast<CDentry*>(r->parent);
7517 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7518 dn->remove_client_lease(r, mds->locker);
7519 }
7520 int after = client_leases[pool].size();
7521 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7522 << (before-after) << " leases, " << after << " left" << dendl;
7523 }
7524}
7525
7526
7527void MDCache::check_memory_usage()
7528{
7529 static MemoryModel mm(g_ceph_context);
7530 static MemoryModel::snap last;
7531 mm.sample(&last);
7532 static MemoryModel::snap baseline = last;
7533
7534 // check client caps
11fdf7f2 7535 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
181888fb 7536 double caps_per_inode = 0.0;
7c673cae 7537 if (CInode::count())
181888fb 7538 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7c673cae 7539
a8e16298 7540 dout(2) << "Memory usage: "
7c673cae
FG
7541 << " total " << last.get_total()
7542 << ", rss " << last.get_rss()
7543 << ", heap " << last.get_heap()
7544 << ", baseline " << baseline.get_heap()
7c673cae
FG
7545 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7546 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7547 << dendl;
7548
c07f9fc5 7549 mds->update_mlogger();
7c673cae
FG
7550 mds->mlogger->set(l_mdm_rss, last.get_rss());
7551 mds->mlogger->set(l_mdm_heap, last.get_heap());
7552
181888fb 7553 if (cache_toofull()) {
a8e16298 7554 mds->server->recall_client_state(nullptr);
7c673cae
FG
7555 }
7556
7557 // If the cache size had exceeded its limit, but we're back in bounds
7558 // now, free any unused pool memory so that our memory usage isn't
7559 // permanently bloated.
181888fb 7560 if (exceeded_size_limit && !cache_toofull()) {
7c673cae
FG
7561 // Only do this once we are back in bounds: otherwise the releases would
7562 // slow down whatever process caused us to exceed bounds to begin with
7563 if (ceph_using_tcmalloc()) {
a8e16298 7564 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7c673cae
FG
7565 << dendl;
7566 ceph_heap_release_free_memory();
7567 }
7568 exceeded_size_limit = false;
7569 }
7570}
7571
7572
7573
7574// =========================================================================================
7575// shutdown
7576
7577class C_MDC_ShutdownCheck : public MDCacheContext {
7578public:
7579 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7580 void finish(int) override {
7581 mdcache->shutdown_check();
7582 }
7583};
7584
7585void MDCache::shutdown_check()
7586{
7587 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7588
7589 // cache
7590 char old_val[32] = { 0 };
7591 char *o = old_val;
11fdf7f2
TL
7592 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7593 g_conf().set_val("debug_mds", "10");
7594 g_conf().apply_changes(nullptr);
7c673cae 7595 show_cache();
11fdf7f2
TL
7596 g_conf().set_val("debug_mds", old_val);
7597 g_conf().apply_changes(nullptr);
7598 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae
FG
7599
7600 // this
31f18b77 7601 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7602 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7603
7604
7605 if (mds->objecter->is_active()) {
7606 dout(0) << "objecter still active" << dendl;
7607 mds->objecter->dump_active();
7608 }
7609}
7610
7611
7612void MDCache::shutdown_start()
7613{
a8e16298 7614 dout(5) << "shutdown_start" << dendl;
7c673cae 7615
11fdf7f2
TL
7616 if (g_conf()->mds_shutdown_check)
7617 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7c673cae 7618
11fdf7f2 7619 // g_conf()->debug_mds = 10;
7c673cae
FG
7620}
7621
7622
7623
7624bool MDCache::shutdown_pass()
7625{
7626 dout(7) << "shutdown_pass" << dendl;
7627
7628 if (mds->is_stopped()) {
7629 dout(7) << " already shut down" << dendl;
7630 show_cache();
7631 show_subtrees();
7632 return true;
7633 }
7634
7635 // empty stray dir
28e407b8 7636 bool strays_all_exported = shutdown_export_strays();
7c673cae
FG
7637
7638 // trim cache
181888fb 7639 trim(UINT64_MAX);
31f18b77 7640 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae 7641
28e407b8 7642 // Export all subtrees to another active (usually rank 0) if not rank 0
7c673cae
FG
7643 int num_auth_subtree = 0;
7644 if (!subtrees.empty() &&
28e407b8 7645 mds->get_nodeid() != 0) {
7c673cae
FG
7646 dout(7) << "looking for subtrees to export to mds0" << dendl;
7647 list<CDir*> ls;
7648 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7649 it != subtrees.end();
7650 ++it) {
7651 CDir *dir = it->first;
7652 if (dir->get_inode()->is_mdsdir())
7653 continue;
7654 if (dir->is_auth()) {
7655 num_auth_subtree++;
7656 if (dir->is_frozen() ||
7657 dir->is_freezing() ||
7658 dir->is_ambiguous_dir_auth() ||
7659 dir->state_test(CDir::STATE_EXPORTING))
7660 continue;
7661 ls.push_back(dir);
7662 }
7663 }
28e407b8
AA
7664
7665 migrator->clear_export_queue();
7c673cae
FG
7666 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7667 CDir *dir = *p;
7668 mds_rank_t dest = dir->get_inode()->authority().first;
7669 if (dest > 0 && !mds->mdsmap->is_active(dest))
7670 dest = 0;
7671 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7672 migrator->export_dir_nicely(dir, dest);
7673 }
7674 }
7675
28e407b8
AA
7676 if (!strays_all_exported) {
7677 dout(7) << "waiting for strays to migrate" << dendl;
7678 return false;
7679 }
7680
7c673cae 7681 if (num_auth_subtree > 0) {
11fdf7f2 7682 ceph_assert(mds->get_nodeid() > 0);
7c673cae
FG
7683 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7684 show_subtrees();
7685 return false;
7686 }
7687
7688 // close out any sessions (and open files!) before we try to trim the log, etc.
7689 if (mds->sessionmap.have_unclosed_sessions()) {
7690 if (!mds->server->terminating_sessions)
7691 mds->server->terminate_sessions();
7692 return false;
7693 }
7694
28e407b8
AA
7695 // Fully trim the log so that all objects in cache are clean and may be
7696 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7697 // trim the log such that the cache eventually becomes clean.
f64942e4
AA
7698 if (mds->mdlog->get_num_segments() > 0) {
7699 auto ls = mds->mdlog->get_current_segment();
7700 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7701 // Current segment contains events other than subtreemap or
7702 // there are dirty dirfrags (see CDir::log_mark_dirty())
7703 mds->mdlog->start_new_segment();
7704 mds->mdlog->flush();
7705 }
7706 }
7707 mds->mdlog->trim_all();
28e407b8
AA
7708 if (mds->mdlog->get_num_segments() > 1) {
7709 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7710 return false;
7711 }
7712
7713 // drop our reference to our stray dir inode
7714 for (int i = 0; i < NUM_STRAY; ++i) {
7715 if (strays[i] &&
7716 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7717 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7718 strays[i]->put(CInode::PIN_STRAY);
7719 strays[i]->put_stickydirs();
7720 }
7721 }
7722
7c673cae
FG
7723 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7724 if (mydir && !mydir->is_subtree_root())
7725 mydir = NULL;
7726
7727 // subtrees map not empty yet?
7728 if (subtrees.size() > (mydir ? 1 : 0)) {
7729 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7730 show_subtrees();
7731 migrator->show_importing();
7732 migrator->show_exporting();
7733 if (!migrator->is_importing() && !migrator->is_exporting())
7734 show_cache();
7735 return false;
7736 }
11fdf7f2
TL
7737 ceph_assert(!migrator->is_exporting());
7738 ceph_assert(!migrator->is_importing());
7c673cae 7739
f64942e4
AA
7740 // replicas may dirty scatter locks
7741 if (myin && myin->is_replicated()) {
7742 dout(7) << "still have replicated objects" << dendl;
7743 return false;
7744 }
7745
11fdf7f2
TL
7746 if ((myin && myin->get_num_auth_pins()) ||
7747 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
181888fb
FG
7748 dout(7) << "still have auth pinned objects" << dendl;
7749 return false;
7750 }
7751
7c673cae
FG
7752 // (only do this once!)
7753 if (!mds->mdlog->is_capped()) {
7754 dout(7) << "capping the log" << dendl;
7755 mds->mdlog->cap();
7c673cae
FG
7756 }
7757
f64942e4
AA
7758 if (!mds->mdlog->empty())
7759 mds->mdlog->trim(0);
7760
7c673cae
FG
7761 if (!mds->mdlog->empty()) {
7762 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7763 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7764 return false;
7765 }
7766
7767 if (!did_shutdown_log_cap) {
7768 // flush journal header
7769 dout(7) << "writing header for (now-empty) journal" << dendl;
11fdf7f2 7770 ceph_assert(mds->mdlog->empty());
7c673cae
FG
7771 mds->mdlog->write_head(0);
7772 // NOTE: filer active checker below will block us until this completes.
7773 did_shutdown_log_cap = true;
7774 return false;
7775 }
7776
7777 // filer active?
7778 if (mds->objecter->is_active()) {
7779 dout(7) << "objecter still active" << dendl;
7780 mds->objecter->dump_active();
7781 return false;
7782 }
7783
7784 // trim what we can from the cache
31f18b77
FG
7785 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7786 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7c673cae
FG
7787 show_cache();
7788 //dump();
7789 return false;
7790 }
31f18b77
FG
7791
7792 // make mydir subtree go away
7793 if (mydir) {
7794 if (mydir->get_num_ref() > 1) { // subtree pin
7795 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7796 show_cache();
7797 return false;
7798 }
7799
7800 remove_subtree(mydir);
7801 myin->close_dirfrag(mydir->get_frag());
7802 }
11fdf7f2 7803 ceph_assert(subtrees.empty());
31f18b77 7804
1adf2230 7805 if (myin) {
31f18b77 7806 remove_inode(myin);
11fdf7f2 7807 ceph_assert(!myin);
1adf2230
AA
7808 }
7809
11fdf7f2
TL
7810 if (global_snaprealm) {
7811 remove_inode(global_snaprealm->inode);
7812 global_snaprealm = nullptr;
7813 }
7814
7c673cae 7815 // done!
a8e16298 7816 dout(5) << "shutdown done." << dendl;
7c673cae
FG
7817 return true;
7818}
7819
7820bool MDCache::shutdown_export_strays()
7821{
f64942e4
AA
7822 static const unsigned MAX_EXPORTING = 100;
7823
7c673cae
FG
7824 if (mds->get_nodeid() == 0)
7825 return true;
f64942e4
AA
7826
7827 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7828 return false;
7829
7830 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7831 << " '" << shutdown_export_next.second << "'" << dendl;
7c673cae
FG
7832
7833 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
f64942e4 7834 bool all_exported = false;
7c673cae 7835
f64942e4
AA
7836again:
7837 auto next = shutdown_export_next;
7c673cae 7838
7c673cae 7839 for (int i = 0; i < NUM_STRAY; ++i) {
f64942e4
AA
7840 CInode *strayi = strays[i];
7841 if (!strayi ||
7842 !strayi->state_test(CInode::STATE_STRAYPINNED))
7843 continue;
7844 if (strayi->ino() < next.first.ino)
7c673cae 7845 continue;
7c673cae 7846
f64942e4
AA
7847 deque<CDir*> dfls;
7848 strayi->get_dirfrags(dfls);
7c673cae 7849
f64942e4
AA
7850 while (!dfls.empty()) {
7851 CDir *dir = dfls.front();
7852 dfls.pop_front();
7853
7854 if (dir->dirfrag() < next.first)
7c673cae 7855 continue;
f64942e4
AA
7856 if (next.first < dir->dirfrag()) {
7857 next.first = dir->dirfrag();
7858 next.second.clear();
7859 }
7860
7861 if (!dir->is_complete()) {
11fdf7f2 7862 MDSContext *fin = nullptr;
f64942e4
AA
7863 if (shutdown_exporting_strays.empty()) {
7864 fin = new MDSInternalContextWrapper(mds,
7865 new FunctionContext([this](int r) {
7866 shutdown_export_strays();
7867 })
7868 );
7869 }
7870 dir->fetch(fin);
7871 goto done;
7c673cae
FG
7872 }
7873
f64942e4
AA
7874 CDir::dentry_key_map::iterator it;
7875 if (next.second.empty()) {
7876 it = dir->begin();
7c673cae 7877 } else {
f64942e4
AA
7878 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
7879 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
7c673cae 7880 }
f64942e4
AA
7881
7882 for (; it != dir->end(); ++it) {
7883 CDentry *dn = it->second;
7884 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7885 if (dnl->is_null())
7886 continue;
7887
7888 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
11fdf7f2 7889 next.second = it->first.name;
f64942e4
AA
7890 goto done;
7891 }
7892
7893 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
7894 if (!ret.second) {
7895 dout(10) << "already exporting/purging " << *dn << dendl;
7896 continue;
7897 }
7898
7899 // Don't try to migrate anything that is actually
7900 // being purged right now
7901 if (!dn->state_test(CDentry::STATE_PURGING))
7902 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
7903
7904 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
7905 ++it;
7906 if (it != dir->end()) {
11fdf7f2 7907 next.second = it->first.name;
f64942e4
AA
7908 } else {
7909 if (dfls.empty())
7910 next.first.ino.val++;
7911 else
7912 next.first = dfls.front()->dirfrag();
7913 next.second.clear();
7914 }
7915 goto done;
7916 }
7917 }
7918 }
7919 }
7920
7921 if (shutdown_exporting_strays.empty()) {
7922 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
7923 if (first_df < shutdown_export_next.first ||
7924 !shutdown_export_next.second.empty()) {
7925 shutdown_export_next.first = first_df;
7926 shutdown_export_next.second.clear();
7927 goto again;
7c673cae 7928 }
f64942e4 7929 all_exported = true;
7c673cae
FG
7930 }
7931
f64942e4
AA
7932done:
7933 shutdown_export_next = next;
7934 return all_exported;
7c673cae
FG
7935}
7936
7937// ========= messaging ==============
7938
11fdf7f2 7939void MDCache::dispatch(const Message::const_ref &m)
7c673cae
FG
7940{
7941 switch (m->get_type()) {
7942
7943 // RESOLVE
7944 case MSG_MDS_RESOLVE:
11fdf7f2 7945 handle_resolve(MMDSResolve::msgref_cast(m));
7c673cae
FG
7946 break;
7947 case MSG_MDS_RESOLVEACK:
11fdf7f2 7948 handle_resolve_ack(MMDSResolveAck::msgref_cast(m));
7c673cae
FG
7949 break;
7950
7951 // REJOIN
7952 case MSG_MDS_CACHEREJOIN:
11fdf7f2 7953 handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m));
7c673cae
FG
7954 break;
7955
7956 case MSG_MDS_DISCOVER:
11fdf7f2 7957 handle_discover(MDiscover::msgref_cast(m));
7c673cae
FG
7958 break;
7959 case MSG_MDS_DISCOVERREPLY:
11fdf7f2 7960 handle_discover_reply(MDiscoverReply::msgref_cast(m));
7c673cae
FG
7961 break;
7962
7963 case MSG_MDS_DIRUPDATE:
11fdf7f2 7964 handle_dir_update(MDirUpdate::msgref_cast(m));
7c673cae
FG
7965 break;
7966
7967 case MSG_MDS_CACHEEXPIRE:
11fdf7f2 7968 handle_cache_expire(MCacheExpire::msgref_cast(m));
7c673cae
FG
7969 break;
7970
7971 case MSG_MDS_DENTRYLINK:
11fdf7f2 7972 handle_dentry_link(MDentryLink::msgref_cast(m));
7c673cae
FG
7973 break;
7974 case MSG_MDS_DENTRYUNLINK:
11fdf7f2 7975 handle_dentry_unlink(MDentryUnlink::msgref_cast(m));
7c673cae
FG
7976 break;
7977
7978 case MSG_MDS_FRAGMENTNOTIFY:
11fdf7f2 7979 handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m));
7c673cae 7980 break;
a8e16298 7981 case MSG_MDS_FRAGMENTNOTIFYACK:
11fdf7f2 7982 handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m));
a8e16298 7983 break;
7c673cae
FG
7984
7985 case MSG_MDS_FINDINO:
11fdf7f2 7986 handle_find_ino(MMDSFindIno::msgref_cast(m));
7c673cae
FG
7987 break;
7988 case MSG_MDS_FINDINOREPLY:
11fdf7f2 7989 handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m));
7c673cae
FG
7990 break;
7991
7992 case MSG_MDS_OPENINO:
11fdf7f2 7993 handle_open_ino(MMDSOpenIno::msgref_cast(m));
7c673cae
FG
7994 break;
7995 case MSG_MDS_OPENINOREPLY:
11fdf7f2
TL
7996 handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m));
7997 break;
7998
7999 case MSG_MDS_SNAPUPDATE:
8000 handle_snap_update(MMDSSnapUpdate::msgref_cast(m));
7c673cae
FG
8001 break;
8002
8003 default:
8004 derr << "cache unknown message " << m->get_type() << dendl;
11fdf7f2 8005 ceph_abort_msg("cache unknown message");
7c673cae
FG
8006 }
8007}
8008
11fdf7f2
TL
8009int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, // who
8010 const filepath& path, // what
7c673cae 8011 vector<CDentry*> *pdnvec, // result
11fdf7f2 8012 CInode **pin,
7c673cae
FG
8013 int onfail)
8014{
8015 bool discover = (onfail == MDS_TRAVERSE_DISCOVER);
8016 bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK);
8017 bool forward = (onfail == MDS_TRAVERSE_FORWARD);
8018
11fdf7f2 8019 ceph_assert(!forward || mdr); // forward requires a request
7c673cae
FG
8020
8021 snapid_t snapid = CEPH_NOSNAP;
8022 if (mdr)
8023 mdr->snapid = snapid;
8024
8025 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8026
8027 if (mds->logger) mds->logger->inc(l_mds_traverse);
8028
8029 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8030 CInode *cur = get_inode(path.get_ino());
8031 if (cur == NULL) {
8032 if (MDS_INO_IS_MDSDIR(path.get_ino()))
11fdf7f2 8033 open_foreign_mdsdir(path.get_ino(), cf.build());
7c673cae
FG
8034 else {
8035 //ceph_abort(); // hrm.. broken
8036 return -ESTALE;
8037 }
8038 return 1;
8039 }
8040 if (cur->state_test(CInode::STATE_PURGING))
8041 return -ESTALE;
8042
8043 // make sure snaprealm are open...
11fdf7f2
TL
8044 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8045 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8046 return 1;
8047 }
8048
8049 // start trace
8050 if (pdnvec)
8051 pdnvec->clear();
8052 if (pin)
8053 *pin = cur;
8054
8055 unsigned depth = 0;
8056 while (depth < path.depth()) {
8057 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8058 << "' snapid " << snapid << dendl;
8059
8060 if (!cur->is_dir()) {
8061 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8062 return -ENOTDIR;
8063 }
8064
8065 // walk into snapdir?
8066 if (path[depth].length() == 0) {
8067 dout(10) << "traverse: snapdir" << dendl;
8068 if (!mdr)
8069 return -EINVAL;
8070 snapid = CEPH_SNAPDIR;
8071 mdr->snapid = snapid;
8072 depth++;
8073 continue;
8074 }
8075 // walk thru snapdir?
8076 if (snapid == CEPH_SNAPDIR) {
8077 if (!mdr)
8078 return -EINVAL;
8079 SnapRealm *realm = cur->find_snaprealm();
8080 snapid = realm->resolve_snapname(path[depth], cur->ino());
8081 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
11fdf7f2
TL
8082 if (!snapid) {
8083 CInode *t = cur;
8084 while (t) {
8085 // if snaplock isn't readable, it's possible that other mds is creating
8086 // snapshot, but snap update message hasn't been received.
8087 if (!t->snaplock.can_read(client)) {
8088 dout(10) << " non-readable snaplock on " << *t << dendl;
8089 t->snaplock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8090 return 1;
8091 }
8092 CDentry *pdn = t->get_projected_parent_dn();
8093 t = pdn ? pdn->get_dir()->get_inode() : NULL;
8094 }
7c673cae 8095 return -ENOENT;
11fdf7f2 8096 }
7c673cae
FG
8097 mdr->snapid = snapid;
8098 depth++;
8099 continue;
8100 }
8101
8102 // open dir
8103 frag_t fg = cur->pick_dirfrag(path[depth]);
8104 CDir *curdir = cur->get_dirfrag(fg);
8105 if (!curdir) {
8106 if (cur->is_auth()) {
8107 // parent dir frozen_dir?
8108 if (cur->is_frozen()) {
8109 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
11fdf7f2 8110 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
7c673cae
FG
8111 return 1;
8112 }
8113 curdir = cur->get_or_open_dirfrag(this, fg);
8114 } else {
8115 // discover?
8116 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
11fdf7f2 8117 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
7c673cae
FG
8118 null_okay);
8119 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8120 return 1;
8121 }
8122 }
11fdf7f2 8123 ceph_assert(curdir);
7c673cae
FG
8124
8125#ifdef MDS_VERIFY_FRAGSTAT
8126 if (curdir->is_complete())
8127 curdir->verify_fragstat();
8128#endif
8129
8130 // frozen?
8131 /*
8132 if (curdir->is_frozen()) {
8133 // doh!
8134 // FIXME: traverse is allowed?
8135 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8136 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8137 if (onfinish) delete onfinish;
8138 return 1;
8139 }
8140 */
8141
8142 // Before doing dirfrag->dn lookup, compare with DamageTable's
8143 // record of which dentries were unreadable
8144 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8145 dout(4) << "traverse: stopped lookup at damaged dentry "
8146 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8147 return -EIO;
8148 }
8149
8150 // dentry
8151 CDentry *dn = curdir->lookup(path[depth], snapid);
8152 CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0;
8153
8154 // null and last_bit and xlocked by me?
8155 if (dnl && dnl->is_null() && null_okay) {
8156 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl;
8157 if (pdnvec)
8158 pdnvec->push_back(dn);
8159 if (pin)
8160 *pin = 0;
8161 break; // done!
8162 }
8163
8164 if (dnl &&
8165 dn->lock.is_xlocked() &&
8166 dn->lock.get_xlock_by() != mdr &&
8167 !dn->lock.can_read(client) &&
8168 (dnl->is_null() || forward)) {
8169 dout(10) << "traverse: xlocked dentry at " << *dn << dendl;
11fdf7f2 8170 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
7c673cae
FG
8171 if (mds->logger) mds->logger->inc(l_mds_traverse_lock);
8172 mds->mdlog->flush();
8173 return 1;
8174 }
8175
8176 // can we conclude ENOENT?
8177 if (dnl && dnl->is_null()) {
8178 if (dn->lock.can_read(client) ||
8179 (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8180 dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl;
8181 if (pdnvec) {
8182 if (depth == path.depth() - 1)
8183 pdnvec->push_back(dn);
8184 else
8185 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8186 }
8187 return -ENOENT;
8188 } else {
8189 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl;
11fdf7f2 8190 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
7c673cae
FG
8191 return 1;
8192 }
8193 }
8194
8195 if (dnl && !dnl->is_null()) {
8196 CInode *in = dnl->get_inode();
8197
8198 // do we have inode?
8199 if (!in) {
11fdf7f2 8200 ceph_assert(dnl->is_remote());
7c673cae
FG
8201 // do i have it?
8202 in = get_inode(dnl->get_remote_ino());
8203 if (in) {
8204 dout(7) << "linking in remote in " << *in << dendl;
8205 dn->link_remote(dnl, in);
8206 } else {
8207 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
11fdf7f2 8208 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
7c673cae
FG
8209 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8210 dout(4) << "traverse: remote dentry points to damaged ino "
8211 << *dn << dendl;
8212 return -EIO;
8213 }
11fdf7f2 8214 open_remote_dentry(dn, true, cf.build(),
7c673cae
FG
8215 (null_okay && depth == path.depth() - 1));
8216 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8217 return 1;
8218 }
8219 }
8220
8221 cur = in;
8222 // make sure snaprealm are open...
11fdf7f2
TL
8223 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8224 !cur->snaprealm->open_parents(cf.build())) {
7c673cae
FG
8225 return 1;
8226 }
8227
8228 // add to trace, continue.
8229 touch_inode(cur);
8230 if (pdnvec)
8231 pdnvec->push_back(dn);
8232 if (pin)
8233 *pin = cur;
8234 depth++;
8235 continue;
8236 }
8237
8238
8239 // MISS. dentry doesn't exist.
8240 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8241
8242 if (curdir->is_auth()) {
8243 // dentry is mine.
8244 if (curdir->is_complete() ||
8245 (snapid == CEPH_NOSNAP &&
8246 curdir->has_bloom() &&
11fdf7f2 8247 !curdir->is_in_bloom(path[depth]))) {
7c673cae
FG
8248 // file not found
8249 if (pdnvec) {
8250 // instantiate a null dn?
8251 if (depth < path.depth()-1){
8252 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8253 dn = NULL;
8254 } else if (dn) {
8255 ceph_abort(); // should have fallen out in ->is_null() check above
8256 } else if (curdir->is_frozen()) {
8257 dout(20) << " not adding null to frozen dir " << dendl;
8258 } else if (snapid < CEPH_MAXSNAP) {
8259 dout(20) << " not adding null for snapid " << snapid << dendl;
8260 } else {
8261 // create a null dentry
8262 dn = curdir->add_null_dentry(path[depth]);
8263 dout(20) << " added null " << *dn << dendl;
8264 }
8265 if (dn)
8266 pdnvec->push_back(dn);
8267 else
8268 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8269 }
8270 return -ENOENT;
8271 } else {
8272
8273 // Check DamageTable for missing fragments before trying to fetch
8274 // this
8275 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8276 dout(4) << "traverse: damaged dirfrag " << *curdir
8277 << ", blocking fetch" << dendl;
8278 return -EIO;
8279 }
8280
8281 // directory isn't complete; reload
8282 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8283 touch_inode(cur);
11fdf7f2 8284 curdir->fetch(cf.build(), path[depth]);
7c673cae
FG
8285 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8286 return 1;
8287 }
8288 } else {
8289 // dirfrag/dentry is not mine.
8290 mds_authority_t dauth = curdir->authority();
8291
8292 if (forward &&
11fdf7f2 8293 mdr && mdr->client_request &&
7c673cae
FG
8294 (int)depth < mdr->client_request->get_num_fwd()) {
8295 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8296 << " < fwd " << mdr->client_request->get_num_fwd()
8297 << ", discovering instead of forwarding" << dendl;
8298 discover = true;
8299 }
8300
8301 if ((discover || null_okay)) {
8302 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
11fdf7f2 8303 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
7c673cae
FG
8304 null_okay);
8305 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8306 return 1;
8307 }
8308 if (forward) {
8309 // forward
8310 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8311
8312 if (curdir->is_ambiguous_auth()) {
8313 // wait
8314 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
11fdf7f2 8315 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
7c673cae
FG
8316 return 1;
8317 }
8318
8319 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
11fdf7f2
TL
8320
8321 request_forward(mdr, dauth.first);
8322
7c673cae 8323 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
7c673cae 8324 return 2;
11fdf7f2 8325 }
7c673cae 8326 }
11fdf7f2 8327
7c673cae
FG
8328 ceph_abort(); // i shouldn't get here
8329 }
8330
8331 // success.
8332 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8333 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8334 if (mdr)
11fdf7f2 8335 ceph_assert(mdr->snapid == snapid);
7c673cae
FG
8336 return 0;
8337}
8338
8339CInode *MDCache::cache_traverse(const filepath& fp)
8340{
8341 dout(10) << "cache_traverse " << fp << dendl;
8342
8343 CInode *in;
8344 if (fp.get_ino())
8345 in = get_inode(fp.get_ino());
8346 else
8347 in = root;
8348 if (!in)
8349 return NULL;
8350
8351 for (unsigned i = 0; i < fp.depth(); i++) {
11fdf7f2 8352 std::string_view dname = fp[i];
7c673cae
FG
8353 frag_t fg = in->pick_dirfrag(dname);
8354 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8355 CDir *curdir = in->get_dirfrag(fg);
8356 if (!curdir)
8357 return NULL;
8358 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8359 if (!dn)
8360 return NULL;
8361 in = dn->get_linkage()->get_inode();
8362 if (!in)
8363 return NULL;
8364 }
8365 dout(10) << " got " << *in << dendl;
8366 return in;
8367}
8368
8369
8370/**
8371 * open_remote_dir -- open up a remote dirfrag
8372 *
8373 * @param diri base inode
8374 * @param approxfg approximate fragment.
8375 * @param fin completion callback
8376 */
11fdf7f2 8377void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
7c673cae
FG
8378{
8379 dout(10) << "open_remote_dir on " << *diri << dendl;
11fdf7f2
TL
8380 ceph_assert(diri->is_dir());
8381 ceph_assert(!diri->is_auth());
8382 ceph_assert(diri->get_dirfrag(approxfg) == 0);
7c673cae 8383
224ce89b 8384 discover_dir_frag(diri, approxfg, fin);
7c673cae
FG
8385}
8386
8387
8388/**
8389 * get_dentry_inode - get or open inode
8390 *
8391 * @param dn the dentry
8392 * @param mdr current request
8393 *
8394 * will return inode for primary, or link up/open up remote link's inode as necessary.
8395 * If it's not available right now, puts mdr on wait list and returns null.
8396 */
8397CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8398{
8399 CDentry::linkage_t *dnl;
8400 if (projected)
8401 dnl = dn->get_projected_linkage();
8402 else
8403 dnl = dn->get_linkage();
8404
11fdf7f2 8405 ceph_assert(!dnl->is_null());
7c673cae
FG
8406
8407 if (dnl->is_primary())
8408 return dnl->inode;
8409
11fdf7f2 8410 ceph_assert(dnl->is_remote());
7c673cae
FG
8411 CInode *in = get_inode(dnl->get_remote_ino());
8412 if (in) {
8413 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8414 dn->link_remote(dnl, in);
8415 return in;
8416 } else {
8417 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8418 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8419 return 0;
8420 }
8421}
8422
8423struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8424 CDentry *dn;
8425 inodeno_t ino;
11fdf7f2 8426 MDSContext *onfinish;
7c673cae 8427 bool want_xlocked;
11fdf7f2 8428 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
31f18b77
FG
8429 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8430 dn->get(MDSCacheObject::PIN_PTRWAITER);
8431 }
7c673cae
FG
8432 void finish(int r) override {
8433 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
31f18b77 8434 dn->put(MDSCacheObject::PIN_PTRWAITER);
7c673cae
FG
8435 }
8436};
8437
11fdf7f2 8438void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
7c673cae
FG
8439{
8440 dout(10) << "open_remote_dentry " << *dn << dendl;
8441 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8442 inodeno_t ino = dnl->get_remote_ino();
8443 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8444 open_ino(ino, pool,
8445 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8446}
8447
11fdf7f2 8448void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
8449 bool want_xlocked, int r)
8450{
8451 if (r < 0) {
31f18b77
FG
8452 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8453 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
7c673cae
FG
8454 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8455 dn->state_set(CDentry::STATE_BADREMOTEINO);
8456
8457 std::string path;
8458 CDir *dir = dn->get_dir();
8459 if (dir) {
31f18b77 8460 dir->get_inode()->make_path_string(path);
94b18763 8461 path += "/";
11fdf7f2 8462 path += dn->get_name();
7c673cae
FG
8463 }
8464
31f18b77 8465 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
7c673cae 8466 if (fatal) {
31f18b77
FG
8467 mds->damaged();
8468 ceph_abort(); // unreachable, damaged() respawns us
7c673cae 8469 }
31f18b77
FG
8470 } else {
8471 r = 0;
8472 }
7c673cae
FG
8473 }
8474 fin->complete(r < 0 ? r : 0);
8475}
8476
8477
8478void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8479{
8480 // empty trace if we're a base inode
8481 if (in->is_base())
8482 return;
8483
8484 CInode *parent = in->get_parent_inode();
11fdf7f2 8485 ceph_assert(parent);
7c673cae
FG
8486 make_trace(trace, parent);
8487
8488 CDentry *dn = in->get_parent_dn();
8489 dout(15) << "make_trace adding " << *dn << dendl;
8490 trace.push_back(dn);
8491}
8492
8493
8494// -------------------------------------------------------------------------------
8495// Open inode by inode number
8496
8497class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8498 inodeno_t ino;
8499 public:
8500 bufferlist bl;
8501 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8502 MDCacheIOContext(c), ino(i) {}
8503 void finish(int r) override {
8504 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8505 }
91327a77
AA
8506 void print(ostream& out) const override {
8507 out << "openino_backtrace_fetch" << ino << ")";
8508 }
7c673cae
FG
8509};
8510
8511struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8512 inodeno_t ino;
11fdf7f2 8513 MMDSOpenIno::const_ref msg;
7c673cae
FG
8514 bool parent;
8515 public:
11fdf7f2 8516 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const MMDSOpenIno::const_ref &m, bool p) :
7c673cae
FG
8517 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8518 void finish(int r) override {
8519 if (r < 0 && !parent)
8520 r = -EAGAIN;
8521 if (msg) {
8522 mdcache->handle_open_ino(msg, r);
8523 return;
8524 }
11fdf7f2
TL
8525 auto& info = mdcache->opening_inodes.at(ino);
8526 mdcache->_open_ino_traverse_dir(ino, info, r);
7c673cae
FG
8527 }
8528};
8529
8530struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8531 inodeno_t ino;
8532 public:
8533 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8534 void finish(int r) override {
8535 mdcache->_open_ino_parent_opened(ino, r);
8536 }
8537};
8538
8539void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8540{
8541 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8542
11fdf7f2 8543 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8544
8545 CInode *in = get_inode(ino);
8546 if (in) {
8547 dout(10) << " found cached " << *in << dendl;
8548 open_ino_finish(ino, info, in->authority().first);
8549 return;
8550 }
8551
8552 inode_backtrace_t backtrace;
8553 if (err == 0) {
8554 try {
11fdf7f2 8555 decode(backtrace, bl);
7c673cae
FG
8556 } catch (const buffer::error &decode_exc) {
8557 derr << "corrupt backtrace on ino x0" << std::hex << ino
8558 << std::dec << ": " << decode_exc << dendl;
8559 open_ino_finish(ino, info, -EIO);
8560 return;
8561 }
8562 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8563 dout(10) << " old object in pool " << info.pool
8564 << ", retrying pool " << backtrace.pool << dendl;
8565 info.pool = backtrace.pool;
8566 C_IO_MDC_OpenInoBacktraceFetched *fin =
8567 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8568 fetch_backtrace(ino, info.pool, fin->bl,
8569 new C_OnFinisher(fin, mds->finisher));
8570 return;
8571 }
8572 } else if (err == -ENOENT) {
8573 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8574 if (info.pool != meta_pool) {
8575 dout(10) << " no object in pool " << info.pool
8576 << ", retrying pool " << meta_pool << dendl;
8577 info.pool = meta_pool;
8578 C_IO_MDC_OpenInoBacktraceFetched *fin =
8579 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8580 fetch_backtrace(ino, info.pool, fin->bl,
8581 new C_OnFinisher(fin, mds->finisher));
8582 return;
8583 }
8584 err = 0; // backtrace.ancestors.empty() is checked below
8585 }
8586
8587 if (err == 0) {
8588 if (backtrace.ancestors.empty()) {
8589 dout(10) << " got empty backtrace " << dendl;
8590 err = -EIO;
8591 } else if (!info.ancestors.empty()) {
8592 if (info.ancestors[0] == backtrace.ancestors[0]) {
8593 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8594 err = -EINVAL;
8595 } else {
8596 info.last_err = 0;
8597 }
8598 }
8599 }
8600 if (err) {
8601 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8602 if (info.last_err)
8603 err = info.last_err;
8604 open_ino_finish(ino, info, err);
8605 return;
8606 }
8607
8608 dout(10) << " got backtrace " << backtrace << dendl;
8609 info.ancestors = backtrace.ancestors;
8610
8611 _open_ino_traverse_dir(ino, info, 0);
8612}
8613
8614void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8615{
8616 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8617
11fdf7f2 8618 open_ino_info_t& info = opening_inodes.at(ino);
7c673cae
FG
8619
8620 CInode *in = get_inode(ino);
8621 if (in) {
8622 dout(10) << " found cached " << *in << dendl;
8623 open_ino_finish(ino, info, in->authority().first);
8624 return;
8625 }
8626
8627 if (ret == mds->get_nodeid()) {
8628 _open_ino_traverse_dir(ino, info, 0);
8629 } else {
8630 if (ret >= 0) {
8631 mds_rank_t checked_rank = mds_rank_t(ret);
8632 info.check_peers = true;
8633 info.auth_hint = checked_rank;
8634 info.checked.erase(checked_rank);
8635 }
8636 do_open_ino(ino, info, ret);
8637 }
8638}
8639
8640void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8641{
8642 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8643
8644 CInode *in = get_inode(ino);
8645 if (in) {
8646 dout(10) << " found cached " << *in << dendl;
8647 open_ino_finish(ino, info, in->authority().first);
8648 return;
8649 }
8650
8651 if (ret) {
8652 do_open_ino(ino, info, ret);
8653 return;
8654 }
8655
8656 mds_rank_t hint = info.auth_hint;
8657 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8658 info.discover, info.want_xlocked, &hint);
8659 if (ret > 0)
8660 return;
8661 if (hint != mds->get_nodeid())
8662 info.auth_hint = hint;
8663 do_open_ino(ino, info, ret);
8664}
8665
11fdf7f2 8666void MDCache::_open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent)
7c673cae
FG
8667{
8668 if (dir->state_test(CDir::STATE_REJOINUNDEF))
11fdf7f2 8669 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
7c673cae 8670 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
11fdf7f2
TL
8671 if (mds->logger)
8672 mds->logger->inc(l_mds_openino_dir_fetch);
7c673cae
FG
8673}
8674
11fdf7f2
TL
8675int MDCache::open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
8676 const vector<inode_backpointer_t>& ancestors,
7c673cae
FG
8677 bool discover, bool want_xlocked, mds_rank_t *hint)
8678{
8679 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8680 int err = 0;
8681 for (unsigned i = 0; i < ancestors.size(); i++) {
11fdf7f2
TL
8682 const auto& ancestor = ancestors.at(i);
8683 CInode *diri = get_inode(ancestor.dirino);
7c673cae
FG
8684
8685 if (!diri) {
11fdf7f2
TL
8686 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8687 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
7c673cae
FG
8688 return 1;
8689 }
8690 continue;
8691 }
8692
8693 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8694 CDir *dir = diri->get_parent_dir();
8695 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8696 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8697 dir = dir->get_inode()->get_parent_dir();
8698 _open_ino_fetch_dir(ino, m, dir, i == 0);
8699 return 1;
8700 }
8701
8702 if (!diri->is_dir()) {
8703 dout(10) << " " << *diri << " is not dir" << dendl;
8704 if (i == 0)
8705 err = -ENOTDIR;
8706 break;
8707 }
8708
11fdf7f2 8709 const string& name = ancestor.dname;
7c673cae
FG
8710 frag_t fg = diri->pick_dirfrag(name);
8711 CDir *dir = diri->get_dirfrag(fg);
8712 if (!dir) {
8713 if (diri->is_auth()) {
8714 if (diri->is_frozen()) {
8715 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8716 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8717 return 1;
8718 }
8719 dir = diri->get_or_open_dirfrag(this, fg);
8720 } else if (discover) {
8721 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8722 return 1;
8723 }
8724 }
8725 if (dir) {
11fdf7f2 8726 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
7c673cae
FG
8727 CDentry *dn = dir->lookup(name);
8728 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8729 if (dir->is_auth()) {
8730 if (dnl && dnl->is_primary() &&
8731 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8732 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8733 _open_ino_fetch_dir(ino, m, dir, i == 0);
8734 return 1;
8735 }
8736
8737 if (!dnl && !dir->is_complete() &&
8738 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8739 dout(10) << " fetching incomplete " << *dir << dendl;
8740 _open_ino_fetch_dir(ino, m, dir, i == 0);
8741 return 1;
8742 }
8743
8744 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8745 if (i == 0)
8746 err = -ENOENT;
8747 } else if (discover) {
8748 if (!dnl) {
8749 filepath path(name, 0);
8750 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8751 (i == 0 && want_xlocked));
8752 return 1;
8753 }
8754 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8755 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8756 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8757 return 1;
8758 }
8759 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8760 if (i == 0)
8761 err = -ENOENT;
8762 }
8763 }
8764 if (hint && i == 0)
8765 *hint = dir ? dir->authority().first : diri->authority().first;
8766 break;
8767 }
8768 return err;
8769}
8770
8771void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8772{
8773 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8774
11fdf7f2 8775 MDSContext::vec waiters;
7c673cae
FG
8776 waiters.swap(info.waiters);
8777 opening_inodes.erase(ino);
8778 finish_contexts(g_ceph_context, waiters, ret);
8779}
8780
8781void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8782{
8783 if (err < 0 && err != -EAGAIN) {
8784 info.checked.clear();
7c673cae
FG
8785 info.checking = MDS_RANK_NONE;
8786 info.check_peers = true;
8787 info.fetch_backtrace = true;
8788 if (info.discover) {
8789 info.discover = false;
8790 info.ancestors.clear();
8791 }
8792 if (err != -ENOENT && err != -ENOTDIR)
8793 info.last_err = err;
8794 }
8795
d2e6a577
FG
8796 if (info.check_peers || info.discover) {
8797 if (info.discover) {
8798 // got backtrace from peer, but failed to find inode. re-check peers
8799 info.discover = false;
8800 info.ancestors.clear();
8801 info.checked.clear();
8802 }
7c673cae
FG
8803 info.check_peers = false;
8804 info.checking = MDS_RANK_NONE;
8805 do_open_ino_peer(ino, info);
8806 } else if (info.fetch_backtrace) {
8807 info.check_peers = true;
8808 info.fetch_backtrace = false;
8809 info.checking = mds->get_nodeid();
8810 info.checked.clear();
7c673cae
FG
8811 C_IO_MDC_OpenInoBacktraceFetched *fin =
8812 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8813 fetch_backtrace(ino, info.pool, fin->bl,
8814 new C_OnFinisher(fin, mds->finisher));
8815 } else {
11fdf7f2 8816 ceph_assert(!info.ancestors.empty());
7c673cae
FG
8817 info.checking = mds->get_nodeid();
8818 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
8819 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
8820 }
8821}
8822
8823void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
8824{
8825 set<mds_rank_t> all, active;
8826 mds->mdsmap->get_mds_set(all);
7c673cae 8827 if (mds->get_state() == MDSMap::STATE_REJOIN)
1adf2230
AA
8828 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
8829 else
8830 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
8831
8832 dout(10) << "do_open_ino_peer " << ino << " active " << active
8833 << " all " << all << " checked " << info.checked << dendl;
8834
11fdf7f2 8835 mds_rank_t whoami = mds->get_nodeid();
7c673cae 8836 mds_rank_t peer = MDS_RANK_NONE;
11fdf7f2 8837 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
7c673cae
FG
8838 if (active.count(info.auth_hint)) {
8839 peer = info.auth_hint;
8840 info.auth_hint = MDS_RANK_NONE;
8841 }
8842 } else {
8843 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
11fdf7f2 8844 if (*p != whoami && info.checked.count(*p) == 0) {
7c673cae
FG
8845 peer = *p;
8846 break;
8847 }
8848 }
8849 if (peer < 0) {
11fdf7f2 8850 all.erase(whoami);
d2e6a577 8851 if (all != info.checked) {
7c673cae
FG
8852 dout(10) << " waiting for more peers to be active" << dendl;
8853 } else {
8854 dout(10) << " all MDS peers have been checked " << dendl;
8855 do_open_ino(ino, info, 0);
8856 }
8857 } else {
8858 info.checking = peer;
8859 vector<inode_backpointer_t> *pa = NULL;
8860 // got backtrace from peer or backtrace just fetched
8861 if (info.discover || !info.fetch_backtrace)
8862 pa = &info.ancestors;
11fdf7f2
TL
8863 mds->send_message_mds(MMDSOpenIno::create(info.tid, ino, pa), peer);
8864 if (mds->logger)
8865 mds->logger->inc(l_mds_openino_peer_discover);
7c673cae
FG
8866 }
8867}
8868
11fdf7f2 8869void MDCache::handle_open_ino(const MMDSOpenIno::const_ref &m, int err)
7c673cae
FG
8870{
8871 if (mds->get_state() < MDSMap::STATE_REJOIN &&
8872 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
8873 return;
8874 }
8875
8876 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
8877
11fdf7f2 8878 auto from = mds_rank_t(m->get_source().num());
7c673cae 8879 inodeno_t ino = m->ino;
11fdf7f2 8880 MMDSOpenInoReply::ref reply;
7c673cae
FG
8881 CInode *in = get_inode(ino);
8882 if (in) {
8883 dout(10) << " have " << *in << dendl;
11fdf7f2 8884 reply = MMDSOpenInoReply::create(m->get_tid(), ino, mds_rank_t(0));
7c673cae
FG
8885 if (in->is_auth()) {
8886 touch_inode(in);
8887 while (1) {
8888 CDentry *pdn = in->get_parent_dn();
8889 if (!pdn)
8890 break;
8891 CInode *diri = pdn->get_dir()->get_inode();
94b18763 8892 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
7c673cae
FG
8893 in->inode.version));
8894 in = diri;
8895 }
8896 } else {
8897 reply->hint = in->authority().first;
8898 }
8899 } else if (err < 0) {
11fdf7f2 8900 reply = MMDSOpenInoReply::create(m->get_tid(), ino, MDS_RANK_NONE, err);
7c673cae
FG
8901 } else {
8902 mds_rank_t hint = MDS_RANK_NONE;
8903 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
8904 if (ret > 0)
8905 return;
11fdf7f2 8906 reply = MMDSOpenInoReply::create(m->get_tid(), ino, hint, ret);
7c673cae 8907 }
11fdf7f2 8908 mds->send_message_mds(reply, from);
7c673cae
FG
8909}
8910
11fdf7f2 8911void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m)
7c673cae
FG
8912{
8913 dout(10) << "handle_open_ino_reply " << *m << dendl;
8914
8915 inodeno_t ino = m->ino;
8916 mds_rank_t from = mds_rank_t(m->get_source().num());
8917 auto it = opening_inodes.find(ino);
8918 if (it != opening_inodes.end() && it->second.checking == from) {
8919 open_ino_info_t& info = it->second;
8920 info.checking = MDS_RANK_NONE;
8921 info.checked.insert(from);
8922
8923 CInode *in = get_inode(ino);
8924 if (in) {
8925 dout(10) << " found cached " << *in << dendl;
8926 open_ino_finish(ino, info, in->authority().first);
8927 } else if (!m->ancestors.empty()) {
8928 dout(10) << " found ino " << ino << " on mds." << from << dendl;
8929 if (!info.want_replica) {
8930 open_ino_finish(ino, info, from);
7c673cae
FG
8931 return;
8932 }
8933
8934 info.ancestors = m->ancestors;
8935 info.auth_hint = from;
8936 info.checking = mds->get_nodeid();
8937 info.discover = true;
8938 _open_ino_traverse_dir(ino, info, 0);
8939 } else if (m->error) {
8940 dout(10) << " error " << m->error << " from mds." << from << dendl;
8941 do_open_ino(ino, info, m->error);
8942 } else {
8943 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
8944 info.auth_hint = m->hint;
8945 info.checked.erase(m->hint);
8946 }
8947 do_open_ino_peer(ino, info);
8948 }
8949 }
7c673cae
FG
8950}
8951
8952void MDCache::kick_open_ino_peers(mds_rank_t who)
8953{
8954 dout(10) << "kick_open_ino_peers mds." << who << dendl;
8955
8956 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
8957 p != opening_inodes.end();
8958 ++p) {
8959 open_ino_info_t& info = p->second;
8960 if (info.checking == who) {
8961 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
8962 info.checking = MDS_RANK_NONE;
8963 do_open_ino_peer(p->first, info);
8964 } else if (info.checking == MDS_RANK_NONE) {
8965 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
8966 do_open_ino_peer(p->first, info);
8967 }
8968 }
8969}
8970
11fdf7f2 8971void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
7c673cae
FG
8972 bool want_replica, bool want_xlocked)
8973{
8974 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
8975 << want_replica << dendl;
8976
11fdf7f2
TL
8977 auto it = opening_inodes.find(ino);
8978 if (it != opening_inodes.end()) {
8979 open_ino_info_t& info = it->second;
7c673cae
FG
8980 if (want_replica) {
8981 info.want_replica = true;
8982 if (want_xlocked && !info.want_xlocked) {
8983 if (!info.ancestors.empty()) {
8984 CInode *diri = get_inode(info.ancestors[0].dirino);
8985 if (diri) {
8986 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
8987 CDir *dir = diri->get_dirfrag(fg);
8988 if (dir && !dir->is_auth()) {
8989 filepath path(info.ancestors[0].dname, 0);
8990 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
8991 }
8992 }
8993 }
8994 info.want_xlocked = true;
8995 }
8996 }
8997 info.waiters.push_back(fin);
8998 } else {
8999 open_ino_info_t& info = opening_inodes[ino];
7c673cae
FG
9000 info.want_replica = want_replica;
9001 info.want_xlocked = want_xlocked;
9002 info.tid = ++open_ino_last_tid;
9003 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9004 info.waiters.push_back(fin);
11fdf7f2
TL
9005 if (mds->is_rejoin() &&
9006 open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
9007 info.fetch_backtrace = false;
9008 info.checking = mds->get_nodeid();
9009 _open_ino_traverse_dir(ino, info, 0);
9010 } else {
9011 do_open_ino(ino, info, 0);
9012 }
7c673cae
FG
9013 }
9014}
9015
9016/* ---------------------------- */
9017
9018/*
9019 * search for a given inode on MDS peers. optionally start with the given node.
9020
9021
9022 TODO
9023 - recover from mds node failure, recovery
9024 - traverse path
9025
9026 */
11fdf7f2 9027void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint)
7c673cae
FG
9028{
9029 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
b32b8144
FG
9030 CInode *in = get_inode(ino);
9031 if (in && in->state_test(CInode::STATE_PURGING)) {
9032 c->complete(-ESTALE);
9033 return;
9034 }
11fdf7f2 9035 ceph_assert(!in);
7c673cae
FG
9036
9037 ceph_tid_t tid = ++find_ino_peer_last_tid;
9038 find_ino_peer_info_t& fip = find_ino_peer[tid];
9039 fip.ino = ino;
9040 fip.tid = tid;
9041 fip.fin = c;
9042 fip.hint = hint;
7c673cae
FG
9043 _do_find_ino_peer(fip);
9044}
9045
9046void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9047{
9048 set<mds_rank_t> all, active;
9049 mds->mdsmap->get_mds_set(all);
1adf2230 9050 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
7c673cae
FG
9051
9052 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9053 << " active " << active << " all " << all
9054 << " checked " << fip.checked
9055 << dendl;
9056
9057 mds_rank_t m = MDS_RANK_NONE;
9058 if (fip.hint >= 0) {
9059 m = fip.hint;
9060 fip.hint = MDS_RANK_NONE;
9061 } else {
9062 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9063 if (*p != mds->get_nodeid() &&
9064 fip.checked.count(*p) == 0) {
9065 m = *p;
9066 break;
9067 }
9068 }
9069 if (m == MDS_RANK_NONE) {
d2e6a577
FG
9070 all.erase(mds->get_nodeid());
9071 if (all != fip.checked) {
7c673cae
FG
9072 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9073 } else {
9074 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9075 fip.fin->complete(-ESTALE);
9076 find_ino_peer.erase(fip.tid);
9077 }
9078 } else {
9079 fip.checking = m;
11fdf7f2 9080 mds->send_message_mds(MMDSFindIno::create(fip.tid, fip.ino), m);
7c673cae
FG
9081 }
9082}
9083
11fdf7f2 9084void MDCache::handle_find_ino(const MMDSFindIno::const_ref &m)
7c673cae
FG
9085{
9086 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
9087 return;
9088 }
9089
9090 dout(10) << "handle_find_ino " << *m << dendl;
11fdf7f2 9091 auto r = MMDSFindInoReply::create(m->tid);
7c673cae
FG
9092 CInode *in = get_inode(m->ino);
9093 if (in) {
9094 in->make_path(r->path);
9095 dout(10) << " have " << r->path << " " << *in << dendl;
9096 }
11fdf7f2 9097 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
7c673cae
FG
9098}
9099
9100
11fdf7f2 9101void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref &m)
7c673cae
FG
9102{
9103 map<ceph_tid_t, find_ino_peer_info_t>::iterator p = find_ino_peer.find(m->tid);
9104 if (p != find_ino_peer.end()) {
9105 dout(10) << "handle_find_ino_reply " << *m << dendl;
9106 find_ino_peer_info_t& fip = p->second;
9107
9108 // success?
9109 if (get_inode(fip.ino)) {
9110 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9111 mds->queue_waiter(fip.fin);
9112 find_ino_peer.erase(p);
7c673cae
FG
9113 return;
9114 }
9115
9116 mds_rank_t from = mds_rank_t(m->get_source().num());
9117 if (fip.checking == from)
9118 fip.checking = MDS_RANK_NONE;
9119 fip.checked.insert(from);
9120
9121 if (!m->path.empty()) {
9122 // we got a path!
9123 vector<CDentry*> trace;
11fdf7f2 9124 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 9125 MDRequestRef null_ref;
11fdf7f2 9126 int r = path_traverse(null_ref, cf, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER);
7c673cae
FG
9127 if (r > 0)
9128 return;
9129 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9130 << ", retrying" << dendl;
9131 fip.checked.clear();
9132 _do_find_ino_peer(fip);
9133 } else {
9134 // nope, continue.
9135 _do_find_ino_peer(fip);
9136 }
9137 } else {
9138 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9139 }
7c673cae
FG
9140}
9141
9142void MDCache::kick_find_ino_peers(mds_rank_t who)
9143{
9144 // find_ino_peers requests we should move on from
9145 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9146 p != find_ino_peer.end();
9147 ++p) {
9148 find_ino_peer_info_t& fip = p->second;
9149 if (fip.checking == who) {
9150 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9151 fip.checking = MDS_RANK_NONE;
9152 _do_find_ino_peer(fip);
9153 } else if (fip.checking == MDS_RANK_NONE) {
9154 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9155 _do_find_ino_peer(fip);
9156 }
9157 }
9158}
9159
9160/* ---------------------------- */
9161
9162int MDCache::get_num_client_requests()
9163{
9164 int count = 0;
9165 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9166 p != active_requests.end();
9167 ++p) {
9168 MDRequestRef& mdr = p->second;
9169 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9170 count++;
9171 }
9172 return count;
9173}
9174
11fdf7f2 9175MDRequestRef MDCache::request_start(const MClientRequest::const_ref& req)
7c673cae
FG
9176{
9177 // did we win a forward race against a slave?
9178 if (active_requests.count(req->get_reqid())) {
9179 MDRequestRef& mdr = active_requests[req->get_reqid()];
11fdf7f2 9180 ceph_assert(mdr);
7c673cae
FG
9181 if (mdr->is_slave()) {
9182 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9183 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9184 } else {
9185 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
7c673cae
FG
9186 }
9187 return MDRequestRef();
9188 }
9189
9190 // register new client request
9191 MDRequestImpl::Params params;
9192 params.reqid = req->get_reqid();
9193 params.attempt = req->get_num_fwd();
9194 params.client_req = req;
9195 params.initiated = req->get_recv_stamp();
9196 params.throttled = req->get_throttle_stamp();
9197 params.all_read = req->get_recv_complete_stamp();
9198 params.dispatched = req->get_dispatch_stamp();
9199
9200 MDRequestRef mdr =
11fdf7f2 9201 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae
FG
9202 active_requests[params.reqid] = mdr;
9203 mdr->set_op_stamp(req->get_stamp());
9204 dout(7) << "request_start " << *mdr << dendl;
9205 return mdr;
9206}
9207
11fdf7f2 9208MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const Message::const_ref &m)
7c673cae
FG
9209{
9210 int by = m->get_source().num();
9211 MDRequestImpl::Params params;
9212 params.reqid = ri;
9213 params.attempt = attempt;
9214 params.triggering_slave_req = m;
9215 params.slave_to = by;
9216 params.initiated = m->get_recv_stamp();
9217 params.throttled = m->get_throttle_stamp();
9218 params.all_read = m->get_recv_complete_stamp();
9219 params.dispatched = m->get_dispatch_stamp();
9220 MDRequestRef mdr =
11fdf7f2
TL
9221 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9222 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9223 active_requests[mdr->reqid] = mdr;
9224 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9225 return mdr;
9226}
9227
9228MDRequestRef MDCache::request_start_internal(int op)
9229{
91327a77 9230 utime_t now = ceph_clock_now();
7c673cae
FG
9231 MDRequestImpl::Params params;
9232 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9233 params.reqid.tid = mds->issue_tid();
91327a77
AA
9234 params.initiated = now;
9235 params.throttled = now;
9236 params.all_read = now;
9237 params.dispatched = now;
7c673cae
FG
9238 params.internal_op = op;
9239 MDRequestRef mdr =
11fdf7f2 9240 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
7c673cae 9241
11fdf7f2 9242 ceph_assert(active_requests.count(mdr->reqid) == 0);
7c673cae
FG
9243 active_requests[mdr->reqid] = mdr;
9244 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9245 return mdr;
9246}
9247
9248MDRequestRef MDCache::request_get(metareqid_t rid)
9249{
9250 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
11fdf7f2 9251 ceph_assert(p != active_requests.end());
7c673cae
FG
9252 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9253 return p->second;
9254}
9255
9256void MDCache::request_finish(MDRequestRef& mdr)
9257{
9258 dout(7) << "request_finish " << *mdr << dendl;
9259 mdr->mark_event("finishing request");
9260
9261 // slave finisher?
9262 if (mdr->has_more() && mdr->more()->slave_commit) {
9263 Context *fin = mdr->more()->slave_commit;
9264 mdr->more()->slave_commit = 0;
9265 int ret;
9266 if (mdr->aborted) {
9267 mdr->aborted = false;
9268 ret = -1;
9269 mdr->more()->slave_rolling_back = true;
9270 } else {
9271 ret = 0;
9272 mdr->committing = true;
9273 }
9274 fin->complete(ret); // this must re-call request_finish.
9275 return;
9276 }
9277
d2e6a577
FG
9278 switch(mdr->internal_op) {
9279 case CEPH_MDS_OP_FRAGMENTDIR:
9280 logger->inc(l_mdss_ireq_fragmentdir);
9281 break;
9282 case CEPH_MDS_OP_EXPORTDIR:
9283 logger->inc(l_mdss_ireq_exportdir);
9284 break;
9285 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9286 logger->inc(l_mdss_ireq_enqueue_scrub);
9287 break;
9288 case CEPH_MDS_OP_FLUSH:
9289 logger->inc(l_mdss_ireq_flush);
9290 break;
9291 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9292 logger->inc(l_mdss_ireq_fragstats);
9293 break;
9294 case CEPH_MDS_OP_REPAIR_INODESTATS:
9295 logger->inc(l_mdss_ireq_inodestats);
9296 break;
9297 }
9298
7c673cae
FG
9299 request_cleanup(mdr);
9300}
9301
9302
9303void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9304{
9305 mdr->mark_event("forwarding request");
9306 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9307 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9308 << *mdr->client_request << dendl;
91327a77 9309 mds->forward_message_mds(mdr->release_client_request(), who);
7c673cae
FG
9310 if (mds->logger) mds->logger->inc(l_mds_forward);
9311 } else if (mdr->internal_op >= 0) {
9312 dout(10) << "request_forward on internal op; cancelling" << dendl;
9313 mdr->internal_op_finish->complete(-EXDEV);
9314 } else {
9315 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9316 << " was from mds" << dendl;
9317 }
9318 request_cleanup(mdr);
9319}
9320
9321
9322void MDCache::dispatch_request(MDRequestRef& mdr)
9323{
9324 if (mdr->client_request) {
9325 mds->server->dispatch_client_request(mdr);
9326 } else if (mdr->slave_request) {
9327 mds->server->dispatch_slave_request(mdr);
9328 } else {
9329 switch (mdr->internal_op) {
9330 case CEPH_MDS_OP_FRAGMENTDIR:
9331 dispatch_fragment_dir(mdr);
9332 break;
9333 case CEPH_MDS_OP_EXPORTDIR:
9334 migrator->dispatch_export_dir(mdr, 0);
9335 break;
9336 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9337 enqueue_scrub_work(mdr);
9338 break;
9339 case CEPH_MDS_OP_FLUSH:
9340 flush_dentry_work(mdr);
9341 break;
9342 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9343 repair_dirfrag_stats_work(mdr);
9344 break;
9345 case CEPH_MDS_OP_REPAIR_INODESTATS:
9346 repair_inode_stats_work(mdr);
9347 break;
11fdf7f2
TL
9348 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9349 upgrade_inode_snaprealm_work(mdr);
9350 break;
7c673cae
FG
9351 default:
9352 ceph_abort();
9353 }
9354 }
9355}
9356
9357
9358void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9359{
9360 if (!mdr->has_more())
9361 return;
9362
9363 // clean up slaves
9364 // (will implicitly drop remote dn pins)
9365 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9366 p != mdr->more()->slaves.end();
9367 ++p) {
11fdf7f2 9368 auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt,
7c673cae
FG
9369 MMDSSlaveRequest::OP_FINISH);
9370
9371 if (mdr->killed && !mdr->committing) {
9372 r->mark_abort();
9373 } else if (mdr->more()->srcdn_auth_mds == *p &&
9374 mdr->more()->inode_import.length() > 0) {
9375 // information about rename imported caps
9376 r->inode_export.claim(mdr->more()->inode_import);
9377 }
9378
9379 mds->send_message_mds(r, *p);
9380 }
9381
9382 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9383 * implicitly. Note that we don't call the finishers -- there shouldn't
9384 * be any on a remote lock and the request finish wakes up all
9385 * the waiters anyway! */
7c673cae 9386
11fdf7f2
TL
9387 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9388 SimpleLock *lock = it->lock;
9389 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9390 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9391 << " on " << lock->get_parent() << dendl;
9392 lock->put_xlock();
9393 mdr->locks.erase(it++);
9394 } else if (it->is_remote_wrlock()) {
9395 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9396 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9397 if (it->is_wrlock()) {
9398 it->clear_remote_wrlock();
9399 ++it;
9400 } else {
9401 mdr->locks.erase(it++);
9402 }
9403 } else {
9404 ++it;
9405 }
7c673cae
FG
9406 }
9407
9408 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9409 * leaving them in can cause double-notifies as
9410 * this function can get called more than once */
9411}
9412
9413void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9414{
9415 request_drop_foreign_locks(mdr);
9416 mds->locker->drop_non_rdlocks(mdr.get());
9417}
9418
9419void MDCache::request_drop_locks(MDRequestRef& mdr)
9420{
9421 request_drop_foreign_locks(mdr);
9422 mds->locker->drop_locks(mdr.get());
9423}
9424
9425void MDCache::request_cleanup(MDRequestRef& mdr)
9426{
9427 dout(15) << "request_cleanup " << *mdr << dendl;
9428
9429 if (mdr->has_more()) {
9430 if (mdr->more()->is_ambiguous_auth)
9431 mdr->clear_ambiguous_auth();
9432 if (!mdr->more()->waiting_for_finish.empty())
9433 mds->queue_waiters(mdr->more()->waiting_for_finish);
9434 }
9435
9436 request_drop_locks(mdr);
9437
9438 // drop (local) auth pins
9439 mdr->drop_local_auth_pins();
9440
9441 // drop stickydirs
11fdf7f2 9442 mdr->put_stickydirs();
7c673cae
FG
9443
9444 mds->locker->kick_cap_releases(mdr);
9445
9446 // drop cache pins
9447 mdr->drop_pins();
9448
9449 // remove from session
9450 mdr->item_session_request.remove_myself();
9451
9452 // remove from map
9453 active_requests.erase(mdr->reqid);
9454
9455 if (mds->logger)
9456 log_stat();
9457
9458 mdr->mark_event("cleaned up request");
9459}
9460
9461void MDCache::request_kill(MDRequestRef& mdr)
9462{
9463 // rollback slave requests is tricky. just let the request proceed.
94b18763 9464 if (mdr->has_more() &&
7c673cae 9465 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
94b18763 9466 if (!mdr->done_locking) {
11fdf7f2 9467 ceph_assert(mdr->more()->witnessed.empty());
94b18763
FG
9468 mdr->aborted = true;
9469 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9470 } else {
9471 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9472 }
7c673cae 9473
11fdf7f2
TL
9474 ceph_assert(mdr->used_prealloc_ino == 0);
9475 ceph_assert(mdr->prealloc_inos.empty());
7c673cae
FG
9476
9477 mdr->session = NULL;
9478 mdr->item_session_request.remove_myself();
9479 return;
9480 }
9481
9482 mdr->killed = true;
9483 mdr->mark_event("killing request");
9484
9485 if (mdr->committing) {
9486 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9487 } else {
9488 dout(10) << "request_kill " << *mdr << dendl;
9489 request_cleanup(mdr);
9490 }
9491}
9492
9493// -------------------------------------------------------------------------------
9494// SNAPREALMS
9495
11fdf7f2 9496void MDCache::create_global_snaprealm()
7c673cae 9497{
11fdf7f2
TL
9498 CInode *in = new CInode(this); // dummy inode
9499 create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9500 add_inode(in);
9501 global_snaprealm = in->snaprealm;
7c673cae
FG
9502}
9503
11fdf7f2 9504void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
7c673cae
FG
9505{
9506 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9507
9508 vector<inodeno_t> split_inos;
9509 vector<inodeno_t> split_realms;
9510
11fdf7f2
TL
9511 if (notify_clients) {
9512 ceph_assert(in->snaprealm->have_past_parents_open());
9513 if (snapop == CEPH_SNAP_OP_SPLIT) {
9514 // notify clients of update|split
9515 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9516 !p.end(); ++p)
9517 split_inos.push_back((*p)->ino());
7c673cae 9518
11fdf7f2
TL
9519 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9520 p != in->snaprealm->open_children.end();
9521 ++p)
9522 split_realms.push_back((*p)->inode->ino());
9523 }
9524 }
7c673cae
FG
9525
9526 set<SnapRealm*> past_children;
11fdf7f2 9527 map<client_t, MClientSnap::ref> updates;
7c673cae
FG
9528 list<SnapRealm*> q;
9529 q.push_back(in->snaprealm);
9530 while (!q.empty()) {
9531 SnapRealm *realm = q.front();
9532 q.pop_front();
9533
9534 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9535 realm->invalidate_cached_snaps();
9536
11fdf7f2
TL
9537 if (notify_clients) {
9538 for (const auto& p : realm->client_caps) {
9539 const auto& client = p.first;
9540 const auto& caps = p.second;
9541 ceph_assert(!caps->empty());
9542
9543 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9544 if (em.second) {
9545 auto update = MClientSnap::create(CEPH_SNAP_OP_SPLIT);
9546 update->head.split = in->ino();
9547 update->split_inos = split_inos;
9548 update->split_realms = split_realms;
9549 update->bl = in->snaprealm->get_snap_trace();
9550 em.first->second = std::move(update);
9551 }
7c673cae
FG
9552 }
9553 }
9554
9555 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9556 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9557 p != realm->open_past_children.end();
9558 ++p)
9559 past_children.insert(*p);
9560 }
9561
9562 // notify for active children, too.
9563 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9564 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9565 p != realm->open_children.end();
9566 ++p)
9567 q.push_back(*p);
9568 }
9569
11fdf7f2 9570 if (notify_clients)
7c673cae
FG
9571 send_snaps(updates);
9572
9573 // notify past children and their descendants if we update/delete old snapshots
9574 for (set<SnapRealm*>::iterator p = past_children.begin();
9575 p != past_children.end();
9576 ++p)
9577 q.push_back(*p);
9578
9579 while (!q.empty()) {
9580 SnapRealm *realm = q.front();
9581 q.pop_front();
9582
9583 realm->invalidate_cached_snaps();
9584
9585 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9586 p != realm->open_children.end();
9587 ++p) {
9588 if (past_children.count(*p) == 0)
9589 q.push_back(*p);
9590 }
9591
9592 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9593 p != realm->open_past_children.end();
9594 ++p) {
9595 if (past_children.count(*p) == 0) {
9596 q.push_back(*p);
9597 past_children.insert(*p);
9598 }
9599 }
9600 }
9601
9602 if (snapop == CEPH_SNAP_OP_DESTROY) {
9603 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9604 for (set<SnapRealm*>::iterator p = past_children.begin();
9605 p != past_children.end();
9606 ++p)
9607 maybe_eval_stray((*p)->inode, true);
9608 }
9609}
9610
11fdf7f2 9611void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
7c673cae 9612{
11fdf7f2
TL
9613 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9614 ceph_assert(in->is_auth());
7c673cae 9615
11fdf7f2
TL
9616 set<mds_rank_t> mds_set;
9617 if (stid > 0) {
9618 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9619 mds_set.erase(mds->get_nodeid());
9620 } else {
9621 in->list_replicas(mds_set);
9622 }
7c673cae 9623
11fdf7f2
TL
9624 if (!mds_set.empty()) {
9625 bufferlist snap_blob;
9626 in->encode_snap(snap_blob);
7c673cae 9627
11fdf7f2
TL
9628 for (auto p : mds_set) {
9629 auto m = MMDSSnapUpdate::create(in->ino(), stid, snap_op);
9630 m->snap_blob = snap_blob;
9631 mds->send_message_mds(m, p);
9632 }
9633 }
7c673cae 9634
11fdf7f2
TL
9635 if (stid > 0)
9636 notify_global_snaprealm_update(snap_op);
9637}
7c673cae 9638
11fdf7f2
TL
9639void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref &m)
9640{
9641 mds_rank_t from = mds_rank_t(m->get_source().num());
9642 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 9643
11fdf7f2
TL
9644 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9645 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9646 return;
9647 }
7c673cae 9648
11fdf7f2
TL
9649 // null rejoin_done means open_snaprealms() has already been called
9650 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9651 (mds->is_rejoin() && !rejoin_done);
9652
9653 if (m->get_tid() > 0) {
9654 mds->snapclient->notify_commit(m->get_tid());
9655 if (notify_clients)
9656 notify_global_snaprealm_update(m->get_snap_op());
9657 }
9658
9659 CInode *in = get_inode(m->get_ino());
9660 if (in) {
9661 ceph_assert(!in->is_auth());
9662 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9663 (mds->is_rejoin() && !in->is_rejoining())) {
9664 auto p = m->snap_blob.cbegin();
9665 in->decode_snap(p);
9666
9667 if (!notify_clients) {
9668 if (!rejoin_pending_snaprealms.count(in)) {
9669 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9670 rejoin_pending_snaprealms.insert(in);
9671 }
9672 }
9673 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9674 }
9675 }
7c673cae
FG
9676}
9677
11fdf7f2
TL
9678void MDCache::notify_global_snaprealm_update(int snap_op)
9679{
9680 if (snap_op != CEPH_SNAP_OP_DESTROY)
9681 snap_op = CEPH_SNAP_OP_UPDATE;
9682 set<Session*> sessions;
9683 mds->sessionmap.get_client_session_set(sessions);
9684 for (auto &session : sessions) {
9685 if (!session->is_open() && !session->is_stale())
9686 continue;
9687 auto update = MClientSnap::create(snap_op);
9688 update->head.split = global_snaprealm->inode->ino();
9689 update->bl = global_snaprealm->get_snap_trace();
9690 mds->send_message_client_counted(update, session);
9691 }
9692}
7c673cae
FG
9693
9694// -------------------------------------------------------------------------------
9695// STRAYS
9696
9697struct C_MDC_RetryScanStray : public MDCacheContext {
9698 dirfrag_t next;
9699 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9700 void finish(int r) override {
9701 mdcache->scan_stray_dir(next);
9702 }
9703};
9704
9705void MDCache::scan_stray_dir(dirfrag_t next)
9706{
9707 dout(10) << "scan_stray_dir " << next << dendl;
9708
9709 list<CDir*> ls;
9710 for (int i = 0; i < NUM_STRAY; ++i) {
9711 if (strays[i]->ino() < next.ino)
9712 continue;
9713 strays[i]->get_dirfrags(ls);
9714 }
9715
9716 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9717 CDir *dir = *p;
9718 if (dir->dirfrag() < next)
9719 continue;
9720 if (!dir->is_complete()) {
9721 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9722 return;
9723 }
94b18763
FG
9724 for (auto &p : dir->items) {
9725 CDentry *dn = p.second;
7c673cae
FG
9726 dn->state_set(CDentry::STATE_STRAY);
9727 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9728 if (dnl->is_primary()) {
9729 CInode *in = dnl->get_inode();
9730 if (in->inode.nlink == 0)
9731 in->state_set(CInode::STATE_ORPHAN);
9732 maybe_eval_stray(in);
9733 }
9734 }
9735 }
9736}
9737
7c673cae
FG
9738void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9739{
9740 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9741 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
11fdf7f2
TL
9742 if (mds->logger)
9743 mds->logger->inc(l_mds_openino_backtrace_fetch);
7c673cae
FG
9744}
9745
9746
9747
9748
9749
9750// ========================================================================================
9751// DISCOVER
9752/*
9753
9754 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9755 to the parent metadata object in the cache (pinning it).
9756
9757 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9758
9759*/
9760
9761void MDCache::_send_discover(discover_info_t& d)
9762{
11fdf7f2 9763 auto dis = MDiscover::create(d.ino, d.frag, d.snap, d.want_path, d.want_base_dir, d.want_xlocked);
7c673cae
FG
9764 dis->set_tid(d.tid);
9765 mds->send_message_mds(dis, d.mds);
9766}
9767
9768void MDCache::discover_base_ino(inodeno_t want_ino,
11fdf7f2 9769 MDSContext *onfinish,
7c673cae
FG
9770 mds_rank_t from)
9771{
9772 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9773 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9774 discover_info_t& d = _create_discover(from);
9775 d.ino = want_ino;
9776 _send_discover(d);
9777 }
9778 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9779}
9780
9781
9782void MDCache::discover_dir_frag(CInode *base,
9783 frag_t approx_fg,
11fdf7f2 9784 MDSContext *onfinish,
7c673cae
FG
9785 mds_rank_t from)
9786{
9787 if (from < 0)
9788 from = base->authority().first;
9789
9790 dirfrag_t df(base->ino(), approx_fg);
9791 dout(7) << "discover_dir_frag " << df
9792 << " from mds." << from << dendl;
9793
9794 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9795 discover_info_t& d = _create_discover(from);
9796 d.pin_base(base);
9797 d.ino = base->ino();
9798 d.frag = approx_fg;
9799 d.want_base_dir = true;
9800 _send_discover(d);
9801 }
9802
9803 if (onfinish)
9804 base->add_dir_waiter(approx_fg, onfinish);
9805}
9806
9807struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9808 CInode *base;
9809 snapid_t snapid;
9810 filepath path;
9811 mds_rank_t from;
9812 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
9813 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
9814 void finish(int r) override {
9815 mdcache->discover_path(base, snapid, path, 0, from);
9816 }
9817};
9818
9819void MDCache::discover_path(CInode *base,
9820 snapid_t snap,
9821 filepath want_path,
11fdf7f2 9822 MDSContext *onfinish,
7c673cae
FG
9823 bool want_xlocked,
9824 mds_rank_t from)
9825{
9826 if (from < 0)
9827 from = base->authority().first;
9828
9829 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
9830 << (want_xlocked ? " want_xlocked":"")
9831 << dendl;
9832
9833 if (base->is_ambiguous_auth()) {
9834 dout(10) << " waiting for single auth on " << *base << dendl;
9835 if (!onfinish)
9836 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
9837 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
9838 return;
9839 } else if (from == mds->get_nodeid()) {
11fdf7f2 9840 MDSContext::vec finished;
7c673cae
FG
9841 base->take_waiting(CInode::WAIT_DIR, finished);
9842 mds->queue_waiters(finished);
9843 return;
9844 }
9845
9846 frag_t fg = base->pick_dirfrag(want_path[0]);
9847 if ((want_xlocked && want_path.depth() == 1) ||
9848 !base->is_waiting_for_dir(fg) || !onfinish) {
9849 discover_info_t& d = _create_discover(from);
9850 d.ino = base->ino();
9851 d.pin_base(base);
9852 d.frag = fg;
9853 d.snap = snap;
9854 d.want_path = want_path;
9855 d.want_base_dir = true;
9856 d.want_xlocked = want_xlocked;
9857 _send_discover(d);
9858 }
9859
9860 // register + wait
9861 if (onfinish)
9862 base->add_dir_waiter(fg, onfinish);
9863}
9864
9865struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
9866 CDir *base;
9867 snapid_t snapid;
9868 filepath path;
9869 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
9870 MDCacheContext(c), base(b), snapid(s), path(p) {}
9871 void finish(int r) override {
9872 mdcache->discover_path(base, snapid, path, 0);
9873 }
9874};
9875
9876void MDCache::discover_path(CDir *base,
9877 snapid_t snap,
9878 filepath want_path,
11fdf7f2 9879 MDSContext *onfinish,
7c673cae
FG
9880 bool want_xlocked)
9881{
9882 mds_rank_t from = base->authority().first;
9883
9884 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
9885 << (want_xlocked ? " want_xlocked":"")
9886 << dendl;
9887
9888 if (base->is_ambiguous_auth()) {
9889 dout(7) << " waiting for single auth on " << *base << dendl;
9890 if (!onfinish)
9891 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
9892 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
9893 return;
9894 } else if (from == mds->get_nodeid()) {
11fdf7f2 9895 MDSContext::vec finished;
7c673cae
FG
9896 base->take_sub_waiting(finished);
9897 mds->queue_waiters(finished);
9898 return;
9899 }
9900
9901 if ((want_xlocked && want_path.depth() == 1) ||
9902 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
9903 discover_info_t& d = _create_discover(from);
9904 d.ino = base->ino();
31f18b77 9905 d.pin_base(base->inode);
7c673cae
FG
9906 d.frag = base->get_frag();
9907 d.snap = snap;
9908 d.want_path = want_path;
9909 d.want_base_dir = false;
9910 d.want_xlocked = want_xlocked;
9911 _send_discover(d);
9912 }
9913
9914 // register + wait
9915 if (onfinish)
9916 base->add_dentry_waiter(want_path[0], snap, onfinish);
9917}
9918
9919void MDCache::kick_discovers(mds_rank_t who)
9920{
9921 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
9922 p != discovers.end();
9923 ++p) {
9924 if (p->second.mds != who)
9925 continue;
9926 _send_discover(p->second);
9927 }
9928}
9929
9930
11fdf7f2 9931void MDCache::handle_discover(const MDiscover::const_ref &dis)
7c673cae
FG
9932{
9933 mds_rank_t whoami = mds->get_nodeid();
9934 mds_rank_t from = mds_rank_t(dis->get_source().num());
9935
11fdf7f2 9936 ceph_assert(from != whoami);
7c673cae
FG
9937
9938 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
9939 if (mds->get_state() < MDSMap::STATE_REJOIN &&
d2e6a577 9940 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
7c673cae
FG
9941 return;
9942 }
9943
9944 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9945 // delay processing request from survivor because we may not yet choose lock states.
9946 if (!mds->mdsmap->is_rejoin(from)) {
9947 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
9948 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
9949 return;
9950 }
9951 }
9952
9953
9954 CInode *cur = 0;
11fdf7f2 9955 auto reply = MDiscoverReply::create(*dis);
7c673cae
FG
9956
9957 snapid_t snapid = dis->get_snapid();
9958
9959 // get started.
9960 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
9961 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
9962 // wants root
9963 dout(7) << "handle_discover from mds." << from
9964 << " wants base + " << dis->get_want().get_path()
9965 << " snap " << snapid
9966 << dendl;
9967
9968 cur = get_inode(dis->get_base_ino());
11fdf7f2 9969 ceph_assert(cur);
7c673cae
FG
9970
9971 // add root
9972 reply->starts_with = MDiscoverReply::INODE;
9973 replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
9974 dout(10) << "added base " << *cur << dendl;
9975 }
9976 else {
9977 // there's a base inode
9978 cur = get_inode(dis->get_base_ino(), snapid);
9979 if (!cur && snapid != CEPH_NOSNAP) {
9980 cur = get_inode(dis->get_base_ino());
9981 if (cur && !cur->is_multiversion())
9982 cur = NULL; // nope!
9983 }
9984
9985 if (!cur) {
9986 dout(7) << "handle_discover mds." << from
9987 << " don't have base ino " << dis->get_base_ino() << "." << snapid
9988 << dendl;
9989 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
9990 reply->set_error_dentry(dis->get_dentry(0));
9991 reply->set_flag_error_dir();
9992 } else if (dis->wants_base_dir()) {
9993 dout(7) << "handle_discover mds." << from
9994 << " wants basedir+" << dis->get_want().get_path()
9995 << " has " << *cur
9996 << dendl;
9997 } else {
9998 dout(7) << "handle_discover mds." << from
9999 << " wants " << dis->get_want().get_path()
10000 << " has " << *cur
10001 << dendl;
10002 }
10003 }
10004
11fdf7f2 10005 ceph_assert(reply);
7c673cae
FG
10006
10007 // add content
10008 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10009 for (unsigned i = 0;
10010 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10011 i++) {
10012
10013 // -- figure out the dir
10014
10015 // is *cur even a dir at all?
10016 if (!cur->is_dir()) {
10017 dout(7) << *cur << " not a dir" << dendl;
10018 reply->set_flag_error_dir();
10019 break;
10020 }
10021
10022 // pick frag
10023 frag_t fg;
10024 if (dis->get_want().depth()) {
10025 // dentry specifies
10026 fg = cur->pick_dirfrag(dis->get_dentry(i));
10027 } else {
10028 // requester explicity specified the frag
11fdf7f2 10029 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
7c673cae
FG
10030 fg = dis->get_base_dir_frag();
10031 if (!cur->dirfragtree.is_leaf(fg))
10032 fg = cur->dirfragtree[fg.value()];
10033 }
10034 CDir *curdir = cur->get_dirfrag(fg);
10035
10036 if ((!curdir && !cur->is_auth()) ||
10037 (curdir && !curdir->is_auth())) {
10038
10039 /* before:
10040 * ONLY set flag if empty!!
10041 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10042 * resulting in duplicate discovers in flight,
10043 * which can wreak havoc when discovering rename srcdn (which may move)
10044 */
10045
10046 if (reply->is_empty()) {
10047 // only hint if empty.
10048 // someday this could be better, but right now the waiter logic isn't smart enough.
10049
10050 // hint
10051 if (curdir) {
10052 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10053 reply->set_dir_auth_hint(curdir->authority().first);
10054 } else {
10055 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10056 << *cur << dendl;
10057 reply->set_dir_auth_hint(cur->authority().first);
10058 }
10059
10060 // note error dentry, if any
10061 // NOTE: important, as it allows requester to issue an equivalent discover
10062 // to whomever we hint at.
10063 if (dis->get_want().depth() > i)
10064 reply->set_error_dentry(dis->get_dentry(i));
10065 }
10066
10067 break;
10068 }
10069
10070 if (!curdir) { // open dir?
10071 if (cur->is_frozen()) {
10072 if (!reply->is_empty()) {
10073 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10074 break;
10075 }
10076 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10077 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10078 return;
10079 }
10080 curdir = cur->get_or_open_dirfrag(this, fg);
10081 } else if (curdir->is_frozen_tree() ||
10082 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
31f18b77
FG
10083 if (!reply->is_empty()) {
10084 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10085 break;
10086 }
7c673cae
FG
10087 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10088 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10089 reply->set_flag_error_dir();
10090 break;
10091 }
7c673cae
FG
10092 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10093 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10094 return;
10095 }
10096
10097 // add dir
10098 if (curdir->get_version() == 0) {
10099 // fetch newly opened dir
10100 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10101 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10102 // make sure the base frag is correct, though, in there was a refragment since the
10103 // original request was sent.
10104 reply->set_base_dir_frag(curdir->get_frag());
10105 } else {
11fdf7f2 10106 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
7c673cae
FG
10107 if (!reply->trace.length())
10108 reply->starts_with = MDiscoverReply::DIR;
10109 replicate_dir(curdir, from, reply->trace);
10110 dout(7) << "handle_discover added dir " << *curdir << dendl;
10111 }
10112
10113 // lookup
10114 CDentry *dn = 0;
10115 if (curdir->get_version() == 0) {
10116 // fetch newly opened dir
11fdf7f2 10117 ceph_assert(!curdir->has_bloom());
7c673cae
FG
10118 } else if (dis->get_want().depth() > 0) {
10119 // lookup dentry
10120 dn = curdir->lookup(dis->get_dentry(i), snapid);
10121 } else
10122 break; // done!
10123
10124 // incomplete dir?
10125 if (!dn) {
31f18b77 10126 if (!curdir->is_complete() &&
11fdf7f2
TL
10127 !(snapid == CEPH_NOSNAP &&
10128 curdir->has_bloom() &&
10129 !curdir->is_in_bloom(dis->get_dentry(i)))) {
7c673cae
FG
10130 // readdir
10131 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10132 if (reply->is_empty()) {
10133 // fetch and wait
10134 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10135 dis->wants_base_dir() && curdir->get_version() == 0);
7c673cae
FG
10136 return;
10137 } else {
10138 // initiate fetch, but send what we have so far
10139 curdir->fetch(0);
10140 break;
10141 }
10142 }
10143
11fdf7f2
TL
10144 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10145 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10146 << " dne, non-empty reply, stopping" << dendl;
10147 break;
10148 }
10149
7c673cae
FG
10150 // send null dentry
10151 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10152 << *curdir << dendl;
11fdf7f2
TL
10153 if (snapid == CEPH_NOSNAP)
10154 dn = curdir->add_null_dentry(dis->get_dentry(i));
10155 else
10156 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
7c673cae 10157 }
11fdf7f2 10158 ceph_assert(dn);
7c673cae 10159
31f18b77
FG
10160 // don't add replica to purging dentry/inode
10161 if (dn->state_test(CDentry::STATE_PURGING)) {
10162 if (reply->is_empty())
10163 reply->set_flag_error_dn(dis->get_dentry(i));
10164 break;
10165 }
10166
7c673cae
FG
10167 CDentry::linkage_t *dnl = dn->get_linkage();
10168
10169 // xlocked dentry?
10170 // ...always block on non-tail items (they are unrelated)
10171 // ...allow xlocked tail disocvery _only_ if explicitly requested
10172 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10173 if (dn->lock.is_xlocked()) {
10174 // is this the last (tail) item in the discover traversal?
10175 if (tailitem && dis->wants_xlocked()) {
10176 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl;
10177 } else if (reply->is_empty()) {
10178 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10179 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10180 return;
10181 } else {
10182 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10183 break;
10184 }
10185 }
10186
10187 // frozen inode?
10188 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10189 if (tailitem && dis->wants_xlocked()) {
10190 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10191 } else if (reply->is_empty()) {
10192 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10193 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
7c673cae
FG
10194 return;
10195 } else {
10196 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10197 break;
10198 }
10199 }
10200
10201 // add dentry
10202 if (!reply->trace.length())
10203 reply->starts_with = MDiscoverReply::DENTRY;
10204 replicate_dentry(dn, from, reply->trace);
10205 dout(7) << "handle_discover added dentry " << *dn << dendl;
10206
10207 if (!dnl->is_primary()) break; // stop on null or remote link.
10208
10209 // add inode
10210 CInode *next = dnl->get_inode();
11fdf7f2 10211 ceph_assert(next->is_auth());
7c673cae
FG
10212
10213 replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10214 dout(7) << "handle_discover added inode " << *next << dendl;
10215
10216 // descend, keep going.
10217 cur = next;
10218 continue;
10219 }
10220
10221 // how did we do?
11fdf7f2 10222 ceph_assert(!reply->is_empty());
7c673cae
FG
10223 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10224 mds->send_message(reply, dis->get_connection());
7c673cae
FG
10225}
10226
11fdf7f2 10227void MDCache::handle_discover_reply(const MDiscoverReply::const_ref &m)
7c673cae
FG
10228{
10229 /*
10230 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10231 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
7c673cae
FG
10232 return;
10233 }
10234 */
10235 dout(7) << "discover_reply " << *m << dendl;
10236 if (m->is_flag_error_dir())
10237 dout(7) << " flag error, dir" << dendl;
10238 if (m->is_flag_error_dn())
10239 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10240
11fdf7f2 10241 MDSContext::vec finished, error;
7c673cae
FG
10242 mds_rank_t from = mds_rank_t(m->get_source().num());
10243
10244 // starting point
10245 CInode *cur = get_inode(m->get_base_ino());
11fdf7f2 10246 auto p = m->trace.cbegin();
7c673cae
FG
10247
10248 int next = m->starts_with;
10249
10250 // decrement discover counters
10251 if (m->get_tid()) {
10252 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10253 if (p != discovers.end()) {
10254 dout(10) << " found tid " << m->get_tid() << dendl;
10255 discovers.erase(p);
10256 } else {
10257 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10258 }
10259 }
10260
10261 // discover may start with an inode
10262 if (!p.end() && next == MDiscoverReply::INODE) {
10263 cur = add_replica_inode(p, NULL, finished);
10264 dout(7) << "discover_reply got base inode " << *cur << dendl;
11fdf7f2 10265 ceph_assert(cur->is_base());
7c673cae
FG
10266
10267 next = MDiscoverReply::DIR;
10268
10269 // take waiters?
10270 if (cur->is_base() &&
10271 waiting_for_base_ino[from].count(cur->ino())) {
10272 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10273 waiting_for_base_ino[from].erase(cur->ino());
10274 }
10275 }
11fdf7f2 10276 ceph_assert(cur);
7c673cae
FG
10277
10278 // loop over discover results.
10279 // indexes follow each ([[dir] dentry] inode)
10280 // can start, end with any type.
10281 while (!p.end()) {
10282 // dir
10283 frag_t fg;
10284 CDir *curdir = 0;
10285 if (next == MDiscoverReply::DIR) {
10286 curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished);
10287 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
11fdf7f2 10288 ceph_assert(m->get_wanted_base_dir());
7c673cae
FG
10289 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10290 }
10291 } else {
10292 // note: this can only happen our first way around this loop.
10293 if (p.end() && m->is_flag_error_dn()) {
10294 fg = cur->pick_dirfrag(m->get_error_dentry());
10295 curdir = cur->get_dirfrag(fg);
10296 } else
10297 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10298 }
10299
10300 if (p.end())
10301 break;
10302
10303 // dentry
10304 CDentry *dn = add_replica_dentry(p, curdir, finished);
10305
10306 if (p.end())
10307 break;
10308
10309 // inode
10310 cur = add_replica_inode(p, dn, finished);
10311
10312 next = MDiscoverReply::DIR;
10313 }
10314
10315 // dir error?
10316 // or dir_auth hint?
10317 if (m->is_flag_error_dir() && !cur->is_dir()) {
10318 // not a dir.
10319 cur->take_waiting(CInode::WAIT_DIR, error);
10320 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10321 mds_rank_t who = m->get_dir_auth_hint();
10322 if (who == mds->get_nodeid()) who = -1;
10323 if (who >= 0)
10324 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10325
7c673cae
FG
10326
10327 if (m->get_wanted_base_dir()) {
31f18b77
FG
10328 frag_t fg = m->get_base_dir_frag();
10329 CDir *dir = cur->get_dirfrag(fg);
10330
7c673cae
FG
10331 if (cur->is_waiting_for_dir(fg)) {
10332 if (cur->is_auth())
10333 cur->take_waiting(CInode::WAIT_DIR, finished);
10334 else if (dir || !cur->dirfragtree.is_leaf(fg))
10335 cur->take_dir_waiting(fg, finished);
10336 else
10337 discover_dir_frag(cur, fg, 0, who);
10338 } else
10339 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10340 }
10341
10342 // try again?
10343 if (m->get_error_dentry().length()) {
31f18b77
FG
10344 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10345 CDir *dir = cur->get_dirfrag(fg);
7c673cae
FG
10346 // wanted a dentry
10347 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10348 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10349 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10350 m->get_wanted_snapid(), finished);
10351 } else {
10352 filepath relpath(m->get_error_dentry(), 0);
10353 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
10354 }
10355 } else
10356 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10357 << m->get_error_dentry() << dendl;
10358 }
31f18b77
FG
10359 } else if (m->is_flag_error_dn()) {
10360 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10361 CDir *dir = cur->get_dirfrag(fg);
10362 if (dir) {
10363 if (dir->is_auth()) {
10364 dir->take_sub_waiting(finished);
10365 } else {
10366 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10367 m->get_wanted_snapid(), error);
10368 }
10369 }
7c673cae
FG
10370 }
10371
10372 // waiters
10373 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10374 mds->queue_waiters(finished);
7c673cae
FG
10375}
10376
10377
10378
10379// ----------------------------
10380// REPLICAS
10381
b32b8144
FG
10382
10383void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10384{
10385 dirfrag_t df = dir->dirfrag();
11fdf7f2 10386 encode(df, bl);
b32b8144
FG
10387 dir->encode_replica(to, bl);
10388}
10389
10390void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10391{
11fdf7f2
TL
10392 encode(dn->get_name(), bl);
10393 encode(dn->last, bl);
b32b8144
FG
10394 dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10395}
10396
10397void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10398 uint64_t features)
10399{
11fdf7f2
TL
10400 encode(in->inode.ino, bl); // bleh, minor assymetry here
10401 encode(in->last, bl);
b32b8144
FG
10402 in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
10403}
10404
11fdf7f2
TL
10405CDir *MDCache::add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10406 MDSContext::vec& finished)
7c673cae
FG
10407{
10408 dirfrag_t df;
11fdf7f2 10409 decode(df, p);
7c673cae 10410
11fdf7f2 10411 ceph_assert(diri->ino() == df.ino);
7c673cae
FG
10412
10413 // add it (_replica_)
10414 CDir *dir = diri->get_dirfrag(df.frag);
10415
10416 if (dir) {
10417 // had replica. update w/ new nonce.
10418 dir->decode_replica(p);
10419 dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl;
10420 } else {
10421 // force frag to leaf in the diri tree
10422 if (!diri->dirfragtree.is_leaf(df.frag)) {
10423 dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree "
10424 << diri->dirfragtree << dendl;
10425 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10426 }
10427
10428 // add replica.
10429 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10430 dir->decode_replica(p);
10431
10432 // is this a dir_auth delegation boundary?
10433 if (from != diri->authority().first ||
10434 diri->is_ambiguous_auth() ||
10435 diri->is_base())
10436 adjust_subtree_auth(dir, from);
10437
10438 dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
10439
10440 // get waiters
10441 diri->take_dir_waiting(df.frag, finished);
10442 }
10443
10444 return dir;
10445}
10446
11fdf7f2 10447CDentry *MDCache::add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
7c673cae
FG
10448{
10449 string name;
10450 snapid_t last;
11fdf7f2
TL
10451 decode(name, p);
10452 decode(last, p);
7c673cae
FG
10453
10454 CDentry *dn = dir->lookup(name, last);
10455
10456 // have it?
10457 if (dn) {
10458 dn->decode_replica(p, false);
10459 dout(7) << "add_replica_dentry had " << *dn << dendl;
10460 } else {
10461 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10462 dn->decode_replica(p, true);
10463 dout(7) << "add_replica_dentry added " << *dn << dendl;
10464 }
10465
10466 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10467
10468 return dn;
10469}
10470
11fdf7f2 10471CInode *MDCache::add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
7c673cae
FG
10472{
10473 inodeno_t ino;
10474 snapid_t last;
11fdf7f2
TL
10475 decode(ino, p);
10476 decode(last, p);
7c673cae
FG
10477 CInode *in = get_inode(ino, last);
10478 if (!in) {
10479 in = new CInode(this, false, 1, last);
10480 in->decode_replica(p, true);
10481 add_inode(in);
10482 if (in->ino() == MDS_INO_ROOT)
10483 in->inode_auth.first = 0;
10484 else if (in->is_mdsdir())
10485 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10486 dout(10) << "add_replica_inode added " << *in << dendl;
10487 if (dn) {
11fdf7f2 10488 ceph_assert(dn->get_linkage()->is_null());
7c673cae
FG
10489 dn->dir->link_primary_inode(dn, in);
10490 }
10491 } else {
10492 in->decode_replica(p, false);
10493 dout(10) << "add_replica_inode had " << *in << dendl;
10494 }
10495
10496 if (dn) {
10497 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10498 dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl;
10499 }
10500
10501 return in;
10502}
10503
10504
10505void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10506{
10507 uint64_t features = mds->mdsmap->get_up_features();
10508 replicate_inode(get_myin(), who, bl, features);
10509 replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10510 replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10511 replicate_inode(straydn->get_dir()->inode, who, bl, features);
10512 replicate_dir(straydn->get_dir(), who, bl);
10513 replicate_dentry(straydn, who, bl);
10514}
10515
11fdf7f2 10516CDentry *MDCache::add_replica_stray(const bufferlist &bl, mds_rank_t from)
7c673cae 10517{
11fdf7f2
TL
10518 MDSContext::vec finished;
10519 auto p = bl.cbegin();
7c673cae
FG
10520
10521 CInode *mdsin = add_replica_inode(p, NULL, finished);
10522 CDir *mdsdir = add_replica_dir(p, mdsin, from, finished);
10523 CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished);
10524 CInode *strayin = add_replica_inode(p, straydirdn, finished);
10525 CDir *straydir = add_replica_dir(p, strayin, from, finished);
10526 CDentry *straydn = add_replica_dentry(p, straydir, finished);
10527 if (!finished.empty())
10528 mds->queue_waiters(finished);
10529
10530 return straydn;
10531}
10532
10533
10534int MDCache::send_dir_updates(CDir *dir, bool bcast)
10535{
10536 // this is an FYI, re: replication
10537
10538 set<mds_rank_t> who;
10539 if (bcast) {
10540 mds->get_mds_map()->get_active_mds_set(who);
10541 } else {
181888fb
FG
10542 for (const auto &p : dir->get_replicas()) {
10543 who.insert(p.first);
10544 }
7c673cae
FG
10545 }
10546
10547 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10548
10549 filepath path;
10550 dir->inode->make_path(path);
10551
10552 mds_rank_t whoami = mds->get_nodeid();
10553 for (set<mds_rank_t>::iterator it = who.begin();
10554 it != who.end();
10555 ++it) {
10556 if (*it == whoami) continue;
10557 //if (*it == except) continue;
10558 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10559
94b18763
FG
10560 std::set<int32_t> s;
10561 for (const auto &r : dir->dir_rep_by) {
10562 s.insert(r);
10563 }
11fdf7f2 10564 mds->send_message_mds(MDirUpdate::create(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
7c673cae
FG
10565 }
10566
10567 return 0;
10568}
10569
11fdf7f2 10570void MDCache::handle_dir_update(const MDirUpdate::const_ref &m)
7c673cae 10571{
224ce89b
WB
10572 dirfrag_t df = m->get_dirfrag();
10573 CDir *dir = get_dirfrag(df);
7c673cae 10574 if (!dir) {
224ce89b 10575 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
7c673cae
FG
10576
10577 // discover it?
10578 if (m->should_discover()) {
10579 // only try once!
10580 // this is key to avoid a fragtree update race, among other things.
224ce89b 10581 m->inc_tried_discover();
7c673cae
FG
10582 vector<CDentry*> trace;
10583 CInode *in;
10584 filepath path = m->get_path();
10585 dout(5) << "trying discover on dir_update for " << path << dendl;
11fdf7f2 10586 CF_MDS_RetryMessageFactory cf(mds, m);
7c673cae 10587 MDRequestRef null_ref;
11fdf7f2 10588 int r = path_traverse(null_ref, cf, path, &trace, &in, MDS_TRAVERSE_DISCOVER);
7c673cae
FG
10589 if (r > 0)
10590 return;
224ce89b
WB
10591 if (r == 0 &&
10592 in->ino() == df.ino &&
10593 in->get_approx_dirfrag(df.frag) == NULL) {
10594 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10595 return;
10596 }
7c673cae
FG
10597 }
10598
7c673cae
FG
10599 return;
10600 }
10601
224ce89b
WB
10602 if (!m->has_tried_discover()) {
10603 // Update if it already exists. Othwerwise it got updated by discover reply.
10604 dout(5) << "dir_update on " << *dir << dendl;
10605 dir->dir_rep = m->get_dir_rep();
94b18763
FG
10606 dir->dir_rep_by.clear();
10607 for (const auto &e : m->get_dir_rep_by()) {
10608 dir->dir_rep_by.insert(e);
10609 }
224ce89b 10610 }
7c673cae
FG
10611}
10612
10613
10614
10615
10616
10617// LINK
10618
10619void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10620{
10621 dout(7) << "send_dentry_link " << *dn << dendl;
10622
10623 CDir *subtree = get_subtree_root(dn->get_dir());
181888fb 10624 for (const auto &p : dn->get_replicas()) {
7c673cae 10625 // don't tell (rename) witnesses; they already know
181888fb 10626 if (mdr.get() && mdr->more()->witnessed.count(p.first))
7c673cae 10627 continue;
181888fb
FG
10628 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10629 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10630 rejoin_gather.count(p.first)))
7c673cae
FG
10631 continue;
10632 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 10633 auto m = MDentryLink::create(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
7c673cae
FG
10634 if (dnl->is_primary()) {
10635 dout(10) << " primary " << *dnl->get_inode() << dendl;
181888fb 10636 replicate_inode(dnl->get_inode(), p.first, m->bl,
7c673cae
FG
10637 mds->mdsmap->get_up_features());
10638 } else if (dnl->is_remote()) {
10639 inodeno_t ino = dnl->get_remote_ino();
10640 __u8 d_type = dnl->get_remote_d_type();
10641 dout(10) << " remote " << ino << " " << d_type << dendl;
11fdf7f2
TL
10642 encode(ino, m->bl);
10643 encode(d_type, m->bl);
7c673cae
FG
10644 } else
10645 ceph_abort(); // aie, bad caller!
181888fb 10646 mds->send_message_mds(m, p.first);
7c673cae
FG
10647 }
10648}
10649
11fdf7f2 10650void MDCache::handle_dentry_link(const MDentryLink::const_ref &m)
7c673cae 10651{
7c673cae
FG
10652 CDentry *dn = NULL;
10653 CDir *dir = get_dirfrag(m->get_dirfrag());
10654 if (!dir) {
10655 dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl;
10656 } else {
10657 dn = dir->lookup(m->get_dn());
10658 if (!dn) {
10659 dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10660 } else {
10661 dout(7) << "handle_dentry_link on " << *dn << dendl;
10662 CDentry::linkage_t *dnl = dn->get_linkage();
10663
11fdf7f2
TL
10664 ceph_assert(!dn->is_auth());
10665 ceph_assert(dnl->is_null());
7c673cae
FG
10666 }
10667 }
10668
11fdf7f2
TL
10669 auto p = m->bl.cbegin();
10670 MDSContext::vec finished;
7c673cae
FG
10671 if (dn) {
10672 if (m->get_is_primary()) {
10673 // primary link.
10674 add_replica_inode(p, dn, finished);
10675 } else {
10676 // remote link, easy enough.
10677 inodeno_t ino;
10678 __u8 d_type;
11fdf7f2
TL
10679 decode(ino, p);
10680 decode(d_type, p);
7c673cae
FG
10681 dir->link_remote_inode(dn, ino, d_type);
10682 }
10683 } else {
10684 ceph_abort();
10685 }
10686
10687 if (!finished.empty())
10688 mds->queue_waiters(finished);
10689
7c673cae
FG
10690 return;
10691}
10692
10693
10694// UNLINK
10695
10696void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
10697{
10698 dout(10) << "send_dentry_unlink " << *dn << dendl;
10699 // share unlink news with replicas
10700 set<mds_rank_t> replicas;
10701 dn->list_replicas(replicas);
11fdf7f2
TL
10702 bufferlist snapbl;
10703 if (straydn) {
7c673cae 10704 straydn->list_replicas(replicas);
11fdf7f2
TL
10705 CInode *strayin = straydn->get_linkage()->get_inode();
10706 strayin->encode_snap_blob(snapbl);
10707 }
7c673cae
FG
10708 for (set<mds_rank_t>::iterator it = replicas.begin();
10709 it != replicas.end();
10710 ++it) {
10711 // don't tell (rmdir) witnesses; they already know
10712 if (mdr.get() && mdr->more()->witnessed.count(*it))
10713 continue;
10714
10715 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
10716 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
10717 rejoin_gather.count(*it)))
10718 continue;
10719
11fdf7f2
TL
10720 auto unlink = MDentryUnlink::create(dn->get_dir()->dirfrag(), dn->get_name());
10721 if (straydn) {
7c673cae 10722 replicate_stray(straydn, *it, unlink->straybl);
11fdf7f2
TL
10723 unlink->snapbl = snapbl;
10724 }
7c673cae
FG
10725 mds->send_message_mds(unlink, *it);
10726 }
10727}
10728
11fdf7f2 10729void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref &m)
7c673cae
FG
10730{
10731 // straydn
10732 CDentry *straydn = NULL;
10733 if (m->straybl.length())
10734 straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num()));
10735
10736 CDir *dir = get_dirfrag(m->get_dirfrag());
10737 if (!dir) {
10738 dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl;
10739 } else {
10740 CDentry *dn = dir->lookup(m->get_dn());
10741 if (!dn) {
10742 dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10743 } else {
10744 dout(7) << "handle_dentry_unlink on " << *dn << dendl;
10745 CDentry::linkage_t *dnl = dn->get_linkage();
10746
10747 // open inode?
10748 if (dnl->is_primary()) {
10749 CInode *in = dnl->get_inode();
10750 dn->dir->unlink_inode(dn);
11fdf7f2 10751 ceph_assert(straydn);
7c673cae
FG
10752 straydn->dir->link_primary_inode(straydn, in);
10753
10754 // in->first is lazily updated on replica; drag it forward so
10755 // that we always keep it in sync with the dnq
11fdf7f2 10756 ceph_assert(straydn->first >= in->first);
7c673cae
FG
10757 in->first = straydn->first;
10758
10759 // update subtree map?
10760 if (in->is_dir())
10761 adjust_subtree_after_rename(in, dir, false);
10762
11fdf7f2
TL
10763 if (m->snapbl.length()) {
10764 bool hadrealm = (in->snaprealm ? true : false);
10765 in->decode_snap_blob(m->snapbl);
10766 ceph_assert(in->snaprealm);
10767 ceph_assert(in->snaprealm->have_past_parents_open());
10768 if (!hadrealm)
10769 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
10770 }
10771
7c673cae
FG
10772 // send caps to auth (if we're not already)
10773 if (in->is_any_caps() &&
10774 !in->state_test(CInode::STATE_EXPORTINGCAPS))
10775 migrator->export_caps(in);
10776
7c673cae
FG
10777 straydn = NULL;
10778 } else {
11fdf7f2
TL
10779 ceph_assert(!straydn);
10780 ceph_assert(dnl->is_remote());
7c673cae
FG
10781 dn->dir->unlink_inode(dn);
10782 }
11fdf7f2 10783 ceph_assert(dnl->is_null());
7c673cae
FG
10784 }
10785 }
10786
10787 // race with trim_dentry()
10788 if (straydn) {
11fdf7f2
TL
10789 ceph_assert(straydn->get_num_ref() == 0);
10790 ceph_assert(straydn->get_linkage()->is_null());
10791 expiremap ex;
10792 trim_dentry(straydn, ex);
10793 send_expire_messages(ex);
7c673cae 10794 }
7c673cae
FG
10795}
10796
10797
10798
10799
10800
10801
10802// ===================================================================
10803
10804
10805
10806// ===================================================================
10807// FRAGMENT
10808
10809
10810/**
10811 * adjust_dir_fragments -- adjust fragmentation for a directory
10812 *
10813 * @param diri directory inode
10814 * @param basefrag base fragment
10815 * @param bits bit adjustment. positive for split, negative for merge.
10816 */
10817void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
10818 list<CDir*>& resultfrags,
11fdf7f2 10819 MDSContext::vec& waiters,
7c673cae
FG
10820 bool replay)
10821{
10822 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
10823 << " on " << *diri << dendl;
10824
10825 list<CDir*> srcfrags;
10826 diri->get_dirfrags_under(basefrag, srcfrags);
10827
10828 adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay);
10829}
10830
10831CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
10832{
10833 CDir *dir = diri->get_dirfrag(fg);
10834 if (dir)
10835 return dir;
10836
10837 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
10838
10839 list<CDir*> src, result;
11fdf7f2 10840 MDSContext::vec waiters;
7c673cae
FG
10841
10842 // split a parent?
10843 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
10844 while (1) {
10845 CDir *pdir = diri->get_dirfrag(parent);
10846 if (pdir) {
10847 int split = fg.bits() - parent.bits();
10848 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
10849 src.push_back(pdir);
10850 adjust_dir_fragments(diri, src, parent, split, result, waiters, replay);
10851 dir = diri->get_dirfrag(fg);
10852 if (dir) {
10853 dout(10) << "force_dir_fragment result " << *dir << dendl;
10854 break;
10855 }
10856 }
10857 if (parent == frag_t())
10858 break;
10859 frag_t last = parent;
10860 parent = parent.parent();
10861 dout(10) << " " << last << " parent is " << parent << dendl;
10862 }
10863
10864 if (!dir) {
10865 // hoover up things under fg?
10866 diri->get_dirfrags_under(fg, src);
10867 if (src.empty()) {
10868 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
10869 } else {
10870 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
10871 adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay);
10872 dir = result.front();
10873 dout(10) << "force_dir_fragment result " << *dir << dendl;
10874 }
10875 }
10876 if (!replay)
10877 mds->queue_waiters(waiters);
10878 return dir;
10879}
10880
10881void MDCache::adjust_dir_fragments(CInode *diri,
10882 list<CDir*>& srcfrags,
10883 frag_t basefrag, int bits,
10884 list<CDir*>& resultfrags,
11fdf7f2 10885 MDSContext::vec& waiters,
7c673cae
FG
10886 bool replay)
10887{
10888 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
10889 << " srcfrags " << srcfrags
10890 << " on " << *diri << dendl;
10891
10892 // adjust fragtree
10893 // yuck. we may have discovered the inode while it was being fragmented.
10894 if (!diri->dirfragtree.is_leaf(basefrag))
10895 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
10896
10897 if (bits > 0)
10898 diri->dirfragtree.split(basefrag, bits);
10899 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
10900
10901 if (srcfrags.empty())
10902 return;
10903
10904 // split
10905 CDir *parent_dir = diri->get_parent_dir();
10906 CDir *parent_subtree = 0;
10907 if (parent_dir)
10908 parent_subtree = get_subtree_root(parent_dir);
10909
10910 if (bits > 0) {
10911 // SPLIT
11fdf7f2 10912 ceph_assert(srcfrags.size() == 1);
7c673cae
FG
10913 CDir *dir = srcfrags.front();
10914
10915 dir->split(bits, resultfrags, waiters, replay);
10916
10917 // did i change the subtree map?
10918 if (dir->is_subtree_root()) {
10919 // new frags are now separate subtrees
10920 for (list<CDir*>::iterator p = resultfrags.begin();
10921 p != resultfrags.end();
10922 ++p)
10923 subtrees[*p].clear(); // new frag is now its own subtree
10924
10925 // was i a bound?
10926 if (parent_subtree) {
11fdf7f2 10927 ceph_assert(subtrees[parent_subtree].count(dir));
7c673cae
FG
10928 subtrees[parent_subtree].erase(dir);
10929 for (list<CDir*>::iterator p = resultfrags.begin();
10930 p != resultfrags.end();
10931 ++p) {
11fdf7f2 10932 ceph_assert((*p)->is_subtree_root());
7c673cae
FG
10933 subtrees[parent_subtree].insert(*p);
10934 }
10935 }
10936
10937 // adjust my bounds.
10938 set<CDir*> bounds;
10939 bounds.swap(subtrees[dir]);
10940 subtrees.erase(dir);
10941 for (set<CDir*>::iterator p = bounds.begin();
10942 p != bounds.end();
10943 ++p) {
10944 CDir *frag = get_subtree_root((*p)->get_parent_dir());
10945 subtrees[frag].insert(*p);
10946 }
10947
10948 show_subtrees(10);
7c673cae
FG
10949 }
10950
10951 diri->close_dirfrag(dir->get_frag());
10952
10953 } else {
10954 // MERGE
10955
10956 // are my constituent bits subtrees? if so, i will be too.
10957 // (it's all or none, actually.)
11fdf7f2 10958 bool any_subtree = false, any_non_subtree = false;
31f18b77 10959 for (CDir *dir : srcfrags) {
11fdf7f2 10960 if (dir->is_subtree_root())
31f18b77 10961 any_subtree = true;
11fdf7f2
TL
10962 else
10963 any_non_subtree = true;
31f18b77 10964 }
11fdf7f2
TL
10965 ceph_assert(!any_subtree || !any_non_subtree);
10966
31f18b77
FG
10967 set<CDir*> new_bounds;
10968 if (any_subtree) {
10969 for (CDir *dir : srcfrags) {
10970 // this simplifies the code that find subtrees underneath the dirfrag
10971 if (!dir->is_subtree_root()) {
10972 dir->state_set(CDir::STATE_AUXSUBTREE);
10973 adjust_subtree_auth(dir, mds->get_nodeid());
10974 }
10975 }
10976
10977 for (CDir *dir : srcfrags) {
11fdf7f2 10978 ceph_assert(dir->is_subtree_root());
7c673cae 10979 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
7c673cae
FG
10980 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
10981 set<CDir*>::iterator r = q->second.begin();
10982 while (r != subtrees[dir].end()) {
10983 new_bounds.insert(*r);
10984 subtrees[dir].erase(r++);
10985 }
10986 subtrees.erase(q);
31f18b77 10987
7c673cae
FG
10988 // remove myself as my parent's bound
10989 if (parent_subtree)
10990 subtrees[parent_subtree].erase(dir);
10991 }
10992 }
10993
10994 // merge
10995 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
10996 f->merge(srcfrags, waiters, replay);
7c673cae 10997
31f18b77 10998 if (any_subtree) {
11fdf7f2 10999 ceph_assert(f->is_subtree_root());
7c673cae
FG
11000 subtrees[f].swap(new_bounds);
11001 if (parent_subtree)
11002 subtrees[parent_subtree].insert(f);
11003
11004 show_subtrees(10);
11005 }
11006
11007 resultfrags.push_back(f);
11008 }
11009}
11010
11011
11012class C_MDC_FragmentFrozen : public MDSInternalContext {
11013 MDCache *mdcache;
11014 MDRequestRef mdr;
11015public:
11016 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11017 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11018 void finish(int r) override {
11019 mdcache->fragment_frozen(mdr, r);
11020 }
11021};
11022
11023bool MDCache::can_fragment(CInode *diri, list<CDir*>& dirs)
11024{
11025 if (is_readonly()) {
11026 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11027 return false;
11028 }
11029 if (mds->is_cluster_degraded()) {
11030 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11031 return false;
11032 }
11033 if (diri->get_parent_dir() &&
11034 diri->get_parent_dir()->get_inode()->is_stray()) {
11035 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11036 return false;
11037 }
11038 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
11039 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11040 return false;
11041 }
11042
11043 if (diri->scrub_is_in_progress()) {
11044 dout(7) << "can_fragment: scrub in progress" << dendl;
11045 return false;
11046 }
11047
11048 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11049 CDir *dir = *p;
11050 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11051 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11052 return false;
11053 }
11054 if (!dir->is_auth()) {
11055 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11056 return false;
11057 }
11058 if (dir->is_bad()) {
11059 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11060 return false;
11061 }
11062 if (dir->is_frozen() ||
11063 dir->is_freezing()) {
11064 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11065 return false;
11066 }
11067 }
11068
11069 return true;
11070}
11071
11072void MDCache::split_dir(CDir *dir, int bits)
11073{
11074 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11fdf7f2 11075 ceph_assert(dir->is_auth());
7c673cae
FG
11076 CInode *diri = dir->inode;
11077
11078 list<CDir*> dirs;
11079 dirs.push_back(dir);
11080
11081 if (!can_fragment(diri, dirs)) {
11082 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11083 return;
11084 }
11085
31f18b77
FG
11086 if (dir->frag.bits() + bits > 24) {
11087 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11088 return;
11089 }
11090
7c673cae
FG
11091 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11092 mdr->more()->fragment_base = dir->dirfrag();
11093
11fdf7f2 11094 ceph_assert(fragments.count(dir->dirfrag()) == 0);
7c673cae
FG
11095 fragment_info_t& info = fragments[dir->dirfrag()];
11096 info.mdr = mdr;
11097 info.dirs.push_back(dir);
11098 info.bits = bits;
11099 info.last_cum_auth_pins_change = ceph_clock_now();
11100
11101 fragment_freeze_dirs(dirs);
11102 // initial mark+complete pass
11103 fragment_mark_and_complete(mdr);
11104}
11105
11106void MDCache::merge_dir(CInode *diri, frag_t frag)
11107{
11108 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11109
11110 list<CDir*> dirs;
11111 if (!diri->get_dirfrags_under(frag, dirs)) {
11112 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11113 return;
11114 }
11115
11116 if (diri->dirfragtree.is_leaf(frag)) {
11117 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11118 return;
11119 }
11120
11121 if (!can_fragment(diri, dirs))
11122 return;
11123
11124 CDir *first = dirs.front();
11125 int bits = first->get_frag().bits() - frag.bits();
11126 dout(10) << " we are merginb by " << bits << " bits" << dendl;
11127
11128 dirfrag_t basedirfrag(diri->ino(), frag);
11129 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11130 mdr->more()->fragment_base = basedirfrag;
11131
11fdf7f2 11132 ceph_assert(fragments.count(basedirfrag) == 0);
7c673cae
FG
11133 fragment_info_t& info = fragments[basedirfrag];
11134 info.mdr = mdr;
11135 info.dirs = dirs;
11136 info.bits = -bits;
11137 info.last_cum_auth_pins_change = ceph_clock_now();
11138
11139 fragment_freeze_dirs(dirs);
11140 // initial mark+complete pass
11141 fragment_mark_and_complete(mdr);
11142}
11143
11144void MDCache::fragment_freeze_dirs(list<CDir*>& dirs)
11145{
11fdf7f2
TL
11146 bool any_subtree = false, any_non_subtree = false;
11147 for (CDir* dir : dirs) {
7c673cae
FG
11148 dir->auth_pin(dir); // until we mark and complete them
11149 dir->state_set(CDir::STATE_FRAGMENTING);
11150 dir->freeze_dir();
11fdf7f2
TL
11151 ceph_assert(dir->is_freezing_dir());
11152
11153 if (dir->is_subtree_root())
11154 any_subtree = true;
11155 else
11156 any_non_subtree = true;
11157 }
11158
11159 if (any_subtree && any_non_subtree) {
11160 // either all dirfrags are subtree roots or all are not.
11161 for (CDir *dir : dirs) {
11162 if (dir->is_subtree_root()) {
11163 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11164 } else {
11165 dir->state_set(CDir::STATE_AUXSUBTREE);
11166 adjust_subtree_auth(dir, mds->get_nodeid());
11167 }
11168 }
7c673cae
FG
11169 }
11170}
11171
11172class C_MDC_FragmentMarking : public MDCacheContext {
11173 MDRequestRef mdr;
11174public:
11175 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11176 void finish(int r) override {
11177 mdcache->fragment_mark_and_complete(mdr);
11178 }
11179};
11180
11181void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11182{
11183 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11184 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11185 if (it == fragments.end() || it->second.mdr != mdr) {
11186 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11187 request_finish(mdr);
11188 return;
11189 }
11190
11191 fragment_info_t& info = it->second;
11192 CInode *diri = info.dirs.front()->get_inode();
11193 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11194
11195 MDSGatherBuilder gather(g_ceph_context);
11196
11197 for (list<CDir*>::iterator p = info.dirs.begin();
11198 p != info.dirs.end();
11199 ++p) {
11200 CDir *dir = *p;
11201
11202 bool ready = true;
11203 if (!dir->is_complete()) {
11204 dout(15) << " fetching incomplete " << *dir << dendl;
11205 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11206 ready = false;
11207 } else if (dir->get_frag() == frag_t()) {
11208 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11209 // the operation. To avoid CDir::fetch() complaining about missing object,
11210 // we commit new dirfrag first.
11211 if (dir->state_test(CDir::STATE_CREATING)) {
11212 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11213 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11214 ready = false;
11215 } else if (dir->is_new()) {
11216 dout(15) << " committing new " << *dir << dendl;
11fdf7f2 11217 ceph_assert(dir->is_dirty());
7c673cae
FG
11218 dir->commit(0, gather.new_sub(), true);
11219 ready = false;
11220 }
11221 }
11222 if (!ready)
11223 continue;
11224
11225 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11226 dout(15) << " marking " << *dir << dendl;
94b18763
FG
11227 for (auto &p : dir->items) {
11228 CDentry *dn = p.second;
7c673cae 11229 dn->get(CDentry::PIN_FRAGMENTING);
11fdf7f2 11230 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11231 dn->state_set(CDentry::STATE_FRAGMENTING);
11232 }
11233 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11234 dir->auth_unpin(dir);
11235 } else {
11236 dout(15) << " already marked " << *dir << dendl;
11237 }
11238 }
11239 if (gather.has_subs()) {
11240 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11241 gather.activate();
11242 return;
11243 }
11244
11245 for (list<CDir*>::iterator p = info.dirs.begin();
11246 p != info.dirs.end();
11247 ++p) {
11248 CDir *dir = *p;
11249 if (!dir->is_frozen_dir()) {
11fdf7f2 11250 ceph_assert(dir->is_freezing_dir());
7c673cae
FG
11251 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11252 }
11253 }
11254 if (gather.has_subs()) {
11255 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11256 gather.activate();
11257 // flush log so that request auth_pins are retired
11258 mds->mdlog->flush();
11259 return;
11260 }
11261
11262 fragment_frozen(mdr, 0);
11263}
11264
11265void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
11266{
11267 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11268 for (list<CDir*>::iterator p = dirs.begin(); p != dirs.end(); ++p) {
11269 CDir *dir = *p;
11270 dout(10) << " frag " << *dir << dendl;
11271
11fdf7f2 11272 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
7c673cae
FG
11273 dir->state_clear(CDir::STATE_FRAGMENTING);
11274
11275 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11276 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11277
94b18763
FG
11278 for (auto &p : dir->items) {
11279 CDentry *dn = p.second;
11fdf7f2 11280 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11281 dn->state_clear(CDentry::STATE_FRAGMENTING);
11282 dn->put(CDentry::PIN_FRAGMENTING);
11283 }
11284 } else {
11285 dir->auth_unpin(dir);
11286 }
11287
11288 dir->unfreeze_dir();
11289 }
11290}
11291
11292bool MDCache::fragment_are_all_frozen(CDir *dir)
11293{
11fdf7f2 11294 ceph_assert(dir->is_frozen_dir());
7c673cae
FG
11295 map<dirfrag_t,fragment_info_t>::iterator p;
11296 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11297 p != fragments.end() && p->first.ino == dir->ino();
11298 ++p) {
11299 if (p->first.frag.contains(dir->get_frag()))
11300 return p->second.all_frozen;
11301 }
11302 ceph_abort();
11303 return false;
11304}
11305
11306void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11307{
11308 map<dirfrag_t,fragment_info_t>::iterator p;
11309 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11310 p != fragments.end() && p->first.ino == dir->ino();
11311 ++p) {
11312 if (p->first.frag.contains(dir->get_frag())) {
11313 p->second.num_remote_waiters++;
11314 return;
11315 }
11316 }
11317 ceph_abort();
11318}
11319
11320void MDCache::find_stale_fragment_freeze()
11321{
11322 dout(10) << "find_stale_fragment_freeze" << dendl;
11323 // see comment in Migrator::find_stale_export_freeze()
11324 utime_t now = ceph_clock_now();
11325 utime_t cutoff = now;
11fdf7f2 11326 cutoff -= g_conf()->mds_freeze_tree_timeout;
7c673cae
FG
11327
11328 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11329 p != fragments.end(); ) {
11330 dirfrag_t df = p->first;
11331 fragment_info_t& info = p->second;
11332 ++p;
11333 if (info.all_frozen)
11334 continue;
11335 CDir *dir;
11336 int total_auth_pins = 0;
11337 for (list<CDir*>::iterator q = info.dirs.begin();
11338 q != info.dirs.end();
11339 ++q) {
11340 dir = *q;
11341 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11342 total_auth_pins = -1;
11343 break;
11344 }
11345 if (dir->is_frozen_dir())
11346 continue;
11347 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11348 }
11349 if (total_auth_pins < 0)
11350 continue;
11351 if (info.last_cum_auth_pins != total_auth_pins) {
11352 info.last_cum_auth_pins = total_auth_pins;
11353 info.last_cum_auth_pins_change = now;
11354 continue;
11355 }
11356 if (info.last_cum_auth_pins_change >= cutoff)
11357 continue;
11358 dir = info.dirs.front();
11359 if (info.num_remote_waiters > 0 ||
11360 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11361 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11362 list<CDir*> dirs;
11363 info.dirs.swap(dirs);
11364 fragments.erase(df);
11365 fragment_unmark_unfreeze_dirs(dirs);
11366 }
11367 }
11368}
11369
11370class C_MDC_FragmentPrep : public MDCacheLogContext {
11371 MDRequestRef mdr;
11372public:
11373 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11374 void finish(int r) override {
11375 mdcache->_fragment_logged(mdr);
11376 }
11377};
11378
11379class C_MDC_FragmentStore : public MDCacheContext {
11380 MDRequestRef mdr;
11381public:
11382 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11383 void finish(int r) override {
11384 mdcache->_fragment_stored(mdr);
11385 }
11386};
11387
11388class C_MDC_FragmentCommit : public MDCacheLogContext {
11389 dirfrag_t basedirfrag;
a8e16298 11390 MDRequestRef mdr;
7c673cae 11391public:
a8e16298
TL
11392 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11393 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
7c673cae 11394 void finish(int r) override {
a8e16298 11395 mdcache->_fragment_committed(basedirfrag, mdr);
7c673cae
FG
11396 }
11397};
11398
a8e16298 11399class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
7c673cae 11400 dirfrag_t basedirfrag;
a8e16298
TL
11401 int bits;
11402 MDRequestRef mdr;
7c673cae 11403public:
a8e16298
TL
11404 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11405 const MDRequestRef& r) :
11406 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
7c673cae 11407 void finish(int r) override {
11fdf7f2 11408 ceph_assert(r == 0 || r == -ENOENT);
a8e16298 11409 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
7c673cae 11410 }
91327a77 11411 void print(ostream& out) const override {
a8e16298 11412 out << "fragment_purge_old(" << basedirfrag << ")";
91327a77 11413 }
7c673cae
FG
11414};
11415
11416void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11417{
11418 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11419 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11420 if (it == fragments.end() || it->second.mdr != mdr) {
11421 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11422 request_finish(mdr);
11423 return;
11424 }
11425
11fdf7f2 11426 ceph_assert(r == 0);
7c673cae
FG
11427 fragment_info_t& info = it->second;
11428 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11429 << " on " << info.dirs.front()->get_inode() << dendl;
11430
11431 info.all_frozen = true;
11432 dispatch_fragment_dir(mdr);
11433}
11434
11435void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11436{
11437 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11438 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11439 if (it == fragments.end() || it->second.mdr != mdr) {
11440 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11441 request_finish(mdr);
11442 return;
11443 }
11444
11445 fragment_info_t& info = it->second;
11446 CInode *diri = info.dirs.front()->get_inode();
11447
11448 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11449 << " on " << *diri << dendl;
11450 if (!mdr->aborted) {
11fdf7f2
TL
11451 MutationImpl::LockOpVec lov;
11452 lov.add_wrlock(&diri->dirfragtreelock);
7c673cae 11453 // prevent a racing gather on any other scatterlocks too
11fdf7f2
TL
11454 lov.add_wrlock(&diri->nestlock);
11455 lov.add_wrlock(&diri->filelock);
11456 if (!mds->locker->acquire_locks(mdr, lov, NULL, true))
7c673cae
FG
11457 if (!mdr->aborted)
11458 return;
11459 }
11460
11461 if (mdr->aborted) {
11462 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11463 << info.dirs.front()->dirfrag() << dendl;
11464 if (info.bits > 0)
11465 mds->balancer->queue_split(info.dirs.front(), false);
11466 else
11467 mds->balancer->queue_merge(info.dirs.front());
11468 fragment_unmark_unfreeze_dirs(info.dirs);
11469 fragments.erase(it);
11470 request_finish(mdr);
11471 return;
11472 }
11473
11474 mdr->ls = mds->mdlog->get_current_segment();
11475 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11476 mds->mdlog->start_entry(le);
11477
11478 for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
11479 CDir *dir = *p;
11480 dirfrag_rollback rollback;
11481 rollback.fnode = dir->fnode;
11482 le->add_orig_frag(dir->get_frag(), &rollback);
11483 }
11484
11485 // refragment
11fdf7f2 11486 MDSContext::vec waiters;
7c673cae
FG
11487 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11488 info.resultfrags, waiters, false);
11fdf7f2 11489 if (g_conf()->mds_debug_frag)
7c673cae
FG
11490 diri->verify_dirfrags();
11491 mds->queue_waiters(waiters);
11492
11fdf7f2
TL
11493 for (const auto& fg : le->orig_frags)
11494 ceph_assert(!diri->dirfragtree.is_leaf(fg));
7c673cae
FG
11495
11496 le->metablob.add_dir_context(*info.resultfrags.begin());
11497 for (list<CDir*>::iterator p = info.resultfrags.begin();
11498 p != info.resultfrags.end();
11499 ++p) {
11500 if (diri->is_auth()) {
11501 le->metablob.add_fragmented_dir(*p, false, false);
11502 } else {
11503 (*p)->state_set(CDir::STATE_DIRTYDFT);
11504 le->metablob.add_fragmented_dir(*p, false, true);
11505 }
11506 }
11507
11508 // dft lock
11509 if (diri->is_auth()) {
11510 // journal dirfragtree
94b18763
FG
11511 auto &pi = diri->project_inode();
11512 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11513 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11514 } else {
11515 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11516 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11517 mdr->add_updated_lock(&diri->dirfragtreelock);
11518 }
11519
11520 /*
11521 // filelock
11522 mds->locker->mark_updated_scatterlock(&diri->filelock);
11523 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11524 mut->add_updated_lock(&diri->filelock);
11525
11526 // dirlock
11527 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11528 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11529 mut->add_updated_lock(&diri->nestlock);
11530 */
11531
11532 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11533 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11534 mdr, __func__);
11535 mds->mdlog->flush();
11536}
11537
11538void MDCache::_fragment_logged(MDRequestRef& mdr)
11539{
11540 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298 11541 auto& info = fragments.at(basedirfrag);
7c673cae
FG
11542 CInode *diri = info.resultfrags.front()->get_inode();
11543
11544 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11545 << " on " << *diri << dendl;
a8e16298 11546 mdr->mark_event("prepare logged");
7c673cae
FG
11547
11548 if (diri->is_auth())
11549 diri->pop_and_dirty_projected_inode(mdr->ls);
11550
11551 mdr->apply(); // mark scatterlock
11552
11553 // store resulting frags
11554 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11555
11556 for (list<CDir*>::iterator p = info.resultfrags.begin();
11557 p != info.resultfrags.end();
11558 ++p) {
11559 CDir *dir = *p;
11560 dout(10) << " storing result frag " << *dir << dendl;
11561
11562 // freeze and store them too
11563 dir->auth_pin(this);
11564 dir->state_set(CDir::STATE_FRAGMENTING);
11565 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11566 }
11567
11568 gather.activate();
11569}
11570
11571void MDCache::_fragment_stored(MDRequestRef& mdr)
11572{
11573 dirfrag_t basedirfrag = mdr->more()->fragment_base;
a8e16298
TL
11574 fragment_info_t &info = fragments.at(basedirfrag);
11575 CDir *first = info.resultfrags.front();
11576 CInode *diri = first->get_inode();
7c673cae
FG
11577
11578 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11579 << " on " << *diri << dendl;
a8e16298 11580 mdr->mark_event("new frags stored");
7c673cae
FG
11581
11582 // tell peers
a8e16298
TL
11583 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11584 diri->authority().first : CDIR_AUTH_UNKNOWN;
181888fb
FG
11585 for (const auto &p : first->get_replicas()) {
11586 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11587 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11588 rejoin_gather.count(p.first)))
7c673cae
FG
11589 continue;
11590
11fdf7f2 11591 auto notify = MMDSFragmentNotify::create(basedirfrag, info.bits, mdr->reqid.tid);
a8e16298
TL
11592 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11593 diri_auth != p.first) { // not auth mds of diri
11594 /*
11595 * In the nornal case, mds does not trim dir inode whose child dirfrags
11596 * are likely being fragmented (see trim_inode()). But when fragmenting
11597 * subtree roots, following race can happen:
11598 *
11599 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11600 * mds.c and drops wrlock on dirfragtreelock.
11601 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11602 * SYNC and send lock message mds.c
11603 * - mds.c receives the lock message and changes dirfragtreelock state
11604 * to SYNC
11605 * - mds.c trim dirfrag and dir inode from its cache
11606 * - mds.c receives the fragment_notify message
11607 *
11608 * So we need to ensure replicas have received the notify, then unlock
11609 * the dirfragtreelock.
11610 */
11611 notify->mark_ack_wanted();
11612 info.notify_ack_waiting.insert(p.first);
11613 }
7c673cae
FG
11614
11615 // freshly replicate new dirs to peers
11616 for (list<CDir*>::iterator q = info.resultfrags.begin();
11617 q != info.resultfrags.end();
11618 ++q)
181888fb 11619 replicate_dir(*q, p.first, notify->basebl);
7c673cae 11620
181888fb 11621 mds->send_message_mds(notify, p.first);
7c673cae
FG
11622 }
11623
11624 // journal commit
11625 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
a8e16298 11626 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
7c673cae 11627
7c673cae
FG
11628
11629 // unfreeze resulting frags
11630 for (list<CDir*>::iterator p = info.resultfrags.begin();
11631 p != info.resultfrags.end();
11632 ++p) {
11633 CDir *dir = *p;
11634 dout(10) << " result frag " << *dir << dendl;
11635
94b18763
FG
11636 for (auto &p : dir->items) {
11637 CDentry *dn = p.second;
11fdf7f2 11638 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
7c673cae
FG
11639 dn->state_clear(CDentry::STATE_FRAGMENTING);
11640 dn->put(CDentry::PIN_FRAGMENTING);
11641 }
11642
11643 // unfreeze
11644 dir->unfreeze_dir();
11645 }
11646
a8e16298
TL
11647 if (info.notify_ack_waiting.empty()) {
11648 fragment_drop_locks(info);
11649 } else {
11650 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11651 }
7c673cae
FG
11652}
11653
a8e16298 11654void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
7c673cae
FG
11655{
11656 dout(10) << "fragment_committed " << basedirfrag << dendl;
a8e16298
TL
11657 if (mdr)
11658 mdr->mark_event("commit logged");
11659
11660 ufragment &uf = uncommitted_fragments.at(basedirfrag);
7c673cae
FG
11661
11662 // remove old frags
11663 C_GatherBuilder gather(
11664 g_ceph_context,
11665 new C_OnFinisher(
a8e16298 11666 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
7c673cae
FG
11667 mds->finisher));
11668
11669 SnapContext nullsnapc;
11670 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11fdf7f2
TL
11671 for (const auto& fg : uf.old_frags) {
11672 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
7c673cae 11673 ObjectOperation op;
11fdf7f2 11674 if (fg == frag_t()) {
7c673cae
FG
11675 // backtrace object
11676 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11677 op.truncate(0);
11678 op.omap_clear();
11679 } else {
11680 dout(10) << " removing orphan dirfrag " << oid << dendl;
11681 op.remove();
11682 }
11683 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11684 ceph::real_clock::now(),
11685 0, gather.new_sub());
11686 }
11687
11fdf7f2 11688 ceph_assert(gather.has_subs());
7c673cae
FG
11689 gather.activate();
11690}
11691
a8e16298 11692void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
7c673cae 11693{
a8e16298
TL
11694 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
11695 if (mdr)
11696 mdr->mark_event("old frags purged");
11697
11698 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
11699 mds->mdlog->start_submit_entry(le);
11700
11701 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11702
11703 if (mds->logger) {
11704 if (bits > 0) {
11705 mds->logger->inc(l_mds_dir_split);
11706 } else {
11707 mds->logger->inc(l_mds_dir_merge);
11708 }
11709 }
11710
11711 if (mdr) {
11712 auto it = fragments.find(basedirfrag);
11713 ceph_assert(it != fragments.end());
11714 it->second.finishing = true;
11715 if (it->second.notify_ack_waiting.empty())
11716 fragment_maybe_finish(it);
11717 else
11718 mdr->mark_event("wating for notify acks");
11719 }
11720}
11721
11722void MDCache::fragment_drop_locks(fragment_info_t& info)
11723{
11724 mds->locker->drop_locks(info.mdr.get());
11725 request_finish(info.mdr);
11726 //info.mdr.reset();
11727}
11728
11729void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
11730{
11731 if (!it->second.finishing)
11732 return;
7c673cae
FG
11733
11734 // unmark & auth_unpin
a8e16298 11735 for (const auto &dir : it->second.resultfrags) {
7c673cae
FG
11736 dir->state_clear(CDir::STATE_FRAGMENTING);
11737 dir->auth_unpin(this);
11738
11739 // In case the resulting fragments are beyond the split size,
11740 // we might need to split them again right away (they could
11741 // have been taking inserts between unfreezing and getting
11742 // here)
11743 mds->balancer->maybe_fragment(dir, false);
11744 }
11745
a8e16298
TL
11746 fragments.erase(it);
11747}
11748
11749
11fdf7f2 11750void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &ack)
a8e16298
TL
11751{
11752 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
11753 mds_rank_t from = mds_rank_t(ack->get_source().num());
11754
11755 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
a8e16298 11756 return;
7c673cae
FG
11757 }
11758
a8e16298
TL
11759 auto it = fragments.find(ack->get_base_dirfrag());
11760 if (it == fragments.end() ||
11761 it->second.get_tid() != ack->get_tid()) {
11762 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
a8e16298
TL
11763 return;
11764 }
7c673cae 11765
a8e16298
TL
11766 if (it->second.notify_ack_waiting.erase(from) &&
11767 it->second.notify_ack_waiting.empty()) {
11768 fragment_drop_locks(it->second);
11769 fragment_maybe_finish(it);
11770 }
7c673cae
FG
11771}
11772
11fdf7f2 11773void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref &notify)
7c673cae
FG
11774{
11775 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
a8e16298 11776 mds_rank_t from = mds_rank_t(notify->get_source().num());
7c673cae
FG
11777
11778 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7c673cae
FG
11779 return;
11780 }
11781
11782 CInode *diri = get_inode(notify->get_ino());
11783 if (diri) {
11784 frag_t base = notify->get_basefrag();
11785 int bits = notify->get_bits();
11786
11787/*
11788 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11789 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11790 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11791 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
7c673cae
FG
11792 return;
11793 }
11794*/
11795
11796 // refragment
11fdf7f2 11797 MDSContext::vec waiters;
7c673cae
FG
11798 list<CDir*> resultfrags;
11799 adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
11fdf7f2 11800 if (g_conf()->mds_debug_frag)
7c673cae
FG
11801 diri->verify_dirfrags();
11802
11803 for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
11804 diri->take_dir_waiting((*p)->get_frag(), waiters);
11805
11806 // add new replica dirs values
11fdf7f2 11807 auto p = notify->basebl.cbegin();
7c673cae 11808 while (!p.end())
a8e16298 11809 add_replica_dir(p, diri, from, waiters);
7c673cae
FG
11810
11811 mds->queue_waiters(waiters);
11812 } else {
11813 ceph_abort();
11814 }
11815
a8e16298 11816 if (notify->is_ack_wanted()) {
11fdf7f2
TL
11817 auto ack = MMDSFragmentNotifyAck::create(notify->get_base_dirfrag(),
11818 notify->get_bits(), notify->get_tid());
a8e16298
TL
11819 mds->send_message_mds(ack, from);
11820 }
7c673cae
FG
11821}
11822
11fdf7f2 11823void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
7c673cae
FG
11824 LogSegment *ls, bufferlist *rollback)
11825{
11826 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
11fdf7f2 11827 ceph_assert(!uncommitted_fragments.count(basedirfrag));
7c673cae
FG
11828 ufragment& uf = uncommitted_fragments[basedirfrag];
11829 uf.old_frags = old_frags;
11830 uf.bits = bits;
11831 uf.ls = ls;
11832 ls->uncommitted_fragments.insert(basedirfrag);
11833 if (rollback)
11834 uf.rollback.swap(*rollback);
11835}
11836
11837void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
11838{
11839 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11840 << " op " << EFragment::op_name(op) << dendl;
11841 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11842 if (it != uncommitted_fragments.end()) {
11843 ufragment& uf = it->second;
11844 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
11845 uf.committed = true;
11846 } else {
11847 uf.ls->uncommitted_fragments.erase(basedirfrag);
11848 mds->queue_waiters(uf.waiters);
11849 uncommitted_fragments.erase(it);
11850 }
11851 }
11852}
11853
11fdf7f2 11854void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
7c673cae
FG
11855{
11856 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11857 << " old_frags (" << old_frags << ")" << dendl;
11858 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
11859 if (it != uncommitted_fragments.end()) {
11860 ufragment& uf = it->second;
11861 if (!uf.old_frags.empty()) {
11fdf7f2 11862 uf.old_frags = std::move(old_frags);
7c673cae
FG
11863 uf.committed = true;
11864 } else {
11865 uf.ls->uncommitted_fragments.erase(basedirfrag);
11866 uncommitted_fragments.erase(it);
11867 }
11868 }
11869}
11870
11871void MDCache::rollback_uncommitted_fragments()
11872{
11873 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
11874 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
11875 p != uncommitted_fragments.end();
11876 ++p) {
11877 ufragment &uf = p->second;
11878 CInode *diri = get_inode(p->first.ino);
11fdf7f2 11879 ceph_assert(diri);
7c673cae
FG
11880
11881 if (uf.committed) {
a8e16298 11882 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
11883 continue;
11884 }
11885
11886 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
11887
11888 LogSegment *ls = mds->mdlog->get_current_segment();
11889 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
11890 mds->mdlog->start_entry(le);
11891 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
11892
11fdf7f2 11893 frag_vec_t old_frags;
7c673cae
FG
11894 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
11895
11896 list<CDir*> resultfrags;
11897 if (uf.old_frags.empty()) {
11898 // created by old format EFragment
11fdf7f2 11899 MDSContext::vec waiters;
7c673cae
FG
11900 adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
11901 } else {
11fdf7f2
TL
11902 auto bp = uf.rollback.cbegin();
11903 for (const auto& fg : uf.old_frags) {
11904 CDir *dir = force_dir_fragment(diri, fg);
7c673cae
FG
11905 resultfrags.push_back(dir);
11906
11907 dirfrag_rollback rollback;
11fdf7f2 11908 decode(rollback, bp);
7c673cae
FG
11909
11910 dir->set_version(rollback.fnode.version);
11911 dir->fnode = rollback.fnode;
11912
11913 dir->_mark_dirty(ls);
11914
11915 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
11916 dout(10) << " dirty nestinfo on " << *dir << dendl;
11917 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
11918 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
11919 }
11920 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
11921 dout(10) << " dirty fragstat on " << *dir << dendl;
11922 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
11923 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
11924 }
11925
11926 le->add_orig_frag(dir->get_frag());
11927 le->metablob.add_dir_context(dir);
11928 if (diri_auth) {
11929 le->metablob.add_fragmented_dir(dir, true, false);
11930 } else {
11931 dout(10) << " dirty dirfragtree on " << *dir << dendl;
11932 dir->state_set(CDir::STATE_DIRTYDFT);
11933 le->metablob.add_fragmented_dir(dir, true, true);
11934 }
11935 }
11936 }
11937
11938 if (diri_auth) {
94b18763
FG
11939 auto &pi = diri->project_inode();
11940 pi.inode.version = diri->pre_dirty();
7c673cae
FG
11941 diri->pop_and_dirty_projected_inode(ls); // hacky
11942 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
11943 } else {
11944 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11945 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11946 }
11947
11fdf7f2 11948 if (g_conf()->mds_debug_frag)
7c673cae
FG
11949 diri->verify_dirfrags();
11950
11fdf7f2
TL
11951 for (const auto& leaf : old_frags) {
11952 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
11953 }
7c673cae 11954
7c673cae
FG
11955 mds->mdlog->submit_entry(le);
11956
11957 uf.old_frags.swap(old_frags);
a8e16298 11958 _fragment_committed(p->first, MDRequestRef());
7c673cae
FG
11959 }
11960}
11961
11962void MDCache::force_readonly()
11963{
11964 if (is_readonly())
11965 return;
11966
11967 dout(1) << "force file system read-only" << dendl;
11968 mds->clog->warn() << "force file system read-only";
11969
11970 set_readonly();
11971
11972 mds->server->force_clients_readonly();
11973
11974 // revoke write caps
81eedcae 11975 int count = 0;
94b18763 11976 for (auto &p : inode_map) {
b32b8144 11977 CInode *in = p.second;
7c673cae
FG
11978 if (in->is_head())
11979 mds->locker->eval(in, CEPH_CAP_LOCKS);
81eedcae
TL
11980 if (!(++count % 1000))
11981 mds->heartbeat_reset();
7c673cae
FG
11982 }
11983
11984 mds->mdlog->flush();
11985}
11986
11987
11988// ==============================================================
11989// debug crap
11990
81eedcae 11991void MDCache::show_subtrees(int dbl, bool force_print)
7c673cae 11992{
11fdf7f2 11993 if (g_conf()->mds_thrash_exports)
7c673cae
FG
11994 dbl += 15;
11995
11996 //dout(10) << "show_subtrees" << dendl;
11997
11fdf7f2 11998 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
7c673cae
FG
11999 return; // i won't print anything.
12000
12001 if (subtrees.empty()) {
11fdf7f2
TL
12002 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12003 << dendl;
7c673cae
FG
12004 return;
12005 }
12006
81eedcae
TL
12007 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12008 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12009 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12010 "printing subtrees" << dendl;
12011 return;
12012 }
12013
7c673cae
FG
12014 // root frags
12015 list<CDir*> basefrags;
12016 for (set<CInode*>::iterator p = base_inodes.begin();
12017 p != base_inodes.end();
12018 ++p)
12019 (*p)->get_dirfrags(basefrags);
12020 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12021 dout(15) << "show_subtrees" << dendl;
12022
12023 // queue stuff
12024 list<pair<CDir*,int> > q;
12025 string indent;
12026 set<CDir*> seen;
12027
12028 // calc max depth
12029 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
12030 q.push_back(pair<CDir*,int>(*p, 0));
12031
12032 set<CDir*> subtrees_seen;
12033
81eedcae 12034 unsigned int depth = 0;
7c673cae
FG
12035 while (!q.empty()) {
12036 CDir *dir = q.front().first;
81eedcae 12037 unsigned int d = q.front().second;
7c673cae
FG
12038 q.pop_front();
12039
12040 if (subtrees.count(dir) == 0) continue;
12041
12042 subtrees_seen.insert(dir);
12043
12044 if (d > depth) depth = d;
12045
12046 // sanity check
12047 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12048 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
11fdf7f2 12049 ceph_assert(seen.count(dir) == 0);
7c673cae
FG
12050 seen.insert(dir);
12051
12052 // nested items?
12053 if (!subtrees[dir].empty()) {
12054 for (set<CDir*>::iterator p = subtrees[dir].begin();
12055 p != subtrees[dir].end();
12056 ++p) {
12057 //dout(25) << " saw sub " << **p << dendl;
12058 q.push_front(pair<CDir*,int>(*p, d+1));
12059 }
12060 }
12061 }
12062
81eedcae
TL
12063 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12064 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12065 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12066 "subtrees" << dendl;
12067 return;
12068 }
7c673cae
FG
12069
12070 // print tree
12071 for (list<CDir*>::iterator p = basefrags.begin(); p != basefrags.end(); ++p)
12072 q.push_back(pair<CDir*,int>(*p, 0));
12073
12074 while (!q.empty()) {
12075 CDir *dir = q.front().first;
12076 int d = q.front().second;
12077 q.pop_front();
12078
12079 if (subtrees.count(dir) == 0) continue;
12080
12081 // adjust indenter
12082 while ((unsigned)d < indent.size())
12083 indent.resize(d);
12084
12085 // pad
12086 string pad = "______________________________________";
12087 pad.resize(depth*2+1-indent.size());
12088 if (!subtrees[dir].empty())
12089 pad[0] = '.'; // parent
12090
12091
12092 string auth;
12093 if (dir->is_auth())
12094 auth = "auth ";
12095 else
12096 auth = " rep ";
12097
12098 char s[10];
12099 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12100 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12101 else
12102 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12103
12104 // print
11fdf7f2
TL
12105 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12106 << " " << auth << *dir << dendl;
7c673cae
FG
12107
12108 if (dir->ino() == MDS_INO_ROOT)
11fdf7f2 12109 ceph_assert(dir->inode == root);
7c673cae 12110 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
11fdf7f2 12111 ceph_assert(dir->inode == myin);
7c673cae 12112 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
11fdf7f2 12113 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
7c673cae
FG
12114
12115 // nested items?
12116 if (!subtrees[dir].empty()) {
12117 // more at my level?
12118 if (!q.empty() && q.front().second == d)
12119 indent += "| ";
12120 else
12121 indent += " ";
12122
12123 for (set<CDir*>::iterator p = subtrees[dir].begin();
12124 p != subtrees[dir].end();
12125 ++p)
12126 q.push_front(pair<CDir*,int>(*p, d+2));
12127 }
12128 }
12129
12130 // verify there isn't stray crap in subtree map
12131 int lost = 0;
12132 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12133 p != subtrees.end();
12134 ++p) {
12135 if (subtrees_seen.count(p->first)) continue;
12136 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12137 lost++;
12138 }
11fdf7f2 12139 ceph_assert(lost == 0);
7c673cae
FG
12140}
12141
7c673cae
FG
12142void MDCache::show_cache()
12143{
12144 dout(7) << "show_cache" << dendl;
b32b8144
FG
12145
12146 auto show_func = [this](CInode *in) {
7c673cae 12147 // unlinked?
b32b8144
FG
12148 if (!in->parent)
12149 dout(7) << " unlinked " << *in << dendl;
12150
7c673cae
FG
12151 // dirfrags?
12152 list<CDir*> dfs;
b32b8144 12153 in->get_dirfrags(dfs);
7c673cae
FG
12154 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
12155 CDir *dir = *p;
12156 dout(7) << " dirfrag " << *dir << dendl;
b32b8144 12157
94b18763
FG
12158 for (auto &p : dir->items) {
12159 CDentry *dn = p.second;
7c673cae
FG
12160 dout(7) << " dentry " << *dn << dendl;
12161 CDentry::linkage_t *dnl = dn->get_linkage();
12162 if (dnl->is_primary() && dnl->get_inode())
12163 dout(7) << " inode " << *dnl->get_inode() << dendl;
12164 }
12165 }
b32b8144
FG
12166 };
12167
94b18763 12168 for (auto &p : inode_map)
b32b8144 12169 show_func(p.second);
94b18763 12170 for (auto &p : snap_inode_map)
b32b8144 12171 show_func(p.second);
7c673cae
FG
12172}
12173
f64942e4 12174void MDCache::cache_status(Formatter *f)
181888fb
FG
12175{
12176 f->open_object_section("cache");
12177
12178 f->open_object_section("pool");
12179 mempool::get_pool(mempool::mds_co::id).dump(f);
12180 f->close_section();
12181
12182 f->close_section();
181888fb
FG
12183}
12184
11fdf7f2 12185void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
7c673cae 12186{
11fdf7f2
TL
12187 ceph_assert(in);
12188 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12189 return;
12190 }
12191 list<CDir*> ls;
12192 in->get_dirfrags(ls);
12193 for (const auto &subdir : ls) {
12194 for (const auto &p : subdir->items) {
12195 CDentry *dn = p.second;
12196 CInode *in = dn->get_linkage()->get_inode();
12197 if (in) {
12198 dump_tree(in, cur_depth + 1, max_depth, f);
12199 }
12200 }
12201 }
12202 f->open_object_section("inode");
12203 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12204 f->close_section();
7c673cae
FG
12205}
12206
11fdf7f2 12207int MDCache::dump_cache(std::string_view file_name)
7c673cae 12208{
11fdf7f2 12209 return dump_cache(file_name, NULL);
7c673cae
FG
12210}
12211
11fdf7f2 12212int MDCache::dump_cache(Formatter *f)
7c673cae 12213{
11fdf7f2 12214 return dump_cache(std::string_view(""), f);
7c673cae
FG
12215}
12216
12217/**
12218 * Dump the metadata cache, either to a Formatter, if
12219 * provided, else to a plain text file.
12220 */
11fdf7f2 12221int MDCache::dump_cache(std::string_view fn, Formatter *f)
7c673cae
FG
12222{
12223 int r = 0;
f64942e4
AA
12224
12225 // dumping large caches may cause mds to hang or worse get killed.
12226 // so, disallow the dump if the cache size exceeds the configured
12227 // threshold, which is 1G for formatter and unlimited for file (note
12228 // that this can be jacked up by the admin... and is nothing but foot
12229 // shooting, but the option itself is for devs and hence dangerous to
12230 // tune). TODO: remove this when fixed.
12231 uint64_t threshold = f ?
11fdf7f2
TL
12232 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12233 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
f64942e4
AA
12234
12235 if (threshold && cache_size() > threshold) {
12236 if (f) {
12237 std::stringstream ss;
12238 ss << "cache usage exceeds dump threshold";
12239 f->open_object_section("result");
12240 f->dump_string("error", ss.str());
12241 f->close_section();
12242 } else {
12243 derr << "cache usage exceeds dump threshold" << dendl;
12244 r = -EINVAL;
12245 }
12246 return r;
12247 }
12248
12249 r = 0;
7c673cae
FG
12250 int fd = -1;
12251
12252 if (f) {
12253 f->open_array_section("inodes");
12254 } else {
94b18763
FG
12255 char path[PATH_MAX] = "";
12256 if (fn.length()) {
12257 snprintf(path, sizeof path, "%s", fn.data());
12258 } else {
12259 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
7c673cae
FG
12260 }
12261
94b18763 12262 dout(1) << "dump_cache to " << path << dendl;
7c673cae 12263
91327a77 12264 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
7c673cae 12265 if (fd < 0) {
94b18763 12266 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
31f18b77 12267 return errno;
7c673cae
FG
12268 }
12269 }
12270
11fdf7f2 12271 auto dump_func = [fd, f](CInode *in) {
b32b8144 12272 int r;
7c673cae
FG
12273 if (f) {
12274 f->open_object_section("inode");
11fdf7f2
TL
12275 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12276 f->close_section();
12277 return 1;
12278 }
12279 ostringstream ss;
12280 ss << *in << std::endl;
12281 std::string s = ss.str();
12282 r = safe_write(fd, s.c_str(), s.length());
12283 if (r < 0)
12284 return r;
7c673cae
FG
12285 list<CDir*> dfs;
12286 in->get_dirfrags(dfs);
11fdf7f2
TL
12287 for (auto &dir : dfs) {
12288 ostringstream tt;
12289 tt << " " << *dir << std::endl;
12290 std::string t = tt.str();
12291 r = safe_write(fd, t.c_str(), t.length());
12292 if (r < 0)
12293 return r;
94b18763
FG
12294 for (auto &p : dir->items) {
12295 CDentry *dn = p.second;
11fdf7f2
TL
12296 ostringstream uu;
12297 uu << " " << *dn << std::endl;
12298 std::string u = uu.str();
12299 r = safe_write(fd, u.c_str(), u.length());
12300 if (r < 0)
12301 return r;
7c673cae
FG
12302 }
12303 dir->check_rstats();
7c673cae 12304 }
b32b8144
FG
12305 return 1;
12306 };
12307
94b18763 12308 for (auto &p : inode_map) {
b32b8144
FG
12309 r = dump_func(p.second);
12310 if (r < 0)
12311 goto out;
12312 }
94b18763 12313 for (auto &p : snap_inode_map) {
b32b8144
FG
12314 r = dump_func(p.second);
12315 if (r < 0)
12316 goto out;
7c673cae 12317 }
b32b8144 12318 r = 0;
7c673cae
FG
12319
12320 out:
12321 if (f) {
12322 f->close_section(); // inodes
12323 } else {
12324 ::close(fd);
12325 }
31f18b77 12326 return r;
7c673cae
FG
12327}
12328
12329
12330
12331C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12332 : MDSInternalContext(c->mds), cache(c), mdr(r)
12333{}
12334
12335void C_MDS_RetryRequest::finish(int r)
12336{
12337 mdr->retry++;
12338 cache->dispatch_request(mdr);
12339}
12340
12341
12342class C_MDS_EnqueueScrub : public Context
12343{
11fdf7f2 12344 std::string tag;
7c673cae
FG
12345 Formatter *formatter;
12346 Context *on_finish;
12347public:
12348 ScrubHeaderRef header;
11fdf7f2
TL
12349 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12350 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
7c673cae
FG
12351
12352 Context *take_finisher() {
12353 Context *fin = on_finish;
12354 on_finish = NULL;
12355 return fin;
12356 }
12357
12358 void finish(int r) override {
11fdf7f2
TL
12359 if (r == 0) {
12360 // since recursive scrub is asynchronous, dump minimal output
12361 // to not upset cli tools.
12362 if (header && header->get_recursive()) {
12363 formatter->open_object_section("results");
12364 formatter->dump_int("return_code", 0);
12365 formatter->dump_string("scrub_tag", tag);
12366 formatter->dump_string("mode", "asynchronous");
12367 formatter->close_section(); // results
12368 }
12369 } else { // we failed the lookup or something; dump ourselves
7c673cae
FG
12370 formatter->open_object_section("results");
12371 formatter->dump_int("return_code", r);
12372 formatter->close_section(); // results
11fdf7f2 12373 r = 0; // already dumped in formatter
7c673cae
FG
12374 }
12375 if (on_finish)
12376 on_finish->complete(r);
12377 }
12378};
12379
12380void MDCache::enqueue_scrub(
11fdf7f2
TL
12381 std::string_view path,
12382 std::string_view tag,
7c673cae
FG
12383 bool force, bool recursive, bool repair,
12384 Formatter *f, Context *fin)
12385{
11fdf7f2 12386 dout(10) << __func__ << " " << path << dendl;
7c673cae 12387 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
11fdf7f2
TL
12388 if (path == "~mdsdir") {
12389 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12390 mdr->set_filepath(fp);
12391 } else {
12392 filepath fp(path);
12393 mdr->set_filepath(path);
12394 }
12395
12396 bool is_internal = false;
12397 std::string tag_str(tag);
12398 if (tag_str.empty()) {
12399 uuid_d uuid_gen;
12400 uuid_gen.generate_random();
12401 tag_str = uuid_gen.to_string();
12402 is_internal = true;
12403 }
7c673cae 12404
11fdf7f2 12405 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
7c673cae 12406 cs->header = std::make_shared<ScrubHeader>(
11fdf7f2 12407 tag_str, is_internal, force, recursive, repair, f);
7c673cae
FG
12408
12409 mdr->internal_op_finish = cs;
12410 enqueue_scrub_work(mdr);
12411}
12412
12413void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12414{
11fdf7f2
TL
12415 MutationImpl::LockOpVec lov;
12416 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
7c673cae
FG
12417 if (NULL == in)
12418 return;
12419
12420 // TODO: Remove this restriction
11fdf7f2 12421 ceph_assert(in->is_auth());
7c673cae 12422
11fdf7f2 12423 bool locked = mds->locker->acquire_locks(mdr, lov);
7c673cae
FG
12424 if (!locked)
12425 return;
12426
12427 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
11fdf7f2 12428 ScrubHeaderRef header = cs->header;
7c673cae
FG
12429
12430 // Cannot scrub same dentry twice at same time
11fdf7f2 12431 if (in->scrub_is_in_progress()) {
7c673cae
FG
12432 mds->server->respond_to_request(mdr, -EBUSY);
12433 return;
12434 } else {
12435 in->scrub_info();
12436 }
12437
12438 header->set_origin(in);
12439
11fdf7f2
TL
12440 Context *fin;
12441 if (header->get_recursive()) {
12442 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12443 fin = new MDSInternalContextWrapper(mds,
12444 new FunctionContext([this, header](int r) {
12445 recursive_scrub_finish(header);
12446 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12447 })
12448 );
12449 } else {
b32b8144
FG
12450 fin = cs->take_finisher();
12451 }
12452
12453 // If the scrub did some repair, then flush the journal at the end of
12454 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12455 // the on disk state will still look damaged.
28e407b8
AA
12456 auto scrub_finish = new FunctionContext([this, header, fin](int r){
12457 if (!header->get_repaired()) {
12458 if (fin)
12459 fin->complete(r);
12460 return;
12461 }
12462
12463 auto flush_finish = new FunctionContext([this, fin](int r){
12464 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12465 mds->mdlog->trim_all();
12466
12467 if (fin) {
12468 MDSGatherBuilder gather(g_ceph_context);
12469 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12470 for (auto logseg : expiring_segments)
12471 logseg->wait_for_expiry(gather.new_sub());
11fdf7f2 12472 ceph_assert(gather.has_subs());
28e407b8
AA
12473 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12474 gather.activate();
b32b8144 12475 }
28e407b8
AA
12476 });
12477
12478 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12479 mds->mdlog->start_new_segment();
12480 mds->mdlog->flush();
12481 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
b32b8144
FG
12482 });
12483
7c673cae 12484 if (!header->get_recursive()) {
7c673cae 12485 mds->scrubstack->enqueue_inode_top(in, header,
28e407b8 12486 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144
FG
12487 } else {
12488 mds->scrubstack->enqueue_inode_bottom(in, header,
28e407b8 12489 new MDSInternalContextWrapper(mds, scrub_finish));
b32b8144 12490 }
7c673cae
FG
12491
12492 mds->server->respond_to_request(mdr, 0);
12493 return;
12494}
12495
11fdf7f2
TL
12496void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12497{
12498 if (header->get_origin()->is_base() &&
12499 header->get_force() && header->get_repair()) {
12500 // notify snapserver that base directory is recursively scrubbed.
12501 // After both root and mdsdir are recursively scrubbed, snapserver
12502 // knows that all old format snaprealms are converted to the new
12503 // format.
12504 if (mds->mdsmap->get_num_in_mds() == 1 &&
12505 mds->mdsmap->get_num_failed_mds() == 0 &&
12506 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12507 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12508 }
12509 }
12510}
12511
12512struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
7c673cae 12513 MDRequestRef mdr;
11fdf7f2 12514 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
7c673cae
FG
12515 MDCacheLogContext(c), mdr(m) {}
12516 void finish(int r) override {
12517 mdr->apply();
12518 get_mds()->server->respond_to_request(mdr, r);
12519 }
12520};
12521
12522void MDCache::repair_dirfrag_stats(CDir *dir)
12523{
12524 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12525 mdr->pin(dir);
12526 mdr->internal_op_private = dir;
12527 mdr->internal_op_finish = new C_MDSInternalNoop;
12528 repair_dirfrag_stats_work(mdr);
12529}
12530
12531void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12532{
12533 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12534 dout(10) << __func__ << " " << *dir << dendl;
12535
12536 if (!dir->is_auth()) {
12537 mds->server->respond_to_request(mdr, -ESTALE);
12538 return;
12539 }
12540
12541 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
224ce89b
WB
12542 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12543
7c673cae
FG
12544 mds->locker->drop_locks(mdr.get());
12545 mdr->drop_local_auth_pins();
224ce89b
WB
12546 if (!mdr->remote_auth_pins.empty())
12547 mds->locker->notify_freeze_waiter(dir);
7c673cae
FG
12548 return;
12549 }
12550
12551 mdr->auth_pin(dir);
12552
11fdf7f2 12553 MutationImpl::LockOpVec lov;
7c673cae 12554 CInode *diri = dir->inode;
11fdf7f2
TL
12555 lov.add_rdlock(&diri->dirfragtreelock);
12556 lov.add_wrlock(&diri->nestlock);
12557 lov.add_wrlock(&diri->filelock);
12558 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12559 return;
12560
12561 if (!dir->is_complete()) {
12562 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12563 return;
12564 }
12565
12566 frag_info_t frag_info;
12567 nest_info_t nest_info;
94b18763 12568 for (auto it = dir->begin(); it != dir->end(); ++it) {
7c673cae
FG
12569 CDentry *dn = it->second;
12570 if (dn->last != CEPH_NOSNAP)
12571 continue;
12572 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12573 if (dnl->is_primary()) {
12574 CInode *in = dnl->get_inode();
12575 nest_info.add(in->get_projected_inode()->accounted_rstat);
12576 if (in->is_dir())
12577 frag_info.nsubdirs++;
12578 else
12579 frag_info.nfiles++;
12580 } else if (dnl->is_remote())
12581 frag_info.nfiles++;
12582 }
12583
12584 fnode_t *pf = dir->get_projected_fnode();
12585 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12586 bool good_rstat = nest_info.same_sums(pf->rstat);
12587 if (good_fragstat && good_rstat) {
12588 dout(10) << __func__ << " no corruption found" << dendl;
12589 mds->server->respond_to_request(mdr, 0);
12590 return;
12591 }
12592
12593 pf = dir->project_fnode();
12594 pf->version = dir->pre_dirty();
12595 mdr->add_projected_fnode(dir);
12596
12597 mdr->ls = mds->mdlog->get_current_segment();
12598 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12599 mds->mdlog->start_entry(le);
12600
12601 if (!good_fragstat) {
12602 if (pf->fragstat.mtime > frag_info.mtime)
12603 frag_info.mtime = pf->fragstat.mtime;
12604 if (pf->fragstat.change_attr > frag_info.change_attr)
12605 frag_info.change_attr = pf->fragstat.change_attr;
12606 pf->fragstat = frag_info;
12607 mds->locker->mark_updated_scatterlock(&diri->filelock);
12608 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12609 mdr->add_updated_lock(&diri->filelock);
12610 }
12611
12612 if (!good_rstat) {
12613 if (pf->rstat.rctime > nest_info.rctime)
12614 nest_info.rctime = pf->rstat.rctime;
12615 pf->rstat = nest_info;
12616 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12617 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12618 mdr->add_updated_lock(&diri->nestlock);
12619 }
12620
12621 le->metablob.add_dir_context(dir);
12622 le->metablob.add_dir(dir, true);
12623
11fdf7f2 12624 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
7c673cae
FG
12625}
12626
12627void MDCache::repair_inode_stats(CInode *diri)
12628{
12629 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12630 mdr->pin(diri);
12631 mdr->internal_op_private = diri;
12632 mdr->internal_op_finish = new C_MDSInternalNoop;
12633 repair_inode_stats_work(mdr);
12634}
12635
12636void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12637{
12638 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12639 dout(10) << __func__ << " " << *diri << dendl;
12640
12641 if (!diri->is_auth()) {
12642 mds->server->respond_to_request(mdr, -ESTALE);
12643 return;
12644 }
12645 if (!diri->is_dir()) {
12646 mds->server->respond_to_request(mdr, -ENOTDIR);
12647 return;
12648 }
12649
11fdf7f2 12650 MutationImpl::LockOpVec lov;
7c673cae
FG
12651
12652 if (mdr->ls) // already marked filelock/nestlock dirty ?
12653 goto do_rdlocks;
12654
11fdf7f2
TL
12655 lov.add_rdlock(&diri->dirfragtreelock);
12656 lov.add_wrlock(&diri->nestlock);
12657 lov.add_wrlock(&diri->filelock);
12658 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12659 return;
12660
12661 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12662 // the scatter-gather process, which will fix any fragstat/rstat errors.
11fdf7f2
TL
12663 {
12664 frag_vec_t leaves;
12665 diri->dirfragtree.get_leaves(leaves);
12666 for (const auto& leaf : leaves) {
12667 CDir *dir = diri->get_dirfrag(leaf);
12668 if (!dir) {
12669 ceph_assert(mdr->is_auth_pinned(diri));
12670 dir = diri->get_or_open_dirfrag(this, leaf);
12671 }
12672 if (dir->get_version() == 0) {
12673 ceph_assert(dir->is_auth());
12674 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12675 return;
12676 }
7c673cae
FG
12677 }
12678 }
12679
12680 diri->state_set(CInode::STATE_REPAIRSTATS);
12681 mdr->ls = mds->mdlog->get_current_segment();
12682 mds->locker->mark_updated_scatterlock(&diri->filelock);
12683 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12684 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12685 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12686
12687 mds->locker->drop_locks(mdr.get());
12688
12689do_rdlocks:
12690 // force the scatter-gather process
11fdf7f2
TL
12691 lov.clear();
12692 lov.add_rdlock(&diri->dirfragtreelock);
12693 lov.add_rdlock(&diri->nestlock);
12694 lov.add_rdlock(&diri->filelock);
12695 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
12696 return;
12697
12698 diri->state_clear(CInode::STATE_REPAIRSTATS);
12699
12700 frag_info_t dir_info;
12701 nest_info_t nest_info;
11fdf7f2
TL
12702 nest_info.rsubdirs = 1; // it gets one to account for self
12703 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
12704 nest_info.rsnaps = srnode->snaps.size();
7c673cae 12705
11fdf7f2
TL
12706 {
12707 frag_vec_t leaves;
12708 diri->dirfragtree.get_leaves(leaves);
12709 for (const auto& leaf : leaves) {
12710 CDir *dir = diri->get_dirfrag(leaf);
12711 ceph_assert(dir);
12712 ceph_assert(dir->get_version() > 0);
12713 dir_info.add(dir->fnode.accounted_fragstat);
12714 nest_info.add(dir->fnode.accounted_rstat);
12715 }
7c673cae
FG
12716 }
12717
12718 if (!dir_info.same_sums(diri->inode.dirstat) ||
12719 !nest_info.same_sums(diri->inode.rstat)) {
12720 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12721 << *diri << dendl;
12722 }
12723
12724 mds->server->respond_to_request(mdr, 0);
12725}
12726
11fdf7f2
TL
12727void MDCache::upgrade_inode_snaprealm(CInode *in)
12728{
12729 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
12730 mdr->pin(in);
12731 mdr->internal_op_private = in;
12732 mdr->internal_op_finish = new C_MDSInternalNoop;
12733 upgrade_inode_snaprealm_work(mdr);
12734}
12735
12736void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
12737{
12738 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
12739 dout(10) << __func__ << " " << *in << dendl;
12740
12741 if (!in->is_auth()) {
12742 mds->server->respond_to_request(mdr, -ESTALE);
12743 return;
12744 }
12745
12746 MutationImpl::LockOpVec lov;
12747 mds->locker->include_snap_rdlocks(in, lov);
12748 lov.erase_rdlock(&in->snaplock);
12749 lov.add_xlock(&in->snaplock);
12750
12751 if (!mds->locker->acquire_locks(mdr, lov))
12752 return;
12753
12754 // project_snaprealm() upgrades snaprealm format
12755 auto &pi = in->project_inode(false, true);
12756 mdr->add_projected_inode(in);
12757 pi.inode.version = in->pre_dirty();
12758
12759 mdr->ls = mds->mdlog->get_current_segment();
12760 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
12761 mds->mdlog->start_entry(le);
12762
12763 if (in->is_base()) {
12764 le->metablob.add_root(true, in);
12765 } else {
12766 CDentry *pdn = in->get_projected_parent_dn();
12767 le->metablob.add_dir_context(pdn->get_dir());
12768 le->metablob.add_primary_dentry(pdn, in, true);
12769 }
12770
12771 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12772}
12773
12774void MDCache::flush_dentry(std::string_view path, Context *fin)
7c673cae
FG
12775{
12776 if (is_readonly()) {
12777 dout(10) << __func__ << ": read-only FS" << dendl;
12778 fin->complete(-EROFS);
12779 return;
12780 }
12781 dout(10) << "flush_dentry " << path << dendl;
12782 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
94b18763 12783 filepath fp(path);
7c673cae
FG
12784 mdr->set_filepath(fp);
12785 mdr->internal_op_finish = fin;
12786 flush_dentry_work(mdr);
12787}
12788
11fdf7f2 12789class C_FinishIOMDR : public MDSContext {
7c673cae
FG
12790protected:
12791 MDSRank *mds;
12792 MDRequestRef mdr;
12793 MDSRank *get_mds() override { return mds; }
12794public:
12795 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
12796 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
12797};
12798
12799void MDCache::flush_dentry_work(MDRequestRef& mdr)
12800{
11fdf7f2
TL
12801 MutationImpl::LockOpVec lov;
12802 CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, lov, true);
7c673cae
FG
12803 if (NULL == in)
12804 return;
12805
12806 // TODO: Is this necessary? Fix it if so
11fdf7f2
TL
12807 ceph_assert(in->is_auth());
12808 bool locked = mds->locker->acquire_locks(mdr, lov);
7c673cae
FG
12809 if (!locked)
12810 return;
12811 in->flush(new C_FinishIOMDR(mds, mdr));
12812}
12813
12814
12815/**
12816 * Initialize performance counters with global perfcounter
12817 * collection.
12818 */
12819void MDCache::register_perfcounters()
12820{
91327a77
AA
12821 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
12822
12823 // Stray/purge statistics
12824 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
12825 PerfCountersBuilder::PRIO_INTERESTING);
12826 pcb.add_u64(l_mdc_num_recovering_enqueued,
12827 "num_recovering_enqueued", "Files waiting for recovery", "recy",
12828 PerfCountersBuilder::PRIO_INTERESTING);
12829 pcb.add_u64_counter(l_mdc_recovery_completed,
12830 "recovery_completed", "File recoveries completed", "recd",
12831 PerfCountersBuilder::PRIO_INTERESTING);
12832
12833 // useful recovery queue statistics
12834 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
12835 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
12836 "Files currently being recovered");
12837 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
12838 "Files waiting for recovery with elevated priority");
12839 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
12840 "File recoveries started");
12841
12842 // along with other stray dentries stats
12843 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
12844 "Stray dentries delayed");
12845 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
12846 "Stray dentries enqueuing for purge");
12847 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
12848 "Stray dentries created");
7c673cae 12849 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
91327a77
AA
12850 "Stray dentries enqueued for purge");
12851 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
12852 "Stray dentries reintegrated");
12853 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
12854 "Stray dentries migrated");
7c673cae 12855
91327a77 12856 // low prio internal request stats
d2e6a577 12857 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
91327a77 12858 "Internal Request type enqueue scrub");
d2e6a577 12859 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
91327a77 12860 "Internal Request type export dir");
d2e6a577 12861 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
91327a77 12862 "Internal Request type flush");
d2e6a577 12863 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
91327a77 12864 "Internal Request type fragmentdir");
d2e6a577 12865 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
91327a77 12866 "Internal Request type frag stats");
d2e6a577 12867 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
91327a77 12868 "Internal Request type inode stats");
d2e6a577 12869
7c673cae
FG
12870 logger.reset(pcb.create_perf_counters());
12871 g_ceph_context->get_perfcounters_collection()->add(logger.get());
12872 recovery_queue.set_logger(logger.get());
12873 stray_manager.set_logger(logger.get());
12874}
12875
7c673cae
FG
12876/**
12877 * Call this when putting references to an inode/dentry or
12878 * when attempting to trim it.
12879 *
12880 * If this inode is no longer linked by anyone, and this MDS
12881 * rank holds the primary dentry, and that dentry is in a stray
12882 * directory, then give up the dentry to the StrayManager, never
12883 * to be seen again by MDCache.
12884 *
12885 * @param delay if true, then purgeable inodes are stashed til
12886 * the next trim(), rather than being purged right
12887 * away.
12888 */
12889void MDCache::maybe_eval_stray(CInode *in, bool delay) {
224ce89b
WB
12890 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
12891 mds->get_state() <= MDSMap::STATE_REJOIN)
7c673cae 12892 return;
224ce89b 12893
7c673cae
FG
12894 CDentry *dn = in->get_projected_parent_dn();
12895
12896 if (dn->state_test(CDentry::STATE_PURGING)) {
12897 /* We have already entered the purging process, no need
12898 * to re-evaluate me ! */
12899 return;
12900 }
12901
11fdf7f2
TL
12902 if (dn->get_dir()->get_inode()->is_stray()) {
12903 if (delay)
12904 stray_manager.queue_delayed(dn);
12905 else
12906 stray_manager.eval_stray(dn);
7c673cae
FG
12907 }
12908}
12909
31f18b77
FG
12910void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
12911 dout(10) << __func__ << " " << *diri << dendl;
11fdf7f2 12912 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
31f18b77
FG
12913 list<CDir*> ls;
12914 diri->get_dirfrags(ls);
94b18763 12915 for (auto &p : ls) {
31f18b77
FG
12916 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
12917 p->try_remove_dentries_for_stray();
12918 }
12919 if (!diri->snaprealm) {
12920 if (diri->is_auth())
12921 diri->clear_dirty_rstat();
12922 diri->clear_scatter_dirty();
12923 }
12924}
12925
11fdf7f2
TL
12926bool MDCache::dump_inode(Formatter *f, uint64_t number) {
12927 CInode *in = get_inode(number);
12928 if (!in) {
12929 return false;
12930 }
12931 f->open_object_section("inode");
12932 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
12933 f->close_section();
12934 return true;
12935}