]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
update ceph source to reef 18.2.0
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <ostream>
17 #include <string>
18 #include <string_view>
19 #include <map>
20
21 #include "MDCache.h"
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDLog.h"
26 #include "MDBalancer.h"
27 #include "Migrator.h"
28 #include "ScrubStack.h"
29
30 #include "SnapClient.h"
31
32 #include "MDSMap.h"
33
34 #include "CInode.h"
35 #include "CDir.h"
36
37 #include "Mutation.h"
38
39 #include "include/ceph_fs.h"
40 #include "include/filepath.h"
41 #include "include/util.h"
42
43 #include "messages/MClientCaps.h"
44
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
47
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
52
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
55
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/EPeerUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/EPurged.h"
63 #include "events/ESessions.h"
64
65 #include "InoTable.h"
66 #include "fscrypt.h"
67
68 #include "common/Timer.h"
69
70 #include "perfglue/heap_profiler.h"
71
72
73 #include "common/config.h"
74 #include "include/ceph_assert.h"
75
76 #define dout_context g_ceph_context
77 #define dout_subsys ceph_subsys_mds
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, mds)
80
81 using namespace std;
82
83 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
84 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
85 }
86
87 set<int> SimpleLock::empty_gather_set;
88
89
90 /**
91 * All non-I/O contexts that require a reference
92 * to an MDCache instance descend from this.
93 */
94 class MDCacheContext : public virtual MDSContext {
95 protected:
96 MDCache *mdcache;
97 MDSRank *get_mds() override
98 {
99 ceph_assert(mdcache != NULL);
100 return mdcache->mds;
101 }
102 public:
103 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
104 };
105
106 class MDCacheLogContext : public virtual MDSLogContextBase {
107 protected:
108 MDCache *mdcache;
109 MDSRank *get_mds() override
110 {
111 ceph_assert(mdcache != NULL);
112 return mdcache->mds;
113 }
114 public:
115 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
116 };
117
118 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
119 mds(m),
120 open_file_table(m),
121 filer(m->objecter, m->finisher),
122 stray_manager(m, purge_queue_),
123 recovery_queue(m),
124 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
125 {
126 migrator.reset(new Migrator(mds, this));
127
128 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
129 (g_conf()->mds_dir_max_commit_size << 20) :
130 (0.9 *(g_conf()->osd_max_write_size << 20));
131
132 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
133 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
134 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
135
136 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
137 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
138 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
139
140 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
141
142 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
143
144 bottom_lru.lru_set_midpoint(0);
145
146 decayrate.set_halflife(g_conf()->mds_decay_halflife);
147
148 upkeeper = std::thread(&MDCache::upkeep_main, this);
149 }
150
151 MDCache::~MDCache()
152 {
153 if (logger) {
154 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
155 }
156 if (upkeeper.joinable())
157 upkeeper.join();
158 }
159
160 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
161 {
162 dout(20) << "config changes: " << changed << dendl;
163 if (changed.count("mds_cache_memory_limit"))
164 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
165 if (changed.count("mds_cache_reservation"))
166 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
167
168 bool ephemeral_pin_config_changed = false;
169 if (changed.count("mds_export_ephemeral_distributed")) {
170 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
171 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
172 /* copy to vector to avoid removals during iteration */
173 ephemeral_pin_config_changed = true;
174 }
175 if (changed.count("mds_export_ephemeral_random")) {
176 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
177 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
178 /* copy to vector to avoid removals during iteration */
179 ephemeral_pin_config_changed = true;
180 }
181 if (ephemeral_pin_config_changed) {
182 std::vector<CInode*> migrate;
183 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
184 for (auto& in : migrate) {
185 in->maybe_export_pin(true);
186 }
187 }
188 if (changed.count("mds_export_ephemeral_random_max")) {
189 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
190 }
191 if (changed.count("mds_health_cache_threshold"))
192 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
193 if (changed.count("mds_cache_mid"))
194 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
195 if (changed.count("mds_cache_trim_decay_rate")) {
196 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
197 }
198 if (changed.count("mds_symlink_recovery")) {
199 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
200 dout(10) << "Storing symlink targets on file object's head " << symlink_recovery << dendl;
201 }
202
203 migrator->handle_conf_change(changed, mdsmap);
204 mds->balancer->handle_conf_change(changed, mdsmap);
205 }
206
207 void MDCache::log_stat()
208 {
209 mds->logger->set(l_mds_inodes, lru.lru_get_size());
210 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
211 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
212 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
213 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
214 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
215 mds->logger->set(l_mds_caps, Capability::count());
216 if (root) {
217 mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles);
218 mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes);
219 mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps);
220 }
221 }
222
223
224 //
225
226 bool MDCache::shutdown()
227 {
228 {
229 std::scoped_lock lock(upkeep_mutex);
230 upkeep_trim_shutdown = true;
231 upkeep_cvar.notify_one();
232 }
233 if (lru.lru_get_size() > 0) {
234 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
235 //show_cache();
236 show_subtrees();
237 //dump();
238 }
239 return true;
240 }
241
242
243 // ====================================================================
244 // some inode functions
245
246 void MDCache::add_inode(CInode *in)
247 {
248 // add to inode map
249 if (in->last == CEPH_NOSNAP) {
250 auto &p = inode_map[in->ino()];
251 ceph_assert(!p); // should be no dup inos!
252 p = in;
253 } else {
254 auto &p = snap_inode_map[in->vino()];
255 ceph_assert(!p); // should be no dup inos!
256 p = in;
257 }
258
259 if (in->ino() < MDS_INO_SYSTEM_BASE) {
260 if (in->ino() == CEPH_INO_ROOT)
261 root = in;
262 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
263 myin = in;
264 else if (in->is_stray()) {
265 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
266 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
267 }
268 }
269 if (in->is_base())
270 base_inodes.insert(in);
271 }
272 }
273
274 void MDCache::remove_inode(CInode *o)
275 {
276 dout(14) << "remove_inode " << *o << dendl;
277
278 if (o->get_parent_dn()) {
279 // FIXME: multiple parents?
280 CDentry *dn = o->get_parent_dn();
281 ceph_assert(!dn->is_dirty());
282 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
283 }
284
285 if (o->is_dirty())
286 o->mark_clean();
287 if (o->is_dirty_parent())
288 o->clear_dirty_parent();
289
290 o->clear_scatter_dirty();
291
292 o->clear_clientwriteable();
293
294 o->item_open_file.remove_myself();
295
296 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
297 export_pin_queue.erase(o);
298
299 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
300 export_pin_delayed_queue.erase(o);
301
302 o->clear_ephemeral_pin(true, true);
303
304 // remove from inode map
305 if (o->last == CEPH_NOSNAP) {
306 inode_map.erase(o->ino());
307 } else {
308 o->item_caps.remove_myself();
309 snap_inode_map.erase(o->vino());
310 }
311
312 clear_taken_inos(o->ino());
313
314 if (o->ino() < MDS_INO_SYSTEM_BASE) {
315 if (o == root) root = 0;
316 if (o == myin) myin = 0;
317 if (o->is_stray()) {
318 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
319 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
320 }
321 }
322 if (o->is_base())
323 base_inodes.erase(o);
324 }
325
326 // delete it
327 ceph_assert(o->get_num_ref() == 0);
328 delete o;
329 }
330
331 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
332 {
333 file_layout_t result = file_layout_t::get_default();
334 result.pool_id = mdsmap.get_first_data_pool();
335 return result;
336 }
337
338 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
339 {
340 file_layout_t result = file_layout_t::get_default();
341 result.pool_id = mdsmap.get_metadata_pool();
342 if (g_conf()->mds_log_segment_size > 0) {
343 result.object_size = g_conf()->mds_log_segment_size;
344 result.stripe_unit = g_conf()->mds_log_segment_size;
345 }
346 return result;
347 }
348
349 void MDCache::init_layouts()
350 {
351 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
352 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
353 }
354
355 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const
356 {
357 auto _inode = in->_get_inode();
358 _inode->ino = ino;
359 _inode->version = 1;
360 _inode->xattr_version = 1;
361 _inode->mode = 0500 | mode;
362 _inode->size = 0;
363 _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now();
364 _inode->nlink = 1;
365 _inode->truncate_size = -1ull;
366 _inode->change_attr = 0;
367 _inode->export_pin = MDS_RANK_NONE;
368
369 // FIPS zeroization audit 20191117: this memset is not security related.
370 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
371 if (_inode->is_dir()) {
372 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
373 _inode->rstat.rsubdirs = 1; /* itself */
374 _inode->rstat.rctime = in->get_inode()->ctime;
375 } else {
376 _inode->layout = default_file_layout;
377 ++_inode->rstat.rfiles;
378 }
379 _inode->accounted_rstat = _inode->rstat;
380
381 if (in->is_base()) {
382 if (in->is_root())
383 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
384 else
385 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
386 in->open_snaprealm(); // empty snaprealm
387 ceph_assert(!in->snaprealm->parent); // created its own
388 in->snaprealm->srnode.seq = 1;
389 }
390 }
391
392 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
393 {
394 dout(0) << "creating system inode with ino:" << ino << dendl;
395 CInode *in = new CInode(this);
396 create_unlinked_system_inode(in, ino, mode);
397 add_inode(in);
398 return in;
399 }
400
401 CInode *MDCache::create_root_inode()
402 {
403 CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
404 auto _inode = in->_get_inode();
405 _inode->uid = g_conf()->mds_root_ino_uid;
406 _inode->gid = g_conf()->mds_root_ino_gid;
407 _inode->layout = default_file_layout;
408 _inode->layout.pool_id = mds->mdsmap->get_first_data_pool();
409 return in;
410 }
411
412 void MDCache::create_empty_hierarchy(MDSGather *gather)
413 {
414 // create root dir
415 CInode *root = create_root_inode();
416
417 // force empty root dir
418 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
419 adjust_subtree_auth(rootdir, mds->get_nodeid());
420 rootdir->dir_rep = CDir::REP_ALL; //NONE;
421
422 ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat);
423 ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat);
424 ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat);
425 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
426 * assume version 0 is stale/invalid.
427 */
428
429 rootdir->mark_complete();
430 rootdir->_get_fnode()->version = rootdir->pre_dirty();
431 rootdir->mark_dirty(mds->mdlog->get_current_segment());
432 rootdir->commit(0, gather->new_sub());
433
434 root->store(gather->new_sub());
435 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
436 root->store_backtrace(gather->new_sub());
437 }
438
439 void MDCache::create_mydir_hierarchy(MDSGather *gather)
440 {
441 // create mds dir
442 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
443
444 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
445 auto mydir_fnode = mydir->_get_fnode();
446
447 adjust_subtree_auth(mydir, mds->get_nodeid());
448
449 LogSegment *ls = mds->mdlog->get_current_segment();
450
451 // stray dir
452 for (int i = 0; i < NUM_STRAY; ++i) {
453 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
454 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
455 CachedStackStringStream css;
456 *css << "stray" << i;
457 CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, "");
458 sdn->_mark_dirty(mds->mdlog->get_current_segment());
459
460 stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat;
461
462 mydir_fnode->rstat.add(stray->get_inode()->rstat);
463 mydir_fnode->fragstat.nsubdirs++;
464 // save them
465 straydir->mark_complete();
466 straydir->_get_fnode()->version = straydir->pre_dirty();
467 straydir->mark_dirty(ls);
468 straydir->commit(0, gather->new_sub());
469 stray->mark_dirty_parent(ls, true);
470 stray->store_backtrace(gather->new_sub());
471 }
472
473 mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat;
474 mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat;
475
476 auto inode = myin->_get_inode();
477 inode->dirstat = mydir->get_fnode()->fragstat;
478 inode->rstat = mydir->get_fnode()->rstat;
479 ++inode->rstat.rsubdirs;
480 inode->accounted_rstat = inode->rstat;
481
482 mydir->mark_complete();
483 mydir_fnode->version = mydir->pre_dirty();
484 mydir->mark_dirty(ls);
485 mydir->commit(0, gather->new_sub());
486
487 myin->store(gather->new_sub());
488 }
489
490 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
491 MutationRef mut;
492 CDentry *dn;
493 version_t dpv;
494 MDSContext *fin;
495 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
496 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
497 void finish(int r) override {
498 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
499 }
500 };
501
502 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
503 {
504 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
505 CDentry *dn = dir->add_null_dentry(name);
506
507 dn->push_projected_linkage(in);
508 version_t dpv = dn->pre_dirty();
509
510 CDir *mdir = 0;
511 auto inode = in->_get_inode();
512 if (in->is_dir()) {
513 inode->rstat.rsubdirs = 1;
514
515 mdir = in->get_or_open_dirfrag(this, frag_t());
516 mdir->mark_complete();
517 mdir->_get_fnode()->version = mdir->pre_dirty();
518 } else {
519 inode->rstat.rfiles = 1;
520 }
521
522 inode->version = dn->pre_dirty();
523
524 SnapRealm *realm = dir->get_inode()->find_snaprealm();
525 dn->first = in->first = realm->get_newest_seq() + 1;
526
527 MutationRef mut(new MutationImpl());
528
529 // force some locks. hacky.
530 mds->locker->wrlock_force(&dir->inode->filelock, mut);
531 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
532
533 mut->ls = mds->mdlog->get_current_segment();
534 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
535 mds->mdlog->start_entry(le);
536
537 if (!in->is_mdsdir()) {
538 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
539 le->metablob.add_primary_dentry(dn, in, true);
540 } else {
541 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
542 journal_dirty_inode(mut.get(), &le->metablob, in);
543 dn->push_projected_linkage(in->ino(), in->d_type());
544 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
545 le->metablob.add_root(true, in);
546 }
547 if (mdir)
548 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
549
550 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
551 mds->mdlog->flush();
552 }
553
554 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
555 {
556 dout(10) << "_create_system_file_finish " << *dn << dendl;
557
558 dn->pop_projected_linkage();
559 dn->mark_dirty(dpv, mut->ls);
560
561 CInode *in = dn->get_linkage()->get_inode();
562 in->mark_dirty(mut->ls);
563
564 if (in->is_dir()) {
565 CDir *dir = in->get_dirfrag(frag_t());
566 ceph_assert(dir);
567 dir->mark_dirty(mut->ls);
568 dir->mark_new(mut->ls);
569 }
570
571 mut->apply();
572 mds->locker->drop_locks(mut.get());
573 mut->cleanup();
574
575 fin->complete(0);
576
577 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
578 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
579 }
580
581
582
583 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
584 MDCache *cache;
585 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
586 void finish(int r) override {
587 if (r < 0) {
588 // If we can't open root, something disastrous has happened: mark
589 // this rank damaged for operator intervention. Note that
590 // it is not okay to call suicide() here because we are in
591 // a Finisher callback.
592 cache->mds->damaged();
593 ceph_abort(); // damaged should never return
594 } else {
595 cache->open_root();
596 }
597 }
598 };
599
600 void MDCache::open_root_inode(MDSContext *c)
601 {
602 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
603 CInode *in;
604 in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
605 in->fetch(c);
606 } else {
607 discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
608 }
609 }
610
611 void MDCache::open_mydir_inode(MDSContext *c)
612 {
613 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
614 in->fetch(c);
615 }
616
617 void MDCache::open_mydir_frag(MDSContext *c)
618 {
619 open_mydir_inode(
620 new MDSInternalContextWrapper(mds,
621 new LambdaContext([this, c](int r) {
622 if (r < 0) {
623 c->complete(r);
624 return;
625 }
626 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
627 ceph_assert(mydir);
628 adjust_subtree_auth(mydir, mds->get_nodeid());
629 mydir->fetch(c);
630 })
631 )
632 );
633 }
634
635 void MDCache::open_root()
636 {
637 dout(10) << "open_root" << dendl;
638
639 if (!root) {
640 open_root_inode(new C_MDS_RetryOpenRoot(this));
641 return;
642 }
643 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
644 ceph_assert(root->is_auth());
645 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
646 ceph_assert(rootdir);
647 if (!rootdir->is_subtree_root())
648 adjust_subtree_auth(rootdir, mds->get_nodeid());
649 if (!rootdir->is_complete()) {
650 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
651 return;
652 }
653 } else {
654 ceph_assert(!root->is_auth());
655 CDir *rootdir = root->get_dirfrag(frag_t());
656 if (!rootdir) {
657 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
658 return;
659 }
660 }
661
662 if (!myin) {
663 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
664 in->fetch(new C_MDS_RetryOpenRoot(this));
665 return;
666 }
667 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
668 ceph_assert(mydir);
669 adjust_subtree_auth(mydir, mds->get_nodeid());
670
671 populate_mydir();
672 }
673
674 void MDCache::advance_stray() {
675 // check whether the directory has been fragmented
676 if (stray_fragmenting_index >= 0) {
677 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
678 bool any_fragmenting = false;
679 for (const auto& dir : dfs) {
680 if (dir->state_test(CDir::STATE_FRAGMENTING) ||
681 mds->balancer->is_fragment_pending(dir->dirfrag())) {
682 any_fragmenting = true;
683 break;
684 }
685 }
686 if (!any_fragmenting)
687 stray_fragmenting_index = -1;
688 }
689
690 for (int i = 1; i < NUM_STRAY; i++){
691 stray_index = (stray_index + i) % NUM_STRAY;
692 if (stray_index != stray_fragmenting_index)
693 break;
694 }
695
696 if (stray_fragmenting_index == -1 && is_open()) {
697 // Fragment later stray dir in advance. We don't choose past
698 // stray dir because in-flight requests may still use it.
699 stray_fragmenting_index = (stray_index + 3) % NUM_STRAY;
700 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
701 bool any_fragmenting = false;
702 for (const auto& dir : dfs) {
703 if (dir->should_split()) {
704 mds->balancer->queue_split(dir, true);
705 any_fragmenting = true;
706 } else if (dir->should_merge()) {
707 mds->balancer->queue_merge(dir);
708 any_fragmenting = true;
709 }
710 }
711 if (!any_fragmenting)
712 stray_fragmenting_index = -1;
713 }
714
715 dout(10) << "advance_stray to index " << stray_index
716 << " fragmenting index " << stray_fragmenting_index << dendl;
717 }
718
719 void MDCache::populate_mydir()
720 {
721 ceph_assert(myin);
722 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
723 ceph_assert(mydir);
724
725 dout(10) << "populate_mydir " << *mydir << dendl;
726
727 if (!mydir->is_complete()) {
728 mydir->fetch(new C_MDS_RetryOpenRoot(this));
729 return;
730 }
731
732 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
733 // A missing dirfrag, we will recreate it. Before that, we must dirty
734 // it before dirtying any of the strays we create within it.
735 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
736 "recreating it now";
737 LogSegment *ls = mds->mdlog->get_current_segment();
738 mydir->state_clear(CDir::STATE_BADFRAG);
739 mydir->mark_complete();
740 mydir->_get_fnode()->version = mydir->pre_dirty();
741 mydir->mark_dirty(ls);
742 }
743
744 // open or create stray
745 uint64_t num_strays = 0;
746 for (int i = 0; i < NUM_STRAY; ++i) {
747 CachedStackStringStream css;
748 *css << "stray" << i;
749 CDentry *straydn = mydir->lookup(css->str());
750
751 // allow for older fs's with stray instead of stray0
752 if (straydn == NULL && i == 0)
753 straydn = mydir->lookup("stray");
754
755 if (!straydn || !straydn->get_linkage()->get_inode()) {
756 _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
757 new C_MDS_RetryOpenRoot(this));
758 return;
759 }
760 ceph_assert(straydn);
761 ceph_assert(strays[i]);
762 // we make multiple passes through this method; make sure we only pin each stray once.
763 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
764 strays[i]->get(CInode::PIN_STRAY);
765 strays[i]->state_set(CInode::STATE_STRAYPINNED);
766 strays[i]->get_stickydirs();
767 }
768 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
769
770 // open all frags
771 frag_vec_t leaves;
772 strays[i]->dirfragtree.get_leaves(leaves);
773 for (const auto& leaf : leaves) {
774 CDir *dir = strays[i]->get_dirfrag(leaf);
775 if (!dir) {
776 dir = strays[i]->get_or_open_dirfrag(this, leaf);
777 }
778
779 // DamageTable applies special handling to strays: it will
780 // have damaged() us out if one is damaged.
781 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
782
783 if (dir->get_version() == 0) {
784 dir->fetch_keys({}, new C_MDS_RetryOpenRoot(this));
785 return;
786 }
787
788 if (dir->get_frag_size() > 0)
789 num_strays += dir->get_frag_size();
790 }
791 }
792
793 // okay!
794 dout(10) << "populate_mydir done" << dendl;
795 ceph_assert(!open);
796 open = true;
797 mds->queue_waiters(waiting_for_open);
798
799 stray_manager.set_num_strays(num_strays);
800 stray_manager.activate();
801
802 scan_stray_dir();
803 }
804
805 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
806 {
807 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
808 }
809
810 CDir *MDCache::get_stray_dir(CInode *in)
811 {
812 string straydname;
813 in->name_stray_dentry(straydname);
814
815 CInode *strayi = get_stray();
816 ceph_assert(strayi);
817 frag_t fg = strayi->pick_dirfrag(straydname);
818 CDir *straydir = strayi->get_dirfrag(fg);
819 ceph_assert(straydir);
820 return straydir;
821 }
822
823 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
824 {
825 // inode?
826 if (info.ino)
827 return get_inode(info.ino, info.snapid);
828
829 // dir or dentry.
830 CDir *dir = get_dirfrag(info.dirfrag);
831 if (!dir) return 0;
832
833 if (info.dname.length())
834 return dir->lookup(info.dname, info.snapid);
835 else
836 return dir;
837 }
838
839
840 // ====================================================================
841 // consistent hash ring
842
843 /*
844 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
845 */
846 mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
847 {
848 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
849 uint64_t hash = rjhash64(ino);
850 if (fg)
851 hash = rjhash64(hash + rjhash64(fg.value()));
852
853 int64_t b = -1, j = 0;
854 while (j < max_mds) {
855 b = j;
856 hash = hash*2862933555777941757ULL + 1;
857 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
858 }
859 // verify bounds before returning
860 auto result = mds_rank_t(b);
861 ceph_assert(result >= 0 && result < max_mds);
862 return result;
863 }
864
865
866 // ====================================================================
867 // subtree management
868
869 /*
870 * adjust the dir_auth of a subtree.
871 * merge with parent and/or child subtrees, if is it appropriate.
872 * merge can ONLY happen if both parent and child have unambiguous auth.
873 */
874 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
875 {
876 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
877 << " on " << *dir << dendl;
878
879 show_subtrees();
880
881 CDir *root;
882 if (dir->inode->is_base()) {
883 root = dir; // bootstrap hack.
884 if (subtrees.count(root) == 0) {
885 subtrees[root];
886 root->get(CDir::PIN_SUBTREE);
887 }
888 } else {
889 root = get_subtree_root(dir); // subtree root
890 }
891 ceph_assert(root);
892 ceph_assert(subtrees.count(root));
893 dout(7) << " current root is " << *root << dendl;
894
895 if (root == dir) {
896 // i am already a subtree.
897 dir->set_dir_auth(auth);
898 } else {
899 // i am a new subtree.
900 dout(10) << " new subtree at " << *dir << dendl;
901 ceph_assert(subtrees.count(dir) == 0);
902 subtrees[dir]; // create empty subtree bounds list for me.
903 dir->get(CDir::PIN_SUBTREE);
904
905 // set dir_auth
906 dir->set_dir_auth(auth);
907
908 // move items nested beneath me, under me.
909 set<CDir*>::iterator p = subtrees[root].begin();
910 while (p != subtrees[root].end()) {
911 set<CDir*>::iterator next = p;
912 ++next;
913 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
914 // move under me
915 dout(10) << " claiming child bound " << **p << dendl;
916 subtrees[dir].insert(*p);
917 subtrees[root].erase(p);
918 }
919 p = next;
920 }
921
922 // i am a bound of the parent subtree.
923 subtrees[root].insert(dir);
924
925 // i am now the subtree root.
926 root = dir;
927
928 // adjust recursive pop counters
929 if (adjust_pop && dir->is_auth()) {
930 CDir *p = dir->get_parent_dir();
931 while (p) {
932 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
933 if (p->is_subtree_root()) break;
934 p = p->inode->get_parent_dir();
935 }
936 }
937 }
938
939 show_subtrees();
940 }
941
942
943 void MDCache::try_subtree_merge(CDir *dir)
944 {
945 dout(7) << "try_subtree_merge " << *dir << dendl;
946 // record my old bounds
947 auto oldbounds = subtrees.at(dir);
948
949 set<CInode*> to_eval;
950 // try merge at my root
951 try_subtree_merge_at(dir, &to_eval);
952
953 // try merge at my old bounds
954 for (auto bound : oldbounds)
955 try_subtree_merge_at(bound, &to_eval);
956
957 if (!(mds->is_any_replay() || mds->is_resolve())) {
958 for(auto in : to_eval)
959 eval_subtree_root(in);
960 }
961 }
962
963 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
964 {
965 dout(10) << "try_subtree_merge_at " << *dir << dendl;
966
967 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
968 dir->state_test(CDir::STATE_EXPORTBOUND) ||
969 dir->state_test(CDir::STATE_AUXSUBTREE))
970 return;
971
972 auto it = subtrees.find(dir);
973 ceph_assert(it != subtrees.end());
974
975 // merge with parent?
976 CDir *parent = dir;
977 if (!dir->inode->is_base())
978 parent = get_subtree_root(dir->get_parent_dir());
979
980 if (parent != dir && // we have a parent,
981 parent->dir_auth == dir->dir_auth) { // auth matches,
982 // merge with parent.
983 dout(10) << " subtree merge at " << *dir << dendl;
984 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
985
986 // move our bounds under the parent
987 subtrees[parent].insert(it->second.begin(), it->second.end());
988
989 // we are no longer a subtree or bound
990 dir->put(CDir::PIN_SUBTREE);
991 subtrees.erase(it);
992 subtrees[parent].erase(dir);
993
994 // adjust popularity?
995 if (adjust_pop && dir->is_auth()) {
996 CDir *cur = dir;
997 CDir *p = dir->get_parent_dir();
998 while (p) {
999 p->pop_auth_subtree.add(dir->pop_auth_subtree);
1000 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1001 if (p->is_subtree_root()) break;
1002 cur = p;
1003 p = p->inode->get_parent_dir();
1004 }
1005 }
1006
1007 if (to_eval && dir->get_inode()->is_auth())
1008 to_eval->insert(dir->get_inode());
1009
1010 show_subtrees(15);
1011 }
1012 }
1013
1014 void MDCache::eval_subtree_root(CInode *diri)
1015 {
1016 // evaluate subtree inode filelock?
1017 // (we should scatter the filelock on subtree bounds)
1018 ceph_assert(diri->is_auth());
1019 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1020 }
1021
1022
1023 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1024 {
1025 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1026 << " on " << *dir
1027 << " bounds " << bounds
1028 << dendl;
1029
1030 show_subtrees();
1031
1032 CDir *root;
1033 if (dir->ino() == CEPH_INO_ROOT) {
1034 root = dir; // bootstrap hack.
1035 if (subtrees.count(root) == 0) {
1036 subtrees[root];
1037 root->get(CDir::PIN_SUBTREE);
1038 }
1039 } else {
1040 root = get_subtree_root(dir); // subtree root
1041 }
1042 ceph_assert(root);
1043 ceph_assert(subtrees.count(root));
1044 dout(7) << " current root is " << *root << dendl;
1045
1046 mds_authority_t oldauth = dir->authority();
1047
1048 if (root == dir) {
1049 // i am already a subtree.
1050 dir->set_dir_auth(auth);
1051 } else {
1052 // i am a new subtree.
1053 dout(10) << " new subtree at " << *dir << dendl;
1054 ceph_assert(subtrees.count(dir) == 0);
1055 subtrees[dir]; // create empty subtree bounds list for me.
1056 dir->get(CDir::PIN_SUBTREE);
1057
1058 // set dir_auth
1059 dir->set_dir_auth(auth);
1060
1061 // move items nested beneath me, under me.
1062 set<CDir*>::iterator p = subtrees[root].begin();
1063 while (p != subtrees[root].end()) {
1064 set<CDir*>::iterator next = p;
1065 ++next;
1066 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1067 // move under me
1068 dout(10) << " claiming child bound " << **p << dendl;
1069 subtrees[dir].insert(*p);
1070 subtrees[root].erase(p);
1071 }
1072 p = next;
1073 }
1074
1075 // i am a bound of the parent subtree.
1076 subtrees[root].insert(dir);
1077
1078 // i am now the subtree root.
1079 root = dir;
1080 }
1081
1082 set<CInode*> to_eval;
1083
1084 // verify/adjust bounds.
1085 // - these may be new, or
1086 // - beneath existing ambiguous bounds (which will be collapsed),
1087 // - but NOT beneath unambiguous bounds.
1088 for (const auto& bound : bounds) {
1089 // new bound?
1090 if (subtrees[dir].count(bound) == 0) {
1091 if (get_subtree_root(bound) == dir) {
1092 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1093 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1094 }
1095 else {
1096 dout(10) << " want bound " << *bound << dendl;
1097 CDir *t = get_subtree_root(bound->get_parent_dir());
1098 if (subtrees[t].count(bound) == 0) {
1099 ceph_assert(t != dir);
1100 dout(10) << " new bound " << *bound << dendl;
1101 adjust_subtree_auth(bound, t->authority());
1102 }
1103 // make sure it's nested beneath ambiguous subtree(s)
1104 while (1) {
1105 while (subtrees[dir].count(t) == 0)
1106 t = get_subtree_root(t->get_parent_dir());
1107 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1108 adjust_subtree_auth(t, auth);
1109 try_subtree_merge_at(t, &to_eval);
1110 t = get_subtree_root(bound->get_parent_dir());
1111 if (t == dir) break;
1112 }
1113 }
1114 }
1115 else {
1116 dout(10) << " already have bound " << *bound << dendl;
1117 }
1118 }
1119 // merge stray bounds?
1120 while (!subtrees[dir].empty()) {
1121 set<CDir*> copy = subtrees[dir];
1122 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1123 if (bounds.count(*p) == 0) {
1124 CDir *stray = *p;
1125 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1126 adjust_subtree_auth(stray, auth);
1127 try_subtree_merge_at(stray, &to_eval);
1128 }
1129 }
1130 // swallowing subtree may add new subtree bounds
1131 if (copy == subtrees[dir])
1132 break;
1133 }
1134
1135 // bound should now match.
1136 verify_subtree_bounds(dir, bounds);
1137
1138 show_subtrees();
1139
1140 if (!(mds->is_any_replay() || mds->is_resolve())) {
1141 for(auto in : to_eval)
1142 eval_subtree_root(in);
1143 }
1144 }
1145
1146
1147 /*
1148 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1149 * fragmentation as necessary to get an equivalent bounding set. That is, only
1150 * split if one of our frags spans the provided bounding set. Never merge.
1151 */
1152 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1153 {
1154 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1155
1156 // sort by ino
1157 map<inodeno_t, fragset_t> byino;
1158 for (auto& frag : dfs) {
1159 byino[frag.ino].insert_raw(frag.frag);
1160 }
1161 dout(10) << " by ino: " << byino << dendl;
1162
1163 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1164 p->second.simplify();
1165 CInode *diri = get_inode(p->first);
1166 if (!diri)
1167 continue;
1168 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1169
1170 fragtree_t tmpdft;
1171 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1172 tmpdft.force_to_leaf(g_ceph_context, *q);
1173
1174 for (const auto& fg : p->second) {
1175 frag_vec_t leaves;
1176 diri->dirfragtree.get_leaves_under(fg, leaves);
1177 if (leaves.empty()) {
1178 frag_t approx_fg = diri->dirfragtree[fg.value()];
1179 frag_vec_t approx_leaves;
1180 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1181 for (const auto& leaf : approx_leaves) {
1182 if (p->second.get().count(leaf) == 0) {
1183 // not bound, so the resolve message is from auth MDS of the dirfrag
1184 force_dir_fragment(diri, leaf);
1185 }
1186 }
1187 }
1188
1189 auto&& [complete, sibs] = diri->get_dirfrags_under(fg);
1190 for (const auto& sib : sibs)
1191 bounds.insert(sib);
1192 }
1193 }
1194 }
1195
1196 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1197 {
1198 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1199 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1200
1201 set<CDir*> bounds;
1202 get_force_dirfrag_bound_set(bound_dfs, bounds);
1203 adjust_bounded_subtree_auth(dir, bounds, auth);
1204 }
1205
1206 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1207 {
1208 dout(10) << "map_dirfrag_set " << dfs << dendl;
1209
1210 // group by inode
1211 map<inodeno_t, fragset_t> ino_fragset;
1212 for (const auto &df : dfs) {
1213 ino_fragset[df.ino].insert_raw(df.frag);
1214 }
1215 // get frags
1216 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1217 p != ino_fragset.end();
1218 ++p) {
1219 p->second.simplify();
1220 CInode *in = get_inode(p->first);
1221 if (!in)
1222 continue;
1223
1224 frag_vec_t fgs;
1225 for (const auto& fg : p->second) {
1226 in->dirfragtree.get_leaves_under(fg, fgs);
1227 }
1228
1229 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1230 << " on " << *in << dendl;
1231
1232 for (const auto& fg : fgs) {
1233 CDir *dir = in->get_dirfrag(fg);
1234 if (dir)
1235 result.insert(dir);
1236 }
1237 }
1238 }
1239
1240
1241
1242 CDir *MDCache::get_subtree_root(CDir *dir)
1243 {
1244 // find the underlying dir that delegates (or is about to delegate) auth
1245 while (true) {
1246 if (dir->is_subtree_root())
1247 return dir;
1248 dir = dir->get_inode()->get_parent_dir();
1249 if (!dir)
1250 return 0; // none
1251 }
1252 }
1253
1254 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1255 {
1256 // find the underlying dir that delegates (or is about to delegate) auth
1257 while (true) {
1258 if (dir->is_subtree_root())
1259 return dir;
1260 dir = dir->get_inode()->get_projected_parent_dir();
1261 if (!dir)
1262 return 0; // none
1263 }
1264 }
1265
1266 void MDCache::remove_subtree(CDir *dir)
1267 {
1268 dout(10) << "remove_subtree " << *dir << dendl;
1269 auto it = subtrees.find(dir);
1270 ceph_assert(it != subtrees.end());
1271 subtrees.erase(it);
1272 dir->put(CDir::PIN_SUBTREE);
1273 if (dir->get_parent_dir()) {
1274 CDir *p = get_subtree_root(dir->get_parent_dir());
1275 auto it = subtrees.find(p);
1276 ceph_assert(it != subtrees.end());
1277 auto count = it->second.erase(dir);
1278 ceph_assert(count == 1);
1279 }
1280 }
1281
1282 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1283 {
1284 ceph_assert(subtrees.count(dir));
1285 bounds = subtrees[dir];
1286 }
1287
1288 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1289 {
1290 if (subtrees.count(dir)) {
1291 // just copy them, dir is a subtree.
1292 get_subtree_bounds(dir, bounds);
1293 } else {
1294 // find them
1295 CDir *root = get_subtree_root(dir);
1296 for (set<CDir*>::iterator p = subtrees[root].begin();
1297 p != subtrees[root].end();
1298 ++p) {
1299 CDir *t = *p;
1300 while (t != root) {
1301 t = t->get_parent_dir();
1302 ceph_assert(t);
1303 if (t == dir) {
1304 bounds.insert(*p);
1305 continue;
1306 }
1307 }
1308 }
1309 }
1310 }
1311
1312 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1313 {
1314 // for debugging only.
1315 ceph_assert(subtrees.count(dir));
1316 if (bounds != subtrees[dir]) {
1317 dout(0) << "verify_subtree_bounds failed" << dendl;
1318 set<CDir*> b = bounds;
1319 for (auto &cd : subtrees[dir]) {
1320 if (bounds.count(cd)) {
1321 b.erase(cd);
1322 continue;
1323 }
1324 dout(0) << " missing bound " << *cd << dendl;
1325 }
1326 for (const auto &cd : b)
1327 dout(0) << " extra bound " << *cd << dendl;
1328 }
1329 ceph_assert(bounds == subtrees[dir]);
1330 }
1331
1332 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1333 {
1334 // for debugging only.
1335 ceph_assert(subtrees.count(dir));
1336
1337 // make sure that any bounds i do have are properly noted as such.
1338 int failed = 0;
1339 for (const auto &fg : bounds) {
1340 CDir *bd = get_dirfrag(fg);
1341 if (!bd) continue;
1342 if (subtrees[dir].count(bd) == 0) {
1343 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1344 failed++;
1345 }
1346 }
1347 ceph_assert(failed == 0);
1348 }
1349
1350 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1351 {
1352 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1353 << " to " << *newdir << dendl;
1354 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1355 }
1356
1357 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1358 {
1359 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1360
1361 CDir *newdir = diri->get_parent_dir();
1362
1363 if (pop) {
1364 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1365 ceph_assert(p != projected_subtree_renames.end());
1366 ceph_assert(!p->second.empty());
1367 ceph_assert(p->second.front().first == olddir);
1368 ceph_assert(p->second.front().second == newdir);
1369 p->second.pop_front();
1370 if (p->second.empty())
1371 projected_subtree_renames.erase(p);
1372 }
1373
1374 // adjust total auth pin of freezing subtree
1375 if (olddir != newdir) {
1376 auto&& dfls = diri->get_nested_dirfrags();
1377 for (const auto& dir : dfls)
1378 olddir->adjust_freeze_after_rename(dir);
1379 }
1380
1381 // adjust subtree
1382 // N.B. make sure subtree dirfrags are at the front of the list
1383 auto dfls = diri->get_subtree_dirfrags();
1384 diri->get_nested_dirfrags(dfls);
1385 for (const auto& dir : dfls) {
1386 dout(10) << "dirfrag " << *dir << dendl;
1387 CDir *oldparent = get_subtree_root(olddir);
1388 dout(10) << " old parent " << *oldparent << dendl;
1389 CDir *newparent = get_subtree_root(newdir);
1390 dout(10) << " new parent " << *newparent << dendl;
1391
1392 auto& oldbounds = subtrees[oldparent];
1393 auto& newbounds = subtrees[newparent];
1394
1395 if (olddir != newdir)
1396 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1397
1398 if (oldparent == newparent) {
1399 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1400 } else if (dir->is_subtree_root()) {
1401 // children are fine. change parent.
1402 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1403 {
1404 auto n = oldbounds.erase(dir);
1405 ceph_assert(n == 1);
1406 }
1407 newbounds.insert(dir);
1408 // caller is responsible for 'eval diri'
1409 try_subtree_merge_at(dir, NULL, false);
1410 } else {
1411 // mid-subtree.
1412
1413 // see if any old bounds move to the new parent.
1414 std::vector<CDir*> tomove;
1415 for (const auto& bound : oldbounds) {
1416 CDir *broot = get_subtree_root(bound->get_parent_dir());
1417 if (broot != oldparent) {
1418 ceph_assert(broot == newparent);
1419 tomove.push_back(bound);
1420 }
1421 }
1422 for (const auto& bound : tomove) {
1423 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1424 oldbounds.erase(bound);
1425 newbounds.insert(bound);
1426 }
1427
1428 // did auth change?
1429 if (oldparent->authority() != newparent->authority()) {
1430 adjust_subtree_auth(dir, oldparent->authority(), false);
1431 // caller is responsible for 'eval diri'
1432 try_subtree_merge_at(dir, NULL, false);
1433 }
1434 }
1435
1436 if (olddir != newdir)
1437 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1438 }
1439
1440 show_subtrees();
1441 }
1442
1443 // ===================================
1444 // journal and snap/cow helpers
1445
1446
1447 /*
1448 * find first inode in cache that follows given snapid. otherwise, return current.
1449 */
1450 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1451 {
1452 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1453 ceph_assert(in->last == CEPH_NOSNAP);
1454
1455 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1456 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1457 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1458 in = p->second;
1459 }
1460
1461 return in;
1462 }
1463
1464
1465 /*
1466 * note: i'm currently cheating wrt dirty and inode.version on cow
1467 * items. instead of doing a full dir predirty, i just take the
1468 * original item's version, and set the dirty flag (via
1469 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1470 * means a special case in the dir commit clean sweep assertions.
1471 * bah.
1472 */
1473 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1474 {
1475 ceph_assert(last >= in->first);
1476
1477 CInode *oldin = new CInode(this, true, in->first, last);
1478 auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode());
1479 _inode->trim_client_ranges(last);
1480 oldin->reset_inode(std::move(_inode));
1481 auto _xattrs = in->get_previous_projected_xattrs();
1482 oldin->reset_xattrs(std::move(_xattrs));
1483
1484 oldin->symlink = in->symlink;
1485
1486 if (in->first < in->oldest_snap)
1487 in->oldest_snap = in->first;
1488
1489 in->first = last+1;
1490
1491 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1492 add_inode(oldin);
1493
1494 if (in->last != CEPH_NOSNAP) {
1495 CInode *head_in = get_inode(in->ino());
1496 ceph_assert(head_in);
1497 auto ret = head_in->split_need_snapflush(oldin, in);
1498 if (ret.first) {
1499 oldin->client_snap_caps = in->client_snap_caps;
1500 if (!oldin->client_snap_caps.empty()) {
1501 for (int i = 0; i < num_cinode_locks; i++) {
1502 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1503 ceph_assert(lock);
1504 if (lock->get_state() != LOCK_SNAP_SYNC) {
1505 ceph_assert(lock->is_stable());
1506 lock->set_state(LOCK_SNAP_SYNC); // gathering
1507 oldin->auth_pin(lock);
1508 }
1509 lock->get_wrlock(true);
1510 }
1511 }
1512 }
1513 if (!ret.second) {
1514 auto client_snap_caps = std::move(in->client_snap_caps);
1515 in->client_snap_caps.clear();
1516 in->item_open_file.remove_myself();
1517 in->item_caps.remove_myself();
1518
1519 if (!client_snap_caps.empty()) {
1520 MDSContext::vec finished;
1521 for (int i = 0; i < num_cinode_locks; i++) {
1522 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1523 ceph_assert(lock);
1524 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1525 lock->put_wrlock();
1526 if (!lock->get_num_wrlocks()) {
1527 lock->set_state(LOCK_SYNC);
1528 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1529 in->auth_unpin(lock);
1530 }
1531 }
1532 mds->queue_waiters(finished);
1533 }
1534 }
1535 return oldin;
1536 }
1537
1538 if (!in->client_caps.empty()) {
1539 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1540 // clone caps?
1541 for (auto &p : in->client_caps) {
1542 client_t client = p.first;
1543 Capability *cap = &p.second;
1544 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1545 if ((issued & CEPH_CAP_ANY_WR) &&
1546 cap->client_follows < last) {
1547 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1548 oldin->client_snap_caps.insert(client);
1549 cap->client_follows = last;
1550
1551 // we need snapflushes for any intervening snaps
1552 dout(10) << " snaps " << snaps << dendl;
1553 for (auto q = snaps.lower_bound(oldin->first);
1554 q != snaps.end() && *q <= last;
1555 ++q) {
1556 in->add_need_snapflush(oldin, *q, client);
1557 }
1558 } else {
1559 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1560 }
1561 }
1562
1563 if (!oldin->client_snap_caps.empty()) {
1564 for (int i = 0; i < num_cinode_locks; i++) {
1565 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1566 ceph_assert(lock);
1567 if (lock->get_state() != LOCK_SNAP_SYNC) {
1568 ceph_assert(lock->is_stable());
1569 lock->set_state(LOCK_SNAP_SYNC); // gathering
1570 oldin->auth_pin(lock);
1571 }
1572 lock->get_wrlock(true);
1573 }
1574 }
1575 }
1576 return oldin;
1577 }
1578
1579 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1580 CDentry *dn, snapid_t follows,
1581 CInode **pcow_inode, CDentry::linkage_t *dnl)
1582 {
1583 if (!dn) {
1584 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1585 return;
1586 }
1587 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1588 ceph_assert(dn->is_auth());
1589
1590 // nothing to cow on a null dentry, fix caller
1591 if (!dnl)
1592 dnl = dn->get_projected_linkage();
1593 ceph_assert(!dnl->is_null());
1594
1595 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1596 bool cow_head = false;
1597 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1598 ceph_assert(in->is_frozen_inode());
1599 cow_head = true;
1600 }
1601 if (in && (in->is_multiversion() || cow_head)) {
1602 // multiversion inode.
1603 SnapRealm *realm = NULL;
1604
1605 if (in->get_projected_parent_dn() != dn) {
1606 ceph_assert(follows == CEPH_NOSNAP);
1607 realm = dn->dir->inode->find_snaprealm();
1608 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1609 ceph_assert(dir_follows >= realm->get_newest_seq());
1610
1611 if (dir_follows+1 > dn->first) {
1612 snapid_t oldfirst = dn->first;
1613 dn->first = dir_follows+1;
1614 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1615 CDir *dir = dn->dir;
1616 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows);
1617 dout(10) << " olddn " << *olddn << dendl;
1618 ceph_assert(dir->is_projected());
1619 olddn->set_projected_version(dir->get_projected_version());
1620 metablob->add_remote_dentry(olddn, true);
1621 mut->add_cow_dentry(olddn);
1622 // FIXME: adjust link count here? hmm.
1623
1624 if (dir_follows+1 > in->first)
1625 in->cow_old_inode(dir_follows, cow_head);
1626 }
1627 }
1628
1629 follows = dir_follows;
1630 if (in->snaprealm) {
1631 realm = in->snaprealm;
1632 ceph_assert(follows >= realm->get_newest_seq());
1633 }
1634 } else {
1635 realm = in->find_snaprealm();
1636 if (follows == CEPH_NOSNAP) {
1637 follows = get_global_snaprealm()->get_newest_seq();
1638 ceph_assert(follows >= realm->get_newest_seq());
1639 }
1640 }
1641
1642 // already cloned?
1643 if (follows < in->first) {
1644 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1645 return;
1646 }
1647
1648 if (!realm->has_snaps_in_range(in->first, follows)) {
1649 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1650 in->first = follows + 1;
1651 return;
1652 }
1653
1654 in->cow_old_inode(follows, cow_head);
1655
1656 } else {
1657 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1658 if (follows == CEPH_NOSNAP) {
1659 follows = get_global_snaprealm()->get_newest_seq();
1660 ceph_assert(follows >= realm->get_newest_seq());
1661 }
1662
1663 // already cloned?
1664 if (follows < dn->first) {
1665 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1666 return;
1667 }
1668
1669 // update dn.first before adding old dentry to cdir's map
1670 snapid_t oldfirst = dn->first;
1671 dn->first = follows+1;
1672
1673 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1674 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1675 if (in)
1676 in->first = follows+1;
1677 return;
1678 }
1679
1680 dout(10) << " dn " << *dn << dendl;
1681 CDir *dir = dn->get_dir();
1682 ceph_assert(dir->is_projected());
1683
1684 if (in) {
1685 CInode *oldin = cow_inode(in, follows);
1686 ceph_assert(in->is_projected());
1687 mut->add_cow_inode(oldin);
1688 if (pcow_inode)
1689 *pcow_inode = oldin;
1690 CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows);
1691 dout(10) << " olddn " << *olddn << dendl;
1692 bool need_snapflush = !oldin->client_snap_caps.empty();
1693 if (need_snapflush) {
1694 mut->ls->open_files.push_back(&oldin->item_open_file);
1695 mds->locker->mark_need_snapflush_inode(oldin);
1696 }
1697 olddn->set_projected_version(dir->get_projected_version());
1698 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1699 mut->add_cow_dentry(olddn);
1700 } else {
1701 ceph_assert(dnl->is_remote());
1702 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows);
1703 dout(10) << " olddn " << *olddn << dendl;
1704
1705 olddn->set_projected_version(dir->get_projected_version());
1706 metablob->add_remote_dentry(olddn, true);
1707 mut->add_cow_dentry(olddn);
1708 }
1709 }
1710 }
1711
1712 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1713 {
1714 if (in->is_base()) {
1715 metablob->add_root(true, in);
1716 } else {
1717 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1718 follows = in->first - 1;
1719 CDentry *dn = in->get_projected_parent_dn();
1720 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1721 journal_cow_dentry(mut, metablob, dn, follows);
1722 if (in->get_projected_inode()->is_backtrace_updated()) {
1723 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1724 in->get_previous_projected_inode()->layout.pool_id;
1725 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1726 } else {
1727 metablob->add_primary_dentry(dn, in, true);
1728 }
1729 }
1730 }
1731
1732
1733
1734 // nested ---------------------------------------------------------------
1735
1736 void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
1737 CInode *cur, CDir *parent, snapid_t first,
1738 int linkunlink, SnapRealm *prealm)
1739 {
1740 CDentry *parentdn = cur->get_projected_parent_dn();
1741
1742 if (cur->first > first)
1743 first = cur->first;
1744
1745 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1746 << " " << *cur << dendl;
1747 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1748 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1749
1750 /*
1751 * FIXME. this incompletely propagates rstats to _old_ parents
1752 * (i.e. shortly after a directory rename). but we need full
1753 * blown hard link backpointers to make this work properly...
1754 */
1755 snapid_t floor = parentdn->first;
1756 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1757
1758 if (!prealm)
1759 prealm = parent->inode->find_snaprealm();
1760 const set<snapid_t> snaps = prealm->get_snaps();
1761
1762 if (cur->last != CEPH_NOSNAP) {
1763 ceph_assert(cur->dirty_old_rstats.empty());
1764 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1765 if (q == snaps.end() || *q > cur->last)
1766 return;
1767 }
1768
1769 if (cur->last >= floor) {
1770 bool update = true;
1771 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1772 // rename src inode is not projected in the peer rename prep case. so we should
1773 // avoid updateing the inode.
1774 ceph_assert(linkunlink < 0);
1775 ceph_assert(cur->is_frozen_inode());
1776 update = false;
1777 }
1778 // hacky
1779 const CInode::mempool_inode *pi;
1780 if (update && mut->is_projected(cur)) {
1781 pi = cur->_get_projected_inode();
1782 } else {
1783 pi = cur->get_projected_inode().get();
1784 if (update) {
1785 // new inode
1786 ceph_assert(pi->rstat == pi->accounted_rstat);
1787 update = false;
1788 }
1789 }
1790 _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent,
1791 linkunlink, update);
1792 }
1793
1794 if (g_conf()->mds_snap_rstat) {
1795 for (const auto &p : cur->dirty_old_rstats) {
1796 const auto &old = cur->get_old_inodes()->at(p);
1797 snapid_t ofirst = std::max(old.first, floor);
1798 auto it = snaps.lower_bound(ofirst);
1799 if (it == snaps.end() || *it > p)
1800 continue;
1801 if (p >= floor)
1802 _project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false);
1803 }
1804 }
1805 cur->dirty_old_rstats.clear();
1806 }
1807
1808
1809 void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
1810 CDir *parent, int linkunlink, bool update_inode)
1811 {
1812 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1813 dout(20) << " inode rstat " << inode->rstat << dendl;
1814 dout(20) << " inode accounted_rstat " << inode->accounted_rstat << dendl;
1815 nest_info_t delta;
1816 if (linkunlink == 0) {
1817 delta.add(inode->rstat);
1818 delta.sub(inode->accounted_rstat);
1819 } else if (linkunlink < 0) {
1820 delta.sub(inode->accounted_rstat);
1821 } else {
1822 delta.add(inode->rstat);
1823 }
1824 dout(20) << " delta " << delta << dendl;
1825
1826
1827 while (last >= ofirst) {
1828 /*
1829 * pick fnode version to update. at each iteration, we want to
1830 * pick a segment ending in 'last' to update. split as necessary
1831 * to make that work. then, adjust first up so that we only
1832 * update one segment at a time. then loop to cover the whole
1833 * [ofirst,last] interval.
1834 */
1835 nest_info_t *prstat;
1836 snapid_t first;
1837 auto pf = parent->_get_projected_fnode();
1838 if (last == CEPH_NOSNAP) {
1839 if (g_conf()->mds_snap_rstat)
1840 first = std::max(ofirst, parent->first);
1841 else
1842 first = parent->first;
1843 prstat = &pf->rstat;
1844 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1845
1846 if (first > parent->first &&
1847 !(pf->rstat == pf->accounted_rstat)) {
1848 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1849 << parent->first << "," << (first-1) << "] "
1850 << " " << *prstat << "/" << pf->accounted_rstat
1851 << dendl;
1852 parent->dirty_old_rstat[first-1].first = parent->first;
1853 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1854 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1855 }
1856 parent->first = first;
1857 } else if (!g_conf()->mds_snap_rstat) {
1858 // drop snapshots' rstats
1859 break;
1860 } else if (last >= parent->first) {
1861 first = parent->first;
1862 parent->dirty_old_rstat[last].first = first;
1863 parent->dirty_old_rstat[last].rstat = pf->rstat;
1864 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1865 prstat = &parent->dirty_old_rstat[last].rstat;
1866 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1867 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1868 } else {
1869 // be careful, dirty_old_rstat is a _sparse_ map.
1870 // sorry, this is ugly.
1871 first = ofirst;
1872
1873 // find any intersection with last
1874 auto it = parent->dirty_old_rstat.lower_bound(last);
1875 if (it == parent->dirty_old_rstat.end()) {
1876 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1877 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1878 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1879 first = parent->dirty_old_rstat.rbegin()->first+1;
1880 }
1881 } else {
1882 // *it last is >= last
1883 if (it->second.first <= last) {
1884 // *it intersects [first,last]
1885 if (it->second.first < first) {
1886 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1887 parent->dirty_old_rstat[first-1] = it->second;
1888 it->second.first = first;
1889 }
1890 if (it->second.first > first)
1891 first = it->second.first;
1892 if (last < it->first) {
1893 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1894 parent->dirty_old_rstat[last] = it->second;
1895 it->second.first = last+1;
1896 }
1897 } else {
1898 // *it is to the _right_ of [first,last]
1899 it = parent->dirty_old_rstat.lower_bound(first);
1900 // new *it last is >= first
1901 if (it->second.first <= last && // new *it isn't also to the right, and
1902 it->first >= first) { // it intersects our first bit,
1903 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1904 first = it->first+1;
1905 }
1906 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1907 }
1908 }
1909 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1910 parent->dirty_old_rstat[last].first = first;
1911 prstat = &parent->dirty_old_rstat[last].rstat;
1912 }
1913
1914 // apply
1915 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1916 ceph_assert(last >= first);
1917 prstat->add(delta);
1918 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1919
1920 last = first-1;
1921 }
1922
1923 if (update_inode) {
1924 auto _inode = const_cast<CInode::mempool_inode*>(inode);
1925 _inode->accounted_rstat = _inode->rstat;
1926 }
1927 }
1928
1929 void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat,
1930 const nest_info_t& accounted_rstat,
1931 snapid_t ofirst, snapid_t last,
1932 CInode *pin, bool cow_head)
1933 {
1934 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1935 dout(20) << " frag rstat " << rstat << dendl;
1936 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1937 nest_info_t delta = rstat;
1938 delta.sub(accounted_rstat);
1939 dout(20) << " delta " << delta << dendl;
1940
1941 CInode::old_inode_map_ptr _old_inodes;
1942 while (last >= ofirst) {
1943 CInode::mempool_inode *pi;
1944 snapid_t first;
1945 if (last == pin->last) {
1946 pi = pin->_get_projected_inode();
1947 first = std::max(ofirst, pin->first);
1948 if (first > pin->first) {
1949 auto& old = pin->cow_old_inode(first-1, cow_head);
1950 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1951 }
1952 } else {
1953 if (!_old_inodes) {
1954 _old_inodes = CInode::allocate_old_inode_map();
1955 if (pin->is_any_old_inodes())
1956 *_old_inodes = *pin->get_old_inodes();
1957 }
1958 if (last >= pin->first) {
1959 first = pin->first;
1960 pin->cow_old_inode(last, cow_head);
1961 } else {
1962 // our life is easier here because old_inodes is not sparse
1963 // (although it may not begin at snapid 1)
1964 auto it = _old_inodes->lower_bound(last);
1965 if (it == _old_inodes->end()) {
1966 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1967 break;
1968 }
1969 first = it->second.first;
1970 if (first > last) {
1971 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1972 //assert(p == pin->old_inodes.begin());
1973 break;
1974 }
1975 if (it->first > last) {
1976 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1977 << (last+1) << "," << it->first << "]" << dendl;
1978 (*_old_inodes)[last] = it->second;
1979 it->second.first = last+1;
1980 pin->dirty_old_rstats.insert(it->first);
1981 }
1982 }
1983 if (first < ofirst) {
1984 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1985 << first << "," << ofirst-1 << "]" << dendl;
1986 (*_old_inodes)[ofirst-1] = (*_old_inodes)[last];
1987 pin->dirty_old_rstats.insert(ofirst-1);
1988 (*_old_inodes)[last].first = first = ofirst;
1989 }
1990 pi = &(*_old_inodes)[last].inode;
1991 pin->dirty_old_rstats.insert(last);
1992 }
1993 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1994 pi->rstat.add(delta);
1995 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1996
1997 last = first-1;
1998 }
1999 if (_old_inodes)
2000 pin->reset_old_inodes(std::move(_old_inodes));
2001 }
2002
2003 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
2004 {
2005 if (!(mds->is_active() || mds->is_stopping()))
2006 return;
2007
2008 if (!in->is_auth() || in->is_frozen())
2009 return;
2010
2011 const auto& pi = in->get_projected_inode();
2012 if (!pi->quota.is_enabled() && !quota_change)
2013 return;
2014
2015 // creaete snaprealm for quota inode (quota was set before mimic)
2016 if (!in->get_projected_srnode())
2017 mds->server->create_quota_realm(in);
2018
2019 for (auto &p : in->client_caps) {
2020 Capability *cap = &p.second;
2021 if (cap->is_noquota())
2022 continue;
2023
2024 if (exclude_ct >= 0 && exclude_ct != p.first)
2025 goto update;
2026
2027 if (cap->last_rbytes == pi->rstat.rbytes &&
2028 cap->last_rsize == pi->rstat.rsize())
2029 continue;
2030
2031 if (pi->quota.max_files > 0) {
2032 if (pi->rstat.rsize() >= pi->quota.max_files)
2033 goto update;
2034
2035 if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) <
2036 abs(cap->last_rsize - pi->rstat.rsize()))
2037 goto update;
2038 }
2039
2040 if (pi->quota.max_bytes > 0) {
2041 if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3))
2042 goto update;
2043
2044 if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) <
2045 abs(cap->last_rbytes - pi->rstat.rbytes))
2046 goto update;
2047 }
2048
2049 continue;
2050
2051 update:
2052 cap->last_rsize = pi->rstat.rsize();
2053 cap->last_rbytes = pi->rstat.rbytes;
2054
2055 auto msg = make_message<MClientQuota>();
2056 msg->ino = in->ino();
2057 msg->rstat = pi->rstat;
2058 msg->quota = pi->quota;
2059 mds->send_message_client_counted(msg, cap->get_session());
2060 }
2061 for (const auto &it : in->get_replicas()) {
2062 auto msg = make_message<MGatherCaps>();
2063 msg->ino = in->ino();
2064 mds->send_message_mds(msg, it.first);
2065 }
2066 }
2067
2068 /*
2069 * NOTE: we _have_ to delay the scatter if we are called during a
2070 * rejoin, because we can't twiddle locks between when the
2071 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2072 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2073 * (no requests), and a survivor acks immediately. _except_ that
2074 * during rejoin_(weak|strong) processing, we may complete a lock
2075 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2076 * scatterlock state in that case or the lock states will get out of
2077 * sync between the auth and replica.
2078 *
2079 * the simple solution is to never do the scatter here. instead, put
2080 * the scatterlock on a list if it isn't already wrlockable. this is
2081 * probably the best plan anyway, since we avoid too many
2082 * scatters/locks under normal usage.
2083 */
2084 /*
2085 * some notes on dirlock/nestlock scatterlock semantics:
2086 *
2087 * the fragstat (dirlock) will never be updated without
2088 * dirlock+nestlock wrlock held by the caller.
2089 *
2090 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2091 * data is pushed up the tree. this could be changed with some
2092 * restructuring here, but in its current form we ensure that the
2093 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2094 * frag, which is nice. and, we only need to track frags that need to
2095 * be nudged (and not inodes with pending rstat changes that need to
2096 * be pushed into the frag). a consequence of this is that the
2097 * accounted_rstat on scatterlock sync may not match our current
2098 * rstat. this is normal and expected.
2099 */
2100 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2101 CInode *in, CDir *parent,
2102 int flags, int linkunlink,
2103 snapid_t cfollows)
2104 {
2105 bool primary_dn = flags & PREDIRTY_PRIMARY;
2106 bool do_parent_mtime = flags & PREDIRTY_DIR;
2107 bool shallow = flags & PREDIRTY_SHALLOW;
2108
2109 ceph_assert(mds->mdlog->entry_is_open());
2110
2111 // make sure stamp is set
2112 if (mut->get_mds_stamp() == utime_t())
2113 mut->set_mds_stamp(ceph_clock_now());
2114
2115 if (in->is_base())
2116 return;
2117
2118 dout(10) << "predirty_journal_parents"
2119 << (do_parent_mtime ? " do_parent_mtime":"")
2120 << " linkunlink=" << linkunlink
2121 << (primary_dn ? " primary_dn":" remote_dn")
2122 << (shallow ? " SHALLOW":"")
2123 << " follows " << cfollows
2124 << " " << *in << dendl;
2125
2126 if (!parent) {
2127 ceph_assert(primary_dn);
2128 parent = in->get_projected_parent_dn()->get_dir();
2129 }
2130
2131 if (flags == 0 && linkunlink == 0) {
2132 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2133 blob->add_dir_context(parent);
2134 return;
2135 }
2136
2137 // build list of inodes to wrlock, dirty, and update
2138 list<CInode*> lsi;
2139 CInode *cur = in;
2140 CDentry *parentdn = NULL;
2141 bool first = true;
2142 while (parent) {
2143 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2144 ceph_assert(parent->is_auth());
2145
2146 // opportunistically adjust parent dirfrag
2147 CInode *pin = parent->get_inode();
2148
2149 // inode -> dirfrag
2150 mut->auth_pin(parent);
2151
2152 auto pf = parent->project_fnode(mut);
2153 pf->version = parent->pre_dirty();
2154
2155 if (do_parent_mtime || linkunlink) {
2156 ceph_assert(mut->is_wrlocked(&pin->filelock));
2157 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2158 ceph_assert(cfollows == CEPH_NOSNAP);
2159
2160 // update stale fragstat/rstat?
2161 parent->resync_accounted_fragstat();
2162 parent->resync_accounted_rstat();
2163
2164 if (do_parent_mtime) {
2165 pf->fragstat.mtime = mut->get_op_stamp();
2166 pf->fragstat.change_attr++;
2167 dout(10) << "predirty_journal_parents bumping fragstat change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2168 if (pf->fragstat.mtime > pf->rstat.rctime) {
2169 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2170 pf->rstat.rctime = pf->fragstat.mtime;
2171 } else {
2172 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2173 }
2174 }
2175 if (linkunlink) {
2176 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2177 if (in->is_dir()) {
2178 pf->fragstat.nsubdirs += linkunlink;
2179 //pf->rstat.rsubdirs += linkunlink;
2180 } else {
2181 pf->fragstat.nfiles += linkunlink;
2182 //pf->rstat.rfiles += linkunlink;
2183 }
2184 }
2185 }
2186
2187 // rstat
2188 if (!primary_dn) {
2189 // don't update parent this pass
2190 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2191 pin->versionlock.can_wrlock())) {
2192 dout(20) << " unwritable parent nestlock " << pin->nestlock
2193 << ", marking dirty rstat on " << *cur << dendl;
2194 cur->mark_dirty_rstat();
2195 } else {
2196 // if we don't hold a wrlock reference on this nestlock, take one,
2197 // because we are about to write into the dirfrag fnode and that needs
2198 // to commit before the lock can cycle.
2199 if (linkunlink) {
2200 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer());
2201 }
2202
2203 if (!mut->is_wrlocked(&pin->nestlock)) {
2204 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2205 mds->locker->wrlock_force(&pin->nestlock, mut);
2206 }
2207
2208 // now we can project the inode rstat diff the dirfrag
2209 SnapRealm *prealm = pin->find_snaprealm();
2210
2211 snapid_t follows = cfollows;
2212 if (follows == CEPH_NOSNAP)
2213 follows = prealm->get_newest_seq();
2214
2215 snapid_t first = follows+1;
2216
2217 // first, if the frag is stale, bring it back in sync.
2218 parent->resync_accounted_rstat();
2219
2220 // now push inode rstats into frag
2221 project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm);
2222 cur->clear_dirty_rstat();
2223 }
2224
2225 bool stop = false;
2226 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2227 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2228 stop = true;
2229 }
2230
2231 // delay propagating until later?
2232 if (!stop && !first &&
2233 g_conf()->mds_dirstat_min_interval > 0) {
2234 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2235 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2236 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2237 << " < " << g_conf()->mds_dirstat_min_interval
2238 << ", stopping" << dendl;
2239 stop = true;
2240 } else {
2241 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2242 }
2243 }
2244
2245 // can cast only because i'm passing nowait=true in the sole user
2246 if (!stop &&
2247 !mut->is_wrlocked(&pin->nestlock) &&
2248 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2249 !mds->locker->wrlock_try(&pin->nestlock, mut)
2250 )) { // ** do not initiate.. see above comment **
2251 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2252 << " on " << *pin << dendl;
2253 stop = true;
2254 }
2255 if (stop) {
2256 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2257 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2258 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2259 mut->add_updated_lock(&pin->nestlock);
2260 if (do_parent_mtime || linkunlink) {
2261 mds->locker->mark_updated_scatterlock(&pin->filelock);
2262 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2263 mut->add_updated_lock(&pin->filelock);
2264 }
2265 break;
2266 }
2267 if (!mut->is_wrlocked(&pin->versionlock))
2268 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2269
2270 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer());
2271
2272 pin->last_dirstat_prop = mut->get_mds_stamp();
2273
2274 // dirfrag -> diri
2275 mut->auth_pin(pin);
2276 lsi.push_front(pin);
2277
2278 pin->pre_cow_old_inode(); // avoid cow mayhem!
2279
2280 auto pi = pin->project_inode(mut);
2281 pi.inode->version = pin->pre_dirty();
2282
2283 // dirstat
2284 if (do_parent_mtime || linkunlink) {
2285 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2286 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2287 bool touched_mtime = false, touched_chattr = false;
2288 pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2289 pf->accounted_fragstat = pf->fragstat;
2290 if (touched_mtime)
2291 pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime;
2292 if (touched_chattr)
2293 pi.inode->change_attr++;
2294 dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl;
2295
2296 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2297 if (pi.inode->dirstat.size() < 0)
2298 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2299 if (pi.inode->dirstat.size() != pf->fragstat.size()) {
2300 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2301 << parent->dirfrag() << ", inode has " << pi.inode->dirstat
2302 << ", dirfrag has " << pf->fragstat;
2303
2304 // trust the dirfrag for now
2305 pi.inode->dirstat = pf->fragstat;
2306
2307 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2308 }
2309 }
2310 }
2311
2312 // rstat
2313 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2314
2315 // first, if the frag is stale, bring it back in sync.
2316 parent->resync_accounted_rstat();
2317
2318 if (g_conf()->mds_snap_rstat) {
2319 for (auto &p : parent->dirty_old_rstat) {
2320 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2321 p.first, pin, true);
2322 }
2323 }
2324 parent->dirty_old_rstat.clear();
2325 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2326
2327 pf->accounted_rstat = pf->rstat;
2328
2329 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2330 if (pi.inode->rstat.rbytes != pf->rstat.rbytes) {
2331 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2332 << parent->dirfrag() << ", inode has " << pi.inode->rstat
2333 << ", dirfrag has " << pf->rstat;
2334
2335 // trust the dirfrag for now
2336 pi.inode->rstat = pf->rstat;
2337
2338 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2339 }
2340 }
2341
2342 parent->check_rstats();
2343 broadcast_quota_to_client(pin);
2344 if (pin->is_base())
2345 break;
2346 // next parent!
2347 cur = pin;
2348 parentdn = pin->get_projected_parent_dn();
2349 ceph_assert(parentdn);
2350 parent = parentdn->get_dir();
2351 linkunlink = 0;
2352 do_parent_mtime = false;
2353 primary_dn = true;
2354 first = false;
2355 }
2356
2357 // now, stick it in the blob
2358 ceph_assert(parent);
2359 ceph_assert(parent->is_auth());
2360 blob->add_dir_context(parent);
2361 blob->add_dir(parent, true);
2362 for (const auto& in : lsi) {
2363 journal_dirty_inode(mut.get(), blob, in);
2364 }
2365
2366 }
2367
2368
2369
2370
2371
2372 // ===================================
2373 // peer requests
2374
2375
2376 /*
2377 * some handlers for leader requests with peers. we need to make
2378 * sure leader journal commits before we forget we leadered them and
2379 * remove them from the uncommitted_leaders map (used during recovery
2380 * to commit|abort peers).
2381 */
2382 struct C_MDC_CommittedLeader : public MDCacheLogContext {
2383 metareqid_t reqid;
2384 C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2385 void finish(int r) override {
2386 mdcache->_logged_leader_commit(reqid);
2387 }
2388 };
2389
2390 void MDCache::log_leader_commit(metareqid_t reqid)
2391 {
2392 dout(10) << "log_leader_commit " << reqid << dendl;
2393 uncommitted_leaders[reqid].committing = true;
2394 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2395 new C_MDC_CommittedLeader(this, reqid));
2396 }
2397
2398 void MDCache::_logged_leader_commit(metareqid_t reqid)
2399 {
2400 dout(10) << "_logged_leader_commit " << reqid << dendl;
2401 ceph_assert(uncommitted_leaders.count(reqid));
2402 uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2403 mds->queue_waiters(uncommitted_leaders[reqid].waiters);
2404 uncommitted_leaders.erase(reqid);
2405 }
2406
2407 // while active...
2408
2409 void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from)
2410 {
2411 dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl;
2412 ceph_assert(uncommitted_leaders.count(r));
2413 uncommitted_leaders[r].peers.erase(from);
2414 if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty())
2415 log_leader_commit(r);
2416 }
2417
2418 void MDCache::logged_leader_update(metareqid_t reqid)
2419 {
2420 dout(10) << "logged_leader_update " << reqid << dendl;
2421 ceph_assert(uncommitted_leaders.count(reqid));
2422 uncommitted_leaders[reqid].safe = true;
2423 auto p = pending_leaders.find(reqid);
2424 if (p != pending_leaders.end()) {
2425 pending_leaders.erase(p);
2426 if (pending_leaders.empty())
2427 process_delayed_resolve();
2428 }
2429 }
2430
2431 /*
2432 * Leader may crash after receiving all peers' commit acks, but before journalling
2433 * the final commit. Peers may crash after journalling the peer commit, but before
2434 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
2435 * resolve finishes.
2436 */
2437 void MDCache::finish_committed_leaders()
2438 {
2439 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
2440 p != uncommitted_leaders.end();
2441 ++p) {
2442 p->second.recovering = false;
2443 if (!p->second.committing && p->second.peers.empty()) {
2444 dout(10) << "finish_committed_leaders " << p->first << dendl;
2445 log_leader_commit(p->first);
2446 }
2447 }
2448 }
2449
2450 /*
2451 * at end of resolve... we must journal a commit|abort for all peer
2452 * updates, before moving on.
2453 *
2454 * this is so that the leader can safely journal ECommitted on ops it
2455 * leaders when it reaches up:active (all other recovering nodes must
2456 * complete resolve before that happens).
2457 */
2458 struct C_MDC_PeerCommit : public MDCacheLogContext {
2459 mds_rank_t from;
2460 metareqid_t reqid;
2461 C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2462 void finish(int r) override {
2463 mdcache->_logged_peer_commit(from, reqid);
2464 }
2465 };
2466
2467 void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid)
2468 {
2469 dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl;
2470
2471 // send a message
2472 auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED);
2473 mds->send_message_mds(req, from);
2474 }
2475
2476
2477
2478
2479
2480
2481 // ====================================================================
2482 // import map, recovery
2483
2484 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2485 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2486 {
2487 if (subtrees.count(oldparent)) {
2488 vector<dirfrag_t>& v = subtrees[oldparent];
2489 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2490 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2491 if (*it == df) {
2492 v.erase(it);
2493 break;
2494 }
2495 }
2496 if (subtrees.count(newparent)) {
2497 vector<dirfrag_t>& v = subtrees[newparent];
2498 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2499 v.push_back(df);
2500 }
2501 }
2502
2503 ESubtreeMap *MDCache::create_subtree_map()
2504 {
2505 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2506 << num_subtrees_fullauth() << " fullauth"
2507 << dendl;
2508
2509 show_subtrees();
2510
2511 ESubtreeMap *le = new ESubtreeMap();
2512 mds->mdlog->_start_entry(le);
2513
2514 map<dirfrag_t, CDir*> dirs_to_add;
2515
2516 if (myin) {
2517 CDir* mydir = myin->get_dirfrag(frag_t());
2518 dirs_to_add[mydir->dirfrag()] = mydir;
2519 }
2520
2521 // include all auth subtrees, and their bounds.
2522 // and a spanning tree to tie it to the root.
2523 for (auto& [dir, bounds] : subtrees) {
2524 // journal subtree as "ours" if we are
2525 // me, -2
2526 // me, me
2527 // me, !me (may be importing and ambiguous!)
2528
2529 // so not
2530 // !me, *
2531 if (dir->get_dir_auth().first != mds->get_nodeid())
2532 continue;
2533
2534 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2535 my_ambiguous_imports.count(dir->dirfrag())) {
2536 dout(15) << " ambig subtree " << *dir << dendl;
2537 le->ambiguous_subtrees.insert(dir->dirfrag());
2538 } else {
2539 dout(15) << " auth subtree " << *dir << dendl;
2540 }
2541
2542 dirs_to_add[dir->dirfrag()] = dir;
2543 le->subtrees[dir->dirfrag()].clear();
2544
2545 // bounds
2546 size_t nbounds = bounds.size();
2547 if (nbounds > 3) {
2548 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2549 }
2550 for (auto& bound : bounds) {
2551 if (nbounds <= 3) {
2552 dout(15) << " subtree bound " << *bound << dendl;
2553 }
2554 dirs_to_add[bound->dirfrag()] = bound;
2555 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2556 }
2557 }
2558
2559 // apply projected renames
2560 for (const auto& [diri, renames] : projected_subtree_renames) {
2561 for (const auto& [olddir, newdir] : renames) {
2562 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2563
2564 auto&& dfls = diri->get_dirfrags();
2565 for (const auto& dir : dfls) {
2566 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2567 CDir *oldparent = get_projected_subtree_root(olddir);
2568 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2569 CDir *newparent = get_projected_subtree_root(newdir);
2570 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2571
2572 if (oldparent == newparent) {
2573 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
2574 << oldparent->dirfrag() << dendl;
2575 continue;
2576 }
2577
2578 if (dir->is_subtree_root()) {
2579 if (le->subtrees.count(newparent->dirfrag()) &&
2580 oldparent->get_dir_auth() != newparent->get_dir_auth())
2581 dirs_to_add[dir->dirfrag()] = dir;
2582 // children are fine. change parent.
2583 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2584 le->subtrees);
2585 } else {
2586 // mid-subtree.
2587
2588 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2589 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2590 // if oldparent is auth, subtree is mine; include it.
2591 if (le->subtrees.count(oldparent->dirfrag())) {
2592 dirs_to_add[dir->dirfrag()] = dir;
2593 le->subtrees[dir->dirfrag()].clear();
2594 }
2595 // if newparent is auth, subtree is a new bound
2596 if (le->subtrees.count(newparent->dirfrag())) {
2597 dirs_to_add[dir->dirfrag()] = dir;
2598 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2599 }
2600 newparent = dir;
2601 }
2602
2603 // see if any old bounds move to the new parent.
2604 for (auto& bound : subtrees.at(oldparent)) {
2605 if (dir->contains(bound->get_parent_dir()))
2606 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2607 le->subtrees);
2608 }
2609 }
2610 }
2611 }
2612 }
2613
2614 // simplify the journaled map. our in memory map may have more
2615 // subtrees than needed due to migrations that are just getting
2616 // started or just completing. but on replay, the "live" map will
2617 // be simple and we can do a straight comparison.
2618 for (auto& [frag, bfrags] : le->subtrees) {
2619 if (le->ambiguous_subtrees.count(frag))
2620 continue;
2621 unsigned i = 0;
2622 while (i < bfrags.size()) {
2623 dirfrag_t b = bfrags[i];
2624 if (le->subtrees.count(b) &&
2625 le->ambiguous_subtrees.count(b) == 0) {
2626 auto& bb = le->subtrees.at(b);
2627 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2628 for (auto& r : bb) {
2629 bfrags.push_back(r);
2630 }
2631 dirs_to_add.erase(b);
2632 le->subtrees.erase(b);
2633 bfrags.erase(bfrags.begin() + i);
2634 } else {
2635 ++i;
2636 }
2637 }
2638 }
2639
2640 for (auto &p : dirs_to_add) {
2641 CDir *dir = p.second;
2642 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2643 le->metablob.add_dir(dir, false);
2644 }
2645
2646 dout(15) << " subtrees " << le->subtrees << dendl;
2647 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2648
2649 //le->metablob.print(cout);
2650 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2651 return le;
2652 }
2653
2654 void MDCache::dump_resolve_status(Formatter *f) const
2655 {
2656 f->open_object_section("resolve_status");
2657 f->dump_stream("resolve_gather") << resolve_gather;
2658 f->dump_stream("resolve_ack_gather") << resolve_gather;
2659 f->close_section();
2660 }
2661
2662 void MDCache::resolve_start(MDSContext *resolve_done_)
2663 {
2664 dout(10) << "resolve_start" << dendl;
2665 ceph_assert(!resolve_done);
2666 resolve_done.reset(resolve_done_);
2667
2668 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2669 // if we don't have the root dir, adjust it to UNKNOWN. during
2670 // resolve we want mds0 to explicit claim the portion of it that
2671 // it owns, so that anything beyond its bounds get left as
2672 // unknown.
2673 CDir *rootdir = root->get_dirfrag(frag_t());
2674 if (rootdir)
2675 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2676 }
2677 resolve_gather = recovery_set;
2678
2679 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2680 }
2681
2682 void MDCache::send_resolves()
2683 {
2684 send_peer_resolves();
2685
2686 if (!resolve_done) {
2687 // I'm survivor: refresh snap cache
2688 mds->snapclient->sync(
2689 new MDSInternalContextWrapper(mds,
2690 new LambdaContext([this](int r) {
2691 maybe_finish_peer_resolve();
2692 })
2693 )
2694 );
2695 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2696 return;
2697 }
2698 if (!resolve_ack_gather.empty()) {
2699 dout(10) << "send_resolves still waiting for resolve ack from ("
2700 << resolve_ack_gather << ")" << dendl;
2701 return;
2702 }
2703 if (!resolve_need_rollback.empty()) {
2704 dout(10) << "send_resolves still waiting for rollback to commit on ("
2705 << resolve_need_rollback << ")" << dendl;
2706 return;
2707 }
2708
2709 send_subtree_resolves();
2710 }
2711
2712 void MDCache::send_peer_resolves()
2713 {
2714 dout(10) << "send_peer_resolves" << dendl;
2715
2716 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2717
2718 if (mds->is_resolve()) {
2719 for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
2720 p != uncommitted_peers.end();
2721 ++p) {
2722 mds_rank_t leader = p->second.leader;
2723 auto &m = resolves[leader];
2724 if (!m) m = make_message<MMDSResolve>();
2725 m->add_peer_request(p->first, false);
2726 }
2727 } else {
2728 set<mds_rank_t> resolve_set;
2729 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2730 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2731 p != active_requests.end();
2732 ++p) {
2733 MDRequestRef& mdr = p->second;
2734 if (!mdr->is_peer())
2735 continue;
2736 if (!mdr->peer_did_prepare() && !mdr->committing) {
2737 continue;
2738 }
2739 mds_rank_t leader = mdr->peer_to_mds;
2740 if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) {
2741 dout(10) << " including uncommitted " << *mdr << dendl;
2742 if (!resolves.count(leader))
2743 resolves[leader] = make_message<MMDSResolve>();
2744 if (!mdr->committing &&
2745 mdr->has_more() && mdr->more()->is_inode_exporter) {
2746 // re-send cap exports
2747 CInode *in = mdr->more()->rename_inode;
2748 map<client_t, Capability::Export> cap_map;
2749 in->export_client_caps(cap_map);
2750 bufferlist bl;
2751 MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map);
2752 encode(inode_caps, bl);
2753 resolves[leader]->add_peer_request(p->first, bl);
2754 } else {
2755 resolves[leader]->add_peer_request(p->first, mdr->committing);
2756 }
2757 }
2758 }
2759 }
2760
2761 for (auto &p : resolves) {
2762 dout(10) << "sending peer resolve to mds." << p.first << dendl;
2763 mds->send_message_mds(p.second, p.first);
2764 resolve_ack_gather.insert(p.first);
2765 }
2766 }
2767
2768 void MDCache::send_subtree_resolves()
2769 {
2770 dout(10) << "send_subtree_resolves" << dendl;
2771
2772 if (migrator->is_exporting() || migrator->is_importing()) {
2773 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2774 migrator->show_importing();
2775 migrator->show_exporting();
2776 resolves_pending = true;
2777 return; // not now
2778 }
2779
2780 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2781 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2782 p != recovery_set.end();
2783 ++p) {
2784 if (*p == mds->get_nodeid())
2785 continue;
2786 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2787 resolves[*p] = make_message<MMDSResolve>();
2788 }
2789
2790 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2791 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2792
2793 // known
2794 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2795 p != subtrees.end();
2796 ++p) {
2797 CDir *dir = p->first;
2798
2799 // only our subtrees
2800 if (dir->authority().first != mds->get_nodeid())
2801 continue;
2802
2803 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2804 continue; // we'll add it below
2805
2806 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2807 // ambiguous (mid-import)
2808 set<CDir*> bounds;
2809 get_subtree_bounds(dir, bounds);
2810 vector<dirfrag_t> dfls;
2811 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2812 dfls.push_back((*q)->dirfrag());
2813
2814 my_ambig_imports[dir->dirfrag()] = dfls;
2815 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2816 } else {
2817 // not ambiguous.
2818 for (auto &q : resolves) {
2819 resolves[q.first]->add_subtree(dir->dirfrag());
2820 }
2821 // bounds too
2822 vector<dirfrag_t> dfls;
2823 for (set<CDir*>::iterator q = subtrees[dir].begin();
2824 q != subtrees[dir].end();
2825 ++q) {
2826 CDir *bound = *q;
2827 dfls.push_back(bound->dirfrag());
2828 }
2829
2830 my_subtrees[dir->dirfrag()] = dfls;
2831 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2832 }
2833 }
2834
2835 // ambiguous
2836 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2837 p != my_ambiguous_imports.end();
2838 ++p) {
2839 my_ambig_imports[p->first] = p->second;
2840 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2841 }
2842
2843 // simplify the claimed subtree.
2844 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2845 unsigned i = 0;
2846 while (i < p->second.size()) {
2847 dirfrag_t b = p->second[i];
2848 if (my_subtrees.count(b)) {
2849 vector<dirfrag_t>& bb = my_subtrees[b];
2850 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2851 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2852 p->second.push_back(*r);
2853 my_subtrees.erase(b);
2854 p->second.erase(p->second.begin() + i);
2855 } else {
2856 ++i;
2857 }
2858 }
2859 }
2860
2861 // send
2862 for (auto &p : resolves) {
2863 const ref_t<MMDSResolve> &m = p.second;
2864 if (mds->is_resolve()) {
2865 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2866 } else {
2867 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2868 }
2869 m->subtrees = my_subtrees;
2870 m->ambiguous_imports = my_ambig_imports;
2871 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2872 mds->send_message_mds(m, p.first);
2873 }
2874 resolves_pending = false;
2875 }
2876
2877 void MDCache::maybe_finish_peer_resolve() {
2878 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2879 // snap cache get synced or I'm in resolve state
2880 if (mds->snapclient->is_synced() || resolve_done)
2881 send_subtree_resolves();
2882 process_delayed_resolve();
2883 }
2884 }
2885
2886 void MDCache::handle_mds_failure(mds_rank_t who)
2887 {
2888 dout(7) << "handle_mds_failure mds." << who << dendl;
2889
2890 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2891
2892 resolve_gather.insert(who);
2893 discard_delayed_resolve(who);
2894 ambiguous_peer_updates.erase(who);
2895
2896 rejoin_gather.insert(who);
2897 rejoin_sent.erase(who); // i need to send another
2898 rejoin_ack_sent.erase(who); // i need to send another
2899 rejoin_ack_gather.erase(who); // i'll need/get another.
2900
2901 dout(10) << " resolve_gather " << resolve_gather << dendl;
2902 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2903 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2904 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2905 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2906
2907
2908 // tell the migrator too.
2909 migrator->handle_mds_failure_or_stop(who);
2910
2911 // tell the balancer too.
2912 mds->balancer->handle_mds_failure(who);
2913
2914 // clean up any requests peer to/from this node
2915 list<MDRequestRef> finish;
2916 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2917 p != active_requests.end();
2918 ++p) {
2919 MDRequestRef& mdr = p->second;
2920 // peer to the failed node?
2921 if (mdr->peer_to_mds == who) {
2922 if (mdr->peer_did_prepare()) {
2923 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2924 if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds))
2925 remove_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2926
2927 if (!mdr->more()->waiting_on_peer.empty()) {
2928 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2929 // will rollback, no need to wait
2930 mdr->reset_peer_request();
2931 mdr->more()->waiting_on_peer.clear();
2932 }
2933 } else if (!mdr->committing) {
2934 dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl;
2935 if (mdr->peer_request || mdr->peer_rolling_back())
2936 mdr->aborted = true;
2937 else
2938 finish.push_back(mdr);
2939 }
2940 }
2941
2942 if (mdr->is_peer() && mdr->peer_did_prepare()) {
2943 if (mdr->more()->waiting_on_peer.count(who)) {
2944 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2945 dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds."
2946 << who << dendl;
2947 mdr->more()->waiting_on_peer.erase(who);
2948 if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request)
2949 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2950 }
2951
2952 if (mdr->more()->srcdn_auth_mds == who &&
2953 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) {
2954 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2955 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2956 add_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2957 }
2958 } else if (mdr->peer_request) {
2959 const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request;
2960 // FIXME: Peer rename request can arrive after we notice mds failure.
2961 // This can cause mds to crash (does not affect integrity of FS).
2962 if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP &&
2963 peer_req->srcdn_auth == who)
2964 peer_req->mark_interrupted();
2965 }
2966
2967 // failed node is peer?
2968 if (mdr->is_leader() && !mdr->committing) {
2969 if (mdr->more()->srcdn_auth_mds == who) {
2970 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
2971 << who << " to recover" << dendl;
2972 ceph_assert(mdr->more()->witnessed.count(who) == 0);
2973 if (mdr->more()->is_ambiguous_auth)
2974 mdr->clear_ambiguous_auth();
2975 // rename srcdn's auth mds failed, all witnesses will rollback
2976 mdr->more()->witnessed.clear();
2977 pending_leaders.erase(p->first);
2978 }
2979
2980 if (mdr->more()->witnessed.count(who)) {
2981 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2982 if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) {
2983 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
2984 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2985 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
2986 // until either the request is committing or the peer also fails.
2987 ceph_assert(mdr->more()->waiting_on_peer.size() == 1);
2988 pending_leaders.insert(p->first);
2989 } else {
2990 dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds."
2991 << who << " to recover" << dendl;
2992 if (srcdn_auth >= 0)
2993 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2994
2995 // discard this peer's prepare (if any)
2996 mdr->more()->witnessed.erase(who);
2997 }
2998 }
2999
3000 if (mdr->more()->waiting_on_peer.count(who)) {
3001 dout(10) << " leader request " << *mdr << " waiting for peer mds." << who
3002 << " to recover" << dendl;
3003 // retry request when peer recovers
3004 mdr->more()->waiting_on_peer.erase(who);
3005 if (mdr->more()->waiting_on_peer.empty())
3006 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3007 }
3008
3009 if (mdr->locking && mdr->locking_target_mds == who)
3010 mdr->finish_locking(mdr->locking);
3011 }
3012 }
3013
3014 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
3015 p != uncommitted_leaders.end();
3016 ++p) {
3017 // The failed MDS may have already committed the peer update
3018 if (p->second.peers.count(who)) {
3019 p->second.recovering = true;
3020 p->second.peers.erase(who);
3021 }
3022 }
3023
3024 while (!finish.empty()) {
3025 dout(10) << "cleaning up peer request " << *finish.front() << dendl;
3026 request_finish(finish.front());
3027 finish.pop_front();
3028 }
3029
3030 kick_find_ino_peers(who);
3031 kick_open_ino_peers(who);
3032
3033 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3034 p != fragments.end(); ) {
3035 dirfrag_t df = p->first;
3036 fragment_info_t& info = p->second;
3037
3038 if (info.is_fragmenting()) {
3039 if (info.notify_ack_waiting.erase(who) &&
3040 info.notify_ack_waiting.empty()) {
3041 fragment_drop_locks(info);
3042 fragment_maybe_finish(p++);
3043 } else {
3044 ++p;
3045 }
3046 continue;
3047 }
3048
3049 ++p;
3050 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3051 std::vector<CDir*> dirs;
3052 info.dirs.swap(dirs);
3053 fragments.erase(df);
3054 fragment_unmark_unfreeze_dirs(dirs);
3055 }
3056
3057 // MDCache::shutdown_export_strays() always exports strays to mds.0
3058 if (who == mds_rank_t(0))
3059 shutdown_exporting_strays.clear();
3060
3061 show_subtrees();
3062 }
3063
3064 /*
3065 * handle_mds_recovery - called on another node's transition
3066 * from resolve -> active.
3067 */
3068 void MDCache::handle_mds_recovery(mds_rank_t who)
3069 {
3070 dout(7) << "handle_mds_recovery mds." << who << dendl;
3071
3072 // exclude all discover waiters. kick_discovers() will do the job
3073 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3074 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3075
3076 MDSContext::vec waiters;
3077
3078 // wake up any waiters in their subtrees
3079 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3080 p != subtrees.end();
3081 ++p) {
3082 CDir *dir = p->first;
3083
3084 if (dir->authority().first != who ||
3085 dir->authority().second == mds->get_nodeid())
3086 continue;
3087 ceph_assert(!dir->is_auth());
3088
3089 // wake any waiters
3090 std::queue<CDir*> q;
3091 q.push(dir);
3092
3093 while (!q.empty()) {
3094 CDir *d = q.front();
3095 q.pop();
3096 d->take_waiting(d_mask, waiters);
3097
3098 // inode waiters too
3099 for (auto &p : d->items) {
3100 CDentry *dn = p.second;
3101 CDentry::linkage_t *dnl = dn->get_linkage();
3102 if (dnl->is_primary()) {
3103 dnl->get_inode()->take_waiting(i_mask, waiters);
3104
3105 // recurse?
3106 auto&& ls = dnl->get_inode()->get_dirfrags();
3107 for (const auto& subdir : ls) {
3108 if (!subdir->is_subtree_root())
3109 q.push(subdir);
3110 }
3111 }
3112 }
3113 }
3114 }
3115
3116 kick_open_ino_peers(who);
3117 kick_find_ino_peers(who);
3118
3119 // queue them up.
3120 mds->queue_waiters(waiters);
3121 }
3122
3123 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3124 {
3125 dout(7) << "set_recovery_set " << s << dendl;
3126 recovery_set = s;
3127 }
3128
3129
3130 /*
3131 * during resolve state, we share resolves to determine who
3132 * is authoritative for which trees. we expect to get an resolve
3133 * from _everyone_ in the recovery_set (the mds cluster at the time of
3134 * the first failure).
3135 *
3136 * This functions puts the passed message before returning
3137 */
3138 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3139 {
3140 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3141 mds_rank_t from = mds_rank_t(m->get_source().num());
3142
3143 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3144 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3145 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3146 return;
3147 }
3148 // wait until we reach the resolve stage!
3149 return;
3150 }
3151
3152 discard_delayed_resolve(from);
3153
3154 // ambiguous peer requests?
3155 if (!m->peer_requests.empty()) {
3156 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3157 for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) {
3158 if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) {
3159 ceph_assert(!p->second.committing);
3160 pending_leaders.insert(p->first);
3161 }
3162 }
3163
3164 if (!pending_leaders.empty()) {
3165 dout(10) << " still have pending updates, delay processing peer resolve" << dendl;
3166 delayed_resolve[from] = m;
3167 return;
3168 }
3169 }
3170
3171 auto ack = make_message<MMDSResolveAck>();
3172 for (const auto &p : m->peer_requests) {
3173 if (uncommitted_leaders.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3174 // COMMIT
3175 if (p.second.committing) {
3176 // already committing, waiting for the OP_COMMITTED peer reply
3177 dout(10) << " already committing peer request " << p << " noop "<< dendl;
3178 } else {
3179 dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl;
3180 ack->add_commit(p.first);
3181 }
3182 uncommitted_leaders[p.first].peers.insert(from); // wait for peer OP_COMMITTED before we log ECommitted
3183
3184 if (p.second.inode_caps.length() > 0) {
3185 // peer wants to export caps (rename)
3186 ceph_assert(mds->is_resolve());
3187 MMDSResolve::peer_inode_cap inode_caps;
3188 auto q = p.second.inode_caps.cbegin();
3189 decode(inode_caps, q);
3190 inodeno_t ino = inode_caps.ino;
3191 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3192 ceph_assert(get_inode(ino));
3193
3194 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3195 q != cap_exports.end();
3196 ++q) {
3197 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3198 im.cap_id = ++last_cap_id; // assign a new cap ID
3199 im.issue_seq = 1;
3200 im.mseq = q->second.mseq;
3201
3202 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3203 if (session)
3204 rejoin_client_map.emplace(q->first, session->info.inst);
3205 }
3206
3207 // will process these caps in rejoin stage
3208 rejoin_peer_exports[ino].first = from;
3209 rejoin_peer_exports[ino].second.swap(cap_exports);
3210
3211 // send information of imported caps back to peer
3212 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3213 }
3214 } else {
3215 // ABORT
3216 dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl;
3217 ceph_assert(!p.second.committing);
3218 ack->add_abort(p.first);
3219 }
3220 }
3221 mds->send_message(ack, m->get_connection());
3222 return;
3223 }
3224
3225 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3226 dout(10) << "delay processing subtree resolve" << dendl;
3227 delayed_resolve[from] = m;
3228 return;
3229 }
3230
3231 bool survivor = false;
3232 // am i a surviving ambiguous importer?
3233 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3234 survivor = true;
3235 // check for any import success/failure (from this node)
3236 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3237 while (p != my_ambiguous_imports.end()) {
3238 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3239 ++next;
3240 CDir *dir = get_dirfrag(p->first);
3241 ceph_assert(dir);
3242 dout(10) << "checking ambiguous import " << *dir << dendl;
3243 if (migrator->is_importing(dir->dirfrag()) &&
3244 migrator->get_import_peer(dir->dirfrag()) == from) {
3245 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3246
3247 // check if sender claims the subtree
3248 bool claimed_by_sender = false;
3249 for (const auto &q : m->subtrees) {
3250 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3251 CDir *base = get_force_dirfrag(q.first, false);
3252 if (!base || !base->contains(dir))
3253 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3254
3255 bool inside = true;
3256 set<CDir*> bounds;
3257 get_force_dirfrag_bound_set(q.second, bounds);
3258 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3259 CDir *bound = *p;
3260 if (bound->contains(dir)) {
3261 inside = false; // nope, bound is dir or parent of dir, not inside.
3262 break;
3263 }
3264 }
3265 if (inside)
3266 claimed_by_sender = true;
3267 }
3268
3269 my_ambiguous_imports.erase(p); // no longer ambiguous.
3270 if (claimed_by_sender) {
3271 dout(7) << "ambiguous import failed on " << *dir << dendl;
3272 migrator->import_reverse(dir);
3273 } else {
3274 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3275 migrator->import_finish(dir, true);
3276 }
3277 }
3278 p = next;
3279 }
3280 }
3281
3282 // update my dir_auth values
3283 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3284 // migrations between other nodes)
3285 for (const auto& p : m->subtrees) {
3286 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3287 CDir *dir = get_force_dirfrag(p.first, !survivor);
3288 if (!dir)
3289 continue;
3290 adjust_bounded_subtree_auth(dir, p.second, from);
3291 try_subtree_merge(dir);
3292 }
3293
3294 show_subtrees();
3295
3296 // note ambiguous imports too
3297 for (const auto& p : m->ambiguous_imports) {
3298 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3299 other_ambiguous_imports[from][p.first] = p.second;
3300 }
3301
3302 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3303 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3304 for (const auto& p : m->table_clients) {
3305 dout(10) << " noting " << get_mdstable_name(p.type)
3306 << " pending_commits " << p.pending_commits << dendl;
3307 MDSTableClient *client = mds->get_table_client(p.type);
3308 for (const auto& q : p.pending_commits)
3309 client->notify_commit(q);
3310 }
3311
3312 // did i get them all?
3313 resolve_gather.erase(from);
3314
3315 maybe_resolve_finish();
3316 }
3317
3318 void MDCache::process_delayed_resolve()
3319 {
3320 dout(10) << "process_delayed_resolve" << dendl;
3321 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3322 tmp.swap(delayed_resolve);
3323 for (auto &p : tmp) {
3324 handle_resolve(p.second);
3325 }
3326 }
3327
3328 void MDCache::discard_delayed_resolve(mds_rank_t who)
3329 {
3330 delayed_resolve.erase(who);
3331 }
3332
3333 void MDCache::maybe_resolve_finish()
3334 {
3335 ceph_assert(resolve_ack_gather.empty());
3336 ceph_assert(resolve_need_rollback.empty());
3337
3338 if (!resolve_gather.empty()) {
3339 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3340 << resolve_gather << ")" << dendl;
3341 return;
3342 }
3343
3344 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3345 disambiguate_my_imports();
3346 finish_committed_leaders();
3347
3348 if (resolve_done) {
3349 ceph_assert(mds->is_resolve());
3350 trim_unlinked_inodes();
3351 recalc_auth_bits(false);
3352 resolve_done.release()->complete(0);
3353 } else {
3354 // I am survivor.
3355 maybe_send_pending_rejoins();
3356 }
3357 }
3358
3359 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3360 {
3361 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3362 mds_rank_t from = mds_rank_t(ack->get_source().num());
3363
3364 if (!resolve_ack_gather.count(from) ||
3365 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3366 return;
3367 }
3368
3369 if (ambiguous_peer_updates.count(from)) {
3370 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3371 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3372 }
3373
3374 for (const auto &p : ack->commit) {
3375 dout(10) << " commit on peer " << p.first << dendl;
3376
3377 if (ambiguous_peer_updates.count(from)) {
3378 remove_ambiguous_peer_update(p.first, from);
3379 continue;
3380 }
3381
3382 if (mds->is_resolve()) {
3383 // replay
3384 MDPeerUpdate *su = get_uncommitted_peer(p.first, from);
3385 ceph_assert(su);
3386
3387 // log commit
3388 mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from,
3389 EPeerUpdate::OP_COMMIT, su->origop),
3390 new C_MDC_PeerCommit(this, from, p.first));
3391 mds->mdlog->flush();
3392
3393 finish_uncommitted_peer(p.first);
3394 } else {
3395 MDRequestRef mdr = request_get(p.first);
3396 // information about leader imported caps
3397 if (p.second.length() > 0)
3398 mdr->more()->inode_import.share(p.second);
3399
3400 ceph_assert(mdr->peer_request == 0); // shouldn't be doing anything!
3401 request_finish(mdr);
3402 }
3403 }
3404
3405 for (const auto &metareq : ack->abort) {
3406 dout(10) << " abort on peer " << metareq << dendl;
3407
3408 if (mds->is_resolve()) {
3409 MDPeerUpdate *su = get_uncommitted_peer(metareq, from);
3410 ceph_assert(su);
3411
3412 // perform rollback (and journal a rollback entry)
3413 // note: this will hold up the resolve a bit, until the rollback entries journal.
3414 MDRequestRef null_ref;
3415 switch (su->origop) {
3416 case EPeerUpdate::LINK:
3417 mds->server->do_link_rollback(su->rollback, from, null_ref);
3418 break;
3419 case EPeerUpdate::RENAME:
3420 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3421 break;
3422 case EPeerUpdate::RMDIR:
3423 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3424 break;
3425 default:
3426 ceph_abort();
3427 }
3428 } else {
3429 MDRequestRef mdr = request_get(metareq);
3430 mdr->aborted = true;
3431 if (mdr->peer_request) {
3432 if (mdr->peer_did_prepare()) // journaling peer prepare ?
3433 add_rollback(metareq, from);
3434 } else {
3435 request_finish(mdr);
3436 }
3437 }
3438 }
3439
3440 if (!ambiguous_peer_updates.count(from)) {
3441 resolve_ack_gather.erase(from);
3442 maybe_finish_peer_resolve();
3443 }
3444 }
3445
3446 void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su)
3447 {
3448 auto const &ret = uncommitted_peers.emplace(std::piecewise_construct,
3449 std::forward_as_tuple(reqid),
3450 std::forward_as_tuple());
3451 ceph_assert(ret.second);
3452 ls->uncommitted_peers.insert(reqid);
3453 upeer &u = ret.first->second;
3454 u.leader = leader;
3455 u.ls = ls;
3456 u.su = su;
3457 if (su == nullptr) {
3458 return;
3459 }
3460 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3461 uncommitted_peer_rename_olddir[*p]++;
3462 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3463 uncommitted_peer_unlink[*p]++;
3464 }
3465
3466 void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
3467 {
3468 auto it = uncommitted_peers.find(reqid);
3469 if (it == uncommitted_peers.end()) {
3470 ceph_assert(!assert_exist);
3471 return;
3472 }
3473 upeer &u = it->second;
3474 MDPeerUpdate* su = u.su;
3475
3476 if (!u.waiters.empty()) {
3477 mds->queue_waiters(u.waiters);
3478 }
3479 u.ls->uncommitted_peers.erase(reqid);
3480 uncommitted_peers.erase(it);
3481
3482 if (su == nullptr) {
3483 return;
3484 }
3485 // discard the non-auth subtree we renamed out of
3486 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3487 CInode *diri = *p;
3488 map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
3489 ceph_assert(it != uncommitted_peer_rename_olddir.end());
3490 it->second--;
3491 if (it->second == 0) {
3492 uncommitted_peer_rename_olddir.erase(it);
3493 auto&& ls = diri->get_dirfrags();
3494 for (const auto& dir : ls) {
3495 CDir *root = get_subtree_root(dir);
3496 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3497 try_trim_non_auth_subtree(root);
3498 if (dir != root)
3499 break;
3500 }
3501 }
3502 } else
3503 ceph_assert(it->second > 0);
3504 }
3505 // removed the inodes that were unlinked by peer update
3506 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3507 CInode *in = *p;
3508 map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
3509 ceph_assert(it != uncommitted_peer_unlink.end());
3510 it->second--;
3511 if (it->second == 0) {
3512 uncommitted_peer_unlink.erase(it);
3513 if (!in->get_projected_parent_dn())
3514 mds->mdcache->remove_inode_recursive(in);
3515 } else
3516 ceph_assert(it->second > 0);
3517 }
3518 delete su;
3519 }
3520
3521 MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader)
3522 {
3523
3524 MDPeerUpdate* su = nullptr;
3525 auto it = uncommitted_peers.find(reqid);
3526 if (it != uncommitted_peers.end() &&
3527 it->second.leader == leader) {
3528 su = it->second.su;
3529 }
3530 return su;
3531 }
3532
3533 void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
3534 auto p = resolve_need_rollback.find(reqid);
3535 ceph_assert(p != resolve_need_rollback.end());
3536 if (mds->is_resolve()) {
3537 finish_uncommitted_peer(reqid, false);
3538 } else if (mdr) {
3539 finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled);
3540 }
3541 resolve_need_rollback.erase(p);
3542 maybe_finish_peer_resolve();
3543 }
3544
3545 void MDCache::disambiguate_other_imports()
3546 {
3547 dout(10) << "disambiguate_other_imports" << dendl;
3548
3549 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3550 // other nodes' ambiguous imports
3551 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3552 p != other_ambiguous_imports.end();
3553 ++p) {
3554 mds_rank_t who = p->first;
3555 dout(10) << "ambiguous imports for mds." << who << dendl;
3556
3557 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3558 q != p->second.end();
3559 ++q) {
3560 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3561 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3562 CDir *dir = get_force_dirfrag(q->first, recovering);
3563 if (!dir) continue;
3564
3565 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3566 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3567 dout(10) << " mds." << who << " did import " << *dir << dendl;
3568 adjust_bounded_subtree_auth(dir, q->second, who);
3569 try_subtree_merge(dir);
3570 } else {
3571 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3572 }
3573 }
3574 }
3575 other_ambiguous_imports.clear();
3576 }
3577
3578 void MDCache::disambiguate_my_imports()
3579 {
3580 dout(10) << "disambiguate_my_imports" << dendl;
3581
3582 if (!mds->is_resolve()) {
3583 ceph_assert(my_ambiguous_imports.empty());
3584 return;
3585 }
3586
3587 disambiguate_other_imports();
3588
3589 // my ambiguous imports
3590 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3591 while (!my_ambiguous_imports.empty()) {
3592 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3593
3594 CDir *dir = get_dirfrag(q->first);
3595 ceph_assert(dir);
3596
3597 if (dir->authority() != me_ambig) {
3598 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3599 cancel_ambiguous_import(dir);
3600
3601 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3602
3603 // subtree may have been swallowed by another node claiming dir
3604 // as their own.
3605 CDir *root = get_subtree_root(dir);
3606 if (root != dir)
3607 dout(10) << " subtree root is " << *root << dendl;
3608 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3609 try_trim_non_auth_subtree(root);
3610 } else {
3611 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3612 finish_ambiguous_import(q->first);
3613 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3614 }
3615 }
3616 ceph_assert(my_ambiguous_imports.empty());
3617 mds->mdlog->flush();
3618
3619 // verify all my subtrees are unambiguous!
3620 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3621 p != subtrees.end();
3622 ++p) {
3623 CDir *dir = p->first;
3624 if (dir->is_ambiguous_dir_auth()) {
3625 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3626 }
3627 ceph_assert(!dir->is_ambiguous_dir_auth());
3628 }
3629
3630 show_subtrees();
3631 }
3632
3633
3634 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3635 {
3636 ceph_assert(my_ambiguous_imports.count(base) == 0);
3637 my_ambiguous_imports[base] = bounds;
3638 }
3639
3640
3641 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3642 {
3643 // make a list
3644 vector<dirfrag_t> binos;
3645 for (set<CDir*>::iterator p = bounds.begin();
3646 p != bounds.end();
3647 ++p)
3648 binos.push_back((*p)->dirfrag());
3649
3650 // note: this can get called twice if the exporter fails during recovery
3651 if (my_ambiguous_imports.count(base->dirfrag()))
3652 my_ambiguous_imports.erase(base->dirfrag());
3653
3654 add_ambiguous_import(base->dirfrag(), binos);
3655 }
3656
3657 void MDCache::cancel_ambiguous_import(CDir *dir)
3658 {
3659 dirfrag_t df = dir->dirfrag();
3660 ceph_assert(my_ambiguous_imports.count(df));
3661 dout(10) << "cancel_ambiguous_import " << df
3662 << " bounds " << my_ambiguous_imports[df]
3663 << " " << *dir
3664 << dendl;
3665 my_ambiguous_imports.erase(df);
3666 }
3667
3668 void MDCache::finish_ambiguous_import(dirfrag_t df)
3669 {
3670 ceph_assert(my_ambiguous_imports.count(df));
3671 vector<dirfrag_t> bounds;
3672 bounds.swap(my_ambiguous_imports[df]);
3673 my_ambiguous_imports.erase(df);
3674
3675 dout(10) << "finish_ambiguous_import " << df
3676 << " bounds " << bounds
3677 << dendl;
3678 CDir *dir = get_dirfrag(df);
3679 ceph_assert(dir);
3680
3681 // adjust dir_auth, import maps
3682 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3683 try_subtree_merge(dir);
3684 }
3685
3686 void MDCache::remove_inode_recursive(CInode *in)
3687 {
3688 dout(10) << "remove_inode_recursive " << *in << dendl;
3689 auto&& ls = in->get_dirfrags();
3690 for (const auto& subdir : ls) {
3691 dout(10) << " removing dirfrag " << *subdir << dendl;
3692 auto it = subdir->items.begin();
3693 while (it != subdir->items.end()) {
3694 CDentry *dn = it->second;
3695 ++it;
3696 CDentry::linkage_t *dnl = dn->get_linkage();
3697 if (dnl->is_primary()) {
3698 CInode *tin = dnl->get_inode();
3699 subdir->unlink_inode(dn, false);
3700 remove_inode_recursive(tin);
3701 }
3702 subdir->remove_dentry(dn);
3703 }
3704
3705 if (subdir->is_subtree_root())
3706 remove_subtree(subdir);
3707 in->close_dirfrag(subdir->dirfrag().frag);
3708 }
3709 remove_inode(in);
3710 }
3711
3712 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3713 {
3714 ceph_assert(!in->is_auth());
3715
3716 dout(10) << __func__ << ":" << *in << dendl;
3717
3718 // Recurse into any dirfrags beneath this inode
3719 auto&& ls = in->get_dirfrags();
3720 for (const auto& subdir : ls) {
3721 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3722 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3723 return true;
3724 }
3725
3726 for (auto it = subdir->items.begin(); it != subdir->items.end();) {
3727 CDentry *dn = it->second;
3728 it++;
3729 CDentry::linkage_t *dnl = dn->get_linkage();
3730 if (dnl->is_primary()) {
3731 CInode *tin = dnl->get_inode();
3732
3733 /* Remote strays with linkage (i.e. hardlinks) should not be
3734 * expired, because they may be the target of
3735 * a rename() as the owning MDS shuts down */
3736 if (!tin->is_stray() && tin->get_inode()->nlink) {
3737 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3738 return true;
3739 }
3740
3741 const bool abort = expire_recursive(tin, expiremap);
3742 if (abort) {
3743 return true;
3744 }
3745 }
3746 if (dn->lru_is_expireable()) {
3747 trim_dentry(dn, expiremap);
3748 } else {
3749 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3750 return true;
3751 }
3752 }
3753 }
3754
3755 return false;
3756 }
3757
3758 void MDCache::trim_unlinked_inodes()
3759 {
3760 dout(7) << "trim_unlinked_inodes" << dendl;
3761 int count = 0;
3762 vector<CInode*> q;
3763 for (auto &p : inode_map) {
3764 CInode *in = p.second;
3765 if (in->get_parent_dn() == NULL && !in->is_base()) {
3766 dout(7) << " will trim from " << *in << dendl;
3767 q.push_back(in);
3768 }
3769
3770 if (!(++count % mds->heartbeat_reset_grace()))
3771 mds->heartbeat_reset();
3772 }
3773 for (auto& in : q) {
3774 remove_inode_recursive(in);
3775
3776 if (!(++count % mds->heartbeat_reset_grace()))
3777 mds->heartbeat_reset();
3778 }
3779 }
3780
3781 /** recalc_auth_bits()
3782 * once subtree auth is disambiguated, we need to adjust all the
3783 * auth and dirty bits in our cache before moving on.
3784 */
3785 void MDCache::recalc_auth_bits(bool replay)
3786 {
3787 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3788
3789 if (root) {
3790 root->inode_auth.first = mds->mdsmap->get_root();
3791 bool auth = mds->get_nodeid() == root->inode_auth.first;
3792 if (auth) {
3793 root->state_set(CInode::STATE_AUTH);
3794 } else {
3795 root->state_clear(CInode::STATE_AUTH);
3796 if (!replay)
3797 root->state_set(CInode::STATE_REJOINING);
3798 }
3799 }
3800
3801 set<CInode*> subtree_inodes;
3802 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3803 p != subtrees.end();
3804 ++p) {
3805 if (p->first->dir_auth.first == mds->get_nodeid())
3806 subtree_inodes.insert(p->first->inode);
3807 }
3808
3809 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3810 p != subtrees.end();
3811 ++p) {
3812 if (p->first->inode->is_mdsdir()) {
3813 CInode *in = p->first->inode;
3814 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3815 if (auth) {
3816 in->state_set(CInode::STATE_AUTH);
3817 } else {
3818 in->state_clear(CInode::STATE_AUTH);
3819 if (!replay)
3820 in->state_set(CInode::STATE_REJOINING);
3821 }
3822 }
3823
3824 std::queue<CDir*> dfq; // dirfrag queue
3825 dfq.push(p->first);
3826
3827 bool auth = p->first->authority().first == mds->get_nodeid();
3828 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3829
3830 while (!dfq.empty()) {
3831 CDir *dir = dfq.front();
3832 dfq.pop();
3833
3834 // dir
3835 if (auth) {
3836 dir->state_set(CDir::STATE_AUTH);
3837 } else {
3838 dir->state_clear(CDir::STATE_AUTH);
3839 if (!replay) {
3840 // close empty non-auth dirfrag
3841 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3842 dir->inode->close_dirfrag(dir->get_frag());
3843 continue;
3844 }
3845 dir->state_set(CDir::STATE_REJOINING);
3846 dir->state_clear(CDir::STATE_COMPLETE);
3847 if (dir->is_dirty())
3848 dir->mark_clean();
3849 }
3850 }
3851
3852 // dentries in this dir
3853 for (auto &p : dir->items) {
3854 // dn
3855 CDentry *dn = p.second;
3856 CDentry::linkage_t *dnl = dn->get_linkage();
3857 if (auth) {
3858 dn->mark_auth();
3859 } else {
3860 dn->clear_auth();
3861 if (!replay) {
3862 dn->state_set(CDentry::STATE_REJOINING);
3863 if (dn->is_dirty())
3864 dn->mark_clean();
3865 }
3866 }
3867
3868 if (dnl->is_primary()) {
3869 // inode
3870 CInode *in = dnl->get_inode();
3871 if (auth) {
3872 in->state_set(CInode::STATE_AUTH);
3873 } else {
3874 in->state_clear(CInode::STATE_AUTH);
3875 if (!replay) {
3876 in->state_set(CInode::STATE_REJOINING);
3877 if (in->is_dirty())
3878 in->mark_clean();
3879 if (in->is_dirty_parent())
3880 in->clear_dirty_parent();
3881 // avoid touching scatterlocks for our subtree roots!
3882 if (subtree_inodes.count(in) == 0)
3883 in->clear_scatter_dirty();
3884 }
3885 }
3886 // recurse?
3887 if (in->is_dir()) {
3888 auto&& dfv = in->get_nested_dirfrags();
3889 for (const auto& dir : dfv) {
3890 dfq.push(dir);
3891 }
3892 }
3893 }
3894 }
3895 }
3896 }
3897
3898 show_subtrees();
3899 show_cache();
3900 }
3901
3902
3903
3904 // ===========================================================================
3905 // REJOIN
3906
3907 /*
3908 * notes on scatterlock recovery:
3909 *
3910 * - recovering inode replica sends scatterlock data for any subtree
3911 * roots (the only ones that are possibly dirty).
3912 *
3913 * - surviving auth incorporates any provided scatterlock data. any
3914 * pending gathers are then finished, as with the other lock types.
3915 *
3916 * that takes care of surviving auth + (recovering replica)*.
3917 *
3918 * - surviving replica sends strong_inode, which includes current
3919 * scatterlock state, AND any dirty scatterlock data. this
3920 * provides the recovering auth with everything it might need.
3921 *
3922 * - recovering auth must pick initial scatterlock state based on
3923 * (weak|strong) rejoins.
3924 * - always assimilate scatterlock data (it can't hurt)
3925 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3926 * - include base inode in ack for all inodes that saw scatterlock content
3927 *
3928 * also, for scatter gather,
3929 *
3930 * - auth increments {frag,r}stat.version on completion of any gather.
3931 *
3932 * - auth incorporates changes in a gather _only_ if the version
3933 * matches.
3934 *
3935 * - replica discards changes any time the scatterlock syncs, and
3936 * after recovery.
3937 */
3938
3939 void MDCache::dump_rejoin_status(Formatter *f) const
3940 {
3941 f->open_object_section("rejoin_status");
3942 f->dump_stream("rejoin_gather") << rejoin_gather;
3943 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3944 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3945 f->close_section();
3946 }
3947
3948 void MDCache::rejoin_start(MDSContext *rejoin_done_)
3949 {
3950 dout(10) << "rejoin_start" << dendl;
3951 ceph_assert(!rejoin_done);
3952 rejoin_done.reset(rejoin_done_);
3953
3954 rejoin_gather = recovery_set;
3955 // need finish opening cap inodes before sending cache rejoins
3956 rejoin_gather.insert(mds->get_nodeid());
3957 process_imported_caps();
3958 }
3959
3960 /*
3961 * rejoin phase!
3962 *
3963 * this initiates rejoin. it should be called before we get any
3964 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3965 *
3966 * we start out by sending rejoins to everyone in the recovery set.
3967 *
3968 * if we are rejoin, send for all regions in our cache.
3969 * if we are active|stopping, send only to nodes that are rejoining.
3970 */
3971 void MDCache::rejoin_send_rejoins()
3972 {
3973 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3974
3975 if (rejoin_gather.count(mds->get_nodeid())) {
3976 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3977 rejoins_pending = true;
3978 return;
3979 }
3980 if (!resolve_gather.empty()) {
3981 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3982 << resolve_gather << ")" << dendl;
3983 rejoins_pending = true;
3984 return;
3985 }
3986
3987 ceph_assert(!migrator->is_importing());
3988 ceph_assert(!migrator->is_exporting());
3989
3990 if (!mds->is_rejoin()) {
3991 disambiguate_other_imports();
3992 }
3993
3994 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
3995
3996
3997 // if i am rejoining, send a rejoin to everyone.
3998 // otherwise, just send to others who are rejoining.
3999 for (const auto& rank : recovery_set) {
4000 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4001 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
4002 if (mds->is_rejoin())
4003 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4004 else if (mds->mdsmap->is_rejoin(rank))
4005 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
4006 }
4007
4008 if (mds->is_rejoin()) {
4009 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4010 for (auto& p : cap_exports) {
4011 mds_rank_t target = p.second.first;
4012 if (rejoins.count(target) == 0)
4013 continue;
4014 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4015 Session *session = nullptr;
4016 auto it = client_exports.find(q->first);
4017 if (it != client_exports.end()) {
4018 session = it->second.first;
4019 if (session)
4020 it->second.second.insert(target);
4021 } else {
4022 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4023 auto& r = client_exports[q->first];
4024 r.first = session;
4025 if (session)
4026 r.second.insert(target);
4027 }
4028 if (session) {
4029 ++q;
4030 } else {
4031 // remove reconnect with no session
4032 p.second.second.erase(q++);
4033 }
4034 }
4035 rejoins[target]->cap_exports[p.first] = p.second.second;
4036 }
4037 for (auto& p : client_exports) {
4038 Session *session = p.second.first;
4039 for (auto& q : p.second.second) {
4040 auto rejoin = rejoins[q];
4041 rejoin->client_map[p.first] = session->info.inst;
4042 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4043 }
4044 }
4045 }
4046
4047
4048 // check all subtrees
4049 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4050 p != subtrees.end();
4051 ++p) {
4052 CDir *dir = p->first;
4053 ceph_assert(dir->is_subtree_root());
4054 if (dir->is_ambiguous_dir_auth()) {
4055 // exporter is recovering, importer is survivor.
4056 ceph_assert(rejoins.count(dir->authority().first));
4057 ceph_assert(!rejoins.count(dir->authority().second));
4058 continue;
4059 }
4060
4061 // my subtree?
4062 if (dir->is_auth())
4063 continue; // skip my own regions!
4064
4065 mds_rank_t auth = dir->get_dir_auth().first;
4066 ceph_assert(auth >= 0);
4067 if (rejoins.count(auth) == 0)
4068 continue; // don't care about this node's subtrees
4069
4070 rejoin_walk(dir, rejoins[auth]);
4071 }
4072
4073 // rejoin root inodes, too
4074 for (auto &p : rejoins) {
4075 if (mds->is_rejoin()) {
4076 // weak
4077 if (p.first == 0 && root) {
4078 p.second->add_weak_inode(root->vino());
4079 if (root->is_dirty_scattered()) {
4080 dout(10) << " sending scatterlock state on root " << *root << dendl;
4081 p.second->add_scatterlock_state(root);
4082 }
4083 }
4084 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4085 if (in)
4086 p.second->add_weak_inode(in->vino());
4087 }
4088 } else {
4089 // strong
4090 if (p.first == 0 && root) {
4091 p.second->add_strong_inode(root->vino(),
4092 root->get_replica_nonce(),
4093 root->get_caps_wanted(),
4094 root->filelock.get_state(),
4095 root->nestlock.get_state(),
4096 root->dirfragtreelock.get_state());
4097 root->state_set(CInode::STATE_REJOINING);
4098 if (root->is_dirty_scattered()) {
4099 dout(10) << " sending scatterlock state on root " << *root << dendl;
4100 p.second->add_scatterlock_state(root);
4101 }
4102 }
4103
4104 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4105 p.second->add_strong_inode(in->vino(),
4106 in->get_replica_nonce(),
4107 in->get_caps_wanted(),
4108 in->filelock.get_state(),
4109 in->nestlock.get_state(),
4110 in->dirfragtreelock.get_state());
4111 in->state_set(CInode::STATE_REJOINING);
4112 }
4113 }
4114 }
4115
4116 if (!mds->is_rejoin()) {
4117 // i am survivor. send strong rejoin.
4118 // note request remote_auth_pins, xlocks
4119 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4120 p != active_requests.end();
4121 ++p) {
4122 MDRequestRef& mdr = p->second;
4123 if (mdr->is_peer())
4124 continue;
4125 // auth pins
4126 for (const auto& q : mdr->object_states) {
4127 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4128 continue;
4129 if (!q.first->is_auth()) {
4130 mds_rank_t target = q.second.remote_auth_pinned;
4131 ceph_assert(target == q.first->authority().first);
4132 if (rejoins.count(target) == 0) continue;
4133 const auto& rejoin = rejoins[target];
4134
4135 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4136 MDSCacheObjectInfo i;
4137 q.first->set_object_info(i);
4138 if (i.ino)
4139 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4140 else
4141 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4142
4143 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4144 mdr->more()->rename_inode == q.first)
4145 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4146 mdr->reqid, mdr->attempt);
4147 }
4148 }
4149 // xlocks
4150 for (const auto& q : mdr->locks) {
4151 auto lock = q.lock;
4152 auto obj = lock->get_parent();
4153 if (q.is_xlock() && !obj->is_auth()) {
4154 mds_rank_t who = obj->authority().first;
4155 if (rejoins.count(who) == 0) continue;
4156 const auto& rejoin = rejoins[who];
4157
4158 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4159 MDSCacheObjectInfo i;
4160 obj->set_object_info(i);
4161 if (i.ino)
4162 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4163 mdr->reqid, mdr->attempt);
4164 else
4165 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4166 mdr->reqid, mdr->attempt);
4167 } else if (q.is_remote_wrlock()) {
4168 mds_rank_t who = q.wrlock_target;
4169 if (rejoins.count(who) == 0) continue;
4170 const auto& rejoin = rejoins[who];
4171
4172 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4173 MDSCacheObjectInfo i;
4174 obj->set_object_info(i);
4175 ceph_assert(i.ino);
4176 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4177 mdr->reqid, mdr->attempt);
4178 }
4179 }
4180 }
4181 }
4182
4183 // send the messages
4184 for (auto &p : rejoins) {
4185 ceph_assert(rejoin_sent.count(p.first) == 0);
4186 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4187 rejoin_sent.insert(p.first);
4188 rejoin_ack_gather.insert(p.first);
4189 mds->send_message_mds(p.second, p.first);
4190 }
4191 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4192 rejoins_pending = false;
4193
4194 // nothing?
4195 if (mds->is_rejoin() && rejoin_gather.empty()) {
4196 dout(10) << "nothing to rejoin" << dendl;
4197 rejoin_gather_finish();
4198 }
4199 }
4200
4201
4202 /**
4203 * rejoin_walk - build rejoin declarations for a subtree
4204 *
4205 * @param dir subtree root
4206 * @param rejoin rejoin message
4207 *
4208 * from a rejoining node:
4209 * weak dirfrag
4210 * weak dentries (w/ connectivity)
4211 *
4212 * from a surviving node:
4213 * strong dirfrag
4214 * strong dentries (no connectivity!)
4215 * strong inodes
4216 */
4217 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4218 {
4219 dout(10) << "rejoin_walk " << *dir << dendl;
4220
4221 std::vector<CDir*> nested; // finish this dir, then do nested items
4222
4223 if (mds->is_rejoin()) {
4224 // WEAK
4225 rejoin->add_weak_dirfrag(dir->dirfrag());
4226 for (auto &p : dir->items) {
4227 CDentry *dn = p.second;
4228 ceph_assert(dn->last == CEPH_NOSNAP);
4229 CDentry::linkage_t *dnl = dn->get_linkage();
4230 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4231 ceph_assert(dnl->is_primary());
4232 CInode *in = dnl->get_inode();
4233 ceph_assert(dnl->get_inode()->is_dir());
4234 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4235 {
4236 auto&& dirs = in->get_nested_dirfrags();
4237 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4238 }
4239 if (in->is_dirty_scattered()) {
4240 dout(10) << " sending scatterlock state on " << *in << dendl;
4241 rejoin->add_scatterlock_state(in);
4242 }
4243 }
4244 } else {
4245 // STRONG
4246 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4247 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4248 dir->state_set(CDir::STATE_REJOINING);
4249
4250 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4251 CDentry *dn = it->second;
4252 ++it;
4253 dn->state_set(CDentry::STATE_REJOINING);
4254 CDentry::linkage_t *dnl = dn->get_linkage();
4255 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4256
4257 // trim snap dentries. because they may have been pruned by
4258 // their auth mds (snap deleted)
4259 if (dn->last != CEPH_NOSNAP) {
4260 if (in && !in->remote_parents.empty()) {
4261 // unlink any stale remote snap dentry.
4262 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4263 CDentry *remote_dn = *it2;
4264 ++it2;
4265 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4266 remote_dn->unlink_remote(remote_dn->get_linkage());
4267 }
4268 }
4269 if (dn->lru_is_expireable()) {
4270 if (!dnl->is_null())
4271 dir->unlink_inode(dn, false);
4272 if (in)
4273 remove_inode(in);
4274 dir->remove_dentry(dn);
4275 continue;
4276 } else {
4277 // Inventing null/remote dentry shouldn't cause problem
4278 ceph_assert(!dnl->is_primary());
4279 }
4280 }
4281
4282 dout(15) << " add_strong_dentry " << *dn << dendl;
4283 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4284 dn->first, dn->last,
4285 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4286 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4287 dnl->is_remote() ? dnl->get_remote_d_type():0,
4288 dn->get_replica_nonce(),
4289 dn->lock.get_state());
4290 dn->state_set(CDentry::STATE_REJOINING);
4291 if (dnl->is_primary()) {
4292 CInode *in = dnl->get_inode();
4293 dout(15) << " add_strong_inode " << *in << dendl;
4294 rejoin->add_strong_inode(in->vino(),
4295 in->get_replica_nonce(),
4296 in->get_caps_wanted(),
4297 in->filelock.get_state(),
4298 in->nestlock.get_state(),
4299 in->dirfragtreelock.get_state());
4300 in->state_set(CInode::STATE_REJOINING);
4301 {
4302 auto&& dirs = in->get_nested_dirfrags();
4303 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4304 }
4305 if (in->is_dirty_scattered()) {
4306 dout(10) << " sending scatterlock state on " << *in << dendl;
4307 rejoin->add_scatterlock_state(in);
4308 }
4309 }
4310 }
4311 }
4312
4313 // recurse into nested dirs
4314 for (const auto& dir : nested) {
4315 rejoin_walk(dir, rejoin);
4316 }
4317 }
4318
4319
4320 /*
4321 * i got a rejoin.
4322 * - reply with the lockstate
4323 *
4324 * if i am active|stopping,
4325 * - remove source from replica list for everything not referenced here.
4326 */
4327 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4328 {
4329 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4330 << " (" << m->get_payload().length() << " bytes)"
4331 << dendl;
4332
4333 switch (m->op) {
4334 case MMDSCacheRejoin::OP_WEAK:
4335 handle_cache_rejoin_weak(m);
4336 break;
4337 case MMDSCacheRejoin::OP_STRONG:
4338 handle_cache_rejoin_strong(m);
4339 break;
4340 case MMDSCacheRejoin::OP_ACK:
4341 handle_cache_rejoin_ack(m);
4342 break;
4343
4344 default:
4345 ceph_abort();
4346 }
4347 }
4348
4349
4350 /*
4351 * handle_cache_rejoin_weak
4352 *
4353 * the sender
4354 * - is recovering from their journal.
4355 * - may have incorrect (out of date) inode contents
4356 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4357 *
4358 * if the sender didn't trim_non_auth(), they
4359 * - may have incorrect (out of date) dentry/inode linkage
4360 * - may have deleted/purged inodes
4361 * and i may have to go to disk to get accurate inode contents. yuck.
4362 */
4363 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4364 {
4365 mds_rank_t from = mds_rank_t(weak->get_source().num());
4366
4367 // possible response(s)
4368 ref_t<MMDSCacheRejoin> ack; // if survivor
4369 set<vinodeno_t> acked_inodes; // if survivor
4370 set<SimpleLock *> gather_locks; // if survivor
4371 bool survivor = false; // am i a survivor?
4372
4373 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4374 survivor = true;
4375 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4376 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4377
4378 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4379
4380 // check cap exports
4381 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4382 CInode *in = get_inode(p->first);
4383 ceph_assert(!in || in->is_auth());
4384 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4385 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4386 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4387 Capability::Import& im = imported_caps[p->first][q->first];
4388 if (cap) {
4389 im.cap_id = cap->get_cap_id();
4390 im.issue_seq = cap->get_last_seq();
4391 im.mseq = cap->get_mseq();
4392 } else {
4393 // all are zero
4394 }
4395 }
4396 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4397 }
4398
4399 encode(imported_caps, ack->imported_caps);
4400 } else {
4401 ceph_assert(mds->is_rejoin());
4402
4403 // we may have already received a strong rejoin from the sender.
4404 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4405 ceph_assert(gather_locks.empty());
4406
4407 // check cap exports.
4408 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4409 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4410 weak->client_metadata_map.end());
4411
4412 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4413 CInode *in = get_inode(p->first);
4414 ceph_assert(!in || in->is_auth());
4415 // note
4416 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4417 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4418 cap_imports[p->first][q->first][from] = q->second;
4419 }
4420 }
4421 }
4422
4423 // assimilate any potentially dirty scatterlock state
4424 for (const auto &p : weak->inode_scatterlocks) {
4425 CInode *in = get_inode(p.first);
4426 ceph_assert(in);
4427 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4428 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4429 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4430 if (!survivor)
4431 rejoin_potential_updated_scatterlocks.insert(in);
4432 }
4433
4434 // recovering peer may send incorrect dirfrags here. we need to
4435 // infer which dirfrag they meant. the ack will include a
4436 // strong_dirfrag that will set them straight on the fragmentation.
4437
4438 // walk weak map
4439 set<CDir*> dirs_to_share;
4440 for (const auto &p : weak->weak_dirfrags) {
4441 CInode *diri = get_inode(p.ino);
4442 if (!diri)
4443 dout(0) << " missing dir ino " << p.ino << dendl;
4444 ceph_assert(diri);
4445
4446 frag_vec_t leaves;
4447 if (diri->dirfragtree.is_leaf(p.frag)) {
4448 leaves.push_back(p.frag);
4449 } else {
4450 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4451 if (leaves.empty())
4452 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4453 }
4454 for (const auto& leaf : leaves) {
4455 CDir *dir = diri->get_dirfrag(leaf);
4456 if (!dir) {
4457 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4458 continue;
4459 }
4460 ceph_assert(dir);
4461 if (dirs_to_share.count(dir)) {
4462 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4463 } else {
4464 dirs_to_share.insert(dir);
4465 unsigned nonce = dir->add_replica(from);
4466 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4467 if (ack) {
4468 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4469 ack->add_dirfrag_base(dir);
4470 }
4471 }
4472 }
4473 }
4474
4475 for (const auto &p : weak->weak) {
4476 CInode *diri = get_inode(p.first);
4477 if (!diri)
4478 dout(0) << " missing dir ino " << p.first << dendl;
4479 ceph_assert(diri);
4480
4481 // weak dentries
4482 CDir *dir = 0;
4483 for (const auto &q : p.second) {
4484 // locate proper dirfrag.
4485 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4486 frag_t fg = diri->pick_dirfrag(q.first.name);
4487 if (!dir || dir->get_frag() != fg) {
4488 dir = diri->get_dirfrag(fg);
4489 if (!dir)
4490 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4491 ceph_assert(dir);
4492 ceph_assert(dirs_to_share.count(dir));
4493 }
4494
4495 // and dentry
4496 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4497 ceph_assert(dn);
4498 CDentry::linkage_t *dnl = dn->get_linkage();
4499 ceph_assert(dnl->is_primary());
4500
4501 if (survivor && dn->is_replica(from))
4502 dentry_remove_replica(dn, from, gather_locks);
4503 unsigned dnonce = dn->add_replica(from);
4504 dout(10) << " have " << *dn << dendl;
4505 if (ack)
4506 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4507 dn->first, dn->last,
4508 dnl->get_inode()->ino(), inodeno_t(0), 0,
4509 dnonce, dn->lock.get_replica_state());
4510
4511 // inode
4512 CInode *in = dnl->get_inode();
4513 ceph_assert(in);
4514
4515 if (survivor && in->is_replica(from))
4516 inode_remove_replica(in, from, true, gather_locks);
4517 unsigned inonce = in->add_replica(from);
4518 dout(10) << " have " << *in << dendl;
4519
4520 // scatter the dirlock, just in case?
4521 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4522 in->filelock.set_state(LOCK_MIX);
4523
4524 if (ack) {
4525 acked_inodes.insert(in->vino());
4526 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4527 bufferlist bl;
4528 in->_encode_locks_state_for_rejoin(bl, from);
4529 ack->add_inode_locks(in, inonce, bl);
4530 }
4531 }
4532 }
4533
4534 // weak base inodes? (root, stray, etc.)
4535 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4536 p != weak->weak_inodes.end();
4537 ++p) {
4538 CInode *in = get_inode(*p);
4539 ceph_assert(in); // hmm fixme wrt stray?
4540 if (survivor && in->is_replica(from))
4541 inode_remove_replica(in, from, true, gather_locks);
4542 unsigned inonce = in->add_replica(from);
4543 dout(10) << " have base " << *in << dendl;
4544
4545 if (ack) {
4546 acked_inodes.insert(in->vino());
4547 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4548 bufferlist bl;
4549 in->_encode_locks_state_for_rejoin(bl, from);
4550 ack->add_inode_locks(in, inonce, bl);
4551 }
4552 }
4553
4554 ceph_assert(rejoin_gather.count(from));
4555 rejoin_gather.erase(from);
4556 if (survivor) {
4557 // survivor. do everything now.
4558 for (const auto &p : weak->inode_scatterlocks) {
4559 CInode *in = get_inode(p.first);
4560 ceph_assert(in);
4561 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4562 acked_inodes.insert(in->vino());
4563 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4564 }
4565
4566 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4567 mds->send_message(ack, weak->get_connection());
4568
4569 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4570 if (!(*p)->is_stable())
4571 mds->locker->eval_gather(*p);
4572 }
4573 } else {
4574 // done?
4575 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4576 rejoin_gather_finish();
4577 } else {
4578 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4579 }
4580 }
4581 }
4582
4583 /*
4584 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4585 *
4586 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4587 * ack, the replica dne, and we can remove it from our replica maps.
4588 */
4589 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4590 set<vinodeno_t>& acked_inodes,
4591 set<SimpleLock *>& gather_locks)
4592 {
4593 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4594
4595 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4596 // inode?
4597 if (in->is_auth() &&
4598 in->is_replica(from) &&
4599 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4600 inode_remove_replica(in, from, false, gather_locks);
4601 dout(10) << " rem " << *in << dendl;
4602 }
4603
4604 if (!in->is_dir())
4605 return;
4606
4607 const auto&& dfs = in->get_dirfrags();
4608 for (const auto& dir : dfs) {
4609 if (!dir->is_auth())
4610 continue;
4611
4612 if (dir->is_replica(from) &&
4613 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4614 dir->remove_replica(from);
4615 dout(10) << " rem " << *dir << dendl;
4616 }
4617
4618 // dentries
4619 for (auto &p : dir->items) {
4620 CDentry *dn = p.second;
4621
4622 if (dn->is_replica(from)) {
4623 if (ack) {
4624 const auto it = ack->strong_dentries.find(dir->dirfrag());
4625 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4626 continue;
4627 }
4628 }
4629 dentry_remove_replica(dn, from, gather_locks);
4630 dout(10) << " rem " << *dn << dendl;
4631 }
4632 }
4633 }
4634 };
4635
4636 for (auto &p : inode_map)
4637 scour_func(p.second);
4638 for (auto &p : snap_inode_map)
4639 scour_func(p.second);
4640 }
4641
4642
4643 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4644 {
4645 CInode *in = new CInode(this, true, 2, last);
4646 in->_get_inode()->ino = ino;
4647 in->state_set(CInode::STATE_REJOINUNDEF);
4648 add_inode(in);
4649 rejoin_undef_inodes.insert(in);
4650 dout(10) << " invented " << *in << dendl;
4651 return in;
4652 }
4653
4654 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4655 {
4656 CInode *in = get_inode(df.ino);
4657 if (!in)
4658 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4659 if (!in->is_dir()) {
4660 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4661 in->_get_inode()->mode = S_IFDIR;
4662 in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4663 }
4664 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4665 dir->state_set(CDir::STATE_REJOINUNDEF);
4666 rejoin_undef_dirfrags.insert(dir);
4667 dout(10) << " invented " << *dir << dendl;
4668 return dir;
4669 }
4670
4671 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4672 {
4673 mds_rank_t from = mds_rank_t(strong->get_source().num());
4674
4675 // only a recovering node will get a strong rejoin.
4676 if (!mds->is_rejoin()) {
4677 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4678 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4679 return;
4680 }
4681 ceph_abort_msg("got unexpected rejoin message during recovery");
4682 }
4683
4684 // assimilate any potentially dirty scatterlock state
4685 for (const auto &p : strong->inode_scatterlocks) {
4686 CInode *in = get_inode(p.first);
4687 ceph_assert(in);
4688 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4689 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4690 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4691 rejoin_potential_updated_scatterlocks.insert(in);
4692 }
4693
4694 rejoin_unlinked_inodes[from].clear();
4695
4696 // surviving peer may send incorrect dirfrag here (maybe they didn't
4697 // get the fragment notify, or maybe we rolled back?). we need to
4698 // infer the right frag and get them with the program. somehow.
4699 // we don't normally send ACK.. so we'll need to bundle this with
4700 // MISSING or something.
4701
4702 // strong dirfrags/dentries.
4703 // also process auth_pins, xlocks.
4704 for (const auto &p : strong->strong_dirfrags) {
4705 auto& dirfrag = p.first;
4706 CInode *diri = get_inode(dirfrag.ino);
4707 if (!diri)
4708 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4709 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4710 bool refragged = false;
4711 if (dir) {
4712 dout(10) << " have " << *dir << dendl;
4713 } else {
4714 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4715 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4716 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4717 dir = rejoin_invent_dirfrag(dirfrag);
4718 }
4719 if (dir) {
4720 dir->add_replica(from, p.second.nonce);
4721 dir->dir_rep = p.second.dir_rep;
4722 } else {
4723 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4724 frag_vec_t leaves;
4725 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4726 if (leaves.empty())
4727 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4728 dout(10) << " maps to frag(s) " << leaves << dendl;
4729 for (const auto& leaf : leaves) {
4730 CDir *dir = diri->get_dirfrag(leaf);
4731 if (!dir)
4732 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4733 else
4734 dout(10) << " have(approx) " << *dir << dendl;
4735 dir->add_replica(from, p.second.nonce);
4736 dir->dir_rep = p.second.dir_rep;
4737 }
4738 refragged = true;
4739 }
4740
4741 const auto it = strong->strong_dentries.find(dirfrag);
4742 if (it != strong->strong_dentries.end()) {
4743 const auto& dmap = it->second;
4744 for (const auto &q : dmap) {
4745 const string_snap_t& ss = q.first;
4746 const MMDSCacheRejoin::dn_strong& d = q.second;
4747 CDentry *dn;
4748 if (!refragged)
4749 dn = dir->lookup(ss.name, ss.snapid);
4750 else {
4751 frag_t fg = diri->pick_dirfrag(ss.name);
4752 dir = diri->get_dirfrag(fg);
4753 ceph_assert(dir);
4754 dn = dir->lookup(ss.name, ss.snapid);
4755 }
4756 if (!dn) {
4757 if (d.is_remote()) {
4758 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4759 } else if (d.is_null()) {
4760 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4761 } else {
4762 CInode *in = get_inode(d.ino, ss.snapid);
4763 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4764 dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4765 }
4766 dout(10) << " invented " << *dn << dendl;
4767 }
4768 CDentry::linkage_t *dnl = dn->get_linkage();
4769
4770 // dn auth_pin?
4771 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4772 if (pinned_it != strong->authpinned_dentries.end()) {
4773 const auto peer_reqid_it = pinned_it->second.find(ss);
4774 if (peer_reqid_it != pinned_it->second.end()) {
4775 for (const auto &r : peer_reqid_it->second) {
4776 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4777
4778 // get/create peer mdrequest
4779 MDRequestRef mdr;
4780 if (have_request(r.reqid))
4781 mdr = request_get(r.reqid);
4782 else
4783 mdr = request_start_peer(r.reqid, r.attempt, strong);
4784 mdr->auth_pin(dn);
4785 }
4786 }
4787 }
4788
4789 // dn xlock?
4790 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4791 if (xlocked_it != strong->xlocked_dentries.end()) {
4792 const auto ss_req_it = xlocked_it->second.find(ss);
4793 if (ss_req_it != xlocked_it->second.end()) {
4794 const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second;
4795 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4796 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4797 ceph_assert(mdr->is_auth_pinned(dn));
4798 if (!mdr->is_xlocked(&dn->versionlock)) {
4799 ceph_assert(dn->versionlock.can_xlock_local());
4800 dn->versionlock.get_xlock(mdr, mdr->get_client());
4801 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4802 }
4803 if (dn->lock.is_stable())
4804 dn->auth_pin(&dn->lock);
4805 dn->lock.set_state(LOCK_XLOCK);
4806 dn->lock.get_xlock(mdr, mdr->get_client());
4807 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4808 }
4809 }
4810
4811 dn->add_replica(from, d.nonce);
4812 dout(10) << " have " << *dn << dendl;
4813
4814 if (dnl->is_primary()) {
4815 if (d.is_primary()) {
4816 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4817 // the survivor missed MDentryUnlink+MDentryLink messages ?
4818 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4819 CInode *in = get_inode(d.ino, ss.snapid);
4820 ceph_assert(in);
4821 ceph_assert(in->get_parent_dn());
4822 rejoin_unlinked_inodes[from].insert(in);
4823 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4824 }
4825 } else {
4826 // the survivor missed MDentryLink message ?
4827 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4828 dout(7) << " sender doesn't have primay dentry" << dendl;
4829 }
4830 } else {
4831 if (d.is_primary()) {
4832 // the survivor missed MDentryUnlink message ?
4833 CInode *in = get_inode(d.ino, ss.snapid);
4834 ceph_assert(in);
4835 ceph_assert(in->get_parent_dn());
4836 rejoin_unlinked_inodes[from].insert(in);
4837 dout(7) << " sender has primary dentry but we don't" << dendl;
4838 }
4839 }
4840 }
4841 }
4842 }
4843
4844 for (const auto &p : strong->strong_inodes) {
4845 CInode *in = get_inode(p.first);
4846 ceph_assert(in);
4847 in->add_replica(from, p.second.nonce);
4848 dout(10) << " have " << *in << dendl;
4849
4850 const MMDSCacheRejoin::inode_strong& is = p.second;
4851
4852 // caps_wanted
4853 if (is.caps_wanted) {
4854 in->set_mds_caps_wanted(from, is.caps_wanted);
4855 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4856 << " on " << *in << dendl;
4857 }
4858
4859 // scatterlocks?
4860 // infer state from replica state:
4861 // * go to MIX if they might have wrlocks
4862 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4863 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4864 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4865 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4866
4867 // auth pin?
4868 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4869 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4870 for (const auto& r : authpinned_inodes_it->second) {
4871 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4872
4873 // get/create peer mdrequest
4874 MDRequestRef mdr;
4875 if (have_request(r.reqid))
4876 mdr = request_get(r.reqid);
4877 else
4878 mdr = request_start_peer(r.reqid, r.attempt, strong);
4879 if (strong->frozen_authpin_inodes.count(in->vino())) {
4880 ceph_assert(!in->get_num_auth_pins());
4881 mdr->freeze_auth_pin(in);
4882 } else {
4883 ceph_assert(!in->is_frozen_auth_pin());
4884 }
4885 mdr->auth_pin(in);
4886 }
4887 }
4888 // xlock(s)?
4889 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4890 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4891 for (const auto &q : xlocked_inodes_it->second) {
4892 SimpleLock *lock = in->get_lock(q.first);
4893 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4894 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4895 ceph_assert(mdr->is_auth_pinned(in));
4896 if (!mdr->is_xlocked(&in->versionlock)) {
4897 ceph_assert(in->versionlock.can_xlock_local());
4898 in->versionlock.get_xlock(mdr, mdr->get_client());
4899 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4900 }
4901 if (lock->is_stable())
4902 in->auth_pin(lock);
4903 lock->set_state(LOCK_XLOCK);
4904 if (lock == &in->filelock)
4905 in->loner_cap = -1;
4906 lock->get_xlock(mdr, mdr->get_client());
4907 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4908 }
4909 }
4910 }
4911 // wrlock(s)?
4912 for (const auto &p : strong->wrlocked_inodes) {
4913 CInode *in = get_inode(p.first);
4914 for (const auto &q : p.second) {
4915 SimpleLock *lock = in->get_lock(q.first);
4916 for (const auto &r : q.second) {
4917 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4918 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4919 if (in->is_auth())
4920 ceph_assert(mdr->is_auth_pinned(in));
4921 lock->set_state(LOCK_MIX);
4922 if (lock == &in->filelock)
4923 in->loner_cap = -1;
4924 lock->get_wrlock(true);
4925 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4926 }
4927 }
4928 }
4929
4930 // done?
4931 ceph_assert(rejoin_gather.count(from));
4932 rejoin_gather.erase(from);
4933 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4934 rejoin_gather_finish();
4935 } else {
4936 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4937 }
4938 }
4939
4940 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4941 {
4942 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4943 mds_rank_t from = mds_rank_t(ack->get_source().num());
4944
4945 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4946 bool survivor = !mds->is_rejoin();
4947
4948 // for sending cache expire message
4949 set<CInode*> isolated_inodes;
4950 set<CInode*> refragged_inodes;
4951 list<pair<CInode*,int> > updated_realms;
4952
4953 // dirs
4954 for (const auto &p : ack->strong_dirfrags) {
4955 // we may have had incorrect dir fragmentation; refragment based
4956 // on what they auth tells us.
4957 CDir *dir = get_dirfrag(p.first);
4958 if (!dir) {
4959 dir = get_force_dirfrag(p.first, false);
4960 if (dir)
4961 refragged_inodes.insert(dir->get_inode());
4962 }
4963 if (!dir) {
4964 CInode *diri = get_inode(p.first.ino);
4965 if (!diri) {
4966 // barebones inode; the full inode loop below will clean up.
4967 diri = new CInode(this, false);
4968 auto _inode = diri->_get_inode();
4969 _inode->ino = p.first.ino;
4970 _inode->mode = S_IFDIR;
4971 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4972
4973 add_inode(diri);
4974 if (MDS_INO_MDSDIR(from) == p.first.ino) {
4975 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4976 dout(10) << " add inode " << *diri << dendl;
4977 } else {
4978 diri->inode_auth = CDIR_AUTH_DEFAULT;
4979 isolated_inodes.insert(diri);
4980 dout(10) << " unconnected dirfrag " << p.first << dendl;
4981 }
4982 }
4983 // barebones dirfrag; the full dirfrag loop below will clean up.
4984 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4985 if (MDS_INO_MDSDIR(from) == p.first.ino ||
4986 (dir->authority() != CDIR_AUTH_UNDEF &&
4987 dir->authority().first != from))
4988 adjust_subtree_auth(dir, from);
4989 dout(10) << " add dirfrag " << *dir << dendl;
4990 }
4991
4992 dir->set_replica_nonce(p.second.nonce);
4993 dir->state_clear(CDir::STATE_REJOINING);
4994 dout(10) << " got " << *dir << dendl;
4995
4996 // dentries
4997 auto it = ack->strong_dentries.find(p.first);
4998 if (it != ack->strong_dentries.end()) {
4999 for (const auto &q : it->second) {
5000 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5001 if(!dn)
5002 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5003
5004 CDentry::linkage_t *dnl = dn->get_linkage();
5005
5006 ceph_assert(dn->last == q.first.snapid);
5007 if (dn->first != q.second.first) {
5008 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5009 dn->first = q.second.first;
5010 }
5011
5012 // may have bad linkage if we missed dentry link/unlink messages
5013 if (dnl->is_primary()) {
5014 CInode *in = dnl->get_inode();
5015 if (!q.second.is_primary() ||
5016 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5017 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5018 dir->unlink_inode(dn);
5019 }
5020 } else if (dnl->is_remote()) {
5021 if (!q.second.is_remote() ||
5022 q.second.remote_ino != dnl->get_remote_ino() ||
5023 q.second.remote_d_type != dnl->get_remote_d_type()) {
5024 dout(10) << " had bad linkage for " << *dn << dendl;
5025 dir->unlink_inode(dn);
5026 }
5027 } else {
5028 if (!q.second.is_null())
5029 dout(10) << " had bad linkage for " << *dn << dendl;
5030 }
5031
5032 // hmm, did we have the proper linkage here?
5033 if (dnl->is_null() && !q.second.is_null()) {
5034 if (q.second.is_remote()) {
5035 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5036 } else {
5037 CInode *in = get_inode(q.second.ino, q.first.snapid);
5038 if (!in) {
5039 // barebones inode; assume it's dir, the full inode loop below will clean up.
5040 in = new CInode(this, false, q.second.first, q.first.snapid);
5041 auto _inode = in->_get_inode();
5042 _inode->ino = q.second.ino;
5043 _inode->mode = S_IFDIR;
5044 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5045 add_inode(in);
5046 dout(10) << " add inode " << *in << dendl;
5047 } else if (in->get_parent_dn()) {
5048 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5049 << ", unlinking " << *in << dendl;
5050 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5051 }
5052 dn->dir->link_primary_inode(dn, in);
5053 isolated_inodes.erase(in);
5054 }
5055 }
5056
5057 dn->set_replica_nonce(q.second.nonce);
5058 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5059 dn->state_clear(CDentry::STATE_REJOINING);
5060 dout(10) << " got " << *dn << dendl;
5061 }
5062 }
5063 }
5064
5065 for (const auto& in : refragged_inodes) {
5066 auto&& ls = in->get_nested_dirfrags();
5067 for (const auto& dir : ls) {
5068 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5069 continue;
5070 ceph_assert(dir->get_num_any() == 0);
5071 in->close_dirfrag(dir->get_frag());
5072 }
5073 }
5074
5075 // full dirfrags
5076 for (const auto &p : ack->dirfrag_bases) {
5077 CDir *dir = get_dirfrag(p.first);
5078 ceph_assert(dir);
5079 auto q = p.second.cbegin();
5080 dir->_decode_base(q);
5081 dout(10) << " got dir replica " << *dir << dendl;
5082 }
5083
5084 // full inodes
5085 auto p = ack->inode_base.cbegin();
5086 while (!p.end()) {
5087 inodeno_t ino;
5088 snapid_t last;
5089 bufferlist basebl;
5090 decode(ino, p);
5091 decode(last, p);
5092 decode(basebl, p);
5093 CInode *in = get_inode(ino, last);
5094 ceph_assert(in);
5095 auto q = basebl.cbegin();
5096 snapid_t sseq = 0;
5097 if (in->snaprealm)
5098 sseq = in->snaprealm->srnode.seq;
5099 in->_decode_base(q);
5100 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5101 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5102 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5103 }
5104 dout(10) << " got inode base " << *in << dendl;
5105 }
5106
5107 // inodes
5108 p = ack->inode_locks.cbegin();
5109 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5110 while (!p.end()) {
5111 inodeno_t ino;
5112 snapid_t last;
5113 __u32 nonce;
5114 bufferlist lockbl;
5115 decode(ino, p);
5116 decode(last, p);
5117 decode(nonce, p);
5118 decode(lockbl, p);
5119
5120 CInode *in = get_inode(ino, last);
5121 ceph_assert(in);
5122 in->set_replica_nonce(nonce);
5123 auto q = lockbl.cbegin();
5124 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5125 in->state_clear(CInode::STATE_REJOINING);
5126 dout(10) << " got inode locks " << *in << dendl;
5127 }
5128
5129 // FIXME: This can happen if entire subtree, together with the inode subtree root
5130 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5131 ceph_assert(isolated_inodes.empty());
5132
5133 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5134 auto bp = ack->imported_caps.cbegin();
5135 decode(peer_imported, bp);
5136
5137 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5138 p != peer_imported.end();
5139 ++p) {
5140 auto& ex = cap_exports.at(p->first);
5141 ceph_assert(ex.first == from);
5142 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5143 q != p->second.end();
5144 ++q) {
5145 auto r = ex.second.find(q->first);
5146 ceph_assert(r != ex.second.end());
5147
5148 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5149 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5150 if (!session) {
5151 dout(10) << " no session for client." << p->first << dendl;
5152 ex.second.erase(r);
5153 continue;
5154 }
5155
5156 // mark client caps stale.
5157 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5158 r->second.capinfo.cap_id, 0,
5159 mds->get_osd_epoch_barrier());
5160 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5161 (q->second.cap_id > 0 ? from : -1), 0);
5162 mds->send_message_client_counted(m, session);
5163
5164 ex.second.erase(r);
5165 }
5166 ceph_assert(ex.second.empty());
5167 }
5168
5169 for (auto p : updated_realms) {
5170 CInode *in = p.first;
5171 bool notify_clients;
5172 if (mds->is_rejoin()) {
5173 if (!rejoin_pending_snaprealms.count(in)) {
5174 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5175 rejoin_pending_snaprealms.insert(in);
5176 }
5177 notify_clients = false;
5178 } else {
5179 // notify clients if I'm survivor
5180 notify_clients = true;
5181 }
5182 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5183 }
5184
5185 // done?
5186 ceph_assert(rejoin_ack_gather.count(from));
5187 rejoin_ack_gather.erase(from);
5188 if (!survivor) {
5189 if (rejoin_gather.empty()) {
5190 // eval unstable scatter locks after all wrlocks are rejoined.
5191 while (!rejoin_eval_locks.empty()) {
5192 SimpleLock *lock = rejoin_eval_locks.front();
5193 rejoin_eval_locks.pop_front();
5194 if (!lock->is_stable())
5195 mds->locker->eval_gather(lock);
5196 }
5197 }
5198
5199 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5200 rejoin_ack_gather.empty()) {
5201 // finally, kickstart past snap parent opens
5202 open_snaprealms();
5203 } else {
5204 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5205 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5206 }
5207 } else {
5208 // survivor.
5209 mds->queue_waiters(rejoin_waiters);
5210 }
5211 }
5212
5213 /**
5214 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5215 *
5216 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5217 * messages that clean these guys up...
5218 */
5219 void MDCache::rejoin_trim_undef_inodes()
5220 {
5221 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5222
5223 while (!rejoin_undef_inodes.empty()) {
5224 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5225 CInode *in = *p;
5226 rejoin_undef_inodes.erase(p);
5227
5228 in->clear_replica_map();
5229
5230 // close out dirfrags
5231 if (in->is_dir()) {
5232 const auto&& dfls = in->get_dirfrags();
5233 for (const auto& dir : dfls) {
5234 dir->clear_replica_map();
5235
5236 for (auto &p : dir->items) {
5237 CDentry *dn = p.second;
5238 dn->clear_replica_map();
5239
5240 dout(10) << " trimming " << *dn << dendl;
5241 dir->remove_dentry(dn);
5242 }
5243
5244 dout(10) << " trimming " << *dir << dendl;
5245 in->close_dirfrag(dir->dirfrag().frag);
5246 }
5247 }
5248
5249 CDentry *dn = in->get_parent_dn();
5250 if (dn) {
5251 dn->clear_replica_map();
5252 dout(10) << " trimming " << *dn << dendl;
5253 dn->dir->remove_dentry(dn);
5254 } else {
5255 dout(10) << " trimming " << *in << dendl;
5256 remove_inode(in);
5257 }
5258 }
5259
5260 ceph_assert(rejoin_undef_inodes.empty());
5261 }
5262
5263 void MDCache::rejoin_gather_finish()
5264 {
5265 dout(10) << "rejoin_gather_finish" << dendl;
5266 ceph_assert(mds->is_rejoin());
5267 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5268
5269 if (open_undef_inodes_dirfrags())
5270 return;
5271
5272 if (process_imported_caps())
5273 return;
5274
5275 choose_lock_states_and_reconnect_caps();
5276
5277 identify_files_to_recover();
5278 rejoin_send_acks();
5279
5280 // signal completion of fetches, rejoin_gather_finish, etc.
5281 rejoin_ack_gather.erase(mds->get_nodeid());
5282
5283 // did we already get our acks too?
5284 if (rejoin_ack_gather.empty()) {
5285 // finally, open snaprealms
5286 open_snaprealms();
5287 }
5288 }
5289
5290 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5291 inodeno_t ino;
5292 public:
5293 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5294 void finish(int r) override {
5295 mdcache->rejoin_open_ino_finish(ino, r);
5296 }
5297 };
5298
5299 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5300 {
5301 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5302
5303 if (ret < 0) {
5304 cap_imports_missing.insert(ino);
5305 } else if (ret == mds->get_nodeid()) {
5306 ceph_assert(get_inode(ino));
5307 } else {
5308 auto p = cap_imports.find(ino);
5309 ceph_assert(p != cap_imports.end());
5310 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5311 ceph_assert(q->second.count(MDS_RANK_NONE));
5312 ceph_assert(q->second.size() == 1);
5313 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5314 }
5315 cap_imports.erase(p);
5316 }
5317
5318 ceph_assert(cap_imports_num_opening > 0);
5319 cap_imports_num_opening--;
5320
5321 if (cap_imports_num_opening == 0) {
5322 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5323 rejoin_gather_finish();
5324 else if (rejoin_gather.count(mds->get_nodeid()))
5325 process_imported_caps();
5326 }
5327 }
5328
5329 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5330 public:
5331 map<client_t,pair<Session*,uint64_t> > session_map;
5332 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5333 void finish(int r) override {
5334 ceph_assert(r == 0);
5335 mdcache->rejoin_open_sessions_finish(session_map);
5336 }
5337 };
5338
5339 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5340 {
5341 dout(10) << "rejoin_open_sessions_finish" << dendl;
5342 mds->server->finish_force_open_sessions(session_map);
5343 rejoin_session_map.swap(session_map);
5344 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5345 rejoin_gather_finish();
5346 }
5347
5348 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5349 {
5350 auto p = cap_imports.find(ino);
5351 if (p != cap_imports.end()) {
5352 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5353 if (ret < 0) {
5354 cap_imports_missing.insert(ino);
5355 } else if (ret != mds->get_nodeid()) {
5356 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5357 ceph_assert(q->second.count(MDS_RANK_NONE));
5358 ceph_assert(q->second.size() == 1);
5359 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5360 }
5361 cap_imports.erase(p);
5362 }
5363 }
5364 }
5365
5366 bool MDCache::process_imported_caps()
5367 {
5368 dout(10) << "process_imported_caps" << dendl;
5369
5370 if (!open_file_table.is_prefetched() &&
5371 open_file_table.prefetch_inodes()) {
5372 open_file_table.wait_for_prefetch(
5373 new MDSInternalContextWrapper(mds,
5374 new LambdaContext([this](int r) {
5375 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5376 process_imported_caps();
5377 })
5378 )
5379 );
5380 return true;
5381 }
5382
5383 open_ino_batch_start();
5384
5385 for (auto& p : cap_imports) {
5386 CInode *in = get_inode(p.first);
5387 if (in) {
5388 ceph_assert(in->is_auth());
5389 cap_imports_missing.erase(p.first);
5390 continue;
5391 }
5392 if (cap_imports_missing.count(p.first) > 0)
5393 continue;
5394
5395 uint64_t parent_ino = 0;
5396 std::string_view d_name;
5397 for (auto& q : p.second) {
5398 for (auto& r : q.second) {
5399 auto &icr = r.second;
5400 if (icr.capinfo.pathbase &&
5401 icr.path.length() > 0 &&
5402 icr.path.find('/') == string::npos) {
5403 parent_ino = icr.capinfo.pathbase;
5404 d_name = icr.path;
5405 break;
5406 }
5407 }
5408 if (parent_ino)
5409 break;
5410 }
5411
5412 dout(10) << " opening missing ino " << p.first << dendl;
5413 cap_imports_num_opening++;
5414 auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
5415 if (parent_ino) {
5416 vector<inode_backpointer_t> ancestors;
5417 ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
5418 open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
5419 } else {
5420 open_ino(p.first, (int64_t)-1, fin, false);
5421 }
5422 if (!(cap_imports_num_opening % mds->heartbeat_reset_grace()))
5423 mds->heartbeat_reset();
5424 }
5425
5426 open_ino_batch_submit();
5427
5428 if (cap_imports_num_opening > 0)
5429 return true;
5430
5431 // called by rejoin_gather_finish() ?
5432 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5433 if (!rejoin_client_map.empty() &&
5434 rejoin_session_map.empty()) {
5435 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5436 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5437 rejoin_client_metadata_map,
5438 finish->session_map);
5439 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5440 std::move(rejoin_client_metadata_map));
5441 mds->mdlog->start_submit_entry(le, finish);
5442 mds->mdlog->flush();
5443 rejoin_client_map.clear();
5444 rejoin_client_metadata_map.clear();
5445 return true;
5446 }
5447
5448 // process caps that were exported by peer rename
5449 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
5450 p != rejoin_peer_exports.end();
5451 ++p) {
5452 CInode *in = get_inode(p->first);
5453 ceph_assert(in);
5454 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5455 q != p->second.second.end();
5456 ++q) {
5457 auto r = rejoin_session_map.find(q->first);
5458 if (r == rejoin_session_map.end())
5459 continue;
5460
5461 Session *session = r->second.first;
5462 Capability *cap = in->get_client_cap(q->first);
5463 if (!cap) {
5464 cap = in->add_client_cap(q->first, session);
5465 // add empty item to reconnected_caps
5466 (void)reconnected_caps[p->first][q->first];
5467 }
5468 cap->merge(q->second, true);
5469
5470 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5471 ceph_assert(cap->get_last_seq() == im.issue_seq);
5472 ceph_assert(cap->get_mseq() == im.mseq);
5473 cap->set_cap_id(im.cap_id);
5474 // send cap import because we assigned a new cap ID
5475 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5476 p->second.first, CEPH_CAP_FLAG_AUTH);
5477 }
5478 }
5479 rejoin_peer_exports.clear();
5480 rejoin_imported_caps.clear();
5481
5482 // process cap imports
5483 // ino -> client -> frommds -> capex
5484 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5485 CInode *in = get_inode(p->first);
5486 if (!in) {
5487 dout(10) << " still missing ino " << p->first
5488 << ", will try again after replayed client requests" << dendl;
5489 ++p;
5490 continue;
5491 }
5492 ceph_assert(in->is_auth());
5493 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5494 Session *session;
5495 {
5496 auto r = rejoin_session_map.find(q->first);
5497 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5498 }
5499
5500 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5501 if (!session) {
5502 if (r->first >= 0)
5503 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5504 continue;
5505 }
5506
5507 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5508 add_reconnected_cap(q->first, in->ino(), r->second);
5509 if (r->first >= 0) {
5510 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5511 cap->inc_mseq();
5512 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5513
5514 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5515 im.cap_id = cap->get_cap_id();
5516 im.issue_seq = cap->get_last_seq();
5517 im.mseq = cap->get_mseq();
5518 }
5519 }
5520 }
5521 cap_imports.erase(p++); // remove and move on
5522 }
5523 } else {
5524 trim_non_auth();
5525
5526 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5527 rejoin_gather.erase(mds->get_nodeid());
5528 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5529 maybe_send_pending_rejoins();
5530 }
5531 return false;
5532 }
5533
5534 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5535 client_t client, snapid_t snap_follows)
5536 {
5537 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5538
5539 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5540 return;
5541
5542 const set<snapid_t>& snaps = realm->get_snaps();
5543 snapid_t follows = snap_follows;
5544
5545 while (true) {
5546 CInode *in = pick_inode_snap(head_in, follows);
5547 if (in == head_in)
5548 break;
5549
5550 bool need_snapflush = false;
5551 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5552 p != snaps.end() && *p <= in->last;
5553 ++p) {
5554 head_in->add_need_snapflush(in, *p, client);
5555 need_snapflush = true;
5556 }
5557 follows = in->last;
5558 if (!need_snapflush)
5559 continue;
5560
5561 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5562
5563 if (in->client_snap_caps.empty()) {
5564 for (int i = 0; i < num_cinode_locks; i++) {
5565 int lockid = cinode_lock_info[i].lock;
5566 SimpleLock *lock = in->get_lock(lockid);
5567 ceph_assert(lock);
5568 in->auth_pin(lock);
5569 lock->set_state(LOCK_SNAP_SYNC);
5570 lock->get_wrlock(true);
5571 }
5572 }
5573 in->client_snap_caps.insert(client);
5574 mds->locker->mark_need_snapflush_inode(in);
5575 }
5576 }
5577
5578 /*
5579 * choose lock states based on reconnected caps
5580 */
5581 void MDCache::choose_lock_states_and_reconnect_caps()
5582 {
5583 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5584
5585 int count = 0;
5586 for (auto p : inode_map) {
5587 CInode *in = p.second;
5588 if (in->last != CEPH_NOSNAP)
5589 continue;
5590
5591 if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat())
5592 in->mark_dirty_rstat();
5593
5594 int dirty_caps = 0;
5595 auto q = reconnected_caps.find(in->ino());
5596 if (q != reconnected_caps.end()) {
5597 for (const auto &it : q->second)
5598 dirty_caps |= it.second.dirty_caps;
5599 }
5600 in->choose_lock_states(dirty_caps);
5601 dout(15) << " chose lock states on " << *in << dendl;
5602
5603 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5604 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5605 rejoin_pending_snaprealms.insert(in);
5606 }
5607
5608 if (!(++count % mds->heartbeat_reset_grace()))
5609 mds->heartbeat_reset();
5610 }
5611 }
5612
5613 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5614 map<client_t,ref_t<MClientSnap>>& splits)
5615 {
5616 ref_t<MClientSnap> snap;
5617 auto it = splits.find(client);
5618 if (it != splits.end()) {
5619 snap = it->second;
5620 snap->head.op = CEPH_SNAP_OP_SPLIT;
5621 } else {
5622 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5623 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5624 snap->head.split = realm->inode->ino();
5625 snap->bl = mds->server->get_snap_trace(client, realm);
5626
5627 for (const auto& child : realm->open_children)
5628 snap->split_realms.push_back(child->inode->ino());
5629 }
5630 snap->split_inos.push_back(ino);
5631 }
5632
5633 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5634 map<client_t,ref_t<MClientSnap>>& splits)
5635 {
5636 ceph_assert(parent_realm);
5637
5638 vector<inodeno_t> split_inos;
5639 vector<inodeno_t> split_realms;
5640
5641 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
5642 split_inos.push_back((*p)->ino());
5643 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5644 p != realm->open_children.end();
5645 ++p)
5646 split_realms.push_back((*p)->inode->ino());
5647
5648 for (const auto& p : realm->client_caps) {
5649 ceph_assert(!p.second->empty());
5650 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5651 if (em.second) {
5652 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5653 update->head.split = parent_realm->inode->ino();
5654 update->split_inos = split_inos;
5655 update->split_realms = split_realms;
5656 update->bl = mds->server->get_snap_trace(p.first, parent_realm);
5657 em.first->second = std::move(update);
5658 }
5659 }
5660 }
5661
5662 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5663 {
5664 dout(10) << "send_snaps" << dendl;
5665
5666 for (auto &p : splits) {
5667 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5668 if (session) {
5669 dout(10) << " client." << p.first
5670 << " split " << p.second->head.split
5671 << " inos " << p.second->split_inos
5672 << dendl;
5673 mds->send_message_client_counted(p.second, session);
5674 } else {
5675 dout(10) << " no session for client." << p.first << dendl;
5676 }
5677 }
5678 splits.clear();
5679 }
5680
5681
5682 /*
5683 * remove any items from logsegment open_file lists that don't have
5684 * any caps
5685 */
5686 void MDCache::clean_open_file_lists()
5687 {
5688 dout(10) << "clean_open_file_lists" << dendl;
5689
5690 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5691 p != mds->mdlog->segments.end();
5692 ++p) {
5693 LogSegment *ls = p->second;
5694
5695 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5696 while (!q.end()) {
5697 CInode *in = *q;
5698 ++q;
5699 if (in->last == CEPH_NOSNAP) {
5700 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5701 in->item_open_file.remove_myself();
5702 } else {
5703 if (in->client_snap_caps.empty()) {
5704 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5705 in->item_open_file.remove_myself();
5706 }
5707 }
5708 }
5709 }
5710 }
5711
5712 void MDCache::dump_openfiles(Formatter *f)
5713 {
5714 f->open_array_section("openfiles");
5715 for (auto p = mds->mdlog->segments.begin();
5716 p != mds->mdlog->segments.end();
5717 ++p) {
5718 LogSegment *ls = p->second;
5719
5720 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5721 while (!q.end()) {
5722 CInode *in = *q;
5723 ++q;
5724 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5725 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5726 continue;
5727 f->open_object_section("file");
5728 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5729 f->close_section();
5730 }
5731 }
5732 f->close_section();
5733 }
5734
5735 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5736 {
5737 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5738 << " on " << *in << dendl;
5739 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5740 if (!session) {
5741 dout(10) << " no session for client." << client << dendl;
5742 return NULL;
5743 }
5744
5745 Capability *cap = in->reconnect_cap(client, icr, session);
5746
5747 if (frommds >= 0) {
5748 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5749 cap->inc_mseq();
5750 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5751 }
5752
5753 return cap;
5754 }
5755
5756 void MDCache::export_remaining_imported_caps()
5757 {
5758 dout(10) << "export_remaining_imported_caps" << dendl;
5759
5760 CachedStackStringStream css;
5761
5762 int count = 0;
5763 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5764 *css << " ino " << p->first << "\n";
5765 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5766 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5767 if (session) {
5768 // mark client caps stale.
5769 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5770 0, 0, 0,
5771 mds->get_osd_epoch_barrier());
5772 stale->set_cap_peer(0, 0, 0, -1, 0);
5773 mds->send_message_client_counted(stale, q->first);
5774 }
5775 }
5776
5777 if (!(++count % mds->heartbeat_reset_grace()))
5778 mds->heartbeat_reset();
5779 }
5780
5781 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5782 p != cap_reconnect_waiters.end();
5783 ++p)
5784 mds->queue_waiters(p->second);
5785
5786 cap_imports.clear();
5787 cap_reconnect_waiters.clear();
5788
5789 if (css->strv().length()) {
5790 mds->clog->warn() << "failed to reconnect caps for missing inodes:"
5791 << css->strv();
5792 }
5793 }
5794
5795 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5796 {
5797 client_t client = session->info.get_client();
5798 Capability *cap = nullptr;
5799 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5800 if (rc) {
5801 cap = in->reconnect_cap(client, *rc, session);
5802 dout(10) << "try_reconnect_cap client." << client
5803 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5804 << " issue " << ccap_string(rc->capinfo.issued)
5805 << " on " << *in << dendl;
5806 remove_replay_cap_reconnect(in->ino(), client);
5807
5808 if (in->is_replicated()) {
5809 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5810 } else {
5811 int dirty_caps = 0;
5812 auto p = reconnected_caps.find(in->ino());
5813 if (p != reconnected_caps.end()) {
5814 auto q = p->second.find(client);
5815 if (q != p->second.end())
5816 dirty_caps = q->second.dirty_caps;
5817 }
5818 in->choose_lock_states(dirty_caps);
5819 dout(15) << " chose lock states on " << *in << dendl;
5820 }
5821
5822 map<inodeno_t, MDSContext::vec >::iterator it =
5823 cap_reconnect_waiters.find(in->ino());
5824 if (it != cap_reconnect_waiters.end()) {
5825 mds->queue_waiters(it->second);
5826 cap_reconnect_waiters.erase(it);
5827 }
5828 }
5829 return cap;
5830 }
5831
5832
5833
5834 // -------
5835 // cap imports and delayed snap parent opens
5836
5837 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5838 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5839 int peer, int p_flags)
5840 {
5841 SnapRealm *realm = in->find_snaprealm();
5842 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5843 if (cap->get_last_seq() == 0) // reconnected cap
5844 cap->inc_last_seq();
5845 cap->set_last_issue();
5846 cap->set_last_issue_stamp(ceph_clock_now());
5847 cap->clear_new();
5848 auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
5849 in->ino(), realm->inode->ino(), cap->get_cap_id(),
5850 cap->get_last_seq(), cap->pending(), cap->wanted(),
5851 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
5852 in->encode_cap_message(reap, cap);
5853 reap->snapbl = mds->server->get_snap_trace(session, realm);
5854 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5855 mds->send_message_client_counted(reap, session);
5856 }
5857
5858 void MDCache::do_delayed_cap_imports()
5859 {
5860 dout(10) << "do_delayed_cap_imports" << dendl;
5861
5862 ceph_assert(delayed_imported_caps.empty());
5863 }
5864
5865 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5866 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5867 void finish(int r) override {
5868 mdcache->open_snaprealms();
5869 }
5870 };
5871
5872 void MDCache::open_snaprealms()
5873 {
5874 dout(10) << "open_snaprealms" << dendl;
5875
5876 auto it = rejoin_pending_snaprealms.begin();
5877 while (it != rejoin_pending_snaprealms.end()) {
5878 CInode *in = *it;
5879 SnapRealm *realm = in->snaprealm;
5880 ceph_assert(realm);
5881
5882 map<client_t,ref_t<MClientSnap>> splits;
5883 // finish off client snaprealm reconnects?
5884 auto q = reconnected_snaprealms.find(in->ino());
5885 if (q != reconnected_snaprealms.end()) {
5886 for (const auto& r : q->second)
5887 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5888 reconnected_snaprealms.erase(q);
5889 }
5890
5891 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) {
5892 CInode *child = *p;
5893 auto q = reconnected_caps.find(child->ino());
5894 ceph_assert(q != reconnected_caps.end());
5895 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5896 Capability *cap = child->get_client_cap(r->first);
5897 if (!cap)
5898 continue;
5899 if (r->second.snap_follows > 0) {
5900 if (r->second.snap_follows < child->first - 1) {
5901 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5902 } else if (r->second.snapflush) {
5903 // When processing a cap flush message that is re-sent, it's possble
5904 // that the sender has already released all WR caps. So we should
5905 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5906 cap->mark_needsnapflush();
5907 }
5908 }
5909 // make sure client's cap is in the correct snaprealm.
5910 if (r->second.realm_ino != in->ino()) {
5911 prepare_realm_split(realm, r->first, child->ino(), splits);
5912 }
5913 }
5914 }
5915
5916 rejoin_pending_snaprealms.erase(it++);
5917 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5918
5919 send_snaps(splits);
5920 }
5921
5922 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5923
5924 if (!reconnected_snaprealms.empty()) {
5925 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5926 for (auto& p : reconnected_snaprealms) {
5927 CachedStackStringStream css;
5928 *css << " " << p.first << " {";
5929 bool first = true;
5930 for (auto& q : p.second) {
5931 if (!first)
5932 *css << ", ";
5933 *css << "client." << q.first << "/" << q.second;
5934 }
5935 *css << "}";
5936 dout(5) << css->strv() << dendl;
5937 }
5938 }
5939 ceph_assert(rejoin_waiters.empty());
5940 ceph_assert(rejoin_pending_snaprealms.empty());
5941 dout(10) << "open_snaprealms - all open" << dendl;
5942 do_delayed_cap_imports();
5943
5944 ceph_assert(rejoin_done);
5945 rejoin_done.release()->complete(0);
5946 reconnected_caps.clear();
5947 }
5948
5949 bool MDCache::open_undef_inodes_dirfrags()
5950 {
5951 dout(10) << "open_undef_inodes_dirfrags "
5952 << rejoin_undef_inodes.size() << " inodes "
5953 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5954
5955 // dirfrag -> (fetch_complete, keys_to_fetch)
5956 map<CDir*, pair<bool, std::vector<dentry_key_t> > > fetch_queue;
5957 for (auto& dir : rejoin_undef_dirfrags) {
5958 ceph_assert(dir->get_version() == 0);
5959 fetch_queue.emplace(std::piecewise_construct, std::make_tuple(dir), std::make_tuple());
5960 }
5961
5962 if (g_conf().get_val<bool>("mds_dir_prefetch")) {
5963 for (auto& in : rejoin_undef_inodes) {
5964 ceph_assert(!in->is_base());
5965 ceph_assert(in->get_parent_dir());
5966 fetch_queue.emplace(std::piecewise_construct, std::make_tuple(in->get_parent_dir()), std::make_tuple());
5967 }
5968 } else {
5969 for (auto& in : rejoin_undef_inodes) {
5970 assert(!in->is_base());
5971 CDentry *dn = in->get_parent_dn();
5972 auto& p = fetch_queue[dn->get_dir()];
5973
5974 if (dn->last != CEPH_NOSNAP) {
5975 p.first = true;
5976 p.second.clear();
5977 } else if (!p.first) {
5978 p.second.push_back(dn->key());
5979 }
5980 }
5981 }
5982
5983 if (fetch_queue.empty())
5984 return false;
5985
5986 MDSGatherBuilder gather(g_ceph_context,
5987 new MDSInternalContextWrapper(mds,
5988 new LambdaContext([this](int r) {
5989 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5990 rejoin_gather_finish();
5991 })
5992 )
5993 );
5994
5995 for (auto& p : fetch_queue) {
5996 CDir *dir = p.first;
5997 CInode *diri = dir->get_inode();
5998 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5999 continue;
6000 if (dir->state_test(CDir::STATE_REJOINUNDEF))
6001 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
6002 if (p.second.first || p.second.second.empty()) {
6003 dir->fetch(gather.new_sub());
6004 } else {
6005 dir->fetch_keys(p.second.second, gather.new_sub());
6006 }
6007 }
6008 ceph_assert(gather.has_subs());
6009 gather.activate();
6010 return true;
6011 }
6012
6013 void MDCache::opened_undef_inode(CInode *in) {
6014 dout(10) << "opened_undef_inode " << *in << dendl;
6015 rejoin_undef_inodes.erase(in);
6016 if (in->is_dir()) {
6017 // FIXME: re-hash dentries if necessary
6018 ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
6019 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
6020 CDir *dir = in->get_dirfrag(frag_t());
6021 ceph_assert(dir);
6022 rejoin_undef_dirfrags.erase(dir);
6023 in->force_dirfrags();
6024 auto&& ls = in->get_dirfrags();
6025 for (const auto& dir : ls) {
6026 rejoin_undef_dirfrags.insert(dir);
6027 }
6028 }
6029 }
6030 }
6031
6032 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6033 map<client_t,ref_t<MClientSnap>>& updates)
6034 {
6035 if (seq < realm->get_newest_seq()) {
6036 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
6037 << realm->get_newest_seq() << " on " << *realm << dendl;
6038 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
6039 snap->bl = mds->server->get_snap_trace(client, realm);
6040 for (const auto& child : realm->open_children)
6041 snap->split_realms.push_back(child->inode->ino());
6042 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
6043 } else {
6044 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6045 << " on " << *realm << dendl;
6046 }
6047 }
6048
6049
6050
6051 void MDCache::rejoin_send_acks()
6052 {
6053 dout(7) << "rejoin_send_acks" << dendl;
6054
6055 // replicate stray
6056 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6057 p != rejoin_unlinked_inodes.end();
6058 ++p) {
6059 for (set<CInode*>::iterator q = p->second.begin();
6060 q != p->second.end();
6061 ++q) {
6062 CInode *in = *q;
6063 dout(7) << " unlinked inode " << *in << dendl;
6064 // inode expired
6065 if (!in->is_replica(p->first))
6066 continue;
6067 while (1) {
6068 CDentry *dn = in->get_parent_dn();
6069 if (dn->is_replica(p->first))
6070 break;
6071 dn->add_replica(p->first);
6072 CDir *dir = dn->get_dir();
6073 if (dir->is_replica(p->first))
6074 break;
6075 dir->add_replica(p->first);
6076 in = dir->get_inode();
6077 if (in->is_replica(p->first))
6078 break;
6079 in->add_replica(p->first);
6080 if (in->is_base())
6081 break;
6082 }
6083 }
6084 }
6085 rejoin_unlinked_inodes.clear();
6086
6087 // send acks to everyone in the recovery set
6088 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6089 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6090 p != recovery_set.end();
6091 ++p) {
6092 if (rejoin_ack_sent.count(*p))
6093 continue;
6094 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6095 }
6096
6097 rejoin_ack_sent = recovery_set;
6098
6099 // walk subtrees
6100 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6101 p != subtrees.end();
6102 ++p) {
6103 CDir *dir = p->first;
6104 if (!dir->is_auth())
6105 continue;
6106 dout(10) << "subtree " << *dir << dendl;
6107
6108 // auth items in this subtree
6109 std::queue<CDir*> dq;
6110 dq.push(dir);
6111
6112 while (!dq.empty()) {
6113 CDir *dir = dq.front();
6114 dq.pop();
6115
6116 // dir
6117 for (auto &r : dir->get_replicas()) {
6118 auto it = acks.find(r.first);
6119 if (it == acks.end())
6120 continue;
6121 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6122 it->second->add_dirfrag_base(dir);
6123 }
6124
6125 for (auto &p : dir->items) {
6126 CDentry *dn = p.second;
6127 CDentry::linkage_t *dnl = dn->get_linkage();
6128
6129 // inode
6130 CInode *in = NULL;
6131 if (dnl->is_primary())
6132 in = dnl->get_inode();
6133
6134 // dentry
6135 for (auto &r : dn->get_replicas()) {
6136 auto it = acks.find(r.first);
6137 if (it == acks.end())
6138 continue;
6139 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
6140 dn->first, dn->last,
6141 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6142 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6143 dnl->is_remote() ? dnl->get_remote_d_type():0,
6144 ++r.second,
6145 dn->lock.get_replica_state());
6146 // peer missed MDentrylink message ?
6147 if (in && !in->is_replica(r.first))
6148 in->add_replica(r.first);
6149 }
6150
6151 if (!in)
6152 continue;
6153
6154 for (auto &r : in->get_replicas()) {
6155 auto it = acks.find(r.first);
6156 if (it == acks.end())
6157 continue;
6158 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6159 bufferlist bl;
6160 in->_encode_locks_state_for_rejoin(bl, r.first);
6161 it->second->add_inode_locks(in, ++r.second, bl);
6162 }
6163
6164 // subdirs in this subtree?
6165 {
6166 auto&& dirs = in->get_nested_dirfrags();
6167 for (const auto& dir : dirs) {
6168 dq.push(dir);
6169 }
6170 }
6171 }
6172 }
6173 }
6174
6175 // base inodes too
6176 if (root && root->is_auth())
6177 for (auto &r : root->get_replicas()) {
6178 auto it = acks.find(r.first);
6179 if (it == acks.end())
6180 continue;
6181 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6182 bufferlist bl;
6183 root->_encode_locks_state_for_rejoin(bl, r.first);
6184 it->second->add_inode_locks(root, ++r.second, bl);
6185 }
6186 if (myin)
6187 for (auto &r : myin->get_replicas()) {
6188 auto it = acks.find(r.first);
6189 if (it == acks.end())
6190 continue;
6191 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6192 bufferlist bl;
6193 myin->_encode_locks_state_for_rejoin(bl, r.first);
6194 it->second->add_inode_locks(myin, ++r.second, bl);
6195 }
6196
6197 // include inode base for any inodes whose scatterlocks may have updated
6198 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6199 p != rejoin_potential_updated_scatterlocks.end();
6200 ++p) {
6201 CInode *in = *p;
6202 for (const auto &r : in->get_replicas()) {
6203 auto it = acks.find(r.first);
6204 if (it == acks.end())
6205 continue;
6206 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6207 }
6208 }
6209
6210 // send acks
6211 for (auto p = acks.begin(); p != acks.end(); ++p) {
6212 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6213 mds->send_message_mds(p->second, p->first);
6214 }
6215
6216 rejoin_imported_caps.clear();
6217 }
6218
6219 class C_MDC_ReIssueCaps : public MDCacheContext {
6220 CInode *in;
6221 public:
6222 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6223 MDCacheContext(mdc), in(i)
6224 {
6225 in->get(CInode::PIN_PTRWAITER);
6226 }
6227 void finish(int r) override {
6228 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6229 mdcache->mds->locker->issue_caps(in);
6230 in->put(CInode::PIN_PTRWAITER);
6231 }
6232 };
6233
6234 void MDCache::reissue_all_caps()
6235 {
6236 dout(10) << "reissue_all_caps" << dendl;
6237
6238 int count = 0;
6239 for (auto &p : inode_map) {
6240 int n = 1;
6241 CInode *in = p.second;
6242 if (in->is_head() && in->is_any_caps()) {
6243 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6244 if (in->is_frozen_inode()) {
6245 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6246 continue;
6247 }
6248 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6249 n += mds->locker->issue_caps(in);
6250 }
6251
6252 if ((count % mds->heartbeat_reset_grace()) + n >= mds->heartbeat_reset_grace())
6253 mds->heartbeat_reset();
6254 count += n;
6255 }
6256 }
6257
6258
6259 // ===============================================================================
6260
6261 struct C_MDC_QueuedCow : public MDCacheContext {
6262 CInode *in;
6263 MutationRef mut;
6264 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6265 MDCacheContext(mdc), in(i), mut(m) {}
6266 void finish(int r) override {
6267 mdcache->_queued_file_recover_cow(in, mut);
6268 }
6269 };
6270
6271
6272 void MDCache::queue_file_recover(CInode *in)
6273 {
6274 dout(10) << "queue_file_recover " << *in << dendl;
6275 ceph_assert(in->is_auth());
6276
6277 // cow?
6278 /*
6279 SnapRealm *realm = in->find_snaprealm();
6280 set<snapid_t> s = realm->get_snaps();
6281 while (!s.empty() && *s.begin() < in->first)
6282 s.erase(s.begin());
6283 while (!s.empty() && *s.rbegin() > in->last)
6284 s.erase(*s.rbegin());
6285 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6286 if (s.size() > 1) {
6287 auto pi = in->project_inode(mut);
6288 pi.inode.version = in->pre_dirty();
6289
6290 auto mut(std::make_shared<MutationImpl>());
6291 mut->ls = mds->mdlog->get_current_segment();
6292 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6293 mds->mdlog->start_entry(le);
6294 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6295
6296 s.erase(*s.begin());
6297 while (!s.empty()) {
6298 snapid_t snapid = *s.begin();
6299 CInode *cow_inode = 0;
6300 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6301 ceph_assert(cow_inode);
6302 recovery_queue.enqueue(cow_inode);
6303 s.erase(*s.begin());
6304 }
6305
6306 in->parent->first = in->first;
6307 le->metablob.add_primary_dentry(in->parent, in, true);
6308 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6309 mds->mdlog->flush();
6310 }
6311 */
6312
6313 recovery_queue.enqueue(in);
6314 }
6315
6316 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6317 {
6318 mut->apply();
6319 mds->locker->drop_locks(mut.get());
6320 mut->cleanup();
6321 }
6322
6323
6324 /*
6325 * called after recovery to recover file sizes for previously opened (for write)
6326 * files. that is, those where max_size > size.
6327 */
6328 void MDCache::identify_files_to_recover()
6329 {
6330 dout(10) << "identify_files_to_recover" << dendl;
6331 int count = 0;
6332
6333 // Clear the recover and check queues in case the monitor sends rejoin mdsmap twice.
6334 rejoin_recover_q.clear();
6335 rejoin_check_q.clear();
6336
6337 for (auto &p : inode_map) {
6338 CInode *in = p.second;
6339 if (!in->is_auth())
6340 continue;
6341
6342 if (in->last != CEPH_NOSNAP)
6343 continue;
6344
6345 // Only normal files need file size recovery
6346 if (!in->is_file()) {
6347 continue;
6348 }
6349
6350 bool recover = false;
6351 const auto& client_ranges = in->get_projected_inode()->client_ranges;
6352 if (!client_ranges.empty()) {
6353 in->mark_clientwriteable();
6354 for (auto& p : client_ranges) {
6355 Capability *cap = in->get_client_cap(p.first);
6356 if (cap) {
6357 cap->mark_clientwriteable();
6358 } else {
6359 dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
6360 recover = true;
6361 break;
6362 }
6363 }
6364 }
6365
6366 if (recover) {
6367 if (in->filelock.is_stable()) {
6368 in->auth_pin(&in->filelock);
6369 } else {
6370 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6371 }
6372 in->filelock.set_state(LOCK_PRE_SCAN);
6373 rejoin_recover_q.push_back(in);
6374 } else {
6375 rejoin_check_q.push_back(in);
6376 }
6377
6378 if (!(++count % mds->heartbeat_reset_grace()))
6379 mds->heartbeat_reset();
6380 }
6381 }
6382
6383 void MDCache::start_files_to_recover()
6384 {
6385 int count = 0;
6386 for (CInode *in : rejoin_check_q) {
6387 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6388 mds->locker->issue_caps(in);
6389 mds->locker->check_inode_max_size(in);
6390 if (!(++count % mds->heartbeat_reset_grace()))
6391 mds->heartbeat_reset();
6392 }
6393 rejoin_check_q.clear();
6394 for (CInode *in : rejoin_recover_q) {
6395 mds->locker->file_recover(&in->filelock);
6396 if (!(++count % mds->heartbeat_reset_grace()))
6397 mds->heartbeat_reset();
6398 }
6399 if (!rejoin_recover_q.empty()) {
6400 rejoin_recover_q.clear();
6401 do_file_recover();
6402 }
6403 }
6404
6405 void MDCache::do_file_recover()
6406 {
6407 recovery_queue.advance();
6408 }
6409
6410 // ===============================================================================
6411
6412
6413 // ----------------------------
6414 // truncate
6415
6416 class C_MDC_RetryTruncate : public MDCacheContext {
6417 CInode *in;
6418 LogSegment *ls;
6419 public:
6420 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6421 MDCacheContext(c), in(i), ls(l) {}
6422 void finish(int r) override {
6423 mdcache->_truncate_inode(in, ls);
6424 }
6425 };
6426
6427 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6428 {
6429 const auto& pi = in->get_projected_inode();
6430 dout(10) << "truncate_inode "
6431 << pi->truncate_from << " -> " << pi->truncate_size
6432 << " on " << *in
6433 << dendl;
6434
6435 ls->truncating_inodes.insert(in);
6436 in->get(CInode::PIN_TRUNCATING);
6437 in->auth_pin(this);
6438
6439 if (!in->client_need_snapflush.empty() &&
6440 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6441 ceph_assert(in->filelock.is_xlocked());
6442 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6443 mds->locker->issue_caps(in);
6444 return;
6445 }
6446
6447 _truncate_inode(in, ls);
6448 }
6449
6450 struct C_IO_MDC_TruncateWriteFinish : public MDCacheIOContext {
6451 CInode *in;
6452 LogSegment *ls;
6453 uint32_t block_size;
6454 C_IO_MDC_TruncateWriteFinish(MDCache *c, CInode *i, LogSegment *l, uint32_t bs) :
6455 MDCacheIOContext(c, false), in(i), ls(l), block_size(bs) {
6456 }
6457 void finish(int r) override {
6458 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
6459 mdcache->truncate_inode_write_finish(in, ls, block_size);
6460 }
6461 void print(ostream& out) const override {
6462 out << "file_truncate_write(" << in->ino() << ")";
6463 }
6464 };
6465
6466 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6467 CInode *in;
6468 LogSegment *ls;
6469 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6470 MDCacheIOContext(c, false), in(i), ls(l) {
6471 }
6472 void finish(int r) override {
6473 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
6474 mdcache->truncate_inode_finish(in, ls);
6475 }
6476 void print(ostream& out) const override {
6477 out << "file_truncate(" << in->ino() << ")";
6478 }
6479 };
6480
6481 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6482 {
6483 const auto& pi = in->get_inode();
6484 dout(10) << "_truncate_inode "
6485 << pi->truncate_from << " -> " << pi->truncate_size
6486 << " fscrypt last block length is " << pi->fscrypt_last_block.length()
6487 << " on " << *in << dendl;
6488
6489 ceph_assert(pi->is_truncating());
6490 ceph_assert(pi->truncate_size < (1ULL << 63));
6491 ceph_assert(pi->truncate_from < (1ULL << 63));
6492 ceph_assert(pi->truncate_size < pi->truncate_from ||
6493 (pi->truncate_size == pi->truncate_from &&
6494 pi->fscrypt_last_block.length()));
6495
6496
6497 SnapRealm *realm = in->find_snaprealm();
6498 SnapContext nullsnap;
6499 const SnapContext *snapc;
6500 if (realm) {
6501 dout(10) << " realm " << *realm << dendl;
6502 snapc = &realm->get_snap_context();
6503 } else {
6504 dout(10) << " NO realm, using null context" << dendl;
6505 snapc = &nullsnap;
6506 ceph_assert(in->last == CEPH_NOSNAP);
6507 }
6508 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in
6509 << " fscrypt_last_block length is " << pi->fscrypt_last_block.length()
6510 << dendl;
6511 auto layout = pi->layout;
6512 struct ceph_fscrypt_last_block_header header;
6513 memset(&header, 0, sizeof(header));
6514 bufferlist data;
6515 if (pi->fscrypt_last_block.length()) {
6516 auto bl = pi->fscrypt_last_block.cbegin();
6517 DECODE_START(1, bl);
6518 decode(header.change_attr, bl);
6519 decode(header.file_offset, bl);
6520 decode(header.block_size, bl);
6521
6522 /*
6523 * The block_size will be in unit of KB, so if the last block is not
6524 * located in a file hole, the struct_len should be larger than the
6525 * header.block_size.
6526 */
6527 if (struct_len > header.block_size) {
6528 bl.copy(header.block_size, data);
6529 }
6530 DECODE_FINISH(bl);
6531 }
6532
6533 if (data.length()) {
6534 dout(10) << "_truncate_inode write on inode " << *in << " change_attr: "
6535 << header.change_attr << " offset: " << header.file_offset << " blen: "
6536 << header.block_size << dendl;
6537 filer.write(in->ino(), &layout, *snapc, header.file_offset, header.block_size,
6538 data, ceph::real_time::min(), 0,
6539 new C_OnFinisher(new C_IO_MDC_TruncateWriteFinish(this, in, ls,
6540 header.block_size),
6541 mds->finisher));
6542 } else { // located in file hole.
6543 uint64_t length = pi->truncate_from - pi->truncate_size;
6544
6545 /*
6546 * When the fscrypt is enabled the truncate_from and truncate_size
6547 * possibly equal and both are aligned up to header.block_size. In
6548 * this case we will always request a larger length to make sure the
6549 * OSD won't miss truncating the last object.
6550 */
6551 if (pi->fscrypt_last_block.length()) {
6552 dout(10) << "_truncate_inode truncate on inode " << *in << " hits a hole!" << dendl;
6553 length += header.block_size;
6554 }
6555 ceph_assert(length);
6556
6557 dout(10) << "_truncate_inode truncate on inode " << *in << dendl;
6558 filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length,
6559 pi->truncate_seq, ceph::real_time::min(), 0,
6560 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6561 mds->finisher));
6562 }
6563
6564 }
6565
6566 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6567 CInode *in;
6568 MutationRef mut;
6569 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6570 MDCacheLogContext(m), in(i), mut(mu) {}
6571 void finish(int r) override {
6572 mdcache->truncate_inode_logged(in, mut);
6573 }
6574 };
6575
6576 void MDCache::truncate_inode_write_finish(CInode *in, LogSegment *ls,
6577 uint32_t block_size)
6578 {
6579 const auto& pi = in->get_inode();
6580 dout(10) << "_truncate_inode_write "
6581 << pi->truncate_from << " -> " << pi->truncate_size
6582 << " on " << *in << dendl;
6583
6584 ceph_assert(pi->is_truncating());
6585 ceph_assert(pi->truncate_size < (1ULL << 63));
6586 ceph_assert(pi->truncate_from < (1ULL << 63));
6587 ceph_assert(pi->truncate_size < pi->truncate_from ||
6588 (pi->truncate_size == pi->truncate_from &&
6589 pi->fscrypt_last_block.length()));
6590
6591
6592 SnapRealm *realm = in->find_snaprealm();
6593 SnapContext nullsnap;
6594 const SnapContext *snapc;
6595 if (realm) {
6596 dout(10) << " realm " << *realm << dendl;
6597 snapc = &realm->get_snap_context();
6598 } else {
6599 dout(10) << " NO realm, using null context" << dendl;
6600 snapc = &nullsnap;
6601 ceph_assert(in->last == CEPH_NOSNAP);
6602 }
6603 dout(10) << "_truncate_inode_write snapc " << snapc << " on " << *in
6604 << " fscrypt_last_block length is " << pi->fscrypt_last_block.length()
6605 << dendl;
6606 auto layout = pi->layout;
6607 /*
6608 * When the fscrypt is enabled the truncate_from and truncate_size
6609 * possibly equal and both are aligned up to header.block_size. In
6610 * this case we will always request a larger length to make sure the
6611 * OSD won't miss truncating the last object.
6612 */
6613 uint64_t length = pi->truncate_from - pi->truncate_size + block_size;
6614 filer.truncate(in->ino(), &layout, *snapc, pi->truncate_size, length,
6615 pi->truncate_seq, ceph::real_time::min(), 0,
6616 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6617 mds->finisher));
6618 }
6619
6620 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6621 {
6622 dout(10) << "truncate_inode_finish " << *in << dendl;
6623
6624 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6625 ceph_assert(p != ls->truncating_inodes.end());
6626 ls->truncating_inodes.erase(p);
6627
6628 MutationRef mut(new MutationImpl());
6629 mut->ls = mds->mdlog->get_current_segment();
6630
6631 // update
6632 auto pi = in->project_inode(mut);
6633 pi.inode->version = in->pre_dirty();
6634 pi.inode->truncate_from = 0;
6635 pi.inode->truncate_pending--;
6636 pi.inode->fscrypt_last_block = bufferlist();
6637
6638 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6639 mds->mdlog->start_entry(le);
6640
6641 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6642 journal_dirty_inode(mut.get(), &le->metablob, in);
6643 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6644 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6645
6646 // flush immediately if there are readers/writers waiting
6647 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6648 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6649 mds->mdlog->flush();
6650 }
6651
6652 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6653 {
6654 dout(10) << "truncate_inode_logged " << *in << dendl;
6655 mut->apply();
6656 mds->locker->drop_locks(mut.get());
6657 mut->cleanup();
6658
6659 in->put(CInode::PIN_TRUNCATING);
6660 in->auth_unpin(this);
6661
6662 MDSContext::vec waiters;
6663 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6664 mds->queue_waiters(waiters);
6665 }
6666
6667
6668 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6669 {
6670 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6671 << ls->seq << "/" << ls->offset << dendl;
6672 ls->truncating_inodes.insert(in);
6673 in->get(CInode::PIN_TRUNCATING);
6674 }
6675
6676 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6677 {
6678 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6679 << ls->seq << "/" << ls->offset << dendl;
6680 // if we have the logseg the truncate started in, it must be in our list.
6681 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6682 ceph_assert(p != ls->truncating_inodes.end());
6683 ls->truncating_inodes.erase(p);
6684 in->put(CInode::PIN_TRUNCATING);
6685 }
6686
6687 void MDCache::start_recovered_truncates()
6688 {
6689 dout(10) << "start_recovered_truncates" << dendl;
6690 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6691 p != mds->mdlog->segments.end();
6692 ++p) {
6693 LogSegment *ls = p->second;
6694 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6695 q != ls->truncating_inodes.end();
6696 ++q) {
6697 CInode *in = *q;
6698 in->auth_pin(this);
6699
6700 if (!in->client_need_snapflush.empty() &&
6701 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6702 ceph_assert(in->filelock.is_stable());
6703 in->filelock.set_state(LOCK_XLOCKDONE);
6704 in->auth_pin(&in->filelock);
6705 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6706 // start_files_to_recover will revoke caps
6707 continue;
6708 }
6709 _truncate_inode(in, ls);
6710 }
6711 }
6712 }
6713
6714
6715 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6716 interval_set<inodeno_t> inos;
6717 LogSegment *ls;
6718 version_t inotablev;
6719 public:
6720 C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos,
6721 LogSegment *_ls, version_t iv)
6722 : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {}
6723 void finish(int r) override {
6724 ceph_assert(r == 0);
6725 if (inotablev) {
6726 get_mds()->inotable->apply_release_ids(inos);
6727 ceph_assert(get_mds()->inotable->get_version() == inotablev);
6728 }
6729 ls->purge_inodes_finish(inos);
6730 }
6731 };
6732
6733 void MDCache::start_purge_inodes(){
6734 dout(10) << "start_purge_inodes" << dendl;
6735 for (auto& p : mds->mdlog->segments){
6736 LogSegment *ls = p.second;
6737 if (ls->purging_inodes.size()){
6738 purge_inodes(ls->purging_inodes, ls);
6739 }
6740 }
6741 }
6742
6743 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6744 {
6745 dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl;
6746 // FIXME: handle non-default data pool and namespace
6747
6748 auto cb = new LambdaContext([this, inos, ls](int r){
6749 ceph_assert(r == 0 || r == -2);
6750 mds->inotable->project_release_ids(inos);
6751 version_t piv = mds->inotable->get_projected_version();
6752 ceph_assert(piv != 0);
6753 mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv),
6754 new C_MDS_purge_completed_finish(this, inos, ls, piv));
6755 mds->mdlog->flush();
6756 });
6757
6758 C_GatherBuilder gather(g_ceph_context,
6759 new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher));
6760 SnapContext nullsnapc;
6761 for (const auto& [start, len] : inos) {
6762 for (auto i = start; i < start + len ; i += 1) {
6763 filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1,
6764 ceph::real_clock::now(), 0, gather.new_sub());
6765 }
6766 }
6767 gather.activate();
6768 }
6769
6770 // ================================================================================
6771 // cache trimming
6772
6773 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6774 {
6775 bool is_standby_replay = mds->is_standby_replay();
6776 std::vector<CDentry *> unexpirables;
6777 uint64_t trimmed = 0;
6778
6779 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6780
6781 dout(7) << "trim_lru trimming " << count
6782 << " items from LRU"
6783 << " size=" << lru.lru_get_size()
6784 << " mid=" << lru.lru_get_top()
6785 << " pintail=" << lru.lru_get_pintail()
6786 << " pinned=" << lru.lru_get_num_pinned()
6787 << dendl;
6788
6789 const uint64_t trim_counter_start = trim_counter.get();
6790 bool throttled = false;
6791 while (1) {
6792 throttled |= trim_counter_start+trimmed >= trim_threshold;
6793 if (throttled) break;
6794 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6795 if (!dn)
6796 break;
6797 if (trim_dentry(dn, expiremap)) {
6798 unexpirables.push_back(dn);
6799 } else {
6800 trimmed++;
6801 }
6802 }
6803
6804 for (auto &dn : unexpirables) {
6805 bottom_lru.lru_insert_mid(dn);
6806 }
6807 unexpirables.clear();
6808
6809 // trim dentries from the LRU until count is reached
6810 // if mds is in standby_replay and skip trimming the inodes
6811 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6812 throttled |= trim_counter_start+trimmed >= trim_threshold;
6813 if (throttled) break;
6814 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6815 if (!dn) {
6816 break;
6817 }
6818 if (is_standby_replay && dn->get_linkage()->inode) {
6819 // we move the inodes that need to be trimmed to the end of the lru queue.
6820 // refer to MDCache::standby_trim_segment
6821 lru.lru_insert_bot(dn);
6822 break;
6823 } else if (trim_dentry(dn, expiremap)) {
6824 unexpirables.push_back(dn);
6825 } else {
6826 trimmed++;
6827 if (count > 0) count--;
6828 }
6829 }
6830 trim_counter.hit(trimmed);
6831
6832 for (auto &dn : unexpirables) {
6833 lru.lru_insert_mid(dn);
6834 }
6835 unexpirables.clear();
6836
6837 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6838 return std::pair<bool, uint64_t>(throttled, trimmed);
6839 }
6840
6841 /*
6842 * note: only called while MDS is active or stopping... NOT during recovery.
6843 * however, we may expire a replica whose authority is recovering.
6844 *
6845 * @param count is number of dentries to try to expire
6846 */
6847 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6848 {
6849 uint64_t used = cache_size();
6850 uint64_t limit = cache_memory_limit;
6851 expiremap expiremap;
6852
6853 dout(7) << "trim bytes_used=" << bytes2str(used)
6854 << " limit=" << bytes2str(limit)
6855 << " reservation=" << cache_reservation
6856 << "% count=" << count << dendl;
6857
6858 // process delayed eval_stray()
6859 stray_manager.advance_delayed();
6860
6861 auto result = trim_lru(count, expiremap);
6862 auto& trimmed = result.second;
6863
6864 // trim non-auth, non-bound subtrees
6865 for (auto p = subtrees.begin(); p != subtrees.end();) {
6866 CDir *dir = p->first;
6867 ++p;
6868 CInode *diri = dir->get_inode();
6869 if (dir->is_auth()) {
6870 if (diri->is_auth() && !diri->is_base()) {
6871 /* this situation should correspond to an export pin */
6872 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6873 /* pinned empty subtree, try to drop */
6874 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6875 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6876 dir->state_clear(CDir::STATE_AUXSUBTREE);
6877 remove_subtree(dir);
6878 diri->close_dirfrag(dir->dirfrag().frag);
6879 }
6880 }
6881 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6882 if (dir->state_test(CDir::STATE_EXPORTING) ||
6883 !(mds->is_active() || mds->is_stopping()) ||
6884 dir->is_freezing() || dir->is_frozen())
6885 continue;
6886
6887 migrator->export_empty_import(dir);
6888 ++trimmed;
6889 }
6890 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6891 // only subtree pin
6892 if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
6893 continue;
6894 }
6895
6896 // don't trim subtree root if its auth MDS is recovering.
6897 // This simplify the cache rejoin code.
6898 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6899 continue;
6900 trim_dirfrag(dir, 0, expiremap);
6901 ++trimmed;
6902 }
6903 }
6904
6905 // trim root?
6906 if (mds->is_stopping() && root) {
6907 auto&& ls = root->get_dirfrags();
6908 for (const auto& dir : ls) {
6909 if (dir->get_num_ref() == 1) { // subtree pin
6910 trim_dirfrag(dir, 0, expiremap);
6911 ++trimmed;
6912 }
6913 }
6914 if (root->get_num_ref() == 0) {
6915 trim_inode(0, root, 0, expiremap);
6916 ++trimmed;
6917 }
6918 }
6919
6920 std::set<mds_rank_t> stopping;
6921 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6922 stopping.erase(mds->get_nodeid());
6923 for (auto rank : stopping) {
6924 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6925 if (!mdsdir_in)
6926 continue;
6927
6928 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6929 if (em.second) {
6930 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6931 }
6932
6933 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds->get_nodeid() << dendl;
6934
6935 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6936 if (!aborted) {
6937 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6938 auto&& ls = mdsdir_in->get_dirfrags();
6939 for (auto dir : ls) {
6940 if (dir->get_num_ref() == 1) { // subtree pin
6941 trim_dirfrag(dir, dir, expiremap);
6942 ++trimmed;
6943 }
6944 }
6945 if (mdsdir_in->get_num_ref() == 0) {
6946 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6947 ++trimmed;
6948 }
6949 } else {
6950 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6951 }
6952 }
6953
6954 // Other rank's base inodes (when I'm stopping)
6955 if (mds->is_stopping()) {
6956 for (set<CInode*>::iterator p = base_inodes.begin();
6957 p != base_inodes.end();) {
6958 CInode *base_in = *p;
6959 ++p;
6960 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6961 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6962 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6963 if (base_in->get_num_ref() == 0) {
6964 trim_inode(NULL, base_in, NULL, expiremap);
6965 ++trimmed;
6966 }
6967 }
6968 }
6969 }
6970
6971 // send any expire messages
6972 send_expire_messages(expiremap);
6973
6974 return result;
6975 }
6976
6977 void MDCache::send_expire_messages(expiremap& expiremap)
6978 {
6979 // send expires
6980 for (const auto &p : expiremap) {
6981 if (mds->is_cluster_degraded() &&
6982 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6983 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6984 rejoin_sent.count(p.first) == 0))) {
6985 continue;
6986 }
6987 dout(7) << "sending cache_expire to " << p.first << dendl;
6988 mds->send_message_mds(p.second, p.first);
6989 }
6990 expiremap.clear();
6991 }
6992
6993
6994 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6995 {
6996 dout(12) << "trim_dentry " << *dn << dendl;
6997
6998 CDentry::linkage_t *dnl = dn->get_linkage();
6999
7000 CDir *dir = dn->get_dir();
7001 ceph_assert(dir);
7002
7003 CDir *con = get_subtree_root(dir);
7004 if (con)
7005 dout(12) << " in container " << *con << dendl;
7006 else {
7007 dout(12) << " no container; under a not-yet-linked dir" << dendl;
7008 ceph_assert(dn->is_auth());
7009 }
7010
7011 // If replica dentry is not readable, it's likely we will receive
7012 // MDentryLink/MDentryUnlink message soon (It's possible we first
7013 // receive a MDentryUnlink message, then MDentryLink message)
7014 // MDentryLink message only replicates an inode, so we should
7015 // avoid trimming the inode's parent dentry. This is because that
7016 // unconnected replicas are problematic for subtree migration.
7017 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
7018 !dn->get_dir()->get_inode()->is_stray())
7019 return true;
7020
7021 // adjust the dir state
7022 // NOTE: we can safely remove a clean, null dentry without effecting
7023 // directory completeness.
7024 // (check this _before_ we unlink the inode, below!)
7025 bool clear_complete = false;
7026 if (dn->is_auth() && !(dnl->is_null() && dn->is_clean()))
7027 clear_complete = true;
7028
7029 // unlink the dentry
7030 if (dnl->is_remote()) {
7031 // just unlink.
7032 dir->unlink_inode(dn, false);
7033 } else if (dnl->is_primary()) {
7034 // expire the inode, too.
7035 CInode *in = dnl->get_inode();
7036 ceph_assert(in);
7037 if (trim_inode(dn, in, con, expiremap))
7038 return true; // purging stray instead of trimming
7039 } else {
7040 ceph_assert(dnl->is_null());
7041 }
7042
7043 if (!dn->is_auth()) {
7044 // notify dentry authority.
7045 mds_authority_t auth = dn->authority();
7046
7047 for (int p=0; p<2; p++) {
7048 mds_rank_t a = auth.first;
7049 if (p) a = auth.second;
7050 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7051 if (mds->get_nodeid() == auth.second &&
7052 con->is_importing()) break; // don't send any expire while importing.
7053 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7054
7055 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
7056 ceph_assert(a != mds->get_nodeid());
7057 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7058 if (em.second)
7059 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
7060 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
7061 }
7062 }
7063
7064 if (clear_complete) {
7065 if (dn->last == CEPH_NOSNAP)
7066 dir->add_to_bloom(dn);
7067 dir->state_clear(CDir::STATE_COMPLETE);
7068 }
7069
7070 // remove dentry
7071 dir->remove_dentry(dn);
7072
7073 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
7074 return false;
7075 }
7076
7077
7078 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7079 {
7080 dout(15) << "trim_dirfrag " << *dir << dendl;
7081
7082 if (dir->is_subtree_root()) {
7083 ceph_assert(!dir->is_auth() ||
7084 (!dir->is_replicated() && dir->inode->is_base()));
7085 remove_subtree(dir); // remove from subtree map
7086 }
7087 ceph_assert(dir->get_num_ref() == 0);
7088
7089 CInode *in = dir->get_inode();
7090
7091 if (!dir->is_auth()) {
7092 mds_authority_t auth = dir->authority();
7093
7094 // was this an auth delegation? (if so, slightly modified container)
7095 dirfrag_t condf;
7096 if (dir->is_subtree_root()) {
7097 dout(12) << " subtree root, container is " << *dir << dendl;
7098 con = dir;
7099 condf = dir->dirfrag();
7100 } else {
7101 condf = con->dirfrag();
7102 }
7103
7104 for (int p=0; p<2; p++) {
7105 mds_rank_t a = auth.first;
7106 if (p) a = auth.second;
7107 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7108 if (mds->get_nodeid() == auth.second &&
7109 con->is_importing()) break; // don't send any expire while importing.
7110 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7111
7112 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
7113 ceph_assert(a != mds->get_nodeid());
7114 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7115 if (em.second)
7116 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7117 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7118 }
7119 }
7120
7121 in->close_dirfrag(dir->dirfrag().frag);
7122 }
7123
7124 /**
7125 * Try trimming an inode from the cache
7126 *
7127 * @return true if the inode is still in cache, else false if it was trimmed
7128 */
7129 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7130 {
7131 dout(15) << "trim_inode " << *in << dendl;
7132 ceph_assert(in->get_num_ref() == 0);
7133
7134 if (in->is_dir()) {
7135 // If replica inode's dirfragtreelock is not readable, it's likely
7136 // some dirfrags of the inode are being fragmented and we will receive
7137 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7138 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7139 // This is because that unconnected replicas are problematic for
7140 // subtree migration.
7141 //
7142 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7143 return true;
7144 }
7145
7146 // DIR
7147 auto&& dfls = in->get_dirfrags();
7148 for (const auto& dir : dfls) {
7149 ceph_assert(!dir->is_subtree_root());
7150 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7151 }
7152 }
7153
7154 // INODE
7155 if (in->is_auth()) {
7156 // eval stray after closing dirfrags
7157 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7158 maybe_eval_stray(in);
7159 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7160 return true;
7161 }
7162 } else {
7163 mds_authority_t auth = in->authority();
7164
7165 dirfrag_t df;
7166 if (con)
7167 df = con->dirfrag();
7168 else
7169 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7170
7171 for (int p=0; p<2; p++) {
7172 mds_rank_t a = auth.first;
7173 if (p) a = auth.second;
7174 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7175 if (con && mds->get_nodeid() == auth.second &&
7176 con->is_importing()) break; // don't send any expire while importing.
7177 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7178
7179 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7180 ceph_assert(a != mds->get_nodeid());
7181 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7182 if (em.second)
7183 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7184 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7185 }
7186 }
7187
7188 /*
7189 if (in->is_auth()) {
7190 if (in->hack_accessed)
7191 mds->logger->inc("outt");
7192 else {
7193 mds->logger->inc("outut");
7194 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7195 }
7196 }
7197 */
7198
7199 // unlink
7200 if (dn)
7201 dn->get_dir()->unlink_inode(dn, false);
7202 remove_inode(in);
7203 return false;
7204 }
7205
7206
7207 /**
7208 * trim_non_auth - remove any non-auth items from our cache
7209 *
7210 * this reduces the amount of non-auth metadata in our cache, reducing the
7211 * load incurred by the rejoin phase.
7212 *
7213 * the only non-auth items that remain are those that are needed to
7214 * attach our own subtrees to the root.
7215 *
7216 * when we are done, all dentries will be in the top bit of the lru.
7217 *
7218 * why we have to do this:
7219 * we may not have accurate linkage for non-auth items. which means we will
7220 * know which subtree it falls into, and can not be sure to declare it to the
7221 * correct authority.
7222 */
7223 void MDCache::trim_non_auth()
7224 {
7225 dout(7) << "trim_non_auth" << dendl;
7226
7227 // temporarily pin all subtree roots
7228 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7229 p != subtrees.end();
7230 ++p)
7231 p->first->get(CDir::PIN_SUBTREETEMP);
7232
7233 list<CDentry*> auth_list;
7234
7235 // trim non-auth items from the lru
7236 for (;;) {
7237 CDentry *dn = NULL;
7238 if (bottom_lru.lru_get_size() > 0)
7239 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7240 if (!dn && lru.lru_get_size() > 0)
7241 dn = static_cast<CDentry*>(lru.lru_expire());
7242 if (!dn)
7243 break;
7244
7245 CDentry::linkage_t *dnl = dn->get_linkage();
7246
7247 if (dn->is_auth()) {
7248 // add back into lru (at the top)
7249 auth_list.push_back(dn);
7250
7251 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7252 dn->unlink_remote(dnl);
7253 } else {
7254 // non-auth. expire.
7255 CDir *dir = dn->get_dir();
7256 ceph_assert(dir);
7257
7258 // unlink the dentry
7259 dout(10) << " removing " << *dn << dendl;
7260 if (dnl->is_remote()) {
7261 dir->unlink_inode(dn, false);
7262 }
7263 else if (dnl->is_primary()) {
7264 CInode *in = dnl->get_inode();
7265 dout(10) << " removing " << *in << dendl;
7266 auto&& ls = in->get_dirfrags();
7267 for (const auto& subdir : ls) {
7268 ceph_assert(!subdir->is_subtree_root());
7269 in->close_dirfrag(subdir->dirfrag().frag);
7270 }
7271 dir->unlink_inode(dn, false);
7272 remove_inode(in);
7273 }
7274 else {
7275 ceph_assert(dnl->is_null());
7276 }
7277
7278 ceph_assert(!dir->has_bloom());
7279 dir->remove_dentry(dn);
7280 // adjust the dir state
7281 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7282 // close empty non-auth dirfrag
7283 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7284 dir->inode->close_dirfrag(dir->get_frag());
7285 }
7286 }
7287
7288 for (const auto& dn : auth_list) {
7289 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7290 bottom_lru.lru_insert_mid(dn);
7291 else
7292 lru.lru_insert_top(dn);
7293 }
7294
7295 // move everything in the pintail to the top bit of the lru.
7296 lru.lru_touch_entire_pintail();
7297
7298 // unpin all subtrees
7299 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7300 p != subtrees.end();
7301 ++p)
7302 p->first->put(CDir::PIN_SUBTREETEMP);
7303
7304 if (lru.lru_get_size() == 0 &&
7305 bottom_lru.lru_get_size() == 0) {
7306 // root, stray, etc.?
7307 auto p = inode_map.begin();
7308 while (p != inode_map.end()) {
7309 CInode *in = p->second;
7310 ++p;
7311 if (!in->is_auth()) {
7312 auto&& ls = in->get_dirfrags();
7313 for (const auto& dir : ls) {
7314 dout(10) << " removing " << *dir << dendl;
7315 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7316 remove_subtree(dir);
7317 in->close_dirfrag(dir->dirfrag().frag);
7318 }
7319 dout(10) << " removing " << *in << dendl;
7320 ceph_assert(!in->get_parent_dn());
7321 ceph_assert(in->get_num_ref() == 0);
7322 remove_inode(in);
7323 }
7324 }
7325 }
7326
7327 show_subtrees();
7328 }
7329
7330 /**
7331 * Recursively trim the subtree rooted at directory to remove all
7332 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7333 * of those links. This is used to clear invalid data out of the cache.
7334 * Note that it doesn't clear the passed-in directory, since that's not
7335 * always safe.
7336 */
7337 bool MDCache::trim_non_auth_subtree(CDir *dir)
7338 {
7339 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7340
7341 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7342
7343 auto j = dir->begin();
7344 auto i = j;
7345 while (j != dir->end()) {
7346 i = j++;
7347 CDentry *dn = i->second;
7348 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7349 CDentry::linkage_t *dnl = dn->get_linkage();
7350 if (dnl->is_primary()) { // check for subdirectories, etc
7351 CInode *in = dnl->get_inode();
7352 bool keep_inode = false;
7353 if (in->is_dir()) {
7354 auto&& subdirs = in->get_dirfrags();
7355 for (const auto& subdir : subdirs) {
7356 if (subdir->is_subtree_root()) {
7357 keep_inode = true;
7358 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7359 } else {
7360 if (trim_non_auth_subtree(subdir))
7361 keep_inode = true;
7362 else {
7363 in->close_dirfrag(subdir->get_frag());
7364 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7365 }
7366 }
7367 }
7368
7369 }
7370 if (!keep_inode) { // remove it!
7371 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7372 dir->unlink_inode(dn, false);
7373 remove_inode(in);
7374 ceph_assert(!dir->has_bloom());
7375 dir->remove_dentry(dn);
7376 } else {
7377 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7378 dn->clear_auth();
7379 in->state_clear(CInode::STATE_AUTH);
7380 }
7381 } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback
7382 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7383 } else { // just remove it
7384 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7385 if (dnl->is_remote())
7386 dir->unlink_inode(dn, false);
7387 dir->remove_dentry(dn);
7388 }
7389 }
7390 dir->state_clear(CDir::STATE_AUTH);
7391 /**
7392 * We've now checked all our children and deleted those that need it.
7393 * Now return to caller, and tell them if *we're* a keeper.
7394 */
7395 return keep_dir || dir->get_num_any();
7396 }
7397
7398 /*
7399 * during replay, when we determine a subtree is no longer ours, we
7400 * try to trim it from our cache. because subtrees must be connected
7401 * to the root, the fact that we can trim this tree may mean that our
7402 * children or parents can also be trimmed.
7403 */
7404 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7405 {
7406 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7407
7408 // can we now trim child subtrees?
7409 set<CDir*> bounds;
7410 get_subtree_bounds(dir, bounds);
7411 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7412 CDir *bd = *p;
7413 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7414 bd->get_num_any() == 0 && // and empty
7415 can_trim_non_auth_dirfrag(bd)) {
7416 CInode *bi = bd->get_inode();
7417 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7418 remove_subtree(bd);
7419 bd->mark_clean();
7420 bi->close_dirfrag(bd->get_frag());
7421 }
7422 }
7423
7424 if (trim_non_auth_subtree(dir)) {
7425 // keep
7426 try_subtree_merge(dir);
7427 } else {
7428 // can we trim this subtree (and possibly our ancestors) too?
7429 while (true) {
7430 CInode *diri = dir->get_inode();
7431 if (diri->is_base()) {
7432 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7433 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7434 remove_subtree(dir);
7435 dir->mark_clean();
7436 diri->close_dirfrag(dir->get_frag());
7437
7438 dout(10) << " removing " << *diri << dendl;
7439 ceph_assert(!diri->get_parent_dn());
7440 ceph_assert(diri->get_num_ref() == 0);
7441 remove_inode(diri);
7442 }
7443 break;
7444 }
7445
7446 CDir *psub = get_subtree_root(diri->get_parent_dir());
7447 dout(10) << " parent subtree is " << *psub << dendl;
7448 if (psub->get_dir_auth().first == mds->get_nodeid())
7449 break; // we are auth, keep.
7450
7451 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7452 remove_subtree(dir);
7453 dir->mark_clean();
7454 diri->close_dirfrag(dir->get_frag());
7455
7456 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7457 if (trim_non_auth_subtree(psub))
7458 break;
7459 dir = psub;
7460 }
7461 }
7462
7463 show_subtrees();
7464 }
7465
7466 void MDCache::standby_trim_segment(LogSegment *ls)
7467 {
7468 auto try_trim_inode = [this](CInode *in) {
7469 if (in->get_num_ref() == 0 &&
7470 !in->item_open_file.is_on_list() &&
7471 in->parent != NULL &&
7472 in->parent->get_num_ref() == 0){
7473 touch_dentry_bottom(in->parent);
7474 }
7475 };
7476
7477 auto try_trim_dentry = [this](CDentry *dn) {
7478 if (dn->get_num_ref() > 0)
7479 return;
7480 auto in = dn->get_linkage()->inode;
7481 if(in && in->item_open_file.is_on_list())
7482 return;
7483 touch_dentry_bottom(dn);
7484 };
7485
7486 ls->new_dirfrags.clear_list();
7487 ls->open_files.clear_list();
7488
7489 while (!ls->dirty_dirfrags.empty()) {
7490 CDir *dir = ls->dirty_dirfrags.front();
7491 dir->mark_clean();
7492 if (dir->inode)
7493 try_trim_inode(dir->inode);
7494 }
7495 while (!ls->dirty_inodes.empty()) {
7496 CInode *in = ls->dirty_inodes.front();
7497 in->mark_clean();
7498 try_trim_inode(in);
7499 }
7500 while (!ls->dirty_dentries.empty()) {
7501 CDentry *dn = ls->dirty_dentries.front();
7502 dn->mark_clean();
7503 try_trim_dentry(dn);
7504 }
7505 while (!ls->dirty_parent_inodes.empty()) {
7506 CInode *in = ls->dirty_parent_inodes.front();
7507 in->clear_dirty_parent();
7508 try_trim_inode(in);
7509 }
7510 while (!ls->dirty_dirfrag_dir.empty()) {
7511 CInode *in = ls->dirty_dirfrag_dir.front();
7512 in->filelock.remove_dirty();
7513 try_trim_inode(in);
7514 }
7515 while (!ls->dirty_dirfrag_nest.empty()) {
7516 CInode *in = ls->dirty_dirfrag_nest.front();
7517 in->nestlock.remove_dirty();
7518 try_trim_inode(in);
7519 }
7520 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7521 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7522 in->dirfragtreelock.remove_dirty();
7523 try_trim_inode(in);
7524 }
7525 while (!ls->truncating_inodes.empty()) {
7526 auto it = ls->truncating_inodes.begin();
7527 CInode *in = *it;
7528 ls->truncating_inodes.erase(it);
7529 in->put(CInode::PIN_TRUNCATING);
7530 try_trim_inode(in);
7531 }
7532 }
7533
7534 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7535 {
7536 mds_rank_t from = mds_rank_t(m->get_from());
7537
7538 dout(7) << "cache_expire from mds." << from << dendl;
7539
7540 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7541 return;
7542 }
7543
7544 set<SimpleLock *> gather_locks;
7545 // loop over realms
7546 for (const auto &p : m->realms) {
7547 // check container?
7548 if (p.first.ino > 0) {
7549 CInode *expired_inode = get_inode(p.first.ino);
7550 ceph_assert(expired_inode); // we had better have this.
7551 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7552 ceph_assert(parent_dir);
7553
7554 int export_state = -1;
7555 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7556 export_state = migrator->get_export_state(parent_dir);
7557 ceph_assert(export_state >= 0);
7558 }
7559
7560 if (!parent_dir->is_auth() ||
7561 (export_state != -1 &&
7562 ((export_state == Migrator::EXPORT_WARNING &&
7563 migrator->export_has_warned(parent_dir,from)) ||
7564 export_state == Migrator::EXPORT_EXPORTING ||
7565 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7566 (export_state == Migrator::EXPORT_NOTIFYING &&
7567 !migrator->export_has_notified(parent_dir,from))))) {
7568
7569 // not auth.
7570 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7571 ceph_assert(parent_dir->is_frozen_tree_root());
7572
7573 // make a message container
7574
7575 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7576 if (em.second)
7577 em.first->second = make_message<MCacheExpire>(from); /* new */
7578
7579 // merge these expires into it
7580 em.first->second->add_realm(p.first, p.second);
7581 continue;
7582 }
7583 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7584 (export_state == Migrator::EXPORT_WARNING &&
7585 !migrator->export_has_warned(parent_dir, from)));
7586
7587 dout(7) << "expires for " << *parent_dir << dendl;
7588 } else {
7589 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7590 }
7591
7592 // INODES
7593 for (const auto &q : p.second.inodes) {
7594 CInode *in = get_inode(q.first);
7595 unsigned nonce = q.second;
7596
7597 if (!in) {
7598 dout(0) << " inode expire on " << q.first << " from " << from
7599 << ", don't have it" << dendl;
7600 ceph_assert(in);
7601 }
7602 ceph_assert(in->is_auth());
7603 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7604
7605 // check nonce
7606 if (nonce == in->get_replica_nonce(from)) {
7607 // remove from our cached_by
7608 dout(7) << " inode expire on " << *in << " from mds." << from
7609 << " cached_by was " << in->get_replicas() << dendl;
7610 inode_remove_replica(in, from, false, gather_locks);
7611 }
7612 else {
7613 // this is an old nonce, ignore expire.
7614 dout(7) << " inode expire on " << *in << " from mds." << from
7615 << " with old nonce " << nonce
7616 << " (current " << in->get_replica_nonce(from) << "), dropping"
7617 << dendl;
7618 }
7619 }
7620
7621 // DIRS
7622 for (const auto &q : p.second.dirs) {
7623 CDir *dir = get_dirfrag(q.first);
7624 unsigned nonce = q.second;
7625
7626 if (!dir) {
7627 CInode *diri = get_inode(q.first.ino);
7628 if (diri) {
7629 if (mds->is_rejoin() &&
7630 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7631 !diri->is_replica(from)) {
7632 auto&& ls = diri->get_nested_dirfrags();
7633 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7634 << " while rejoining, inode isn't replicated" << dendl;
7635 for (const auto& d : ls) {
7636 dir = d;
7637 if (dir->is_replica(from)) {
7638 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7639 dir->remove_replica(from);
7640 }
7641 }
7642 continue;
7643 }
7644 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7645 if (other) {
7646 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7647 << " have " << *other << ", mismatched frags, dropping" << dendl;
7648 continue;
7649 }
7650 }
7651 dout(0) << " dir expire on " << q.first << " from " << from
7652 << ", don't have it" << dendl;
7653 ceph_assert(dir);
7654 }
7655 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7656
7657 ceph_assert(dir->is_auth());
7658
7659 // check nonce
7660 if (nonce == dir->get_replica_nonce(from)) {
7661 // remove from our cached_by
7662 dout(7) << " dir expire on " << *dir << " from mds." << from
7663 << " replicas was " << dir->get_replicas() << dendl;
7664 dir->remove_replica(from);
7665 }
7666 else {
7667 // this is an old nonce, ignore expire.
7668 dout(7) << " dir expire on " << *dir << " from mds." << from
7669 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7670 << "), dropping" << dendl;
7671 }
7672 }
7673
7674 // DENTRIES
7675 for (const auto &pd : p.second.dentries) {
7676 dout(10) << " dn expires in dir " << pd.first << dendl;
7677 CInode *diri = get_inode(pd.first.ino);
7678 ceph_assert(diri);
7679 CDir *dir = diri->get_dirfrag(pd.first.frag);
7680
7681 if (!dir) {
7682 dout(0) << " dn expires on " << pd.first << " from " << from
7683 << ", must have refragmented" << dendl;
7684 } else {
7685 ceph_assert(dir->is_auth());
7686 }
7687
7688 for (const auto &p : pd.second) {
7689 unsigned nonce = p.second;
7690 CDentry *dn;
7691
7692 if (dir) {
7693 dn = dir->lookup(p.first.first, p.first.second);
7694 } else {
7695 // which dirfrag for this dentry?
7696 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7697 ceph_assert(dir);
7698 ceph_assert(dir->is_auth());
7699 dn = dir->lookup(p.first.first, p.first.second);
7700 }
7701
7702 if (!dn) {
7703 if (dir)
7704 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7705 else
7706 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7707 }
7708 ceph_assert(dn);
7709
7710 if (nonce == dn->get_replica_nonce(from)) {
7711 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7712 dentry_remove_replica(dn, from, gather_locks);
7713 }
7714 else {
7715 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7716 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7717 << "), dropping" << dendl;
7718 }
7719 }
7720 }
7721 }
7722
7723 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7724 if (!(*p)->is_stable())
7725 mds->locker->eval_gather(*p);
7726 }
7727 }
7728
7729 void MDCache::process_delayed_expire(CDir *dir)
7730 {
7731 dout(7) << "process_delayed_expire on " << *dir << dendl;
7732 for (const auto &p : delayed_expire[dir]) {
7733 handle_cache_expire(p.second);
7734 }
7735 delayed_expire.erase(dir);
7736 }
7737
7738 void MDCache::discard_delayed_expire(CDir *dir)
7739 {
7740 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7741 delayed_expire.erase(dir);
7742 }
7743
7744 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7745 set<SimpleLock *>& gather_locks)
7746 {
7747 in->remove_replica(from);
7748 in->set_mds_caps_wanted(from, 0);
7749
7750 // note: this code calls _eval more often than it needs to!
7751 // fix lock
7752 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7753 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7754 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7755 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7756 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7757 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7758
7759 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7760 // Don't remove the recovering mds from lock's gathering list because
7761 // it may hold rejoined wrlocks.
7762 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7763 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7764 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7765 }
7766
7767 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7768 {
7769 dn->remove_replica(from);
7770
7771 // fix lock
7772 if (dn->lock.remove_replica(from))
7773 gather_locks.insert(&dn->lock);
7774
7775 // Replicated strays might now be elegible for purge
7776 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7777 if (dnl->is_primary()) {
7778 maybe_eval_stray(dnl->get_inode());
7779 }
7780 }
7781
7782 void MDCache::trim_client_leases()
7783 {
7784 utime_t now = ceph_clock_now();
7785
7786 dout(10) << "trim_client_leases" << dendl;
7787
7788 std::size_t pool = 0;
7789 for (const auto& list : client_leases) {
7790 pool += 1;
7791 if (list.empty())
7792 continue;
7793
7794 auto before = list.size();
7795 while (!list.empty()) {
7796 ClientLease *r = list.front();
7797 if (r->ttl > now) break;
7798 CDentry *dn = static_cast<CDentry*>(r->parent);
7799 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7800 dn->remove_client_lease(r, mds->locker);
7801 }
7802 auto after = list.size();
7803 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7804 << (before-after) << " leases, " << after << " left" << dendl;
7805 }
7806 }
7807
7808 void MDCache::check_memory_usage()
7809 {
7810 static MemoryModel mm(g_ceph_context);
7811 static MemoryModel::snap last;
7812 mm.sample(&last);
7813 static MemoryModel::snap baseline = last;
7814
7815 // check client caps
7816 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7817 double caps_per_inode = 0.0;
7818 if (CInode::count())
7819 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7820
7821 dout(2) << "Memory usage: "
7822 << " total " << last.get_total()
7823 << ", rss " << last.get_rss()
7824 << ", heap " << last.get_heap()
7825 << ", baseline " << baseline.get_heap()
7826 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7827 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7828 << dendl;
7829
7830 mds->update_mlogger();
7831 mds->mlogger->set(l_mdm_rss, last.get_rss());
7832 mds->mlogger->set(l_mdm_heap, last.get_heap());
7833 }
7834
7835
7836
7837 // =========================================================================================
7838 // shutdown
7839
7840 class C_MDC_ShutdownCheck : public MDCacheContext {
7841 public:
7842 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7843 void finish(int) override {
7844 mdcache->shutdown_check();
7845 }
7846 };
7847
7848 void MDCache::shutdown_check()
7849 {
7850 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7851
7852 // cache
7853 char old_val[32] = { 0 };
7854 char *o = old_val;
7855 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7856 g_conf().set_val("debug_mds", "10");
7857 g_conf().apply_changes(nullptr);
7858 show_cache();
7859 g_conf().set_val("debug_mds", old_val);
7860 g_conf().apply_changes(nullptr);
7861 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7862
7863 // this
7864 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7865 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7866
7867
7868 if (mds->objecter->is_active()) {
7869 dout(0) << "objecter still active" << dendl;
7870 mds->objecter->dump_active();
7871 }
7872 }
7873
7874
7875 void MDCache::shutdown_start()
7876 {
7877 dout(5) << "shutdown_start" << dendl;
7878
7879 if (g_conf()->mds_shutdown_check)
7880 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7881
7882 // g_conf()->debug_mds = 10;
7883 }
7884
7885
7886
7887 bool MDCache::shutdown_pass()
7888 {
7889 dout(7) << "shutdown_pass" << dendl;
7890
7891 if (mds->is_stopped()) {
7892 dout(7) << " already shut down" << dendl;
7893 show_cache();
7894 show_subtrees();
7895 return true;
7896 }
7897
7898 // empty stray dir
7899 bool strays_all_exported = shutdown_export_strays();
7900
7901 // trim cache
7902 trim(UINT64_MAX);
7903 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7904
7905 // Export all subtrees to another active (usually rank 0) if not rank 0
7906 int num_auth_subtree = 0;
7907 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7908 dout(7) << "looking for subtrees to export" << dendl;
7909 std::vector<CDir*> ls;
7910 for (auto& [dir, bounds] : subtrees) {
7911 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7912 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7913 continue;
7914 num_auth_subtree++;
7915 if (dir->is_frozen() ||
7916 dir->is_freezing() ||
7917 dir->is_ambiguous_dir_auth() ||
7918 dir->state_test(CDir::STATE_EXPORTING) ||
7919 dir->get_inode()->is_ephemerally_pinned()) {
7920 continue;
7921 }
7922 ls.push_back(dir);
7923 }
7924
7925 migrator->clear_export_queue();
7926 // stopping mds does not call MDBalancer::tick()
7927 mds->balancer->handle_export_pins();
7928 for (const auto& dir : ls) {
7929 mds_rank_t dest = dir->get_inode()->authority().first;
7930 if (dest > 0 && !mds->mdsmap->is_active(dest))
7931 dest = 0;
7932 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7933 migrator->export_dir_nicely(dir, dest);
7934 }
7935 }
7936
7937 if (!strays_all_exported) {
7938 dout(7) << "waiting for strays to migrate" << dendl;
7939 return false;
7940 }
7941
7942 if (num_auth_subtree > 0) {
7943 ceph_assert(mds->get_nodeid() > 0);
7944 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7945 show_subtrees();
7946 return false;
7947 }
7948
7949 // close out any sessions (and open files!) before we try to trim the log, etc.
7950 if (mds->sessionmap.have_unclosed_sessions()) {
7951 if (!mds->server->terminating_sessions)
7952 mds->server->terminate_sessions();
7953 return false;
7954 }
7955
7956 // Fully trim the log so that all objects in cache are clean and may be
7957 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7958 // trim the log such that the cache eventually becomes clean.
7959 if (mds->mdlog->get_num_segments() > 0) {
7960 auto ls = mds->mdlog->get_current_segment();
7961 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7962 // Current segment contains events other than subtreemap or
7963 // there are dirty dirfrags (see CDir::log_mark_dirty())
7964 mds->mdlog->start_new_segment();
7965 mds->mdlog->flush();
7966 }
7967 }
7968 mds->mdlog->trim_all();
7969 if (mds->mdlog->get_num_segments() > 1) {
7970 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7971 return false;
7972 }
7973
7974 // drop our reference to our stray dir inode
7975 for (int i = 0; i < NUM_STRAY; ++i) {
7976 if (strays[i] &&
7977 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7978 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7979 strays[i]->put(CInode::PIN_STRAY);
7980 strays[i]->put_stickydirs();
7981 }
7982 }
7983
7984 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7985 if (mydir && !mydir->is_subtree_root())
7986 mydir = NULL;
7987
7988 // subtrees map not empty yet?
7989 if (subtrees.size() > (mydir ? 1 : 0)) {
7990 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7991 show_subtrees();
7992 migrator->show_importing();
7993 migrator->show_exporting();
7994 if (!migrator->is_importing() && !migrator->is_exporting())
7995 show_cache();
7996 return false;
7997 }
7998 ceph_assert(!migrator->is_exporting());
7999 ceph_assert(!migrator->is_importing());
8000
8001 // replicas may dirty scatter locks
8002 if (myin && myin->is_replicated()) {
8003 dout(7) << "still have replicated objects" << dendl;
8004 return false;
8005 }
8006
8007 if ((myin && myin->get_num_auth_pins()) ||
8008 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
8009 dout(7) << "still have auth pinned objects" << dendl;
8010 return false;
8011 }
8012
8013 // (only do this once!)
8014 if (!mds->mdlog->is_capped()) {
8015 dout(7) << "capping the mdlog" << dendl;
8016 mds->mdlog->cap();
8017 }
8018
8019 if (!mds->mdlog->empty())
8020 mds->mdlog->trim(0);
8021
8022 if (!mds->mdlog->empty()) {
8023 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
8024 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
8025 return false;
8026 }
8027
8028 if (!did_shutdown_log_cap) {
8029 // flush journal header
8030 dout(7) << "writing header for (now-empty) journal" << dendl;
8031 ceph_assert(mds->mdlog->empty());
8032 mds->mdlog->write_head(0);
8033 // NOTE: filer active checker below will block us until this completes.
8034 did_shutdown_log_cap = true;
8035 return false;
8036 }
8037
8038 // filer active?
8039 if (mds->objecter->is_active()) {
8040 dout(7) << "objecter still active" << dendl;
8041 mds->objecter->dump_active();
8042 return false;
8043 }
8044
8045 // trim what we can from the cache
8046 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
8047 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
8048 show_cache();
8049 //dump();
8050 return false;
8051 }
8052
8053 // make mydir subtree go away
8054 if (mydir) {
8055 if (mydir->get_num_ref() > 1) { // subtree pin
8056 dout(7) << "there's still reference to mydir " << *mydir << dendl;
8057 show_cache();
8058 return false;
8059 }
8060
8061 remove_subtree(mydir);
8062 myin->close_dirfrag(mydir->get_frag());
8063 }
8064 ceph_assert(subtrees.empty());
8065
8066 if (myin) {
8067 remove_inode(myin);
8068 ceph_assert(!myin);
8069 }
8070
8071 if (global_snaprealm) {
8072 remove_inode(global_snaprealm->inode);
8073 global_snaprealm = nullptr;
8074 }
8075
8076 // done!
8077 dout(5) << "shutdown done." << dendl;
8078 return true;
8079 }
8080
8081 bool MDCache::shutdown_export_strays()
8082 {
8083 static const unsigned MAX_EXPORTING = 100;
8084
8085 if (mds->get_nodeid() == 0)
8086 return true;
8087
8088 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
8089 return false;
8090
8091 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
8092 << " '" << shutdown_export_next.second << "'" << dendl;
8093
8094 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
8095 bool all_exported = false;
8096
8097 again:
8098 auto next = shutdown_export_next;
8099
8100 for (int i = 0; i < NUM_STRAY; ++i) {
8101 CInode *strayi = strays[i];
8102 if (!strayi ||
8103 !strayi->state_test(CInode::STATE_STRAYPINNED))
8104 continue;
8105 if (strayi->ino() < next.first.ino)
8106 continue;
8107
8108 deque<CDir*> dfls;
8109 strayi->get_dirfrags(dfls);
8110
8111 while (!dfls.empty()) {
8112 CDir *dir = dfls.front();
8113 dfls.pop_front();
8114
8115 if (dir->dirfrag() < next.first)
8116 continue;
8117 if (next.first < dir->dirfrag()) {
8118 next.first = dir->dirfrag();
8119 next.second.clear();
8120 }
8121
8122 if (!dir->is_complete()) {
8123 MDSContext *fin = nullptr;
8124 if (shutdown_exporting_strays.empty()) {
8125 fin = new MDSInternalContextWrapper(mds,
8126 new LambdaContext([this](int r) {
8127 shutdown_export_strays();
8128 })
8129 );
8130 }
8131 dir->fetch(fin);
8132 goto done;
8133 }
8134
8135 CDir::dentry_key_map::iterator it;
8136 if (next.second.empty()) {
8137 it = dir->begin();
8138 } else {
8139 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8140 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8141 }
8142
8143 for (; it != dir->end(); ++it) {
8144 CDentry *dn = it->second;
8145 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8146 if (dnl->is_null())
8147 continue;
8148
8149 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8150 next.second = it->first.name;
8151 goto done;
8152 }
8153
8154 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8155 if (!ret.second) {
8156 dout(10) << "already exporting/purging " << *dn << dendl;
8157 continue;
8158 }
8159
8160 // Don't try to migrate anything that is actually
8161 // being purged right now
8162 if (!dn->state_test(CDentry::STATE_PURGING))
8163 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8164
8165 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8166 ++it;
8167 if (it != dir->end()) {
8168 next.second = it->first.name;
8169 } else {
8170 if (dfls.empty())
8171 next.first.ino.val++;
8172 else
8173 next.first = dfls.front()->dirfrag();
8174 next.second.clear();
8175 }
8176 goto done;
8177 }
8178 }
8179 }
8180 }
8181
8182 if (shutdown_exporting_strays.empty()) {
8183 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8184 if (first_df < shutdown_export_next.first ||
8185 !shutdown_export_next.second.empty()) {
8186 shutdown_export_next.first = first_df;
8187 shutdown_export_next.second.clear();
8188 goto again;
8189 }
8190 all_exported = true;
8191 }
8192
8193 done:
8194 shutdown_export_next = next;
8195 return all_exported;
8196 }
8197
8198 // ========= messaging ==============
8199
8200 void MDCache::dispatch(const cref_t<Message> &m)
8201 {
8202 switch (m->get_type()) {
8203
8204 // RESOLVE
8205 case MSG_MDS_RESOLVE:
8206 handle_resolve(ref_cast<MMDSResolve>(m));
8207 break;
8208 case MSG_MDS_RESOLVEACK:
8209 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8210 break;
8211
8212 // REJOIN
8213 case MSG_MDS_CACHEREJOIN:
8214 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8215 break;
8216
8217 case MSG_MDS_DISCOVER:
8218 handle_discover(ref_cast<MDiscover>(m));
8219 break;
8220 case MSG_MDS_DISCOVERREPLY:
8221 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8222 break;
8223
8224 case MSG_MDS_DIRUPDATE:
8225 handle_dir_update(ref_cast<MDirUpdate>(m));
8226 break;
8227
8228 case MSG_MDS_CACHEEXPIRE:
8229 handle_cache_expire(ref_cast<MCacheExpire>(m));
8230 break;
8231
8232 case MSG_MDS_DENTRYLINK:
8233 handle_dentry_link(ref_cast<MDentryLink>(m));
8234 break;
8235 case MSG_MDS_DENTRYUNLINK:
8236 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8237 break;
8238 case MSG_MDS_DENTRYUNLINK_ACK:
8239 handle_dentry_unlink_ack(ref_cast<MDentryUnlinkAck>(m));
8240 break;
8241
8242
8243 case MSG_MDS_FRAGMENTNOTIFY:
8244 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8245 break;
8246 case MSG_MDS_FRAGMENTNOTIFYACK:
8247 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8248 break;
8249
8250 case MSG_MDS_FINDINO:
8251 handle_find_ino(ref_cast<MMDSFindIno>(m));
8252 break;
8253 case MSG_MDS_FINDINOREPLY:
8254 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8255 break;
8256
8257 case MSG_MDS_OPENINO:
8258 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8259 break;
8260 case MSG_MDS_OPENINOREPLY:
8261 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8262 break;
8263
8264 case MSG_MDS_SNAPUPDATE:
8265 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8266 break;
8267
8268 default:
8269 derr << "cache unknown message " << m->get_type() << dendl;
8270 ceph_abort_msg("cache unknown message");
8271 }
8272 }
8273
8274 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8275 const filepath& path, int flags,
8276 vector<CDentry*> *pdnvec, CInode **pin)
8277 {
8278 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8279 bool forward = !discover;
8280 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8281 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8282 bool want_inode = (flags & MDS_TRAVERSE_WANT_INODE);
8283 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8284 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8285 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8286 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8287 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8288
8289 if (forward)
8290 ceph_assert(mdr); // forward requires a request
8291
8292 snapid_t snapid = CEPH_NOSNAP;
8293 if (mdr)
8294 mdr->snapid = snapid;
8295
8296 client_t client = mdr ? mdr->get_client() : -1;
8297
8298 if (mds->logger) mds->logger->inc(l_mds_traverse);
8299
8300 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8301 CInode *cur = get_inode(path.get_ino());
8302 if (!cur) {
8303 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8304 open_foreign_mdsdir(path.get_ino(), cf.build());
8305 return 1;
8306 }
8307 if (MDS_INO_IS_STRAY(path.get_ino())) {
8308 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8309 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8310 filepath path(strays[idx]->get_parent_dn()->get_name(),
8311 MDS_INO_MDSDIR(rank));
8312 MDRequestRef null_ref;
8313 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8314 }
8315 return -CEPHFS_ESTALE;
8316 }
8317 if (cur->state_test(CInode::STATE_PURGING))
8318 return -CEPHFS_ESTALE;
8319
8320 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8321 mds->locker->find_and_attach_lock_cache(mdr, cur);
8322
8323 if (mdr && mdr->lock_cache) {
8324 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8325 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8326 } else if (rdlock_snap) {
8327 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8328 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8329 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8330 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8331 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8332 return 1;
8333 }
8334 }
8335
8336 // start trace
8337 if (pdnvec)
8338 pdnvec->clear();
8339 if (pin)
8340 *pin = cur;
8341
8342 CInode *target_inode = nullptr;
8343 MutationImpl::LockOpVec lov;
8344 int r;
8345
8346 for (unsigned depth = 0; depth < path.depth(); ) {
8347 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8348 << "' snapid " << snapid << dendl;
8349
8350 if (!cur->is_dir()) {
8351 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8352 return -CEPHFS_ENOTDIR;
8353 }
8354
8355 // walk into snapdir?
8356 if (path[depth].length() == 0) {
8357 dout(10) << "traverse: snapdir" << dendl;
8358 if (!mdr || depth > 0) // snapdir must be the first component
8359 return -CEPHFS_EINVAL;
8360 snapid = CEPH_SNAPDIR;
8361 mdr->snapid = snapid;
8362 depth++;
8363 continue;
8364 }
8365 // walk thru snapdir?
8366 if (snapid == CEPH_SNAPDIR) {
8367 if (!mdr)
8368 return -CEPHFS_EINVAL;
8369 SnapRealm *realm = cur->find_snaprealm();
8370 snapid = realm->resolve_snapname(path[depth], cur->ino());
8371 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8372 if (!snapid) {
8373 if (pdnvec)
8374 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8375 return -CEPHFS_ENOENT;
8376 }
8377 if (depth == path.depth() - 1)
8378 target_inode = cur;
8379 mdr->snapid = snapid;
8380 depth++;
8381 continue;
8382 }
8383
8384 // open dir
8385 frag_t fg = cur->pick_dirfrag(path[depth]);
8386 CDir *curdir = cur->get_dirfrag(fg);
8387 if (!curdir) {
8388 if (cur->is_auth()) {
8389 // parent dir frozen_dir?
8390 if (cur->is_frozen()) {
8391 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8392 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8393 return 1;
8394 }
8395 curdir = cur->get_or_open_dirfrag(this, fg);
8396 } else {
8397 // discover?
8398 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8399 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8400 path_locked);
8401 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8402 return 1;
8403 }
8404 }
8405 ceph_assert(curdir);
8406
8407 #ifdef MDS_VERIFY_FRAGSTAT
8408 if (curdir->is_complete())
8409 curdir->verify_fragstat();
8410 #endif
8411
8412 // frozen?
8413 /*
8414 if (curdir->is_frozen()) {
8415 // doh!
8416 // FIXME: traverse is allowed?
8417 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8418 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8419 if (onfinish) delete onfinish;
8420 return 1;
8421 }
8422 */
8423
8424 // Defer the auth check until the target inode is determined not to exist
8425 // if want_inode is true.
8426 if (want_auth && want_dentry && !want_inode && depth == path.depth() - 1 &&
8427 (r = maybe_request_forward_to_auth(mdr, cf, curdir)) != 0)
8428 return r;
8429
8430 // Before doing dirfrag->dn lookup, compare with DamageTable's
8431 // record of which dentries were unreadable
8432 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8433 dout(4) << "traverse: stopped lookup at damaged dentry "
8434 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8435 return -CEPHFS_EIO;
8436 }
8437
8438 // dentry
8439 CDentry *dn = curdir->lookup(path[depth], snapid);
8440 if (dn) {
8441 if (dn->state_test(CDentry::STATE_PURGING))
8442 return -CEPHFS_ENOENT;
8443
8444 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8445 // If an auth check was deferred before and the target inode is found
8446 // not to exist now, do the auth check here if necessary.
8447 if (want_auth && want_dentry && want_inode && depth == path.depth() - 1 &&
8448 dnl->is_null() && (r = maybe_request_forward_to_auth(mdr, cf, dn)) != 0)
8449 return r;
8450
8451 if (rdlock_path) {
8452 lov.clear();
8453 // do not xlock the tail dentry if target inode exists and caller wants it
8454 if (xlock_dentry && (dnl->is_null() || !want_inode) &&
8455 depth == path.depth() - 1) {
8456 ceph_assert(dn->is_auth());
8457 if (depth > 0 || !mdr->lock_cache) {
8458 lov.add_wrlock(&cur->filelock);
8459 lov.add_wrlock(&cur->nestlock);
8460 if (rdlock_authlock)
8461 lov.add_rdlock(&cur->authlock);
8462 }
8463 lov.add_xlock(&dn->lock);
8464 } else {
8465 // force client to flush async dir operation if necessary
8466 if (cur->filelock.is_cached())
8467 lov.add_wrlock(&cur->filelock);
8468 lov.add_rdlock(&dn->lock);
8469 }
8470 if (!mds->locker->acquire_locks(mdr, lov)) {
8471 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8472 return 1;
8473 }
8474 } else if (!path_locked &&
8475 !dn->lock.can_read(client) &&
8476 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8477 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8478 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8479 if (mds->logger)
8480 mds->logger->inc(l_mds_traverse_lock);
8481 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8482 mds->mdlog->flush();
8483 return 1;
8484 }
8485
8486 if (pdnvec)
8487 pdnvec->push_back(dn);
8488
8489 // can we conclude CEPHFS_ENOENT?
8490 if (dnl->is_null()) {
8491 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8492 if (depth == path.depth() - 1) {
8493 if (want_dentry)
8494 break;
8495 } else {
8496 if (pdnvec)
8497 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8498 }
8499 return -CEPHFS_ENOENT;
8500 }
8501
8502 // do we have inode?
8503 CInode *in = dnl->get_inode();
8504 if (!in) {
8505 ceph_assert(dnl->is_remote());
8506 // do i have it?
8507 in = get_inode(dnl->get_remote_ino());
8508 if (in) {
8509 dout(7) << "linking in remote in " << *in << dendl;
8510 dn->link_remote(dnl, in);
8511 } else {
8512 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8513 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8514 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8515 dout(4) << "traverse: remote dentry points to damaged ino "
8516 << *dn << dendl;
8517 return -CEPHFS_EIO;
8518 }
8519 open_remote_dentry(dn, true, cf.build(),
8520 (path_locked && depth == path.depth() - 1));
8521 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8522 return 1;
8523 }
8524 }
8525
8526 cur = in;
8527
8528 if (rdlock_snap && !(want_dentry && !want_inode && depth == path.depth() - 1)) {
8529 lov.clear();
8530 lov.add_rdlock(&cur->snaplock);
8531 if (!mds->locker->acquire_locks(mdr, lov)) {
8532 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8533 return 1;
8534 }
8535 }
8536
8537 if (depth == path.depth() - 1)
8538 target_inode = cur;
8539
8540 // add to trace, continue.
8541 touch_inode(cur);
8542 if (pin)
8543 *pin = cur;
8544 depth++;
8545 continue;
8546 }
8547
8548 ceph_assert(!dn);
8549
8550 // MISS. dentry doesn't exist.
8551 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8552
8553 if (curdir->is_auth()) {
8554 // dentry is mine.
8555 if (curdir->is_complete() ||
8556 (snapid == CEPH_NOSNAP &&
8557 curdir->has_bloom() &&
8558 !curdir->is_in_bloom(path[depth]))) {
8559 // file not found
8560 if (pdnvec) {
8561 // instantiate a null dn?
8562 if (depth < path.depth() - 1) {
8563 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8564 } else if (snapid < CEPH_MAXSNAP) {
8565 dout(20) << " not adding null for snapid " << snapid << dendl;
8566 } else if (curdir->is_frozen()) {
8567 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8568 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8569 return 1;
8570 } else {
8571 // create a null dentry
8572 dn = curdir->add_null_dentry(path[depth]);
8573 dout(20) << " added null " << *dn << dendl;
8574
8575 if (rdlock_path) {
8576 lov.clear();
8577 if (xlock_dentry) {
8578 if (depth > 0 || !mdr->lock_cache) {
8579 lov.add_wrlock(&cur->filelock);
8580 lov.add_wrlock(&cur->nestlock);
8581 if (rdlock_authlock)
8582 lov.add_rdlock(&cur->authlock);
8583 }
8584 lov.add_xlock(&dn->lock);
8585 } else {
8586 // force client to flush async dir operation if necessary
8587 if (cur->filelock.is_cached())
8588 lov.add_wrlock(&cur->filelock);
8589 lov.add_rdlock(&dn->lock);
8590 }
8591 if (!mds->locker->acquire_locks(mdr, lov)) {
8592 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8593 return 1;
8594 }
8595 }
8596 }
8597 if (dn) {
8598 pdnvec->push_back(dn);
8599 if (want_dentry)
8600 break;
8601 } else {
8602 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8603 }
8604 }
8605 return -CEPHFS_ENOENT;
8606 } else {
8607
8608 // Check DamageTable for missing fragments before trying to fetch
8609 // this
8610 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8611 dout(4) << "traverse: damaged dirfrag " << *curdir
8612 << ", blocking fetch" << dendl;
8613 return -CEPHFS_EIO;
8614 }
8615
8616 // directory isn't complete; reload
8617 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8618 touch_inode(cur);
8619 curdir->fetch(path[depth], snapid, cf.build());
8620 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8621 return 1;
8622 }
8623 } else {
8624 // dirfrag/dentry is not mine.
8625
8626 if (forward &&
8627 mdr && mdr->client_request &&
8628 (int)depth < mdr->client_request->get_num_fwd()){
8629 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8630 << " < fwd " << mdr->client_request->get_num_fwd()
8631 << ", discovering instead of forwarding" << dendl;
8632 discover = true;
8633 }
8634
8635 if ((discover)) {
8636 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8637 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8638 path_locked);
8639 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8640 return 1;
8641 }
8642 if (forward) {
8643 // forward
8644 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8645
8646 r = maybe_request_forward_to_auth(mdr, cf, curdir);
8647 ceph_assert(r != 0);
8648
8649 if (r == 2 && mds->logger)
8650 mds->logger->inc(l_mds_traverse_forward);
8651
8652 return r;
8653 }
8654 }
8655
8656 ceph_abort(); // i shouldn't get here
8657 }
8658
8659 if (path.depth() == 0) {
8660 dout(7) << "no tail dentry, base " << *cur << dendl;
8661 if (want_dentry && !want_inode) {
8662 return -CEPHFS_ENOENT;
8663 }
8664 target_inode = cur;
8665 }
8666
8667 if (target_inode) {
8668 dout(7) << "found target " << *target_inode << dendl;
8669 if (want_auth && !(want_dentry && !want_inode) &&
8670 (r = maybe_request_forward_to_auth(mdr, cf, target_inode)) != 0)
8671 return r;
8672 }
8673
8674 // success.
8675 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8676 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8677 if (mdr)
8678 ceph_assert(mdr->snapid == snapid);
8679
8680 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8681 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8682 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8683 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8684
8685 if (rdlock_path)
8686 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8687
8688 return 0;
8689 }
8690
8691 int MDCache::maybe_request_forward_to_auth(MDRequestRef& mdr, MDSContextFactory& cf,
8692 MDSCacheObject *p)
8693 {
8694 if (p->is_ambiguous_auth()) {
8695 dout(7) << "waiting for single auth on " << *p << dendl;
8696 p->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8697 return 1;
8698 }
8699 if (!p->is_auth()) {
8700 dout(7) << "fw to auth for " << *p << dendl;
8701 request_forward(mdr, p->authority().first);
8702 return 2;
8703 }
8704 return 0;
8705 }
8706
8707 CInode *MDCache::cache_traverse(const filepath& fp)
8708 {
8709 dout(10) << "cache_traverse " << fp << dendl;
8710
8711 CInode *in;
8712 unsigned depth = 0;
8713 char mdsdir_name[16];
8714 sprintf(mdsdir_name, "~mds%d", mds->get_nodeid());
8715
8716 if (fp.get_ino()) {
8717 in = get_inode(fp.get_ino());
8718 } else if (fp.depth() > 0 && (fp[0] == "~mdsdir" || fp[0] == mdsdir_name)) {
8719 in = myin;
8720 depth = 1;
8721 } else {
8722 in = root;
8723 }
8724 if (!in)
8725 return NULL;
8726
8727 for (; depth < fp.depth(); depth++) {
8728 std::string_view dname = fp[depth];
8729 frag_t fg = in->pick_dirfrag(dname);
8730 dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl;
8731 CDir *curdir = in->get_dirfrag(fg);
8732 if (!curdir)
8733 return NULL;
8734 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8735 if (!dn)
8736 return NULL;
8737 in = dn->get_linkage()->get_inode();
8738 if (!in)
8739 return NULL;
8740 }
8741 dout(10) << " got " << *in << dendl;
8742 return in;
8743 }
8744
8745
8746 /**
8747 * open_remote_dir -- open up a remote dirfrag
8748 *
8749 * @param diri base inode
8750 * @param approxfg approximate fragment.
8751 * @param fin completion callback
8752 */
8753 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8754 {
8755 dout(10) << "open_remote_dir on " << *diri << dendl;
8756 ceph_assert(diri->is_dir());
8757 ceph_assert(!diri->is_auth());
8758 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8759
8760 discover_dir_frag(diri, approxfg, fin);
8761 }
8762
8763
8764 /**
8765 * get_dentry_inode - get or open inode
8766 *
8767 * @param dn the dentry
8768 * @param mdr current request
8769 *
8770 * will return inode for primary, or link up/open up remote link's inode as necessary.
8771 * If it's not available right now, puts mdr on wait list and returns null.
8772 */
8773 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8774 {
8775 CDentry::linkage_t *dnl;
8776 if (projected)
8777 dnl = dn->get_projected_linkage();
8778 else
8779 dnl = dn->get_linkage();
8780
8781 ceph_assert(!dnl->is_null());
8782
8783 if (dnl->is_primary())
8784 return dnl->inode;
8785
8786 ceph_assert(dnl->is_remote());
8787 CInode *in = get_inode(dnl->get_remote_ino());
8788 if (in) {
8789 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8790 dn->link_remote(dnl, in);
8791 return in;
8792 } else {
8793 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8794 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8795 return 0;
8796 }
8797 }
8798
8799 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8800 CDentry *dn;
8801 inodeno_t ino;
8802 MDSContext *onfinish;
8803 bool want_xlocked;
8804 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8805 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8806 dn->get(MDSCacheObject::PIN_PTRWAITER);
8807 }
8808 void finish(int r) override {
8809 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8810 dn->put(MDSCacheObject::PIN_PTRWAITER);
8811 }
8812 };
8813
8814 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8815 {
8816 dout(10) << "open_remote_dentry " << *dn << dendl;
8817 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8818 inodeno_t ino = dnl->get_remote_ino();
8819 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1;
8820 open_ino(ino, pool,
8821 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8822 }
8823
8824 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8825 bool want_xlocked, int r)
8826 {
8827 if (r < 0) {
8828 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8829 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8830 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8831 dn->state_set(CDentry::STATE_BADREMOTEINO);
8832
8833 std::string path;
8834 CDir *dir = dn->get_dir();
8835 if (dir) {
8836 dir->get_inode()->make_path_string(path);
8837 path += "/";
8838 path += dn->get_name();
8839 }
8840
8841 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8842 if (fatal) {
8843 mds->damaged();
8844 ceph_abort(); // unreachable, damaged() respawns us
8845 }
8846 } else {
8847 r = 0;
8848 }
8849 }
8850 fin->complete(r < 0 ? r : 0);
8851 }
8852
8853
8854 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8855 {
8856 // empty trace if we're a base inode
8857 if (in->is_base())
8858 return;
8859
8860 CInode *parent = in->get_parent_inode();
8861 ceph_assert(parent);
8862 make_trace(trace, parent);
8863
8864 CDentry *dn = in->get_parent_dn();
8865 dout(15) << "make_trace adding " << *dn << dendl;
8866 trace.push_back(dn);
8867 }
8868
8869
8870 // -------------------------------------------------------------------------------
8871 // Open inode by inode number
8872
8873 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8874 inodeno_t ino;
8875 public:
8876 bufferlist bl;
8877 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8878 MDCacheIOContext(c), ino(i) {}
8879 void finish(int r) override {
8880 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8881 }
8882 void print(ostream& out) const override {
8883 out << "openino_backtrace_fetch" << ino << ")";
8884 }
8885 };
8886
8887 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8888 inodeno_t ino;
8889 cref_t<MMDSOpenIno> msg;
8890 bool parent;
8891 public:
8892 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8893 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8894 void finish(int r) override {
8895 if (r < 0 && !parent)
8896 r = -CEPHFS_EAGAIN;
8897 if (msg) {
8898 mdcache->handle_open_ino(msg, r);
8899 return;
8900 }
8901 auto& info = mdcache->opening_inodes.at(ino);
8902 mdcache->_open_ino_traverse_dir(ino, info, r);
8903 }
8904 };
8905
8906 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8907 inodeno_t ino;
8908 public:
8909 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8910 void finish(int r) override {
8911 mdcache->_open_ino_parent_opened(ino, r);
8912 }
8913 };
8914
8915 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8916 {
8917 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8918
8919 open_ino_info_t& info = opening_inodes.at(ino);
8920
8921 CInode *in = get_inode(ino);
8922 if (in) {
8923 dout(10) << " found cached " << *in << dendl;
8924 open_ino_finish(ino, info, in->authority().first);
8925 return;
8926 }
8927
8928 inode_backtrace_t backtrace;
8929 if (err == 0) {
8930 try {
8931 decode(backtrace, bl);
8932 } catch (const buffer::error &decode_exc) {
8933 derr << "corrupt backtrace on ino x0" << std::hex << ino
8934 << std::dec << ": " << decode_exc.what() << dendl;
8935 open_ino_finish(ino, info, -CEPHFS_EIO);
8936 return;
8937 }
8938 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8939 dout(10) << " old object in pool " << info.pool
8940 << ", retrying pool " << backtrace.pool << dendl;
8941 info.pool = backtrace.pool;
8942 C_IO_MDC_OpenInoBacktraceFetched *fin =
8943 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8944 fetch_backtrace(ino, info.pool, fin->bl,
8945 new C_OnFinisher(fin, mds->finisher));
8946 return;
8947 }
8948 } else if (err == -CEPHFS_ENOENT) {
8949 int64_t meta_pool = mds->get_metadata_pool();
8950 if (info.pool != meta_pool) {
8951 dout(10) << " no object in pool " << info.pool
8952 << ", retrying pool " << meta_pool << dendl;
8953 info.pool = meta_pool;
8954 C_IO_MDC_OpenInoBacktraceFetched *fin =
8955 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8956 fetch_backtrace(ino, info.pool, fin->bl,
8957 new C_OnFinisher(fin, mds->finisher));
8958 return;
8959 }
8960 err = 0; // backtrace.ancestors.empty() is checked below
8961 }
8962
8963 if (err == 0) {
8964 if (backtrace.ancestors.empty()) {
8965 dout(10) << " got empty backtrace " << dendl;
8966 err = -CEPHFS_ESTALE;
8967 } else if (!info.ancestors.empty()) {
8968 if (info.ancestors[0] == backtrace.ancestors[0]) {
8969 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8970 err = -CEPHFS_EINVAL;
8971 } else {
8972 info.last_err = 0;
8973 }
8974 }
8975 }
8976 if (err) {
8977 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8978 if (info.last_err)
8979 err = info.last_err;
8980 open_ino_finish(ino, info, err);
8981 return;
8982 }
8983
8984 dout(10) << " got backtrace " << backtrace << dendl;
8985 info.ancestors = backtrace.ancestors;
8986
8987 _open_ino_traverse_dir(ino, info, 0);
8988 }
8989
8990 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8991 {
8992 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8993
8994 open_ino_info_t& info = opening_inodes.at(ino);
8995
8996 CInode *in = get_inode(ino);
8997 if (in) {
8998 dout(10) << " found cached " << *in << dendl;
8999 open_ino_finish(ino, info, in->authority().first);
9000 return;
9001 }
9002
9003 if (ret == mds->get_nodeid()) {
9004 _open_ino_traverse_dir(ino, info, 0);
9005 } else {
9006 if (ret >= 0) {
9007 mds_rank_t checked_rank = mds_rank_t(ret);
9008 info.check_peers = true;
9009 info.auth_hint = checked_rank;
9010 info.checked.erase(checked_rank);
9011 }
9012 do_open_ino(ino, info, ret);
9013 }
9014 }
9015
9016 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
9017 {
9018 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
9019
9020 CInode *in = get_inode(ino);
9021 if (in) {
9022 dout(10) << " found cached " << *in << dendl;
9023 open_ino_finish(ino, info, in->authority().first);
9024 return;
9025 }
9026
9027 if (ret) {
9028 do_open_ino(ino, info, ret);
9029 return;
9030 }
9031
9032 mds_rank_t hint = info.auth_hint;
9033 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
9034 info.discover, info.want_xlocked, &hint);
9035 if (ret > 0)
9036 return;
9037 if (hint != mds->get_nodeid())
9038 info.auth_hint = hint;
9039 do_open_ino(ino, info, ret);
9040 }
9041
9042 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, bool parent,
9043 CDir *dir, std::string_view dname)
9044 {
9045 if (dir->state_test(CDir::STATE_REJOINUNDEF))
9046 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
9047
9048 auto fin = new C_MDC_OpenInoTraverseDir(this, ino, m, parent);
9049 if (open_ino_batch && !dname.empty()) {
9050 auto& p = open_ino_batched_fetch[dir];
9051 p.first.emplace_back(dname);
9052 p.second.emplace_back(fin);
9053 return;
9054 }
9055
9056 dir->fetch(dname, CEPH_NOSNAP, fin);
9057 if (mds->logger)
9058 mds->logger->inc(l_mds_openino_dir_fetch);
9059 }
9060
9061 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
9062 const vector<inode_backpointer_t>& ancestors,
9063 bool discover, bool want_xlocked, mds_rank_t *hint)
9064 {
9065 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
9066 int err = 0;
9067 for (unsigned i = 0; i < ancestors.size(); i++) {
9068 const auto& ancestor = ancestors.at(i);
9069 CInode *diri = get_inode(ancestor.dirino);
9070
9071 if (!diri) {
9072 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
9073 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9074 return 1;
9075 }
9076 continue;
9077 }
9078
9079 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
9080 CDentry *dn = diri->get_parent_dn();
9081 CDir *dir = dn->get_dir();
9082 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
9083 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
9084 dn = dir->get_inode()->get_parent_dn();
9085 dir = dn->get_dir();
9086 }
9087 _open_ino_fetch_dir(ino, m, i == 0, dir, dn->name);
9088 return 1;
9089 }
9090
9091 if (!diri->is_dir()) {
9092 dout(10) << " " << *diri << " is not dir" << dendl;
9093 if (i == 0)
9094 err = -CEPHFS_ENOTDIR;
9095 break;
9096 }
9097
9098 const string& name = ancestor.dname;
9099 frag_t fg = diri->pick_dirfrag(name);
9100 CDir *dir = diri->get_dirfrag(fg);
9101 if (!dir) {
9102 if (diri->is_auth()) {
9103 if (diri->is_frozen()) {
9104 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
9105 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9106 return 1;
9107 }
9108 dir = diri->get_or_open_dirfrag(this, fg);
9109 } else if (discover) {
9110 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9111 return 1;
9112 }
9113 }
9114 if (dir) {
9115 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
9116 CDentry *dn = dir->lookup(name);
9117 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
9118 if (dir->is_auth()) {
9119 if (dnl && dnl->is_primary() &&
9120 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
9121 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
9122 _open_ino_fetch_dir(ino, m, i == 0, dir, name);
9123 return 1;
9124 }
9125
9126 if (!dnl && !dir->is_complete() &&
9127 (!dir->has_bloom() || dir->is_in_bloom(name))) {
9128 dout(10) << " fetching incomplete " << *dir << dendl;
9129 _open_ino_fetch_dir(ino, m, i == 0, dir, name);
9130 return 1;
9131 }
9132
9133 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9134 if (i == 0)
9135 err = -CEPHFS_ENOENT;
9136 } else if (discover) {
9137 if (!dnl) {
9138 filepath path(name, 0);
9139 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
9140 (i == 0 && want_xlocked));
9141 return 1;
9142 }
9143 if (dnl->is_null() && !dn->lock.can_read(-1)) {
9144 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
9145 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9146 return 1;
9147 }
9148 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9149 if (i == 0)
9150 err = -CEPHFS_ENOENT;
9151 }
9152 }
9153 if (hint && i == 0)
9154 *hint = dir ? dir->authority().first : diri->authority().first;
9155 break;
9156 }
9157 return err;
9158 }
9159
9160 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
9161 {
9162 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9163
9164 MDSContext::vec waiters;
9165 waiters.swap(info.waiters);
9166 opening_inodes.erase(ino);
9167 finish_contexts(g_ceph_context, waiters, ret);
9168 }
9169
9170 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9171 {
9172 if (err < 0 && err != -CEPHFS_EAGAIN) {
9173 info.checked.clear();
9174 info.checking = MDS_RANK_NONE;
9175 info.check_peers = true;
9176 info.fetch_backtrace = true;
9177 if (info.discover) {
9178 info.discover = false;
9179 info.ancestors.clear();
9180 }
9181 if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR)
9182 info.last_err = err;
9183 }
9184
9185 if (info.check_peers || info.discover) {
9186 if (info.discover) {
9187 // got backtrace from peer, but failed to find inode. re-check peers
9188 info.discover = false;
9189 info.ancestors.clear();
9190 info.checked.clear();
9191 }
9192 info.check_peers = false;
9193 info.checking = MDS_RANK_NONE;
9194 do_open_ino_peer(ino, info);
9195 } else if (info.fetch_backtrace) {
9196 info.check_peers = true;
9197 info.fetch_backtrace = false;
9198 info.checking = mds->get_nodeid();
9199 info.checked.clear();
9200 C_IO_MDC_OpenInoBacktraceFetched *fin =
9201 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9202 fetch_backtrace(ino, info.pool, fin->bl,
9203 new C_OnFinisher(fin, mds->finisher));
9204 } else {
9205 ceph_assert(!info.ancestors.empty());
9206 info.checking = mds->get_nodeid();
9207 open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(),
9208 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9209 }
9210 }
9211
9212 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9213 {
9214 set<mds_rank_t> all, active;
9215 mds->mdsmap->get_mds_set(all);
9216 if (mds->get_state() == MDSMap::STATE_REJOIN)
9217 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9218 else
9219 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9220
9221 dout(10) << "do_open_ino_peer " << ino << " active " << active
9222 << " all " << all << " checked " << info.checked << dendl;
9223
9224 mds_rank_t whoami = mds->get_nodeid();
9225 mds_rank_t peer = MDS_RANK_NONE;
9226 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9227 if (active.count(info.auth_hint)) {
9228 peer = info.auth_hint;
9229 info.auth_hint = MDS_RANK_NONE;
9230 }
9231 } else {
9232 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9233 if (*p != whoami && info.checked.count(*p) == 0) {
9234 peer = *p;
9235 break;
9236 }
9237 }
9238 if (peer < 0) {
9239 all.erase(whoami);
9240 if (all != info.checked) {
9241 dout(10) << " waiting for more peers to be active" << dendl;
9242 } else {
9243 dout(10) << " all MDS peers have been checked " << dendl;
9244 do_open_ino(ino, info, 0);
9245 }
9246 } else {
9247 info.checking = peer;
9248 vector<inode_backpointer_t> *pa = NULL;
9249 // got backtrace from peer or backtrace just fetched
9250 if (info.discover || !info.fetch_backtrace)
9251 pa = &info.ancestors;
9252 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9253 if (mds->logger)
9254 mds->logger->inc(l_mds_openino_peer_discover);
9255 }
9256 }
9257
9258 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9259 {
9260 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9261 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9262 return;
9263 }
9264
9265 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9266
9267 auto from = mds_rank_t(m->get_source().num());
9268 inodeno_t ino = m->ino;
9269 ref_t<MMDSOpenInoReply> reply;
9270 CInode *in = get_inode(ino);
9271 if (in) {
9272 dout(10) << " have " << *in << dendl;
9273 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9274 if (in->is_auth()) {
9275 touch_inode(in);
9276 while (1) {
9277 CDentry *pdn = in->get_parent_dn();
9278 if (!pdn)
9279 break;
9280 CInode *diri = pdn->get_dir()->get_inode();
9281 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9282 in->get_version()));
9283 in = diri;
9284 }
9285 } else {
9286 reply->hint = in->authority().first;
9287 }
9288 } else if (err < 0) {
9289 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9290 } else {
9291 mds_rank_t hint = MDS_RANK_NONE;
9292 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9293 if (ret > 0)
9294 return;
9295 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9296 }
9297 mds->send_message_mds(reply, from);
9298 }
9299
9300 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9301 {
9302 dout(10) << "handle_open_ino_reply " << *m << dendl;
9303
9304 inodeno_t ino = m->ino;
9305 mds_rank_t from = mds_rank_t(m->get_source().num());
9306 auto it = opening_inodes.find(ino);
9307 if (it != opening_inodes.end() && it->second.checking == from) {
9308 open_ino_info_t& info = it->second;
9309 info.checking = MDS_RANK_NONE;
9310 info.checked.insert(from);
9311
9312 CInode *in = get_inode(ino);
9313 if (in) {
9314 dout(10) << " found cached " << *in << dendl;
9315 open_ino_finish(ino, info, in->authority().first);
9316 } else if (!m->ancestors.empty()) {
9317 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9318 if (!info.want_replica) {
9319 open_ino_finish(ino, info, from);
9320 return;
9321 }
9322
9323 info.ancestors = m->ancestors;
9324 info.auth_hint = from;
9325 info.checking = mds->get_nodeid();
9326 info.discover = true;
9327 _open_ino_traverse_dir(ino, info, 0);
9328 } else if (m->error) {
9329 dout(10) << " error " << m->error << " from mds." << from << dendl;
9330 do_open_ino(ino, info, m->error);
9331 } else {
9332 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9333 info.auth_hint = m->hint;
9334 info.checked.erase(m->hint);
9335 }
9336 do_open_ino_peer(ino, info);
9337 }
9338 }
9339 }
9340
9341 void MDCache::kick_open_ino_peers(mds_rank_t who)
9342 {
9343 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9344
9345 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9346 p != opening_inodes.end();
9347 ++p) {
9348 open_ino_info_t& info = p->second;
9349 if (info.checking == who) {
9350 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9351 info.checking = MDS_RANK_NONE;
9352 do_open_ino_peer(p->first, info);
9353 } else if (info.checking == MDS_RANK_NONE) {
9354 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9355 do_open_ino_peer(p->first, info);
9356 }
9357 }
9358 }
9359
9360 void MDCache::open_ino_batch_start()
9361 {
9362 dout(10) << __func__ << dendl;
9363 open_ino_batch = true;
9364 }
9365
9366 void MDCache::open_ino_batch_submit()
9367 {
9368 dout(10) << __func__ << dendl;
9369 open_ino_batch = false;
9370
9371 for (auto& [dir, p] : open_ino_batched_fetch) {
9372 CInode *in = dir->inode;
9373 std::vector<dentry_key_t> keys;
9374 for (auto& dname : p.first)
9375 keys.emplace_back(CEPH_NOSNAP, dname, in->hash_dentry_name(dname));
9376 dir->fetch_keys(keys,
9377 new MDSInternalContextWrapper(mds,
9378 new LambdaContext([this, waiters = std::move(p.second)](int r) mutable {
9379 mds->queue_waiters_front(waiters);
9380 })
9381 )
9382 );
9383 if (mds->logger)
9384 mds->logger->inc(l_mds_openino_dir_fetch);
9385 }
9386 open_ino_batched_fetch.clear();
9387 }
9388
9389 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9390 bool want_replica, bool want_xlocked,
9391 vector<inode_backpointer_t> *ancestors_hint,
9392 mds_rank_t auth_hint)
9393 {
9394 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9395 << want_replica << dendl;
9396
9397 auto it = opening_inodes.find(ino);
9398 if (it != opening_inodes.end()) {
9399 open_ino_info_t& info = it->second;
9400 if (want_replica) {
9401 info.want_replica = true;
9402 if (want_xlocked && !info.want_xlocked) {
9403 if (!info.ancestors.empty()) {
9404 CInode *diri = get_inode(info.ancestors[0].dirino);
9405 if (diri) {
9406 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9407 CDir *dir = diri->get_dirfrag(fg);
9408 if (dir && !dir->is_auth()) {
9409 filepath path(info.ancestors[0].dname, 0);
9410 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9411 }
9412 }
9413 }
9414 info.want_xlocked = true;
9415 }
9416 }
9417 info.waiters.push_back(fin);
9418 } else {
9419 open_ino_info_t& info = opening_inodes[ino];
9420 info.want_replica = want_replica;
9421 info.want_xlocked = want_xlocked;
9422 info.tid = ++open_ino_last_tid;
9423 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9424 info.waiters.push_back(fin);
9425 if (auth_hint != MDS_RANK_NONE)
9426 info.auth_hint = auth_hint;
9427 if (ancestors_hint) {
9428 info.ancestors = std::move(*ancestors_hint);
9429 info.fetch_backtrace = false;
9430 info.checking = mds->get_nodeid();
9431 _open_ino_traverse_dir(ino, info, 0);
9432 } else {
9433 do_open_ino(ino, info, 0);
9434 }
9435 }
9436 }
9437
9438 /* ---------------------------- */
9439
9440 /*
9441 * search for a given inode on MDS peers. optionally start with the given node.
9442
9443
9444 TODO
9445 - recover from mds node failure, recovery
9446 - traverse path
9447
9448 */
9449 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9450 mds_rank_t hint, bool path_locked)
9451 {
9452 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9453 CInode *in = get_inode(ino);
9454 if (in && in->state_test(CInode::STATE_PURGING)) {
9455 c->complete(-CEPHFS_ESTALE);
9456 return;
9457 }
9458 ceph_assert(!in);
9459
9460 ceph_tid_t tid = ++find_ino_peer_last_tid;
9461 find_ino_peer_info_t& fip = find_ino_peer[tid];
9462 fip.ino = ino;
9463 fip.tid = tid;
9464 fip.fin = c;
9465 fip.path_locked = path_locked;
9466 fip.hint = hint;
9467 _do_find_ino_peer(fip);
9468 }
9469
9470 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9471 {
9472 set<mds_rank_t> all, active;
9473 mds->mdsmap->get_mds_set(all);
9474 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9475
9476 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9477 << " active " << active << " all " << all
9478 << " checked " << fip.checked
9479 << dendl;
9480
9481 mds_rank_t m = MDS_RANK_NONE;
9482 if (fip.hint >= 0) {
9483 m = fip.hint;
9484 fip.hint = MDS_RANK_NONE;
9485 } else {
9486 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9487 if (*p != mds->get_nodeid() &&
9488 fip.checked.count(*p) == 0) {
9489 m = *p;
9490 break;
9491 }
9492 }
9493 if (m == MDS_RANK_NONE) {
9494 all.erase(mds->get_nodeid());
9495 if (all != fip.checked) {
9496 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9497 } else {
9498 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9499 fip.fin->complete(-CEPHFS_ESTALE);
9500 find_ino_peer.erase(fip.tid);
9501 }
9502 } else {
9503 fip.checking = m;
9504 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9505 }
9506 }
9507
9508 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9509 {
9510 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9511 return;
9512 }
9513
9514 dout(10) << "handle_find_ino " << *m << dendl;
9515 auto r = make_message<MMDSFindInoReply>(m->tid);
9516 CInode *in = get_inode(m->ino);
9517 if (in) {
9518 in->make_path(r->path);
9519 dout(10) << " have " << r->path << " " << *in << dendl;
9520
9521 /*
9522 * If the the CInode was just created by using openc in current
9523 * auth MDS, but the client just sends a getattr request to another
9524 * replica MDS. Then here it will make a path of '#INODE-NUMBER'
9525 * only because the CInode hasn't been linked yet, and the replica
9526 * MDS will keep retrying until the auth MDS flushes the mdlog and
9527 * the C_MDS_openc_finish and link_primary_inode are called at most
9528 * 5 seconds later.
9529 */
9530 if (!in->get_parent_dn() && in->is_auth()) {
9531 mds->mdlog->flush();
9532 }
9533 }
9534 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9535 }
9536
9537
9538 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9539 {
9540 auto p = find_ino_peer.find(m->tid);
9541 if (p != find_ino_peer.end()) {
9542 dout(10) << "handle_find_ino_reply " << *m << dendl;
9543 find_ino_peer_info_t& fip = p->second;
9544
9545 // success?
9546 if (get_inode(fip.ino)) {
9547 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9548 mds->queue_waiter(fip.fin);
9549 find_ino_peer.erase(p);
9550 return;
9551 }
9552
9553 mds_rank_t from = mds_rank_t(m->get_source().num());
9554 if (fip.checking == from)
9555 fip.checking = MDS_RANK_NONE;
9556 fip.checked.insert(from);
9557
9558 if (!m->path.empty()) {
9559 // we got a path!
9560 vector<CDentry*> trace;
9561 CF_MDS_RetryMessageFactory cf(mds, m);
9562 MDRequestRef null_ref;
9563 int flags = MDS_TRAVERSE_DISCOVER;
9564 if (fip.path_locked)
9565 flags |= MDS_TRAVERSE_PATH_LOCKED;
9566 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9567 if (r > 0)
9568 return;
9569 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9570 << ", retrying" << dendl;
9571 fip.checked.clear();
9572 _do_find_ino_peer(fip);
9573 } else {
9574 // nope, continue.
9575 _do_find_ino_peer(fip);
9576 }
9577 } else {
9578 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9579 }
9580 }
9581
9582 void MDCache::kick_find_ino_peers(mds_rank_t who)
9583 {
9584 // find_ino_peers requests we should move on from
9585 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9586 p != find_ino_peer.end();
9587 ++p) {
9588 find_ino_peer_info_t& fip = p->second;
9589 if (fip.checking == who) {
9590 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9591 fip.checking = MDS_RANK_NONE;
9592 _do_find_ino_peer(fip);
9593 } else if (fip.checking == MDS_RANK_NONE) {
9594 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9595 _do_find_ino_peer(fip);
9596 }
9597 }
9598 }
9599
9600 /* ---------------------------- */
9601
9602 int MDCache::get_num_client_requests()
9603 {
9604 int count = 0;
9605 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9606 p != active_requests.end();
9607 ++p) {
9608 MDRequestRef& mdr = p->second;
9609 if (mdr->reqid.name.is_client() && !mdr->is_peer())
9610 count++;
9611 }
9612 return count;
9613 }
9614
9615 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9616 {
9617 // did we win a forward race against a peer?
9618 if (active_requests.count(req->get_reqid())) {
9619 MDRequestRef& mdr = active_requests[req->get_reqid()];
9620 ceph_assert(mdr);
9621 if (mdr->is_peer()) {
9622 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9623 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9624 } else {
9625 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9626 }
9627 return MDRequestRef();
9628 }
9629
9630 // register new client request
9631 MDRequestImpl::Params params;
9632 params.reqid = req->get_reqid();
9633 params.attempt = req->get_num_fwd();
9634 params.client_req = req;
9635 params.initiated = req->get_recv_stamp();
9636 params.throttled = req->get_throttle_stamp();
9637 params.all_read = req->get_recv_complete_stamp();
9638 params.dispatched = req->get_dispatch_stamp();
9639
9640 MDRequestRef mdr =
9641 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9642 active_requests[params.reqid] = mdr;
9643 mdr->set_op_stamp(req->get_stamp());
9644 dout(7) << "request_start " << *mdr << dendl;
9645 return mdr;
9646 }
9647
9648 MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9649 {
9650 int by = m->get_source().num();
9651 MDRequestImpl::Params params;
9652 params.reqid = ri;
9653 params.attempt = attempt;
9654 params.triggering_peer_req = m;
9655 params.peer_to = by;
9656 params.initiated = m->get_recv_stamp();
9657 params.throttled = m->get_throttle_stamp();
9658 params.all_read = m->get_recv_complete_stamp();
9659 params.dispatched = m->get_dispatch_stamp();
9660 MDRequestRef mdr =
9661 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9662 ceph_assert(active_requests.count(mdr->reqid) == 0);
9663 active_requests[mdr->reqid] = mdr;
9664 dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl;
9665 return mdr;
9666 }
9667
9668 MDRequestRef MDCache::request_start_internal(int op)
9669 {
9670 utime_t now = ceph_clock_now();
9671 MDRequestImpl::Params params;
9672 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9673 params.reqid.tid = mds->issue_tid();
9674 params.initiated = now;
9675 params.throttled = now;
9676 params.all_read = now;
9677 params.dispatched = now;
9678 params.internal_op = op;
9679 MDRequestRef mdr =
9680 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9681
9682 if (active_requests.count(mdr->reqid)) {
9683 auto& _mdr = active_requests[mdr->reqid];
9684 dout(0) << __func__ << " existing " << *_mdr << " op " << _mdr->internal_op << dendl;
9685 dout(0) << __func__ << " new " << *mdr << " op " << op << dendl;
9686 ceph_abort();
9687 }
9688 active_requests[mdr->reqid] = mdr;
9689 dout(7) << __func__ << " " << *mdr << " op " << op << dendl;
9690 return mdr;
9691 }
9692
9693 MDRequestRef MDCache::request_get(metareqid_t rid)
9694 {
9695 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9696 ceph_assert(p != active_requests.end());
9697 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9698 return p->second;
9699 }
9700
9701 void MDCache::request_finish(MDRequestRef& mdr)
9702 {
9703 dout(7) << "request_finish " << *mdr << dendl;
9704 mdr->mark_event("finishing request");
9705
9706 // peer finisher?
9707 if (mdr->has_more() && mdr->more()->peer_commit) {
9708 Context *fin = mdr->more()->peer_commit;
9709 mdr->more()->peer_commit = 0;
9710 int ret;
9711 if (mdr->aborted) {
9712 mdr->aborted = false;
9713 ret = -1;
9714 mdr->more()->peer_rolling_back = true;
9715 } else {
9716 ret = 0;
9717 mdr->committing = true;
9718 }
9719 fin->complete(ret); // this must re-call request_finish.
9720 return;
9721 }
9722
9723 switch(mdr->internal_op) {
9724 case CEPH_MDS_OP_FRAGMENTDIR:
9725 logger->inc(l_mdss_ireq_fragmentdir);
9726 break;
9727 case CEPH_MDS_OP_EXPORTDIR:
9728 logger->inc(l_mdss_ireq_exportdir);
9729 break;
9730 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9731 logger->inc(l_mdss_ireq_enqueue_scrub);
9732 break;
9733 case CEPH_MDS_OP_FLUSH:
9734 logger->inc(l_mdss_ireq_flush);
9735 break;
9736 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9737 logger->inc(l_mdss_ireq_fragstats);
9738 break;
9739 case CEPH_MDS_OP_REPAIR_INODESTATS:
9740 logger->inc(l_mdss_ireq_inodestats);
9741 break;
9742 }
9743
9744 request_cleanup(mdr);
9745 }
9746
9747
9748 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9749 {
9750 CachedStackStringStream css;
9751 *css << "forwarding request to mds." << who;
9752 mdr->mark_event(css->strv());
9753 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9754 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9755 << *mdr->client_request << dendl;
9756 if (mdr->is_batch_head()) {
9757 mdr->release_batch_op()->forward(who);
9758 } else {
9759 mds->forward_message_mds(mdr->release_client_request(), who);
9760 }
9761 if (mds->logger) mds->logger->inc(l_mds_forward);
9762 } else if (mdr->internal_op >= 0) {
9763 dout(10) << "request_forward on internal op; cancelling" << dendl;
9764 mdr->internal_op_finish->complete(-CEPHFS_EXDEV);
9765 } else {
9766 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9767 << " was from mds" << dendl;
9768 }
9769 request_cleanup(mdr);
9770 }
9771
9772
9773 void MDCache::dispatch_request(MDRequestRef& mdr)
9774 {
9775 if (mdr->client_request) {
9776 mds->server->dispatch_client_request(mdr);
9777 } else if (mdr->peer_request) {
9778 mds->server->dispatch_peer_request(mdr);
9779 } else {
9780 switch (mdr->internal_op) {
9781 case CEPH_MDS_OP_FRAGMENTDIR:
9782 dispatch_fragment_dir(mdr);
9783 break;
9784 case CEPH_MDS_OP_EXPORTDIR:
9785 migrator->dispatch_export_dir(mdr, 0);
9786 break;
9787 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9788 enqueue_scrub_work(mdr);
9789 break;
9790 case CEPH_MDS_OP_FLUSH:
9791 flush_dentry_work(mdr);
9792 break;
9793 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9794 repair_dirfrag_stats_work(mdr);
9795 break;
9796 case CEPH_MDS_OP_REPAIR_INODESTATS:
9797 repair_inode_stats_work(mdr);
9798 break;
9799 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
9800 rdlock_dirfrags_stats_work(mdr);
9801 break;
9802 default:
9803 ceph_abort();
9804 }
9805 }
9806 }
9807
9808
9809 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9810 {
9811 if (!mdr->has_more())
9812 return;
9813
9814 // clean up peers
9815 // (will implicitly drop remote dn pins)
9816 for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin();
9817 p != mdr->more()->peers.end();
9818 ++p) {
9819 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
9820 MMDSPeerRequest::OP_FINISH);
9821
9822 if (mdr->killed && !mdr->committing) {
9823 r->mark_abort();
9824 } else if (mdr->more()->srcdn_auth_mds == *p &&
9825 mdr->more()->inode_import.length() > 0) {
9826 // information about rename imported caps
9827 r->inode_export = std::move(mdr->more()->inode_import);
9828 }
9829
9830 mds->send_message_mds(r, *p);
9831 }
9832
9833 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9834 * implicitly. Note that we don't call the finishers -- there shouldn't
9835 * be any on a remote lock and the request finish wakes up all
9836 * the waiters anyway! */
9837
9838 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9839 SimpleLock *lock = it->lock;
9840 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9841 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9842 << " on " << lock->get_parent() << dendl;
9843 lock->put_xlock();
9844 mdr->locks.erase(it++);
9845 } else if (it->is_remote_wrlock()) {
9846 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9847 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9848 if (it->is_wrlock()) {
9849 it->clear_remote_wrlock();
9850 ++it;
9851 } else {
9852 mdr->locks.erase(it++);
9853 }
9854 } else {
9855 ++it;
9856 }
9857 }
9858
9859 mdr->more()->peers.clear(); /* we no longer have requests out to them, and
9860 * leaving them in can cause double-notifies as
9861 * this function can get called more than once */
9862 }
9863
9864 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9865 {
9866 request_drop_foreign_locks(mdr);
9867 mds->locker->drop_non_rdlocks(mdr.get());
9868 }
9869
9870 void MDCache::request_drop_locks(MDRequestRef& mdr)
9871 {
9872 request_drop_foreign_locks(mdr);
9873 mds->locker->drop_locks(mdr.get());
9874 }
9875
9876 void MDCache::request_cleanup(MDRequestRef& mdr)
9877 {
9878 dout(15) << "request_cleanup " << *mdr << dendl;
9879
9880 if (mdr->has_more()) {
9881 if (mdr->more()->is_ambiguous_auth)
9882 mdr->clear_ambiguous_auth();
9883 if (!mdr->more()->waiting_for_finish.empty())
9884 mds->queue_waiters(mdr->more()->waiting_for_finish);
9885 }
9886
9887 request_drop_locks(mdr);
9888
9889 // drop (local) auth pins
9890 mdr->drop_local_auth_pins();
9891
9892 // drop stickydirs
9893 mdr->put_stickydirs();
9894
9895 mds->locker->kick_cap_releases(mdr);
9896
9897 // drop cache pins
9898 mdr->drop_pins();
9899
9900 // remove from session
9901 mdr->item_session_request.remove_myself();
9902
9903 // remove from map
9904 active_requests.erase(mdr->reqid);
9905
9906 if (mds->logger)
9907 log_stat();
9908
9909 mdr->mark_event("cleaned up request");
9910 }
9911
9912 void MDCache::request_kill(MDRequestRef& mdr)
9913 {
9914 // rollback peer requests is tricky. just let the request proceed.
9915 if (mdr->has_more() &&
9916 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) {
9917 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9918 ceph_assert(mdr->more()->witnessed.empty());
9919 mdr->aborted = true;
9920 dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl;
9921 } else {
9922 dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl;
9923 }
9924
9925 ceph_assert(mdr->used_prealloc_ino == 0);
9926 ceph_assert(mdr->prealloc_inos.empty());
9927
9928 mdr->session = NULL;
9929 mdr->item_session_request.remove_myself();
9930 return;
9931 }
9932
9933 mdr->killed = true;
9934 mdr->mark_event("killing request");
9935
9936 if (mdr->committing) {
9937 dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
9938 mdr->item_session_request.remove_myself();
9939 } else {
9940 dout(10) << "request_kill " << *mdr << dendl;
9941 request_cleanup(mdr);
9942 }
9943 }
9944
9945 // -------------------------------------------------------------------------------
9946 // SNAPREALMS
9947
9948 void MDCache::create_global_snaprealm()
9949 {
9950 CInode *in = new CInode(this); // dummy inode
9951 create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9952 add_inode(in);
9953 global_snaprealm = in->snaprealm;
9954 }
9955
9956 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9957 {
9958 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9959
9960 vector<inodeno_t> split_inos;
9961 vector<inodeno_t> split_realms;
9962
9963 if (notify_clients) {
9964 if (snapop == CEPH_SNAP_OP_SPLIT) {
9965 // notify clients of update|split
9966 for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p)
9967 split_inos.push_back((*p)->ino());
9968
9969 for (auto& r : in->snaprealm->open_children)
9970 split_realms.push_back(r->inode->ino());
9971 }
9972 }
9973
9974 map<client_t, ref_t<MClientSnap>> updates;
9975 list<SnapRealm*> q;
9976 q.push_back(in->snaprealm);
9977 while (!q.empty()) {
9978 SnapRealm *realm = q.front();
9979 q.pop_front();
9980
9981 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9982 realm->invalidate_cached_snaps();
9983
9984 if (notify_clients) {
9985 for (const auto& p : realm->client_caps) {
9986 const auto& client = p.first;
9987 const auto& caps = p.second;
9988 ceph_assert(!caps->empty());
9989
9990 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9991 if (em.second) {
9992 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9993 update->head.split = in->ino();
9994 update->split_inos = split_inos;
9995 update->split_realms = split_realms;
9996 update->bl = mds->server->get_snap_trace(em.first->first, in->snaprealm);
9997 em.first->second = std::move(update);
9998 }
9999 }
10000 }
10001
10002 // notify for active children, too.
10003 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
10004 for (auto& r : realm->open_children)
10005 q.push_back(r);
10006 }
10007
10008 if (notify_clients)
10009 send_snaps(updates);
10010 }
10011
10012 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
10013 {
10014 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
10015 ceph_assert(in->is_auth());
10016
10017 set<mds_rank_t> mds_set;
10018 if (stid > 0) {
10019 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
10020 mds_set.erase(mds->get_nodeid());
10021 } else {
10022 in->list_replicas(mds_set);
10023 }
10024
10025 if (!mds_set.empty()) {
10026 bufferlist snap_blob;
10027 in->encode_snap(snap_blob);
10028
10029 for (auto p : mds_set) {
10030 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
10031 m->snap_blob = snap_blob;
10032 mds->send_message_mds(m, p);
10033 }
10034 }
10035
10036 if (stid > 0)
10037 notify_global_snaprealm_update(snap_op);
10038 }
10039
10040 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
10041 {
10042 mds_rank_t from = mds_rank_t(m->get_source().num());
10043 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
10044
10045 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
10046 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
10047 return;
10048 }
10049
10050 // null rejoin_done means open_snaprealms() has already been called
10051 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
10052 (mds->is_rejoin() && !rejoin_done);
10053
10054 if (m->get_tid() > 0) {
10055 mds->snapclient->notify_commit(m->get_tid());
10056 if (notify_clients)
10057 notify_global_snaprealm_update(m->get_snap_op());
10058 }
10059
10060 CInode *in = get_inode(m->get_ino());
10061 if (in) {
10062 ceph_assert(!in->is_auth());
10063 if (mds->get_state() > MDSMap::STATE_REJOIN ||
10064 (mds->is_rejoin() && !in->is_rejoining())) {
10065 auto p = m->snap_blob.cbegin();
10066 in->decode_snap(p);
10067
10068 if (!notify_clients) {
10069 if (!rejoin_pending_snaprealms.count(in)) {
10070 in->get(CInode::PIN_OPENINGSNAPPARENTS);
10071 rejoin_pending_snaprealms.insert(in);
10072 }
10073 }
10074 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
10075 }
10076 }
10077 }
10078
10079 void MDCache::notify_global_snaprealm_update(int snap_op)
10080 {
10081 if (snap_op != CEPH_SNAP_OP_DESTROY)
10082 snap_op = CEPH_SNAP_OP_UPDATE;
10083 set<Session*> sessions;
10084 mds->sessionmap.get_client_session_set(sessions);
10085 for (auto &session : sessions) {
10086 if (!session->is_open() && !session->is_stale())
10087 continue;
10088 auto update = make_message<MClientSnap>(snap_op);
10089 update->head.split = global_snaprealm->inode->ino();
10090 update->bl = mds->server->get_snap_trace(session, global_snaprealm);
10091 mds->send_message_client_counted(update, session);
10092 }
10093 }
10094
10095 // -------------------------------------------------------------------------------
10096 // STRAYS
10097
10098 struct C_MDC_RetryScanStray : public MDCacheContext {
10099 dirfrag_t next;
10100 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
10101 void finish(int r) override {
10102 mdcache->scan_stray_dir(next);
10103 }
10104 };
10105
10106 void MDCache::scan_stray_dir(dirfrag_t next)
10107 {
10108 dout(10) << "scan_stray_dir " << next << dendl;
10109
10110 if (next.ino)
10111 next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()];
10112
10113 for (int i = 0; i < NUM_STRAY; ++i) {
10114 if (strays[i]->ino() < next.ino)
10115 continue;
10116
10117 std::vector<CDir*> ls;
10118 strays[i]->get_dirfrags(ls);
10119
10120 for (const auto& dir : ls) {
10121 if (dir->get_frag() < next.frag)
10122 continue;
10123
10124 if (!dir->can_auth_pin()) {
10125 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag()));
10126 return;
10127 }
10128
10129 if (!dir->is_complete()) {
10130 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
10131 return;
10132 }
10133
10134 for (auto &p : dir->items) {
10135 CDentry *dn = p.second;
10136 dn->state_set(CDentry::STATE_STRAY);
10137 CDentry::linkage_t *dnl = dn->get_projected_linkage();
10138 if (dnl->is_primary()) {
10139 CInode *in = dnl->get_inode();
10140 if (in->get_inode()->nlink == 0)
10141 in->state_set(CInode::STATE_ORPHAN);
10142 maybe_eval_stray(in);
10143 }
10144 }
10145 }
10146 next.frag = frag_t();
10147 }
10148 }
10149
10150 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
10151 {
10152 object_t oid = CInode::get_object_name(ino, frag_t(), "");
10153 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
10154 if (mds->logger)
10155 mds->logger->inc(l_mds_openino_backtrace_fetch);
10156 }
10157
10158
10159
10160
10161
10162 // ========================================================================================
10163 // DISCOVER
10164 /*
10165
10166 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10167 to the parent metadata object in the cache (pinning it).
10168
10169 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10170
10171 */
10172
10173 void MDCache::_send_discover(discover_info_t& d)
10174 {
10175 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
10176 d.want_base_dir, d.path_locked);
10177 logger->inc(l_mdc_dir_send_discover);
10178 dis->set_tid(d.tid);
10179 mds->send_message_mds(dis, d.mds);
10180 }
10181
10182 void MDCache::discover_base_ino(inodeno_t want_ino,
10183 MDSContext *onfinish,
10184 mds_rank_t from)
10185 {
10186 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
10187 if (waiting_for_base_ino[from].count(want_ino) == 0) {
10188 discover_info_t& d = _create_discover(from);
10189 d.ino = want_ino;
10190 _send_discover(d);
10191 }
10192 waiting_for_base_ino[from][want_ino].push_back(onfinish);
10193 }
10194
10195
10196 void MDCache::discover_dir_frag(CInode *base,
10197 frag_t approx_fg,
10198 MDSContext *onfinish,
10199 mds_rank_t from)
10200 {
10201 if (from < 0)
10202 from = base->authority().first;
10203
10204 dirfrag_t df(base->ino(), approx_fg);
10205 dout(7) << "discover_dir_frag " << df
10206 << " from mds." << from << dendl;
10207
10208 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10209 discover_info_t& d = _create_discover(from);
10210 d.pin_base(base);
10211 d.ino = base->ino();
10212 d.frag = approx_fg;
10213 d.want_base_dir = true;
10214 _send_discover(d);
10215 }
10216
10217 if (onfinish)
10218 base->add_dir_waiter(approx_fg, onfinish);
10219 }
10220
10221 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10222 CInode *base;
10223 snapid_t snapid;
10224 filepath path;
10225 mds_rank_t from;
10226 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10227 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10228 void finish(int r) override {
10229 mdcache->discover_path(base, snapid, path, 0, from);
10230 }
10231 };
10232
10233 void MDCache::discover_path(CInode *base,
10234 snapid_t snap,
10235 filepath want_path,
10236 MDSContext *onfinish,
10237 bool path_locked,
10238 mds_rank_t from)
10239 {
10240 if (from < 0)
10241 from = base->authority().first;
10242
10243 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10244 << (path_locked ? " path_locked":"")
10245 << dendl;
10246
10247 if (base->is_ambiguous_auth()) {
10248 dout(10) << " waiting for single auth on " << *base << dendl;
10249 if (!onfinish)
10250 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10251 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10252 return;
10253 } else if (from == mds->get_nodeid()) {
10254 MDSContext::vec finished;
10255 base->take_waiting(CInode::WAIT_DIR, finished);
10256 mds->queue_waiters(finished);
10257 return;
10258 }
10259
10260 frag_t fg = base->pick_dirfrag(want_path[0]);
10261 if ((path_locked && want_path.depth() == 1) ||
10262 !base->is_waiting_for_dir(fg) || !onfinish) {
10263 discover_info_t& d = _create_discover(from);
10264 d.ino = base->ino();
10265 d.pin_base(base);
10266 d.frag = fg;
10267 d.snap = snap;
10268 d.want_path = want_path;
10269 d.want_base_dir = true;
10270 d.path_locked = path_locked;
10271 _send_discover(d);
10272 }
10273
10274 // register + wait
10275 if (onfinish)
10276 base->add_dir_waiter(fg, onfinish);
10277 }
10278
10279 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10280 CDir *base;
10281 snapid_t snapid;
10282 filepath path;
10283 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10284 MDCacheContext(c), base(b), snapid(s), path(p) {}
10285 void finish(int r) override {
10286 mdcache->discover_path(base, snapid, path, 0);
10287 }
10288 };
10289
10290 void MDCache::discover_path(CDir *base,
10291 snapid_t snap,
10292 filepath want_path,
10293 MDSContext *onfinish,
10294 bool path_locked)
10295 {
10296 mds_rank_t from = base->authority().first;
10297
10298 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10299 << (path_locked ? " path_locked":"")
10300 << dendl;
10301
10302 if (base->is_ambiguous_auth()) {
10303 dout(7) << " waiting for single auth on " << *base << dendl;
10304 if (!onfinish)
10305 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10306 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10307 return;
10308 }
10309
10310 if ((path_locked && want_path.depth() == 1) ||
10311 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10312 discover_info_t& d = _create_discover(from);
10313 d.ino = base->ino();
10314 d.pin_base(base->inode);
10315 d.frag = base->get_frag();
10316 d.snap = snap;
10317 d.want_path = want_path;
10318 d.want_base_dir = false;
10319 d.path_locked = path_locked;
10320 _send_discover(d);
10321 }
10322
10323 // register + wait
10324 if (onfinish)
10325 base->add_dentry_waiter(want_path[0], snap, onfinish);
10326 }
10327
10328 void MDCache::kick_discovers(mds_rank_t who)
10329 {
10330 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10331 p != discovers.end();
10332 ++p) {
10333 if (p->second.mds != who)
10334 continue;
10335 _send_discover(p->second);
10336 }
10337 }
10338
10339
10340 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10341 {
10342 mds_rank_t whoami = mds->get_nodeid();
10343 mds_rank_t from = mds_rank_t(dis->get_source().num());
10344
10345 ceph_assert(from != whoami);
10346
10347 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10348 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10349 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10350 return;
10351 }
10352
10353 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10354 // delay processing request from survivor because we may not yet choose lock states.
10355 if (!mds->mdsmap->is_rejoin(from)) {
10356 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10357 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10358 return;
10359 }
10360 }
10361
10362
10363 CInode *cur = 0;
10364 auto reply = make_message<MDiscoverReply>(*dis);
10365
10366 snapid_t snapid = dis->get_snapid();
10367
10368 logger->inc(l_mdc_dir_handle_discover);
10369
10370 // get started.
10371 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10372 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10373 // wants root
10374 dout(7) << "handle_discover from mds." << from
10375 << " wants base + " << dis->get_want().get_path()
10376 << " snap " << snapid
10377 << dendl;
10378
10379 cur = get_inode(dis->get_base_ino());
10380 ceph_assert(cur);
10381
10382 // add root
10383 reply->starts_with = MDiscoverReply::INODE;
10384 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10385 dout(10) << "added base " << *cur << dendl;
10386 }
10387 else {
10388 // there's a base inode
10389 cur = get_inode(dis->get_base_ino(), snapid);
10390 if (!cur && snapid != CEPH_NOSNAP) {
10391 cur = get_inode(dis->get_base_ino());
10392 if (cur && !cur->is_multiversion())
10393 cur = NULL; // nope!
10394 }
10395
10396 if (!cur) {
10397 dout(7) << "handle_discover mds." << from
10398 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10399 << dendl;
10400 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10401 reply->set_error_dentry(dis->get_dentry(0));
10402 reply->set_flag_error_dir();
10403 } else if (dis->wants_base_dir()) {
10404 dout(7) << "handle_discover mds." << from
10405 << " wants basedir+" << dis->get_want().get_path()
10406 << " has " << *cur
10407 << dendl;
10408 } else {
10409 dout(7) << "handle_discover mds." << from
10410 << " wants " << dis->get_want().get_path()
10411 << " has " << *cur
10412 << dendl;
10413 }
10414 }
10415
10416 ceph_assert(reply);
10417
10418 // add content
10419 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10420 for (unsigned i = 0;
10421 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10422 i++) {
10423
10424 // -- figure out the dir
10425
10426 // is *cur even a dir at all?
10427 if (!cur->is_dir()) {
10428 dout(7) << *cur << " not a dir" << dendl;
10429 reply->set_flag_error_dir();
10430 break;
10431 }
10432
10433 // pick frag
10434 frag_t fg;
10435 if (dis->get_want().depth()) {
10436 // dentry specifies
10437 fg = cur->pick_dirfrag(dis->get_dentry(i));
10438 } else {
10439 // requester explicity specified the frag
10440 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10441 fg = dis->get_base_dir_frag();
10442 if (!cur->dirfragtree.is_leaf(fg))
10443 fg = cur->dirfragtree[fg.value()];
10444 }
10445 CDir *curdir = cur->get_dirfrag(fg);
10446
10447 if ((!curdir && !cur->is_auth()) ||
10448 (curdir && !curdir->is_auth())) {
10449
10450 /* before:
10451 * ONLY set flag if empty!!
10452 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10453 * resulting in duplicate discovers in flight,
10454 * which can wreak havoc when discovering rename srcdn (which may move)
10455 */
10456
10457 if (reply->is_empty()) {
10458 // only hint if empty.
10459 // someday this could be better, but right now the waiter logic isn't smart enough.
10460
10461 // hint
10462 if (curdir) {
10463 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10464 reply->set_dir_auth_hint(curdir->authority().first);
10465 } else {
10466 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10467 << *cur << dendl;
10468 reply->set_dir_auth_hint(cur->authority().first);
10469 }
10470
10471 // note error dentry, if any
10472 // NOTE: important, as it allows requester to issue an equivalent discover
10473 // to whomever we hint at.
10474 if (dis->get_want().depth() > i)
10475 reply->set_error_dentry(dis->get_dentry(i));
10476 }
10477
10478 break;
10479 }
10480
10481 if (!curdir) { // open dir?
10482 if (cur->is_frozen()) {
10483 if (!reply->is_empty()) {
10484 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10485 break;
10486 }
10487 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10488 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10489 return;
10490 }
10491 curdir = cur->get_or_open_dirfrag(this, fg);
10492 } else if (curdir->is_frozen_tree() ||
10493 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10494 if (!reply->is_empty()) {
10495 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10496 break;
10497 }
10498 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10499 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10500 reply->set_flag_error_dir();
10501 break;
10502 }
10503 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10504 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10505 return;
10506 }
10507
10508 // add dir
10509 if (curdir->get_version() == 0) {
10510 // fetch newly opened dir
10511 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10512 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10513 // make sure the base frag is correct, though, in there was a refragment since the
10514 // original request was sent.
10515 reply->set_base_dir_frag(curdir->get_frag());
10516 } else {
10517 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10518 if (!reply->trace.length())
10519 reply->starts_with = MDiscoverReply::DIR;
10520 encode_replica_dir(curdir, from, reply->trace);
10521 dout(7) << "handle_discover added dir " << *curdir << dendl;
10522 }
10523
10524 // lookup
10525 CDentry *dn = 0;
10526 std::string_view dname;
10527 if (dis->get_want().depth() > 0)
10528 dname = dis->get_dentry(i);
10529 if (curdir->get_version() == 0) {
10530 // fetch newly opened dir
10531 ceph_assert(!curdir->has_bloom());
10532 } else if (dname.size() > 0) {
10533 // lookup dentry
10534 dn = curdir->lookup(dname, snapid);
10535 } else
10536 break; // done!
10537
10538 // incomplete dir?
10539 if (!dn) {
10540 if (!curdir->is_complete() &&
10541 !(dname.size() > 0 &&
10542 snapid == CEPH_NOSNAP &&
10543 curdir->has_bloom() &&
10544 !curdir->is_in_bloom(dname))) {
10545 // readdir
10546 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10547 if (reply->is_empty()) {
10548 // fetch and wait
10549 curdir->fetch(dname, snapid, new C_MDS_RetryMessage(mds, dis),
10550 dis->wants_base_dir() && curdir->get_version() == 0);
10551 return;
10552 } else {
10553 // initiate fetch, but send what we have so far
10554 curdir->fetch(dname, snapid, nullptr);
10555 break;
10556 }
10557 }
10558
10559 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10560 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10561 << " dne, non-empty reply, stopping" << dendl;
10562 break;
10563 }
10564
10565 // send null dentry
10566 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10567 << *curdir << dendl;
10568 if (snapid == CEPH_NOSNAP)
10569 dn = curdir->add_null_dentry(dis->get_dentry(i));
10570 else
10571 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10572 }
10573 ceph_assert(dn);
10574
10575 // don't add replica to purging dentry/inode
10576 if (dn->state_test(CDentry::STATE_PURGING)) {
10577 if (reply->is_empty())
10578 reply->set_flag_error_dn(dis->get_dentry(i));
10579 break;
10580 }
10581
10582 CDentry::linkage_t *dnl = dn->get_linkage();
10583
10584 // xlocked dentry?
10585 // ...always block on non-tail items (they are unrelated)
10586 // ...allow xlocked tail disocvery _only_ if explicitly requested
10587 if (dn->lock.is_xlocked()) {
10588 // is this the last (tail) item in the discover traversal?
10589 if (dis->is_path_locked()) {
10590 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10591 } else if (reply->is_empty()) {
10592 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10593 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10594 return;
10595 } else {
10596 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10597 break;
10598 }
10599 }
10600
10601 // frozen inode?
10602 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10603 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10604 if (tailitem && dis->is_path_locked()) {
10605 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10606 } else if (reply->is_empty()) {
10607 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10608 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10609 return;
10610 } else {
10611 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10612 break;
10613 }
10614 }
10615
10616 // add dentry
10617 if (!reply->trace.length())
10618 reply->starts_with = MDiscoverReply::DENTRY;
10619 encode_replica_dentry(dn, from, reply->trace);
10620 dout(7) << "handle_discover added dentry " << *dn << dendl;
10621
10622 if (!dnl->is_primary()) break; // stop on null or remote link.
10623
10624 // add inode
10625 CInode *next = dnl->get_inode();
10626 ceph_assert(next->is_auth());
10627
10628 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10629 dout(7) << "handle_discover added inode " << *next << dendl;
10630
10631 // descend, keep going.
10632 cur = next;
10633 continue;
10634 }
10635
10636 // how did we do?
10637 ceph_assert(!reply->is_empty());
10638 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10639 mds->send_message(reply, dis->get_connection());
10640 }
10641
10642 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10643 {
10644 /*
10645 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10646 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10647 return;
10648 }
10649 */
10650 dout(7) << "discover_reply " << *m << dendl;
10651 if (m->is_flag_error_dir())
10652 dout(7) << " flag error, dir" << dendl;
10653 if (m->is_flag_error_dn())
10654 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10655
10656 MDSContext::vec finished, error;
10657 mds_rank_t from = mds_rank_t(m->get_source().num());
10658
10659 // starting point
10660 CInode *cur = get_inode(m->get_base_ino());
10661 auto p = m->trace.cbegin();
10662
10663 int next = m->starts_with;
10664
10665 // decrement discover counters
10666 if (m->get_tid()) {
10667 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10668 if (p != discovers.end()) {
10669 dout(10) << " found tid " << m->get_tid() << dendl;
10670 discovers.erase(p);
10671 } else {
10672 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10673 }
10674 }
10675
10676 // discover may start with an inode
10677 if (!p.end() && next == MDiscoverReply::INODE) {
10678 decode_replica_inode(cur, p, NULL, finished);
10679 dout(7) << "discover_reply got base inode " << *cur << dendl;
10680 ceph_assert(cur->is_base());
10681
10682 next = MDiscoverReply::DIR;
10683
10684 // take waiters?
10685 if (cur->is_base() &&
10686 waiting_for_base_ino[from].count(cur->ino())) {
10687 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10688 waiting_for_base_ino[from].erase(cur->ino());
10689 }
10690 }
10691 ceph_assert(cur);
10692
10693 // loop over discover results.
10694 // indexes follow each ([[dir] dentry] inode)
10695 // can start, end with any type.
10696 while (!p.end()) {
10697 // dir
10698 frag_t fg;
10699 CDir *curdir = nullptr;
10700 if (next == MDiscoverReply::DIR) {
10701 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10702 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10703 ceph_assert(m->get_wanted_base_dir());
10704 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10705 }
10706 } else {
10707 // note: this can only happen our first way around this loop.
10708 if (p.end() && m->is_flag_error_dn()) {
10709 fg = cur->pick_dirfrag(m->get_error_dentry());
10710 curdir = cur->get_dirfrag(fg);
10711 } else
10712 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10713 }
10714
10715 if (p.end())
10716 break;
10717
10718 // dentry
10719 CDentry *dn = nullptr;
10720 decode_replica_dentry(dn, p, curdir, finished);
10721
10722 if (p.end())
10723 break;
10724
10725 // inode
10726 decode_replica_inode(cur, p, dn, finished);
10727
10728 next = MDiscoverReply::DIR;
10729 }
10730
10731 // dir error?
10732 // or dir_auth hint?
10733 if (m->is_flag_error_dir() && !cur->is_dir()) {
10734 // not a dir.
10735 cur->take_waiting(CInode::WAIT_DIR, error);
10736 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10737 mds_rank_t who = m->get_dir_auth_hint();
10738 if (who == mds->get_nodeid()) who = -1;
10739 if (who >= 0)
10740 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10741
10742
10743 if (m->get_wanted_base_dir()) {
10744 frag_t fg = m->get_base_dir_frag();
10745 CDir *dir = cur->get_dirfrag(fg);
10746
10747 if (cur->is_waiting_for_dir(fg)) {
10748 if (cur->is_auth())
10749 cur->take_waiting(CInode::WAIT_DIR, finished);
10750 else if (dir || !cur->dirfragtree.is_leaf(fg))
10751 cur->take_dir_waiting(fg, finished);
10752 else
10753 discover_dir_frag(cur, fg, 0, who);
10754 } else
10755 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10756 }
10757
10758 // try again?
10759 if (m->get_error_dentry().length()) {
10760 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10761 CDir *dir = cur->get_dirfrag(fg);
10762 // wanted a dentry
10763 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10764 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10765 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10766 m->get_wanted_snapid(), finished);
10767 } else {
10768 filepath relpath(m->get_error_dentry(), 0);
10769 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10770 }
10771 } else
10772 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10773 << m->get_error_dentry() << dendl;
10774 }
10775 } else if (m->is_flag_error_dn()) {
10776 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10777 CDir *dir = cur->get_dirfrag(fg);
10778 if (dir && !dir->is_auth()) {
10779 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10780 m->get_wanted_snapid(), error);
10781 }
10782 }
10783
10784 // waiters
10785 finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT); // finish errors directly
10786 mds->queue_waiters(finished);
10787 }
10788
10789
10790
10791 // ----------------------------
10792 // REPLICAS
10793
10794
10795 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10796 {
10797 ENCODE_START(1, 1, bl);
10798 dirfrag_t df = dir->dirfrag();
10799 encode(df, bl);
10800 __u32 nonce = dir->add_replica(to);
10801 encode(nonce, bl);
10802 dir->_encode_base(bl);
10803 ENCODE_FINISH(bl);
10804 }
10805
10806 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10807 {
10808 ENCODE_START(2, 1, bl);
10809 encode(dn->get_name(), bl);
10810 encode(dn->last, bl);
10811
10812 __u32 nonce = dn->add_replica(to);
10813 encode(nonce, bl);
10814 encode(dn->first, bl);
10815 encode(dn->linkage.remote_ino, bl);
10816 encode(dn->linkage.remote_d_type, bl);
10817 dn->lock.encode_state_for_replica(bl);
10818 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10819 encode(need_recover, bl);
10820 encode(dn->alternate_name, bl);
10821 ENCODE_FINISH(bl);
10822 }
10823
10824 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10825 uint64_t features)
10826 {
10827 ceph_assert(in->is_auth());
10828
10829 ENCODE_START(2, 1, bl);
10830 encode(in->ino(), bl); // bleh, minor assymetry here
10831 encode(in->last, bl);
10832
10833 __u32 nonce = in->add_replica(to);
10834 encode(nonce, bl);
10835
10836 in->_encode_base(bl, features);
10837 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10838
10839 __u32 state = in->state;
10840 encode(state, bl);
10841
10842 ENCODE_FINISH(bl);
10843 }
10844
10845 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10846 MDSContext::vec& finished)
10847 {
10848 DECODE_START(1, p);
10849 dirfrag_t df;
10850 decode(df, p);
10851
10852 ceph_assert(diri->ino() == df.ino);
10853
10854 // add it (_replica_)
10855 dir = diri->get_dirfrag(df.frag);
10856
10857 if (dir) {
10858 // had replica. update w/ new nonce.
10859 __u32 nonce;
10860 decode(nonce, p);
10861 dir->set_replica_nonce(nonce);
10862 dir->_decode_base(p);
10863 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10864 } else {
10865 // force frag to leaf in the diri tree
10866 if (!diri->dirfragtree.is_leaf(df.frag)) {
10867 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10868 << diri->dirfragtree << dendl;
10869 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10870 }
10871 // add replica.
10872 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10873 __u32 nonce;
10874 decode(nonce, p);
10875 dir->set_replica_nonce(nonce);
10876 dir->_decode_base(p);
10877 // is this a dir_auth delegation boundary?
10878 if (from != diri->authority().first ||
10879 diri->is_ambiguous_auth() ||
10880 diri->is_base())
10881 adjust_subtree_auth(dir, from);
10882
10883 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10884 // get waiters
10885 diri->take_dir_waiting(df.frag, finished);
10886 }
10887 DECODE_FINISH(p);
10888 }
10889
10890 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10891 {
10892 DECODE_START(1, p);
10893 string name;
10894 snapid_t last;
10895 decode(name, p);
10896 decode(last, p);
10897
10898 dn = dir->lookup(name, last);
10899
10900 // have it?
10901 bool is_new = false;
10902 if (dn) {
10903 is_new = false;
10904 dout(7) << __func__ << " had " << *dn << dendl;
10905 } else {
10906 is_new = true;
10907 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10908 dout(7) << __func__ << " added " << *dn << dendl;
10909 }
10910
10911 __u32 nonce;
10912 decode(nonce, p);
10913 dn->set_replica_nonce(nonce);
10914 decode(dn->first, p);
10915
10916 inodeno_t rino;
10917 unsigned char rdtype;
10918 decode(rino, p);
10919 decode(rdtype, p);
10920 dn->lock.decode_state(p, is_new);
10921
10922 bool need_recover;
10923 decode(need_recover, p);
10924
10925 mempool::mds_co::string alternate_name;
10926 if (struct_v >= 2) {
10927 decode(alternate_name, p);
10928 }
10929
10930 if (is_new) {
10931 dn->set_alternate_name(std::move(alternate_name));
10932 if (rino)
10933 dir->link_remote_inode(dn, rino, rdtype);
10934 if (need_recover)
10935 dn->lock.mark_need_recover();
10936 } else {
10937 ceph_assert(dn->alternate_name == alternate_name);
10938 }
10939
10940 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10941 DECODE_FINISH(p);
10942 }
10943
10944 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10945 {
10946 DECODE_START(2, p);
10947 inodeno_t ino;
10948 snapid_t last;
10949 __u32 nonce;
10950 decode(ino, p);
10951 decode(last, p);
10952 decode(nonce, p);
10953 in = get_inode(ino, last);
10954 if (!in) {
10955 in = new CInode(this, false, 2, last);
10956 in->set_replica_nonce(nonce);
10957 in->_decode_base(p);
10958 in->_decode_locks_state_for_replica(p, true);
10959 add_inode(in);
10960 if (in->ino() == CEPH_INO_ROOT)
10961 in->inode_auth.first = 0;
10962 else if (in->is_mdsdir())
10963 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10964 dout(10) << __func__ << " added " << *in << dendl;
10965 if (dn) {
10966 ceph_assert(dn->get_linkage()->is_null());
10967 dn->dir->link_primary_inode(dn, in);
10968 }
10969 } else {
10970 in->set_replica_nonce(nonce);
10971 in->_decode_base(p);
10972 in->_decode_locks_state_for_replica(p, false);
10973 dout(10) << __func__ << " had " << *in << dendl;
10974 }
10975
10976 if (dn) {
10977 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10978 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10979 }
10980
10981 if (struct_v >= 2) {
10982 __u32 s;
10983 decode(s, p);
10984 s &= CInode::MASK_STATE_REPLICATED;
10985 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10986 dout(10) << "replica inode is random ephemeral pinned" << dendl;
10987 in->set_ephemeral_pin(false, true);
10988 }
10989 }
10990
10991 DECODE_FINISH(p);
10992 }
10993
10994
10995 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10996 {
10997 ceph_assert(straydn->get_num_auth_pins());
10998 ENCODE_START(2, 1, bl);
10999 uint64_t features = mds->mdsmap->get_up_features();
11000 encode_replica_inode(get_myin(), who, bl, features);
11001 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
11002 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
11003 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
11004 encode_replica_dir(straydn->get_dir(), who, bl);
11005 encode_replica_dentry(straydn, who, bl);
11006 if (!straydn->get_projected_linkage()->is_null()) {
11007 encode_replica_inode(straydn->get_projected_linkage()->get_inode(), who, bl, features);
11008 }
11009 ENCODE_FINISH(bl);
11010 }
11011
11012 void MDCache::decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from)
11013 {
11014 MDSContext::vec finished;
11015 auto p = bl.cbegin();
11016
11017 DECODE_START(2, p);
11018 CInode *mdsin = nullptr;
11019 decode_replica_inode(mdsin, p, NULL, finished);
11020 CDir *mdsdir = nullptr;
11021 decode_replica_dir(mdsdir, p, mdsin, from, finished);
11022 CDentry *straydirdn = nullptr;
11023 decode_replica_dentry(straydirdn, p, mdsdir, finished);
11024 CInode *strayin = nullptr;
11025 decode_replica_inode(strayin, p, straydirdn, finished);
11026 CDir *straydir = nullptr;
11027 decode_replica_dir(straydir, p, strayin, from, finished);
11028
11029 decode_replica_dentry(straydn, p, straydir, finished);
11030 if (struct_v >= 2 && in) {
11031 decode_replica_inode(*in, p, straydn, finished);
11032 }
11033 if (!finished.empty())
11034 mds->queue_waiters(finished);
11035 DECODE_FINISH(p);
11036 }
11037
11038
11039 int MDCache::send_dir_updates(CDir *dir, bool bcast)
11040 {
11041 // this is an FYI, re: replication
11042
11043 set<mds_rank_t> who;
11044 if (bcast) {
11045 set<mds_rank_t> mds_set;
11046 mds->get_mds_map()->get_active_mds_set(mds_set);
11047
11048 set<mds_rank_t> replica_set;
11049 for (const auto &p : dir->get_replicas()) {
11050 replica_set.insert(p.first);
11051 }
11052
11053 std::set_difference(mds_set.begin(), mds_set.end(),
11054 replica_set.begin(), replica_set.end(),
11055 std::inserter(who, who.end()));
11056 } else {
11057 for (const auto &p : dir->get_replicas()) {
11058 who.insert(p.first);
11059 }
11060 }
11061
11062 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
11063
11064 filepath path;
11065 dir->inode->make_path(path);
11066
11067 std::set<int32_t> dir_rep_set;
11068 for (const auto &r : dir->dir_rep_by) {
11069 dir_rep_set.insert(r);
11070 }
11071
11072 mds_rank_t whoami = mds->get_nodeid();
11073 for (set<mds_rank_t>::iterator it = who.begin();
11074 it != who.end();
11075 ++it) {
11076 if (*it == whoami) continue;
11077 //if (*it == except) continue;
11078 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
11079
11080 logger->inc(l_mdc_dir_update);
11081 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it);
11082 }
11083
11084 return 0;
11085 }
11086
11087 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
11088 {
11089 dirfrag_t df = m->get_dirfrag();
11090 CDir *dir = get_dirfrag(df);
11091 logger->inc(l_mdc_dir_update_receipt);
11092 if (!dir) {
11093 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
11094
11095 // discover it?
11096 if (m->should_discover()) {
11097 // only try once!
11098 // this is key to avoid a fragtree update race, among other things.
11099 m->inc_tried_discover();
11100 vector<CDentry*> trace;
11101 CInode *in;
11102 filepath path = m->get_path();
11103 dout(5) << "trying discover on dir_update for " << path << dendl;
11104 logger->inc(l_mdc_dir_try_discover);
11105 CF_MDS_RetryMessageFactory cf(mds, m);
11106 MDRequestRef null_ref;
11107 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
11108 if (r > 0)
11109 return;
11110 if (r == 0 &&
11111 in->ino() == df.ino &&
11112 in->get_approx_dirfrag(df.frag) == NULL) {
11113 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
11114 return;
11115 }
11116 }
11117
11118 return;
11119 }
11120
11121 if (!m->has_tried_discover()) {
11122 // Update if it already exists. Othwerwise it got updated by discover reply.
11123 dout(5) << "dir_update on " << *dir << dendl;
11124 dir->dir_rep = m->get_dir_rep();
11125 dir->dir_rep_by.clear();
11126 for (const auto &e : m->get_dir_rep_by()) {
11127 dir->dir_rep_by.insert(e);
11128 }
11129 }
11130 }
11131
11132
11133
11134
11135
11136 // LINK
11137
11138 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
11139 {
11140 ENCODE_START(1, 1, bl);
11141 inodeno_t ino = dnl->get_remote_ino();
11142 encode(ino, bl);
11143 __u8 d_type = dnl->get_remote_d_type();
11144 encode(d_type, bl);
11145 ENCODE_FINISH(bl);
11146 }
11147
11148 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
11149 {
11150 DECODE_START(1, p);
11151 inodeno_t ino;
11152 __u8 d_type;
11153 decode(ino, p);
11154 decode(d_type, p);
11155 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
11156 dir->link_remote_inode(dn, ino, d_type);
11157 DECODE_FINISH(p);
11158 }
11159
11160 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
11161 {
11162 dout(7) << __func__ << " " << *dn << dendl;
11163
11164 CDir *subtree = get_subtree_root(dn->get_dir());
11165 for (const auto &p : dn->get_replicas()) {
11166 // don't tell (rename) witnesses; they already know
11167 if (mdr.get() && mdr->more()->witnessed.count(p.first))
11168 continue;
11169 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11170 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11171 rejoin_gather.count(p.first)))
11172 continue;
11173 CDentry::linkage_t *dnl = dn->get_linkage();
11174 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
11175 if (dnl->is_primary()) {
11176 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
11177 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
11178 mds->mdsmap->get_up_features());
11179 } else if (dnl->is_remote()) {
11180 encode_remote_dentry_link(dnl, m->bl);
11181 } else
11182 ceph_abort(); // aie, bad caller!
11183 mds->send_message_mds(m, p.first);
11184 }
11185 }
11186
11187 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
11188 {
11189 CDentry *dn = NULL;
11190 CDir *dir = get_dirfrag(m->get_dirfrag());
11191 if (!dir) {
11192 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11193 } else {
11194 dn = dir->lookup(m->get_dn());
11195 if (!dn) {
11196 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11197 } else {
11198 dout(7) << __func__ << " on " << *dn << dendl;
11199 CDentry::linkage_t *dnl = dn->get_linkage();
11200
11201 ceph_assert(!dn->is_auth());
11202 ceph_assert(dnl->is_null());
11203 }
11204 }
11205
11206 auto p = m->bl.cbegin();
11207 MDSContext::vec finished;
11208 if (dn) {
11209 if (m->get_is_primary()) {
11210 // primary link.
11211 CInode *in = nullptr;
11212 decode_replica_inode(in, p, dn, finished);
11213 } else {
11214 // remote link, easy enough.
11215 decode_remote_dentry_link(dir, dn, p);
11216 }
11217 } else {
11218 ceph_abort();
11219 }
11220
11221 if (!finished.empty())
11222 mds->queue_waiters(finished);
11223
11224 return;
11225 }
11226
11227
11228 // UNLINK
11229
11230 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn,
11231 MDRequestRef& mdr, bool unlinking)
11232 {
11233 dout(10) << __func__ << " " << *dn << dendl;
11234 // share unlink news with replicas
11235 set<mds_rank_t> replicas;
11236 dn->list_replicas(replicas);
11237 bufferlist snapbl;
11238 if (straydn) {
11239 straydn->list_replicas(replicas);
11240 CInode *strayin = straydn->get_linkage()->get_inode();
11241 strayin->encode_snap_blob(snapbl);
11242 }
11243
11244 if (unlinking) {
11245 ceph_assert(!straydn);
11246 dn->replica_unlinking_ref = 0;
11247 }
11248 for (set<mds_rank_t>::iterator it = replicas.begin();
11249 it != replicas.end();
11250 ++it) {
11251 // don't tell (rmdir) witnesses; they already know
11252 if (mdr.get() && mdr->more()->witnessed.count(*it))
11253 continue;
11254
11255 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11256 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11257 rejoin_gather.count(*it)))
11258 continue;
11259
11260 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(),
11261 dn->get_name(), unlinking);
11262 if (straydn) {
11263 encode_replica_stray(straydn, *it, unlink->straybl);
11264 unlink->snapbl = snapbl;
11265 }
11266 mds->send_message_mds(unlink, *it);
11267 if (unlinking) {
11268 dn->replica_unlinking_ref++;
11269 dn->get(CDentry::PIN_WAITUNLINKSTATE);
11270 }
11271 }
11272
11273 if (unlinking && dn->replica_unlinking_ref) {
11274 dn->add_waiter(CDentry::WAIT_UNLINK_STATE, new C_MDS_RetryRequest(this, mdr));
11275 }
11276 }
11277
11278 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11279 {
11280 // straydn
11281 CDentry *straydn = nullptr;
11282 CInode *strayin = nullptr;
11283
11284 if (m->straybl.length())
11285 decode_replica_stray(straydn, &strayin, m->straybl, mds_rank_t(m->get_source().num()));
11286
11287 boost::intrusive_ptr<MDentryUnlinkAck> ack;
11288 CDentry::linkage_t *dnl;
11289 CDentry *dn;
11290 CInode *in;
11291 bool hadrealm;
11292
11293 CDir *dir = get_dirfrag(m->get_dirfrag());
11294 if (!dir) {
11295 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11296 if (m->is_unlinking())
11297 goto ack;
11298 } else {
11299 dn = dir->lookup(m->get_dn());
11300 if (!dn) {
11301 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11302 if (m->is_unlinking())
11303 goto ack;
11304 } else {
11305 dout(7) << __func__ << " on " << *dn << dendl;
11306
11307 if (m->is_unlinking()) {
11308 dn->state_set(CDentry::STATE_UNLINKING);
11309 goto ack;
11310 }
11311
11312 dnl = dn->get_linkage();
11313
11314 // open inode?
11315 if (dnl->is_primary()) {
11316 in = dnl->get_inode();
11317 dn->dir->unlink_inode(dn);
11318 ceph_assert(straydn);
11319 straydn->dir->link_primary_inode(straydn, in);
11320
11321 // in->first is lazily updated on replica; drag it forward so
11322 // that we always keep it in sync with the dnq
11323 ceph_assert(straydn->first >= in->first);
11324 in->first = straydn->first;
11325
11326 // update subtree map?
11327 if (in->is_dir()) {
11328 adjust_subtree_after_rename(in, dir, false);
11329 }
11330
11331 if (m->snapbl.length()) {
11332 hadrealm = (in->snaprealm ? true : false);
11333 in->decode_snap_blob(m->snapbl);
11334 ceph_assert(in->snaprealm);
11335 if (!hadrealm)
11336 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11337 }
11338
11339 // send caps to auth (if we're not already)
11340 if (in->is_any_caps() &&
11341 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11342 migrator->export_caps(in);
11343
11344 straydn = NULL;
11345 } else {
11346 ceph_assert(!straydn);
11347 ceph_assert(dnl->is_remote());
11348 dn->dir->unlink_inode(dn);
11349 }
11350 ceph_assert(dnl->is_null());
11351 dn->state_clear(CDentry::STATE_UNLINKING);
11352
11353 MDSContext::vec finished;
11354 dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished);
11355 mds->queue_waiters(finished);
11356
11357 }
11358 }
11359
11360 // race with trim_dentry()
11361 if (straydn) {
11362 ceph_assert(straydn->get_num_ref() == 0);
11363 ceph_assert(straydn->get_linkage()->is_null());
11364 expiremap ex;
11365 trim_dentry(straydn, ex);
11366 send_expire_messages(ex);
11367 }
11368 return;
11369
11370 ack:
11371 ack = make_message<MDentryUnlinkAck>(m->get_dirfrag(), m->get_dn());
11372 mds->send_message(ack, m->get_connection());
11373 }
11374
11375 void MDCache::handle_dentry_unlink_ack(const cref_t<MDentryUnlinkAck> &m)
11376 {
11377 CDir *dir = get_dirfrag(m->get_dirfrag());
11378 if (!dir) {
11379 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11380 } else {
11381 CDentry *dn = dir->lookup(m->get_dn());
11382 if (!dn) {
11383 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11384 } else {
11385 dout(7) << __func__ << " on " << *dn << " ref "
11386 << dn->replica_unlinking_ref << " -> "
11387 << dn->replica_unlinking_ref - 1 << dendl;
11388 dn->replica_unlinking_ref--;
11389 if (!dn->replica_unlinking_ref) {
11390 MDSContext::vec finished;
11391 dn->take_waiting(CDentry::WAIT_UNLINK_STATE, finished);
11392 mds->queue_waiters(finished);
11393 }
11394 dn->put(CDentry::PIN_WAITUNLINKSTATE);
11395 }
11396 }
11397 }
11398
11399
11400
11401
11402
11403 // ===================================================================
11404
11405
11406
11407 // ===================================================================
11408 // FRAGMENT
11409
11410
11411 /**
11412 * adjust_dir_fragments -- adjust fragmentation for a directory
11413 *
11414 * @param diri directory inode
11415 * @param basefrag base fragment
11416 * @param bits bit adjustment. positive for split, negative for merge.
11417 */
11418 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11419 std::vector<CDir*>* resultfrags,
11420 MDSContext::vec& waiters,
11421 bool replay)
11422 {
11423 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11424 << " on " << *diri << dendl;
11425
11426 auto&& p = diri->get_dirfrags_under(basefrag);
11427
11428 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11429 }
11430
11431 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11432 {
11433 CDir *dir = diri->get_dirfrag(fg);
11434 if (dir)
11435 return dir;
11436
11437 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11438
11439 std::vector<CDir*> src, result;
11440 MDSContext::vec waiters;
11441
11442 // split a parent?
11443 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11444 while (1) {
11445 CDir *pdir = diri->get_dirfrag(parent);
11446 if (pdir) {
11447 int split = fg.bits() - parent.bits();
11448 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11449 src.push_back(pdir);
11450 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11451 dir = diri->get_dirfrag(fg);
11452 if (dir) {
11453 dout(10) << "force_dir_fragment result " << *dir << dendl;
11454 break;
11455 }
11456 }
11457 if (parent == frag_t())
11458 break;
11459 frag_t last = parent;
11460 parent = parent.parent();
11461 dout(10) << " " << last << " parent is " << parent << dendl;
11462 }
11463
11464 if (!dir) {
11465 // hoover up things under fg?
11466 {
11467 auto&& p = diri->get_dirfrags_under(fg);
11468 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11469 }
11470 if (src.empty()) {
11471 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11472 } else {
11473 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11474 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11475 dir = result.front();
11476 dout(10) << "force_dir_fragment result " << *dir << dendl;
11477 }
11478 }
11479 if (!replay)
11480 mds->queue_waiters(waiters);
11481 return dir;
11482 }
11483
11484 void MDCache::adjust_dir_fragments(CInode *diri,
11485 const std::vector<CDir*>& srcfrags,
11486 frag_t basefrag, int bits,
11487 std::vector<CDir*>* resultfrags,
11488 MDSContext::vec& waiters,
11489 bool replay)
11490 {
11491 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11492 << " srcfrags " << srcfrags
11493 << " on " << *diri << dendl;
11494
11495 // adjust fragtree
11496 // yuck. we may have discovered the inode while it was being fragmented.
11497 if (!diri->dirfragtree.is_leaf(basefrag))
11498 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11499
11500 if (bits > 0)
11501 diri->dirfragtree.split(basefrag, bits);
11502 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11503
11504 if (srcfrags.empty())
11505 return;
11506
11507 // split
11508 CDir *parent_dir = diri->get_parent_dir();
11509 CDir *parent_subtree = 0;
11510 if (parent_dir)
11511 parent_subtree = get_subtree_root(parent_dir);
11512
11513 ceph_assert(srcfrags.size() >= 1);
11514 if (bits > 0) {
11515 // SPLIT
11516 ceph_assert(srcfrags.size() == 1);
11517 CDir *dir = srcfrags.front();
11518
11519 dir->split(bits, resultfrags, waiters, replay);
11520
11521 // did i change the subtree map?
11522 if (dir->is_subtree_root()) {
11523 // new frags are now separate subtrees
11524 for (const auto& dir : *resultfrags) {
11525 subtrees[dir].clear(); // new frag is now its own subtree
11526 }
11527
11528 // was i a bound?
11529 if (parent_subtree) {
11530 ceph_assert(subtrees[parent_subtree].count(dir));
11531 subtrees[parent_subtree].erase(dir);
11532 for (const auto& dir : *resultfrags) {
11533 ceph_assert(dir->is_subtree_root());
11534 subtrees[parent_subtree].insert(dir);
11535 }
11536 }
11537
11538 // adjust my bounds.
11539 set<CDir*> bounds;
11540 bounds.swap(subtrees[dir]);
11541 subtrees.erase(dir);
11542 for (set<CDir*>::iterator p = bounds.begin();
11543 p != bounds.end();
11544 ++p) {
11545 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11546 subtrees[frag].insert(*p);
11547 }
11548
11549 show_subtrees(10);
11550 }
11551
11552 diri->close_dirfrag(dir->get_frag());
11553
11554 } else {
11555 // MERGE
11556
11557 // are my constituent bits subtrees? if so, i will be too.
11558 // (it's all or none, actually.)
11559 bool any_subtree = false, any_non_subtree = false;
11560 for (const auto& dir : srcfrags) {
11561 if (dir->is_subtree_root())
11562 any_subtree = true;
11563 else
11564 any_non_subtree = true;
11565 }
11566 ceph_assert(!any_subtree || !any_non_subtree);
11567
11568 set<CDir*> new_bounds;
11569 if (any_subtree) {
11570 for (const auto& dir : srcfrags) {
11571 // this simplifies the code that find subtrees underneath the dirfrag
11572 if (!dir->is_subtree_root()) {
11573 dir->state_set(CDir::STATE_AUXSUBTREE);
11574 adjust_subtree_auth(dir, mds->get_nodeid());
11575 }
11576 }
11577
11578 for (const auto& dir : srcfrags) {
11579 ceph_assert(dir->is_subtree_root());
11580 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11581 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11582 set<CDir*>::iterator r = q->second.begin();
11583 while (r != subtrees[dir].end()) {
11584 new_bounds.insert(*r);
11585 subtrees[dir].erase(r++);
11586 }
11587 subtrees.erase(q);
11588
11589 // remove myself as my parent's bound
11590 if (parent_subtree)
11591 subtrees[parent_subtree].erase(dir);
11592 }
11593 }
11594
11595 // merge
11596 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11597 f->merge(srcfrags, waiters, replay);
11598
11599 if (any_subtree) {
11600 ceph_assert(f->is_subtree_root());
11601 subtrees[f].swap(new_bounds);
11602 if (parent_subtree)
11603 subtrees[parent_subtree].insert(f);
11604
11605 show_subtrees(10);
11606 }
11607
11608 resultfrags->push_back(f);
11609 }
11610 }
11611
11612
11613 class C_MDC_FragmentFrozen : public MDSInternalContext {
11614 MDCache *mdcache;
11615 MDRequestRef mdr;
11616 public:
11617 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11618 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11619 void finish(int r) override {
11620 mdcache->fragment_frozen(mdr, r);
11621 }
11622 };
11623
11624 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11625 {
11626 if (is_readonly()) {
11627 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11628 return false;
11629 }
11630 if (mds->is_cluster_degraded()) {
11631 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11632 return false;
11633 }
11634 if (diri->get_parent_dir() &&
11635 diri->get_parent_dir()->get_inode()->is_stray()) {
11636 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11637 return false;
11638 }
11639 if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) {
11640 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl;
11641 return false;
11642 }
11643
11644 for (const auto& dir : dirs) {
11645 if (dir->scrub_is_in_progress()) {
11646 dout(7) << "can_fragment: scrub in progress " << *dir << dendl;
11647 return false;
11648 }
11649
11650 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11651 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11652 return false;
11653 }
11654 if (!dir->is_auth()) {
11655 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11656 return false;
11657 }
11658 if (dir->is_bad()) {
11659 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11660 return false;
11661 }
11662 if (dir->is_frozen() ||
11663 dir->is_freezing()) {
11664 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11665 return false;
11666 }
11667 }
11668
11669 return true;
11670 }
11671
11672 void MDCache::split_dir(CDir *dir, int bits)
11673 {
11674 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11675 ceph_assert(dir->is_auth());
11676 CInode *diri = dir->inode;
11677
11678 std::vector<CDir*> dirs;
11679 dirs.push_back(dir);
11680
11681 if (!can_fragment(diri, dirs)) {
11682 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11683 return;
11684 }
11685
11686 if (dir->frag.bits() + bits > 24) {
11687 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11688 return;
11689 }
11690
11691 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11692 mdr->more()->fragment_base = dir->dirfrag();
11693
11694 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11695 fragment_info_t& info = fragments[dir->dirfrag()];
11696 info.mdr = mdr;
11697 info.dirs.push_back(dir);
11698 info.bits = bits;
11699 info.last_cum_auth_pins_change = ceph_clock_now();
11700
11701 fragment_freeze_dirs(dirs);
11702 // initial mark+complete pass
11703 fragment_mark_and_complete(mdr);
11704 }
11705
11706 void MDCache::merge_dir(CInode *diri, frag_t frag)
11707 {
11708 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11709
11710 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11711 if (!all) {
11712 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11713 return;
11714 }
11715
11716 if (diri->dirfragtree.is_leaf(frag)) {
11717 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11718 return;
11719 }
11720
11721 if (!can_fragment(diri, dirs))
11722 return;
11723
11724 CDir *first = dirs.front();
11725 int bits = first->get_frag().bits() - frag.bits();
11726 dout(10) << " we are merging by " << bits << " bits" << dendl;
11727
11728 dirfrag_t basedirfrag(diri->ino(), frag);
11729 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11730 mdr->more()->fragment_base = basedirfrag;
11731
11732 ceph_assert(fragments.count(basedirfrag) == 0);
11733 fragment_info_t& info = fragments[basedirfrag];
11734 info.mdr = mdr;
11735 info.dirs = dirs;
11736 info.bits = -bits;
11737 info.last_cum_auth_pins_change = ceph_clock_now();
11738
11739 fragment_freeze_dirs(dirs);
11740 // initial mark+complete pass
11741 fragment_mark_and_complete(mdr);
11742 }
11743
11744 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11745 {
11746 bool any_subtree = false, any_non_subtree = false;
11747 for (const auto& dir : dirs) {
11748 dir->auth_pin(dir); // until we mark and complete them
11749 dir->state_set(CDir::STATE_FRAGMENTING);
11750 dir->freeze_dir();
11751 ceph_assert(dir->is_freezing_dir());
11752
11753 if (dir->is_subtree_root())
11754 any_subtree = true;
11755 else
11756 any_non_subtree = true;
11757 }
11758
11759 if (any_subtree && any_non_subtree) {
11760 // either all dirfrags are subtree roots or all are not.
11761 for (const auto& dir : dirs) {
11762 if (dir->is_subtree_root()) {
11763 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11764 } else {
11765 dir->state_set(CDir::STATE_AUXSUBTREE);
11766 adjust_subtree_auth(dir, mds->get_nodeid());
11767 }
11768 }
11769 }
11770 }
11771
11772 class C_MDC_FragmentMarking : public MDCacheContext {
11773 MDRequestRef mdr;
11774 public:
11775 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11776 void finish(int r) override {
11777 mdcache->fragment_mark_and_complete(mdr);
11778 }
11779 };
11780
11781 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11782 {
11783 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11784 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11785 if (it == fragments.end() || it->second.mdr != mdr) {
11786 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11787 request_finish(mdr);
11788 return;
11789 }
11790
11791 fragment_info_t& info = it->second;
11792 CInode *diri = info.dirs.front()->get_inode();
11793 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11794
11795 MDSGatherBuilder gather(g_ceph_context);
11796
11797 for (const auto& dir : info.dirs) {
11798 bool ready = true;
11799 if (!dir->is_complete()) {
11800 dout(15) << " fetching incomplete " << *dir << dendl;
11801 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11802 ready = false;
11803 } else if (dir->get_frag() == frag_t()) {
11804 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11805 // the operation. To avoid CDir::fetch() complaining about missing object,
11806 // we commit new dirfrag first.
11807 if (dir->state_test(CDir::STATE_CREATING)) {
11808 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11809 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11810 ready = false;
11811 } else if (dir->is_new()) {
11812 dout(15) << " committing new " << *dir << dendl;
11813 ceph_assert(dir->is_dirty());
11814 dir->commit(0, gather.new_sub(), true);
11815 ready = false;
11816 }
11817 }
11818 if (!ready)
11819 continue;
11820
11821 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11822 dout(15) << " marking " << *dir << dendl;
11823 for (auto &p : dir->items) {
11824 CDentry *dn = p.second;
11825 dn->get(CDentry::PIN_FRAGMENTING);
11826 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11827 dn->state_set(CDentry::STATE_FRAGMENTING);
11828 }
11829 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11830 dir->auth_unpin(dir);
11831 } else {
11832 dout(15) << " already marked " << *dir << dendl;
11833 }
11834 }
11835 if (gather.has_subs()) {
11836 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11837 gather.activate();
11838 return;
11839 }
11840
11841 for (const auto& dir : info.dirs) {
11842 if (!dir->is_frozen_dir()) {
11843 ceph_assert(dir->is_freezing_dir());
11844 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11845 }
11846 }
11847 if (gather.has_subs()) {
11848 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11849 gather.activate();
11850 // flush log so that request auth_pins are retired
11851 mds->mdlog->flush();
11852 return;
11853 }
11854
11855 fragment_frozen(mdr, 0);
11856 }
11857
11858 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11859 {
11860 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11861 for (const auto& dir : dirs) {
11862 dout(10) << " frag " << *dir << dendl;
11863
11864 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11865 dir->state_clear(CDir::STATE_FRAGMENTING);
11866
11867 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11868 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11869
11870 for (auto &p : dir->items) {
11871 CDentry *dn = p.second;
11872 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11873 dn->state_clear(CDentry::STATE_FRAGMENTING);
11874 dn->put(CDentry::PIN_FRAGMENTING);
11875 }
11876 } else {
11877 dir->auth_unpin(dir);
11878 }
11879
11880 dir->unfreeze_dir();
11881 }
11882 }
11883
11884 bool MDCache::fragment_are_all_frozen(CDir *dir)
11885 {
11886 ceph_assert(dir->is_frozen_dir());
11887 map<dirfrag_t,fragment_info_t>::iterator p;
11888 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11889 p != fragments.end() && p->first.ino == dir->ino();
11890 ++p) {
11891 if (p->first.frag.contains(dir->get_frag()))
11892 return p->second.all_frozen;
11893 }
11894 ceph_abort();
11895 return false;
11896 }
11897
11898 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11899 {
11900 map<dirfrag_t,fragment_info_t>::iterator p;
11901 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11902 p != fragments.end() && p->first.ino == dir->ino();
11903 ++p) {
11904 if (p->first.frag.contains(dir->get_frag())) {
11905 p->second.num_remote_waiters++;
11906 return;
11907 }
11908 }
11909 ceph_abort();
11910 }
11911
11912 void MDCache::find_stale_fragment_freeze()
11913 {
11914 dout(10) << "find_stale_fragment_freeze" << dendl;
11915 // see comment in Migrator::find_stale_export_freeze()
11916 utime_t now = ceph_clock_now();
11917 utime_t cutoff = now;
11918 cutoff -= g_conf()->mds_freeze_tree_timeout;
11919
11920 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11921 p != fragments.end(); ) {
11922 dirfrag_t df = p->first;
11923 fragment_info_t& info = p->second;
11924 ++p;
11925 if (info.all_frozen)
11926 continue;
11927 CDir *dir;
11928 int total_auth_pins = 0;
11929 for (const auto& d : info.dirs) {
11930 dir = d;
11931 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11932 total_auth_pins = -1;
11933 break;
11934 }
11935 if (dir->is_frozen_dir())
11936 continue;
11937 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11938 }
11939 if (total_auth_pins < 0)
11940 continue;
11941 if (info.last_cum_auth_pins != total_auth_pins) {
11942 info.last_cum_auth_pins = total_auth_pins;
11943 info.last_cum_auth_pins_change = now;
11944 continue;
11945 }
11946 if (info.last_cum_auth_pins_change >= cutoff)
11947 continue;
11948 dir = info.dirs.front();
11949 if (info.num_remote_waiters > 0 ||
11950 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11951 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11952 std::vector<CDir*> dirs;
11953 info.dirs.swap(dirs);
11954 fragments.erase(df);
11955 fragment_unmark_unfreeze_dirs(dirs);
11956 }
11957 }
11958 }
11959
11960 class C_MDC_FragmentPrep : public MDCacheLogContext {
11961 MDRequestRef mdr;
11962 public:
11963 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11964 void finish(int r) override {
11965 mdcache->_fragment_logged(mdr);
11966 }
11967 };
11968
11969 class C_MDC_FragmentStore : public MDCacheContext {
11970 MDRequestRef mdr;
11971 public:
11972 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11973 void finish(int r) override {
11974 mdcache->_fragment_stored(mdr);
11975 }
11976 };
11977
11978 class C_MDC_FragmentCommit : public MDCacheLogContext {
11979 dirfrag_t basedirfrag;
11980 MDRequestRef mdr;
11981 public:
11982 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11983 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11984 void finish(int r) override {
11985 mdcache->_fragment_committed(basedirfrag, mdr);
11986 }
11987 };
11988
11989 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11990 dirfrag_t basedirfrag;
11991 int bits;
11992 MDRequestRef mdr;
11993 public:
11994 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11995 const MDRequestRef& r) :
11996 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11997 void finish(int r) override {
11998 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
11999 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
12000 }
12001 void print(ostream& out) const override {
12002 out << "fragment_purge_old(" << basedirfrag << ")";
12003 }
12004 };
12005
12006 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
12007 {
12008 dirfrag_t basedirfrag = mdr->more()->fragment_base;
12009 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
12010 if (it == fragments.end() || it->second.mdr != mdr) {
12011 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
12012 request_finish(mdr);
12013 return;
12014 }
12015
12016 ceph_assert(r == 0);
12017 fragment_info_t& info = it->second;
12018 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
12019 << " on " << info.dirs.front()->get_inode() << dendl;
12020
12021 info.all_frozen = true;
12022 dispatch_fragment_dir(mdr);
12023 }
12024
12025 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
12026 {
12027 dirfrag_t basedirfrag = mdr->more()->fragment_base;
12028 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
12029 if (it == fragments.end() || it->second.mdr != mdr) {
12030 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
12031 request_finish(mdr);
12032 return;
12033 }
12034
12035 fragment_info_t& info = it->second;
12036 CInode *diri = info.dirs.front()->get_inode();
12037
12038 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
12039 << " on " << *diri << dendl;
12040
12041 if (mdr->more()->peer_error)
12042 mdr->aborted = true;
12043
12044 if (!mdr->aborted) {
12045 MutationImpl::LockOpVec lov;
12046 lov.add_wrlock(&diri->dirfragtreelock);
12047 // prevent a racing gather on any other scatterlocks too
12048 lov.lock_scatter_gather(&diri->nestlock);
12049 lov.lock_scatter_gather(&diri->filelock);
12050 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
12051 if (!mdr->aborted)
12052 return;
12053 }
12054 }
12055
12056 if (mdr->aborted) {
12057 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
12058 << info.dirs.front()->dirfrag() << dendl;
12059 if (info.bits > 0)
12060 mds->balancer->queue_split(info.dirs.front(), false);
12061 else
12062 mds->balancer->queue_merge(info.dirs.front());
12063 fragment_unmark_unfreeze_dirs(info.dirs);
12064 fragments.erase(it);
12065 request_finish(mdr);
12066 return;
12067 }
12068
12069 mdr->ls = mds->mdlog->get_current_segment();
12070 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
12071 mds->mdlog->start_entry(le);
12072
12073 for (const auto& dir : info.dirs) {
12074 dirfrag_rollback rollback;
12075 rollback.fnode = dir->fnode;
12076 le->add_orig_frag(dir->get_frag(), &rollback);
12077 }
12078
12079 // refragment
12080 MDSContext::vec waiters;
12081 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
12082 &info.resultfrags, waiters, false);
12083 if (g_conf()->mds_debug_frag)
12084 diri->verify_dirfrags();
12085 mds->queue_waiters(waiters);
12086
12087 for (const auto& fg : le->orig_frags)
12088 ceph_assert(!diri->dirfragtree.is_leaf(fg));
12089
12090 le->metablob.add_dir_context(info.resultfrags.front());
12091 for (const auto& dir : info.resultfrags) {
12092 if (diri->is_auth()) {
12093 le->metablob.add_fragmented_dir(dir, false, false);
12094 } else {
12095 dir->state_set(CDir::STATE_DIRTYDFT);
12096 le->metablob.add_fragmented_dir(dir, false, true);
12097 }
12098 }
12099
12100 // dft lock
12101 if (diri->is_auth()) {
12102 // journal dirfragtree
12103 auto pi = diri->project_inode(mdr);
12104 pi.inode->version = diri->pre_dirty();
12105 predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
12106 journal_dirty_inode(mdr.get(), &le->metablob, diri);
12107 } else {
12108 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12109 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12110 mdr->add_updated_lock(&diri->dirfragtreelock);
12111 }
12112
12113 /*
12114 // filelock
12115 mds->locker->mark_updated_scatterlock(&diri->filelock);
12116 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12117 mut->add_updated_lock(&diri->filelock);
12118
12119 // dirlock
12120 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12121 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12122 mut->add_updated_lock(&diri->nestlock);
12123 */
12124
12125 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
12126 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
12127 mdr, __func__);
12128 mds->mdlog->flush();
12129 }
12130
12131 void MDCache::_fragment_logged(MDRequestRef& mdr)
12132 {
12133 dirfrag_t basedirfrag = mdr->more()->fragment_base;
12134 auto& info = fragments.at(basedirfrag);
12135 CInode *diri = info.resultfrags.front()->get_inode();
12136
12137 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
12138 << " on " << *diri << dendl;
12139 mdr->mark_event("prepare logged");
12140
12141 mdr->apply(); // mark scatterlock
12142
12143 // store resulting frags
12144 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
12145
12146 for (const auto& dir : info.resultfrags) {
12147 dout(10) << " storing result frag " << *dir << dendl;
12148
12149 dir->mark_dirty(mdr->ls);
12150 dir->mark_new(mdr->ls);
12151
12152 // freeze and store them too
12153 dir->auth_pin(this);
12154 dir->state_set(CDir::STATE_FRAGMENTING);
12155 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
12156 }
12157
12158 gather.activate();
12159 }
12160
12161 void MDCache::_fragment_stored(MDRequestRef& mdr)
12162 {
12163 dirfrag_t basedirfrag = mdr->more()->fragment_base;
12164 fragment_info_t &info = fragments.at(basedirfrag);
12165 CDir *first = info.resultfrags.front();
12166 CInode *diri = first->get_inode();
12167
12168 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
12169 << " on " << *diri << dendl;
12170 mdr->mark_event("new frags stored");
12171
12172 // tell peers
12173 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
12174 diri->authority().first : CDIR_AUTH_UNKNOWN;
12175 for (const auto &p : first->get_replicas()) {
12176 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
12177 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
12178 rejoin_gather.count(p.first)))
12179 continue;
12180
12181 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
12182 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
12183 diri_auth != p.first) { // not auth mds of diri
12184 /*
12185 * In the nornal case, mds does not trim dir inode whose child dirfrags
12186 * are likely being fragmented (see trim_inode()). But when fragmenting
12187 * subtree roots, following race can happen:
12188 *
12189 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
12190 * mds.c and drops wrlock on dirfragtreelock.
12191 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
12192 * SYNC and send lock message mds.c
12193 * - mds.c receives the lock message and changes dirfragtreelock state
12194 * to SYNC
12195 * - mds.c trim dirfrag and dir inode from its cache
12196 * - mds.c receives the fragment_notify message
12197 *
12198 * So we need to ensure replicas have received the notify, then unlock
12199 * the dirfragtreelock.
12200 */
12201 notify->mark_ack_wanted();
12202 info.notify_ack_waiting.insert(p.first);
12203 }
12204
12205 // freshly replicate new dirs to peers
12206 for (const auto& dir : info.resultfrags) {
12207 encode_replica_dir(dir, p.first, notify->basebl);
12208 }
12209
12210 mds->send_message_mds(notify, p.first);
12211 }
12212
12213 // journal commit
12214 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
12215 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
12216
12217
12218 // unfreeze resulting frags
12219 for (const auto& dir : info.resultfrags) {
12220 dout(10) << " result frag " << *dir << dendl;
12221
12222 for (auto &p : dir->items) {
12223 CDentry *dn = p.second;
12224 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
12225 dn->state_clear(CDentry::STATE_FRAGMENTING);
12226 dn->put(CDentry::PIN_FRAGMENTING);
12227 }
12228
12229 // unfreeze
12230 dir->unfreeze_dir();
12231 }
12232
12233 if (info.notify_ack_waiting.empty()) {
12234 fragment_drop_locks(info);
12235 } else {
12236 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
12237 }
12238 }
12239
12240 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
12241 {
12242 dout(10) << "fragment_committed " << basedirfrag << dendl;
12243 if (mdr)
12244 mdr->mark_event("commit logged");
12245
12246 ufragment &uf = uncommitted_fragments.at(basedirfrag);
12247
12248 // remove old frags
12249 C_GatherBuilder gather(
12250 g_ceph_context,
12251 new C_OnFinisher(
12252 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
12253 mds->finisher));
12254
12255 SnapContext nullsnapc;
12256 object_locator_t oloc(mds->get_metadata_pool());
12257 for (const auto& fg : uf.old_frags) {
12258 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
12259 ObjectOperation op;
12260 if (fg == frag_t()) {
12261 // backtrace object
12262 dout(10) << " truncate orphan dirfrag " << oid << dendl;
12263 op.truncate(0);
12264 op.omap_clear();
12265 } else {
12266 dout(10) << " removing orphan dirfrag " << oid << dendl;
12267 op.remove();
12268 }
12269 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12270 ceph::real_clock::now(),
12271 0, gather.new_sub());
12272 }
12273
12274 ceph_assert(gather.has_subs());
12275 gather.activate();
12276 }
12277
12278 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
12279 {
12280 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12281 if (mdr)
12282 mdr->mark_event("old frags purged");
12283
12284 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12285 mds->mdlog->start_submit_entry(le);
12286
12287 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12288
12289 if (mds->logger) {
12290 if (bits > 0) {
12291 mds->logger->inc(l_mds_dir_split);
12292 } else {
12293 mds->logger->inc(l_mds_dir_merge);
12294 }
12295 }
12296
12297 if (mdr) {
12298 auto it = fragments.find(basedirfrag);
12299 ceph_assert(it != fragments.end());
12300 it->second.finishing = true;
12301 if (it->second.notify_ack_waiting.empty())
12302 fragment_maybe_finish(it);
12303 else
12304 mdr->mark_event("wating for notify acks");
12305 }
12306 }
12307
12308 void MDCache::fragment_drop_locks(fragment_info_t& info)
12309 {
12310 mds->locker->drop_locks(info.mdr.get());
12311 request_finish(info.mdr);
12312 //info.mdr.reset();
12313 }
12314
12315 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12316 {
12317 if (!it->second.finishing)
12318 return;
12319
12320 // unmark & auth_unpin
12321 for (const auto &dir : it->second.resultfrags) {
12322 dir->state_clear(CDir::STATE_FRAGMENTING);
12323 dir->auth_unpin(this);
12324
12325 // In case the resulting fragments are beyond the split size,
12326 // we might need to split them again right away (they could
12327 // have been taking inserts between unfreezing and getting
12328 // here)
12329 mds->balancer->maybe_fragment(dir, false);
12330 }
12331
12332 fragments.erase(it);
12333 }
12334
12335
12336 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12337 {
12338 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12339 mds_rank_t from = mds_rank_t(ack->get_source().num());
12340
12341 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12342 return;
12343 }
12344
12345 auto it = fragments.find(ack->get_base_dirfrag());
12346 if (it == fragments.end() ||
12347 it->second.get_tid() != ack->get_tid()) {
12348 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12349 return;
12350 }
12351
12352 if (it->second.notify_ack_waiting.erase(from) &&
12353 it->second.notify_ack_waiting.empty()) {
12354 fragment_drop_locks(it->second);
12355 fragment_maybe_finish(it);
12356 }
12357 }
12358
12359 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12360 {
12361 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12362 mds_rank_t from = mds_rank_t(notify->get_source().num());
12363
12364 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12365 return;
12366 }
12367
12368 CInode *diri = get_inode(notify->get_ino());
12369 if (diri) {
12370 frag_t base = notify->get_basefrag();
12371 int bits = notify->get_bits();
12372
12373 /*
12374 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12375 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12376 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12377 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12378 return;
12379 }
12380 */
12381
12382 // refragment
12383 MDSContext::vec waiters;
12384 std::vector<CDir*> resultfrags;
12385 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12386 if (g_conf()->mds_debug_frag)
12387 diri->verify_dirfrags();
12388
12389 for (const auto& dir : resultfrags) {
12390 diri->take_dir_waiting(dir->get_frag(), waiters);
12391 }
12392
12393 // add new replica dirs values
12394 auto p = notify->basebl.cbegin();
12395 while (!p.end()) {
12396 CDir *tmp_dir = nullptr;
12397 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12398 }
12399
12400 mds->queue_waiters(waiters);
12401 } else {
12402 ceph_abort();
12403 }
12404
12405 if (notify->is_ack_wanted()) {
12406 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12407 notify->get_bits(), notify->get_tid());
12408 mds->send_message_mds(ack, from);
12409 }
12410 }
12411
12412 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12413 LogSegment *ls, bufferlist *rollback)
12414 {
12415 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12416 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12417 ufragment& uf = uncommitted_fragments[basedirfrag];
12418 uf.old_frags = old_frags;
12419 uf.bits = bits;
12420 uf.ls = ls;
12421 ls->uncommitted_fragments.insert(basedirfrag);
12422 if (rollback)
12423 uf.rollback.swap(*rollback);
12424 }
12425
12426 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12427 {
12428 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12429 << " op " << EFragment::op_name(op) << dendl;
12430 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12431 if (it != uncommitted_fragments.end()) {
12432 ufragment& uf = it->second;
12433 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12434 uf.committed = true;
12435 } else {
12436 uf.ls->uncommitted_fragments.erase(basedirfrag);
12437 mds->queue_waiters(uf.waiters);
12438 uncommitted_fragments.erase(it);
12439 }
12440 }
12441 }
12442
12443 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12444 {
12445 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12446 << " old_frags (" << old_frags << ")" << dendl;
12447 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12448 if (it != uncommitted_fragments.end()) {
12449 ufragment& uf = it->second;
12450 if (!uf.old_frags.empty()) {
12451 uf.old_frags = std::move(old_frags);
12452 uf.committed = true;
12453 } else {
12454 uf.ls->uncommitted_fragments.erase(basedirfrag);
12455 uncommitted_fragments.erase(it);
12456 }
12457 }
12458 }
12459
12460 void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
12461 {
12462 MDSGatherBuilder gather(g_ceph_context, finisher);
12463 for (auto& p : uncommitted_fragments) {
12464 p.second.waiters.push_back(gather.new_sub());
12465 }
12466 gather.activate();
12467 }
12468
12469 struct C_MDC_FragmentRollback : public MDCacheLogContext {
12470 MutationRef mut;
12471 C_MDC_FragmentRollback(MDCache *c, MutationRef& m) :
12472 MDCacheLogContext(c), mut(m) {}
12473 void finish(int r) override {
12474 mut->apply();
12475 get_mds()->locker->drop_locks(mut.get());
12476 mut->cleanup();
12477 }
12478 };
12479
12480 void MDCache::rollback_uncommitted_fragments()
12481 {
12482 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12483 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12484 p != uncommitted_fragments.end();
12485 ++p) {
12486 ufragment &uf = p->second;
12487 CInode *diri = get_inode(p->first.ino);
12488 ceph_assert(diri);
12489
12490 if (uf.committed) {
12491 _fragment_committed(p->first, MDRequestRef());
12492 continue;
12493 }
12494
12495 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12496
12497 MutationRef mut(new MutationImpl());
12498 mut->ls = mds->mdlog->get_current_segment();
12499 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12500 mds->mdlog->start_entry(le);
12501 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12502
12503 frag_vec_t old_frags;
12504 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12505
12506 std::vector<CDir*> resultfrags;
12507 if (uf.old_frags.empty()) {
12508 // created by old format EFragment
12509 MDSContext::vec waiters;
12510 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12511 } else {
12512 auto bp = uf.rollback.cbegin();
12513 for (const auto& fg : uf.old_frags) {
12514 CDir *dir = force_dir_fragment(diri, fg);
12515 resultfrags.push_back(dir);
12516
12517 dirfrag_rollback rollback;
12518 decode(rollback, bp);
12519
12520 dir->fnode = rollback.fnode;
12521
12522 dir->mark_dirty(mut->ls);
12523
12524 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
12525 dout(10) << " dirty nestinfo on " << *dir << dendl;
12526 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12527 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12528 mut->add_updated_lock(&diri->nestlock);
12529 }
12530 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
12531 dout(10) << " dirty fragstat on " << *dir << dendl;
12532 mds->locker->mark_updated_scatterlock(&diri->filelock);
12533 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12534 mut->add_updated_lock(&diri->filelock);
12535 }
12536
12537 le->add_orig_frag(dir->get_frag());
12538 le->metablob.add_dir_context(dir);
12539 if (diri_auth) {
12540 le->metablob.add_fragmented_dir(dir, true, false);
12541 } else {
12542 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12543 dir->state_set(CDir::STATE_DIRTYDFT);
12544 le->metablob.add_fragmented_dir(dir, true, true);
12545 }
12546 }
12547 }
12548
12549 if (diri_auth) {
12550 auto pi = diri->project_inode(mut);
12551 pi.inode->version = diri->pre_dirty();
12552 predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
12553 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12554 } else {
12555 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12556 mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12557 mut->add_updated_lock(&diri->dirfragtreelock);
12558 }
12559
12560 if (g_conf()->mds_debug_frag)
12561 diri->verify_dirfrags();
12562
12563 for (const auto& leaf : old_frags) {
12564 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12565 }
12566
12567 mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut));
12568
12569 uf.old_frags.swap(old_frags);
12570 _fragment_committed(p->first, MDRequestRef());
12571 }
12572 }
12573
12574 void MDCache::force_readonly()
12575 {
12576 if (is_readonly())
12577 return;
12578
12579 dout(1) << "force file system read-only" << dendl;
12580 mds->clog->warn() << "force file system read-only";
12581
12582 set_readonly();
12583
12584 mds->server->force_clients_readonly();
12585
12586 // revoke write caps
12587 int count = 0;
12588 for (auto &p : inode_map) {
12589 CInode *in = p.second;
12590 if (in->is_head())
12591 mds->locker->eval(in, CEPH_CAP_LOCKS);
12592 if (!(++count % mds->heartbeat_reset_grace()))
12593 mds->heartbeat_reset();
12594 }
12595
12596 mds->mdlog->flush();
12597 }
12598
12599
12600 // ==============================================================
12601 // debug crap
12602
12603 void MDCache::show_subtrees(int dbl, bool force_print)
12604 {
12605 if (g_conf()->mds_thrash_exports)
12606 dbl += 15;
12607
12608 //dout(10) << "show_subtrees" << dendl;
12609
12610 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12611 return; // i won't print anything.
12612
12613 if (subtrees.empty()) {
12614 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12615 << dendl;
12616 return;
12617 }
12618
12619 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12620 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12621 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12622 "printing subtrees" << dendl;
12623 return;
12624 }
12625
12626 // root frags
12627 std::vector<CDir*> basefrags;
12628 for (set<CInode*>::iterator p = base_inodes.begin();
12629 p != base_inodes.end();
12630 ++p)
12631 (*p)->get_dirfrags(basefrags);
12632 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12633 dout(15) << "show_subtrees" << dendl;
12634
12635 // queue stuff
12636 list<pair<CDir*,int> > q;
12637 string indent;
12638 set<CDir*> seen;
12639
12640 // calc max depth
12641 for (const auto& dir : basefrags) {
12642 q.emplace_back(dir, 0);
12643 }
12644
12645 set<CDir*> subtrees_seen;
12646
12647 unsigned int depth = 0;
12648 while (!q.empty()) {
12649 CDir *dir = q.front().first;
12650 unsigned int d = q.front().second;
12651 q.pop_front();
12652
12653 if (subtrees.count(dir) == 0) continue;
12654
12655 subtrees_seen.insert(dir);
12656
12657 if (d > depth) depth = d;
12658
12659 // sanity check
12660 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12661 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12662 ceph_assert(seen.count(dir) == 0);
12663 seen.insert(dir);
12664
12665 // nested items?
12666 if (!subtrees[dir].empty()) {
12667 for (set<CDir*>::iterator p = subtrees[dir].begin();
12668 p != subtrees[dir].end();
12669 ++p) {
12670 //dout(25) << " saw sub " << **p << dendl;
12671 q.push_front(pair<CDir*,int>(*p, d+1));
12672 }
12673 }
12674 }
12675
12676 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12677 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12678 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12679 "subtrees" << dendl;
12680 return;
12681 }
12682
12683 // print tree
12684 for (const auto& dir : basefrags) {
12685 q.emplace_back(dir, 0);
12686 }
12687
12688 while (!q.empty()) {
12689 CDir *dir = q.front().first;
12690 int d = q.front().second;
12691 q.pop_front();
12692
12693 if (subtrees.count(dir) == 0) continue;
12694
12695 // adjust indenter
12696 while ((unsigned)d < indent.size())
12697 indent.resize(d);
12698
12699 // pad
12700 string pad = "______________________________________";
12701 pad.resize(depth*2+1-indent.size());
12702 if (!subtrees[dir].empty())
12703 pad[0] = '.'; // parent
12704
12705
12706 string auth;
12707 if (dir->is_auth())
12708 auth = "auth ";
12709 else
12710 auth = " rep ";
12711
12712 char s[10];
12713 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12714 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12715 else
12716 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12717
12718 // print
12719 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12720 << " " << auth << *dir << dendl;
12721
12722 if (dir->ino() == CEPH_INO_ROOT)
12723 ceph_assert(dir->inode == root);
12724 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12725 ceph_assert(dir->inode == myin);
12726 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12727 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12728
12729 // nested items?
12730 if (!subtrees[dir].empty()) {
12731 // more at my level?
12732 if (!q.empty() && q.front().second == d)
12733 indent += "| ";
12734 else
12735 indent += " ";
12736
12737 for (set<CDir*>::iterator p = subtrees[dir].begin();
12738 p != subtrees[dir].end();
12739 ++p)
12740 q.push_front(pair<CDir*,int>(*p, d+2));
12741 }
12742 }
12743
12744 // verify there isn't stray crap in subtree map
12745 int lost = 0;
12746 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12747 p != subtrees.end();
12748 ++p) {
12749 if (subtrees_seen.count(p->first)) continue;
12750 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12751 lost++;
12752 }
12753 ceph_assert(lost == 0);
12754 }
12755
12756 void MDCache::show_cache()
12757 {
12758 if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
12759 return;
12760 dout(7) << "show_cache" << dendl;
12761
12762 auto show_func = [this](CInode *in) {
12763 // unlinked?
12764 if (!in->parent)
12765 dout(7) << " unlinked " << *in << dendl;
12766
12767 // dirfrags?
12768 auto&& dfs = in->get_dirfrags();
12769 for (const auto& dir : dfs) {
12770 dout(7) << " dirfrag " << *dir << dendl;
12771
12772 for (auto &p : dir->items) {
12773 CDentry *dn = p.second;
12774 dout(7) << " dentry " << *dn << dendl;
12775 CDentry::linkage_t *dnl = dn->get_linkage();
12776 if (dnl->is_primary() && dnl->get_inode())
12777 dout(7) << " inode " << *dnl->get_inode() << dendl;
12778 }
12779 }
12780 };
12781
12782 for (auto &p : inode_map)
12783 show_func(p.second);
12784 for (auto &p : snap_inode_map)
12785 show_func(p.second);
12786 }
12787
12788 void MDCache::cache_status(Formatter *f)
12789 {
12790 f->open_object_section("cache");
12791
12792 f->open_object_section("pool");
12793 mempool::get_pool(mempool::mds_co::id).dump(f);
12794 f->close_section();
12795
12796 f->close_section();
12797 }
12798
12799 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12800 {
12801 ceph_assert(in);
12802 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12803 return;
12804 }
12805 auto&& ls = in->get_dirfrags();
12806 for (const auto &subdir : ls) {
12807 for (const auto &p : subdir->items) {
12808 CDentry *dn = p.second;
12809 CInode *in = dn->get_linkage()->get_inode();
12810 if (in) {
12811 dump_tree(in, cur_depth + 1, max_depth, f);
12812 }
12813 }
12814 }
12815 f->open_object_section("inode");
12816 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12817 f->close_section();
12818 }
12819
12820 int MDCache::dump_cache(std::string_view file_name, double timeout)
12821 {
12822 return dump_cache(file_name, NULL, timeout);
12823 }
12824
12825 int MDCache::dump_cache(Formatter *f, double timeout)
12826 {
12827 return dump_cache(std::string_view(""), f, timeout);
12828 }
12829
12830 /**
12831 * Dump the metadata cache, either to a Formatter, if
12832 * provided, else to a plain text file.
12833 */
12834 int MDCache::dump_cache(std::string_view fn, Formatter *f, double timeout)
12835 {
12836 int r = 0;
12837
12838 // dumping large caches may cause mds to hang or worse get killed.
12839 // so, disallow the dump if the cache size exceeds the configured
12840 // threshold, which is 1G for formatter and unlimited for file (note
12841 // that this can be jacked up by the admin... and is nothing but foot
12842 // shooting, but the option itself is for devs and hence dangerous to
12843 // tune). TODO: remove this when fixed.
12844 uint64_t threshold = f ?
12845 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12846 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12847
12848 if (threshold && cache_size() > threshold) {
12849 if (f) {
12850 CachedStackStringStream css;
12851 *css << "cache usage exceeds dump threshold";
12852 f->open_object_section("result");
12853 f->dump_string("error", css->strv());
12854 f->close_section();
12855 } else {
12856 derr << "cache usage exceeds dump threshold" << dendl;
12857 r = -CEPHFS_EINVAL;
12858 }
12859 return r;
12860 }
12861
12862 r = 0;
12863 int fd = -1;
12864
12865 if (f) {
12866 f->open_array_section("inodes");
12867 } else {
12868 char path[PATH_MAX] = "";
12869 if (fn.length()) {
12870 snprintf(path, sizeof path, "%s", fn.data());
12871 } else {
12872 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12873 }
12874
12875 dout(1) << "dump_cache to " << path << dendl;
12876
12877 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12878 if (fd < 0) {
12879 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12880 return errno;
12881 }
12882 }
12883
12884 auto dump_func = [fd, f](CInode *in) {
12885 int r;
12886 if (f) {
12887 f->open_object_section("inode");
12888 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12889 f->close_section();
12890 return 1;
12891 }
12892 CachedStackStringStream css;
12893 *css << *in << std::endl;
12894 auto sv = css->strv();
12895 r = safe_write(fd, sv.data(), sv.size());
12896 if (r < 0)
12897 return r;
12898 auto&& dfs = in->get_dirfrags();
12899 for (auto &dir : dfs) {
12900 CachedStackStringStream css2;
12901 *css2 << " " << *dir << std::endl;
12902 auto sv = css2->strv();
12903 r = safe_write(fd, sv.data(), sv.size());
12904 if (r < 0)
12905 return r;
12906 for (auto &p : dir->items) {
12907 CDentry *dn = p.second;
12908 CachedStackStringStream css3;
12909 *css3 << " " << *dn << std::endl;
12910 auto sv = css3->strv();
12911 r = safe_write(fd, sv.data(), sv.size());
12912 if (r < 0)
12913 return r;
12914 }
12915 dir->check_rstats();
12916 }
12917 return 1;
12918 };
12919
12920 auto start = mono_clock::now();
12921 int64_t count = 0;
12922 for (auto &p : inode_map) {
12923 r = dump_func(p.second);
12924 if (r < 0)
12925 goto out;
12926 if (!(++count % 1000) &&
12927 timeout > 0 &&
12928 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12929 r = -ETIMEDOUT;
12930 goto out;
12931 }
12932 }
12933 for (auto &p : snap_inode_map) {
12934 r = dump_func(p.second);
12935 if (r < 0)
12936 goto out;
12937 if (!(++count % 1000) &&
12938 timeout > 0 &&
12939 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12940 r = -ETIMEDOUT;
12941 goto out;
12942 }
12943
12944 }
12945 r = 0;
12946
12947 out:
12948 if (f) {
12949 if (r == -ETIMEDOUT)
12950 {
12951 f->close_section();
12952 f->open_object_section("result");
12953 f->dump_string("error", "the operation timeout");
12954 }
12955 f->close_section(); // inodes
12956 } else {
12957 if (r == -ETIMEDOUT)
12958 {
12959 CachedStackStringStream css;
12960 *css << "error : the operation timeout" << std::endl;
12961 auto sv = css->strv();
12962 r = safe_write(fd, sv.data(), sv.size());
12963 }
12964 ::close(fd);
12965 }
12966 return r;
12967 }
12968
12969 void C_MDS_RetryRequest::finish(int r)
12970 {
12971 mdr->retry++;
12972 cache->dispatch_request(mdr);
12973 }
12974
12975 MDSContext *CF_MDS_RetryRequestFactory::build()
12976 {
12977 if (drop_locks) {
12978 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
12979 mdr->drop_local_auth_pins();
12980 }
12981 return new C_MDS_RetryRequest(mdcache, mdr);
12982 }
12983
12984 class C_MDS_EnqueueScrub : public Context
12985 {
12986 std::string tag;
12987 Formatter *formatter;
12988 Context *on_finish;
12989 public:
12990 ScrubHeaderRef header;
12991 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12992 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12993
12994 void finish(int r) override {
12995 formatter->open_object_section("results");
12996 formatter->dump_int("return_code", r);
12997 if (r == 0) {
12998 formatter->dump_string("scrub_tag", tag);
12999 formatter->dump_string("mode", "asynchronous");
13000 }
13001 formatter->close_section();
13002
13003 r = 0;
13004 if (on_finish)
13005 on_finish->complete(r);
13006 }
13007 };
13008
13009 void MDCache::enqueue_scrub(
13010 std::string_view path,
13011 std::string_view tag,
13012 bool force, bool recursive, bool repair,
13013 Formatter *f, Context *fin)
13014 {
13015 dout(10) << __func__ << " " << path << dendl;
13016
13017 filepath fp;
13018 if (path.compare(0, 4, "~mds") == 0) {
13019 mds_rank_t rank;
13020 if (path == "~mdsdir") {
13021 rank = mds->get_nodeid();
13022 } else {
13023 std::string err;
13024 rank = strict_strtoll(path.substr(4), 10, &err);
13025 if (!err.empty())
13026 rank = MDS_RANK_NONE;
13027 }
13028 if (rank >= 0 && rank < MAX_MDS)
13029 fp.set_path("", MDS_INO_MDSDIR(rank));
13030 }
13031 if (fp.get_ino() == inodeno_t(0))
13032 fp.set_path(path);
13033
13034 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
13035 mdr->set_filepath(fp);
13036
13037 bool is_internal = false;
13038 std::string tag_str(tag);
13039 if (tag_str.empty()) {
13040 uuid_d uuid_gen;
13041 uuid_gen.generate_random();
13042 tag_str = uuid_gen.to_string();
13043 is_internal = true;
13044 }
13045
13046 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
13047 cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, recursive, repair);
13048
13049 mdr->internal_op_finish = cs;
13050 enqueue_scrub_work(mdr);
13051 }
13052
13053 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
13054 {
13055 CInode *in;
13056 CF_MDS_RetryRequestFactory cf(this, mdr, true);
13057 int r = path_traverse(mdr, cf, mdr->get_filepath(),
13058 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH,
13059 nullptr, &in);
13060 if (r > 0)
13061 return;
13062 if (r < 0) {
13063 mds->server->respond_to_request(mdr, r);
13064 return;
13065 }
13066
13067 // Cannot scrub same dentry twice at same time
13068 if (in->scrub_is_in_progress()) {
13069 mds->server->respond_to_request(mdr, -CEPHFS_EBUSY);
13070 return;
13071 } else {
13072 in->scrub_info();
13073 }
13074
13075 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
13076 ScrubHeaderRef& header = cs->header;
13077
13078 r = mds->scrubstack->enqueue(in, header, !header->get_recursive());
13079
13080 mds->server->respond_to_request(mdr, r);
13081 }
13082
13083 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
13084 MDRequestRef mdr;
13085 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
13086 MDCacheLogContext(c), mdr(m) {}
13087 void finish(int r) override {
13088 mdr->apply();
13089 get_mds()->server->respond_to_request(mdr, r);
13090 }
13091 };
13092
13093 struct C_MDC_ScrubRepaired : public MDCacheContext {
13094 ScrubHeaderRef header;
13095 public:
13096 C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h)
13097 : MDCacheContext(m), header(h) {
13098 header->inc_num_pending();
13099 }
13100 void finish(int r) override {
13101 header->dec_num_pending();
13102 }
13103 };
13104
13105 void MDCache::repair_dirfrag_stats(CDir *dir)
13106 {
13107 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
13108 mdr->pin(dir);
13109 mdr->internal_op_private = dir;
13110 if (dir->scrub_is_in_progress())
13111 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header());
13112 else
13113 mdr->internal_op_finish = new C_MDSInternalNoop;
13114 repair_dirfrag_stats_work(mdr);
13115 }
13116
13117 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
13118 {
13119 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
13120 dout(10) << __func__ << " " << *dir << dendl;
13121
13122 if (!dir->is_auth()) {
13123 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13124 return;
13125 }
13126
13127 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
13128 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
13129
13130 mds->locker->drop_locks(mdr.get());
13131 mdr->drop_local_auth_pins();
13132 if (mdr->is_any_remote_auth_pin())
13133 mds->locker->notify_freeze_waiter(dir);
13134 return;
13135 }
13136
13137 mdr->auth_pin(dir);
13138
13139 MutationImpl::LockOpVec lov;
13140 CInode *diri = dir->inode;
13141 lov.add_rdlock(&diri->dirfragtreelock);
13142 lov.add_wrlock(&diri->nestlock);
13143 lov.add_wrlock(&diri->filelock);
13144 if (!mds->locker->acquire_locks(mdr, lov))
13145 return;
13146
13147 if (!dir->is_complete()) {
13148 dir->fetch(new C_MDS_RetryRequest(this, mdr));
13149 return;
13150 }
13151
13152 frag_info_t frag_info;
13153 nest_info_t nest_info;
13154 for (auto it = dir->begin(); it != dir->end(); ++it) {
13155 CDentry *dn = it->second;
13156 if (dn->last != CEPH_NOSNAP)
13157 continue;
13158 CDentry::linkage_t *dnl = dn->get_projected_linkage();
13159 if (dnl->is_primary()) {
13160 CInode *in = dnl->get_inode();
13161 nest_info.add(in->get_projected_inode()->accounted_rstat);
13162 if (in->is_dir())
13163 frag_info.nsubdirs++;
13164 else
13165 frag_info.nfiles++;
13166 } else if (dnl->is_remote())
13167 frag_info.nfiles++;
13168 }
13169
13170 auto pf = dir->get_projected_fnode();
13171 bool good_fragstat = frag_info.same_sums(pf->fragstat);
13172 bool good_rstat = nest_info.same_sums(pf->rstat);
13173 if (good_fragstat && good_rstat) {
13174 dout(10) << __func__ << " no corruption found" << dendl;
13175 mds->server->respond_to_request(mdr, 0);
13176 return;
13177 }
13178
13179 auto _pf = dir->project_fnode(mdr);
13180 _pf->version = dir->pre_dirty();
13181 pf = _pf;
13182
13183 mdr->ls = mds->mdlog->get_current_segment();
13184 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
13185 mds->mdlog->start_entry(le);
13186
13187 if (!good_fragstat) {
13188 if (pf->fragstat.mtime > frag_info.mtime)
13189 frag_info.mtime = pf->fragstat.mtime;
13190 if (pf->fragstat.change_attr > frag_info.change_attr)
13191 frag_info.change_attr = pf->fragstat.change_attr;
13192 _pf->fragstat = frag_info;
13193 mds->locker->mark_updated_scatterlock(&diri->filelock);
13194 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13195 mdr->add_updated_lock(&diri->filelock);
13196 }
13197
13198 if (!good_rstat) {
13199 if (pf->rstat.rctime > nest_info.rctime)
13200 nest_info.rctime = pf->rstat.rctime;
13201 _pf->rstat = nest_info;
13202 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13203 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13204 mdr->add_updated_lock(&diri->nestlock);
13205 }
13206
13207 le->metablob.add_dir_context(dir);
13208 le->metablob.add_dir(dir, true);
13209
13210 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13211 }
13212
13213 void MDCache::repair_inode_stats(CInode *diri)
13214 {
13215 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
13216 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
13217 mdr->internal_op_private = diri;
13218 if (diri->scrub_is_in_progress())
13219 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header());
13220 else
13221 mdr->internal_op_finish = new C_MDSInternalNoop;
13222 repair_inode_stats_work(mdr);
13223 }
13224
13225 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
13226 {
13227 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13228 dout(10) << __func__ << " " << *diri << dendl;
13229
13230 if (!diri->is_auth()) {
13231 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13232 return;
13233 }
13234 if (!diri->is_dir()) {
13235 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
13236 return;
13237 }
13238
13239 MutationImpl::LockOpVec lov;
13240
13241 if (mdr->ls) // already marked filelock/nestlock dirty ?
13242 goto do_rdlocks;
13243
13244 lov.add_rdlock(&diri->dirfragtreelock);
13245 lov.add_wrlock(&diri->nestlock);
13246 lov.add_wrlock(&diri->filelock);
13247 if (!mds->locker->acquire_locks(mdr, lov))
13248 return;
13249
13250 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13251 // the scatter-gather process, which will fix any fragstat/rstat errors.
13252 {
13253 frag_vec_t leaves;
13254 diri->dirfragtree.get_leaves(leaves);
13255 for (const auto& leaf : leaves) {
13256 CDir *dir = diri->get_dirfrag(leaf);
13257 if (!dir) {
13258 ceph_assert(mdr->is_auth_pinned(diri));
13259 dir = diri->get_or_open_dirfrag(this, leaf);
13260 }
13261 if (dir->get_version() == 0) {
13262 ceph_assert(dir->is_auth());
13263 dir->fetch_keys({}, new C_MDS_RetryRequest(this, mdr));
13264 return;
13265 }
13266 }
13267 }
13268
13269 diri->state_set(CInode::STATE_REPAIRSTATS);
13270 mdr->ls = mds->mdlog->get_current_segment();
13271 mds->locker->mark_updated_scatterlock(&diri->filelock);
13272 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13273 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13274 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13275
13276 mds->locker->drop_locks(mdr.get());
13277
13278 do_rdlocks:
13279 // force the scatter-gather process
13280 lov.clear();
13281 lov.add_rdlock(&diri->dirfragtreelock);
13282 lov.add_rdlock(&diri->nestlock);
13283 lov.add_rdlock(&diri->filelock);
13284 if (!mds->locker->acquire_locks(mdr, lov))
13285 return;
13286
13287 diri->state_clear(CInode::STATE_REPAIRSTATS);
13288
13289 frag_info_t dir_info;
13290 nest_info_t nest_info;
13291 nest_info.rsubdirs = 1; // it gets one to account for self
13292 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13293 nest_info.rsnaps = srnode->snaps.size();
13294
13295 {
13296 frag_vec_t leaves;
13297 diri->dirfragtree.get_leaves(leaves);
13298 for (const auto& leaf : leaves) {
13299 CDir *dir = diri->get_dirfrag(leaf);
13300 ceph_assert(dir);
13301 ceph_assert(dir->get_version() > 0);
13302 dir_info.add(dir->get_fnode()->accounted_fragstat);
13303 nest_info.add(dir->get_fnode()->accounted_rstat);
13304 }
13305 }
13306
13307 if (!dir_info.same_sums(diri->get_inode()->dirstat) ||
13308 !nest_info.same_sums(diri->get_inode()->rstat)) {
13309 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13310 << *diri << dendl;
13311 }
13312
13313 mds->server->respond_to_request(mdr, 0);
13314 }
13315
13316 void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin)
13317 {
13318 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS);
13319 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
13320 mdr->internal_op_private = diri;
13321 mdr->internal_op_finish = fin;
13322 return rdlock_dirfrags_stats_work(mdr);
13323 }
13324
13325 void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr)
13326 {
13327 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13328 dout(10) << __func__ << " " << *diri << dendl;
13329 if (!diri->is_auth()) {
13330 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13331 return;
13332 }
13333 if (!diri->is_dir()) {
13334 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
13335 return;
13336 }
13337
13338 MutationImpl::LockOpVec lov;
13339 lov.add_rdlock(&diri->dirfragtreelock);
13340 lov.add_rdlock(&diri->nestlock);
13341 lov.add_rdlock(&diri->filelock);
13342 if (!mds->locker->acquire_locks(mdr, lov))
13343 return;
13344 dout(10) << __func__ << " start dirfrags : " << *diri << dendl;
13345
13346 mds->server->respond_to_request(mdr, 0);
13347 return;
13348 }
13349
13350 void MDCache::flush_dentry(std::string_view path, Context *fin)
13351 {
13352 if (is_readonly()) {
13353 dout(10) << __func__ << ": read-only FS" << dendl;
13354 fin->complete(-CEPHFS_EROFS);
13355 return;
13356 }
13357 dout(10) << "flush_dentry " << path << dendl;
13358 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13359 filepath fp(path);
13360 mdr->set_filepath(fp);
13361 mdr->internal_op_finish = fin;
13362 flush_dentry_work(mdr);
13363 }
13364
13365 class C_FinishIOMDR : public MDSContext {
13366 protected:
13367 MDSRank *mds;
13368 MDRequestRef mdr;
13369 MDSRank *get_mds() override { return mds; }
13370 public:
13371 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13372 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13373 };
13374
13375 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13376 {
13377 MutationImpl::LockOpVec lov;
13378 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13379 if (!in)
13380 return;
13381
13382 ceph_assert(in->is_auth());
13383 in->flush(new C_FinishIOMDR(mds, mdr));
13384 }
13385
13386
13387 /**
13388 * Initialize performance counters with global perfcounter
13389 * collection.
13390 */
13391 void MDCache::register_perfcounters()
13392 {
13393 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13394
13395 pcb.add_u64_counter(l_mdc_dir_update, "dir_update",
13396 "Directory replication directives");
13397 pcb.add_u64_counter(l_mdc_dir_update_receipt, "dir_update_receipt",
13398 "Directory replication directives received");
13399 pcb.add_u64_counter(l_mdc_dir_try_discover, "dir_try_discover",
13400 "Directory replication attempt to discover");
13401 pcb.add_u64_counter(l_mdc_dir_send_discover, "dir_send_discover",
13402 "Directory replication discovery message sent");
13403 pcb.add_u64_counter(l_mdc_dir_handle_discover, "dir_handle_discover",
13404 "Directory replication discovery message handled");
13405
13406 // Stray/purge statistics
13407 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13408 PerfCountersBuilder::PRIO_INTERESTING);
13409 pcb.add_u64(l_mdc_num_recovering_enqueued,
13410 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13411 PerfCountersBuilder::PRIO_INTERESTING);
13412 pcb.add_u64_counter(l_mdc_recovery_completed,
13413 "recovery_completed", "File recoveries completed", "recd",
13414 PerfCountersBuilder::PRIO_INTERESTING);
13415
13416 // useful recovery queue statistics
13417 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13418 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13419 "Files currently being recovered");
13420 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13421 "Files waiting for recovery with elevated priority");
13422 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13423 "File recoveries started");
13424
13425 // along with other stray dentries stats
13426 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13427 "Stray dentries delayed");
13428 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13429 "Stray dentries enqueuing for purge");
13430 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13431 "Stray dentries created");
13432 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13433 "Stray dentries enqueued for purge");
13434 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13435 "Stray dentries reintegrated");
13436 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13437 "Stray dentries migrated");
13438
13439 // low prio internal request stats
13440 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13441 "Internal Request type enqueue scrub");
13442 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13443 "Internal Request type export dir");
13444 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13445 "Internal Request type flush");
13446 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13447 "Internal Request type fragmentdir");
13448 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13449 "Internal Request type frag stats");
13450 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13451 "Internal Request type inode stats");
13452
13453 logger.reset(pcb.create_perf_counters());
13454 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13455 recovery_queue.set_logger(logger.get());
13456 stray_manager.set_logger(logger.get());
13457 }
13458
13459 /**
13460 * Call this when putting references to an inode/dentry or
13461 * when attempting to trim it.
13462 *
13463 * If this inode is no longer linked by anyone, and this MDS
13464 * rank holds the primary dentry, and that dentry is in a stray
13465 * directory, then give up the dentry to the StrayManager, never
13466 * to be seen again by MDCache.
13467 *
13468 * @param delay if true, then purgeable inodes are stashed til
13469 * the next trim(), rather than being purged right
13470 * away.
13471 */
13472 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13473 if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() ||
13474 mds->get_state() <= MDSMap::STATE_REJOIN)
13475 return;
13476
13477 CDentry *dn = in->get_projected_parent_dn();
13478
13479 if (dn->state_test(CDentry::STATE_PURGING)) {
13480 /* We have already entered the purging process, no need
13481 * to re-evaluate me ! */
13482 return;
13483 }
13484
13485 if (dn->get_dir()->get_inode()->is_stray()) {
13486 if (delay)
13487 stray_manager.queue_delayed(dn);
13488 else
13489 stray_manager.eval_stray(dn);
13490 }
13491 }
13492
13493 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13494 dout(10) << __func__ << " " << *diri << dendl;
13495 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13496 auto&& ls = diri->get_dirfrags();
13497 for (auto &p : ls) {
13498 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13499 p->try_remove_dentries_for_stray();
13500 }
13501 if (!diri->snaprealm) {
13502 if (diri->is_auth())
13503 diri->clear_dirty_rstat();
13504 diri->clear_scatter_dirty();
13505 }
13506 }
13507
13508 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13509 CInode *in = get_inode(number);
13510 if (!in) {
13511 return false;
13512 }
13513 f->open_object_section("inode");
13514 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13515 f->close_section();
13516 return true;
13517 }
13518
13519 void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
13520 const mds_rank_t max_mds = mdsmap.get_max_mds();
13521
13522 // process export_pin_delayed_queue whenever a new MDSMap received
13523 auto &q = export_pin_delayed_queue;
13524 for (auto it = q.begin(); it != q.end(); ) {
13525 auto *in = *it;
13526 mds_rank_t export_pin = in->get_export_pin(false);
13527 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13528 << " max_mds=" << max_mds << dendl;
13529 if (export_pin >= mdsmap.get_max_mds()) {
13530 it++;
13531 continue;
13532 }
13533
13534 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13535 it = q.erase(it);
13536 in->queue_export_pin(export_pin);
13537 }
13538
13539 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13540 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13541 /* copy to vector to avoid removals during iteration */
13542 std::vector<CInode*> migrate;
13543 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
13544 for (auto& in : migrate) {
13545 in->maybe_export_pin();
13546 }
13547 }
13548
13549 if (max_mds <= 1) {
13550 export_ephemeral_dist_frag_bits = 0;
13551 } else {
13552 double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
13553 want *= max_mds;
13554 unsigned n = 0;
13555 while ((1U << n) < (unsigned)want)
13556 ++n;
13557 export_ephemeral_dist_frag_bits = n;
13558 }
13559 }
13560
13561 void MDCache::upkeep_main(void)
13562 {
13563 std::unique_lock lock(upkeep_mutex);
13564 while (!upkeep_trim_shutdown.load()) {
13565 auto now = clock::now();
13566 auto since = now-upkeep_last_trim;
13567 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
13568 if (since >= trim_interval*.90) {
13569 lock.unlock(); /* mds_lock -> upkeep_mutex */
13570 std::scoped_lock mds_lock(mds->mds_lock);
13571 lock.lock();
13572 if (upkeep_trim_shutdown.load())
13573 return;
13574 check_memory_usage();
13575 if (mds->is_cache_trimmable()) {
13576 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
13577 bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
13578 if (active_with_clients) {
13579 trim_client_leases();
13580 }
13581 if (is_open()) {
13582 trim();
13583 }
13584 if (active_with_clients) {
13585 auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
13586 if (cache_toofull()) {
13587 recall_flags = recall_flags|Server::RecallFlags::TRIM;
13588 }
13589 mds->server->recall_client_state(nullptr, recall_flags);
13590 }
13591 upkeep_last_trim = now = clock::now();
13592 } else {
13593 dout(10) << "cache not ready for trimming" << dendl;
13594 }
13595 } else {
13596 trim_interval -= since;
13597 }
13598 since = now-upkeep_last_release;
13599 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
13600 if (since >= release_interval*.90) {
13601 /* XXX not necessary once MDCache uses PriorityCache */
13602 dout(10) << "releasing free memory" << dendl;
13603 ceph_heap_release_free_memory();
13604 upkeep_last_release = clock::now();
13605 } else {
13606 release_interval -= since;
13607 }
13608 auto interval = std::min(release_interval, trim_interval);
13609 dout(20) << "upkeep thread waiting interval " << interval << dendl;
13610 upkeep_cvar.wait_for(lock, interval);
13611 }
13612 }