]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <ostream>
17 #include <string>
18 #include <string_view>
19 #include <map>
20
21 #include "MDCache.h"
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDLog.h"
26 #include "MDBalancer.h"
27 #include "Migrator.h"
28 #include "ScrubStack.h"
29
30 #include "SnapClient.h"
31
32 #include "MDSMap.h"
33
34 #include "CInode.h"
35 #include "CDir.h"
36
37 #include "Mutation.h"
38
39 #include "include/ceph_fs.h"
40 #include "include/filepath.h"
41 #include "include/util.h"
42
43 #include "messages/MClientCaps.h"
44
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
47
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
52
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
55
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/EPeerUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/EPurged.h"
63 #include "events/ESessions.h"
64
65 #include "InoTable.h"
66
67 #include "common/Timer.h"
68
69 #include "perfglue/heap_profiler.h"
70
71
72 #include "common/config.h"
73 #include "include/ceph_assert.h"
74
75 #define dout_context g_ceph_context
76 #define dout_subsys ceph_subsys_mds
77 #undef dout_prefix
78 #define dout_prefix _prefix(_dout, mds)
79
80 using namespace std;
81
82 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
83 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
84 }
85
86 set<int> SimpleLock::empty_gather_set;
87
88
89 /**
90 * All non-I/O contexts that require a reference
91 * to an MDCache instance descend from this.
92 */
93 class MDCacheContext : public virtual MDSContext {
94 protected:
95 MDCache *mdcache;
96 MDSRank *get_mds() override
97 {
98 ceph_assert(mdcache != NULL);
99 return mdcache->mds;
100 }
101 public:
102 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
103 };
104
105
106 /**
107 * Only for contexts called back from an I/O completion
108 *
109 * Note: duplication of members wrt MDCacheContext, because
110 * it'ls the lesser of two evils compared with introducing
111 * yet another piece of (multiple) inheritance.
112 */
113 class MDCacheIOContext : public virtual MDSIOContextBase {
114 protected:
115 MDCache *mdcache;
116 MDSRank *get_mds() override
117 {
118 ceph_assert(mdcache != NULL);
119 return mdcache->mds;
120 }
121 public:
122 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
123 MDSIOContextBase(track), mdcache(mdc_) {}
124 };
125
126 class MDCacheLogContext : public virtual MDSLogContextBase {
127 protected:
128 MDCache *mdcache;
129 MDSRank *get_mds() override
130 {
131 ceph_assert(mdcache != NULL);
132 return mdcache->mds;
133 }
134 public:
135 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
136 };
137
138 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
139 mds(m),
140 open_file_table(m),
141 filer(m->objecter, m->finisher),
142 stray_manager(m, purge_queue_),
143 recovery_queue(m),
144 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
145 {
146 migrator.reset(new Migrator(mds, this));
147
148 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
149 (g_conf()->mds_dir_max_commit_size << 20) :
150 (0.9 *(g_conf()->osd_max_write_size << 20));
151
152 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
153 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
154 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
155
156 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
157 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
158 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
159
160 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
161
162 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
163
164 bottom_lru.lru_set_midpoint(0);
165
166 decayrate.set_halflife(g_conf()->mds_decay_halflife);
167
168 upkeeper = std::thread(&MDCache::upkeep_main, this);
169 }
170
171 MDCache::~MDCache()
172 {
173 if (logger) {
174 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
175 }
176 if (upkeeper.joinable())
177 upkeeper.join();
178 }
179
180 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
181 {
182 dout(20) << "config changes: " << changed << dendl;
183 if (changed.count("mds_cache_memory_limit"))
184 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
185 if (changed.count("mds_cache_reservation"))
186 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
187
188 bool ephemeral_pin_config_changed = false;
189 if (changed.count("mds_export_ephemeral_distributed")) {
190 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
191 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
192 /* copy to vector to avoid removals during iteration */
193 ephemeral_pin_config_changed = true;
194 }
195 if (changed.count("mds_export_ephemeral_random")) {
196 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
197 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
198 /* copy to vector to avoid removals during iteration */
199 ephemeral_pin_config_changed = true;
200 }
201 if (ephemeral_pin_config_changed) {
202 std::vector<CInode*> migrate;
203 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
204 for (auto& in : migrate) {
205 in->maybe_export_pin(true);
206 }
207 }
208 if (changed.count("mds_export_ephemeral_random_max")) {
209 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
210 }
211 if (changed.count("mds_health_cache_threshold"))
212 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
213 if (changed.count("mds_cache_mid"))
214 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
215 if (changed.count("mds_cache_trim_decay_rate")) {
216 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
217 }
218 if (changed.count("mds_symlink_recovery")) {
219 symlink_recovery = g_conf().get_val<bool>("mds_symlink_recovery");
220 dout(10) << "Storing symlink targets on file object's head " << symlink_recovery << dendl;
221 }
222
223 migrator->handle_conf_change(changed, mdsmap);
224 mds->balancer->handle_conf_change(changed, mdsmap);
225 }
226
227 void MDCache::log_stat()
228 {
229 mds->logger->set(l_mds_inodes, lru.lru_get_size());
230 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
231 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
232 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
233 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
234 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
235 mds->logger->set(l_mds_caps, Capability::count());
236 if (root) {
237 mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles);
238 mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes);
239 mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps);
240 }
241 }
242
243
244 //
245
246 bool MDCache::shutdown()
247 {
248 {
249 std::scoped_lock lock(upkeep_mutex);
250 upkeep_trim_shutdown = true;
251 upkeep_cvar.notify_one();
252 }
253 if (lru.lru_get_size() > 0) {
254 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
255 //show_cache();
256 show_subtrees();
257 //dump();
258 }
259 return true;
260 }
261
262
263 // ====================================================================
264 // some inode functions
265
266 void MDCache::add_inode(CInode *in)
267 {
268 // add to inode map
269 if (in->last == CEPH_NOSNAP) {
270 auto &p = inode_map[in->ino()];
271 ceph_assert(!p); // should be no dup inos!
272 p = in;
273 } else {
274 auto &p = snap_inode_map[in->vino()];
275 ceph_assert(!p); // should be no dup inos!
276 p = in;
277 }
278
279 if (in->ino() < MDS_INO_SYSTEM_BASE) {
280 if (in->ino() == CEPH_INO_ROOT)
281 root = in;
282 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
283 myin = in;
284 else if (in->is_stray()) {
285 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
286 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
287 }
288 }
289 if (in->is_base())
290 base_inodes.insert(in);
291 }
292 }
293
294 void MDCache::remove_inode(CInode *o)
295 {
296 dout(14) << "remove_inode " << *o << dendl;
297
298 if (o->get_parent_dn()) {
299 // FIXME: multiple parents?
300 CDentry *dn = o->get_parent_dn();
301 ceph_assert(!dn->is_dirty());
302 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
303 }
304
305 if (o->is_dirty())
306 o->mark_clean();
307 if (o->is_dirty_parent())
308 o->clear_dirty_parent();
309
310 o->clear_scatter_dirty();
311
312 o->clear_clientwriteable();
313
314 o->item_open_file.remove_myself();
315
316 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
317 export_pin_queue.erase(o);
318
319 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
320 export_pin_delayed_queue.erase(o);
321
322 o->clear_ephemeral_pin(true, true);
323
324 // remove from inode map
325 if (o->last == CEPH_NOSNAP) {
326 inode_map.erase(o->ino());
327 } else {
328 o->item_caps.remove_myself();
329 snap_inode_map.erase(o->vino());
330 }
331
332 if (o->ino() < MDS_INO_SYSTEM_BASE) {
333 if (o == root) root = 0;
334 if (o == myin) myin = 0;
335 if (o->is_stray()) {
336 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
337 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
338 }
339 }
340 if (o->is_base())
341 base_inodes.erase(o);
342 }
343
344 // delete it
345 ceph_assert(o->get_num_ref() == 0);
346 delete o;
347 }
348
349 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
350 {
351 file_layout_t result = file_layout_t::get_default();
352 result.pool_id = mdsmap.get_first_data_pool();
353 return result;
354 }
355
356 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
357 {
358 file_layout_t result = file_layout_t::get_default();
359 result.pool_id = mdsmap.get_metadata_pool();
360 if (g_conf()->mds_log_segment_size > 0) {
361 result.object_size = g_conf()->mds_log_segment_size;
362 result.stripe_unit = g_conf()->mds_log_segment_size;
363 }
364 return result;
365 }
366
367 void MDCache::init_layouts()
368 {
369 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
370 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
371 }
372
373 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const
374 {
375 auto _inode = in->_get_inode();
376 _inode->ino = ino;
377 _inode->version = 1;
378 _inode->xattr_version = 1;
379 _inode->mode = 0500 | mode;
380 _inode->size = 0;
381 _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now();
382 _inode->nlink = 1;
383 _inode->truncate_size = -1ull;
384 _inode->change_attr = 0;
385 _inode->export_pin = MDS_RANK_NONE;
386
387 // FIPS zeroization audit 20191117: this memset is not security related.
388 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
389 if (_inode->is_dir()) {
390 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
391 _inode->rstat.rsubdirs = 1; /* itself */
392 _inode->rstat.rctime = in->get_inode()->ctime;
393 } else {
394 _inode->layout = default_file_layout;
395 ++_inode->rstat.rfiles;
396 }
397 _inode->accounted_rstat = _inode->rstat;
398
399 if (in->is_base()) {
400 if (in->is_root())
401 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
402 else
403 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
404 in->open_snaprealm(); // empty snaprealm
405 ceph_assert(!in->snaprealm->parent); // created its own
406 in->snaprealm->srnode.seq = 1;
407 }
408 }
409
410 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
411 {
412 dout(0) << "creating system inode with ino:" << ino << dendl;
413 CInode *in = new CInode(this);
414 create_unlinked_system_inode(in, ino, mode);
415 add_inode(in);
416 return in;
417 }
418
419 CInode *MDCache::create_root_inode()
420 {
421 CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
422 auto _inode = in->_get_inode();
423 _inode->uid = g_conf()->mds_root_ino_uid;
424 _inode->gid = g_conf()->mds_root_ino_gid;
425 _inode->layout = default_file_layout;
426 _inode->layout.pool_id = mds->mdsmap->get_first_data_pool();
427 return in;
428 }
429
430 void MDCache::create_empty_hierarchy(MDSGather *gather)
431 {
432 // create root dir
433 CInode *root = create_root_inode();
434
435 // force empty root dir
436 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
437 adjust_subtree_auth(rootdir, mds->get_nodeid());
438 rootdir->dir_rep = CDir::REP_ALL; //NONE;
439
440 ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat);
441 ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat);
442 ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat);
443 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
444 * assume version 0 is stale/invalid.
445 */
446
447 rootdir->mark_complete();
448 rootdir->_get_fnode()->version = rootdir->pre_dirty();
449 rootdir->mark_dirty(mds->mdlog->get_current_segment());
450 rootdir->commit(0, gather->new_sub());
451
452 root->store(gather->new_sub());
453 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
454 root->store_backtrace(gather->new_sub());
455 }
456
457 void MDCache::create_mydir_hierarchy(MDSGather *gather)
458 {
459 // create mds dir
460 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
461
462 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
463 auto mydir_fnode = mydir->_get_fnode();
464
465 adjust_subtree_auth(mydir, mds->get_nodeid());
466
467 LogSegment *ls = mds->mdlog->get_current_segment();
468
469 // stray dir
470 for (int i = 0; i < NUM_STRAY; ++i) {
471 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
472 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
473 CachedStackStringStream css;
474 *css << "stray" << i;
475 CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, "");
476 sdn->_mark_dirty(mds->mdlog->get_current_segment());
477
478 stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat;
479
480 mydir_fnode->rstat.add(stray->get_inode()->rstat);
481 mydir_fnode->fragstat.nsubdirs++;
482 // save them
483 straydir->mark_complete();
484 straydir->_get_fnode()->version = straydir->pre_dirty();
485 straydir->mark_dirty(ls);
486 straydir->commit(0, gather->new_sub());
487 stray->mark_dirty_parent(ls, true);
488 stray->store_backtrace(gather->new_sub());
489 }
490
491 mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat;
492 mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat;
493
494 auto inode = myin->_get_inode();
495 inode->dirstat = mydir->get_fnode()->fragstat;
496 inode->rstat = mydir->get_fnode()->rstat;
497 ++inode->rstat.rsubdirs;
498 inode->accounted_rstat = inode->rstat;
499
500 mydir->mark_complete();
501 mydir_fnode->version = mydir->pre_dirty();
502 mydir->mark_dirty(ls);
503 mydir->commit(0, gather->new_sub());
504
505 myin->store(gather->new_sub());
506 }
507
508 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
509 MutationRef mut;
510 CDentry *dn;
511 version_t dpv;
512 MDSContext *fin;
513 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
514 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
515 void finish(int r) override {
516 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
517 }
518 };
519
520 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
521 {
522 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
523 CDentry *dn = dir->add_null_dentry(name);
524
525 dn->push_projected_linkage(in);
526 version_t dpv = dn->pre_dirty();
527
528 CDir *mdir = 0;
529 auto inode = in->_get_inode();
530 if (in->is_dir()) {
531 inode->rstat.rsubdirs = 1;
532
533 mdir = in->get_or_open_dirfrag(this, frag_t());
534 mdir->mark_complete();
535 mdir->_get_fnode()->version = mdir->pre_dirty();
536 } else {
537 inode->rstat.rfiles = 1;
538 }
539
540 inode->version = dn->pre_dirty();
541
542 SnapRealm *realm = dir->get_inode()->find_snaprealm();
543 dn->first = in->first = realm->get_newest_seq() + 1;
544
545 MutationRef mut(new MutationImpl());
546
547 // force some locks. hacky.
548 mds->locker->wrlock_force(&dir->inode->filelock, mut);
549 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
550
551 mut->ls = mds->mdlog->get_current_segment();
552 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
553 mds->mdlog->start_entry(le);
554
555 if (!in->is_mdsdir()) {
556 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
557 le->metablob.add_primary_dentry(dn, in, true);
558 } else {
559 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
560 journal_dirty_inode(mut.get(), &le->metablob, in);
561 dn->push_projected_linkage(in->ino(), in->d_type());
562 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
563 le->metablob.add_root(true, in);
564 }
565 if (mdir)
566 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
567
568 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
569 mds->mdlog->flush();
570 }
571
572 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
573 {
574 dout(10) << "_create_system_file_finish " << *dn << dendl;
575
576 dn->pop_projected_linkage();
577 dn->mark_dirty(dpv, mut->ls);
578
579 CInode *in = dn->get_linkage()->get_inode();
580 in->mark_dirty(mut->ls);
581
582 if (in->is_dir()) {
583 CDir *dir = in->get_dirfrag(frag_t());
584 ceph_assert(dir);
585 dir->mark_dirty(mut->ls);
586 dir->mark_new(mut->ls);
587 }
588
589 mut->apply();
590 mds->locker->drop_locks(mut.get());
591 mut->cleanup();
592
593 fin->complete(0);
594
595 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
596 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
597 }
598
599
600
601 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
602 MDCache *cache;
603 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
604 void finish(int r) override {
605 if (r < 0) {
606 // If we can't open root, something disastrous has happened: mark
607 // this rank damaged for operator intervention. Note that
608 // it is not okay to call suicide() here because we are in
609 // a Finisher callback.
610 cache->mds->damaged();
611 ceph_abort(); // damaged should never return
612 } else {
613 cache->open_root();
614 }
615 }
616 };
617
618 void MDCache::open_root_inode(MDSContext *c)
619 {
620 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
621 CInode *in;
622 in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
623 in->fetch(c);
624 } else {
625 discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
626 }
627 }
628
629 void MDCache::open_mydir_inode(MDSContext *c)
630 {
631 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
632 in->fetch(c);
633 }
634
635 void MDCache::open_mydir_frag(MDSContext *c)
636 {
637 open_mydir_inode(
638 new MDSInternalContextWrapper(mds,
639 new LambdaContext([this, c](int r) {
640 if (r < 0) {
641 c->complete(r);
642 return;
643 }
644 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
645 ceph_assert(mydir);
646 adjust_subtree_auth(mydir, mds->get_nodeid());
647 mydir->fetch(c);
648 })
649 )
650 );
651 }
652
653 void MDCache::open_root()
654 {
655 dout(10) << "open_root" << dendl;
656
657 if (!root) {
658 open_root_inode(new C_MDS_RetryOpenRoot(this));
659 return;
660 }
661 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
662 ceph_assert(root->is_auth());
663 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
664 ceph_assert(rootdir);
665 if (!rootdir->is_subtree_root())
666 adjust_subtree_auth(rootdir, mds->get_nodeid());
667 if (!rootdir->is_complete()) {
668 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
669 return;
670 }
671 } else {
672 ceph_assert(!root->is_auth());
673 CDir *rootdir = root->get_dirfrag(frag_t());
674 if (!rootdir) {
675 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
676 return;
677 }
678 }
679
680 if (!myin) {
681 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
682 in->fetch(new C_MDS_RetryOpenRoot(this));
683 return;
684 }
685 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
686 ceph_assert(mydir);
687 adjust_subtree_auth(mydir, mds->get_nodeid());
688
689 populate_mydir();
690 }
691
692 void MDCache::advance_stray() {
693 // check whether the directory has been fragmented
694 if (stray_fragmenting_index >= 0) {
695 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
696 bool any_fragmenting = false;
697 for (const auto& dir : dfs) {
698 if (dir->state_test(CDir::STATE_FRAGMENTING) ||
699 mds->balancer->is_fragment_pending(dir->dirfrag())) {
700 any_fragmenting = true;
701 break;
702 }
703 }
704 if (!any_fragmenting)
705 stray_fragmenting_index = -1;
706 }
707
708 for (int i = 1; i < NUM_STRAY; i++){
709 stray_index = (stray_index + i) % NUM_STRAY;
710 if (stray_index != stray_fragmenting_index)
711 break;
712 }
713
714 if (stray_fragmenting_index == -1 && is_open()) {
715 // Fragment later stray dir in advance. We don't choose past
716 // stray dir because in-flight requests may still use it.
717 stray_fragmenting_index = (stray_index + 3) % NUM_STRAY;
718 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
719 bool any_fragmenting = false;
720 for (const auto& dir : dfs) {
721 if (dir->should_split()) {
722 mds->balancer->queue_split(dir, true);
723 any_fragmenting = true;
724 } else if (dir->should_merge()) {
725 mds->balancer->queue_merge(dir);
726 any_fragmenting = true;
727 }
728 }
729 if (!any_fragmenting)
730 stray_fragmenting_index = -1;
731 }
732
733 dout(10) << "advance_stray to index " << stray_index
734 << " fragmenting index " << stray_fragmenting_index << dendl;
735 }
736
737 void MDCache::populate_mydir()
738 {
739 ceph_assert(myin);
740 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
741 ceph_assert(mydir);
742
743 dout(10) << "populate_mydir " << *mydir << dendl;
744
745 if (!mydir->is_complete()) {
746 mydir->fetch(new C_MDS_RetryOpenRoot(this));
747 return;
748 }
749
750 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
751 // A missing dirfrag, we will recreate it. Before that, we must dirty
752 // it before dirtying any of the strays we create within it.
753 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
754 "recreating it now";
755 LogSegment *ls = mds->mdlog->get_current_segment();
756 mydir->state_clear(CDir::STATE_BADFRAG);
757 mydir->mark_complete();
758 mydir->_get_fnode()->version = mydir->pre_dirty();
759 mydir->mark_dirty(ls);
760 }
761
762 // open or create stray
763 uint64_t num_strays = 0;
764 for (int i = 0; i < NUM_STRAY; ++i) {
765 CachedStackStringStream css;
766 *css << "stray" << i;
767 CDentry *straydn = mydir->lookup(css->str());
768
769 // allow for older fs's with stray instead of stray0
770 if (straydn == NULL && i == 0)
771 straydn = mydir->lookup("stray");
772
773 if (!straydn || !straydn->get_linkage()->get_inode()) {
774 _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
775 new C_MDS_RetryOpenRoot(this));
776 return;
777 }
778 ceph_assert(straydn);
779 ceph_assert(strays[i]);
780 // we make multiple passes through this method; make sure we only pin each stray once.
781 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
782 strays[i]->get(CInode::PIN_STRAY);
783 strays[i]->state_set(CInode::STATE_STRAYPINNED);
784 strays[i]->get_stickydirs();
785 }
786 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
787
788 // open all frags
789 frag_vec_t leaves;
790 strays[i]->dirfragtree.get_leaves(leaves);
791 for (const auto& leaf : leaves) {
792 CDir *dir = strays[i]->get_dirfrag(leaf);
793 if (!dir) {
794 dir = strays[i]->get_or_open_dirfrag(this, leaf);
795 }
796
797 // DamageTable applies special handling to strays: it will
798 // have damaged() us out if one is damaged.
799 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
800
801 if (dir->get_version() == 0) {
802 dir->fetch(new C_MDS_RetryOpenRoot(this));
803 return;
804 }
805
806 if (dir->get_frag_size() > 0)
807 num_strays += dir->get_frag_size();
808 }
809 }
810
811 // okay!
812 dout(10) << "populate_mydir done" << dendl;
813 ceph_assert(!open);
814 open = true;
815 mds->queue_waiters(waiting_for_open);
816
817 stray_manager.set_num_strays(num_strays);
818 stray_manager.activate();
819
820 scan_stray_dir();
821 }
822
823 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
824 {
825 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
826 }
827
828 CDir *MDCache::get_stray_dir(CInode *in)
829 {
830 string straydname;
831 in->name_stray_dentry(straydname);
832
833 CInode *strayi = get_stray();
834 ceph_assert(strayi);
835 frag_t fg = strayi->pick_dirfrag(straydname);
836 CDir *straydir = strayi->get_dirfrag(fg);
837 ceph_assert(straydir);
838 return straydir;
839 }
840
841 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
842 {
843 // inode?
844 if (info.ino)
845 return get_inode(info.ino, info.snapid);
846
847 // dir or dentry.
848 CDir *dir = get_dirfrag(info.dirfrag);
849 if (!dir) return 0;
850
851 if (info.dname.length())
852 return dir->lookup(info.dname, info.snapid);
853 else
854 return dir;
855 }
856
857
858 // ====================================================================
859 // consistent hash ring
860
861 /*
862 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
863 */
864 mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
865 {
866 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
867 uint64_t hash = rjhash64(ino);
868 if (fg)
869 hash = rjhash64(hash + rjhash64(fg.value()));
870
871 int64_t b = -1, j = 0;
872 while (j < max_mds) {
873 b = j;
874 hash = hash*2862933555777941757ULL + 1;
875 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
876 }
877 // verify bounds before returning
878 auto result = mds_rank_t(b);
879 ceph_assert(result >= 0 && result < max_mds);
880 return result;
881 }
882
883
884 // ====================================================================
885 // subtree management
886
887 /*
888 * adjust the dir_auth of a subtree.
889 * merge with parent and/or child subtrees, if is it appropriate.
890 * merge can ONLY happen if both parent and child have unambiguous auth.
891 */
892 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
893 {
894 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
895 << " on " << *dir << dendl;
896
897 show_subtrees();
898
899 CDir *root;
900 if (dir->inode->is_base()) {
901 root = dir; // bootstrap hack.
902 if (subtrees.count(root) == 0) {
903 subtrees[root];
904 root->get(CDir::PIN_SUBTREE);
905 }
906 } else {
907 root = get_subtree_root(dir); // subtree root
908 }
909 ceph_assert(root);
910 ceph_assert(subtrees.count(root));
911 dout(7) << " current root is " << *root << dendl;
912
913 if (root == dir) {
914 // i am already a subtree.
915 dir->set_dir_auth(auth);
916 } else {
917 // i am a new subtree.
918 dout(10) << " new subtree at " << *dir << dendl;
919 ceph_assert(subtrees.count(dir) == 0);
920 subtrees[dir]; // create empty subtree bounds list for me.
921 dir->get(CDir::PIN_SUBTREE);
922
923 // set dir_auth
924 dir->set_dir_auth(auth);
925
926 // move items nested beneath me, under me.
927 set<CDir*>::iterator p = subtrees[root].begin();
928 while (p != subtrees[root].end()) {
929 set<CDir*>::iterator next = p;
930 ++next;
931 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
932 // move under me
933 dout(10) << " claiming child bound " << **p << dendl;
934 subtrees[dir].insert(*p);
935 subtrees[root].erase(p);
936 }
937 p = next;
938 }
939
940 // i am a bound of the parent subtree.
941 subtrees[root].insert(dir);
942
943 // i am now the subtree root.
944 root = dir;
945
946 // adjust recursive pop counters
947 if (adjust_pop && dir->is_auth()) {
948 CDir *p = dir->get_parent_dir();
949 while (p) {
950 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
951 if (p->is_subtree_root()) break;
952 p = p->inode->get_parent_dir();
953 }
954 }
955 }
956
957 show_subtrees();
958 }
959
960
961 void MDCache::try_subtree_merge(CDir *dir)
962 {
963 dout(7) << "try_subtree_merge " << *dir << dendl;
964 // record my old bounds
965 auto oldbounds = subtrees.at(dir);
966
967 set<CInode*> to_eval;
968 // try merge at my root
969 try_subtree_merge_at(dir, &to_eval);
970
971 // try merge at my old bounds
972 for (auto bound : oldbounds)
973 try_subtree_merge_at(bound, &to_eval);
974
975 if (!(mds->is_any_replay() || mds->is_resolve())) {
976 for(auto in : to_eval)
977 eval_subtree_root(in);
978 }
979 }
980
981 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
982 {
983 dout(10) << "try_subtree_merge_at " << *dir << dendl;
984
985 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
986 dir->state_test(CDir::STATE_EXPORTBOUND) ||
987 dir->state_test(CDir::STATE_AUXSUBTREE))
988 return;
989
990 auto it = subtrees.find(dir);
991 ceph_assert(it != subtrees.end());
992
993 // merge with parent?
994 CDir *parent = dir;
995 if (!dir->inode->is_base())
996 parent = get_subtree_root(dir->get_parent_dir());
997
998 if (parent != dir && // we have a parent,
999 parent->dir_auth == dir->dir_auth) { // auth matches,
1000 // merge with parent.
1001 dout(10) << " subtree merge at " << *dir << dendl;
1002 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
1003
1004 // move our bounds under the parent
1005 subtrees[parent].insert(it->second.begin(), it->second.end());
1006
1007 // we are no longer a subtree or bound
1008 dir->put(CDir::PIN_SUBTREE);
1009 subtrees.erase(it);
1010 subtrees[parent].erase(dir);
1011
1012 // adjust popularity?
1013 if (adjust_pop && dir->is_auth()) {
1014 CDir *cur = dir;
1015 CDir *p = dir->get_parent_dir();
1016 while (p) {
1017 p->pop_auth_subtree.add(dir->pop_auth_subtree);
1018 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1019 if (p->is_subtree_root()) break;
1020 cur = p;
1021 p = p->inode->get_parent_dir();
1022 }
1023 }
1024
1025 if (to_eval && dir->get_inode()->is_auth())
1026 to_eval->insert(dir->get_inode());
1027
1028 show_subtrees(15);
1029 }
1030 }
1031
1032 void MDCache::eval_subtree_root(CInode *diri)
1033 {
1034 // evaluate subtree inode filelock?
1035 // (we should scatter the filelock on subtree bounds)
1036 ceph_assert(diri->is_auth());
1037 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1038 }
1039
1040
1041 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1042 {
1043 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1044 << " on " << *dir
1045 << " bounds " << bounds
1046 << dendl;
1047
1048 show_subtrees();
1049
1050 CDir *root;
1051 if (dir->ino() == CEPH_INO_ROOT) {
1052 root = dir; // bootstrap hack.
1053 if (subtrees.count(root) == 0) {
1054 subtrees[root];
1055 root->get(CDir::PIN_SUBTREE);
1056 }
1057 } else {
1058 root = get_subtree_root(dir); // subtree root
1059 }
1060 ceph_assert(root);
1061 ceph_assert(subtrees.count(root));
1062 dout(7) << " current root is " << *root << dendl;
1063
1064 mds_authority_t oldauth = dir->authority();
1065
1066 if (root == dir) {
1067 // i am already a subtree.
1068 dir->set_dir_auth(auth);
1069 } else {
1070 // i am a new subtree.
1071 dout(10) << " new subtree at " << *dir << dendl;
1072 ceph_assert(subtrees.count(dir) == 0);
1073 subtrees[dir]; // create empty subtree bounds list for me.
1074 dir->get(CDir::PIN_SUBTREE);
1075
1076 // set dir_auth
1077 dir->set_dir_auth(auth);
1078
1079 // move items nested beneath me, under me.
1080 set<CDir*>::iterator p = subtrees[root].begin();
1081 while (p != subtrees[root].end()) {
1082 set<CDir*>::iterator next = p;
1083 ++next;
1084 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1085 // move under me
1086 dout(10) << " claiming child bound " << **p << dendl;
1087 subtrees[dir].insert(*p);
1088 subtrees[root].erase(p);
1089 }
1090 p = next;
1091 }
1092
1093 // i am a bound of the parent subtree.
1094 subtrees[root].insert(dir);
1095
1096 // i am now the subtree root.
1097 root = dir;
1098 }
1099
1100 set<CInode*> to_eval;
1101
1102 // verify/adjust bounds.
1103 // - these may be new, or
1104 // - beneath existing ambiguous bounds (which will be collapsed),
1105 // - but NOT beneath unambiguous bounds.
1106 for (const auto& bound : bounds) {
1107 // new bound?
1108 if (subtrees[dir].count(bound) == 0) {
1109 if (get_subtree_root(bound) == dir) {
1110 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1111 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1112 }
1113 else {
1114 dout(10) << " want bound " << *bound << dendl;
1115 CDir *t = get_subtree_root(bound->get_parent_dir());
1116 if (subtrees[t].count(bound) == 0) {
1117 ceph_assert(t != dir);
1118 dout(10) << " new bound " << *bound << dendl;
1119 adjust_subtree_auth(bound, t->authority());
1120 }
1121 // make sure it's nested beneath ambiguous subtree(s)
1122 while (1) {
1123 while (subtrees[dir].count(t) == 0)
1124 t = get_subtree_root(t->get_parent_dir());
1125 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1126 adjust_subtree_auth(t, auth);
1127 try_subtree_merge_at(t, &to_eval);
1128 t = get_subtree_root(bound->get_parent_dir());
1129 if (t == dir) break;
1130 }
1131 }
1132 }
1133 else {
1134 dout(10) << " already have bound " << *bound << dendl;
1135 }
1136 }
1137 // merge stray bounds?
1138 while (!subtrees[dir].empty()) {
1139 set<CDir*> copy = subtrees[dir];
1140 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1141 if (bounds.count(*p) == 0) {
1142 CDir *stray = *p;
1143 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1144 adjust_subtree_auth(stray, auth);
1145 try_subtree_merge_at(stray, &to_eval);
1146 }
1147 }
1148 // swallowing subtree may add new subtree bounds
1149 if (copy == subtrees[dir])
1150 break;
1151 }
1152
1153 // bound should now match.
1154 verify_subtree_bounds(dir, bounds);
1155
1156 show_subtrees();
1157
1158 if (!(mds->is_any_replay() || mds->is_resolve())) {
1159 for(auto in : to_eval)
1160 eval_subtree_root(in);
1161 }
1162 }
1163
1164
1165 /*
1166 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1167 * fragmentation as necessary to get an equivalent bounding set. That is, only
1168 * split if one of our frags spans the provided bounding set. Never merge.
1169 */
1170 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1171 {
1172 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1173
1174 // sort by ino
1175 map<inodeno_t, fragset_t> byino;
1176 for (auto& frag : dfs) {
1177 byino[frag.ino].insert_raw(frag.frag);
1178 }
1179 dout(10) << " by ino: " << byino << dendl;
1180
1181 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1182 p->second.simplify();
1183 CInode *diri = get_inode(p->first);
1184 if (!diri)
1185 continue;
1186 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1187
1188 fragtree_t tmpdft;
1189 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1190 tmpdft.force_to_leaf(g_ceph_context, *q);
1191
1192 for (const auto& fg : p->second) {
1193 frag_vec_t leaves;
1194 diri->dirfragtree.get_leaves_under(fg, leaves);
1195 if (leaves.empty()) {
1196 frag_t approx_fg = diri->dirfragtree[fg.value()];
1197 frag_vec_t approx_leaves;
1198 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1199 for (const auto& leaf : approx_leaves) {
1200 if (p->second.get().count(leaf) == 0) {
1201 // not bound, so the resolve message is from auth MDS of the dirfrag
1202 force_dir_fragment(diri, leaf);
1203 }
1204 }
1205 }
1206
1207 auto&& [complete, sibs] = diri->get_dirfrags_under(fg);
1208 for (const auto& sib : sibs)
1209 bounds.insert(sib);
1210 }
1211 }
1212 }
1213
1214 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1215 {
1216 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1217 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1218
1219 set<CDir*> bounds;
1220 get_force_dirfrag_bound_set(bound_dfs, bounds);
1221 adjust_bounded_subtree_auth(dir, bounds, auth);
1222 }
1223
1224 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1225 {
1226 dout(10) << "map_dirfrag_set " << dfs << dendl;
1227
1228 // group by inode
1229 map<inodeno_t, fragset_t> ino_fragset;
1230 for (const auto &df : dfs) {
1231 ino_fragset[df.ino].insert_raw(df.frag);
1232 }
1233 // get frags
1234 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1235 p != ino_fragset.end();
1236 ++p) {
1237 p->second.simplify();
1238 CInode *in = get_inode(p->first);
1239 if (!in)
1240 continue;
1241
1242 frag_vec_t fgs;
1243 for (const auto& fg : p->second) {
1244 in->dirfragtree.get_leaves_under(fg, fgs);
1245 }
1246
1247 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1248 << " on " << *in << dendl;
1249
1250 for (const auto& fg : fgs) {
1251 CDir *dir = in->get_dirfrag(fg);
1252 if (dir)
1253 result.insert(dir);
1254 }
1255 }
1256 }
1257
1258
1259
1260 CDir *MDCache::get_subtree_root(CDir *dir)
1261 {
1262 // find the underlying dir that delegates (or is about to delegate) auth
1263 while (true) {
1264 if (dir->is_subtree_root())
1265 return dir;
1266 dir = dir->get_inode()->get_parent_dir();
1267 if (!dir)
1268 return 0; // none
1269 }
1270 }
1271
1272 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1273 {
1274 // find the underlying dir that delegates (or is about to delegate) auth
1275 while (true) {
1276 if (dir->is_subtree_root())
1277 return dir;
1278 dir = dir->get_inode()->get_projected_parent_dir();
1279 if (!dir)
1280 return 0; // none
1281 }
1282 }
1283
1284 void MDCache::remove_subtree(CDir *dir)
1285 {
1286 dout(10) << "remove_subtree " << *dir << dendl;
1287 auto it = subtrees.find(dir);
1288 ceph_assert(it != subtrees.end());
1289 subtrees.erase(it);
1290 dir->put(CDir::PIN_SUBTREE);
1291 if (dir->get_parent_dir()) {
1292 CDir *p = get_subtree_root(dir->get_parent_dir());
1293 auto it = subtrees.find(p);
1294 ceph_assert(it != subtrees.end());
1295 auto count = it->second.erase(dir);
1296 ceph_assert(count == 1);
1297 }
1298 }
1299
1300 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1301 {
1302 ceph_assert(subtrees.count(dir));
1303 bounds = subtrees[dir];
1304 }
1305
1306 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1307 {
1308 if (subtrees.count(dir)) {
1309 // just copy them, dir is a subtree.
1310 get_subtree_bounds(dir, bounds);
1311 } else {
1312 // find them
1313 CDir *root = get_subtree_root(dir);
1314 for (set<CDir*>::iterator p = subtrees[root].begin();
1315 p != subtrees[root].end();
1316 ++p) {
1317 CDir *t = *p;
1318 while (t != root) {
1319 t = t->get_parent_dir();
1320 ceph_assert(t);
1321 if (t == dir) {
1322 bounds.insert(*p);
1323 continue;
1324 }
1325 }
1326 }
1327 }
1328 }
1329
1330 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1331 {
1332 // for debugging only.
1333 ceph_assert(subtrees.count(dir));
1334 if (bounds != subtrees[dir]) {
1335 dout(0) << "verify_subtree_bounds failed" << dendl;
1336 set<CDir*> b = bounds;
1337 for (auto &cd : subtrees[dir]) {
1338 if (bounds.count(cd)) {
1339 b.erase(cd);
1340 continue;
1341 }
1342 dout(0) << " missing bound " << *cd << dendl;
1343 }
1344 for (const auto &cd : b)
1345 dout(0) << " extra bound " << *cd << dendl;
1346 }
1347 ceph_assert(bounds == subtrees[dir]);
1348 }
1349
1350 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1351 {
1352 // for debugging only.
1353 ceph_assert(subtrees.count(dir));
1354
1355 // make sure that any bounds i do have are properly noted as such.
1356 int failed = 0;
1357 for (const auto &fg : bounds) {
1358 CDir *bd = get_dirfrag(fg);
1359 if (!bd) continue;
1360 if (subtrees[dir].count(bd) == 0) {
1361 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1362 failed++;
1363 }
1364 }
1365 ceph_assert(failed == 0);
1366 }
1367
1368 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1369 {
1370 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1371 << " to " << *newdir << dendl;
1372 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1373 }
1374
1375 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1376 {
1377 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1378
1379 CDir *newdir = diri->get_parent_dir();
1380
1381 if (pop) {
1382 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1383 ceph_assert(p != projected_subtree_renames.end());
1384 ceph_assert(!p->second.empty());
1385 ceph_assert(p->second.front().first == olddir);
1386 ceph_assert(p->second.front().second == newdir);
1387 p->second.pop_front();
1388 if (p->second.empty())
1389 projected_subtree_renames.erase(p);
1390 }
1391
1392 // adjust total auth pin of freezing subtree
1393 if (olddir != newdir) {
1394 auto&& dfls = diri->get_nested_dirfrags();
1395 for (const auto& dir : dfls)
1396 olddir->adjust_freeze_after_rename(dir);
1397 }
1398
1399 // adjust subtree
1400 // N.B. make sure subtree dirfrags are at the front of the list
1401 auto dfls = diri->get_subtree_dirfrags();
1402 diri->get_nested_dirfrags(dfls);
1403 for (const auto& dir : dfls) {
1404 dout(10) << "dirfrag " << *dir << dendl;
1405 CDir *oldparent = get_subtree_root(olddir);
1406 dout(10) << " old parent " << *oldparent << dendl;
1407 CDir *newparent = get_subtree_root(newdir);
1408 dout(10) << " new parent " << *newparent << dendl;
1409
1410 auto& oldbounds = subtrees[oldparent];
1411 auto& newbounds = subtrees[newparent];
1412
1413 if (olddir != newdir)
1414 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1415
1416 if (oldparent == newparent) {
1417 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1418 } else if (dir->is_subtree_root()) {
1419 // children are fine. change parent.
1420 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1421 {
1422 auto n = oldbounds.erase(dir);
1423 ceph_assert(n == 1);
1424 }
1425 newbounds.insert(dir);
1426 // caller is responsible for 'eval diri'
1427 try_subtree_merge_at(dir, NULL, false);
1428 } else {
1429 // mid-subtree.
1430
1431 // see if any old bounds move to the new parent.
1432 std::vector<CDir*> tomove;
1433 for (const auto& bound : oldbounds) {
1434 CDir *broot = get_subtree_root(bound->get_parent_dir());
1435 if (broot != oldparent) {
1436 ceph_assert(broot == newparent);
1437 tomove.push_back(bound);
1438 }
1439 }
1440 for (const auto& bound : tomove) {
1441 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1442 oldbounds.erase(bound);
1443 newbounds.insert(bound);
1444 }
1445
1446 // did auth change?
1447 if (oldparent->authority() != newparent->authority()) {
1448 adjust_subtree_auth(dir, oldparent->authority(), false);
1449 // caller is responsible for 'eval diri'
1450 try_subtree_merge_at(dir, NULL, false);
1451 }
1452 }
1453
1454 if (olddir != newdir)
1455 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1456 }
1457
1458 show_subtrees();
1459 }
1460
1461 // ===================================
1462 // journal and snap/cow helpers
1463
1464
1465 /*
1466 * find first inode in cache that follows given snapid. otherwise, return current.
1467 */
1468 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1469 {
1470 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1471 ceph_assert(in->last == CEPH_NOSNAP);
1472
1473 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1474 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1475 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1476 in = p->second;
1477 }
1478
1479 return in;
1480 }
1481
1482
1483 /*
1484 * note: i'm currently cheating wrt dirty and inode.version on cow
1485 * items. instead of doing a full dir predirty, i just take the
1486 * original item's version, and set the dirty flag (via
1487 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1488 * means a special case in the dir commit clean sweep assertions.
1489 * bah.
1490 */
1491 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1492 {
1493 ceph_assert(last >= in->first);
1494
1495 CInode *oldin = new CInode(this, true, in->first, last);
1496 auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode());
1497 _inode->trim_client_ranges(last);
1498 oldin->reset_inode(std::move(_inode));
1499 auto _xattrs = in->get_previous_projected_xattrs();
1500 oldin->reset_xattrs(std::move(_xattrs));
1501
1502 oldin->symlink = in->symlink;
1503
1504 if (in->first < in->oldest_snap)
1505 in->oldest_snap = in->first;
1506
1507 in->first = last+1;
1508
1509 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1510 add_inode(oldin);
1511
1512 if (in->last != CEPH_NOSNAP) {
1513 CInode *head_in = get_inode(in->ino());
1514 ceph_assert(head_in);
1515 auto ret = head_in->split_need_snapflush(oldin, in);
1516 if (ret.first) {
1517 oldin->client_snap_caps = in->client_snap_caps;
1518 if (!oldin->client_snap_caps.empty()) {
1519 for (int i = 0; i < num_cinode_locks; i++) {
1520 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1521 ceph_assert(lock);
1522 if (lock->get_state() != LOCK_SNAP_SYNC) {
1523 ceph_assert(lock->is_stable());
1524 lock->set_state(LOCK_SNAP_SYNC); // gathering
1525 oldin->auth_pin(lock);
1526 }
1527 lock->get_wrlock(true);
1528 }
1529 }
1530 }
1531 if (!ret.second) {
1532 auto client_snap_caps = std::move(in->client_snap_caps);
1533 in->client_snap_caps.clear();
1534 in->item_open_file.remove_myself();
1535 in->item_caps.remove_myself();
1536
1537 if (!client_snap_caps.empty()) {
1538 MDSContext::vec finished;
1539 for (int i = 0; i < num_cinode_locks; i++) {
1540 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1541 ceph_assert(lock);
1542 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1543 lock->put_wrlock();
1544 if (!lock->get_num_wrlocks()) {
1545 lock->set_state(LOCK_SYNC);
1546 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1547 in->auth_unpin(lock);
1548 }
1549 }
1550 mds->queue_waiters(finished);
1551 }
1552 }
1553 return oldin;
1554 }
1555
1556 if (!in->client_caps.empty()) {
1557 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1558 // clone caps?
1559 for (auto &p : in->client_caps) {
1560 client_t client = p.first;
1561 Capability *cap = &p.second;
1562 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1563 if ((issued & CEPH_CAP_ANY_WR) &&
1564 cap->client_follows < last) {
1565 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1566 oldin->client_snap_caps.insert(client);
1567 cap->client_follows = last;
1568
1569 // we need snapflushes for any intervening snaps
1570 dout(10) << " snaps " << snaps << dendl;
1571 for (auto q = snaps.lower_bound(oldin->first);
1572 q != snaps.end() && *q <= last;
1573 ++q) {
1574 in->add_need_snapflush(oldin, *q, client);
1575 }
1576 } else {
1577 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1578 }
1579 }
1580
1581 if (!oldin->client_snap_caps.empty()) {
1582 for (int i = 0; i < num_cinode_locks; i++) {
1583 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1584 ceph_assert(lock);
1585 if (lock->get_state() != LOCK_SNAP_SYNC) {
1586 ceph_assert(lock->is_stable());
1587 lock->set_state(LOCK_SNAP_SYNC); // gathering
1588 oldin->auth_pin(lock);
1589 }
1590 lock->get_wrlock(true);
1591 }
1592 }
1593 }
1594 return oldin;
1595 }
1596
1597 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1598 CDentry *dn, snapid_t follows,
1599 CInode **pcow_inode, CDentry::linkage_t *dnl)
1600 {
1601 if (!dn) {
1602 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1603 return;
1604 }
1605 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1606 ceph_assert(dn->is_auth());
1607
1608 // nothing to cow on a null dentry, fix caller
1609 if (!dnl)
1610 dnl = dn->get_projected_linkage();
1611 ceph_assert(!dnl->is_null());
1612
1613 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1614 bool cow_head = false;
1615 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1616 ceph_assert(in->is_frozen_inode());
1617 cow_head = true;
1618 }
1619 if (in && (in->is_multiversion() || cow_head)) {
1620 // multiversion inode.
1621 SnapRealm *realm = NULL;
1622
1623 if (in->get_projected_parent_dn() != dn) {
1624 ceph_assert(follows == CEPH_NOSNAP);
1625 realm = dn->dir->inode->find_snaprealm();
1626 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1627 ceph_assert(dir_follows >= realm->get_newest_seq());
1628
1629 if (dir_follows+1 > dn->first) {
1630 snapid_t oldfirst = dn->first;
1631 dn->first = dir_follows+1;
1632 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1633 CDir *dir = dn->dir;
1634 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows);
1635 dout(10) << " olddn " << *olddn << dendl;
1636 ceph_assert(dir->is_projected());
1637 olddn->set_projected_version(dir->get_projected_version());
1638 metablob->add_remote_dentry(olddn, true);
1639 mut->add_cow_dentry(olddn);
1640 // FIXME: adjust link count here? hmm.
1641
1642 if (dir_follows+1 > in->first)
1643 in->cow_old_inode(dir_follows, cow_head);
1644 }
1645 }
1646
1647 follows = dir_follows;
1648 if (in->snaprealm) {
1649 realm = in->snaprealm;
1650 ceph_assert(follows >= realm->get_newest_seq());
1651 }
1652 } else {
1653 realm = in->find_snaprealm();
1654 if (follows == CEPH_NOSNAP) {
1655 follows = get_global_snaprealm()->get_newest_seq();
1656 ceph_assert(follows >= realm->get_newest_seq());
1657 }
1658 }
1659
1660 // already cloned?
1661 if (follows < in->first) {
1662 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1663 return;
1664 }
1665
1666 if (!realm->has_snaps_in_range(in->first, follows)) {
1667 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1668 in->first = follows + 1;
1669 return;
1670 }
1671
1672 in->cow_old_inode(follows, cow_head);
1673
1674 } else {
1675 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1676 if (follows == CEPH_NOSNAP) {
1677 follows = get_global_snaprealm()->get_newest_seq();
1678 ceph_assert(follows >= realm->get_newest_seq());
1679 }
1680
1681 // already cloned?
1682 if (follows < dn->first) {
1683 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1684 return;
1685 }
1686
1687 // update dn.first before adding old dentry to cdir's map
1688 snapid_t oldfirst = dn->first;
1689 dn->first = follows+1;
1690
1691 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1692 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1693 if (in)
1694 in->first = follows+1;
1695 return;
1696 }
1697
1698 dout(10) << " dn " << *dn << dendl;
1699 CDir *dir = dn->get_dir();
1700 ceph_assert(dir->is_projected());
1701
1702 if (in) {
1703 CInode *oldin = cow_inode(in, follows);
1704 ceph_assert(in->is_projected());
1705 mut->add_cow_inode(oldin);
1706 if (pcow_inode)
1707 *pcow_inode = oldin;
1708 CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows);
1709 dout(10) << " olddn " << *olddn << dendl;
1710 bool need_snapflush = !oldin->client_snap_caps.empty();
1711 if (need_snapflush) {
1712 mut->ls->open_files.push_back(&oldin->item_open_file);
1713 mds->locker->mark_need_snapflush_inode(oldin);
1714 }
1715 olddn->set_projected_version(dir->get_projected_version());
1716 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1717 mut->add_cow_dentry(olddn);
1718 } else {
1719 ceph_assert(dnl->is_remote());
1720 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows);
1721 dout(10) << " olddn " << *olddn << dendl;
1722
1723 olddn->set_projected_version(dir->get_projected_version());
1724 metablob->add_remote_dentry(olddn, true);
1725 mut->add_cow_dentry(olddn);
1726 }
1727 }
1728 }
1729
1730 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1731 {
1732 if (in->is_base()) {
1733 metablob->add_root(true, in);
1734 } else {
1735 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1736 follows = in->first - 1;
1737 CDentry *dn = in->get_projected_parent_dn();
1738 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1739 journal_cow_dentry(mut, metablob, dn, follows);
1740 if (in->get_projected_inode()->is_backtrace_updated()) {
1741 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1742 in->get_previous_projected_inode()->layout.pool_id;
1743 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1744 } else {
1745 metablob->add_primary_dentry(dn, in, true);
1746 }
1747 }
1748 }
1749
1750
1751
1752 // nested ---------------------------------------------------------------
1753
1754 void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
1755 CInode *cur, CDir *parent, snapid_t first,
1756 int linkunlink, SnapRealm *prealm)
1757 {
1758 CDentry *parentdn = cur->get_projected_parent_dn();
1759
1760 if (cur->first > first)
1761 first = cur->first;
1762
1763 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1764 << " " << *cur << dendl;
1765 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1766 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1767
1768 /*
1769 * FIXME. this incompletely propagates rstats to _old_ parents
1770 * (i.e. shortly after a directory rename). but we need full
1771 * blown hard link backpointers to make this work properly...
1772 */
1773 snapid_t floor = parentdn->first;
1774 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1775
1776 if (!prealm)
1777 prealm = parent->inode->find_snaprealm();
1778 const set<snapid_t> snaps = prealm->get_snaps();
1779
1780 if (cur->last != CEPH_NOSNAP) {
1781 ceph_assert(cur->dirty_old_rstats.empty());
1782 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1783 if (q == snaps.end() || *q > cur->last)
1784 return;
1785 }
1786
1787 if (cur->last >= floor) {
1788 bool update = true;
1789 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1790 // rename src inode is not projected in the peer rename prep case. so we should
1791 // avoid updateing the inode.
1792 ceph_assert(linkunlink < 0);
1793 ceph_assert(cur->is_frozen_inode());
1794 update = false;
1795 }
1796 // hacky
1797 const CInode::mempool_inode *pi;
1798 if (update && mut->is_projected(cur)) {
1799 pi = cur->_get_projected_inode();
1800 } else {
1801 pi = cur->get_projected_inode().get();
1802 if (update) {
1803 // new inode
1804 ceph_assert(pi->rstat == pi->accounted_rstat);
1805 update = false;
1806 }
1807 }
1808 _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent,
1809 linkunlink, update);
1810 }
1811
1812 if (g_conf()->mds_snap_rstat) {
1813 for (const auto &p : cur->dirty_old_rstats) {
1814 const auto &old = cur->get_old_inodes()->at(p);
1815 snapid_t ofirst = std::max(old.first, floor);
1816 auto it = snaps.lower_bound(ofirst);
1817 if (it == snaps.end() || *it > p)
1818 continue;
1819 if (p >= floor)
1820 _project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false);
1821 }
1822 }
1823 cur->dirty_old_rstats.clear();
1824 }
1825
1826
1827 void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
1828 CDir *parent, int linkunlink, bool update_inode)
1829 {
1830 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1831 dout(20) << " inode rstat " << inode->rstat << dendl;
1832 dout(20) << " inode accounted_rstat " << inode->accounted_rstat << dendl;
1833 nest_info_t delta;
1834 if (linkunlink == 0) {
1835 delta.add(inode->rstat);
1836 delta.sub(inode->accounted_rstat);
1837 } else if (linkunlink < 0) {
1838 delta.sub(inode->accounted_rstat);
1839 } else {
1840 delta.add(inode->rstat);
1841 }
1842 dout(20) << " delta " << delta << dendl;
1843
1844
1845 while (last >= ofirst) {
1846 /*
1847 * pick fnode version to update. at each iteration, we want to
1848 * pick a segment ending in 'last' to update. split as necessary
1849 * to make that work. then, adjust first up so that we only
1850 * update one segment at a time. then loop to cover the whole
1851 * [ofirst,last] interval.
1852 */
1853 nest_info_t *prstat;
1854 snapid_t first;
1855 auto pf = parent->_get_projected_fnode();
1856 if (last == CEPH_NOSNAP) {
1857 if (g_conf()->mds_snap_rstat)
1858 first = std::max(ofirst, parent->first);
1859 else
1860 first = parent->first;
1861 prstat = &pf->rstat;
1862 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1863
1864 if (first > parent->first &&
1865 !(pf->rstat == pf->accounted_rstat)) {
1866 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1867 << parent->first << "," << (first-1) << "] "
1868 << " " << *prstat << "/" << pf->accounted_rstat
1869 << dendl;
1870 parent->dirty_old_rstat[first-1].first = parent->first;
1871 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1872 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1873 }
1874 parent->first = first;
1875 } else if (!g_conf()->mds_snap_rstat) {
1876 // drop snapshots' rstats
1877 break;
1878 } else if (last >= parent->first) {
1879 first = parent->first;
1880 parent->dirty_old_rstat[last].first = first;
1881 parent->dirty_old_rstat[last].rstat = pf->rstat;
1882 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1883 prstat = &parent->dirty_old_rstat[last].rstat;
1884 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1885 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1886 } else {
1887 // be careful, dirty_old_rstat is a _sparse_ map.
1888 // sorry, this is ugly.
1889 first = ofirst;
1890
1891 // find any intersection with last
1892 auto it = parent->dirty_old_rstat.lower_bound(last);
1893 if (it == parent->dirty_old_rstat.end()) {
1894 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1895 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1896 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1897 first = parent->dirty_old_rstat.rbegin()->first+1;
1898 }
1899 } else {
1900 // *it last is >= last
1901 if (it->second.first <= last) {
1902 // *it intersects [first,last]
1903 if (it->second.first < first) {
1904 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1905 parent->dirty_old_rstat[first-1] = it->second;
1906 it->second.first = first;
1907 }
1908 if (it->second.first > first)
1909 first = it->second.first;
1910 if (last < it->first) {
1911 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1912 parent->dirty_old_rstat[last] = it->second;
1913 it->second.first = last+1;
1914 }
1915 } else {
1916 // *it is to the _right_ of [first,last]
1917 it = parent->dirty_old_rstat.lower_bound(first);
1918 // new *it last is >= first
1919 if (it->second.first <= last && // new *it isn't also to the right, and
1920 it->first >= first) { // it intersects our first bit,
1921 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1922 first = it->first+1;
1923 }
1924 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1925 }
1926 }
1927 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1928 parent->dirty_old_rstat[last].first = first;
1929 prstat = &parent->dirty_old_rstat[last].rstat;
1930 }
1931
1932 // apply
1933 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1934 ceph_assert(last >= first);
1935 prstat->add(delta);
1936 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1937
1938 last = first-1;
1939 }
1940
1941 if (update_inode) {
1942 auto _inode = const_cast<CInode::mempool_inode*>(inode);
1943 _inode->accounted_rstat = _inode->rstat;
1944 }
1945 }
1946
1947 void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat,
1948 const nest_info_t& accounted_rstat,
1949 snapid_t ofirst, snapid_t last,
1950 CInode *pin, bool cow_head)
1951 {
1952 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1953 dout(20) << " frag rstat " << rstat << dendl;
1954 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1955 nest_info_t delta = rstat;
1956 delta.sub(accounted_rstat);
1957 dout(20) << " delta " << delta << dendl;
1958
1959 CInode::old_inode_map_ptr _old_inodes;
1960 while (last >= ofirst) {
1961 CInode::mempool_inode *pi;
1962 snapid_t first;
1963 if (last == pin->last) {
1964 pi = pin->_get_projected_inode();
1965 first = std::max(ofirst, pin->first);
1966 if (first > pin->first) {
1967 auto& old = pin->cow_old_inode(first-1, cow_head);
1968 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1969 }
1970 } else {
1971 if (!_old_inodes) {
1972 _old_inodes = CInode::allocate_old_inode_map();
1973 if (pin->is_any_old_inodes())
1974 *_old_inodes = *pin->get_old_inodes();
1975 }
1976 if (last >= pin->first) {
1977 first = pin->first;
1978 pin->cow_old_inode(last, cow_head);
1979 } else {
1980 // our life is easier here because old_inodes is not sparse
1981 // (although it may not begin at snapid 1)
1982 auto it = _old_inodes->lower_bound(last);
1983 if (it == _old_inodes->end()) {
1984 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1985 break;
1986 }
1987 first = it->second.first;
1988 if (first > last) {
1989 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1990 //assert(p == pin->old_inodes.begin());
1991 break;
1992 }
1993 if (it->first > last) {
1994 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1995 << (last+1) << "," << it->first << "]" << dendl;
1996 (*_old_inodes)[last] = it->second;
1997 it->second.first = last+1;
1998 pin->dirty_old_rstats.insert(it->first);
1999 }
2000 }
2001 if (first < ofirst) {
2002 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
2003 << first << "," << ofirst-1 << "]" << dendl;
2004 (*_old_inodes)[ofirst-1] = (*_old_inodes)[last];
2005 pin->dirty_old_rstats.insert(ofirst-1);
2006 (*_old_inodes)[last].first = first = ofirst;
2007 }
2008 pi = &(*_old_inodes)[last].inode;
2009 pin->dirty_old_rstats.insert(last);
2010 }
2011 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
2012 pi->rstat.add(delta);
2013 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
2014
2015 last = first-1;
2016 }
2017 if (_old_inodes)
2018 pin->reset_old_inodes(std::move(_old_inodes));
2019 }
2020
2021 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
2022 {
2023 if (!(mds->is_active() || mds->is_stopping()))
2024 return;
2025
2026 if (!in->is_auth() || in->is_frozen())
2027 return;
2028
2029 const auto& pi = in->get_projected_inode();
2030 if (!pi->quota.is_enable() && !quota_change)
2031 return;
2032
2033 // creaete snaprealm for quota inode (quota was set before mimic)
2034 if (!in->get_projected_srnode())
2035 mds->server->create_quota_realm(in);
2036
2037 for (auto &p : in->client_caps) {
2038 Capability *cap = &p.second;
2039 if (cap->is_noquota())
2040 continue;
2041
2042 if (exclude_ct >= 0 && exclude_ct != p.first)
2043 goto update;
2044
2045 if (cap->last_rbytes == pi->rstat.rbytes &&
2046 cap->last_rsize == pi->rstat.rsize())
2047 continue;
2048
2049 if (pi->quota.max_files > 0) {
2050 if (pi->rstat.rsize() >= pi->quota.max_files)
2051 goto update;
2052
2053 if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) <
2054 abs(cap->last_rsize - pi->rstat.rsize()))
2055 goto update;
2056 }
2057
2058 if (pi->quota.max_bytes > 0) {
2059 if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3))
2060 goto update;
2061
2062 if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) <
2063 abs(cap->last_rbytes - pi->rstat.rbytes))
2064 goto update;
2065 }
2066
2067 continue;
2068
2069 update:
2070 cap->last_rsize = pi->rstat.rsize();
2071 cap->last_rbytes = pi->rstat.rbytes;
2072
2073 auto msg = make_message<MClientQuota>();
2074 msg->ino = in->ino();
2075 msg->rstat = pi->rstat;
2076 msg->quota = pi->quota;
2077 mds->send_message_client_counted(msg, cap->get_session());
2078 }
2079 for (const auto &it : in->get_replicas()) {
2080 auto msg = make_message<MGatherCaps>();
2081 msg->ino = in->ino();
2082 mds->send_message_mds(msg, it.first);
2083 }
2084 }
2085
2086 /*
2087 * NOTE: we _have_ to delay the scatter if we are called during a
2088 * rejoin, because we can't twiddle locks between when the
2089 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2090 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2091 * (no requests), and a survivor acks immediately. _except_ that
2092 * during rejoin_(weak|strong) processing, we may complete a lock
2093 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2094 * scatterlock state in that case or the lock states will get out of
2095 * sync between the auth and replica.
2096 *
2097 * the simple solution is to never do the scatter here. instead, put
2098 * the scatterlock on a list if it isn't already wrlockable. this is
2099 * probably the best plan anyway, since we avoid too many
2100 * scatters/locks under normal usage.
2101 */
2102 /*
2103 * some notes on dirlock/nestlock scatterlock semantics:
2104 *
2105 * the fragstat (dirlock) will never be updated without
2106 * dirlock+nestlock wrlock held by the caller.
2107 *
2108 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2109 * data is pushed up the tree. this could be changed with some
2110 * restructuring here, but in its current form we ensure that the
2111 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2112 * frag, which is nice. and, we only need to track frags that need to
2113 * be nudged (and not inodes with pending rstat changes that need to
2114 * be pushed into the frag). a consequence of this is that the
2115 * accounted_rstat on scatterlock sync may not match our current
2116 * rstat. this is normal and expected.
2117 */
2118 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2119 CInode *in, CDir *parent,
2120 int flags, int linkunlink,
2121 snapid_t cfollows)
2122 {
2123 bool primary_dn = flags & PREDIRTY_PRIMARY;
2124 bool do_parent_mtime = flags & PREDIRTY_DIR;
2125 bool shallow = flags & PREDIRTY_SHALLOW;
2126
2127 ceph_assert(mds->mdlog->entry_is_open());
2128
2129 // make sure stamp is set
2130 if (mut->get_mds_stamp() == utime_t())
2131 mut->set_mds_stamp(ceph_clock_now());
2132
2133 if (in->is_base())
2134 return;
2135
2136 dout(10) << "predirty_journal_parents"
2137 << (do_parent_mtime ? " do_parent_mtime":"")
2138 << " linkunlink=" << linkunlink
2139 << (primary_dn ? " primary_dn":" remote_dn")
2140 << (shallow ? " SHALLOW":"")
2141 << " follows " << cfollows
2142 << " " << *in << dendl;
2143
2144 if (!parent) {
2145 ceph_assert(primary_dn);
2146 parent = in->get_projected_parent_dn()->get_dir();
2147 }
2148
2149 if (flags == 0 && linkunlink == 0) {
2150 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2151 blob->add_dir_context(parent);
2152 return;
2153 }
2154
2155 // build list of inodes to wrlock, dirty, and update
2156 list<CInode*> lsi;
2157 CInode *cur = in;
2158 CDentry *parentdn = NULL;
2159 bool first = true;
2160 while (parent) {
2161 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2162 ceph_assert(parent->is_auth());
2163
2164 // opportunistically adjust parent dirfrag
2165 CInode *pin = parent->get_inode();
2166
2167 // inode -> dirfrag
2168 mut->auth_pin(parent);
2169
2170 auto pf = parent->project_fnode(mut);
2171 pf->version = parent->pre_dirty();
2172
2173 if (do_parent_mtime || linkunlink) {
2174 ceph_assert(mut->is_wrlocked(&pin->filelock));
2175 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2176 ceph_assert(cfollows == CEPH_NOSNAP);
2177
2178 // update stale fragstat/rstat?
2179 parent->resync_accounted_fragstat();
2180 parent->resync_accounted_rstat();
2181
2182 if (do_parent_mtime) {
2183 pf->fragstat.mtime = mut->get_op_stamp();
2184 pf->fragstat.change_attr++;
2185 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2186 if (pf->fragstat.mtime > pf->rstat.rctime) {
2187 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2188 pf->rstat.rctime = pf->fragstat.mtime;
2189 } else {
2190 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2191 }
2192 }
2193 if (linkunlink) {
2194 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2195 if (in->is_dir()) {
2196 pf->fragstat.nsubdirs += linkunlink;
2197 //pf->rstat.rsubdirs += linkunlink;
2198 } else {
2199 pf->fragstat.nfiles += linkunlink;
2200 //pf->rstat.rfiles += linkunlink;
2201 }
2202 }
2203 }
2204
2205 // rstat
2206 if (!primary_dn) {
2207 // don't update parent this pass
2208 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2209 pin->versionlock.can_wrlock())) {
2210 dout(20) << " unwritable parent nestlock " << pin->nestlock
2211 << ", marking dirty rstat on " << *cur << dendl;
2212 cur->mark_dirty_rstat();
2213 } else {
2214 // if we don't hold a wrlock reference on this nestlock, take one,
2215 // because we are about to write into the dirfrag fnode and that needs
2216 // to commit before the lock can cycle.
2217 if (linkunlink) {
2218 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer());
2219 }
2220
2221 if (!mut->is_wrlocked(&pin->nestlock)) {
2222 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2223 mds->locker->wrlock_force(&pin->nestlock, mut);
2224 }
2225
2226 // now we can project the inode rstat diff the dirfrag
2227 SnapRealm *prealm = pin->find_snaprealm();
2228
2229 snapid_t follows = cfollows;
2230 if (follows == CEPH_NOSNAP)
2231 follows = prealm->get_newest_seq();
2232
2233 snapid_t first = follows+1;
2234
2235 // first, if the frag is stale, bring it back in sync.
2236 parent->resync_accounted_rstat();
2237
2238 // now push inode rstats into frag
2239 project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm);
2240 cur->clear_dirty_rstat();
2241 }
2242
2243 bool stop = false;
2244 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2245 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2246 stop = true;
2247 }
2248
2249 // delay propagating until later?
2250 if (!stop && !first &&
2251 g_conf()->mds_dirstat_min_interval > 0) {
2252 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2253 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2254 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2255 << " < " << g_conf()->mds_dirstat_min_interval
2256 << ", stopping" << dendl;
2257 stop = true;
2258 } else {
2259 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2260 }
2261 }
2262
2263 // can cast only because i'm passing nowait=true in the sole user
2264 if (!stop &&
2265 !mut->is_wrlocked(&pin->nestlock) &&
2266 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2267 !mds->locker->wrlock_try(&pin->nestlock, mut)
2268 )) { // ** do not initiate.. see above comment **
2269 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2270 << " on " << *pin << dendl;
2271 stop = true;
2272 }
2273 if (stop) {
2274 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2275 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2276 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2277 mut->add_updated_lock(&pin->nestlock);
2278 if (do_parent_mtime || linkunlink) {
2279 mds->locker->mark_updated_scatterlock(&pin->filelock);
2280 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2281 mut->add_updated_lock(&pin->filelock);
2282 }
2283 break;
2284 }
2285 if (!mut->is_wrlocked(&pin->versionlock))
2286 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2287
2288 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer());
2289
2290 pin->last_dirstat_prop = mut->get_mds_stamp();
2291
2292 // dirfrag -> diri
2293 mut->auth_pin(pin);
2294 lsi.push_front(pin);
2295
2296 pin->pre_cow_old_inode(); // avoid cow mayhem!
2297
2298 auto pi = pin->project_inode(mut);
2299 pi.inode->version = pin->pre_dirty();
2300
2301 // dirstat
2302 if (do_parent_mtime || linkunlink) {
2303 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2304 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2305 bool touched_mtime = false, touched_chattr = false;
2306 pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2307 pf->accounted_fragstat = pf->fragstat;
2308 if (touched_mtime)
2309 pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime;
2310 if (touched_chattr)
2311 pi.inode->change_attr = pi.inode->dirstat.change_attr;
2312 dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl;
2313
2314 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2315 if (pi.inode->dirstat.size() < 0)
2316 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2317 if (pi.inode->dirstat.size() != pf->fragstat.size()) {
2318 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2319 << parent->dirfrag() << ", inode has " << pi.inode->dirstat
2320 << ", dirfrag has " << pf->fragstat;
2321
2322 // trust the dirfrag for now
2323 pi.inode->dirstat = pf->fragstat;
2324
2325 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2326 }
2327 }
2328 }
2329
2330 // rstat
2331 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2332
2333 // first, if the frag is stale, bring it back in sync.
2334 parent->resync_accounted_rstat();
2335
2336 if (g_conf()->mds_snap_rstat) {
2337 for (auto &p : parent->dirty_old_rstat) {
2338 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2339 p.first, pin, true);
2340 }
2341 }
2342 parent->dirty_old_rstat.clear();
2343 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2344
2345 pf->accounted_rstat = pf->rstat;
2346
2347 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2348 if (pi.inode->rstat.rbytes != pf->rstat.rbytes) {
2349 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2350 << parent->dirfrag() << ", inode has " << pi.inode->rstat
2351 << ", dirfrag has " << pf->rstat;
2352
2353 // trust the dirfrag for now
2354 pi.inode->rstat = pf->rstat;
2355
2356 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2357 }
2358 }
2359
2360 parent->check_rstats();
2361 broadcast_quota_to_client(pin);
2362 if (pin->is_base())
2363 break;
2364 // next parent!
2365 cur = pin;
2366 parentdn = pin->get_projected_parent_dn();
2367 ceph_assert(parentdn);
2368 parent = parentdn->get_dir();
2369 linkunlink = 0;
2370 do_parent_mtime = false;
2371 primary_dn = true;
2372 first = false;
2373 }
2374
2375 // now, stick it in the blob
2376 ceph_assert(parent);
2377 ceph_assert(parent->is_auth());
2378 blob->add_dir_context(parent);
2379 blob->add_dir(parent, true);
2380 for (const auto& in : lsi) {
2381 journal_dirty_inode(mut.get(), blob, in);
2382 }
2383
2384 }
2385
2386
2387
2388
2389
2390 // ===================================
2391 // peer requests
2392
2393
2394 /*
2395 * some handlers for leader requests with peers. we need to make
2396 * sure leader journal commits before we forget we leadered them and
2397 * remove them from the uncommitted_leaders map (used during recovery
2398 * to commit|abort peers).
2399 */
2400 struct C_MDC_CommittedLeader : public MDCacheLogContext {
2401 metareqid_t reqid;
2402 C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2403 void finish(int r) override {
2404 mdcache->_logged_leader_commit(reqid);
2405 }
2406 };
2407
2408 void MDCache::log_leader_commit(metareqid_t reqid)
2409 {
2410 dout(10) << "log_leader_commit " << reqid << dendl;
2411 uncommitted_leaders[reqid].committing = true;
2412 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2413 new C_MDC_CommittedLeader(this, reqid));
2414 }
2415
2416 void MDCache::_logged_leader_commit(metareqid_t reqid)
2417 {
2418 dout(10) << "_logged_leader_commit " << reqid << dendl;
2419 ceph_assert(uncommitted_leaders.count(reqid));
2420 uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2421 mds->queue_waiters(uncommitted_leaders[reqid].waiters);
2422 uncommitted_leaders.erase(reqid);
2423 }
2424
2425 // while active...
2426
2427 void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from)
2428 {
2429 dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl;
2430 ceph_assert(uncommitted_leaders.count(r));
2431 uncommitted_leaders[r].peers.erase(from);
2432 if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty())
2433 log_leader_commit(r);
2434 }
2435
2436 void MDCache::logged_leader_update(metareqid_t reqid)
2437 {
2438 dout(10) << "logged_leader_update " << reqid << dendl;
2439 ceph_assert(uncommitted_leaders.count(reqid));
2440 uncommitted_leaders[reqid].safe = true;
2441 auto p = pending_leaders.find(reqid);
2442 if (p != pending_leaders.end()) {
2443 pending_leaders.erase(p);
2444 if (pending_leaders.empty())
2445 process_delayed_resolve();
2446 }
2447 }
2448
2449 /*
2450 * Leader may crash after receiving all peers' commit acks, but before journalling
2451 * the final commit. Peers may crash after journalling the peer commit, but before
2452 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
2453 * resolve finishes.
2454 */
2455 void MDCache::finish_committed_leaders()
2456 {
2457 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
2458 p != uncommitted_leaders.end();
2459 ++p) {
2460 p->second.recovering = false;
2461 if (!p->second.committing && p->second.peers.empty()) {
2462 dout(10) << "finish_committed_leaders " << p->first << dendl;
2463 log_leader_commit(p->first);
2464 }
2465 }
2466 }
2467
2468 /*
2469 * at end of resolve... we must journal a commit|abort for all peer
2470 * updates, before moving on.
2471 *
2472 * this is so that the leader can safely journal ECommitted on ops it
2473 * leaders when it reaches up:active (all other recovering nodes must
2474 * complete resolve before that happens).
2475 */
2476 struct C_MDC_PeerCommit : public MDCacheLogContext {
2477 mds_rank_t from;
2478 metareqid_t reqid;
2479 C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2480 void finish(int r) override {
2481 mdcache->_logged_peer_commit(from, reqid);
2482 }
2483 };
2484
2485 void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid)
2486 {
2487 dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl;
2488
2489 // send a message
2490 auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED);
2491 mds->send_message_mds(req, from);
2492 }
2493
2494
2495
2496
2497
2498
2499 // ====================================================================
2500 // import map, recovery
2501
2502 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2503 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2504 {
2505 if (subtrees.count(oldparent)) {
2506 vector<dirfrag_t>& v = subtrees[oldparent];
2507 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2508 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2509 if (*it == df) {
2510 v.erase(it);
2511 break;
2512 }
2513 }
2514 if (subtrees.count(newparent)) {
2515 vector<dirfrag_t>& v = subtrees[newparent];
2516 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2517 v.push_back(df);
2518 }
2519 }
2520
2521 ESubtreeMap *MDCache::create_subtree_map()
2522 {
2523 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2524 << num_subtrees_fullauth() << " fullauth"
2525 << dendl;
2526
2527 show_subtrees();
2528
2529 ESubtreeMap *le = new ESubtreeMap();
2530 mds->mdlog->_start_entry(le);
2531
2532 map<dirfrag_t, CDir*> dirs_to_add;
2533
2534 if (myin) {
2535 CDir* mydir = myin->get_dirfrag(frag_t());
2536 dirs_to_add[mydir->dirfrag()] = mydir;
2537 }
2538
2539 // include all auth subtrees, and their bounds.
2540 // and a spanning tree to tie it to the root.
2541 for (auto& [dir, bounds] : subtrees) {
2542 // journal subtree as "ours" if we are
2543 // me, -2
2544 // me, me
2545 // me, !me (may be importing and ambiguous!)
2546
2547 // so not
2548 // !me, *
2549 if (dir->get_dir_auth().first != mds->get_nodeid())
2550 continue;
2551
2552 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2553 my_ambiguous_imports.count(dir->dirfrag())) {
2554 dout(15) << " ambig subtree " << *dir << dendl;
2555 le->ambiguous_subtrees.insert(dir->dirfrag());
2556 } else {
2557 dout(15) << " auth subtree " << *dir << dendl;
2558 }
2559
2560 dirs_to_add[dir->dirfrag()] = dir;
2561 le->subtrees[dir->dirfrag()].clear();
2562
2563 // bounds
2564 size_t nbounds = bounds.size();
2565 if (nbounds > 3) {
2566 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2567 }
2568 for (auto& bound : bounds) {
2569 if (nbounds <= 3) {
2570 dout(15) << " subtree bound " << *bound << dendl;
2571 }
2572 dirs_to_add[bound->dirfrag()] = bound;
2573 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2574 }
2575 }
2576
2577 // apply projected renames
2578 for (const auto& [diri, renames] : projected_subtree_renames) {
2579 for (const auto& [olddir, newdir] : renames) {
2580 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2581
2582 auto&& dfls = diri->get_dirfrags();
2583 for (const auto& dir : dfls) {
2584 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2585 CDir *oldparent = get_projected_subtree_root(olddir);
2586 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2587 CDir *newparent = get_projected_subtree_root(newdir);
2588 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2589
2590 if (oldparent == newparent) {
2591 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
2592 << oldparent->dirfrag() << dendl;
2593 continue;
2594 }
2595
2596 if (dir->is_subtree_root()) {
2597 if (le->subtrees.count(newparent->dirfrag()) &&
2598 oldparent->get_dir_auth() != newparent->get_dir_auth())
2599 dirs_to_add[dir->dirfrag()] = dir;
2600 // children are fine. change parent.
2601 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2602 le->subtrees);
2603 } else {
2604 // mid-subtree.
2605
2606 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2607 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2608 // if oldparent is auth, subtree is mine; include it.
2609 if (le->subtrees.count(oldparent->dirfrag())) {
2610 dirs_to_add[dir->dirfrag()] = dir;
2611 le->subtrees[dir->dirfrag()].clear();
2612 }
2613 // if newparent is auth, subtree is a new bound
2614 if (le->subtrees.count(newparent->dirfrag())) {
2615 dirs_to_add[dir->dirfrag()] = dir;
2616 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2617 }
2618 newparent = dir;
2619 }
2620
2621 // see if any old bounds move to the new parent.
2622 for (auto& bound : subtrees.at(oldparent)) {
2623 if (dir->contains(bound->get_parent_dir()))
2624 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2625 le->subtrees);
2626 }
2627 }
2628 }
2629 }
2630 }
2631
2632 // simplify the journaled map. our in memory map may have more
2633 // subtrees than needed due to migrations that are just getting
2634 // started or just completing. but on replay, the "live" map will
2635 // be simple and we can do a straight comparison.
2636 for (auto& [frag, bfrags] : le->subtrees) {
2637 if (le->ambiguous_subtrees.count(frag))
2638 continue;
2639 unsigned i = 0;
2640 while (i < bfrags.size()) {
2641 dirfrag_t b = bfrags[i];
2642 if (le->subtrees.count(b) &&
2643 le->ambiguous_subtrees.count(b) == 0) {
2644 auto& bb = le->subtrees.at(b);
2645 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2646 for (auto& r : bb) {
2647 bfrags.push_back(r);
2648 }
2649 dirs_to_add.erase(b);
2650 le->subtrees.erase(b);
2651 bfrags.erase(bfrags.begin() + i);
2652 } else {
2653 ++i;
2654 }
2655 }
2656 }
2657
2658 for (auto &p : dirs_to_add) {
2659 CDir *dir = p.second;
2660 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2661 le->metablob.add_dir(dir, false);
2662 }
2663
2664 dout(15) << " subtrees " << le->subtrees << dendl;
2665 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2666
2667 //le->metablob.print(cout);
2668 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2669 return le;
2670 }
2671
2672 void MDCache::dump_resolve_status(Formatter *f) const
2673 {
2674 f->open_object_section("resolve_status");
2675 f->dump_stream("resolve_gather") << resolve_gather;
2676 f->dump_stream("resolve_ack_gather") << resolve_gather;
2677 f->close_section();
2678 }
2679
2680 void MDCache::resolve_start(MDSContext *resolve_done_)
2681 {
2682 dout(10) << "resolve_start" << dendl;
2683 ceph_assert(!resolve_done);
2684 resolve_done.reset(resolve_done_);
2685
2686 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2687 // if we don't have the root dir, adjust it to UNKNOWN. during
2688 // resolve we want mds0 to explicit claim the portion of it that
2689 // it owns, so that anything beyond its bounds get left as
2690 // unknown.
2691 CDir *rootdir = root->get_dirfrag(frag_t());
2692 if (rootdir)
2693 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2694 }
2695 resolve_gather = recovery_set;
2696
2697 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2698 }
2699
2700 void MDCache::send_resolves()
2701 {
2702 send_peer_resolves();
2703
2704 if (!resolve_done) {
2705 // I'm survivor: refresh snap cache
2706 mds->snapclient->sync(
2707 new MDSInternalContextWrapper(mds,
2708 new LambdaContext([this](int r) {
2709 maybe_finish_peer_resolve();
2710 })
2711 )
2712 );
2713 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2714 return;
2715 }
2716 if (!resolve_ack_gather.empty()) {
2717 dout(10) << "send_resolves still waiting for resolve ack from ("
2718 << resolve_ack_gather << ")" << dendl;
2719 return;
2720 }
2721 if (!resolve_need_rollback.empty()) {
2722 dout(10) << "send_resolves still waiting for rollback to commit on ("
2723 << resolve_need_rollback << ")" << dendl;
2724 return;
2725 }
2726
2727 send_subtree_resolves();
2728 }
2729
2730 void MDCache::send_peer_resolves()
2731 {
2732 dout(10) << "send_peer_resolves" << dendl;
2733
2734 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2735
2736 if (mds->is_resolve()) {
2737 for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
2738 p != uncommitted_peers.end();
2739 ++p) {
2740 mds_rank_t leader = p->second.leader;
2741 auto &m = resolves[leader];
2742 if (!m) m = make_message<MMDSResolve>();
2743 m->add_peer_request(p->first, false);
2744 }
2745 } else {
2746 set<mds_rank_t> resolve_set;
2747 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2748 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2749 p != active_requests.end();
2750 ++p) {
2751 MDRequestRef& mdr = p->second;
2752 if (!mdr->is_peer())
2753 continue;
2754 if (!mdr->peer_did_prepare() && !mdr->committing) {
2755 continue;
2756 }
2757 mds_rank_t leader = mdr->peer_to_mds;
2758 if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) {
2759 dout(10) << " including uncommitted " << *mdr << dendl;
2760 if (!resolves.count(leader))
2761 resolves[leader] = make_message<MMDSResolve>();
2762 if (!mdr->committing &&
2763 mdr->has_more() && mdr->more()->is_inode_exporter) {
2764 // re-send cap exports
2765 CInode *in = mdr->more()->rename_inode;
2766 map<client_t, Capability::Export> cap_map;
2767 in->export_client_caps(cap_map);
2768 bufferlist bl;
2769 MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map);
2770 encode(inode_caps, bl);
2771 resolves[leader]->add_peer_request(p->first, bl);
2772 } else {
2773 resolves[leader]->add_peer_request(p->first, mdr->committing);
2774 }
2775 }
2776 }
2777 }
2778
2779 for (auto &p : resolves) {
2780 dout(10) << "sending peer resolve to mds." << p.first << dendl;
2781 mds->send_message_mds(p.second, p.first);
2782 resolve_ack_gather.insert(p.first);
2783 }
2784 }
2785
2786 void MDCache::send_subtree_resolves()
2787 {
2788 dout(10) << "send_subtree_resolves" << dendl;
2789
2790 if (migrator->is_exporting() || migrator->is_importing()) {
2791 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2792 migrator->show_importing();
2793 migrator->show_exporting();
2794 resolves_pending = true;
2795 return; // not now
2796 }
2797
2798 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2799 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2800 p != recovery_set.end();
2801 ++p) {
2802 if (*p == mds->get_nodeid())
2803 continue;
2804 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2805 resolves[*p] = make_message<MMDSResolve>();
2806 }
2807
2808 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2809 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2810
2811 // known
2812 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2813 p != subtrees.end();
2814 ++p) {
2815 CDir *dir = p->first;
2816
2817 // only our subtrees
2818 if (dir->authority().first != mds->get_nodeid())
2819 continue;
2820
2821 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2822 continue; // we'll add it below
2823
2824 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2825 // ambiguous (mid-import)
2826 set<CDir*> bounds;
2827 get_subtree_bounds(dir, bounds);
2828 vector<dirfrag_t> dfls;
2829 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2830 dfls.push_back((*q)->dirfrag());
2831
2832 my_ambig_imports[dir->dirfrag()] = dfls;
2833 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2834 } else {
2835 // not ambiguous.
2836 for (auto &q : resolves) {
2837 resolves[q.first]->add_subtree(dir->dirfrag());
2838 }
2839 // bounds too
2840 vector<dirfrag_t> dfls;
2841 for (set<CDir*>::iterator q = subtrees[dir].begin();
2842 q != subtrees[dir].end();
2843 ++q) {
2844 CDir *bound = *q;
2845 dfls.push_back(bound->dirfrag());
2846 }
2847
2848 my_subtrees[dir->dirfrag()] = dfls;
2849 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2850 }
2851 }
2852
2853 // ambiguous
2854 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2855 p != my_ambiguous_imports.end();
2856 ++p) {
2857 my_ambig_imports[p->first] = p->second;
2858 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2859 }
2860
2861 // simplify the claimed subtree.
2862 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2863 unsigned i = 0;
2864 while (i < p->second.size()) {
2865 dirfrag_t b = p->second[i];
2866 if (my_subtrees.count(b)) {
2867 vector<dirfrag_t>& bb = my_subtrees[b];
2868 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2869 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2870 p->second.push_back(*r);
2871 my_subtrees.erase(b);
2872 p->second.erase(p->second.begin() + i);
2873 } else {
2874 ++i;
2875 }
2876 }
2877 }
2878
2879 // send
2880 for (auto &p : resolves) {
2881 const ref_t<MMDSResolve> &m = p.second;
2882 if (mds->is_resolve()) {
2883 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2884 } else {
2885 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2886 }
2887 m->subtrees = my_subtrees;
2888 m->ambiguous_imports = my_ambig_imports;
2889 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2890 mds->send_message_mds(m, p.first);
2891 }
2892 resolves_pending = false;
2893 }
2894
2895 void MDCache::maybe_finish_peer_resolve() {
2896 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2897 // snap cache get synced or I'm in resolve state
2898 if (mds->snapclient->is_synced() || resolve_done)
2899 send_subtree_resolves();
2900 process_delayed_resolve();
2901 }
2902 }
2903
2904 void MDCache::handle_mds_failure(mds_rank_t who)
2905 {
2906 dout(7) << "handle_mds_failure mds." << who << dendl;
2907
2908 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2909
2910 resolve_gather.insert(who);
2911 discard_delayed_resolve(who);
2912 ambiguous_peer_updates.erase(who);
2913
2914 rejoin_gather.insert(who);
2915 rejoin_sent.erase(who); // i need to send another
2916 rejoin_ack_sent.erase(who); // i need to send another
2917 rejoin_ack_gather.erase(who); // i'll need/get another.
2918
2919 dout(10) << " resolve_gather " << resolve_gather << dendl;
2920 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2921 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2922 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2923 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2924
2925
2926 // tell the migrator too.
2927 migrator->handle_mds_failure_or_stop(who);
2928
2929 // tell the balancer too.
2930 mds->balancer->handle_mds_failure(who);
2931
2932 // clean up any requests peer to/from this node
2933 list<MDRequestRef> finish;
2934 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2935 p != active_requests.end();
2936 ++p) {
2937 MDRequestRef& mdr = p->second;
2938 // peer to the failed node?
2939 if (mdr->peer_to_mds == who) {
2940 if (mdr->peer_did_prepare()) {
2941 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2942 if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds))
2943 remove_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2944
2945 if (!mdr->more()->waiting_on_peer.empty()) {
2946 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2947 // will rollback, no need to wait
2948 mdr->reset_peer_request();
2949 mdr->more()->waiting_on_peer.clear();
2950 }
2951 } else if (!mdr->committing) {
2952 dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl;
2953 if (mdr->peer_request || mdr->peer_rolling_back())
2954 mdr->aborted = true;
2955 else
2956 finish.push_back(mdr);
2957 }
2958 }
2959
2960 if (mdr->is_peer() && mdr->peer_did_prepare()) {
2961 if (mdr->more()->waiting_on_peer.count(who)) {
2962 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2963 dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds."
2964 << who << dendl;
2965 mdr->more()->waiting_on_peer.erase(who);
2966 if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request)
2967 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2968 }
2969
2970 if (mdr->more()->srcdn_auth_mds == who &&
2971 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) {
2972 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2973 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2974 add_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2975 }
2976 } else if (mdr->peer_request) {
2977 const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request;
2978 // FIXME: Peer rename request can arrive after we notice mds failure.
2979 // This can cause mds to crash (does not affect integrity of FS).
2980 if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP &&
2981 peer_req->srcdn_auth == who)
2982 peer_req->mark_interrupted();
2983 }
2984
2985 // failed node is peer?
2986 if (mdr->is_leader() && !mdr->committing) {
2987 if (mdr->more()->srcdn_auth_mds == who) {
2988 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
2989 << who << " to recover" << dendl;
2990 ceph_assert(mdr->more()->witnessed.count(who) == 0);
2991 if (mdr->more()->is_ambiguous_auth)
2992 mdr->clear_ambiguous_auth();
2993 // rename srcdn's auth mds failed, all witnesses will rollback
2994 mdr->more()->witnessed.clear();
2995 pending_leaders.erase(p->first);
2996 }
2997
2998 if (mdr->more()->witnessed.count(who)) {
2999 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
3000 if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) {
3001 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
3002 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
3003 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
3004 // until either the request is committing or the peer also fails.
3005 ceph_assert(mdr->more()->waiting_on_peer.size() == 1);
3006 pending_leaders.insert(p->first);
3007 } else {
3008 dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds."
3009 << who << " to recover" << dendl;
3010 if (srcdn_auth >= 0)
3011 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
3012
3013 // discard this peer's prepare (if any)
3014 mdr->more()->witnessed.erase(who);
3015 }
3016 }
3017
3018 if (mdr->more()->waiting_on_peer.count(who)) {
3019 dout(10) << " leader request " << *mdr << " waiting for peer mds." << who
3020 << " to recover" << dendl;
3021 // retry request when peer recovers
3022 mdr->more()->waiting_on_peer.erase(who);
3023 if (mdr->more()->waiting_on_peer.empty())
3024 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3025 }
3026
3027 if (mdr->locking && mdr->locking_target_mds == who)
3028 mdr->finish_locking(mdr->locking);
3029 }
3030 }
3031
3032 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
3033 p != uncommitted_leaders.end();
3034 ++p) {
3035 // The failed MDS may have already committed the peer update
3036 if (p->second.peers.count(who)) {
3037 p->second.recovering = true;
3038 p->second.peers.erase(who);
3039 }
3040 }
3041
3042 while (!finish.empty()) {
3043 dout(10) << "cleaning up peer request " << *finish.front() << dendl;
3044 request_finish(finish.front());
3045 finish.pop_front();
3046 }
3047
3048 kick_find_ino_peers(who);
3049 kick_open_ino_peers(who);
3050
3051 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3052 p != fragments.end(); ) {
3053 dirfrag_t df = p->first;
3054 fragment_info_t& info = p->second;
3055
3056 if (info.is_fragmenting()) {
3057 if (info.notify_ack_waiting.erase(who) &&
3058 info.notify_ack_waiting.empty()) {
3059 fragment_drop_locks(info);
3060 fragment_maybe_finish(p++);
3061 } else {
3062 ++p;
3063 }
3064 continue;
3065 }
3066
3067 ++p;
3068 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3069 std::vector<CDir*> dirs;
3070 info.dirs.swap(dirs);
3071 fragments.erase(df);
3072 fragment_unmark_unfreeze_dirs(dirs);
3073 }
3074
3075 // MDCache::shutdown_export_strays() always exports strays to mds.0
3076 if (who == mds_rank_t(0))
3077 shutdown_exporting_strays.clear();
3078
3079 show_subtrees();
3080 }
3081
3082 /*
3083 * handle_mds_recovery - called on another node's transition
3084 * from resolve -> active.
3085 */
3086 void MDCache::handle_mds_recovery(mds_rank_t who)
3087 {
3088 dout(7) << "handle_mds_recovery mds." << who << dendl;
3089
3090 // exclude all discover waiters. kick_discovers() will do the job
3091 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3092 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3093
3094 MDSContext::vec waiters;
3095
3096 // wake up any waiters in their subtrees
3097 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3098 p != subtrees.end();
3099 ++p) {
3100 CDir *dir = p->first;
3101
3102 if (dir->authority().first != who ||
3103 dir->authority().second == mds->get_nodeid())
3104 continue;
3105 ceph_assert(!dir->is_auth());
3106
3107 // wake any waiters
3108 std::queue<CDir*> q;
3109 q.push(dir);
3110
3111 while (!q.empty()) {
3112 CDir *d = q.front();
3113 q.pop();
3114 d->take_waiting(d_mask, waiters);
3115
3116 // inode waiters too
3117 for (auto &p : d->items) {
3118 CDentry *dn = p.second;
3119 CDentry::linkage_t *dnl = dn->get_linkage();
3120 if (dnl->is_primary()) {
3121 dnl->get_inode()->take_waiting(i_mask, waiters);
3122
3123 // recurse?
3124 auto&& ls = dnl->get_inode()->get_dirfrags();
3125 for (const auto& subdir : ls) {
3126 if (!subdir->is_subtree_root())
3127 q.push(subdir);
3128 }
3129 }
3130 }
3131 }
3132 }
3133
3134 kick_open_ino_peers(who);
3135 kick_find_ino_peers(who);
3136
3137 // queue them up.
3138 mds->queue_waiters(waiters);
3139 }
3140
3141 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3142 {
3143 dout(7) << "set_recovery_set " << s << dendl;
3144 recovery_set = s;
3145 }
3146
3147
3148 /*
3149 * during resolve state, we share resolves to determine who
3150 * is authoritative for which trees. we expect to get an resolve
3151 * from _everyone_ in the recovery_set (the mds cluster at the time of
3152 * the first failure).
3153 *
3154 * This functions puts the passed message before returning
3155 */
3156 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3157 {
3158 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3159 mds_rank_t from = mds_rank_t(m->get_source().num());
3160
3161 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3162 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3163 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3164 return;
3165 }
3166 // wait until we reach the resolve stage!
3167 return;
3168 }
3169
3170 discard_delayed_resolve(from);
3171
3172 // ambiguous peer requests?
3173 if (!m->peer_requests.empty()) {
3174 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3175 for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) {
3176 if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) {
3177 ceph_assert(!p->second.committing);
3178 pending_leaders.insert(p->first);
3179 }
3180 }
3181
3182 if (!pending_leaders.empty()) {
3183 dout(10) << " still have pending updates, delay processing peer resolve" << dendl;
3184 delayed_resolve[from] = m;
3185 return;
3186 }
3187 }
3188
3189 auto ack = make_message<MMDSResolveAck>();
3190 for (const auto &p : m->peer_requests) {
3191 if (uncommitted_leaders.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3192 // COMMIT
3193 if (p.second.committing) {
3194 // already committing, waiting for the OP_COMMITTED peer reply
3195 dout(10) << " already committing peer request " << p << " noop "<< dendl;
3196 } else {
3197 dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl;
3198 ack->add_commit(p.first);
3199 }
3200 uncommitted_leaders[p.first].peers.insert(from); // wait for peer OP_COMMITTED before we log ECommitted
3201
3202 if (p.second.inode_caps.length() > 0) {
3203 // peer wants to export caps (rename)
3204 ceph_assert(mds->is_resolve());
3205 MMDSResolve::peer_inode_cap inode_caps;
3206 auto q = p.second.inode_caps.cbegin();
3207 decode(inode_caps, q);
3208 inodeno_t ino = inode_caps.ino;
3209 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3210 ceph_assert(get_inode(ino));
3211
3212 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3213 q != cap_exports.end();
3214 ++q) {
3215 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3216 im.cap_id = ++last_cap_id; // assign a new cap ID
3217 im.issue_seq = 1;
3218 im.mseq = q->second.mseq;
3219
3220 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3221 if (session)
3222 rejoin_client_map.emplace(q->first, session->info.inst);
3223 }
3224
3225 // will process these caps in rejoin stage
3226 rejoin_peer_exports[ino].first = from;
3227 rejoin_peer_exports[ino].second.swap(cap_exports);
3228
3229 // send information of imported caps back to peer
3230 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3231 }
3232 } else {
3233 // ABORT
3234 dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl;
3235 ceph_assert(!p.second.committing);
3236 ack->add_abort(p.first);
3237 }
3238 }
3239 mds->send_message(ack, m->get_connection());
3240 return;
3241 }
3242
3243 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3244 dout(10) << "delay processing subtree resolve" << dendl;
3245 delayed_resolve[from] = m;
3246 return;
3247 }
3248
3249 bool survivor = false;
3250 // am i a surviving ambiguous importer?
3251 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3252 survivor = true;
3253 // check for any import success/failure (from this node)
3254 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3255 while (p != my_ambiguous_imports.end()) {
3256 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3257 ++next;
3258 CDir *dir = get_dirfrag(p->first);
3259 ceph_assert(dir);
3260 dout(10) << "checking ambiguous import " << *dir << dendl;
3261 if (migrator->is_importing(dir->dirfrag()) &&
3262 migrator->get_import_peer(dir->dirfrag()) == from) {
3263 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3264
3265 // check if sender claims the subtree
3266 bool claimed_by_sender = false;
3267 for (const auto &q : m->subtrees) {
3268 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3269 CDir *base = get_force_dirfrag(q.first, false);
3270 if (!base || !base->contains(dir))
3271 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3272
3273 bool inside = true;
3274 set<CDir*> bounds;
3275 get_force_dirfrag_bound_set(q.second, bounds);
3276 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3277 CDir *bound = *p;
3278 if (bound->contains(dir)) {
3279 inside = false; // nope, bound is dir or parent of dir, not inside.
3280 break;
3281 }
3282 }
3283 if (inside)
3284 claimed_by_sender = true;
3285 }
3286
3287 my_ambiguous_imports.erase(p); // no longer ambiguous.
3288 if (claimed_by_sender) {
3289 dout(7) << "ambiguous import failed on " << *dir << dendl;
3290 migrator->import_reverse(dir);
3291 } else {
3292 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3293 migrator->import_finish(dir, true);
3294 }
3295 }
3296 p = next;
3297 }
3298 }
3299
3300 // update my dir_auth values
3301 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3302 // migrations between other nodes)
3303 for (const auto& p : m->subtrees) {
3304 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3305 CDir *dir = get_force_dirfrag(p.first, !survivor);
3306 if (!dir)
3307 continue;
3308 adjust_bounded_subtree_auth(dir, p.second, from);
3309 try_subtree_merge(dir);
3310 }
3311
3312 show_subtrees();
3313
3314 // note ambiguous imports too
3315 for (const auto& p : m->ambiguous_imports) {
3316 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3317 other_ambiguous_imports[from][p.first] = p.second;
3318 }
3319
3320 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3321 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3322 for (const auto& p : m->table_clients) {
3323 dout(10) << " noting " << get_mdstable_name(p.type)
3324 << " pending_commits " << p.pending_commits << dendl;
3325 MDSTableClient *client = mds->get_table_client(p.type);
3326 for (const auto& q : p.pending_commits)
3327 client->notify_commit(q);
3328 }
3329
3330 // did i get them all?
3331 resolve_gather.erase(from);
3332
3333 maybe_resolve_finish();
3334 }
3335
3336 void MDCache::process_delayed_resolve()
3337 {
3338 dout(10) << "process_delayed_resolve" << dendl;
3339 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3340 tmp.swap(delayed_resolve);
3341 for (auto &p : tmp) {
3342 handle_resolve(p.second);
3343 }
3344 }
3345
3346 void MDCache::discard_delayed_resolve(mds_rank_t who)
3347 {
3348 delayed_resolve.erase(who);
3349 }
3350
3351 void MDCache::maybe_resolve_finish()
3352 {
3353 ceph_assert(resolve_ack_gather.empty());
3354 ceph_assert(resolve_need_rollback.empty());
3355
3356 if (!resolve_gather.empty()) {
3357 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3358 << resolve_gather << ")" << dendl;
3359 return;
3360 }
3361
3362 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3363 disambiguate_my_imports();
3364 finish_committed_leaders();
3365
3366 if (resolve_done) {
3367 ceph_assert(mds->is_resolve());
3368 trim_unlinked_inodes();
3369 recalc_auth_bits(false);
3370 resolve_done.release()->complete(0);
3371 } else {
3372 // I am survivor.
3373 maybe_send_pending_rejoins();
3374 }
3375 }
3376
3377 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3378 {
3379 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3380 mds_rank_t from = mds_rank_t(ack->get_source().num());
3381
3382 if (!resolve_ack_gather.count(from) ||
3383 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3384 return;
3385 }
3386
3387 if (ambiguous_peer_updates.count(from)) {
3388 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3389 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3390 }
3391
3392 for (const auto &p : ack->commit) {
3393 dout(10) << " commit on peer " << p.first << dendl;
3394
3395 if (ambiguous_peer_updates.count(from)) {
3396 remove_ambiguous_peer_update(p.first, from);
3397 continue;
3398 }
3399
3400 if (mds->is_resolve()) {
3401 // replay
3402 MDPeerUpdate *su = get_uncommitted_peer(p.first, from);
3403 ceph_assert(su);
3404
3405 // log commit
3406 mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from,
3407 EPeerUpdate::OP_COMMIT, su->origop),
3408 new C_MDC_PeerCommit(this, from, p.first));
3409 mds->mdlog->flush();
3410
3411 finish_uncommitted_peer(p.first);
3412 } else {
3413 MDRequestRef mdr = request_get(p.first);
3414 // information about leader imported caps
3415 if (p.second.length() > 0)
3416 mdr->more()->inode_import.share(p.second);
3417
3418 ceph_assert(mdr->peer_request == 0); // shouldn't be doing anything!
3419 request_finish(mdr);
3420 }
3421 }
3422
3423 for (const auto &metareq : ack->abort) {
3424 dout(10) << " abort on peer " << metareq << dendl;
3425
3426 if (mds->is_resolve()) {
3427 MDPeerUpdate *su = get_uncommitted_peer(metareq, from);
3428 ceph_assert(su);
3429
3430 // perform rollback (and journal a rollback entry)
3431 // note: this will hold up the resolve a bit, until the rollback entries journal.
3432 MDRequestRef null_ref;
3433 switch (su->origop) {
3434 case EPeerUpdate::LINK:
3435 mds->server->do_link_rollback(su->rollback, from, null_ref);
3436 break;
3437 case EPeerUpdate::RENAME:
3438 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3439 break;
3440 case EPeerUpdate::RMDIR:
3441 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3442 break;
3443 default:
3444 ceph_abort();
3445 }
3446 } else {
3447 MDRequestRef mdr = request_get(metareq);
3448 mdr->aborted = true;
3449 if (mdr->peer_request) {
3450 if (mdr->peer_did_prepare()) // journaling peer prepare ?
3451 add_rollback(metareq, from);
3452 } else {
3453 request_finish(mdr);
3454 }
3455 }
3456 }
3457
3458 if (!ambiguous_peer_updates.count(from)) {
3459 resolve_ack_gather.erase(from);
3460 maybe_finish_peer_resolve();
3461 }
3462 }
3463
3464 void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su)
3465 {
3466 auto const &ret = uncommitted_peers.emplace(std::piecewise_construct,
3467 std::forward_as_tuple(reqid),
3468 std::forward_as_tuple());
3469 ceph_assert(ret.second);
3470 ls->uncommitted_peers.insert(reqid);
3471 upeer &u = ret.first->second;
3472 u.leader = leader;
3473 u.ls = ls;
3474 u.su = su;
3475 if (su == nullptr) {
3476 return;
3477 }
3478 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3479 uncommitted_peer_rename_olddir[*p]++;
3480 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3481 uncommitted_peer_unlink[*p]++;
3482 }
3483
3484 void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
3485 {
3486 auto it = uncommitted_peers.find(reqid);
3487 if (it == uncommitted_peers.end()) {
3488 ceph_assert(!assert_exist);
3489 return;
3490 }
3491 upeer &u = it->second;
3492 MDPeerUpdate* su = u.su;
3493
3494 if (!u.waiters.empty()) {
3495 mds->queue_waiters(u.waiters);
3496 }
3497 u.ls->uncommitted_peers.erase(reqid);
3498 uncommitted_peers.erase(it);
3499
3500 if (su == nullptr) {
3501 return;
3502 }
3503 // discard the non-auth subtree we renamed out of
3504 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3505 CInode *diri = *p;
3506 map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
3507 ceph_assert(it != uncommitted_peer_rename_olddir.end());
3508 it->second--;
3509 if (it->second == 0) {
3510 uncommitted_peer_rename_olddir.erase(it);
3511 auto&& ls = diri->get_dirfrags();
3512 for (const auto& dir : ls) {
3513 CDir *root = get_subtree_root(dir);
3514 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3515 try_trim_non_auth_subtree(root);
3516 if (dir != root)
3517 break;
3518 }
3519 }
3520 } else
3521 ceph_assert(it->second > 0);
3522 }
3523 // removed the inodes that were unlinked by peer update
3524 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3525 CInode *in = *p;
3526 map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
3527 ceph_assert(it != uncommitted_peer_unlink.end());
3528 it->second--;
3529 if (it->second == 0) {
3530 uncommitted_peer_unlink.erase(it);
3531 if (!in->get_projected_parent_dn())
3532 mds->mdcache->remove_inode_recursive(in);
3533 } else
3534 ceph_assert(it->second > 0);
3535 }
3536 delete su;
3537 }
3538
3539 MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader)
3540 {
3541
3542 MDPeerUpdate* su = nullptr;
3543 auto it = uncommitted_peers.find(reqid);
3544 if (it != uncommitted_peers.end() &&
3545 it->second.leader == leader) {
3546 su = it->second.su;
3547 }
3548 return su;
3549 }
3550
3551 void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
3552 auto p = resolve_need_rollback.find(reqid);
3553 ceph_assert(p != resolve_need_rollback.end());
3554 if (mds->is_resolve()) {
3555 finish_uncommitted_peer(reqid, false);
3556 } else if (mdr) {
3557 finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled);
3558 }
3559 resolve_need_rollback.erase(p);
3560 maybe_finish_peer_resolve();
3561 }
3562
3563 void MDCache::disambiguate_other_imports()
3564 {
3565 dout(10) << "disambiguate_other_imports" << dendl;
3566
3567 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3568 // other nodes' ambiguous imports
3569 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3570 p != other_ambiguous_imports.end();
3571 ++p) {
3572 mds_rank_t who = p->first;
3573 dout(10) << "ambiguous imports for mds." << who << dendl;
3574
3575 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3576 q != p->second.end();
3577 ++q) {
3578 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3579 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3580 CDir *dir = get_force_dirfrag(q->first, recovering);
3581 if (!dir) continue;
3582
3583 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3584 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3585 dout(10) << " mds." << who << " did import " << *dir << dendl;
3586 adjust_bounded_subtree_auth(dir, q->second, who);
3587 try_subtree_merge(dir);
3588 } else {
3589 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3590 }
3591 }
3592 }
3593 other_ambiguous_imports.clear();
3594 }
3595
3596 void MDCache::disambiguate_my_imports()
3597 {
3598 dout(10) << "disambiguate_my_imports" << dendl;
3599
3600 if (!mds->is_resolve()) {
3601 ceph_assert(my_ambiguous_imports.empty());
3602 return;
3603 }
3604
3605 disambiguate_other_imports();
3606
3607 // my ambiguous imports
3608 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3609 while (!my_ambiguous_imports.empty()) {
3610 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3611
3612 CDir *dir = get_dirfrag(q->first);
3613 ceph_assert(dir);
3614
3615 if (dir->authority() != me_ambig) {
3616 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3617 cancel_ambiguous_import(dir);
3618
3619 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3620
3621 // subtree may have been swallowed by another node claiming dir
3622 // as their own.
3623 CDir *root = get_subtree_root(dir);
3624 if (root != dir)
3625 dout(10) << " subtree root is " << *root << dendl;
3626 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3627 try_trim_non_auth_subtree(root);
3628 } else {
3629 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3630 finish_ambiguous_import(q->first);
3631 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3632 }
3633 }
3634 ceph_assert(my_ambiguous_imports.empty());
3635 mds->mdlog->flush();
3636
3637 // verify all my subtrees are unambiguous!
3638 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3639 p != subtrees.end();
3640 ++p) {
3641 CDir *dir = p->first;
3642 if (dir->is_ambiguous_dir_auth()) {
3643 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3644 }
3645 ceph_assert(!dir->is_ambiguous_dir_auth());
3646 }
3647
3648 show_subtrees();
3649 }
3650
3651
3652 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3653 {
3654 ceph_assert(my_ambiguous_imports.count(base) == 0);
3655 my_ambiguous_imports[base] = bounds;
3656 }
3657
3658
3659 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3660 {
3661 // make a list
3662 vector<dirfrag_t> binos;
3663 for (set<CDir*>::iterator p = bounds.begin();
3664 p != bounds.end();
3665 ++p)
3666 binos.push_back((*p)->dirfrag());
3667
3668 // note: this can get called twice if the exporter fails during recovery
3669 if (my_ambiguous_imports.count(base->dirfrag()))
3670 my_ambiguous_imports.erase(base->dirfrag());
3671
3672 add_ambiguous_import(base->dirfrag(), binos);
3673 }
3674
3675 void MDCache::cancel_ambiguous_import(CDir *dir)
3676 {
3677 dirfrag_t df = dir->dirfrag();
3678 ceph_assert(my_ambiguous_imports.count(df));
3679 dout(10) << "cancel_ambiguous_import " << df
3680 << " bounds " << my_ambiguous_imports[df]
3681 << " " << *dir
3682 << dendl;
3683 my_ambiguous_imports.erase(df);
3684 }
3685
3686 void MDCache::finish_ambiguous_import(dirfrag_t df)
3687 {
3688 ceph_assert(my_ambiguous_imports.count(df));
3689 vector<dirfrag_t> bounds;
3690 bounds.swap(my_ambiguous_imports[df]);
3691 my_ambiguous_imports.erase(df);
3692
3693 dout(10) << "finish_ambiguous_import " << df
3694 << " bounds " << bounds
3695 << dendl;
3696 CDir *dir = get_dirfrag(df);
3697 ceph_assert(dir);
3698
3699 // adjust dir_auth, import maps
3700 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3701 try_subtree_merge(dir);
3702 }
3703
3704 void MDCache::remove_inode_recursive(CInode *in)
3705 {
3706 dout(10) << "remove_inode_recursive " << *in << dendl;
3707 auto&& ls = in->get_dirfrags();
3708 for (const auto& subdir : ls) {
3709 dout(10) << " removing dirfrag " << *subdir << dendl;
3710 auto it = subdir->items.begin();
3711 while (it != subdir->items.end()) {
3712 CDentry *dn = it->second;
3713 ++it;
3714 CDentry::linkage_t *dnl = dn->get_linkage();
3715 if (dnl->is_primary()) {
3716 CInode *tin = dnl->get_inode();
3717 subdir->unlink_inode(dn, false);
3718 remove_inode_recursive(tin);
3719 }
3720 subdir->remove_dentry(dn);
3721 }
3722
3723 if (subdir->is_subtree_root())
3724 remove_subtree(subdir);
3725 in->close_dirfrag(subdir->dirfrag().frag);
3726 }
3727 remove_inode(in);
3728 }
3729
3730 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3731 {
3732 ceph_assert(!in->is_auth());
3733
3734 dout(10) << __func__ << ":" << *in << dendl;
3735
3736 // Recurse into any dirfrags beneath this inode
3737 auto&& ls = in->get_dirfrags();
3738 for (const auto& subdir : ls) {
3739 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3740 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3741 return true;
3742 }
3743
3744 for (auto &it : subdir->items) {
3745 CDentry *dn = it.second;
3746 CDentry::linkage_t *dnl = dn->get_linkage();
3747 if (dnl->is_primary()) {
3748 CInode *tin = dnl->get_inode();
3749
3750 /* Remote strays with linkage (i.e. hardlinks) should not be
3751 * expired, because they may be the target of
3752 * a rename() as the owning MDS shuts down */
3753 if (!tin->is_stray() && tin->get_inode()->nlink) {
3754 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3755 return true;
3756 }
3757
3758 const bool abort = expire_recursive(tin, expiremap);
3759 if (abort) {
3760 return true;
3761 }
3762 }
3763 if (dn->lru_is_expireable()) {
3764 trim_dentry(dn, expiremap);
3765 } else {
3766 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3767 return true;
3768 }
3769 }
3770 }
3771
3772 return false;
3773 }
3774
3775 void MDCache::trim_unlinked_inodes()
3776 {
3777 dout(7) << "trim_unlinked_inodes" << dendl;
3778 int count = 0;
3779 vector<CInode*> q;
3780 for (auto &p : inode_map) {
3781 CInode *in = p.second;
3782 if (in->get_parent_dn() == NULL && !in->is_base()) {
3783 dout(7) << " will trim from " << *in << dendl;
3784 q.push_back(in);
3785 }
3786
3787 if (!(++count % 1000))
3788 mds->heartbeat_reset();
3789 }
3790 for (auto& in : q) {
3791 remove_inode_recursive(in);
3792
3793 if (!(++count % 1000))
3794 mds->heartbeat_reset();
3795 }
3796 }
3797
3798 /** recalc_auth_bits()
3799 * once subtree auth is disambiguated, we need to adjust all the
3800 * auth and dirty bits in our cache before moving on.
3801 */
3802 void MDCache::recalc_auth_bits(bool replay)
3803 {
3804 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3805
3806 if (root) {
3807 root->inode_auth.first = mds->mdsmap->get_root();
3808 bool auth = mds->get_nodeid() == root->inode_auth.first;
3809 if (auth) {
3810 root->state_set(CInode::STATE_AUTH);
3811 } else {
3812 root->state_clear(CInode::STATE_AUTH);
3813 if (!replay)
3814 root->state_set(CInode::STATE_REJOINING);
3815 }
3816 }
3817
3818 set<CInode*> subtree_inodes;
3819 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3820 p != subtrees.end();
3821 ++p) {
3822 if (p->first->dir_auth.first == mds->get_nodeid())
3823 subtree_inodes.insert(p->first->inode);
3824 }
3825
3826 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3827 p != subtrees.end();
3828 ++p) {
3829 if (p->first->inode->is_mdsdir()) {
3830 CInode *in = p->first->inode;
3831 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3832 if (auth) {
3833 in->state_set(CInode::STATE_AUTH);
3834 } else {
3835 in->state_clear(CInode::STATE_AUTH);
3836 if (!replay)
3837 in->state_set(CInode::STATE_REJOINING);
3838 }
3839 }
3840
3841 std::queue<CDir*> dfq; // dirfrag queue
3842 dfq.push(p->first);
3843
3844 bool auth = p->first->authority().first == mds->get_nodeid();
3845 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3846
3847 while (!dfq.empty()) {
3848 CDir *dir = dfq.front();
3849 dfq.pop();
3850
3851 // dir
3852 if (auth) {
3853 dir->state_set(CDir::STATE_AUTH);
3854 } else {
3855 dir->state_clear(CDir::STATE_AUTH);
3856 if (!replay) {
3857 // close empty non-auth dirfrag
3858 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3859 dir->inode->close_dirfrag(dir->get_frag());
3860 continue;
3861 }
3862 dir->state_set(CDir::STATE_REJOINING);
3863 dir->state_clear(CDir::STATE_COMPLETE);
3864 if (dir->is_dirty())
3865 dir->mark_clean();
3866 }
3867 }
3868
3869 // dentries in this dir
3870 for (auto &p : dir->items) {
3871 // dn
3872 CDentry *dn = p.second;
3873 CDentry::linkage_t *dnl = dn->get_linkage();
3874 if (auth) {
3875 dn->state_set(CDentry::STATE_AUTH);
3876 } else {
3877 dn->state_clear(CDentry::STATE_AUTH);
3878 if (!replay) {
3879 dn->state_set(CDentry::STATE_REJOINING);
3880 if (dn->is_dirty())
3881 dn->mark_clean();
3882 }
3883 }
3884
3885 if (dnl->is_primary()) {
3886 // inode
3887 CInode *in = dnl->get_inode();
3888 if (auth) {
3889 in->state_set(CInode::STATE_AUTH);
3890 } else {
3891 in->state_clear(CInode::STATE_AUTH);
3892 if (!replay) {
3893 in->state_set(CInode::STATE_REJOINING);
3894 if (in->is_dirty())
3895 in->mark_clean();
3896 if (in->is_dirty_parent())
3897 in->clear_dirty_parent();
3898 // avoid touching scatterlocks for our subtree roots!
3899 if (subtree_inodes.count(in) == 0)
3900 in->clear_scatter_dirty();
3901 }
3902 }
3903 // recurse?
3904 if (in->is_dir()) {
3905 auto&& dfv = in->get_nested_dirfrags();
3906 for (const auto& dir : dfv) {
3907 dfq.push(dir);
3908 }
3909 }
3910 }
3911 }
3912 }
3913 }
3914
3915 show_subtrees();
3916 show_cache();
3917 }
3918
3919
3920
3921 // ===========================================================================
3922 // REJOIN
3923
3924 /*
3925 * notes on scatterlock recovery:
3926 *
3927 * - recovering inode replica sends scatterlock data for any subtree
3928 * roots (the only ones that are possibly dirty).
3929 *
3930 * - surviving auth incorporates any provided scatterlock data. any
3931 * pending gathers are then finished, as with the other lock types.
3932 *
3933 * that takes care of surviving auth + (recovering replica)*.
3934 *
3935 * - surviving replica sends strong_inode, which includes current
3936 * scatterlock state, AND any dirty scatterlock data. this
3937 * provides the recovering auth with everything it might need.
3938 *
3939 * - recovering auth must pick initial scatterlock state based on
3940 * (weak|strong) rejoins.
3941 * - always assimilate scatterlock data (it can't hurt)
3942 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3943 * - include base inode in ack for all inodes that saw scatterlock content
3944 *
3945 * also, for scatter gather,
3946 *
3947 * - auth increments {frag,r}stat.version on completion of any gather.
3948 *
3949 * - auth incorporates changes in a gather _only_ if the version
3950 * matches.
3951 *
3952 * - replica discards changes any time the scatterlock syncs, and
3953 * after recovery.
3954 */
3955
3956 void MDCache::dump_rejoin_status(Formatter *f) const
3957 {
3958 f->open_object_section("rejoin_status");
3959 f->dump_stream("rejoin_gather") << rejoin_gather;
3960 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3961 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3962 f->close_section();
3963 }
3964
3965 void MDCache::rejoin_start(MDSContext *rejoin_done_)
3966 {
3967 dout(10) << "rejoin_start" << dendl;
3968 ceph_assert(!rejoin_done);
3969 rejoin_done.reset(rejoin_done_);
3970
3971 rejoin_gather = recovery_set;
3972 // need finish opening cap inodes before sending cache rejoins
3973 rejoin_gather.insert(mds->get_nodeid());
3974 process_imported_caps();
3975 }
3976
3977 /*
3978 * rejoin phase!
3979 *
3980 * this initiates rejoin. it should be called before we get any
3981 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3982 *
3983 * we start out by sending rejoins to everyone in the recovery set.
3984 *
3985 * if we are rejoin, send for all regions in our cache.
3986 * if we are active|stopping, send only to nodes that are rejoining.
3987 */
3988 void MDCache::rejoin_send_rejoins()
3989 {
3990 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3991
3992 if (rejoin_gather.count(mds->get_nodeid())) {
3993 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3994 rejoins_pending = true;
3995 return;
3996 }
3997 if (!resolve_gather.empty()) {
3998 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3999 << resolve_gather << ")" << dendl;
4000 rejoins_pending = true;
4001 return;
4002 }
4003
4004 ceph_assert(!migrator->is_importing());
4005 ceph_assert(!migrator->is_exporting());
4006
4007 if (!mds->is_rejoin()) {
4008 disambiguate_other_imports();
4009 }
4010
4011 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
4012
4013
4014 // if i am rejoining, send a rejoin to everyone.
4015 // otherwise, just send to others who are rejoining.
4016 for (const auto& rank : recovery_set) {
4017 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4018 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
4019 if (mds->is_rejoin())
4020 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4021 else if (mds->mdsmap->is_rejoin(rank))
4022 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
4023 }
4024
4025 if (mds->is_rejoin()) {
4026 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4027 for (auto& p : cap_exports) {
4028 mds_rank_t target = p.second.first;
4029 if (rejoins.count(target) == 0)
4030 continue;
4031 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4032 Session *session = nullptr;
4033 auto it = client_exports.find(q->first);
4034 if (it != client_exports.end()) {
4035 session = it->second.first;
4036 if (session)
4037 it->second.second.insert(target);
4038 } else {
4039 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4040 auto& r = client_exports[q->first];
4041 r.first = session;
4042 if (session)
4043 r.second.insert(target);
4044 }
4045 if (session) {
4046 ++q;
4047 } else {
4048 // remove reconnect with no session
4049 p.second.second.erase(q++);
4050 }
4051 }
4052 rejoins[target]->cap_exports[p.first] = p.second.second;
4053 }
4054 for (auto& p : client_exports) {
4055 Session *session = p.second.first;
4056 for (auto& q : p.second.second) {
4057 auto rejoin = rejoins[q];
4058 rejoin->client_map[p.first] = session->info.inst;
4059 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4060 }
4061 }
4062 }
4063
4064
4065 // check all subtrees
4066 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4067 p != subtrees.end();
4068 ++p) {
4069 CDir *dir = p->first;
4070 ceph_assert(dir->is_subtree_root());
4071 if (dir->is_ambiguous_dir_auth()) {
4072 // exporter is recovering, importer is survivor.
4073 ceph_assert(rejoins.count(dir->authority().first));
4074 ceph_assert(!rejoins.count(dir->authority().second));
4075 continue;
4076 }
4077
4078 // my subtree?
4079 if (dir->is_auth())
4080 continue; // skip my own regions!
4081
4082 mds_rank_t auth = dir->get_dir_auth().first;
4083 ceph_assert(auth >= 0);
4084 if (rejoins.count(auth) == 0)
4085 continue; // don't care about this node's subtrees
4086
4087 rejoin_walk(dir, rejoins[auth]);
4088 }
4089
4090 // rejoin root inodes, too
4091 for (auto &p : rejoins) {
4092 if (mds->is_rejoin()) {
4093 // weak
4094 if (p.first == 0 && root) {
4095 p.second->add_weak_inode(root->vino());
4096 if (root->is_dirty_scattered()) {
4097 dout(10) << " sending scatterlock state on root " << *root << dendl;
4098 p.second->add_scatterlock_state(root);
4099 }
4100 }
4101 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4102 if (in)
4103 p.second->add_weak_inode(in->vino());
4104 }
4105 } else {
4106 // strong
4107 if (p.first == 0 && root) {
4108 p.second->add_strong_inode(root->vino(),
4109 root->get_replica_nonce(),
4110 root->get_caps_wanted(),
4111 root->filelock.get_state(),
4112 root->nestlock.get_state(),
4113 root->dirfragtreelock.get_state());
4114 root->state_set(CInode::STATE_REJOINING);
4115 if (root->is_dirty_scattered()) {
4116 dout(10) << " sending scatterlock state on root " << *root << dendl;
4117 p.second->add_scatterlock_state(root);
4118 }
4119 }
4120
4121 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4122 p.second->add_strong_inode(in->vino(),
4123 in->get_replica_nonce(),
4124 in->get_caps_wanted(),
4125 in->filelock.get_state(),
4126 in->nestlock.get_state(),
4127 in->dirfragtreelock.get_state());
4128 in->state_set(CInode::STATE_REJOINING);
4129 }
4130 }
4131 }
4132
4133 if (!mds->is_rejoin()) {
4134 // i am survivor. send strong rejoin.
4135 // note request remote_auth_pins, xlocks
4136 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4137 p != active_requests.end();
4138 ++p) {
4139 MDRequestRef& mdr = p->second;
4140 if (mdr->is_peer())
4141 continue;
4142 // auth pins
4143 for (const auto& q : mdr->object_states) {
4144 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4145 continue;
4146 if (!q.first->is_auth()) {
4147 mds_rank_t target = q.second.remote_auth_pinned;
4148 ceph_assert(target == q.first->authority().first);
4149 if (rejoins.count(target) == 0) continue;
4150 const auto& rejoin = rejoins[target];
4151
4152 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4153 MDSCacheObjectInfo i;
4154 q.first->set_object_info(i);
4155 if (i.ino)
4156 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4157 else
4158 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4159
4160 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4161 mdr->more()->rename_inode == q.first)
4162 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4163 mdr->reqid, mdr->attempt);
4164 }
4165 }
4166 // xlocks
4167 for (const auto& q : mdr->locks) {
4168 auto lock = q.lock;
4169 auto obj = lock->get_parent();
4170 if (q.is_xlock() && !obj->is_auth()) {
4171 mds_rank_t who = obj->authority().first;
4172 if (rejoins.count(who) == 0) continue;
4173 const auto& rejoin = rejoins[who];
4174
4175 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4176 MDSCacheObjectInfo i;
4177 obj->set_object_info(i);
4178 if (i.ino)
4179 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4180 mdr->reqid, mdr->attempt);
4181 else
4182 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4183 mdr->reqid, mdr->attempt);
4184 } else if (q.is_remote_wrlock()) {
4185 mds_rank_t who = q.wrlock_target;
4186 if (rejoins.count(who) == 0) continue;
4187 const auto& rejoin = rejoins[who];
4188
4189 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4190 MDSCacheObjectInfo i;
4191 obj->set_object_info(i);
4192 ceph_assert(i.ino);
4193 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4194 mdr->reqid, mdr->attempt);
4195 }
4196 }
4197 }
4198 }
4199
4200 // send the messages
4201 for (auto &p : rejoins) {
4202 ceph_assert(rejoin_sent.count(p.first) == 0);
4203 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4204 rejoin_sent.insert(p.first);
4205 rejoin_ack_gather.insert(p.first);
4206 mds->send_message_mds(p.second, p.first);
4207 }
4208 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4209 rejoins_pending = false;
4210
4211 // nothing?
4212 if (mds->is_rejoin() && rejoin_gather.empty()) {
4213 dout(10) << "nothing to rejoin" << dendl;
4214 rejoin_gather_finish();
4215 }
4216 }
4217
4218
4219 /**
4220 * rejoin_walk - build rejoin declarations for a subtree
4221 *
4222 * @param dir subtree root
4223 * @param rejoin rejoin message
4224 *
4225 * from a rejoining node:
4226 * weak dirfrag
4227 * weak dentries (w/ connectivity)
4228 *
4229 * from a surviving node:
4230 * strong dirfrag
4231 * strong dentries (no connectivity!)
4232 * strong inodes
4233 */
4234 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4235 {
4236 dout(10) << "rejoin_walk " << *dir << dendl;
4237
4238 std::vector<CDir*> nested; // finish this dir, then do nested items
4239
4240 if (mds->is_rejoin()) {
4241 // WEAK
4242 rejoin->add_weak_dirfrag(dir->dirfrag());
4243 for (auto &p : dir->items) {
4244 CDentry *dn = p.second;
4245 ceph_assert(dn->last == CEPH_NOSNAP);
4246 CDentry::linkage_t *dnl = dn->get_linkage();
4247 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4248 ceph_assert(dnl->is_primary());
4249 CInode *in = dnl->get_inode();
4250 ceph_assert(dnl->get_inode()->is_dir());
4251 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4252 {
4253 auto&& dirs = in->get_nested_dirfrags();
4254 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4255 }
4256 if (in->is_dirty_scattered()) {
4257 dout(10) << " sending scatterlock state on " << *in << dendl;
4258 rejoin->add_scatterlock_state(in);
4259 }
4260 }
4261 } else {
4262 // STRONG
4263 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4264 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4265 dir->state_set(CDir::STATE_REJOINING);
4266
4267 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4268 CDentry *dn = it->second;
4269 ++it;
4270 dn->state_set(CDentry::STATE_REJOINING);
4271 CDentry::linkage_t *dnl = dn->get_linkage();
4272 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4273
4274 // trim snap dentries. because they may have been pruned by
4275 // their auth mds (snap deleted)
4276 if (dn->last != CEPH_NOSNAP) {
4277 if (in && !in->remote_parents.empty()) {
4278 // unlink any stale remote snap dentry.
4279 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4280 CDentry *remote_dn = *it2;
4281 ++it2;
4282 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4283 remote_dn->unlink_remote(remote_dn->get_linkage());
4284 }
4285 }
4286 if (dn->lru_is_expireable()) {
4287 if (!dnl->is_null())
4288 dir->unlink_inode(dn, false);
4289 if (in)
4290 remove_inode(in);
4291 dir->remove_dentry(dn);
4292 continue;
4293 } else {
4294 // Inventing null/remote dentry shouldn't cause problem
4295 ceph_assert(!dnl->is_primary());
4296 }
4297 }
4298
4299 dout(15) << " add_strong_dentry " << *dn << dendl;
4300 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4301 dn->first, dn->last,
4302 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4303 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4304 dnl->is_remote() ? dnl->get_remote_d_type():0,
4305 dn->get_replica_nonce(),
4306 dn->lock.get_state());
4307 dn->state_set(CDentry::STATE_REJOINING);
4308 if (dnl->is_primary()) {
4309 CInode *in = dnl->get_inode();
4310 dout(15) << " add_strong_inode " << *in << dendl;
4311 rejoin->add_strong_inode(in->vino(),
4312 in->get_replica_nonce(),
4313 in->get_caps_wanted(),
4314 in->filelock.get_state(),
4315 in->nestlock.get_state(),
4316 in->dirfragtreelock.get_state());
4317 in->state_set(CInode::STATE_REJOINING);
4318 {
4319 auto&& dirs = in->get_nested_dirfrags();
4320 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4321 }
4322 if (in->is_dirty_scattered()) {
4323 dout(10) << " sending scatterlock state on " << *in << dendl;
4324 rejoin->add_scatterlock_state(in);
4325 }
4326 }
4327 }
4328 }
4329
4330 // recurse into nested dirs
4331 for (const auto& dir : nested) {
4332 rejoin_walk(dir, rejoin);
4333 }
4334 }
4335
4336
4337 /*
4338 * i got a rejoin.
4339 * - reply with the lockstate
4340 *
4341 * if i am active|stopping,
4342 * - remove source from replica list for everything not referenced here.
4343 */
4344 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4345 {
4346 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4347 << " (" << m->get_payload().length() << " bytes)"
4348 << dendl;
4349
4350 switch (m->op) {
4351 case MMDSCacheRejoin::OP_WEAK:
4352 handle_cache_rejoin_weak(m);
4353 break;
4354 case MMDSCacheRejoin::OP_STRONG:
4355 handle_cache_rejoin_strong(m);
4356 break;
4357 case MMDSCacheRejoin::OP_ACK:
4358 handle_cache_rejoin_ack(m);
4359 break;
4360
4361 default:
4362 ceph_abort();
4363 }
4364 }
4365
4366
4367 /*
4368 * handle_cache_rejoin_weak
4369 *
4370 * the sender
4371 * - is recovering from their journal.
4372 * - may have incorrect (out of date) inode contents
4373 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4374 *
4375 * if the sender didn't trim_non_auth(), they
4376 * - may have incorrect (out of date) dentry/inode linkage
4377 * - may have deleted/purged inodes
4378 * and i may have to go to disk to get accurate inode contents. yuck.
4379 */
4380 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4381 {
4382 mds_rank_t from = mds_rank_t(weak->get_source().num());
4383
4384 // possible response(s)
4385 ref_t<MMDSCacheRejoin> ack; // if survivor
4386 set<vinodeno_t> acked_inodes; // if survivor
4387 set<SimpleLock *> gather_locks; // if survivor
4388 bool survivor = false; // am i a survivor?
4389
4390 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4391 survivor = true;
4392 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4393 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4394
4395 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4396
4397 // check cap exports
4398 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4399 CInode *in = get_inode(p->first);
4400 ceph_assert(!in || in->is_auth());
4401 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4402 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4403 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4404 Capability::Import& im = imported_caps[p->first][q->first];
4405 if (cap) {
4406 im.cap_id = cap->get_cap_id();
4407 im.issue_seq = cap->get_last_seq();
4408 im.mseq = cap->get_mseq();
4409 } else {
4410 // all are zero
4411 }
4412 }
4413 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4414 }
4415
4416 encode(imported_caps, ack->imported_caps);
4417 } else {
4418 ceph_assert(mds->is_rejoin());
4419
4420 // we may have already received a strong rejoin from the sender.
4421 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4422 ceph_assert(gather_locks.empty());
4423
4424 // check cap exports.
4425 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4426 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4427 weak->client_metadata_map.end());
4428
4429 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4430 CInode *in = get_inode(p->first);
4431 ceph_assert(!in || in->is_auth());
4432 // note
4433 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4434 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4435 cap_imports[p->first][q->first][from] = q->second;
4436 }
4437 }
4438 }
4439
4440 // assimilate any potentially dirty scatterlock state
4441 for (const auto &p : weak->inode_scatterlocks) {
4442 CInode *in = get_inode(p.first);
4443 ceph_assert(in);
4444 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4445 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4446 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4447 if (!survivor)
4448 rejoin_potential_updated_scatterlocks.insert(in);
4449 }
4450
4451 // recovering peer may send incorrect dirfrags here. we need to
4452 // infer which dirfrag they meant. the ack will include a
4453 // strong_dirfrag that will set them straight on the fragmentation.
4454
4455 // walk weak map
4456 set<CDir*> dirs_to_share;
4457 for (const auto &p : weak->weak_dirfrags) {
4458 CInode *diri = get_inode(p.ino);
4459 if (!diri)
4460 dout(0) << " missing dir ino " << p.ino << dendl;
4461 ceph_assert(diri);
4462
4463 frag_vec_t leaves;
4464 if (diri->dirfragtree.is_leaf(p.frag)) {
4465 leaves.push_back(p.frag);
4466 } else {
4467 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4468 if (leaves.empty())
4469 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4470 }
4471 for (const auto& leaf : leaves) {
4472 CDir *dir = diri->get_dirfrag(leaf);
4473 if (!dir) {
4474 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4475 continue;
4476 }
4477 ceph_assert(dir);
4478 if (dirs_to_share.count(dir)) {
4479 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4480 } else {
4481 dirs_to_share.insert(dir);
4482 unsigned nonce = dir->add_replica(from);
4483 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4484 if (ack) {
4485 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4486 ack->add_dirfrag_base(dir);
4487 }
4488 }
4489 }
4490 }
4491
4492 for (const auto &p : weak->weak) {
4493 CInode *diri = get_inode(p.first);
4494 if (!diri)
4495 dout(0) << " missing dir ino " << p.first << dendl;
4496 ceph_assert(diri);
4497
4498 // weak dentries
4499 CDir *dir = 0;
4500 for (const auto &q : p.second) {
4501 // locate proper dirfrag.
4502 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4503 frag_t fg = diri->pick_dirfrag(q.first.name);
4504 if (!dir || dir->get_frag() != fg) {
4505 dir = diri->get_dirfrag(fg);
4506 if (!dir)
4507 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4508 ceph_assert(dir);
4509 ceph_assert(dirs_to_share.count(dir));
4510 }
4511
4512 // and dentry
4513 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4514 ceph_assert(dn);
4515 CDentry::linkage_t *dnl = dn->get_linkage();
4516 ceph_assert(dnl->is_primary());
4517
4518 if (survivor && dn->is_replica(from))
4519 dentry_remove_replica(dn, from, gather_locks);
4520 unsigned dnonce = dn->add_replica(from);
4521 dout(10) << " have " << *dn << dendl;
4522 if (ack)
4523 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4524 dn->first, dn->last,
4525 dnl->get_inode()->ino(), inodeno_t(0), 0,
4526 dnonce, dn->lock.get_replica_state());
4527
4528 // inode
4529 CInode *in = dnl->get_inode();
4530 ceph_assert(in);
4531
4532 if (survivor && in->is_replica(from))
4533 inode_remove_replica(in, from, true, gather_locks);
4534 unsigned inonce = in->add_replica(from);
4535 dout(10) << " have " << *in << dendl;
4536
4537 // scatter the dirlock, just in case?
4538 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4539 in->filelock.set_state(LOCK_MIX);
4540
4541 if (ack) {
4542 acked_inodes.insert(in->vino());
4543 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4544 bufferlist bl;
4545 in->_encode_locks_state_for_rejoin(bl, from);
4546 ack->add_inode_locks(in, inonce, bl);
4547 }
4548 }
4549 }
4550
4551 // weak base inodes? (root, stray, etc.)
4552 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4553 p != weak->weak_inodes.end();
4554 ++p) {
4555 CInode *in = get_inode(*p);
4556 ceph_assert(in); // hmm fixme wrt stray?
4557 if (survivor && in->is_replica(from))
4558 inode_remove_replica(in, from, true, gather_locks);
4559 unsigned inonce = in->add_replica(from);
4560 dout(10) << " have base " << *in << dendl;
4561
4562 if (ack) {
4563 acked_inodes.insert(in->vino());
4564 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4565 bufferlist bl;
4566 in->_encode_locks_state_for_rejoin(bl, from);
4567 ack->add_inode_locks(in, inonce, bl);
4568 }
4569 }
4570
4571 ceph_assert(rejoin_gather.count(from));
4572 rejoin_gather.erase(from);
4573 if (survivor) {
4574 // survivor. do everything now.
4575 for (const auto &p : weak->inode_scatterlocks) {
4576 CInode *in = get_inode(p.first);
4577 ceph_assert(in);
4578 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4579 acked_inodes.insert(in->vino());
4580 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4581 }
4582
4583 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4584 mds->send_message(ack, weak->get_connection());
4585
4586 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4587 if (!(*p)->is_stable())
4588 mds->locker->eval_gather(*p);
4589 }
4590 } else {
4591 // done?
4592 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4593 rejoin_gather_finish();
4594 } else {
4595 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4596 }
4597 }
4598 }
4599
4600 /*
4601 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4602 *
4603 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4604 * ack, the replica dne, and we can remove it from our replica maps.
4605 */
4606 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4607 set<vinodeno_t>& acked_inodes,
4608 set<SimpleLock *>& gather_locks)
4609 {
4610 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4611
4612 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4613 // inode?
4614 if (in->is_auth() &&
4615 in->is_replica(from) &&
4616 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4617 inode_remove_replica(in, from, false, gather_locks);
4618 dout(10) << " rem " << *in << dendl;
4619 }
4620
4621 if (!in->is_dir())
4622 return;
4623
4624 const auto&& dfs = in->get_dirfrags();
4625 for (const auto& dir : dfs) {
4626 if (!dir->is_auth())
4627 continue;
4628
4629 if (dir->is_replica(from) &&
4630 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4631 dir->remove_replica(from);
4632 dout(10) << " rem " << *dir << dendl;
4633 }
4634
4635 // dentries
4636 for (auto &p : dir->items) {
4637 CDentry *dn = p.second;
4638
4639 if (dn->is_replica(from)) {
4640 if (ack) {
4641 const auto it = ack->strong_dentries.find(dir->dirfrag());
4642 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4643 continue;
4644 }
4645 }
4646 dentry_remove_replica(dn, from, gather_locks);
4647 dout(10) << " rem " << *dn << dendl;
4648 }
4649 }
4650 }
4651 };
4652
4653 for (auto &p : inode_map)
4654 scour_func(p.second);
4655 for (auto &p : snap_inode_map)
4656 scour_func(p.second);
4657 }
4658
4659
4660 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4661 {
4662 CInode *in = new CInode(this, true, 2, last);
4663 in->_get_inode()->ino = ino;
4664 in->state_set(CInode::STATE_REJOINUNDEF);
4665 add_inode(in);
4666 rejoin_undef_inodes.insert(in);
4667 dout(10) << " invented " << *in << dendl;
4668 return in;
4669 }
4670
4671 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4672 {
4673 CInode *in = get_inode(df.ino);
4674 if (!in)
4675 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4676 if (!in->is_dir()) {
4677 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4678 in->_get_inode()->mode = S_IFDIR;
4679 in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4680 }
4681 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4682 dir->state_set(CDir::STATE_REJOINUNDEF);
4683 rejoin_undef_dirfrags.insert(dir);
4684 dout(10) << " invented " << *dir << dendl;
4685 return dir;
4686 }
4687
4688 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4689 {
4690 mds_rank_t from = mds_rank_t(strong->get_source().num());
4691
4692 // only a recovering node will get a strong rejoin.
4693 if (!mds->is_rejoin()) {
4694 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4695 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4696 return;
4697 }
4698 ceph_abort_msg("got unexpected rejoin message during recovery");
4699 }
4700
4701 // assimilate any potentially dirty scatterlock state
4702 for (const auto &p : strong->inode_scatterlocks) {
4703 CInode *in = get_inode(p.first);
4704 ceph_assert(in);
4705 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4706 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4707 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4708 rejoin_potential_updated_scatterlocks.insert(in);
4709 }
4710
4711 rejoin_unlinked_inodes[from].clear();
4712
4713 // surviving peer may send incorrect dirfrag here (maybe they didn't
4714 // get the fragment notify, or maybe we rolled back?). we need to
4715 // infer the right frag and get them with the program. somehow.
4716 // we don't normally send ACK.. so we'll need to bundle this with
4717 // MISSING or something.
4718
4719 // strong dirfrags/dentries.
4720 // also process auth_pins, xlocks.
4721 for (const auto &p : strong->strong_dirfrags) {
4722 auto& dirfrag = p.first;
4723 CInode *diri = get_inode(dirfrag.ino);
4724 if (!diri)
4725 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4726 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4727 bool refragged = false;
4728 if (dir) {
4729 dout(10) << " have " << *dir << dendl;
4730 } else {
4731 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4732 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4733 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4734 dir = rejoin_invent_dirfrag(dirfrag);
4735 }
4736 if (dir) {
4737 dir->add_replica(from, p.second.nonce);
4738 dir->dir_rep = p.second.dir_rep;
4739 } else {
4740 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4741 frag_vec_t leaves;
4742 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4743 if (leaves.empty())
4744 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4745 dout(10) << " maps to frag(s) " << leaves << dendl;
4746 for (const auto& leaf : leaves) {
4747 CDir *dir = diri->get_dirfrag(leaf);
4748 if (!dir)
4749 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4750 else
4751 dout(10) << " have(approx) " << *dir << dendl;
4752 dir->add_replica(from, p.second.nonce);
4753 dir->dir_rep = p.second.dir_rep;
4754 }
4755 refragged = true;
4756 }
4757
4758 const auto it = strong->strong_dentries.find(dirfrag);
4759 if (it != strong->strong_dentries.end()) {
4760 const auto& dmap = it->second;
4761 for (const auto &q : dmap) {
4762 const string_snap_t& ss = q.first;
4763 const MMDSCacheRejoin::dn_strong& d = q.second;
4764 CDentry *dn;
4765 if (!refragged)
4766 dn = dir->lookup(ss.name, ss.snapid);
4767 else {
4768 frag_t fg = diri->pick_dirfrag(ss.name);
4769 dir = diri->get_dirfrag(fg);
4770 ceph_assert(dir);
4771 dn = dir->lookup(ss.name, ss.snapid);
4772 }
4773 if (!dn) {
4774 if (d.is_remote()) {
4775 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4776 } else if (d.is_null()) {
4777 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4778 } else {
4779 CInode *in = get_inode(d.ino, ss.snapid);
4780 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4781 dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4782 }
4783 dout(10) << " invented " << *dn << dendl;
4784 }
4785 CDentry::linkage_t *dnl = dn->get_linkage();
4786
4787 // dn auth_pin?
4788 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4789 if (pinned_it != strong->authpinned_dentries.end()) {
4790 const auto peer_reqid_it = pinned_it->second.find(ss);
4791 if (peer_reqid_it != pinned_it->second.end()) {
4792 for (const auto &r : peer_reqid_it->second) {
4793 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4794
4795 // get/create peer mdrequest
4796 MDRequestRef mdr;
4797 if (have_request(r.reqid))
4798 mdr = request_get(r.reqid);
4799 else
4800 mdr = request_start_peer(r.reqid, r.attempt, strong);
4801 mdr->auth_pin(dn);
4802 }
4803 }
4804 }
4805
4806 // dn xlock?
4807 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4808 if (xlocked_it != strong->xlocked_dentries.end()) {
4809 const auto ss_req_it = xlocked_it->second.find(ss);
4810 if (ss_req_it != xlocked_it->second.end()) {
4811 const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second;
4812 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4813 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4814 ceph_assert(mdr->is_auth_pinned(dn));
4815 if (!mdr->is_xlocked(&dn->versionlock)) {
4816 ceph_assert(dn->versionlock.can_xlock_local());
4817 dn->versionlock.get_xlock(mdr, mdr->get_client());
4818 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4819 }
4820 if (dn->lock.is_stable())
4821 dn->auth_pin(&dn->lock);
4822 dn->lock.set_state(LOCK_XLOCK);
4823 dn->lock.get_xlock(mdr, mdr->get_client());
4824 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4825 }
4826 }
4827
4828 dn->add_replica(from, d.nonce);
4829 dout(10) << " have " << *dn << dendl;
4830
4831 if (dnl->is_primary()) {
4832 if (d.is_primary()) {
4833 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4834 // the survivor missed MDentryUnlink+MDentryLink messages ?
4835 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4836 CInode *in = get_inode(d.ino, ss.snapid);
4837 ceph_assert(in);
4838 ceph_assert(in->get_parent_dn());
4839 rejoin_unlinked_inodes[from].insert(in);
4840 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4841 }
4842 } else {
4843 // the survivor missed MDentryLink message ?
4844 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4845 dout(7) << " sender doesn't have primay dentry" << dendl;
4846 }
4847 } else {
4848 if (d.is_primary()) {
4849 // the survivor missed MDentryUnlink message ?
4850 CInode *in = get_inode(d.ino, ss.snapid);
4851 ceph_assert(in);
4852 ceph_assert(in->get_parent_dn());
4853 rejoin_unlinked_inodes[from].insert(in);
4854 dout(7) << " sender has primary dentry but we don't" << dendl;
4855 }
4856 }
4857 }
4858 }
4859 }
4860
4861 for (const auto &p : strong->strong_inodes) {
4862 CInode *in = get_inode(p.first);
4863 ceph_assert(in);
4864 in->add_replica(from, p.second.nonce);
4865 dout(10) << " have " << *in << dendl;
4866
4867 const MMDSCacheRejoin::inode_strong& is = p.second;
4868
4869 // caps_wanted
4870 if (is.caps_wanted) {
4871 in->set_mds_caps_wanted(from, is.caps_wanted);
4872 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4873 << " on " << *in << dendl;
4874 }
4875
4876 // scatterlocks?
4877 // infer state from replica state:
4878 // * go to MIX if they might have wrlocks
4879 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4880 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4881 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4882 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4883
4884 // auth pin?
4885 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4886 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4887 for (const auto& r : authpinned_inodes_it->second) {
4888 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4889
4890 // get/create peer mdrequest
4891 MDRequestRef mdr;
4892 if (have_request(r.reqid))
4893 mdr = request_get(r.reqid);
4894 else
4895 mdr = request_start_peer(r.reqid, r.attempt, strong);
4896 if (strong->frozen_authpin_inodes.count(in->vino())) {
4897 ceph_assert(!in->get_num_auth_pins());
4898 mdr->freeze_auth_pin(in);
4899 } else {
4900 ceph_assert(!in->is_frozen_auth_pin());
4901 }
4902 mdr->auth_pin(in);
4903 }
4904 }
4905 // xlock(s)?
4906 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4907 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4908 for (const auto &q : xlocked_inodes_it->second) {
4909 SimpleLock *lock = in->get_lock(q.first);
4910 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4911 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4912 ceph_assert(mdr->is_auth_pinned(in));
4913 if (!mdr->is_xlocked(&in->versionlock)) {
4914 ceph_assert(in->versionlock.can_xlock_local());
4915 in->versionlock.get_xlock(mdr, mdr->get_client());
4916 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4917 }
4918 if (lock->is_stable())
4919 in->auth_pin(lock);
4920 lock->set_state(LOCK_XLOCK);
4921 if (lock == &in->filelock)
4922 in->loner_cap = -1;
4923 lock->get_xlock(mdr, mdr->get_client());
4924 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4925 }
4926 }
4927 }
4928 // wrlock(s)?
4929 for (const auto &p : strong->wrlocked_inodes) {
4930 CInode *in = get_inode(p.first);
4931 for (const auto &q : p.second) {
4932 SimpleLock *lock = in->get_lock(q.first);
4933 for (const auto &r : q.second) {
4934 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4935 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4936 if (in->is_auth())
4937 ceph_assert(mdr->is_auth_pinned(in));
4938 lock->set_state(LOCK_MIX);
4939 if (lock == &in->filelock)
4940 in->loner_cap = -1;
4941 lock->get_wrlock(true);
4942 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4943 }
4944 }
4945 }
4946
4947 // done?
4948 ceph_assert(rejoin_gather.count(from));
4949 rejoin_gather.erase(from);
4950 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4951 rejoin_gather_finish();
4952 } else {
4953 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4954 }
4955 }
4956
4957 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4958 {
4959 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4960 mds_rank_t from = mds_rank_t(ack->get_source().num());
4961
4962 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4963 bool survivor = !mds->is_rejoin();
4964
4965 // for sending cache expire message
4966 set<CInode*> isolated_inodes;
4967 set<CInode*> refragged_inodes;
4968 list<pair<CInode*,int> > updated_realms;
4969
4970 // dirs
4971 for (const auto &p : ack->strong_dirfrags) {
4972 // we may have had incorrect dir fragmentation; refragment based
4973 // on what they auth tells us.
4974 CDir *dir = get_dirfrag(p.first);
4975 if (!dir) {
4976 dir = get_force_dirfrag(p.first, false);
4977 if (dir)
4978 refragged_inodes.insert(dir->get_inode());
4979 }
4980 if (!dir) {
4981 CInode *diri = get_inode(p.first.ino);
4982 if (!diri) {
4983 // barebones inode; the full inode loop below will clean up.
4984 diri = new CInode(this, false);
4985 auto _inode = diri->_get_inode();
4986 _inode->ino = p.first.ino;
4987 _inode->mode = S_IFDIR;
4988 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4989
4990 add_inode(diri);
4991 if (MDS_INO_MDSDIR(from) == p.first.ino) {
4992 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4993 dout(10) << " add inode " << *diri << dendl;
4994 } else {
4995 diri->inode_auth = CDIR_AUTH_DEFAULT;
4996 isolated_inodes.insert(diri);
4997 dout(10) << " unconnected dirfrag " << p.first << dendl;
4998 }
4999 }
5000 // barebones dirfrag; the full dirfrag loop below will clean up.
5001 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
5002 if (MDS_INO_MDSDIR(from) == p.first.ino ||
5003 (dir->authority() != CDIR_AUTH_UNDEF &&
5004 dir->authority().first != from))
5005 adjust_subtree_auth(dir, from);
5006 dout(10) << " add dirfrag " << *dir << dendl;
5007 }
5008
5009 dir->set_replica_nonce(p.second.nonce);
5010 dir->state_clear(CDir::STATE_REJOINING);
5011 dout(10) << " got " << *dir << dendl;
5012
5013 // dentries
5014 auto it = ack->strong_dentries.find(p.first);
5015 if (it != ack->strong_dentries.end()) {
5016 for (const auto &q : it->second) {
5017 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5018 if(!dn)
5019 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5020
5021 CDentry::linkage_t *dnl = dn->get_linkage();
5022
5023 ceph_assert(dn->last == q.first.snapid);
5024 if (dn->first != q.second.first) {
5025 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5026 dn->first = q.second.first;
5027 }
5028
5029 // may have bad linkage if we missed dentry link/unlink messages
5030 if (dnl->is_primary()) {
5031 CInode *in = dnl->get_inode();
5032 if (!q.second.is_primary() ||
5033 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5034 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5035 dir->unlink_inode(dn);
5036 }
5037 } else if (dnl->is_remote()) {
5038 if (!q.second.is_remote() ||
5039 q.second.remote_ino != dnl->get_remote_ino() ||
5040 q.second.remote_d_type != dnl->get_remote_d_type()) {
5041 dout(10) << " had bad linkage for " << *dn << dendl;
5042 dir->unlink_inode(dn);
5043 }
5044 } else {
5045 if (!q.second.is_null())
5046 dout(10) << " had bad linkage for " << *dn << dendl;
5047 }
5048
5049 // hmm, did we have the proper linkage here?
5050 if (dnl->is_null() && !q.second.is_null()) {
5051 if (q.second.is_remote()) {
5052 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5053 } else {
5054 CInode *in = get_inode(q.second.ino, q.first.snapid);
5055 if (!in) {
5056 // barebones inode; assume it's dir, the full inode loop below will clean up.
5057 in = new CInode(this, false, q.second.first, q.first.snapid);
5058 auto _inode = in->_get_inode();
5059 _inode->ino = q.second.ino;
5060 _inode->mode = S_IFDIR;
5061 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5062 add_inode(in);
5063 dout(10) << " add inode " << *in << dendl;
5064 } else if (in->get_parent_dn()) {
5065 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5066 << ", unlinking " << *in << dendl;
5067 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5068 }
5069 dn->dir->link_primary_inode(dn, in);
5070 isolated_inodes.erase(in);
5071 }
5072 }
5073
5074 dn->set_replica_nonce(q.second.nonce);
5075 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5076 dn->state_clear(CDentry::STATE_REJOINING);
5077 dout(10) << " got " << *dn << dendl;
5078 }
5079 }
5080 }
5081
5082 for (const auto& in : refragged_inodes) {
5083 auto&& ls = in->get_nested_dirfrags();
5084 for (const auto& dir : ls) {
5085 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5086 continue;
5087 ceph_assert(dir->get_num_any() == 0);
5088 in->close_dirfrag(dir->get_frag());
5089 }
5090 }
5091
5092 // full dirfrags
5093 for (const auto &p : ack->dirfrag_bases) {
5094 CDir *dir = get_dirfrag(p.first);
5095 ceph_assert(dir);
5096 auto q = p.second.cbegin();
5097 dir->_decode_base(q);
5098 dout(10) << " got dir replica " << *dir << dendl;
5099 }
5100
5101 // full inodes
5102 auto p = ack->inode_base.cbegin();
5103 while (!p.end()) {
5104 inodeno_t ino;
5105 snapid_t last;
5106 bufferlist basebl;
5107 decode(ino, p);
5108 decode(last, p);
5109 decode(basebl, p);
5110 CInode *in = get_inode(ino, last);
5111 ceph_assert(in);
5112 auto q = basebl.cbegin();
5113 snapid_t sseq = 0;
5114 if (in->snaprealm)
5115 sseq = in->snaprealm->srnode.seq;
5116 in->_decode_base(q);
5117 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5118 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5119 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5120 }
5121 dout(10) << " got inode base " << *in << dendl;
5122 }
5123
5124 // inodes
5125 p = ack->inode_locks.cbegin();
5126 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5127 while (!p.end()) {
5128 inodeno_t ino;
5129 snapid_t last;
5130 __u32 nonce;
5131 bufferlist lockbl;
5132 decode(ino, p);
5133 decode(last, p);
5134 decode(nonce, p);
5135 decode(lockbl, p);
5136
5137 CInode *in = get_inode(ino, last);
5138 ceph_assert(in);
5139 in->set_replica_nonce(nonce);
5140 auto q = lockbl.cbegin();
5141 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5142 in->state_clear(CInode::STATE_REJOINING);
5143 dout(10) << " got inode locks " << *in << dendl;
5144 }
5145
5146 // FIXME: This can happen if entire subtree, together with the inode subtree root
5147 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5148 ceph_assert(isolated_inodes.empty());
5149
5150 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5151 auto bp = ack->imported_caps.cbegin();
5152 decode(peer_imported, bp);
5153
5154 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5155 p != peer_imported.end();
5156 ++p) {
5157 auto& ex = cap_exports.at(p->first);
5158 ceph_assert(ex.first == from);
5159 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5160 q != p->second.end();
5161 ++q) {
5162 auto r = ex.second.find(q->first);
5163 ceph_assert(r != ex.second.end());
5164
5165 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5166 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5167 if (!session) {
5168 dout(10) << " no session for client." << p->first << dendl;
5169 ex.second.erase(r);
5170 continue;
5171 }
5172
5173 // mark client caps stale.
5174 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5175 r->second.capinfo.cap_id, 0,
5176 mds->get_osd_epoch_barrier());
5177 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5178 (q->second.cap_id > 0 ? from : -1), 0);
5179 mds->send_message_client_counted(m, session);
5180
5181 ex.second.erase(r);
5182 }
5183 ceph_assert(ex.second.empty());
5184 }
5185
5186 for (auto p : updated_realms) {
5187 CInode *in = p.first;
5188 bool notify_clients;
5189 if (mds->is_rejoin()) {
5190 if (!rejoin_pending_snaprealms.count(in)) {
5191 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5192 rejoin_pending_snaprealms.insert(in);
5193 }
5194 notify_clients = false;
5195 } else {
5196 // notify clients if I'm survivor
5197 notify_clients = true;
5198 }
5199 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5200 }
5201
5202 // done?
5203 ceph_assert(rejoin_ack_gather.count(from));
5204 rejoin_ack_gather.erase(from);
5205 if (!survivor) {
5206 if (rejoin_gather.empty()) {
5207 // eval unstable scatter locks after all wrlocks are rejoined.
5208 while (!rejoin_eval_locks.empty()) {
5209 SimpleLock *lock = rejoin_eval_locks.front();
5210 rejoin_eval_locks.pop_front();
5211 if (!lock->is_stable())
5212 mds->locker->eval_gather(lock);
5213 }
5214 }
5215
5216 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5217 rejoin_ack_gather.empty()) {
5218 // finally, kickstart past snap parent opens
5219 open_snaprealms();
5220 } else {
5221 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5222 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5223 }
5224 } else {
5225 // survivor.
5226 mds->queue_waiters(rejoin_waiters);
5227 }
5228 }
5229
5230 /**
5231 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5232 *
5233 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5234 * messages that clean these guys up...
5235 */
5236 void MDCache::rejoin_trim_undef_inodes()
5237 {
5238 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5239
5240 while (!rejoin_undef_inodes.empty()) {
5241 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5242 CInode *in = *p;
5243 rejoin_undef_inodes.erase(p);
5244
5245 in->clear_replica_map();
5246
5247 // close out dirfrags
5248 if (in->is_dir()) {
5249 const auto&& dfls = in->get_dirfrags();
5250 for (const auto& dir : dfls) {
5251 dir->clear_replica_map();
5252
5253 for (auto &p : dir->items) {
5254 CDentry *dn = p.second;
5255 dn->clear_replica_map();
5256
5257 dout(10) << " trimming " << *dn << dendl;
5258 dir->remove_dentry(dn);
5259 }
5260
5261 dout(10) << " trimming " << *dir << dendl;
5262 in->close_dirfrag(dir->dirfrag().frag);
5263 }
5264 }
5265
5266 CDentry *dn = in->get_parent_dn();
5267 if (dn) {
5268 dn->clear_replica_map();
5269 dout(10) << " trimming " << *dn << dendl;
5270 dn->dir->remove_dentry(dn);
5271 } else {
5272 dout(10) << " trimming " << *in << dendl;
5273 remove_inode(in);
5274 }
5275 }
5276
5277 ceph_assert(rejoin_undef_inodes.empty());
5278 }
5279
5280 void MDCache::rejoin_gather_finish()
5281 {
5282 dout(10) << "rejoin_gather_finish" << dendl;
5283 ceph_assert(mds->is_rejoin());
5284 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5285
5286 if (open_undef_inodes_dirfrags())
5287 return;
5288
5289 if (process_imported_caps())
5290 return;
5291
5292 choose_lock_states_and_reconnect_caps();
5293
5294 identify_files_to_recover();
5295 rejoin_send_acks();
5296
5297 // signal completion of fetches, rejoin_gather_finish, etc.
5298 rejoin_ack_gather.erase(mds->get_nodeid());
5299
5300 // did we already get our acks too?
5301 if (rejoin_ack_gather.empty()) {
5302 // finally, open snaprealms
5303 open_snaprealms();
5304 }
5305 }
5306
5307 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5308 inodeno_t ino;
5309 public:
5310 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5311 void finish(int r) override {
5312 mdcache->rejoin_open_ino_finish(ino, r);
5313 }
5314 };
5315
5316 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5317 {
5318 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5319
5320 if (ret < 0) {
5321 cap_imports_missing.insert(ino);
5322 } else if (ret == mds->get_nodeid()) {
5323 ceph_assert(get_inode(ino));
5324 } else {
5325 auto p = cap_imports.find(ino);
5326 ceph_assert(p != cap_imports.end());
5327 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5328 ceph_assert(q->second.count(MDS_RANK_NONE));
5329 ceph_assert(q->second.size() == 1);
5330 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5331 }
5332 cap_imports.erase(p);
5333 }
5334
5335 ceph_assert(cap_imports_num_opening > 0);
5336 cap_imports_num_opening--;
5337
5338 if (cap_imports_num_opening == 0) {
5339 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5340 rejoin_gather_finish();
5341 else if (rejoin_gather.count(mds->get_nodeid()))
5342 process_imported_caps();
5343 }
5344 }
5345
5346 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5347 public:
5348 map<client_t,pair<Session*,uint64_t> > session_map;
5349 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5350 void finish(int r) override {
5351 ceph_assert(r == 0);
5352 mdcache->rejoin_open_sessions_finish(session_map);
5353 }
5354 };
5355
5356 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5357 {
5358 dout(10) << "rejoin_open_sessions_finish" << dendl;
5359 mds->server->finish_force_open_sessions(session_map);
5360 rejoin_session_map.swap(session_map);
5361 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5362 rejoin_gather_finish();
5363 }
5364
5365 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5366 {
5367 auto p = cap_imports.find(ino);
5368 if (p != cap_imports.end()) {
5369 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5370 if (ret < 0) {
5371 cap_imports_missing.insert(ino);
5372 } else if (ret != mds->get_nodeid()) {
5373 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5374 ceph_assert(q->second.count(MDS_RANK_NONE));
5375 ceph_assert(q->second.size() == 1);
5376 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5377 }
5378 cap_imports.erase(p);
5379 }
5380 }
5381 }
5382
5383 bool MDCache::process_imported_caps()
5384 {
5385 dout(10) << "process_imported_caps" << dendl;
5386
5387 if (!open_file_table.is_prefetched() &&
5388 open_file_table.prefetch_inodes()) {
5389 open_file_table.wait_for_prefetch(
5390 new MDSInternalContextWrapper(mds,
5391 new LambdaContext([this](int r) {
5392 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5393 process_imported_caps();
5394 })
5395 )
5396 );
5397 return true;
5398 }
5399
5400 for (auto& p : cap_imports) {
5401 CInode *in = get_inode(p.first);
5402 if (in) {
5403 ceph_assert(in->is_auth());
5404 cap_imports_missing.erase(p.first);
5405 continue;
5406 }
5407 if (cap_imports_missing.count(p.first) > 0)
5408 continue;
5409
5410 uint64_t parent_ino = 0;
5411 std::string_view d_name;
5412 for (auto& q : p.second) {
5413 for (auto& r : q.second) {
5414 auto &icr = r.second;
5415 if (icr.capinfo.pathbase &&
5416 icr.path.length() > 0 &&
5417 icr.path.find('/') == string::npos) {
5418 parent_ino = icr.capinfo.pathbase;
5419 d_name = icr.path;
5420 break;
5421 }
5422 }
5423 if (parent_ino)
5424 break;
5425 }
5426
5427 dout(10) << " opening missing ino " << p.first << dendl;
5428 cap_imports_num_opening++;
5429 auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
5430 if (parent_ino) {
5431 vector<inode_backpointer_t> ancestors;
5432 ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
5433 open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
5434 } else {
5435 open_ino(p.first, (int64_t)-1, fin, false);
5436 }
5437 if (!(cap_imports_num_opening % 1000))
5438 mds->heartbeat_reset();
5439 }
5440
5441 if (cap_imports_num_opening > 0)
5442 return true;
5443
5444 // called by rejoin_gather_finish() ?
5445 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5446 if (!rejoin_client_map.empty() &&
5447 rejoin_session_map.empty()) {
5448 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5449 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5450 rejoin_client_metadata_map,
5451 finish->session_map);
5452 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5453 std::move(rejoin_client_metadata_map));
5454 mds->mdlog->start_submit_entry(le, finish);
5455 mds->mdlog->flush();
5456 rejoin_client_map.clear();
5457 rejoin_client_metadata_map.clear();
5458 return true;
5459 }
5460
5461 // process caps that were exported by peer rename
5462 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
5463 p != rejoin_peer_exports.end();
5464 ++p) {
5465 CInode *in = get_inode(p->first);
5466 ceph_assert(in);
5467 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5468 q != p->second.second.end();
5469 ++q) {
5470 auto r = rejoin_session_map.find(q->first);
5471 if (r == rejoin_session_map.end())
5472 continue;
5473
5474 Session *session = r->second.first;
5475 Capability *cap = in->get_client_cap(q->first);
5476 if (!cap) {
5477 cap = in->add_client_cap(q->first, session);
5478 // add empty item to reconnected_caps
5479 (void)reconnected_caps[p->first][q->first];
5480 }
5481 cap->merge(q->second, true);
5482
5483 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5484 ceph_assert(cap->get_last_seq() == im.issue_seq);
5485 ceph_assert(cap->get_mseq() == im.mseq);
5486 cap->set_cap_id(im.cap_id);
5487 // send cap import because we assigned a new cap ID
5488 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5489 p->second.first, CEPH_CAP_FLAG_AUTH);
5490 }
5491 }
5492 rejoin_peer_exports.clear();
5493 rejoin_imported_caps.clear();
5494
5495 // process cap imports
5496 // ino -> client -> frommds -> capex
5497 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5498 CInode *in = get_inode(p->first);
5499 if (!in) {
5500 dout(10) << " still missing ino " << p->first
5501 << ", will try again after replayed client requests" << dendl;
5502 ++p;
5503 continue;
5504 }
5505 ceph_assert(in->is_auth());
5506 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5507 Session *session;
5508 {
5509 auto r = rejoin_session_map.find(q->first);
5510 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5511 }
5512
5513 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5514 if (!session) {
5515 if (r->first >= 0)
5516 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5517 continue;
5518 }
5519
5520 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5521 add_reconnected_cap(q->first, in->ino(), r->second);
5522 if (r->first >= 0) {
5523 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5524 cap->inc_mseq();
5525 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5526
5527 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5528 im.cap_id = cap->get_cap_id();
5529 im.issue_seq = cap->get_last_seq();
5530 im.mseq = cap->get_mseq();
5531 }
5532 }
5533 }
5534 cap_imports.erase(p++); // remove and move on
5535 }
5536 } else {
5537 trim_non_auth();
5538
5539 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5540 rejoin_gather.erase(mds->get_nodeid());
5541 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5542 maybe_send_pending_rejoins();
5543 }
5544 return false;
5545 }
5546
5547 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5548 client_t client, snapid_t snap_follows)
5549 {
5550 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5551
5552 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5553 return;
5554
5555 const set<snapid_t>& snaps = realm->get_snaps();
5556 snapid_t follows = snap_follows;
5557
5558 while (true) {
5559 CInode *in = pick_inode_snap(head_in, follows);
5560 if (in == head_in)
5561 break;
5562
5563 bool need_snapflush = false;
5564 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5565 p != snaps.end() && *p <= in->last;
5566 ++p) {
5567 head_in->add_need_snapflush(in, *p, client);
5568 need_snapflush = true;
5569 }
5570 follows = in->last;
5571 if (!need_snapflush)
5572 continue;
5573
5574 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5575
5576 if (in->client_snap_caps.empty()) {
5577 for (int i = 0; i < num_cinode_locks; i++) {
5578 int lockid = cinode_lock_info[i].lock;
5579 SimpleLock *lock = in->get_lock(lockid);
5580 ceph_assert(lock);
5581 in->auth_pin(lock);
5582 lock->set_state(LOCK_SNAP_SYNC);
5583 lock->get_wrlock(true);
5584 }
5585 }
5586 in->client_snap_caps.insert(client);
5587 mds->locker->mark_need_snapflush_inode(in);
5588 }
5589 }
5590
5591 /*
5592 * choose lock states based on reconnected caps
5593 */
5594 void MDCache::choose_lock_states_and_reconnect_caps()
5595 {
5596 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5597
5598 int count = 0;
5599 for (auto p : inode_map) {
5600 CInode *in = p.second;
5601 if (in->last != CEPH_NOSNAP)
5602 continue;
5603
5604 if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat())
5605 in->mark_dirty_rstat();
5606
5607 int dirty_caps = 0;
5608 auto q = reconnected_caps.find(in->ino());
5609 if (q != reconnected_caps.end()) {
5610 for (const auto &it : q->second)
5611 dirty_caps |= it.second.dirty_caps;
5612 }
5613 in->choose_lock_states(dirty_caps);
5614 dout(15) << " chose lock states on " << *in << dendl;
5615
5616 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5617 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5618 rejoin_pending_snaprealms.insert(in);
5619 }
5620
5621 if (!(++count % 1000))
5622 mds->heartbeat_reset();
5623 }
5624 }
5625
5626 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5627 map<client_t,ref_t<MClientSnap>>& splits)
5628 {
5629 ref_t<MClientSnap> snap;
5630 auto it = splits.find(client);
5631 if (it != splits.end()) {
5632 snap = it->second;
5633 snap->head.op = CEPH_SNAP_OP_SPLIT;
5634 } else {
5635 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5636 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5637 snap->head.split = realm->inode->ino();
5638 snap->bl = realm->get_snap_trace();
5639
5640 for (const auto& child : realm->open_children)
5641 snap->split_realms.push_back(child->inode->ino());
5642 }
5643 snap->split_inos.push_back(ino);
5644 }
5645
5646 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5647 map<client_t,ref_t<MClientSnap>>& splits)
5648 {
5649 ceph_assert(parent_realm);
5650
5651 vector<inodeno_t> split_inos;
5652 vector<inodeno_t> split_realms;
5653
5654 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
5655 split_inos.push_back((*p)->ino());
5656 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5657 p != realm->open_children.end();
5658 ++p)
5659 split_realms.push_back((*p)->inode->ino());
5660
5661 for (const auto& p : realm->client_caps) {
5662 ceph_assert(!p.second->empty());
5663 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5664 if (em.second) {
5665 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5666 update->head.split = parent_realm->inode->ino();
5667 update->split_inos = split_inos;
5668 update->split_realms = split_realms;
5669 update->bl = parent_realm->get_snap_trace();
5670 em.first->second = std::move(update);
5671 }
5672 }
5673 }
5674
5675 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5676 {
5677 dout(10) << "send_snaps" << dendl;
5678
5679 for (auto &p : splits) {
5680 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5681 if (session) {
5682 dout(10) << " client." << p.first
5683 << " split " << p.second->head.split
5684 << " inos " << p.second->split_inos
5685 << dendl;
5686 mds->send_message_client_counted(p.second, session);
5687 } else {
5688 dout(10) << " no session for client." << p.first << dendl;
5689 }
5690 }
5691 splits.clear();
5692 }
5693
5694
5695 /*
5696 * remove any items from logsegment open_file lists that don't have
5697 * any caps
5698 */
5699 void MDCache::clean_open_file_lists()
5700 {
5701 dout(10) << "clean_open_file_lists" << dendl;
5702
5703 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5704 p != mds->mdlog->segments.end();
5705 ++p) {
5706 LogSegment *ls = p->second;
5707
5708 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5709 while (!q.end()) {
5710 CInode *in = *q;
5711 ++q;
5712 if (in->last == CEPH_NOSNAP) {
5713 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5714 in->item_open_file.remove_myself();
5715 } else {
5716 if (in->client_snap_caps.empty()) {
5717 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5718 in->item_open_file.remove_myself();
5719 }
5720 }
5721 }
5722 }
5723 }
5724
5725 void MDCache::dump_openfiles(Formatter *f)
5726 {
5727 f->open_array_section("openfiles");
5728 for (auto p = mds->mdlog->segments.begin();
5729 p != mds->mdlog->segments.end();
5730 ++p) {
5731 LogSegment *ls = p->second;
5732
5733 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5734 while (!q.end()) {
5735 CInode *in = *q;
5736 ++q;
5737 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5738 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5739 continue;
5740 f->open_object_section("file");
5741 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5742 f->close_section();
5743 }
5744 }
5745 f->close_section();
5746 }
5747
5748 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5749 {
5750 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5751 << " on " << *in << dendl;
5752 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5753 if (!session) {
5754 dout(10) << " no session for client." << client << dendl;
5755 return NULL;
5756 }
5757
5758 Capability *cap = in->reconnect_cap(client, icr, session);
5759
5760 if (frommds >= 0) {
5761 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5762 cap->inc_mseq();
5763 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5764 }
5765
5766 return cap;
5767 }
5768
5769 void MDCache::export_remaining_imported_caps()
5770 {
5771 dout(10) << "export_remaining_imported_caps" << dendl;
5772
5773 CachedStackStringStream css;
5774
5775 int count = 0;
5776 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5777 *css << " ino " << p->first << "\n";
5778 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5779 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5780 if (session) {
5781 // mark client caps stale.
5782 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5783 0, 0, 0,
5784 mds->get_osd_epoch_barrier());
5785 stale->set_cap_peer(0, 0, 0, -1, 0);
5786 mds->send_message_client_counted(stale, q->first);
5787 }
5788 }
5789
5790 if (!(++count % 1000))
5791 mds->heartbeat_reset();
5792 }
5793
5794 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5795 p != cap_reconnect_waiters.end();
5796 ++p)
5797 mds->queue_waiters(p->second);
5798
5799 cap_imports.clear();
5800 cap_reconnect_waiters.clear();
5801
5802 if (css->strv().length()) {
5803 mds->clog->warn() << "failed to reconnect caps for missing inodes:"
5804 << css->strv();
5805 }
5806 }
5807
5808 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5809 {
5810 client_t client = session->info.get_client();
5811 Capability *cap = nullptr;
5812 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5813 if (rc) {
5814 cap = in->reconnect_cap(client, *rc, session);
5815 dout(10) << "try_reconnect_cap client." << client
5816 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5817 << " issue " << ccap_string(rc->capinfo.issued)
5818 << " on " << *in << dendl;
5819 remove_replay_cap_reconnect(in->ino(), client);
5820
5821 if (in->is_replicated()) {
5822 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5823 } else {
5824 int dirty_caps = 0;
5825 auto p = reconnected_caps.find(in->ino());
5826 if (p != reconnected_caps.end()) {
5827 auto q = p->second.find(client);
5828 if (q != p->second.end())
5829 dirty_caps = q->second.dirty_caps;
5830 }
5831 in->choose_lock_states(dirty_caps);
5832 dout(15) << " chose lock states on " << *in << dendl;
5833 }
5834
5835 map<inodeno_t, MDSContext::vec >::iterator it =
5836 cap_reconnect_waiters.find(in->ino());
5837 if (it != cap_reconnect_waiters.end()) {
5838 mds->queue_waiters(it->second);
5839 cap_reconnect_waiters.erase(it);
5840 }
5841 }
5842 return cap;
5843 }
5844
5845
5846
5847 // -------
5848 // cap imports and delayed snap parent opens
5849
5850 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5851 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5852 int peer, int p_flags)
5853 {
5854 SnapRealm *realm = in->find_snaprealm();
5855 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5856 if (cap->get_last_seq() == 0) // reconnected cap
5857 cap->inc_last_seq();
5858 cap->set_last_issue();
5859 cap->set_last_issue_stamp(ceph_clock_now());
5860 cap->clear_new();
5861 auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
5862 in->ino(), realm->inode->ino(), cap->get_cap_id(),
5863 cap->get_last_seq(), cap->pending(), cap->wanted(),
5864 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
5865 in->encode_cap_message(reap, cap);
5866 reap->snapbl = realm->get_snap_trace();
5867 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5868 mds->send_message_client_counted(reap, session);
5869 }
5870
5871 void MDCache::do_delayed_cap_imports()
5872 {
5873 dout(10) << "do_delayed_cap_imports" << dendl;
5874
5875 ceph_assert(delayed_imported_caps.empty());
5876 }
5877
5878 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5879 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5880 void finish(int r) override {
5881 mdcache->open_snaprealms();
5882 }
5883 };
5884
5885 void MDCache::open_snaprealms()
5886 {
5887 dout(10) << "open_snaprealms" << dendl;
5888
5889 auto it = rejoin_pending_snaprealms.begin();
5890 while (it != rejoin_pending_snaprealms.end()) {
5891 CInode *in = *it;
5892 SnapRealm *realm = in->snaprealm;
5893 ceph_assert(realm);
5894
5895 map<client_t,ref_t<MClientSnap>> splits;
5896 // finish off client snaprealm reconnects?
5897 auto q = reconnected_snaprealms.find(in->ino());
5898 if (q != reconnected_snaprealms.end()) {
5899 for (const auto& r : q->second)
5900 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5901 reconnected_snaprealms.erase(q);
5902 }
5903
5904 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) {
5905 CInode *child = *p;
5906 auto q = reconnected_caps.find(child->ino());
5907 ceph_assert(q != reconnected_caps.end());
5908 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5909 Capability *cap = child->get_client_cap(r->first);
5910 if (!cap)
5911 continue;
5912 if (r->second.snap_follows > 0) {
5913 if (r->second.snap_follows < child->first - 1) {
5914 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5915 } else if (r->second.snapflush) {
5916 // When processing a cap flush message that is re-sent, it's possble
5917 // that the sender has already released all WR caps. So we should
5918 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5919 cap->mark_needsnapflush();
5920 }
5921 }
5922 // make sure client's cap is in the correct snaprealm.
5923 if (r->second.realm_ino != in->ino()) {
5924 prepare_realm_split(realm, r->first, child->ino(), splits);
5925 }
5926 }
5927 }
5928
5929 rejoin_pending_snaprealms.erase(it++);
5930 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5931
5932 send_snaps(splits);
5933 }
5934
5935 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5936
5937 if (!reconnected_snaprealms.empty()) {
5938 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5939 for (auto& p : reconnected_snaprealms) {
5940 CachedStackStringStream css;
5941 *css << " " << p.first << " {";
5942 bool first = true;
5943 for (auto& q : p.second) {
5944 if (!first)
5945 *css << ", ";
5946 *css << "client." << q.first << "/" << q.second;
5947 }
5948 *css << "}";
5949 dout(5) << css->strv() << dendl;
5950 }
5951 }
5952 ceph_assert(rejoin_waiters.empty());
5953 ceph_assert(rejoin_pending_snaprealms.empty());
5954 dout(10) << "open_snaprealms - all open" << dendl;
5955 do_delayed_cap_imports();
5956
5957 ceph_assert(rejoin_done);
5958 rejoin_done.release()->complete(0);
5959 reconnected_caps.clear();
5960 }
5961
5962 bool MDCache::open_undef_inodes_dirfrags()
5963 {
5964 dout(10) << "open_undef_inodes_dirfrags "
5965 << rejoin_undef_inodes.size() << " inodes "
5966 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5967
5968 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5969
5970 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5971 p != rejoin_undef_inodes.end();
5972 ++p) {
5973 CInode *in = *p;
5974 ceph_assert(!in->is_base());
5975 ceph_assert(in->get_parent_dir());
5976 fetch_queue.insert(in->get_parent_dir());
5977 }
5978
5979 if (fetch_queue.empty())
5980 return false;
5981
5982 MDSGatherBuilder gather(g_ceph_context,
5983 new MDSInternalContextWrapper(mds,
5984 new LambdaContext([this](int r) {
5985 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid()))
5986 rejoin_gather_finish();
5987 })
5988 )
5989 );
5990
5991 for (set<CDir*>::iterator p = fetch_queue.begin();
5992 p != fetch_queue.end();
5993 ++p) {
5994 CDir *dir = *p;
5995 CInode *diri = dir->get_inode();
5996 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5997 continue;
5998 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5999 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
6000 dir->fetch(gather.new_sub());
6001 }
6002 ceph_assert(gather.has_subs());
6003 gather.activate();
6004 return true;
6005 }
6006
6007 void MDCache::opened_undef_inode(CInode *in) {
6008 dout(10) << "opened_undef_inode " << *in << dendl;
6009 rejoin_undef_inodes.erase(in);
6010 if (in->is_dir()) {
6011 // FIXME: re-hash dentries if necessary
6012 ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
6013 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
6014 CDir *dir = in->get_dirfrag(frag_t());
6015 ceph_assert(dir);
6016 rejoin_undef_dirfrags.erase(dir);
6017 in->force_dirfrags();
6018 auto&& ls = in->get_dirfrags();
6019 for (const auto& dir : ls) {
6020 rejoin_undef_dirfrags.insert(dir);
6021 }
6022 }
6023 }
6024 }
6025
6026 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6027 map<client_t,ref_t<MClientSnap>>& updates)
6028 {
6029 if (seq < realm->get_newest_seq()) {
6030 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
6031 << realm->get_newest_seq() << " on " << *realm << dendl;
6032 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
6033 snap->bl = realm->get_snap_trace();
6034 for (const auto& child : realm->open_children)
6035 snap->split_realms.push_back(child->inode->ino());
6036 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
6037 } else {
6038 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6039 << " on " << *realm << dendl;
6040 }
6041 }
6042
6043
6044
6045 void MDCache::rejoin_send_acks()
6046 {
6047 dout(7) << "rejoin_send_acks" << dendl;
6048
6049 // replicate stray
6050 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6051 p != rejoin_unlinked_inodes.end();
6052 ++p) {
6053 for (set<CInode*>::iterator q = p->second.begin();
6054 q != p->second.end();
6055 ++q) {
6056 CInode *in = *q;
6057 dout(7) << " unlinked inode " << *in << dendl;
6058 // inode expired
6059 if (!in->is_replica(p->first))
6060 continue;
6061 while (1) {
6062 CDentry *dn = in->get_parent_dn();
6063 if (dn->is_replica(p->first))
6064 break;
6065 dn->add_replica(p->first);
6066 CDir *dir = dn->get_dir();
6067 if (dir->is_replica(p->first))
6068 break;
6069 dir->add_replica(p->first);
6070 in = dir->get_inode();
6071 if (in->is_replica(p->first))
6072 break;
6073 in->add_replica(p->first);
6074 if (in->is_base())
6075 break;
6076 }
6077 }
6078 }
6079 rejoin_unlinked_inodes.clear();
6080
6081 // send acks to everyone in the recovery set
6082 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6083 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6084 p != recovery_set.end();
6085 ++p) {
6086 if (rejoin_ack_sent.count(*p))
6087 continue;
6088 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6089 }
6090
6091 rejoin_ack_sent = recovery_set;
6092
6093 // walk subtrees
6094 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6095 p != subtrees.end();
6096 ++p) {
6097 CDir *dir = p->first;
6098 if (!dir->is_auth())
6099 continue;
6100 dout(10) << "subtree " << *dir << dendl;
6101
6102 // auth items in this subtree
6103 std::queue<CDir*> dq;
6104 dq.push(dir);
6105
6106 while (!dq.empty()) {
6107 CDir *dir = dq.front();
6108 dq.pop();
6109
6110 // dir
6111 for (auto &r : dir->get_replicas()) {
6112 auto it = acks.find(r.first);
6113 if (it == acks.end())
6114 continue;
6115 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6116 it->second->add_dirfrag_base(dir);
6117 }
6118
6119 for (auto &p : dir->items) {
6120 CDentry *dn = p.second;
6121 CDentry::linkage_t *dnl = dn->get_linkage();
6122
6123 // inode
6124 CInode *in = NULL;
6125 if (dnl->is_primary())
6126 in = dnl->get_inode();
6127
6128 // dentry
6129 for (auto &r : dn->get_replicas()) {
6130 auto it = acks.find(r.first);
6131 if (it == acks.end())
6132 continue;
6133 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
6134 dn->first, dn->last,
6135 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6136 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6137 dnl->is_remote() ? dnl->get_remote_d_type():0,
6138 ++r.second,
6139 dn->lock.get_replica_state());
6140 // peer missed MDentrylink message ?
6141 if (in && !in->is_replica(r.first))
6142 in->add_replica(r.first);
6143 }
6144
6145 if (!in)
6146 continue;
6147
6148 for (auto &r : in->get_replicas()) {
6149 auto it = acks.find(r.first);
6150 if (it == acks.end())
6151 continue;
6152 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6153 bufferlist bl;
6154 in->_encode_locks_state_for_rejoin(bl, r.first);
6155 it->second->add_inode_locks(in, ++r.second, bl);
6156 }
6157
6158 // subdirs in this subtree?
6159 {
6160 auto&& dirs = in->get_nested_dirfrags();
6161 for (const auto& dir : dirs) {
6162 dq.push(dir);
6163 }
6164 }
6165 }
6166 }
6167 }
6168
6169 // base inodes too
6170 if (root && root->is_auth())
6171 for (auto &r : root->get_replicas()) {
6172 auto it = acks.find(r.first);
6173 if (it == acks.end())
6174 continue;
6175 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6176 bufferlist bl;
6177 root->_encode_locks_state_for_rejoin(bl, r.first);
6178 it->second->add_inode_locks(root, ++r.second, bl);
6179 }
6180 if (myin)
6181 for (auto &r : myin->get_replicas()) {
6182 auto it = acks.find(r.first);
6183 if (it == acks.end())
6184 continue;
6185 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6186 bufferlist bl;
6187 myin->_encode_locks_state_for_rejoin(bl, r.first);
6188 it->second->add_inode_locks(myin, ++r.second, bl);
6189 }
6190
6191 // include inode base for any inodes whose scatterlocks may have updated
6192 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6193 p != rejoin_potential_updated_scatterlocks.end();
6194 ++p) {
6195 CInode *in = *p;
6196 for (const auto &r : in->get_replicas()) {
6197 auto it = acks.find(r.first);
6198 if (it == acks.end())
6199 continue;
6200 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6201 }
6202 }
6203
6204 // send acks
6205 for (auto p = acks.begin(); p != acks.end(); ++p) {
6206 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6207 mds->send_message_mds(p->second, p->first);
6208 }
6209
6210 rejoin_imported_caps.clear();
6211 }
6212
6213 class C_MDC_ReIssueCaps : public MDCacheContext {
6214 CInode *in;
6215 public:
6216 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6217 MDCacheContext(mdc), in(i)
6218 {
6219 in->get(CInode::PIN_PTRWAITER);
6220 }
6221 void finish(int r) override {
6222 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6223 mdcache->mds->locker->issue_caps(in);
6224 in->put(CInode::PIN_PTRWAITER);
6225 }
6226 };
6227
6228 void MDCache::reissue_all_caps()
6229 {
6230 dout(10) << "reissue_all_caps" << dendl;
6231
6232 int count = 0;
6233 for (auto &p : inode_map) {
6234 int n = 1;
6235 CInode *in = p.second;
6236 if (in->is_head() && in->is_any_caps()) {
6237 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6238 if (in->is_frozen_inode()) {
6239 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6240 continue;
6241 }
6242 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6243 n += mds->locker->issue_caps(in);
6244 }
6245
6246 if ((count % 1000) + n >= 1000)
6247 mds->heartbeat_reset();
6248 count += n;
6249 }
6250 }
6251
6252
6253 // ===============================================================================
6254
6255 struct C_MDC_QueuedCow : public MDCacheContext {
6256 CInode *in;
6257 MutationRef mut;
6258 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6259 MDCacheContext(mdc), in(i), mut(m) {}
6260 void finish(int r) override {
6261 mdcache->_queued_file_recover_cow(in, mut);
6262 }
6263 };
6264
6265
6266 void MDCache::queue_file_recover(CInode *in)
6267 {
6268 dout(10) << "queue_file_recover " << *in << dendl;
6269 ceph_assert(in->is_auth());
6270
6271 // cow?
6272 /*
6273 SnapRealm *realm = in->find_snaprealm();
6274 set<snapid_t> s = realm->get_snaps();
6275 while (!s.empty() && *s.begin() < in->first)
6276 s.erase(s.begin());
6277 while (!s.empty() && *s.rbegin() > in->last)
6278 s.erase(*s.rbegin());
6279 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6280 if (s.size() > 1) {
6281 auto pi = in->project_inode(mut);
6282 pi.inode.version = in->pre_dirty();
6283
6284 auto mut(std::make_shared<MutationImpl>());
6285 mut->ls = mds->mdlog->get_current_segment();
6286 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6287 mds->mdlog->start_entry(le);
6288 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6289
6290 s.erase(*s.begin());
6291 while (!s.empty()) {
6292 snapid_t snapid = *s.begin();
6293 CInode *cow_inode = 0;
6294 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6295 ceph_assert(cow_inode);
6296 recovery_queue.enqueue(cow_inode);
6297 s.erase(*s.begin());
6298 }
6299
6300 in->parent->first = in->first;
6301 le->metablob.add_primary_dentry(in->parent, in, true);
6302 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6303 mds->mdlog->flush();
6304 }
6305 */
6306
6307 recovery_queue.enqueue(in);
6308 }
6309
6310 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6311 {
6312 mut->apply();
6313 mds->locker->drop_locks(mut.get());
6314 mut->cleanup();
6315 }
6316
6317
6318 /*
6319 * called after recovery to recover file sizes for previously opened (for write)
6320 * files. that is, those where max_size > size.
6321 */
6322 void MDCache::identify_files_to_recover()
6323 {
6324 dout(10) << "identify_files_to_recover" << dendl;
6325 int count = 0;
6326 for (auto &p : inode_map) {
6327 CInode *in = p.second;
6328 if (!in->is_auth())
6329 continue;
6330
6331 if (in->last != CEPH_NOSNAP)
6332 continue;
6333
6334 // Only normal files need file size recovery
6335 if (!in->is_file()) {
6336 continue;
6337 }
6338
6339 bool recover = false;
6340 const auto& client_ranges = in->get_projected_inode()->client_ranges;
6341 if (!client_ranges.empty()) {
6342 in->mark_clientwriteable();
6343 for (auto& p : client_ranges) {
6344 Capability *cap = in->get_client_cap(p.first);
6345 if (cap) {
6346 cap->mark_clientwriteable();
6347 } else {
6348 dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
6349 recover = true;
6350 break;
6351 }
6352 }
6353 }
6354
6355 if (recover) {
6356 if (in->filelock.is_stable()) {
6357 in->auth_pin(&in->filelock);
6358 } else {
6359 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6360 }
6361 in->filelock.set_state(LOCK_PRE_SCAN);
6362 rejoin_recover_q.push_back(in);
6363 } else {
6364 rejoin_check_q.push_back(in);
6365 }
6366
6367 if (!(++count % 1000))
6368 mds->heartbeat_reset();
6369 }
6370 }
6371
6372 void MDCache::start_files_to_recover()
6373 {
6374 int count = 0;
6375 for (CInode *in : rejoin_check_q) {
6376 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6377 mds->locker->issue_caps(in);
6378 mds->locker->check_inode_max_size(in);
6379 if (!(++count % 1000))
6380 mds->heartbeat_reset();
6381 }
6382 rejoin_check_q.clear();
6383 for (CInode *in : rejoin_recover_q) {
6384 mds->locker->file_recover(&in->filelock);
6385 if (!(++count % 1000))
6386 mds->heartbeat_reset();
6387 }
6388 if (!rejoin_recover_q.empty()) {
6389 rejoin_recover_q.clear();
6390 do_file_recover();
6391 }
6392 }
6393
6394 void MDCache::do_file_recover()
6395 {
6396 recovery_queue.advance();
6397 }
6398
6399 // ===============================================================================
6400
6401
6402 // ----------------------------
6403 // truncate
6404
6405 class C_MDC_RetryTruncate : public MDCacheContext {
6406 CInode *in;
6407 LogSegment *ls;
6408 public:
6409 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6410 MDCacheContext(c), in(i), ls(l) {}
6411 void finish(int r) override {
6412 mdcache->_truncate_inode(in, ls);
6413 }
6414 };
6415
6416 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6417 {
6418 const auto& pi = in->get_projected_inode();
6419 dout(10) << "truncate_inode "
6420 << pi->truncate_from << " -> " << pi->truncate_size
6421 << " on " << *in
6422 << dendl;
6423
6424 ls->truncating_inodes.insert(in);
6425 in->get(CInode::PIN_TRUNCATING);
6426 in->auth_pin(this);
6427
6428 if (!in->client_need_snapflush.empty() &&
6429 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6430 ceph_assert(in->filelock.is_xlocked());
6431 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6432 mds->locker->issue_caps(in);
6433 return;
6434 }
6435
6436 _truncate_inode(in, ls);
6437 }
6438
6439 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6440 CInode *in;
6441 LogSegment *ls;
6442 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6443 MDCacheIOContext(c, false), in(i), ls(l) {
6444 }
6445 void finish(int r) override {
6446 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
6447 mdcache->truncate_inode_finish(in, ls);
6448 }
6449 void print(ostream& out) const override {
6450 out << "file_truncate(" << in->ino() << ")";
6451 }
6452 };
6453
6454 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6455 {
6456 const auto& pi = in->get_inode();
6457 dout(10) << "_truncate_inode "
6458 << pi->truncate_from << " -> " << pi->truncate_size
6459 << " on " << *in << dendl;
6460
6461 ceph_assert(pi->is_truncating());
6462 ceph_assert(pi->truncate_size < (1ULL << 63));
6463 ceph_assert(pi->truncate_from < (1ULL << 63));
6464 ceph_assert(pi->truncate_size < pi->truncate_from);
6465
6466
6467 SnapRealm *realm = in->find_snaprealm();
6468 SnapContext nullsnap;
6469 const SnapContext *snapc;
6470 if (realm) {
6471 dout(10) << " realm " << *realm << dendl;
6472 snapc = &realm->get_snap_context();
6473 } else {
6474 dout(10) << " NO realm, using null context" << dendl;
6475 snapc = &nullsnap;
6476 ceph_assert(in->last == CEPH_NOSNAP);
6477 }
6478 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6479 auto layout = pi->layout;
6480 filer.truncate(in->ino(), &layout, *snapc,
6481 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6482 pi->truncate_seq, ceph::real_time::min(), 0,
6483 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6484 mds->finisher));
6485 }
6486
6487 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6488 CInode *in;
6489 MutationRef mut;
6490 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6491 MDCacheLogContext(m), in(i), mut(mu) {}
6492 void finish(int r) override {
6493 mdcache->truncate_inode_logged(in, mut);
6494 }
6495 };
6496
6497 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6498 {
6499 dout(10) << "truncate_inode_finish " << *in << dendl;
6500
6501 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6502 ceph_assert(p != ls->truncating_inodes.end());
6503 ls->truncating_inodes.erase(p);
6504
6505 MutationRef mut(new MutationImpl());
6506 mut->ls = mds->mdlog->get_current_segment();
6507
6508 // update
6509 auto pi = in->project_inode(mut);
6510 pi.inode->version = in->pre_dirty();
6511 pi.inode->truncate_from = 0;
6512 pi.inode->truncate_pending--;
6513
6514 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6515 mds->mdlog->start_entry(le);
6516
6517 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6518 journal_dirty_inode(mut.get(), &le->metablob, in);
6519 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6520 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6521
6522 // flush immediately if there are readers/writers waiting
6523 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6524 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6525 mds->mdlog->flush();
6526 }
6527
6528 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6529 {
6530 dout(10) << "truncate_inode_logged " << *in << dendl;
6531 mut->apply();
6532 mds->locker->drop_locks(mut.get());
6533 mut->cleanup();
6534
6535 in->put(CInode::PIN_TRUNCATING);
6536 in->auth_unpin(this);
6537
6538 MDSContext::vec waiters;
6539 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6540 mds->queue_waiters(waiters);
6541 }
6542
6543
6544 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6545 {
6546 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6547 << ls->seq << "/" << ls->offset << dendl;
6548 ls->truncating_inodes.insert(in);
6549 in->get(CInode::PIN_TRUNCATING);
6550 }
6551
6552 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6553 {
6554 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6555 << ls->seq << "/" << ls->offset << dendl;
6556 // if we have the logseg the truncate started in, it must be in our list.
6557 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6558 ceph_assert(p != ls->truncating_inodes.end());
6559 ls->truncating_inodes.erase(p);
6560 in->put(CInode::PIN_TRUNCATING);
6561 }
6562
6563 void MDCache::start_recovered_truncates()
6564 {
6565 dout(10) << "start_recovered_truncates" << dendl;
6566 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6567 p != mds->mdlog->segments.end();
6568 ++p) {
6569 LogSegment *ls = p->second;
6570 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6571 q != ls->truncating_inodes.end();
6572 ++q) {
6573 CInode *in = *q;
6574 in->auth_pin(this);
6575
6576 if (!in->client_need_snapflush.empty() &&
6577 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6578 ceph_assert(in->filelock.is_stable());
6579 in->filelock.set_state(LOCK_XLOCKDONE);
6580 in->auth_pin(&in->filelock);
6581 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6582 // start_files_to_recover will revoke caps
6583 continue;
6584 }
6585 _truncate_inode(in, ls);
6586 }
6587 }
6588 }
6589
6590
6591 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6592 interval_set<inodeno_t> inos;
6593 LogSegment *ls;
6594 version_t inotablev;
6595 public:
6596 C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos,
6597 LogSegment *_ls, version_t iv)
6598 : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {}
6599 void finish(int r) override {
6600 ceph_assert(r == 0);
6601 if (inotablev) {
6602 get_mds()->inotable->apply_release_ids(inos);
6603 ceph_assert(get_mds()->inotable->get_version() == inotablev);
6604 }
6605 ls->purge_inodes_finish(inos);
6606 }
6607 };
6608
6609 void MDCache::start_purge_inodes(){
6610 dout(10) << "start_purge_inodes" << dendl;
6611 for (auto& p : mds->mdlog->segments){
6612 LogSegment *ls = p.second;
6613 if (ls->purging_inodes.size()){
6614 purge_inodes(ls->purging_inodes, ls);
6615 }
6616 }
6617 }
6618
6619 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6620 {
6621 dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl;
6622 // FIXME: handle non-default data pool and namespace
6623
6624 auto cb = new LambdaContext([this, inos, ls](int r){
6625 ceph_assert(r == 0 || r == -2);
6626 mds->inotable->project_release_ids(inos);
6627 version_t piv = mds->inotable->get_projected_version();
6628 ceph_assert(piv != 0);
6629 mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv),
6630 new C_MDS_purge_completed_finish(this, inos, ls, piv));
6631 mds->mdlog->flush();
6632 });
6633
6634 C_GatherBuilder gather(g_ceph_context,
6635 new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher));
6636 SnapContext nullsnapc;
6637 for (const auto& [start, len] : inos) {
6638 for (auto i = start; i < start + len ; i += 1) {
6639 filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1,
6640 ceph::real_clock::now(), 0, gather.new_sub());
6641 }
6642 }
6643 gather.activate();
6644 }
6645
6646 // ================================================================================
6647 // cache trimming
6648
6649 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6650 {
6651 bool is_standby_replay = mds->is_standby_replay();
6652 std::vector<CDentry *> unexpirables;
6653 uint64_t trimmed = 0;
6654
6655 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6656
6657 dout(7) << "trim_lru trimming " << count
6658 << " items from LRU"
6659 << " size=" << lru.lru_get_size()
6660 << " mid=" << lru.lru_get_top()
6661 << " pintail=" << lru.lru_get_pintail()
6662 << " pinned=" << lru.lru_get_num_pinned()
6663 << dendl;
6664
6665 const uint64_t trim_counter_start = trim_counter.get();
6666 bool throttled = false;
6667 while (1) {
6668 throttled |= trim_counter_start+trimmed >= trim_threshold;
6669 if (throttled) break;
6670 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6671 if (!dn)
6672 break;
6673 if (trim_dentry(dn, expiremap)) {
6674 unexpirables.push_back(dn);
6675 } else {
6676 trimmed++;
6677 }
6678 }
6679
6680 for (auto &dn : unexpirables) {
6681 bottom_lru.lru_insert_mid(dn);
6682 }
6683 unexpirables.clear();
6684
6685 // trim dentries from the LRU until count is reached
6686 // if mds is in standby_replay and skip trimming the inodes
6687 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6688 throttled |= trim_counter_start+trimmed >= trim_threshold;
6689 if (throttled) break;
6690 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6691 if (!dn) {
6692 break;
6693 }
6694 if (is_standby_replay && dn->get_linkage()->inode) {
6695 // we move the inodes that need to be trimmed to the end of the lru queue.
6696 // refer to MDCache::standby_trim_segment
6697 lru.lru_insert_bot(dn);
6698 break;
6699 } else if (trim_dentry(dn, expiremap)) {
6700 unexpirables.push_back(dn);
6701 } else {
6702 trimmed++;
6703 if (count > 0) count--;
6704 }
6705 }
6706 trim_counter.hit(trimmed);
6707
6708 for (auto &dn : unexpirables) {
6709 lru.lru_insert_mid(dn);
6710 }
6711 unexpirables.clear();
6712
6713 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6714 return std::pair<bool, uint64_t>(throttled, trimmed);
6715 }
6716
6717 /*
6718 * note: only called while MDS is active or stopping... NOT during recovery.
6719 * however, we may expire a replica whose authority is recovering.
6720 *
6721 * @param count is number of dentries to try to expire
6722 */
6723 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6724 {
6725 uint64_t used = cache_size();
6726 uint64_t limit = cache_memory_limit;
6727 expiremap expiremap;
6728
6729 dout(7) << "trim bytes_used=" << bytes2str(used)
6730 << " limit=" << bytes2str(limit)
6731 << " reservation=" << cache_reservation
6732 << "% count=" << count << dendl;
6733
6734 // process delayed eval_stray()
6735 stray_manager.advance_delayed();
6736
6737 auto result = trim_lru(count, expiremap);
6738 auto& trimmed = result.second;
6739
6740 // trim non-auth, non-bound subtrees
6741 for (auto p = subtrees.begin(); p != subtrees.end();) {
6742 CDir *dir = p->first;
6743 ++p;
6744 CInode *diri = dir->get_inode();
6745 if (dir->is_auth()) {
6746 if (diri->is_auth() && !diri->is_base()) {
6747 /* this situation should correspond to an export pin */
6748 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6749 /* pinned empty subtree, try to drop */
6750 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6751 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6752 dir->state_clear(CDir::STATE_AUXSUBTREE);
6753 remove_subtree(dir);
6754 diri->close_dirfrag(dir->dirfrag().frag);
6755 }
6756 }
6757 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6758 if (dir->state_test(CDir::STATE_EXPORTING) ||
6759 !(mds->is_active() || mds->is_stopping()) ||
6760 dir->is_freezing() || dir->is_frozen())
6761 continue;
6762
6763 migrator->export_empty_import(dir);
6764 ++trimmed;
6765 }
6766 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6767 // only subtree pin
6768 if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
6769 continue;
6770 }
6771
6772 // don't trim subtree root if its auth MDS is recovering.
6773 // This simplify the cache rejoin code.
6774 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6775 continue;
6776 trim_dirfrag(dir, 0, expiremap);
6777 ++trimmed;
6778 }
6779 }
6780
6781 // trim root?
6782 if (mds->is_stopping() && root) {
6783 auto&& ls = root->get_dirfrags();
6784 for (const auto& dir : ls) {
6785 if (dir->get_num_ref() == 1) { // subtree pin
6786 trim_dirfrag(dir, 0, expiremap);
6787 ++trimmed;
6788 }
6789 }
6790 if (root->get_num_ref() == 0) {
6791 trim_inode(0, root, 0, expiremap);
6792 ++trimmed;
6793 }
6794 }
6795
6796 std::set<mds_rank_t> stopping;
6797 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6798 stopping.erase(mds->get_nodeid());
6799 for (auto rank : stopping) {
6800 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6801 if (!mdsdir_in)
6802 continue;
6803
6804 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6805 if (em.second) {
6806 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6807 }
6808
6809 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds->get_nodeid() << dendl;
6810
6811 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6812 if (!aborted) {
6813 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6814 auto&& ls = mdsdir_in->get_dirfrags();
6815 for (auto dir : ls) {
6816 if (dir->get_num_ref() == 1) { // subtree pin
6817 trim_dirfrag(dir, dir, expiremap);
6818 ++trimmed;
6819 }
6820 }
6821 if (mdsdir_in->get_num_ref() == 0) {
6822 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6823 ++trimmed;
6824 }
6825 } else {
6826 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6827 }
6828 }
6829
6830 // Other rank's base inodes (when I'm stopping)
6831 if (mds->is_stopping()) {
6832 for (set<CInode*>::iterator p = base_inodes.begin();
6833 p != base_inodes.end();) {
6834 CInode *base_in = *p;
6835 ++p;
6836 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6837 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6838 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6839 if (base_in->get_num_ref() == 0) {
6840 trim_inode(NULL, base_in, NULL, expiremap);
6841 ++trimmed;
6842 }
6843 }
6844 }
6845 }
6846
6847 // send any expire messages
6848 send_expire_messages(expiremap);
6849
6850 return result;
6851 }
6852
6853 void MDCache::send_expire_messages(expiremap& expiremap)
6854 {
6855 // send expires
6856 for (const auto &p : expiremap) {
6857 if (mds->is_cluster_degraded() &&
6858 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6859 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6860 rejoin_sent.count(p.first) == 0))) {
6861 continue;
6862 }
6863 dout(7) << "sending cache_expire to " << p.first << dendl;
6864 mds->send_message_mds(p.second, p.first);
6865 }
6866 expiremap.clear();
6867 }
6868
6869
6870 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6871 {
6872 dout(12) << "trim_dentry " << *dn << dendl;
6873
6874 CDentry::linkage_t *dnl = dn->get_linkage();
6875
6876 CDir *dir = dn->get_dir();
6877 ceph_assert(dir);
6878
6879 CDir *con = get_subtree_root(dir);
6880 if (con)
6881 dout(12) << " in container " << *con << dendl;
6882 else {
6883 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6884 ceph_assert(dn->is_auth());
6885 }
6886
6887 // If replica dentry is not readable, it's likely we will receive
6888 // MDentryLink/MDentryUnlink message soon (It's possible we first
6889 // receive a MDentryUnlink message, then MDentryLink message)
6890 // MDentryLink message only replicates an inode, so we should
6891 // avoid trimming the inode's parent dentry. This is because that
6892 // unconnected replicas are problematic for subtree migration.
6893 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6894 !dn->get_dir()->get_inode()->is_stray())
6895 return true;
6896
6897 // adjust the dir state
6898 // NOTE: we can safely remove a clean, null dentry without effecting
6899 // directory completeness.
6900 // (check this _before_ we unlink the inode, below!)
6901 bool clear_complete = false;
6902 if (!(dnl->is_null() && dn->is_clean()))
6903 clear_complete = true;
6904
6905 // unlink the dentry
6906 if (dnl->is_remote()) {
6907 // just unlink.
6908 dir->unlink_inode(dn, false);
6909 } else if (dnl->is_primary()) {
6910 // expire the inode, too.
6911 CInode *in = dnl->get_inode();
6912 ceph_assert(in);
6913 if (trim_inode(dn, in, con, expiremap))
6914 return true; // purging stray instead of trimming
6915 } else {
6916 ceph_assert(dnl->is_null());
6917 }
6918
6919 if (!dn->is_auth()) {
6920 // notify dentry authority.
6921 mds_authority_t auth = dn->authority();
6922
6923 for (int p=0; p<2; p++) {
6924 mds_rank_t a = auth.first;
6925 if (p) a = auth.second;
6926 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6927 if (mds->get_nodeid() == auth.second &&
6928 con->is_importing()) break; // don't send any expire while importing.
6929 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6930
6931 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6932 ceph_assert(a != mds->get_nodeid());
6933 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6934 if (em.second)
6935 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6936 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6937 }
6938 }
6939
6940 // remove dentry
6941 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6942 dir->add_to_bloom(dn);
6943 dir->remove_dentry(dn);
6944
6945 if (clear_complete)
6946 dir->state_clear(CDir::STATE_COMPLETE);
6947
6948 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6949 return false;
6950 }
6951
6952
6953 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
6954 {
6955 dout(15) << "trim_dirfrag " << *dir << dendl;
6956
6957 if (dir->is_subtree_root()) {
6958 ceph_assert(!dir->is_auth() ||
6959 (!dir->is_replicated() && dir->inode->is_base()));
6960 remove_subtree(dir); // remove from subtree map
6961 }
6962 ceph_assert(dir->get_num_ref() == 0);
6963
6964 CInode *in = dir->get_inode();
6965
6966 if (!dir->is_auth()) {
6967 mds_authority_t auth = dir->authority();
6968
6969 // was this an auth delegation? (if so, slightly modified container)
6970 dirfrag_t condf;
6971 if (dir->is_subtree_root()) {
6972 dout(12) << " subtree root, container is " << *dir << dendl;
6973 con = dir;
6974 condf = dir->dirfrag();
6975 } else {
6976 condf = con->dirfrag();
6977 }
6978
6979 for (int p=0; p<2; p++) {
6980 mds_rank_t a = auth.first;
6981 if (p) a = auth.second;
6982 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6983 if (mds->get_nodeid() == auth.second &&
6984 con->is_importing()) break; // don't send any expire while importing.
6985 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6986
6987 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6988 ceph_assert(a != mds->get_nodeid());
6989 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6990 if (em.second)
6991 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
6992 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6993 }
6994 }
6995
6996 in->close_dirfrag(dir->dirfrag().frag);
6997 }
6998
6999 /**
7000 * Try trimming an inode from the cache
7001 *
7002 * @return true if the inode is still in cache, else false if it was trimmed
7003 */
7004 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7005 {
7006 dout(15) << "trim_inode " << *in << dendl;
7007 ceph_assert(in->get_num_ref() == 0);
7008
7009 if (in->is_dir()) {
7010 // If replica inode's dirfragtreelock is not readable, it's likely
7011 // some dirfrags of the inode are being fragmented and we will receive
7012 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7013 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7014 // This is because that unconnected replicas are problematic for
7015 // subtree migration.
7016 //
7017 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7018 return true;
7019 }
7020
7021 // DIR
7022 auto&& dfls = in->get_dirfrags();
7023 for (const auto& dir : dfls) {
7024 ceph_assert(!dir->is_subtree_root());
7025 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7026 }
7027 }
7028
7029 // INODE
7030 if (in->is_auth()) {
7031 // eval stray after closing dirfrags
7032 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7033 maybe_eval_stray(in);
7034 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7035 return true;
7036 }
7037 } else {
7038 mds_authority_t auth = in->authority();
7039
7040 dirfrag_t df;
7041 if (con)
7042 df = con->dirfrag();
7043 else
7044 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7045
7046 for (int p=0; p<2; p++) {
7047 mds_rank_t a = auth.first;
7048 if (p) a = auth.second;
7049 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7050 if (con && mds->get_nodeid() == auth.second &&
7051 con->is_importing()) break; // don't send any expire while importing.
7052 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7053
7054 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7055 ceph_assert(a != mds->get_nodeid());
7056 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7057 if (em.second)
7058 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7059 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7060 }
7061 }
7062
7063 /*
7064 if (in->is_auth()) {
7065 if (in->hack_accessed)
7066 mds->logger->inc("outt");
7067 else {
7068 mds->logger->inc("outut");
7069 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7070 }
7071 }
7072 */
7073
7074 // unlink
7075 if (dn)
7076 dn->get_dir()->unlink_inode(dn, false);
7077 remove_inode(in);
7078 return false;
7079 }
7080
7081
7082 /**
7083 * trim_non_auth - remove any non-auth items from our cache
7084 *
7085 * this reduces the amount of non-auth metadata in our cache, reducing the
7086 * load incurred by the rejoin phase.
7087 *
7088 * the only non-auth items that remain are those that are needed to
7089 * attach our own subtrees to the root.
7090 *
7091 * when we are done, all dentries will be in the top bit of the lru.
7092 *
7093 * why we have to do this:
7094 * we may not have accurate linkage for non-auth items. which means we will
7095 * know which subtree it falls into, and can not be sure to declare it to the
7096 * correct authority.
7097 */
7098 void MDCache::trim_non_auth()
7099 {
7100 dout(7) << "trim_non_auth" << dendl;
7101
7102 // temporarily pin all subtree roots
7103 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7104 p != subtrees.end();
7105 ++p)
7106 p->first->get(CDir::PIN_SUBTREETEMP);
7107
7108 list<CDentry*> auth_list;
7109
7110 // trim non-auth items from the lru
7111 for (;;) {
7112 CDentry *dn = NULL;
7113 if (bottom_lru.lru_get_size() > 0)
7114 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7115 if (!dn && lru.lru_get_size() > 0)
7116 dn = static_cast<CDentry*>(lru.lru_expire());
7117 if (!dn)
7118 break;
7119
7120 CDentry::linkage_t *dnl = dn->get_linkage();
7121
7122 if (dn->is_auth()) {
7123 // add back into lru (at the top)
7124 auth_list.push_back(dn);
7125
7126 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7127 dn->unlink_remote(dnl);
7128 } else {
7129 // non-auth. expire.
7130 CDir *dir = dn->get_dir();
7131 ceph_assert(dir);
7132
7133 // unlink the dentry
7134 dout(10) << " removing " << *dn << dendl;
7135 if (dnl->is_remote()) {
7136 dir->unlink_inode(dn, false);
7137 }
7138 else if (dnl->is_primary()) {
7139 CInode *in = dnl->get_inode();
7140 dout(10) << " removing " << *in << dendl;
7141 auto&& ls = in->get_dirfrags();
7142 for (const auto& subdir : ls) {
7143 ceph_assert(!subdir->is_subtree_root());
7144 in->close_dirfrag(subdir->dirfrag().frag);
7145 }
7146 dir->unlink_inode(dn, false);
7147 remove_inode(in);
7148 }
7149 else {
7150 ceph_assert(dnl->is_null());
7151 }
7152
7153 ceph_assert(!dir->has_bloom());
7154 dir->remove_dentry(dn);
7155 // adjust the dir state
7156 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7157 // close empty non-auth dirfrag
7158 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7159 dir->inode->close_dirfrag(dir->get_frag());
7160 }
7161 }
7162
7163 for (const auto& dn : auth_list) {
7164 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7165 bottom_lru.lru_insert_mid(dn);
7166 else
7167 lru.lru_insert_top(dn);
7168 }
7169
7170 // move everything in the pintail to the top bit of the lru.
7171 lru.lru_touch_entire_pintail();
7172
7173 // unpin all subtrees
7174 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7175 p != subtrees.end();
7176 ++p)
7177 p->first->put(CDir::PIN_SUBTREETEMP);
7178
7179 if (lru.lru_get_size() == 0 &&
7180 bottom_lru.lru_get_size() == 0) {
7181 // root, stray, etc.?
7182 auto p = inode_map.begin();
7183 while (p != inode_map.end()) {
7184 CInode *in = p->second;
7185 ++p;
7186 if (!in->is_auth()) {
7187 auto&& ls = in->get_dirfrags();
7188 for (const auto& dir : ls) {
7189 dout(10) << " removing " << *dir << dendl;
7190 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7191 remove_subtree(dir);
7192 in->close_dirfrag(dir->dirfrag().frag);
7193 }
7194 dout(10) << " removing " << *in << dendl;
7195 ceph_assert(!in->get_parent_dn());
7196 ceph_assert(in->get_num_ref() == 0);
7197 remove_inode(in);
7198 }
7199 }
7200 }
7201
7202 show_subtrees();
7203 }
7204
7205 /**
7206 * Recursively trim the subtree rooted at directory to remove all
7207 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7208 * of those links. This is used to clear invalid data out of the cache.
7209 * Note that it doesn't clear the passed-in directory, since that's not
7210 * always safe.
7211 */
7212 bool MDCache::trim_non_auth_subtree(CDir *dir)
7213 {
7214 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7215
7216 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7217
7218 auto j = dir->begin();
7219 auto i = j;
7220 while (j != dir->end()) {
7221 i = j++;
7222 CDentry *dn = i->second;
7223 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7224 CDentry::linkage_t *dnl = dn->get_linkage();
7225 if (dnl->is_primary()) { // check for subdirectories, etc
7226 CInode *in = dnl->get_inode();
7227 bool keep_inode = false;
7228 if (in->is_dir()) {
7229 auto&& subdirs = in->get_dirfrags();
7230 for (const auto& subdir : subdirs) {
7231 if (subdir->is_subtree_root()) {
7232 keep_inode = true;
7233 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7234 } else {
7235 if (trim_non_auth_subtree(subdir))
7236 keep_inode = true;
7237 else {
7238 in->close_dirfrag(subdir->get_frag());
7239 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7240 }
7241 }
7242 }
7243
7244 }
7245 if (!keep_inode) { // remove it!
7246 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7247 dir->unlink_inode(dn, false);
7248 remove_inode(in);
7249 ceph_assert(!dir->has_bloom());
7250 dir->remove_dentry(dn);
7251 } else {
7252 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7253 dn->state_clear(CDentry::STATE_AUTH);
7254 in->state_clear(CInode::STATE_AUTH);
7255 }
7256 } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback
7257 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7258 } else { // just remove it
7259 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7260 if (dnl->is_remote())
7261 dir->unlink_inode(dn, false);
7262 dir->remove_dentry(dn);
7263 }
7264 }
7265 dir->state_clear(CDir::STATE_AUTH);
7266 /**
7267 * We've now checked all our children and deleted those that need it.
7268 * Now return to caller, and tell them if *we're* a keeper.
7269 */
7270 return keep_dir || dir->get_num_any();
7271 }
7272
7273 /*
7274 * during replay, when we determine a subtree is no longer ours, we
7275 * try to trim it from our cache. because subtrees must be connected
7276 * to the root, the fact that we can trim this tree may mean that our
7277 * children or parents can also be trimmed.
7278 */
7279 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7280 {
7281 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7282
7283 // can we now trim child subtrees?
7284 set<CDir*> bounds;
7285 get_subtree_bounds(dir, bounds);
7286 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7287 CDir *bd = *p;
7288 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7289 bd->get_num_any() == 0 && // and empty
7290 can_trim_non_auth_dirfrag(bd)) {
7291 CInode *bi = bd->get_inode();
7292 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7293 remove_subtree(bd);
7294 bd->mark_clean();
7295 bi->close_dirfrag(bd->get_frag());
7296 }
7297 }
7298
7299 if (trim_non_auth_subtree(dir)) {
7300 // keep
7301 try_subtree_merge(dir);
7302 } else {
7303 // can we trim this subtree (and possibly our ancestors) too?
7304 while (true) {
7305 CInode *diri = dir->get_inode();
7306 if (diri->is_base()) {
7307 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7308 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7309 remove_subtree(dir);
7310 dir->mark_clean();
7311 diri->close_dirfrag(dir->get_frag());
7312
7313 dout(10) << " removing " << *diri << dendl;
7314 ceph_assert(!diri->get_parent_dn());
7315 ceph_assert(diri->get_num_ref() == 0);
7316 remove_inode(diri);
7317 }
7318 break;
7319 }
7320
7321 CDir *psub = get_subtree_root(diri->get_parent_dir());
7322 dout(10) << " parent subtree is " << *psub << dendl;
7323 if (psub->get_dir_auth().first == mds->get_nodeid())
7324 break; // we are auth, keep.
7325
7326 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7327 remove_subtree(dir);
7328 dir->mark_clean();
7329 diri->close_dirfrag(dir->get_frag());
7330
7331 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7332 if (trim_non_auth_subtree(psub))
7333 break;
7334 dir = psub;
7335 }
7336 }
7337
7338 show_subtrees();
7339 }
7340
7341 void MDCache::standby_trim_segment(LogSegment *ls)
7342 {
7343 auto try_trim_inode = [this](CInode *in) {
7344 if (in->get_num_ref() == 0 &&
7345 !in->item_open_file.is_on_list() &&
7346 in->parent != NULL &&
7347 in->parent->get_num_ref() == 0){
7348 touch_dentry_bottom(in->parent);
7349 }
7350 };
7351
7352 auto try_trim_dentry = [this](CDentry *dn) {
7353 if (dn->get_num_ref() > 0)
7354 return;
7355 auto in = dn->get_linkage()->inode;
7356 if(in && in->item_open_file.is_on_list())
7357 return;
7358 touch_dentry_bottom(dn);
7359 };
7360
7361 ls->new_dirfrags.clear_list();
7362 ls->open_files.clear_list();
7363
7364 while (!ls->dirty_dirfrags.empty()) {
7365 CDir *dir = ls->dirty_dirfrags.front();
7366 dir->mark_clean();
7367 if (dir->inode)
7368 try_trim_inode(dir->inode);
7369 }
7370 while (!ls->dirty_inodes.empty()) {
7371 CInode *in = ls->dirty_inodes.front();
7372 in->mark_clean();
7373 try_trim_inode(in);
7374 }
7375 while (!ls->dirty_dentries.empty()) {
7376 CDentry *dn = ls->dirty_dentries.front();
7377 dn->mark_clean();
7378 try_trim_dentry(dn);
7379 }
7380 while (!ls->dirty_parent_inodes.empty()) {
7381 CInode *in = ls->dirty_parent_inodes.front();
7382 in->clear_dirty_parent();
7383 try_trim_inode(in);
7384 }
7385 while (!ls->dirty_dirfrag_dir.empty()) {
7386 CInode *in = ls->dirty_dirfrag_dir.front();
7387 in->filelock.remove_dirty();
7388 try_trim_inode(in);
7389 }
7390 while (!ls->dirty_dirfrag_nest.empty()) {
7391 CInode *in = ls->dirty_dirfrag_nest.front();
7392 in->nestlock.remove_dirty();
7393 try_trim_inode(in);
7394 }
7395 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7396 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7397 in->dirfragtreelock.remove_dirty();
7398 try_trim_inode(in);
7399 }
7400 while (!ls->truncating_inodes.empty()) {
7401 auto it = ls->truncating_inodes.begin();
7402 CInode *in = *it;
7403 ls->truncating_inodes.erase(it);
7404 in->put(CInode::PIN_TRUNCATING);
7405 try_trim_inode(in);
7406 }
7407 }
7408
7409 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7410 {
7411 mds_rank_t from = mds_rank_t(m->get_from());
7412
7413 dout(7) << "cache_expire from mds." << from << dendl;
7414
7415 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7416 return;
7417 }
7418
7419 set<SimpleLock *> gather_locks;
7420 // loop over realms
7421 for (const auto &p : m->realms) {
7422 // check container?
7423 if (p.first.ino > 0) {
7424 CInode *expired_inode = get_inode(p.first.ino);
7425 ceph_assert(expired_inode); // we had better have this.
7426 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7427 ceph_assert(parent_dir);
7428
7429 int export_state = -1;
7430 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7431 export_state = migrator->get_export_state(parent_dir);
7432 ceph_assert(export_state >= 0);
7433 }
7434
7435 if (!parent_dir->is_auth() ||
7436 (export_state != -1 &&
7437 ((export_state == Migrator::EXPORT_WARNING &&
7438 migrator->export_has_warned(parent_dir,from)) ||
7439 export_state == Migrator::EXPORT_EXPORTING ||
7440 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7441 (export_state == Migrator::EXPORT_NOTIFYING &&
7442 !migrator->export_has_notified(parent_dir,from))))) {
7443
7444 // not auth.
7445 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7446 ceph_assert(parent_dir->is_frozen_tree_root());
7447
7448 // make a message container
7449
7450 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7451 if (em.second)
7452 em.first->second = make_message<MCacheExpire>(from); /* new */
7453
7454 // merge these expires into it
7455 em.first->second->add_realm(p.first, p.second);
7456 continue;
7457 }
7458 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7459 (export_state == Migrator::EXPORT_WARNING &&
7460 !migrator->export_has_warned(parent_dir, from)));
7461
7462 dout(7) << "expires for " << *parent_dir << dendl;
7463 } else {
7464 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7465 }
7466
7467 // INODES
7468 for (const auto &q : p.second.inodes) {
7469 CInode *in = get_inode(q.first);
7470 unsigned nonce = q.second;
7471
7472 if (!in) {
7473 dout(0) << " inode expire on " << q.first << " from " << from
7474 << ", don't have it" << dendl;
7475 ceph_assert(in);
7476 }
7477 ceph_assert(in->is_auth());
7478 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7479
7480 // check nonce
7481 if (nonce == in->get_replica_nonce(from)) {
7482 // remove from our cached_by
7483 dout(7) << " inode expire on " << *in << " from mds." << from
7484 << " cached_by was " << in->get_replicas() << dendl;
7485 inode_remove_replica(in, from, false, gather_locks);
7486 }
7487 else {
7488 // this is an old nonce, ignore expire.
7489 dout(7) << " inode expire on " << *in << " from mds." << from
7490 << " with old nonce " << nonce
7491 << " (current " << in->get_replica_nonce(from) << "), dropping"
7492 << dendl;
7493 }
7494 }
7495
7496 // DIRS
7497 for (const auto &q : p.second.dirs) {
7498 CDir *dir = get_dirfrag(q.first);
7499 unsigned nonce = q.second;
7500
7501 if (!dir) {
7502 CInode *diri = get_inode(q.first.ino);
7503 if (diri) {
7504 if (mds->is_rejoin() &&
7505 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7506 !diri->is_replica(from)) {
7507 auto&& ls = diri->get_nested_dirfrags();
7508 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7509 << " while rejoining, inode isn't replicated" << dendl;
7510 for (const auto& d : ls) {
7511 dir = d;
7512 if (dir->is_replica(from)) {
7513 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7514 dir->remove_replica(from);
7515 }
7516 }
7517 continue;
7518 }
7519 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7520 if (other) {
7521 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7522 << " have " << *other << ", mismatched frags, dropping" << dendl;
7523 continue;
7524 }
7525 }
7526 dout(0) << " dir expire on " << q.first << " from " << from
7527 << ", don't have it" << dendl;
7528 ceph_assert(dir);
7529 }
7530 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7531
7532 ceph_assert(dir->is_auth());
7533
7534 // check nonce
7535 if (nonce == dir->get_replica_nonce(from)) {
7536 // remove from our cached_by
7537 dout(7) << " dir expire on " << *dir << " from mds." << from
7538 << " replicas was " << dir->get_replicas() << dendl;
7539 dir->remove_replica(from);
7540 }
7541 else {
7542 // this is an old nonce, ignore expire.
7543 dout(7) << " dir expire on " << *dir << " from mds." << from
7544 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7545 << "), dropping" << dendl;
7546 }
7547 }
7548
7549 // DENTRIES
7550 for (const auto &pd : p.second.dentries) {
7551 dout(10) << " dn expires in dir " << pd.first << dendl;
7552 CInode *diri = get_inode(pd.first.ino);
7553 ceph_assert(diri);
7554 CDir *dir = diri->get_dirfrag(pd.first.frag);
7555
7556 if (!dir) {
7557 dout(0) << " dn expires on " << pd.first << " from " << from
7558 << ", must have refragmented" << dendl;
7559 } else {
7560 ceph_assert(dir->is_auth());
7561 }
7562
7563 for (const auto &p : pd.second) {
7564 unsigned nonce = p.second;
7565 CDentry *dn;
7566
7567 if (dir) {
7568 dn = dir->lookup(p.first.first, p.first.second);
7569 } else {
7570 // which dirfrag for this dentry?
7571 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7572 ceph_assert(dir);
7573 ceph_assert(dir->is_auth());
7574 dn = dir->lookup(p.first.first, p.first.second);
7575 }
7576
7577 if (!dn) {
7578 if (dir)
7579 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7580 else
7581 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7582 }
7583 ceph_assert(dn);
7584
7585 if (nonce == dn->get_replica_nonce(from)) {
7586 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7587 dentry_remove_replica(dn, from, gather_locks);
7588 }
7589 else {
7590 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7591 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7592 << "), dropping" << dendl;
7593 }
7594 }
7595 }
7596 }
7597
7598 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7599 if (!(*p)->is_stable())
7600 mds->locker->eval_gather(*p);
7601 }
7602 }
7603
7604 void MDCache::process_delayed_expire(CDir *dir)
7605 {
7606 dout(7) << "process_delayed_expire on " << *dir << dendl;
7607 for (const auto &p : delayed_expire[dir]) {
7608 handle_cache_expire(p.second);
7609 }
7610 delayed_expire.erase(dir);
7611 }
7612
7613 void MDCache::discard_delayed_expire(CDir *dir)
7614 {
7615 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7616 delayed_expire.erase(dir);
7617 }
7618
7619 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7620 set<SimpleLock *>& gather_locks)
7621 {
7622 in->remove_replica(from);
7623 in->set_mds_caps_wanted(from, 0);
7624
7625 // note: this code calls _eval more often than it needs to!
7626 // fix lock
7627 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7628 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7629 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7630 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7631 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7632 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7633
7634 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7635 // Don't remove the recovering mds from lock's gathering list because
7636 // it may hold rejoined wrlocks.
7637 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7638 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7639 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7640 }
7641
7642 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7643 {
7644 dn->remove_replica(from);
7645
7646 // fix lock
7647 if (dn->lock.remove_replica(from))
7648 gather_locks.insert(&dn->lock);
7649
7650 // Replicated strays might now be elegible for purge
7651 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7652 if (dnl->is_primary()) {
7653 maybe_eval_stray(dnl->get_inode());
7654 }
7655 }
7656
7657 void MDCache::trim_client_leases()
7658 {
7659 utime_t now = ceph_clock_now();
7660
7661 dout(10) << "trim_client_leases" << dendl;
7662
7663 std::size_t pool = 0;
7664 for (const auto& list : client_leases) {
7665 pool += 1;
7666 if (list.empty())
7667 continue;
7668
7669 auto before = list.size();
7670 while (!list.empty()) {
7671 ClientLease *r = list.front();
7672 if (r->ttl > now) break;
7673 CDentry *dn = static_cast<CDentry*>(r->parent);
7674 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7675 dn->remove_client_lease(r, mds->locker);
7676 }
7677 auto after = list.size();
7678 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7679 << (before-after) << " leases, " << after << " left" << dendl;
7680 }
7681 }
7682
7683 void MDCache::check_memory_usage()
7684 {
7685 static MemoryModel mm(g_ceph_context);
7686 static MemoryModel::snap last;
7687 mm.sample(&last);
7688 static MemoryModel::snap baseline = last;
7689
7690 // check client caps
7691 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7692 double caps_per_inode = 0.0;
7693 if (CInode::count())
7694 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7695
7696 dout(2) << "Memory usage: "
7697 << " total " << last.get_total()
7698 << ", rss " << last.get_rss()
7699 << ", heap " << last.get_heap()
7700 << ", baseline " << baseline.get_heap()
7701 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7702 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7703 << dendl;
7704
7705 mds->update_mlogger();
7706 mds->mlogger->set(l_mdm_rss, last.get_rss());
7707 mds->mlogger->set(l_mdm_heap, last.get_heap());
7708 }
7709
7710
7711
7712 // =========================================================================================
7713 // shutdown
7714
7715 class C_MDC_ShutdownCheck : public MDCacheContext {
7716 public:
7717 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7718 void finish(int) override {
7719 mdcache->shutdown_check();
7720 }
7721 };
7722
7723 void MDCache::shutdown_check()
7724 {
7725 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7726
7727 // cache
7728 char old_val[32] = { 0 };
7729 char *o = old_val;
7730 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7731 g_conf().set_val("debug_mds", "10");
7732 g_conf().apply_changes(nullptr);
7733 show_cache();
7734 g_conf().set_val("debug_mds", old_val);
7735 g_conf().apply_changes(nullptr);
7736 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7737
7738 // this
7739 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7740 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7741
7742
7743 if (mds->objecter->is_active()) {
7744 dout(0) << "objecter still active" << dendl;
7745 mds->objecter->dump_active();
7746 }
7747 }
7748
7749
7750 void MDCache::shutdown_start()
7751 {
7752 dout(5) << "shutdown_start" << dendl;
7753
7754 if (g_conf()->mds_shutdown_check)
7755 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7756
7757 // g_conf()->debug_mds = 10;
7758 }
7759
7760
7761
7762 bool MDCache::shutdown_pass()
7763 {
7764 dout(7) << "shutdown_pass" << dendl;
7765
7766 if (mds->is_stopped()) {
7767 dout(7) << " already shut down" << dendl;
7768 show_cache();
7769 show_subtrees();
7770 return true;
7771 }
7772
7773 // empty stray dir
7774 bool strays_all_exported = shutdown_export_strays();
7775
7776 // trim cache
7777 trim(UINT64_MAX);
7778 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7779
7780 // Export all subtrees to another active (usually rank 0) if not rank 0
7781 int num_auth_subtree = 0;
7782 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7783 dout(7) << "looking for subtrees to export" << dendl;
7784 std::vector<CDir*> ls;
7785 for (auto& [dir, bounds] : subtrees) {
7786 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7787 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7788 continue;
7789 num_auth_subtree++;
7790 if (dir->is_frozen() ||
7791 dir->is_freezing() ||
7792 dir->is_ambiguous_dir_auth() ||
7793 dir->state_test(CDir::STATE_EXPORTING) ||
7794 dir->get_inode()->is_ephemerally_pinned()) {
7795 continue;
7796 }
7797 ls.push_back(dir);
7798 }
7799
7800 migrator->clear_export_queue();
7801 // stopping mds does not call MDBalancer::tick()
7802 mds->balancer->handle_export_pins();
7803 for (const auto& dir : ls) {
7804 mds_rank_t dest = dir->get_inode()->authority().first;
7805 if (dest > 0 && !mds->mdsmap->is_active(dest))
7806 dest = 0;
7807 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7808 migrator->export_dir_nicely(dir, dest);
7809 }
7810 }
7811
7812 if (!strays_all_exported) {
7813 dout(7) << "waiting for strays to migrate" << dendl;
7814 return false;
7815 }
7816
7817 if (num_auth_subtree > 0) {
7818 ceph_assert(mds->get_nodeid() > 0);
7819 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7820 show_subtrees();
7821 return false;
7822 }
7823
7824 // close out any sessions (and open files!) before we try to trim the log, etc.
7825 if (mds->sessionmap.have_unclosed_sessions()) {
7826 if (!mds->server->terminating_sessions)
7827 mds->server->terminate_sessions();
7828 return false;
7829 }
7830
7831 // Fully trim the log so that all objects in cache are clean and may be
7832 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7833 // trim the log such that the cache eventually becomes clean.
7834 if (mds->mdlog->get_num_segments() > 0) {
7835 auto ls = mds->mdlog->get_current_segment();
7836 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7837 // Current segment contains events other than subtreemap or
7838 // there are dirty dirfrags (see CDir::log_mark_dirty())
7839 mds->mdlog->start_new_segment();
7840 mds->mdlog->flush();
7841 }
7842 }
7843 mds->mdlog->trim_all();
7844 if (mds->mdlog->get_num_segments() > 1) {
7845 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7846 return false;
7847 }
7848
7849 // drop our reference to our stray dir inode
7850 for (int i = 0; i < NUM_STRAY; ++i) {
7851 if (strays[i] &&
7852 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7853 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7854 strays[i]->put(CInode::PIN_STRAY);
7855 strays[i]->put_stickydirs();
7856 }
7857 }
7858
7859 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7860 if (mydir && !mydir->is_subtree_root())
7861 mydir = NULL;
7862
7863 // subtrees map not empty yet?
7864 if (subtrees.size() > (mydir ? 1 : 0)) {
7865 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7866 show_subtrees();
7867 migrator->show_importing();
7868 migrator->show_exporting();
7869 if (!migrator->is_importing() && !migrator->is_exporting())
7870 show_cache();
7871 return false;
7872 }
7873 ceph_assert(!migrator->is_exporting());
7874 ceph_assert(!migrator->is_importing());
7875
7876 // replicas may dirty scatter locks
7877 if (myin && myin->is_replicated()) {
7878 dout(7) << "still have replicated objects" << dendl;
7879 return false;
7880 }
7881
7882 if ((myin && myin->get_num_auth_pins()) ||
7883 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
7884 dout(7) << "still have auth pinned objects" << dendl;
7885 return false;
7886 }
7887
7888 // (only do this once!)
7889 if (!mds->mdlog->is_capped()) {
7890 dout(7) << "capping the log" << dendl;
7891 mds->mdlog->cap();
7892 }
7893
7894 if (!mds->mdlog->empty())
7895 mds->mdlog->trim(0);
7896
7897 if (!mds->mdlog->empty()) {
7898 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7899 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7900 return false;
7901 }
7902
7903 if (!did_shutdown_log_cap) {
7904 // flush journal header
7905 dout(7) << "writing header for (now-empty) journal" << dendl;
7906 ceph_assert(mds->mdlog->empty());
7907 mds->mdlog->write_head(0);
7908 // NOTE: filer active checker below will block us until this completes.
7909 did_shutdown_log_cap = true;
7910 return false;
7911 }
7912
7913 // filer active?
7914 if (mds->objecter->is_active()) {
7915 dout(7) << "objecter still active" << dendl;
7916 mds->objecter->dump_active();
7917 return false;
7918 }
7919
7920 // trim what we can from the cache
7921 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7922 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7923 show_cache();
7924 //dump();
7925 return false;
7926 }
7927
7928 // make mydir subtree go away
7929 if (mydir) {
7930 if (mydir->get_num_ref() > 1) { // subtree pin
7931 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7932 show_cache();
7933 return false;
7934 }
7935
7936 remove_subtree(mydir);
7937 myin->close_dirfrag(mydir->get_frag());
7938 }
7939 ceph_assert(subtrees.empty());
7940
7941 if (myin) {
7942 remove_inode(myin);
7943 ceph_assert(!myin);
7944 }
7945
7946 if (global_snaprealm) {
7947 remove_inode(global_snaprealm->inode);
7948 global_snaprealm = nullptr;
7949 }
7950
7951 // done!
7952 dout(5) << "shutdown done." << dendl;
7953 return true;
7954 }
7955
7956 bool MDCache::shutdown_export_strays()
7957 {
7958 static const unsigned MAX_EXPORTING = 100;
7959
7960 if (mds->get_nodeid() == 0)
7961 return true;
7962
7963 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7964 return false;
7965
7966 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7967 << " '" << shutdown_export_next.second << "'" << dendl;
7968
7969 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7970 bool all_exported = false;
7971
7972 again:
7973 auto next = shutdown_export_next;
7974
7975 for (int i = 0; i < NUM_STRAY; ++i) {
7976 CInode *strayi = strays[i];
7977 if (!strayi ||
7978 !strayi->state_test(CInode::STATE_STRAYPINNED))
7979 continue;
7980 if (strayi->ino() < next.first.ino)
7981 continue;
7982
7983 deque<CDir*> dfls;
7984 strayi->get_dirfrags(dfls);
7985
7986 while (!dfls.empty()) {
7987 CDir *dir = dfls.front();
7988 dfls.pop_front();
7989
7990 if (dir->dirfrag() < next.first)
7991 continue;
7992 if (next.first < dir->dirfrag()) {
7993 next.first = dir->dirfrag();
7994 next.second.clear();
7995 }
7996
7997 if (!dir->is_complete()) {
7998 MDSContext *fin = nullptr;
7999 if (shutdown_exporting_strays.empty()) {
8000 fin = new MDSInternalContextWrapper(mds,
8001 new LambdaContext([this](int r) {
8002 shutdown_export_strays();
8003 })
8004 );
8005 }
8006 dir->fetch(fin);
8007 goto done;
8008 }
8009
8010 CDir::dentry_key_map::iterator it;
8011 if (next.second.empty()) {
8012 it = dir->begin();
8013 } else {
8014 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8015 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8016 }
8017
8018 for (; it != dir->end(); ++it) {
8019 CDentry *dn = it->second;
8020 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8021 if (dnl->is_null())
8022 continue;
8023
8024 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8025 next.second = it->first.name;
8026 goto done;
8027 }
8028
8029 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8030 if (!ret.second) {
8031 dout(10) << "already exporting/purging " << *dn << dendl;
8032 continue;
8033 }
8034
8035 // Don't try to migrate anything that is actually
8036 // being purged right now
8037 if (!dn->state_test(CDentry::STATE_PURGING))
8038 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8039
8040 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8041 ++it;
8042 if (it != dir->end()) {
8043 next.second = it->first.name;
8044 } else {
8045 if (dfls.empty())
8046 next.first.ino.val++;
8047 else
8048 next.first = dfls.front()->dirfrag();
8049 next.second.clear();
8050 }
8051 goto done;
8052 }
8053 }
8054 }
8055 }
8056
8057 if (shutdown_exporting_strays.empty()) {
8058 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8059 if (first_df < shutdown_export_next.first ||
8060 !shutdown_export_next.second.empty()) {
8061 shutdown_export_next.first = first_df;
8062 shutdown_export_next.second.clear();
8063 goto again;
8064 }
8065 all_exported = true;
8066 }
8067
8068 done:
8069 shutdown_export_next = next;
8070 return all_exported;
8071 }
8072
8073 // ========= messaging ==============
8074
8075 void MDCache::dispatch(const cref_t<Message> &m)
8076 {
8077 switch (m->get_type()) {
8078
8079 // RESOLVE
8080 case MSG_MDS_RESOLVE:
8081 handle_resolve(ref_cast<MMDSResolve>(m));
8082 break;
8083 case MSG_MDS_RESOLVEACK:
8084 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8085 break;
8086
8087 // REJOIN
8088 case MSG_MDS_CACHEREJOIN:
8089 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8090 break;
8091
8092 case MSG_MDS_DISCOVER:
8093 handle_discover(ref_cast<MDiscover>(m));
8094 break;
8095 case MSG_MDS_DISCOVERREPLY:
8096 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8097 break;
8098
8099 case MSG_MDS_DIRUPDATE:
8100 handle_dir_update(ref_cast<MDirUpdate>(m));
8101 break;
8102
8103 case MSG_MDS_CACHEEXPIRE:
8104 handle_cache_expire(ref_cast<MCacheExpire>(m));
8105 break;
8106
8107 case MSG_MDS_DENTRYLINK:
8108 handle_dentry_link(ref_cast<MDentryLink>(m));
8109 break;
8110 case MSG_MDS_DENTRYUNLINK:
8111 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8112 break;
8113
8114 case MSG_MDS_FRAGMENTNOTIFY:
8115 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8116 break;
8117 case MSG_MDS_FRAGMENTNOTIFYACK:
8118 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8119 break;
8120
8121 case MSG_MDS_FINDINO:
8122 handle_find_ino(ref_cast<MMDSFindIno>(m));
8123 break;
8124 case MSG_MDS_FINDINOREPLY:
8125 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8126 break;
8127
8128 case MSG_MDS_OPENINO:
8129 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8130 break;
8131 case MSG_MDS_OPENINOREPLY:
8132 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8133 break;
8134
8135 case MSG_MDS_SNAPUPDATE:
8136 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8137 break;
8138
8139 default:
8140 derr << "cache unknown message " << m->get_type() << dendl;
8141 ceph_abort_msg("cache unknown message");
8142 }
8143 }
8144
8145 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8146 const filepath& path, int flags,
8147 vector<CDentry*> *pdnvec, CInode **pin)
8148 {
8149 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8150 bool forward = !discover;
8151 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8152 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8153 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8154 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8155 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8156 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8157 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8158
8159 if (forward)
8160 ceph_assert(mdr); // forward requires a request
8161
8162 snapid_t snapid = CEPH_NOSNAP;
8163 if (mdr)
8164 mdr->snapid = snapid;
8165
8166 client_t client = mdr ? mdr->get_client() : -1;
8167
8168 if (mds->logger) mds->logger->inc(l_mds_traverse);
8169
8170 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8171 CInode *cur = get_inode(path.get_ino());
8172 if (!cur) {
8173 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8174 open_foreign_mdsdir(path.get_ino(), cf.build());
8175 return 1;
8176 }
8177 if (MDS_INO_IS_STRAY(path.get_ino())) {
8178 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8179 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8180 filepath path(strays[idx]->get_parent_dn()->get_name(),
8181 MDS_INO_MDSDIR(rank));
8182 MDRequestRef null_ref;
8183 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8184 }
8185 return -CEPHFS_ESTALE;
8186 }
8187 if (cur->state_test(CInode::STATE_PURGING))
8188 return -CEPHFS_ESTALE;
8189
8190 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8191 mds->locker->find_and_attach_lock_cache(mdr, cur);
8192
8193 if (mdr && mdr->lock_cache) {
8194 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8195 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8196 } else if (rdlock_snap) {
8197 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8198 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8199 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8200 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8201 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8202 return 1;
8203 }
8204 }
8205
8206 // start trace
8207 if (pdnvec)
8208 pdnvec->clear();
8209 if (pin)
8210 *pin = cur;
8211
8212 MutationImpl::LockOpVec lov;
8213
8214 for (unsigned depth = 0; depth < path.depth(); ) {
8215 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8216 << "' snapid " << snapid << dendl;
8217
8218 if (!cur->is_dir()) {
8219 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8220 return -CEPHFS_ENOTDIR;
8221 }
8222
8223 // walk into snapdir?
8224 if (path[depth].length() == 0) {
8225 dout(10) << "traverse: snapdir" << dendl;
8226 if (!mdr || depth > 0) // snapdir must be the first component
8227 return -CEPHFS_EINVAL;
8228 snapid = CEPH_SNAPDIR;
8229 mdr->snapid = snapid;
8230 depth++;
8231 continue;
8232 }
8233 // walk thru snapdir?
8234 if (snapid == CEPH_SNAPDIR) {
8235 if (!mdr)
8236 return -CEPHFS_EINVAL;
8237 SnapRealm *realm = cur->find_snaprealm();
8238 snapid = realm->resolve_snapname(path[depth], cur->ino());
8239 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8240 if (!snapid) {
8241 if (pdnvec)
8242 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8243 return -CEPHFS_ENOENT;
8244 }
8245 mdr->snapid = snapid;
8246 depth++;
8247 continue;
8248 }
8249
8250 // open dir
8251 frag_t fg = cur->pick_dirfrag(path[depth]);
8252 CDir *curdir = cur->get_dirfrag(fg);
8253 if (!curdir) {
8254 if (cur->is_auth()) {
8255 // parent dir frozen_dir?
8256 if (cur->is_frozen()) {
8257 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8258 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8259 return 1;
8260 }
8261 curdir = cur->get_or_open_dirfrag(this, fg);
8262 } else {
8263 // discover?
8264 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8265 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8266 path_locked);
8267 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8268 return 1;
8269 }
8270 }
8271 ceph_assert(curdir);
8272
8273 #ifdef MDS_VERIFY_FRAGSTAT
8274 if (curdir->is_complete())
8275 curdir->verify_fragstat();
8276 #endif
8277
8278 // frozen?
8279 /*
8280 if (curdir->is_frozen()) {
8281 // doh!
8282 // FIXME: traverse is allowed?
8283 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8284 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8285 if (onfinish) delete onfinish;
8286 return 1;
8287 }
8288 */
8289
8290 if (want_auth && want_dentry && depth == path.depth() - 1) {
8291 if (curdir->is_ambiguous_auth()) {
8292 dout(10) << "waiting for single auth on " << *curdir << dendl;
8293 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8294 return 1;
8295 }
8296 if (!curdir->is_auth()) {
8297 dout(10) << "fw to auth for " << *curdir << dendl;
8298 request_forward(mdr, curdir->authority().first);
8299 return 2;
8300 }
8301 }
8302
8303 // Before doing dirfrag->dn lookup, compare with DamageTable's
8304 // record of which dentries were unreadable
8305 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8306 dout(4) << "traverse: stopped lookup at damaged dentry "
8307 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8308 return -CEPHFS_EIO;
8309 }
8310
8311 // dentry
8312 CDentry *dn = curdir->lookup(path[depth], snapid);
8313 if (dn) {
8314 if (dn->state_test(CDentry::STATE_PURGING))
8315 return -CEPHFS_ENOENT;
8316
8317 if (rdlock_path) {
8318 lov.clear();
8319 if (xlock_dentry && depth == path.depth() - 1) {
8320 if (depth > 0 || !mdr->lock_cache) {
8321 lov.add_wrlock(&cur->filelock);
8322 lov.add_wrlock(&cur->nestlock);
8323 if (rdlock_authlock)
8324 lov.add_rdlock(&cur->authlock);
8325 }
8326 lov.add_xlock(&dn->lock);
8327 } else {
8328 // force client to flush async dir operation if necessary
8329 if (cur->filelock.is_cached())
8330 lov.add_wrlock(&cur->filelock);
8331 lov.add_rdlock(&dn->lock);
8332 }
8333 if (!mds->locker->acquire_locks(mdr, lov)) {
8334 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8335 return 1;
8336 }
8337 } else if (!path_locked &&
8338 !dn->lock.can_read(client) &&
8339 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8340 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8341 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8342 if (mds->logger)
8343 mds->logger->inc(l_mds_traverse_lock);
8344 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8345 mds->mdlog->flush();
8346 return 1;
8347 }
8348
8349 if (pdnvec)
8350 pdnvec->push_back(dn);
8351
8352 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8353 // can we conclude CEPHFS_ENOENT?
8354 if (dnl->is_null()) {
8355 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8356 if (depth == path.depth() - 1) {
8357 if (want_dentry)
8358 break;
8359 } else {
8360 if (pdnvec)
8361 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8362 }
8363 return -CEPHFS_ENOENT;
8364 }
8365
8366 // do we have inode?
8367 CInode *in = dnl->get_inode();
8368 if (!in) {
8369 ceph_assert(dnl->is_remote());
8370 // do i have it?
8371 in = get_inode(dnl->get_remote_ino());
8372 if (in) {
8373 dout(7) << "linking in remote in " << *in << dendl;
8374 dn->link_remote(dnl, in);
8375 } else {
8376 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8377 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8378 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8379 dout(4) << "traverse: remote dentry points to damaged ino "
8380 << *dn << dendl;
8381 return -CEPHFS_EIO;
8382 }
8383 open_remote_dentry(dn, true, cf.build(),
8384 (path_locked && depth == path.depth() - 1));
8385 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8386 return 1;
8387 }
8388 }
8389
8390 cur = in;
8391
8392 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8393 lov.clear();
8394 lov.add_rdlock(&cur->snaplock);
8395 if (!mds->locker->acquire_locks(mdr, lov)) {
8396 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8397 return 1;
8398 }
8399 }
8400
8401 // add to trace, continue.
8402 touch_inode(cur);
8403 if (pin)
8404 *pin = cur;
8405 depth++;
8406 continue;
8407 }
8408
8409 ceph_assert(!dn);
8410
8411 // MISS. dentry doesn't exist.
8412 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8413
8414 if (curdir->is_auth()) {
8415 // dentry is mine.
8416 if (curdir->is_complete() ||
8417 (snapid == CEPH_NOSNAP &&
8418 curdir->has_bloom() &&
8419 !curdir->is_in_bloom(path[depth]))) {
8420 // file not found
8421 if (pdnvec) {
8422 // instantiate a null dn?
8423 if (depth < path.depth() - 1) {
8424 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8425 } else if (snapid < CEPH_MAXSNAP) {
8426 dout(20) << " not adding null for snapid " << snapid << dendl;
8427 } else if (curdir->is_frozen()) {
8428 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8429 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8430 return 1;
8431 } else {
8432 // create a null dentry
8433 dn = curdir->add_null_dentry(path[depth]);
8434 dout(20) << " added null " << *dn << dendl;
8435
8436 if (rdlock_path) {
8437 lov.clear();
8438 if (xlock_dentry) {
8439 if (depth > 0 || !mdr->lock_cache) {
8440 lov.add_wrlock(&cur->filelock);
8441 lov.add_wrlock(&cur->nestlock);
8442 if (rdlock_authlock)
8443 lov.add_rdlock(&cur->authlock);
8444 }
8445 lov.add_xlock(&dn->lock);
8446 } else {
8447 // force client to flush async dir operation if necessary
8448 if (cur->filelock.is_cached())
8449 lov.add_wrlock(&cur->filelock);
8450 lov.add_rdlock(&dn->lock);
8451 }
8452 if (!mds->locker->acquire_locks(mdr, lov)) {
8453 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8454 return 1;
8455 }
8456 }
8457 }
8458 if (dn) {
8459 pdnvec->push_back(dn);
8460 if (want_dentry)
8461 break;
8462 } else {
8463 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8464 }
8465 }
8466 return -CEPHFS_ENOENT;
8467 } else {
8468
8469 // Check DamageTable for missing fragments before trying to fetch
8470 // this
8471 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8472 dout(4) << "traverse: damaged dirfrag " << *curdir
8473 << ", blocking fetch" << dendl;
8474 return -CEPHFS_EIO;
8475 }
8476
8477 // directory isn't complete; reload
8478 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8479 touch_inode(cur);
8480 curdir->fetch(cf.build(), path[depth]);
8481 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8482 return 1;
8483 }
8484 } else {
8485 // dirfrag/dentry is not mine.
8486 mds_authority_t dauth = curdir->authority();
8487
8488 if (forward &&
8489 mdr && mdr->client_request &&
8490 (int)depth < mdr->client_request->get_num_fwd()){
8491 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8492 << " < fwd " << mdr->client_request->get_num_fwd()
8493 << ", discovering instead of forwarding" << dendl;
8494 discover = true;
8495 }
8496
8497 if ((discover)) {
8498 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8499 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8500 path_locked);
8501 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8502 return 1;
8503 }
8504 if (forward) {
8505 // forward
8506 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8507
8508 if (curdir->is_ambiguous_auth()) {
8509 // wait
8510 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8511 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
8512 return 1;
8513 }
8514
8515 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8516
8517 request_forward(mdr, dauth.first);
8518
8519 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8520 return 2;
8521 }
8522 }
8523
8524 ceph_abort(); // i shouldn't get here
8525 }
8526
8527 if (want_auth && !want_dentry) {
8528 if (cur->is_ambiguous_auth()) {
8529 dout(10) << "waiting for single auth on " << *cur << dendl;
8530 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8531 return 1;
8532 }
8533 if (!cur->is_auth()) {
8534 dout(10) << "fw to auth for " << *cur << dendl;
8535 request_forward(mdr, cur->authority().first);
8536 return 2;
8537 }
8538 }
8539
8540 // success.
8541 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8542 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8543 if (mdr)
8544 ceph_assert(mdr->snapid == snapid);
8545
8546 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8547 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8548 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8549 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8550
8551 if (rdlock_path)
8552 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8553
8554 return 0;
8555 }
8556
8557 CInode *MDCache::cache_traverse(const filepath& fp)
8558 {
8559 dout(10) << "cache_traverse " << fp << dendl;
8560
8561 CInode *in;
8562 unsigned depth = 0;
8563 char mdsdir_name[16];
8564 sprintf(mdsdir_name, "~mds%d", mds->get_nodeid());
8565
8566 if (fp.get_ino()) {
8567 in = get_inode(fp.get_ino());
8568 } else if (fp.depth() > 0 && (fp[0] == "~mdsdir" || fp[0] == mdsdir_name)) {
8569 in = myin;
8570 depth = 1;
8571 } else {
8572 in = root;
8573 }
8574 if (!in)
8575 return NULL;
8576
8577 for (; depth < fp.depth(); depth++) {
8578 std::string_view dname = fp[depth];
8579 frag_t fg = in->pick_dirfrag(dname);
8580 dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl;
8581 CDir *curdir = in->get_dirfrag(fg);
8582 if (!curdir)
8583 return NULL;
8584 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8585 if (!dn)
8586 return NULL;
8587 in = dn->get_linkage()->get_inode();
8588 if (!in)
8589 return NULL;
8590 }
8591 dout(10) << " got " << *in << dendl;
8592 return in;
8593 }
8594
8595
8596 /**
8597 * open_remote_dir -- open up a remote dirfrag
8598 *
8599 * @param diri base inode
8600 * @param approxfg approximate fragment.
8601 * @param fin completion callback
8602 */
8603 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8604 {
8605 dout(10) << "open_remote_dir on " << *diri << dendl;
8606 ceph_assert(diri->is_dir());
8607 ceph_assert(!diri->is_auth());
8608 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8609
8610 discover_dir_frag(diri, approxfg, fin);
8611 }
8612
8613
8614 /**
8615 * get_dentry_inode - get or open inode
8616 *
8617 * @param dn the dentry
8618 * @param mdr current request
8619 *
8620 * will return inode for primary, or link up/open up remote link's inode as necessary.
8621 * If it's not available right now, puts mdr on wait list and returns null.
8622 */
8623 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8624 {
8625 CDentry::linkage_t *dnl;
8626 if (projected)
8627 dnl = dn->get_projected_linkage();
8628 else
8629 dnl = dn->get_linkage();
8630
8631 ceph_assert(!dnl->is_null());
8632
8633 if (dnl->is_primary())
8634 return dnl->inode;
8635
8636 ceph_assert(dnl->is_remote());
8637 CInode *in = get_inode(dnl->get_remote_ino());
8638 if (in) {
8639 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8640 dn->link_remote(dnl, in);
8641 return in;
8642 } else {
8643 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8644 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8645 return 0;
8646 }
8647 }
8648
8649 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8650 CDentry *dn;
8651 inodeno_t ino;
8652 MDSContext *onfinish;
8653 bool want_xlocked;
8654 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8655 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8656 dn->get(MDSCacheObject::PIN_PTRWAITER);
8657 }
8658 void finish(int r) override {
8659 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8660 dn->put(MDSCacheObject::PIN_PTRWAITER);
8661 }
8662 };
8663
8664 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8665 {
8666 dout(10) << "open_remote_dentry " << *dn << dendl;
8667 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8668 inodeno_t ino = dnl->get_remote_ino();
8669 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1;
8670 open_ino(ino, pool,
8671 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8672 }
8673
8674 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8675 bool want_xlocked, int r)
8676 {
8677 if (r < 0) {
8678 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8679 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8680 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8681 dn->state_set(CDentry::STATE_BADREMOTEINO);
8682
8683 std::string path;
8684 CDir *dir = dn->get_dir();
8685 if (dir) {
8686 dir->get_inode()->make_path_string(path);
8687 path += "/";
8688 path += dn->get_name();
8689 }
8690
8691 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8692 if (fatal) {
8693 mds->damaged();
8694 ceph_abort(); // unreachable, damaged() respawns us
8695 }
8696 } else {
8697 r = 0;
8698 }
8699 }
8700 fin->complete(r < 0 ? r : 0);
8701 }
8702
8703
8704 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8705 {
8706 // empty trace if we're a base inode
8707 if (in->is_base())
8708 return;
8709
8710 CInode *parent = in->get_parent_inode();
8711 ceph_assert(parent);
8712 make_trace(trace, parent);
8713
8714 CDentry *dn = in->get_parent_dn();
8715 dout(15) << "make_trace adding " << *dn << dendl;
8716 trace.push_back(dn);
8717 }
8718
8719
8720 // -------------------------------------------------------------------------------
8721 // Open inode by inode number
8722
8723 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8724 inodeno_t ino;
8725 public:
8726 bufferlist bl;
8727 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8728 MDCacheIOContext(c), ino(i) {}
8729 void finish(int r) override {
8730 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8731 }
8732 void print(ostream& out) const override {
8733 out << "openino_backtrace_fetch" << ino << ")";
8734 }
8735 };
8736
8737 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8738 inodeno_t ino;
8739 cref_t<MMDSOpenIno> msg;
8740 bool parent;
8741 public:
8742 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8743 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8744 void finish(int r) override {
8745 if (r < 0 && !parent)
8746 r = -CEPHFS_EAGAIN;
8747 if (msg) {
8748 mdcache->handle_open_ino(msg, r);
8749 return;
8750 }
8751 auto& info = mdcache->opening_inodes.at(ino);
8752 mdcache->_open_ino_traverse_dir(ino, info, r);
8753 }
8754 };
8755
8756 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8757 inodeno_t ino;
8758 public:
8759 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8760 void finish(int r) override {
8761 mdcache->_open_ino_parent_opened(ino, r);
8762 }
8763 };
8764
8765 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8766 {
8767 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8768
8769 open_ino_info_t& info = opening_inodes.at(ino);
8770
8771 CInode *in = get_inode(ino);
8772 if (in) {
8773 dout(10) << " found cached " << *in << dendl;
8774 open_ino_finish(ino, info, in->authority().first);
8775 return;
8776 }
8777
8778 inode_backtrace_t backtrace;
8779 if (err == 0) {
8780 try {
8781 decode(backtrace, bl);
8782 } catch (const buffer::error &decode_exc) {
8783 derr << "corrupt backtrace on ino x0" << std::hex << ino
8784 << std::dec << ": " << decode_exc.what() << dendl;
8785 open_ino_finish(ino, info, -CEPHFS_EIO);
8786 return;
8787 }
8788 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8789 dout(10) << " old object in pool " << info.pool
8790 << ", retrying pool " << backtrace.pool << dendl;
8791 info.pool = backtrace.pool;
8792 C_IO_MDC_OpenInoBacktraceFetched *fin =
8793 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8794 fetch_backtrace(ino, info.pool, fin->bl,
8795 new C_OnFinisher(fin, mds->finisher));
8796 return;
8797 }
8798 } else if (err == -CEPHFS_ENOENT) {
8799 int64_t meta_pool = mds->get_metadata_pool();
8800 if (info.pool != meta_pool) {
8801 dout(10) << " no object in pool " << info.pool
8802 << ", retrying pool " << meta_pool << dendl;
8803 info.pool = meta_pool;
8804 C_IO_MDC_OpenInoBacktraceFetched *fin =
8805 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8806 fetch_backtrace(ino, info.pool, fin->bl,
8807 new C_OnFinisher(fin, mds->finisher));
8808 return;
8809 }
8810 err = 0; // backtrace.ancestors.empty() is checked below
8811 }
8812
8813 if (err == 0) {
8814 if (backtrace.ancestors.empty()) {
8815 dout(10) << " got empty backtrace " << dendl;
8816 err = -CEPHFS_ESTALE;
8817 } else if (!info.ancestors.empty()) {
8818 if (info.ancestors[0] == backtrace.ancestors[0]) {
8819 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8820 err = -CEPHFS_EINVAL;
8821 } else {
8822 info.last_err = 0;
8823 }
8824 }
8825 }
8826 if (err) {
8827 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8828 if (info.last_err)
8829 err = info.last_err;
8830 open_ino_finish(ino, info, err);
8831 return;
8832 }
8833
8834 dout(10) << " got backtrace " << backtrace << dendl;
8835 info.ancestors = backtrace.ancestors;
8836
8837 _open_ino_traverse_dir(ino, info, 0);
8838 }
8839
8840 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8841 {
8842 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8843
8844 open_ino_info_t& info = opening_inodes.at(ino);
8845
8846 CInode *in = get_inode(ino);
8847 if (in) {
8848 dout(10) << " found cached " << *in << dendl;
8849 open_ino_finish(ino, info, in->authority().first);
8850 return;
8851 }
8852
8853 if (ret == mds->get_nodeid()) {
8854 _open_ino_traverse_dir(ino, info, 0);
8855 } else {
8856 if (ret >= 0) {
8857 mds_rank_t checked_rank = mds_rank_t(ret);
8858 info.check_peers = true;
8859 info.auth_hint = checked_rank;
8860 info.checked.erase(checked_rank);
8861 }
8862 do_open_ino(ino, info, ret);
8863 }
8864 }
8865
8866 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8867 {
8868 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8869
8870 CInode *in = get_inode(ino);
8871 if (in) {
8872 dout(10) << " found cached " << *in << dendl;
8873 open_ino_finish(ino, info, in->authority().first);
8874 return;
8875 }
8876
8877 if (ret) {
8878 do_open_ino(ino, info, ret);
8879 return;
8880 }
8881
8882 mds_rank_t hint = info.auth_hint;
8883 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8884 info.discover, info.want_xlocked, &hint);
8885 if (ret > 0)
8886 return;
8887 if (hint != mds->get_nodeid())
8888 info.auth_hint = hint;
8889 do_open_ino(ino, info, ret);
8890 }
8891
8892 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
8893 {
8894 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8895 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8896 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8897 if (mds->logger)
8898 mds->logger->inc(l_mds_openino_dir_fetch);
8899 }
8900
8901 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
8902 const vector<inode_backpointer_t>& ancestors,
8903 bool discover, bool want_xlocked, mds_rank_t *hint)
8904 {
8905 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8906 int err = 0;
8907 for (unsigned i = 0; i < ancestors.size(); i++) {
8908 const auto& ancestor = ancestors.at(i);
8909 CInode *diri = get_inode(ancestor.dirino);
8910
8911 if (!diri) {
8912 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8913 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8914 return 1;
8915 }
8916 continue;
8917 }
8918
8919 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8920 CDir *dir = diri->get_parent_dir();
8921 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8922 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8923 dir = dir->get_inode()->get_parent_dir();
8924 _open_ino_fetch_dir(ino, m, dir, i == 0);
8925 return 1;
8926 }
8927
8928 if (!diri->is_dir()) {
8929 dout(10) << " " << *diri << " is not dir" << dendl;
8930 if (i == 0)
8931 err = -CEPHFS_ENOTDIR;
8932 break;
8933 }
8934
8935 const string& name = ancestor.dname;
8936 frag_t fg = diri->pick_dirfrag(name);
8937 CDir *dir = diri->get_dirfrag(fg);
8938 if (!dir) {
8939 if (diri->is_auth()) {
8940 if (diri->is_frozen()) {
8941 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8942 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8943 return 1;
8944 }
8945 dir = diri->get_or_open_dirfrag(this, fg);
8946 } else if (discover) {
8947 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8948 return 1;
8949 }
8950 }
8951 if (dir) {
8952 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
8953 CDentry *dn = dir->lookup(name);
8954 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8955 if (dir->is_auth()) {
8956 if (dnl && dnl->is_primary() &&
8957 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8958 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8959 _open_ino_fetch_dir(ino, m, dir, i == 0);
8960 return 1;
8961 }
8962
8963 if (!dnl && !dir->is_complete() &&
8964 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8965 dout(10) << " fetching incomplete " << *dir << dendl;
8966 _open_ino_fetch_dir(ino, m, dir, i == 0);
8967 return 1;
8968 }
8969
8970 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8971 if (i == 0)
8972 err = -CEPHFS_ENOENT;
8973 } else if (discover) {
8974 if (!dnl) {
8975 filepath path(name, 0);
8976 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8977 (i == 0 && want_xlocked));
8978 return 1;
8979 }
8980 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8981 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8982 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8983 return 1;
8984 }
8985 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8986 if (i == 0)
8987 err = -CEPHFS_ENOENT;
8988 }
8989 }
8990 if (hint && i == 0)
8991 *hint = dir ? dir->authority().first : diri->authority().first;
8992 break;
8993 }
8994 return err;
8995 }
8996
8997 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8998 {
8999 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9000
9001 MDSContext::vec waiters;
9002 waiters.swap(info.waiters);
9003 opening_inodes.erase(ino);
9004 finish_contexts(g_ceph_context, waiters, ret);
9005 }
9006
9007 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9008 {
9009 if (err < 0 && err != -CEPHFS_EAGAIN) {
9010 info.checked.clear();
9011 info.checking = MDS_RANK_NONE;
9012 info.check_peers = true;
9013 info.fetch_backtrace = true;
9014 if (info.discover) {
9015 info.discover = false;
9016 info.ancestors.clear();
9017 }
9018 if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR)
9019 info.last_err = err;
9020 }
9021
9022 if (info.check_peers || info.discover) {
9023 if (info.discover) {
9024 // got backtrace from peer, but failed to find inode. re-check peers
9025 info.discover = false;
9026 info.ancestors.clear();
9027 info.checked.clear();
9028 }
9029 info.check_peers = false;
9030 info.checking = MDS_RANK_NONE;
9031 do_open_ino_peer(ino, info);
9032 } else if (info.fetch_backtrace) {
9033 info.check_peers = true;
9034 info.fetch_backtrace = false;
9035 info.checking = mds->get_nodeid();
9036 info.checked.clear();
9037 C_IO_MDC_OpenInoBacktraceFetched *fin =
9038 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9039 fetch_backtrace(ino, info.pool, fin->bl,
9040 new C_OnFinisher(fin, mds->finisher));
9041 } else {
9042 ceph_assert(!info.ancestors.empty());
9043 info.checking = mds->get_nodeid();
9044 open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(),
9045 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9046 }
9047 }
9048
9049 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9050 {
9051 set<mds_rank_t> all, active;
9052 mds->mdsmap->get_mds_set(all);
9053 if (mds->get_state() == MDSMap::STATE_REJOIN)
9054 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9055 else
9056 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9057
9058 dout(10) << "do_open_ino_peer " << ino << " active " << active
9059 << " all " << all << " checked " << info.checked << dendl;
9060
9061 mds_rank_t whoami = mds->get_nodeid();
9062 mds_rank_t peer = MDS_RANK_NONE;
9063 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9064 if (active.count(info.auth_hint)) {
9065 peer = info.auth_hint;
9066 info.auth_hint = MDS_RANK_NONE;
9067 }
9068 } else {
9069 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9070 if (*p != whoami && info.checked.count(*p) == 0) {
9071 peer = *p;
9072 break;
9073 }
9074 }
9075 if (peer < 0) {
9076 all.erase(whoami);
9077 if (all != info.checked) {
9078 dout(10) << " waiting for more peers to be active" << dendl;
9079 } else {
9080 dout(10) << " all MDS peers have been checked " << dendl;
9081 do_open_ino(ino, info, 0);
9082 }
9083 } else {
9084 info.checking = peer;
9085 vector<inode_backpointer_t> *pa = NULL;
9086 // got backtrace from peer or backtrace just fetched
9087 if (info.discover || !info.fetch_backtrace)
9088 pa = &info.ancestors;
9089 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9090 if (mds->logger)
9091 mds->logger->inc(l_mds_openino_peer_discover);
9092 }
9093 }
9094
9095 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9096 {
9097 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9098 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9099 return;
9100 }
9101
9102 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9103
9104 auto from = mds_rank_t(m->get_source().num());
9105 inodeno_t ino = m->ino;
9106 ref_t<MMDSOpenInoReply> reply;
9107 CInode *in = get_inode(ino);
9108 if (in) {
9109 dout(10) << " have " << *in << dendl;
9110 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9111 if (in->is_auth()) {
9112 touch_inode(in);
9113 while (1) {
9114 CDentry *pdn = in->get_parent_dn();
9115 if (!pdn)
9116 break;
9117 CInode *diri = pdn->get_dir()->get_inode();
9118 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9119 in->get_version()));
9120 in = diri;
9121 }
9122 } else {
9123 reply->hint = in->authority().first;
9124 }
9125 } else if (err < 0) {
9126 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9127 } else {
9128 mds_rank_t hint = MDS_RANK_NONE;
9129 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9130 if (ret > 0)
9131 return;
9132 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9133 }
9134 mds->send_message_mds(reply, from);
9135 }
9136
9137 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9138 {
9139 dout(10) << "handle_open_ino_reply " << *m << dendl;
9140
9141 inodeno_t ino = m->ino;
9142 mds_rank_t from = mds_rank_t(m->get_source().num());
9143 auto it = opening_inodes.find(ino);
9144 if (it != opening_inodes.end() && it->second.checking == from) {
9145 open_ino_info_t& info = it->second;
9146 info.checking = MDS_RANK_NONE;
9147 info.checked.insert(from);
9148
9149 CInode *in = get_inode(ino);
9150 if (in) {
9151 dout(10) << " found cached " << *in << dendl;
9152 open_ino_finish(ino, info, in->authority().first);
9153 } else if (!m->ancestors.empty()) {
9154 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9155 if (!info.want_replica) {
9156 open_ino_finish(ino, info, from);
9157 return;
9158 }
9159
9160 info.ancestors = m->ancestors;
9161 info.auth_hint = from;
9162 info.checking = mds->get_nodeid();
9163 info.discover = true;
9164 _open_ino_traverse_dir(ino, info, 0);
9165 } else if (m->error) {
9166 dout(10) << " error " << m->error << " from mds." << from << dendl;
9167 do_open_ino(ino, info, m->error);
9168 } else {
9169 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9170 info.auth_hint = m->hint;
9171 info.checked.erase(m->hint);
9172 }
9173 do_open_ino_peer(ino, info);
9174 }
9175 }
9176 }
9177
9178 void MDCache::kick_open_ino_peers(mds_rank_t who)
9179 {
9180 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9181
9182 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9183 p != opening_inodes.end();
9184 ++p) {
9185 open_ino_info_t& info = p->second;
9186 if (info.checking == who) {
9187 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9188 info.checking = MDS_RANK_NONE;
9189 do_open_ino_peer(p->first, info);
9190 } else if (info.checking == MDS_RANK_NONE) {
9191 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9192 do_open_ino_peer(p->first, info);
9193 }
9194 }
9195 }
9196
9197 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9198 bool want_replica, bool want_xlocked,
9199 vector<inode_backpointer_t> *ancestors_hint,
9200 mds_rank_t auth_hint)
9201 {
9202 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9203 << want_replica << dendl;
9204
9205 auto it = opening_inodes.find(ino);
9206 if (it != opening_inodes.end()) {
9207 open_ino_info_t& info = it->second;
9208 if (want_replica) {
9209 info.want_replica = true;
9210 if (want_xlocked && !info.want_xlocked) {
9211 if (!info.ancestors.empty()) {
9212 CInode *diri = get_inode(info.ancestors[0].dirino);
9213 if (diri) {
9214 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9215 CDir *dir = diri->get_dirfrag(fg);
9216 if (dir && !dir->is_auth()) {
9217 filepath path(info.ancestors[0].dname, 0);
9218 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9219 }
9220 }
9221 }
9222 info.want_xlocked = true;
9223 }
9224 }
9225 info.waiters.push_back(fin);
9226 } else {
9227 open_ino_info_t& info = opening_inodes[ino];
9228 info.want_replica = want_replica;
9229 info.want_xlocked = want_xlocked;
9230 info.tid = ++open_ino_last_tid;
9231 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9232 info.waiters.push_back(fin);
9233 if (auth_hint != MDS_RANK_NONE)
9234 info.auth_hint = auth_hint;
9235 if (ancestors_hint) {
9236 info.ancestors = std::move(*ancestors_hint);
9237 info.fetch_backtrace = false;
9238 info.checking = mds->get_nodeid();
9239 _open_ino_traverse_dir(ino, info, 0);
9240 } else {
9241 do_open_ino(ino, info, 0);
9242 }
9243 }
9244 }
9245
9246 /* ---------------------------- */
9247
9248 /*
9249 * search for a given inode on MDS peers. optionally start with the given node.
9250
9251
9252 TODO
9253 - recover from mds node failure, recovery
9254 - traverse path
9255
9256 */
9257 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9258 mds_rank_t hint, bool path_locked)
9259 {
9260 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9261 CInode *in = get_inode(ino);
9262 if (in && in->state_test(CInode::STATE_PURGING)) {
9263 c->complete(-CEPHFS_ESTALE);
9264 return;
9265 }
9266 ceph_assert(!in);
9267
9268 ceph_tid_t tid = ++find_ino_peer_last_tid;
9269 find_ino_peer_info_t& fip = find_ino_peer[tid];
9270 fip.ino = ino;
9271 fip.tid = tid;
9272 fip.fin = c;
9273 fip.path_locked = path_locked;
9274 fip.hint = hint;
9275 _do_find_ino_peer(fip);
9276 }
9277
9278 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9279 {
9280 set<mds_rank_t> all, active;
9281 mds->mdsmap->get_mds_set(all);
9282 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9283
9284 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9285 << " active " << active << " all " << all
9286 << " checked " << fip.checked
9287 << dendl;
9288
9289 mds_rank_t m = MDS_RANK_NONE;
9290 if (fip.hint >= 0) {
9291 m = fip.hint;
9292 fip.hint = MDS_RANK_NONE;
9293 } else {
9294 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9295 if (*p != mds->get_nodeid() &&
9296 fip.checked.count(*p) == 0) {
9297 m = *p;
9298 break;
9299 }
9300 }
9301 if (m == MDS_RANK_NONE) {
9302 all.erase(mds->get_nodeid());
9303 if (all != fip.checked) {
9304 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9305 } else {
9306 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9307 fip.fin->complete(-CEPHFS_ESTALE);
9308 find_ino_peer.erase(fip.tid);
9309 }
9310 } else {
9311 fip.checking = m;
9312 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9313 }
9314 }
9315
9316 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9317 {
9318 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9319 return;
9320 }
9321
9322 dout(10) << "handle_find_ino " << *m << dendl;
9323 auto r = make_message<MMDSFindInoReply>(m->tid);
9324 CInode *in = get_inode(m->ino);
9325 if (in) {
9326 in->make_path(r->path);
9327 dout(10) << " have " << r->path << " " << *in << dendl;
9328 }
9329 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9330 }
9331
9332
9333 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9334 {
9335 auto p = find_ino_peer.find(m->tid);
9336 if (p != find_ino_peer.end()) {
9337 dout(10) << "handle_find_ino_reply " << *m << dendl;
9338 find_ino_peer_info_t& fip = p->second;
9339
9340 // success?
9341 if (get_inode(fip.ino)) {
9342 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9343 mds->queue_waiter(fip.fin);
9344 find_ino_peer.erase(p);
9345 return;
9346 }
9347
9348 mds_rank_t from = mds_rank_t(m->get_source().num());
9349 if (fip.checking == from)
9350 fip.checking = MDS_RANK_NONE;
9351 fip.checked.insert(from);
9352
9353 if (!m->path.empty()) {
9354 // we got a path!
9355 vector<CDentry*> trace;
9356 CF_MDS_RetryMessageFactory cf(mds, m);
9357 MDRequestRef null_ref;
9358 int flags = MDS_TRAVERSE_DISCOVER;
9359 if (fip.path_locked)
9360 flags |= MDS_TRAVERSE_PATH_LOCKED;
9361 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9362 if (r > 0)
9363 return;
9364 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9365 << ", retrying" << dendl;
9366 fip.checked.clear();
9367 _do_find_ino_peer(fip);
9368 } else {
9369 // nope, continue.
9370 _do_find_ino_peer(fip);
9371 }
9372 } else {
9373 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9374 }
9375 }
9376
9377 void MDCache::kick_find_ino_peers(mds_rank_t who)
9378 {
9379 // find_ino_peers requests we should move on from
9380 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9381 p != find_ino_peer.end();
9382 ++p) {
9383 find_ino_peer_info_t& fip = p->second;
9384 if (fip.checking == who) {
9385 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9386 fip.checking = MDS_RANK_NONE;
9387 _do_find_ino_peer(fip);
9388 } else if (fip.checking == MDS_RANK_NONE) {
9389 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9390 _do_find_ino_peer(fip);
9391 }
9392 }
9393 }
9394
9395 /* ---------------------------- */
9396
9397 int MDCache::get_num_client_requests()
9398 {
9399 int count = 0;
9400 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9401 p != active_requests.end();
9402 ++p) {
9403 MDRequestRef& mdr = p->second;
9404 if (mdr->reqid.name.is_client() && !mdr->is_peer())
9405 count++;
9406 }
9407 return count;
9408 }
9409
9410 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9411 {
9412 // did we win a forward race against a peer?
9413 if (active_requests.count(req->get_reqid())) {
9414 MDRequestRef& mdr = active_requests[req->get_reqid()];
9415 ceph_assert(mdr);
9416 if (mdr->is_peer()) {
9417 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9418 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9419 } else {
9420 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9421 }
9422 return MDRequestRef();
9423 }
9424
9425 // register new client request
9426 MDRequestImpl::Params params;
9427 params.reqid = req->get_reqid();
9428 params.attempt = req->get_num_fwd();
9429 params.client_req = req;
9430 params.initiated = req->get_recv_stamp();
9431 params.throttled = req->get_throttle_stamp();
9432 params.all_read = req->get_recv_complete_stamp();
9433 params.dispatched = req->get_dispatch_stamp();
9434
9435 MDRequestRef mdr =
9436 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9437 active_requests[params.reqid] = mdr;
9438 mdr->set_op_stamp(req->get_stamp());
9439 dout(7) << "request_start " << *mdr << dendl;
9440 return mdr;
9441 }
9442
9443 MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9444 {
9445 int by = m->get_source().num();
9446 MDRequestImpl::Params params;
9447 params.reqid = ri;
9448 params.attempt = attempt;
9449 params.triggering_peer_req = m;
9450 params.peer_to = by;
9451 params.initiated = m->get_recv_stamp();
9452 params.throttled = m->get_throttle_stamp();
9453 params.all_read = m->get_recv_complete_stamp();
9454 params.dispatched = m->get_dispatch_stamp();
9455 MDRequestRef mdr =
9456 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9457 ceph_assert(active_requests.count(mdr->reqid) == 0);
9458 active_requests[mdr->reqid] = mdr;
9459 dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl;
9460 return mdr;
9461 }
9462
9463 MDRequestRef MDCache::request_start_internal(int op)
9464 {
9465 utime_t now = ceph_clock_now();
9466 MDRequestImpl::Params params;
9467 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9468 params.reqid.tid = mds->issue_tid();
9469 params.initiated = now;
9470 params.throttled = now;
9471 params.all_read = now;
9472 params.dispatched = now;
9473 params.internal_op = op;
9474 MDRequestRef mdr =
9475 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9476
9477 ceph_assert(active_requests.count(mdr->reqid) == 0);
9478 active_requests[mdr->reqid] = mdr;
9479 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9480 return mdr;
9481 }
9482
9483 MDRequestRef MDCache::request_get(metareqid_t rid)
9484 {
9485 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9486 ceph_assert(p != active_requests.end());
9487 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9488 return p->second;
9489 }
9490
9491 void MDCache::request_finish(MDRequestRef& mdr)
9492 {
9493 dout(7) << "request_finish " << *mdr << dendl;
9494 mdr->mark_event("finishing request");
9495
9496 // peer finisher?
9497 if (mdr->has_more() && mdr->more()->peer_commit) {
9498 Context *fin = mdr->more()->peer_commit;
9499 mdr->more()->peer_commit = 0;
9500 int ret;
9501 if (mdr->aborted) {
9502 mdr->aborted = false;
9503 ret = -1;
9504 mdr->more()->peer_rolling_back = true;
9505 } else {
9506 ret = 0;
9507 mdr->committing = true;
9508 }
9509 fin->complete(ret); // this must re-call request_finish.
9510 return;
9511 }
9512
9513 switch(mdr->internal_op) {
9514 case CEPH_MDS_OP_FRAGMENTDIR:
9515 logger->inc(l_mdss_ireq_fragmentdir);
9516 break;
9517 case CEPH_MDS_OP_EXPORTDIR:
9518 logger->inc(l_mdss_ireq_exportdir);
9519 break;
9520 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9521 logger->inc(l_mdss_ireq_enqueue_scrub);
9522 break;
9523 case CEPH_MDS_OP_FLUSH:
9524 logger->inc(l_mdss_ireq_flush);
9525 break;
9526 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9527 logger->inc(l_mdss_ireq_fragstats);
9528 break;
9529 case CEPH_MDS_OP_REPAIR_INODESTATS:
9530 logger->inc(l_mdss_ireq_inodestats);
9531 break;
9532 }
9533
9534 request_cleanup(mdr);
9535 }
9536
9537
9538 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9539 {
9540 CachedStackStringStream css;
9541 *css << "forwarding request to mds." << who;
9542 mdr->mark_event(css->strv());
9543 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9544 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9545 << *mdr->client_request << dendl;
9546 if (mdr->is_batch_head()) {
9547 mdr->release_batch_op()->forward(who);
9548 } else {
9549 mds->forward_message_mds(mdr->release_client_request(), who);
9550 }
9551 if (mds->logger) mds->logger->inc(l_mds_forward);
9552 } else if (mdr->internal_op >= 0) {
9553 dout(10) << "request_forward on internal op; cancelling" << dendl;
9554 mdr->internal_op_finish->complete(-CEPHFS_EXDEV);
9555 } else {
9556 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9557 << " was from mds" << dendl;
9558 }
9559 request_cleanup(mdr);
9560 }
9561
9562
9563 void MDCache::dispatch_request(MDRequestRef& mdr)
9564 {
9565 if (mdr->client_request) {
9566 mds->server->dispatch_client_request(mdr);
9567 } else if (mdr->peer_request) {
9568 mds->server->dispatch_peer_request(mdr);
9569 } else {
9570 switch (mdr->internal_op) {
9571 case CEPH_MDS_OP_FRAGMENTDIR:
9572 dispatch_fragment_dir(mdr);
9573 break;
9574 case CEPH_MDS_OP_EXPORTDIR:
9575 migrator->dispatch_export_dir(mdr, 0);
9576 break;
9577 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9578 enqueue_scrub_work(mdr);
9579 break;
9580 case CEPH_MDS_OP_FLUSH:
9581 flush_dentry_work(mdr);
9582 break;
9583 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9584 repair_dirfrag_stats_work(mdr);
9585 break;
9586 case CEPH_MDS_OP_REPAIR_INODESTATS:
9587 repair_inode_stats_work(mdr);
9588 break;
9589 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
9590 rdlock_dirfrags_stats_work(mdr);
9591 break;
9592 default:
9593 ceph_abort();
9594 }
9595 }
9596 }
9597
9598
9599 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9600 {
9601 if (!mdr->has_more())
9602 return;
9603
9604 // clean up peers
9605 // (will implicitly drop remote dn pins)
9606 for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin();
9607 p != mdr->more()->peers.end();
9608 ++p) {
9609 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
9610 MMDSPeerRequest::OP_FINISH);
9611
9612 if (mdr->killed && !mdr->committing) {
9613 r->mark_abort();
9614 } else if (mdr->more()->srcdn_auth_mds == *p &&
9615 mdr->more()->inode_import.length() > 0) {
9616 // information about rename imported caps
9617 r->inode_export = std::move(mdr->more()->inode_import);
9618 }
9619
9620 mds->send_message_mds(r, *p);
9621 }
9622
9623 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9624 * implicitly. Note that we don't call the finishers -- there shouldn't
9625 * be any on a remote lock and the request finish wakes up all
9626 * the waiters anyway! */
9627
9628 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9629 SimpleLock *lock = it->lock;
9630 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9631 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9632 << " on " << lock->get_parent() << dendl;
9633 lock->put_xlock();
9634 mdr->locks.erase(it++);
9635 } else if (it->is_remote_wrlock()) {
9636 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9637 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9638 if (it->is_wrlock()) {
9639 it->clear_remote_wrlock();
9640 ++it;
9641 } else {
9642 mdr->locks.erase(it++);
9643 }
9644 } else {
9645 ++it;
9646 }
9647 }
9648
9649 mdr->more()->peers.clear(); /* we no longer have requests out to them, and
9650 * leaving them in can cause double-notifies as
9651 * this function can get called more than once */
9652 }
9653
9654 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9655 {
9656 request_drop_foreign_locks(mdr);
9657 mds->locker->drop_non_rdlocks(mdr.get());
9658 }
9659
9660 void MDCache::request_drop_locks(MDRequestRef& mdr)
9661 {
9662 request_drop_foreign_locks(mdr);
9663 mds->locker->drop_locks(mdr.get());
9664 }
9665
9666 void MDCache::request_cleanup(MDRequestRef& mdr)
9667 {
9668 dout(15) << "request_cleanup " << *mdr << dendl;
9669
9670 if (mdr->has_more()) {
9671 if (mdr->more()->is_ambiguous_auth)
9672 mdr->clear_ambiguous_auth();
9673 if (!mdr->more()->waiting_for_finish.empty())
9674 mds->queue_waiters(mdr->more()->waiting_for_finish);
9675 }
9676
9677 request_drop_locks(mdr);
9678
9679 // drop (local) auth pins
9680 mdr->drop_local_auth_pins();
9681
9682 // drop stickydirs
9683 mdr->put_stickydirs();
9684
9685 mds->locker->kick_cap_releases(mdr);
9686
9687 // drop cache pins
9688 mdr->drop_pins();
9689
9690 // remove from session
9691 mdr->item_session_request.remove_myself();
9692
9693 // remove from map
9694 active_requests.erase(mdr->reqid);
9695
9696 if (mds->logger)
9697 log_stat();
9698
9699 mdr->mark_event("cleaned up request");
9700 }
9701
9702 void MDCache::request_kill(MDRequestRef& mdr)
9703 {
9704 // rollback peer requests is tricky. just let the request proceed.
9705 if (mdr->has_more() &&
9706 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) {
9707 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9708 ceph_assert(mdr->more()->witnessed.empty());
9709 mdr->aborted = true;
9710 dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl;
9711 } else {
9712 dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl;
9713 }
9714
9715 ceph_assert(mdr->used_prealloc_ino == 0);
9716 ceph_assert(mdr->prealloc_inos.empty());
9717
9718 mdr->session = NULL;
9719 mdr->item_session_request.remove_myself();
9720 return;
9721 }
9722
9723 mdr->killed = true;
9724 mdr->mark_event("killing request");
9725
9726 if (mdr->committing) {
9727 dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
9728 mdr->item_session_request.remove_myself();
9729 } else {
9730 dout(10) << "request_kill " << *mdr << dendl;
9731 request_cleanup(mdr);
9732 }
9733 }
9734
9735 // -------------------------------------------------------------------------------
9736 // SNAPREALMS
9737
9738 void MDCache::create_global_snaprealm()
9739 {
9740 CInode *in = new CInode(this); // dummy inode
9741 create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9742 add_inode(in);
9743 global_snaprealm = in->snaprealm;
9744 }
9745
9746 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9747 {
9748 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9749
9750 vector<inodeno_t> split_inos;
9751 vector<inodeno_t> split_realms;
9752
9753 if (notify_clients) {
9754 if (snapop == CEPH_SNAP_OP_SPLIT) {
9755 // notify clients of update|split
9756 for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p)
9757 split_inos.push_back((*p)->ino());
9758
9759 for (auto& r : in->snaprealm->open_children)
9760 split_realms.push_back(r->inode->ino());
9761 }
9762 }
9763
9764 map<client_t, ref_t<MClientSnap>> updates;
9765 list<SnapRealm*> q;
9766 q.push_back(in->snaprealm);
9767 while (!q.empty()) {
9768 SnapRealm *realm = q.front();
9769 q.pop_front();
9770
9771 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9772 realm->invalidate_cached_snaps();
9773
9774 if (notify_clients) {
9775 for (const auto& p : realm->client_caps) {
9776 const auto& client = p.first;
9777 const auto& caps = p.second;
9778 ceph_assert(!caps->empty());
9779
9780 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9781 if (em.second) {
9782 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9783 update->head.split = in->ino();
9784 update->split_inos = split_inos;
9785 update->split_realms = split_realms;
9786 update->bl = in->snaprealm->get_snap_trace();
9787 em.first->second = std::move(update);
9788 }
9789 }
9790 }
9791
9792 // notify for active children, too.
9793 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9794 for (auto& r : realm->open_children)
9795 q.push_back(r);
9796 }
9797
9798 if (notify_clients)
9799 send_snaps(updates);
9800 }
9801
9802 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
9803 {
9804 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9805 ceph_assert(in->is_auth());
9806
9807 set<mds_rank_t> mds_set;
9808 if (stid > 0) {
9809 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9810 mds_set.erase(mds->get_nodeid());
9811 } else {
9812 in->list_replicas(mds_set);
9813 }
9814
9815 if (!mds_set.empty()) {
9816 bufferlist snap_blob;
9817 in->encode_snap(snap_blob);
9818
9819 for (auto p : mds_set) {
9820 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
9821 m->snap_blob = snap_blob;
9822 mds->send_message_mds(m, p);
9823 }
9824 }
9825
9826 if (stid > 0)
9827 notify_global_snaprealm_update(snap_op);
9828 }
9829
9830 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
9831 {
9832 mds_rank_t from = mds_rank_t(m->get_source().num());
9833 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
9834
9835 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9836 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9837 return;
9838 }
9839
9840 // null rejoin_done means open_snaprealms() has already been called
9841 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9842 (mds->is_rejoin() && !rejoin_done);
9843
9844 if (m->get_tid() > 0) {
9845 mds->snapclient->notify_commit(m->get_tid());
9846 if (notify_clients)
9847 notify_global_snaprealm_update(m->get_snap_op());
9848 }
9849
9850 CInode *in = get_inode(m->get_ino());
9851 if (in) {
9852 ceph_assert(!in->is_auth());
9853 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9854 (mds->is_rejoin() && !in->is_rejoining())) {
9855 auto p = m->snap_blob.cbegin();
9856 in->decode_snap(p);
9857
9858 if (!notify_clients) {
9859 if (!rejoin_pending_snaprealms.count(in)) {
9860 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9861 rejoin_pending_snaprealms.insert(in);
9862 }
9863 }
9864 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9865 }
9866 }
9867 }
9868
9869 void MDCache::notify_global_snaprealm_update(int snap_op)
9870 {
9871 if (snap_op != CEPH_SNAP_OP_DESTROY)
9872 snap_op = CEPH_SNAP_OP_UPDATE;
9873 set<Session*> sessions;
9874 mds->sessionmap.get_client_session_set(sessions);
9875 for (auto &session : sessions) {
9876 if (!session->is_open() && !session->is_stale())
9877 continue;
9878 auto update = make_message<MClientSnap>(snap_op);
9879 update->head.split = global_snaprealm->inode->ino();
9880 update->bl = global_snaprealm->get_snap_trace();
9881 mds->send_message_client_counted(update, session);
9882 }
9883 }
9884
9885 // -------------------------------------------------------------------------------
9886 // STRAYS
9887
9888 struct C_MDC_RetryScanStray : public MDCacheContext {
9889 dirfrag_t next;
9890 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9891 void finish(int r) override {
9892 mdcache->scan_stray_dir(next);
9893 }
9894 };
9895
9896 void MDCache::scan_stray_dir(dirfrag_t next)
9897 {
9898 dout(10) << "scan_stray_dir " << next << dendl;
9899
9900 if (next.ino)
9901 next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()];
9902
9903 for (int i = 0; i < NUM_STRAY; ++i) {
9904 if (strays[i]->ino() < next.ino)
9905 continue;
9906
9907 std::vector<CDir*> ls;
9908 strays[i]->get_dirfrags(ls);
9909
9910 for (const auto& dir : ls) {
9911 if (dir->get_frag() < next.frag)
9912 continue;
9913
9914 if (!dir->can_auth_pin()) {
9915 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag()));
9916 return;
9917 }
9918
9919 if (!dir->is_complete()) {
9920 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9921 return;
9922 }
9923
9924 for (auto &p : dir->items) {
9925 CDentry *dn = p.second;
9926 dn->state_set(CDentry::STATE_STRAY);
9927 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9928 if (dnl->is_primary()) {
9929 CInode *in = dnl->get_inode();
9930 if (in->get_inode()->nlink == 0)
9931 in->state_set(CInode::STATE_ORPHAN);
9932 maybe_eval_stray(in);
9933 }
9934 }
9935 }
9936 }
9937 }
9938
9939 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9940 {
9941 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9942 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9943 if (mds->logger)
9944 mds->logger->inc(l_mds_openino_backtrace_fetch);
9945 }
9946
9947
9948
9949
9950
9951 // ========================================================================================
9952 // DISCOVER
9953 /*
9954
9955 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9956 to the parent metadata object in the cache (pinning it).
9957
9958 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9959
9960 */
9961
9962 void MDCache::_send_discover(discover_info_t& d)
9963 {
9964 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
9965 d.want_base_dir, d.path_locked);
9966 dis->set_tid(d.tid);
9967 mds->send_message_mds(dis, d.mds);
9968 }
9969
9970 void MDCache::discover_base_ino(inodeno_t want_ino,
9971 MDSContext *onfinish,
9972 mds_rank_t from)
9973 {
9974 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9975 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9976 discover_info_t& d = _create_discover(from);
9977 d.ino = want_ino;
9978 _send_discover(d);
9979 }
9980 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9981 }
9982
9983
9984 void MDCache::discover_dir_frag(CInode *base,
9985 frag_t approx_fg,
9986 MDSContext *onfinish,
9987 mds_rank_t from)
9988 {
9989 if (from < 0)
9990 from = base->authority().first;
9991
9992 dirfrag_t df(base->ino(), approx_fg);
9993 dout(7) << "discover_dir_frag " << df
9994 << " from mds." << from << dendl;
9995
9996 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9997 discover_info_t& d = _create_discover(from);
9998 d.pin_base(base);
9999 d.ino = base->ino();
10000 d.frag = approx_fg;
10001 d.want_base_dir = true;
10002 _send_discover(d);
10003 }
10004
10005 if (onfinish)
10006 base->add_dir_waiter(approx_fg, onfinish);
10007 }
10008
10009 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10010 CInode *base;
10011 snapid_t snapid;
10012 filepath path;
10013 mds_rank_t from;
10014 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10015 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10016 void finish(int r) override {
10017 mdcache->discover_path(base, snapid, path, 0, from);
10018 }
10019 };
10020
10021 void MDCache::discover_path(CInode *base,
10022 snapid_t snap,
10023 filepath want_path,
10024 MDSContext *onfinish,
10025 bool path_locked,
10026 mds_rank_t from)
10027 {
10028 if (from < 0)
10029 from = base->authority().first;
10030
10031 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10032 << (path_locked ? " path_locked":"")
10033 << dendl;
10034
10035 if (base->is_ambiguous_auth()) {
10036 dout(10) << " waiting for single auth on " << *base << dendl;
10037 if (!onfinish)
10038 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10039 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10040 return;
10041 } else if (from == mds->get_nodeid()) {
10042 MDSContext::vec finished;
10043 base->take_waiting(CInode::WAIT_DIR, finished);
10044 mds->queue_waiters(finished);
10045 return;
10046 }
10047
10048 frag_t fg = base->pick_dirfrag(want_path[0]);
10049 if ((path_locked && want_path.depth() == 1) ||
10050 !base->is_waiting_for_dir(fg) || !onfinish) {
10051 discover_info_t& d = _create_discover(from);
10052 d.ino = base->ino();
10053 d.pin_base(base);
10054 d.frag = fg;
10055 d.snap = snap;
10056 d.want_path = want_path;
10057 d.want_base_dir = true;
10058 d.path_locked = path_locked;
10059 _send_discover(d);
10060 }
10061
10062 // register + wait
10063 if (onfinish)
10064 base->add_dir_waiter(fg, onfinish);
10065 }
10066
10067 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10068 CDir *base;
10069 snapid_t snapid;
10070 filepath path;
10071 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10072 MDCacheContext(c), base(b), snapid(s), path(p) {}
10073 void finish(int r) override {
10074 mdcache->discover_path(base, snapid, path, 0);
10075 }
10076 };
10077
10078 void MDCache::discover_path(CDir *base,
10079 snapid_t snap,
10080 filepath want_path,
10081 MDSContext *onfinish,
10082 bool path_locked)
10083 {
10084 mds_rank_t from = base->authority().first;
10085
10086 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10087 << (path_locked ? " path_locked":"")
10088 << dendl;
10089
10090 if (base->is_ambiguous_auth()) {
10091 dout(7) << " waiting for single auth on " << *base << dendl;
10092 if (!onfinish)
10093 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10094 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10095 return;
10096 } else if (from == mds->get_nodeid()) {
10097 MDSContext::vec finished;
10098 base->take_sub_waiting(finished);
10099 mds->queue_waiters(finished);
10100 return;
10101 }
10102
10103 if ((path_locked && want_path.depth() == 1) ||
10104 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10105 discover_info_t& d = _create_discover(from);
10106 d.ino = base->ino();
10107 d.pin_base(base->inode);
10108 d.frag = base->get_frag();
10109 d.snap = snap;
10110 d.want_path = want_path;
10111 d.want_base_dir = false;
10112 d.path_locked = path_locked;
10113 _send_discover(d);
10114 }
10115
10116 // register + wait
10117 if (onfinish)
10118 base->add_dentry_waiter(want_path[0], snap, onfinish);
10119 }
10120
10121 void MDCache::kick_discovers(mds_rank_t who)
10122 {
10123 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10124 p != discovers.end();
10125 ++p) {
10126 if (p->second.mds != who)
10127 continue;
10128 _send_discover(p->second);
10129 }
10130 }
10131
10132
10133 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10134 {
10135 mds_rank_t whoami = mds->get_nodeid();
10136 mds_rank_t from = mds_rank_t(dis->get_source().num());
10137
10138 ceph_assert(from != whoami);
10139
10140 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10141 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10142 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10143 return;
10144 }
10145
10146 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10147 // delay processing request from survivor because we may not yet choose lock states.
10148 if (!mds->mdsmap->is_rejoin(from)) {
10149 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10150 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10151 return;
10152 }
10153 }
10154
10155
10156 CInode *cur = 0;
10157 auto reply = make_message<MDiscoverReply>(*dis);
10158
10159 snapid_t snapid = dis->get_snapid();
10160
10161 // get started.
10162 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10163 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10164 // wants root
10165 dout(7) << "handle_discover from mds." << from
10166 << " wants base + " << dis->get_want().get_path()
10167 << " snap " << snapid
10168 << dendl;
10169
10170 cur = get_inode(dis->get_base_ino());
10171 ceph_assert(cur);
10172
10173 // add root
10174 reply->starts_with = MDiscoverReply::INODE;
10175 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10176 dout(10) << "added base " << *cur << dendl;
10177 }
10178 else {
10179 // there's a base inode
10180 cur = get_inode(dis->get_base_ino(), snapid);
10181 if (!cur && snapid != CEPH_NOSNAP) {
10182 cur = get_inode(dis->get_base_ino());
10183 if (cur && !cur->is_multiversion())
10184 cur = NULL; // nope!
10185 }
10186
10187 if (!cur) {
10188 dout(7) << "handle_discover mds." << from
10189 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10190 << dendl;
10191 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10192 reply->set_error_dentry(dis->get_dentry(0));
10193 reply->set_flag_error_dir();
10194 } else if (dis->wants_base_dir()) {
10195 dout(7) << "handle_discover mds." << from
10196 << " wants basedir+" << dis->get_want().get_path()
10197 << " has " << *cur
10198 << dendl;
10199 } else {
10200 dout(7) << "handle_discover mds." << from
10201 << " wants " << dis->get_want().get_path()
10202 << " has " << *cur
10203 << dendl;
10204 }
10205 }
10206
10207 ceph_assert(reply);
10208
10209 // add content
10210 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10211 for (unsigned i = 0;
10212 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10213 i++) {
10214
10215 // -- figure out the dir
10216
10217 // is *cur even a dir at all?
10218 if (!cur->is_dir()) {
10219 dout(7) << *cur << " not a dir" << dendl;
10220 reply->set_flag_error_dir();
10221 break;
10222 }
10223
10224 // pick frag
10225 frag_t fg;
10226 if (dis->get_want().depth()) {
10227 // dentry specifies
10228 fg = cur->pick_dirfrag(dis->get_dentry(i));
10229 } else {
10230 // requester explicity specified the frag
10231 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10232 fg = dis->get_base_dir_frag();
10233 if (!cur->dirfragtree.is_leaf(fg))
10234 fg = cur->dirfragtree[fg.value()];
10235 }
10236 CDir *curdir = cur->get_dirfrag(fg);
10237
10238 if ((!curdir && !cur->is_auth()) ||
10239 (curdir && !curdir->is_auth())) {
10240
10241 /* before:
10242 * ONLY set flag if empty!!
10243 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10244 * resulting in duplicate discovers in flight,
10245 * which can wreak havoc when discovering rename srcdn (which may move)
10246 */
10247
10248 if (reply->is_empty()) {
10249 // only hint if empty.
10250 // someday this could be better, but right now the waiter logic isn't smart enough.
10251
10252 // hint
10253 if (curdir) {
10254 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10255 reply->set_dir_auth_hint(curdir->authority().first);
10256 } else {
10257 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10258 << *cur << dendl;
10259 reply->set_dir_auth_hint(cur->authority().first);
10260 }
10261
10262 // note error dentry, if any
10263 // NOTE: important, as it allows requester to issue an equivalent discover
10264 // to whomever we hint at.
10265 if (dis->get_want().depth() > i)
10266 reply->set_error_dentry(dis->get_dentry(i));
10267 }
10268
10269 break;
10270 }
10271
10272 if (!curdir) { // open dir?
10273 if (cur->is_frozen()) {
10274 if (!reply->is_empty()) {
10275 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10276 break;
10277 }
10278 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10279 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10280 return;
10281 }
10282 curdir = cur->get_or_open_dirfrag(this, fg);
10283 } else if (curdir->is_frozen_tree() ||
10284 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10285 if (!reply->is_empty()) {
10286 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10287 break;
10288 }
10289 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10290 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10291 reply->set_flag_error_dir();
10292 break;
10293 }
10294 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10295 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10296 return;
10297 }
10298
10299 // add dir
10300 if (curdir->get_version() == 0) {
10301 // fetch newly opened dir
10302 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10303 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10304 // make sure the base frag is correct, though, in there was a refragment since the
10305 // original request was sent.
10306 reply->set_base_dir_frag(curdir->get_frag());
10307 } else {
10308 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10309 if (!reply->trace.length())
10310 reply->starts_with = MDiscoverReply::DIR;
10311 encode_replica_dir(curdir, from, reply->trace);
10312 dout(7) << "handle_discover added dir " << *curdir << dendl;
10313 }
10314
10315 // lookup
10316 CDentry *dn = 0;
10317 if (curdir->get_version() == 0) {
10318 // fetch newly opened dir
10319 ceph_assert(!curdir->has_bloom());
10320 } else if (dis->get_want().depth() > 0) {
10321 // lookup dentry
10322 dn = curdir->lookup(dis->get_dentry(i), snapid);
10323 } else
10324 break; // done!
10325
10326 // incomplete dir?
10327 if (!dn) {
10328 if (!curdir->is_complete() &&
10329 !(snapid == CEPH_NOSNAP &&
10330 curdir->has_bloom() &&
10331 !curdir->is_in_bloom(dis->get_dentry(i)))) {
10332 // readdir
10333 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10334 if (reply->is_empty()) {
10335 // fetch and wait
10336 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10337 dis->wants_base_dir() && curdir->get_version() == 0);
10338 return;
10339 } else {
10340 // initiate fetch, but send what we have so far
10341 curdir->fetch(0);
10342 break;
10343 }
10344 }
10345
10346 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10347 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10348 << " dne, non-empty reply, stopping" << dendl;
10349 break;
10350 }
10351
10352 // send null dentry
10353 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10354 << *curdir << dendl;
10355 if (snapid == CEPH_NOSNAP)
10356 dn = curdir->add_null_dentry(dis->get_dentry(i));
10357 else
10358 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10359 }
10360 ceph_assert(dn);
10361
10362 // don't add replica to purging dentry/inode
10363 if (dn->state_test(CDentry::STATE_PURGING)) {
10364 if (reply->is_empty())
10365 reply->set_flag_error_dn(dis->get_dentry(i));
10366 break;
10367 }
10368
10369 CDentry::linkage_t *dnl = dn->get_linkage();
10370
10371 // xlocked dentry?
10372 // ...always block on non-tail items (they are unrelated)
10373 // ...allow xlocked tail disocvery _only_ if explicitly requested
10374 if (dn->lock.is_xlocked()) {
10375 // is this the last (tail) item in the discover traversal?
10376 if (dis->is_path_locked()) {
10377 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10378 } else if (reply->is_empty()) {
10379 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10380 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10381 return;
10382 } else {
10383 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10384 break;
10385 }
10386 }
10387
10388 // frozen inode?
10389 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10390 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10391 if (tailitem && dis->is_path_locked()) {
10392 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10393 } else if (reply->is_empty()) {
10394 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10395 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10396 return;
10397 } else {
10398 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10399 break;
10400 }
10401 }
10402
10403 // add dentry
10404 if (!reply->trace.length())
10405 reply->starts_with = MDiscoverReply::DENTRY;
10406 encode_replica_dentry(dn, from, reply->trace);
10407 dout(7) << "handle_discover added dentry " << *dn << dendl;
10408
10409 if (!dnl->is_primary()) break; // stop on null or remote link.
10410
10411 // add inode
10412 CInode *next = dnl->get_inode();
10413 ceph_assert(next->is_auth());
10414
10415 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10416 dout(7) << "handle_discover added inode " << *next << dendl;
10417
10418 // descend, keep going.
10419 cur = next;
10420 continue;
10421 }
10422
10423 // how did we do?
10424 ceph_assert(!reply->is_empty());
10425 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10426 mds->send_message(reply, dis->get_connection());
10427 }
10428
10429 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10430 {
10431 /*
10432 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10433 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10434 return;
10435 }
10436 */
10437 dout(7) << "discover_reply " << *m << dendl;
10438 if (m->is_flag_error_dir())
10439 dout(7) << " flag error, dir" << dendl;
10440 if (m->is_flag_error_dn())
10441 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10442
10443 MDSContext::vec finished, error;
10444 mds_rank_t from = mds_rank_t(m->get_source().num());
10445
10446 // starting point
10447 CInode *cur = get_inode(m->get_base_ino());
10448 auto p = m->trace.cbegin();
10449
10450 int next = m->starts_with;
10451
10452 // decrement discover counters
10453 if (m->get_tid()) {
10454 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10455 if (p != discovers.end()) {
10456 dout(10) << " found tid " << m->get_tid() << dendl;
10457 discovers.erase(p);
10458 } else {
10459 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10460 }
10461 }
10462
10463 // discover may start with an inode
10464 if (!p.end() && next == MDiscoverReply::INODE) {
10465 decode_replica_inode(cur, p, NULL, finished);
10466 dout(7) << "discover_reply got base inode " << *cur << dendl;
10467 ceph_assert(cur->is_base());
10468
10469 next = MDiscoverReply::DIR;
10470
10471 // take waiters?
10472 if (cur->is_base() &&
10473 waiting_for_base_ino[from].count(cur->ino())) {
10474 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10475 waiting_for_base_ino[from].erase(cur->ino());
10476 }
10477 }
10478 ceph_assert(cur);
10479
10480 // loop over discover results.
10481 // indexes follow each ([[dir] dentry] inode)
10482 // can start, end with any type.
10483 while (!p.end()) {
10484 // dir
10485 frag_t fg;
10486 CDir *curdir = nullptr;
10487 if (next == MDiscoverReply::DIR) {
10488 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10489 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10490 ceph_assert(m->get_wanted_base_dir());
10491 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10492 }
10493 } else {
10494 // note: this can only happen our first way around this loop.
10495 if (p.end() && m->is_flag_error_dn()) {
10496 fg = cur->pick_dirfrag(m->get_error_dentry());
10497 curdir = cur->get_dirfrag(fg);
10498 } else
10499 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10500 }
10501
10502 if (p.end())
10503 break;
10504
10505 // dentry
10506 CDentry *dn = nullptr;
10507 decode_replica_dentry(dn, p, curdir, finished);
10508
10509 if (p.end())
10510 break;
10511
10512 // inode
10513 decode_replica_inode(cur, p, dn, finished);
10514
10515 next = MDiscoverReply::DIR;
10516 }
10517
10518 // dir error?
10519 // or dir_auth hint?
10520 if (m->is_flag_error_dir() && !cur->is_dir()) {
10521 // not a dir.
10522 cur->take_waiting(CInode::WAIT_DIR, error);
10523 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10524 mds_rank_t who = m->get_dir_auth_hint();
10525 if (who == mds->get_nodeid()) who = -1;
10526 if (who >= 0)
10527 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10528
10529
10530 if (m->get_wanted_base_dir()) {
10531 frag_t fg = m->get_base_dir_frag();
10532 CDir *dir = cur->get_dirfrag(fg);
10533
10534 if (cur->is_waiting_for_dir(fg)) {
10535 if (cur->is_auth())
10536 cur->take_waiting(CInode::WAIT_DIR, finished);
10537 else if (dir || !cur->dirfragtree.is_leaf(fg))
10538 cur->take_dir_waiting(fg, finished);
10539 else
10540 discover_dir_frag(cur, fg, 0, who);
10541 } else
10542 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10543 }
10544
10545 // try again?
10546 if (m->get_error_dentry().length()) {
10547 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10548 CDir *dir = cur->get_dirfrag(fg);
10549 // wanted a dentry
10550 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10551 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10552 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10553 m->get_wanted_snapid(), finished);
10554 } else {
10555 filepath relpath(m->get_error_dentry(), 0);
10556 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10557 }
10558 } else
10559 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10560 << m->get_error_dentry() << dendl;
10561 }
10562 } else if (m->is_flag_error_dn()) {
10563 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10564 CDir *dir = cur->get_dirfrag(fg);
10565 if (dir) {
10566 if (dir->is_auth()) {
10567 dir->take_sub_waiting(finished);
10568 } else {
10569 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10570 m->get_wanted_snapid(), error);
10571 }
10572 }
10573 }
10574
10575 // waiters
10576 finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT); // finish errors directly
10577 mds->queue_waiters(finished);
10578 }
10579
10580
10581
10582 // ----------------------------
10583 // REPLICAS
10584
10585
10586 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10587 {
10588 ENCODE_START(1, 1, bl);
10589 dirfrag_t df = dir->dirfrag();
10590 encode(df, bl);
10591 __u32 nonce = dir->add_replica(to);
10592 encode(nonce, bl);
10593 dir->_encode_base(bl);
10594 ENCODE_FINISH(bl);
10595 }
10596
10597 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10598 {
10599 ENCODE_START(2, 1, bl);
10600 encode(dn->get_name(), bl);
10601 encode(dn->last, bl);
10602
10603 __u32 nonce = dn->add_replica(to);
10604 encode(nonce, bl);
10605 encode(dn->first, bl);
10606 encode(dn->linkage.remote_ino, bl);
10607 encode(dn->linkage.remote_d_type, bl);
10608 dn->lock.encode_state_for_replica(bl);
10609 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10610 encode(need_recover, bl);
10611 encode(dn->alternate_name, bl);
10612 ENCODE_FINISH(bl);
10613 }
10614
10615 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10616 uint64_t features)
10617 {
10618 ceph_assert(in->is_auth());
10619
10620 ENCODE_START(2, 1, bl);
10621 encode(in->ino(), bl); // bleh, minor assymetry here
10622 encode(in->last, bl);
10623
10624 __u32 nonce = in->add_replica(to);
10625 encode(nonce, bl);
10626
10627 in->_encode_base(bl, features);
10628 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10629
10630 __u32 state = in->state;
10631 encode(state, bl);
10632
10633 ENCODE_FINISH(bl);
10634 }
10635
10636 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10637 MDSContext::vec& finished)
10638 {
10639 DECODE_START(1, p);
10640 dirfrag_t df;
10641 decode(df, p);
10642
10643 ceph_assert(diri->ino() == df.ino);
10644
10645 // add it (_replica_)
10646 dir = diri->get_dirfrag(df.frag);
10647
10648 if (dir) {
10649 // had replica. update w/ new nonce.
10650 __u32 nonce;
10651 decode(nonce, p);
10652 dir->set_replica_nonce(nonce);
10653 dir->_decode_base(p);
10654 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10655 } else {
10656 // force frag to leaf in the diri tree
10657 if (!diri->dirfragtree.is_leaf(df.frag)) {
10658 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10659 << diri->dirfragtree << dendl;
10660 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10661 }
10662 // add replica.
10663 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10664 __u32 nonce;
10665 decode(nonce, p);
10666 dir->set_replica_nonce(nonce);
10667 dir->_decode_base(p);
10668 // is this a dir_auth delegation boundary?
10669 if (from != diri->authority().first ||
10670 diri->is_ambiguous_auth() ||
10671 diri->is_base())
10672 adjust_subtree_auth(dir, from);
10673
10674 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10675 // get waiters
10676 diri->take_dir_waiting(df.frag, finished);
10677 }
10678 DECODE_FINISH(p);
10679 }
10680
10681 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10682 {
10683 DECODE_START(1, p);
10684 string name;
10685 snapid_t last;
10686 decode(name, p);
10687 decode(last, p);
10688
10689 dn = dir->lookup(name, last);
10690
10691 // have it?
10692 bool is_new = false;
10693 if (dn) {
10694 is_new = false;
10695 dout(7) << __func__ << " had " << *dn << dendl;
10696 } else {
10697 is_new = true;
10698 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10699 dout(7) << __func__ << " added " << *dn << dendl;
10700 }
10701
10702 __u32 nonce;
10703 decode(nonce, p);
10704 dn->set_replica_nonce(nonce);
10705 decode(dn->first, p);
10706
10707 inodeno_t rino;
10708 unsigned char rdtype;
10709 decode(rino, p);
10710 decode(rdtype, p);
10711 dn->lock.decode_state(p, is_new);
10712
10713 bool need_recover;
10714 decode(need_recover, p);
10715
10716 mempool::mds_co::string alternate_name;
10717 if (struct_v >= 2) {
10718 decode(alternate_name, p);
10719 }
10720
10721 if (is_new) {
10722 dn->set_alternate_name(std::move(alternate_name));
10723 if (rino)
10724 dir->link_remote_inode(dn, rino, rdtype);
10725 if (need_recover)
10726 dn->lock.mark_need_recover();
10727 } else {
10728 ceph_assert(dn->alternate_name == alternate_name);
10729 }
10730
10731 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10732 DECODE_FINISH(p);
10733 }
10734
10735 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10736 {
10737 DECODE_START(2, p);
10738 inodeno_t ino;
10739 snapid_t last;
10740 __u32 nonce;
10741 decode(ino, p);
10742 decode(last, p);
10743 decode(nonce, p);
10744 in = get_inode(ino, last);
10745 if (!in) {
10746 in = new CInode(this, false, 2, last);
10747 in->set_replica_nonce(nonce);
10748 in->_decode_base(p);
10749 in->_decode_locks_state_for_replica(p, true);
10750 add_inode(in);
10751 if (in->ino() == CEPH_INO_ROOT)
10752 in->inode_auth.first = 0;
10753 else if (in->is_mdsdir())
10754 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10755 dout(10) << __func__ << " added " << *in << dendl;
10756 if (dn) {
10757 ceph_assert(dn->get_linkage()->is_null());
10758 dn->dir->link_primary_inode(dn, in);
10759 }
10760 } else {
10761 in->set_replica_nonce(nonce);
10762 in->_decode_base(p);
10763 in->_decode_locks_state_for_replica(p, false);
10764 dout(10) << __func__ << " had " << *in << dendl;
10765 }
10766
10767 if (dn) {
10768 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10769 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10770 }
10771
10772 if (struct_v >= 2) {
10773 __u32 s;
10774 decode(s, p);
10775 s &= CInode::MASK_STATE_REPLICATED;
10776 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10777 dout(10) << "replica inode is random ephemeral pinned" << dendl;
10778 in->set_ephemeral_pin(false, true);
10779 }
10780 }
10781
10782 DECODE_FINISH(p);
10783 }
10784
10785
10786 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10787 {
10788 ceph_assert(straydn->get_num_auth_pins());
10789 ENCODE_START(1, 1, bl);
10790 uint64_t features = mds->mdsmap->get_up_features();
10791 encode_replica_inode(get_myin(), who, bl, features);
10792 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10793 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10794 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10795 encode_replica_dir(straydn->get_dir(), who, bl);
10796 encode_replica_dentry(straydn, who, bl);
10797 ENCODE_FINISH(bl);
10798 }
10799
10800 void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
10801 {
10802 MDSContext::vec finished;
10803 auto p = bl.cbegin();
10804
10805 DECODE_START(1, p);
10806 CInode *mdsin = nullptr;
10807 decode_replica_inode(mdsin, p, NULL, finished);
10808 CDir *mdsdir = nullptr;
10809 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10810 CDentry *straydirdn = nullptr;
10811 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10812 CInode *strayin = nullptr;
10813 decode_replica_inode(strayin, p, straydirdn, finished);
10814 CDir *straydir = nullptr;
10815 decode_replica_dir(straydir, p, strayin, from, finished);
10816
10817 decode_replica_dentry(straydn, p, straydir, finished);
10818 if (!finished.empty())
10819 mds->queue_waiters(finished);
10820 DECODE_FINISH(p);
10821 }
10822
10823
10824 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10825 {
10826 // this is an FYI, re: replication
10827
10828 set<mds_rank_t> who;
10829 if (bcast) {
10830 set<mds_rank_t> mds_set;
10831 mds->get_mds_map()->get_active_mds_set(mds_set);
10832
10833 set<mds_rank_t> replica_set;
10834 for (const auto &p : dir->get_replicas()) {
10835 replica_set.insert(p.first);
10836 }
10837
10838 std::set_difference(mds_set.begin(), mds_set.end(),
10839 replica_set.begin(), replica_set.end(),
10840 std::inserter(who, who.end()));
10841 } else {
10842 for (const auto &p : dir->get_replicas()) {
10843 who.insert(p.first);
10844 }
10845 }
10846
10847 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10848
10849 filepath path;
10850 dir->inode->make_path(path);
10851
10852 std::set<int32_t> dir_rep_set;
10853 for (const auto &r : dir->dir_rep_by) {
10854 dir_rep_set.insert(r);
10855 }
10856
10857 mds_rank_t whoami = mds->get_nodeid();
10858 for (set<mds_rank_t>::iterator it = who.begin();
10859 it != who.end();
10860 ++it) {
10861 if (*it == whoami) continue;
10862 //if (*it == except) continue;
10863 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10864
10865 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it);
10866 }
10867
10868 return 0;
10869 }
10870
10871 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
10872 {
10873 dirfrag_t df = m->get_dirfrag();
10874 CDir *dir = get_dirfrag(df);
10875 if (!dir) {
10876 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10877
10878 // discover it?
10879 if (m->should_discover()) {
10880 // only try once!
10881 // this is key to avoid a fragtree update race, among other things.
10882 m->inc_tried_discover();
10883 vector<CDentry*> trace;
10884 CInode *in;
10885 filepath path = m->get_path();
10886 dout(5) << "trying discover on dir_update for " << path << dendl;
10887 CF_MDS_RetryMessageFactory cf(mds, m);
10888 MDRequestRef null_ref;
10889 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
10890 if (r > 0)
10891 return;
10892 if (r == 0 &&
10893 in->ino() == df.ino &&
10894 in->get_approx_dirfrag(df.frag) == NULL) {
10895 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10896 return;
10897 }
10898 }
10899
10900 return;
10901 }
10902
10903 if (!m->has_tried_discover()) {
10904 // Update if it already exists. Othwerwise it got updated by discover reply.
10905 dout(5) << "dir_update on " << *dir << dendl;
10906 dir->dir_rep = m->get_dir_rep();
10907 dir->dir_rep_by.clear();
10908 for (const auto &e : m->get_dir_rep_by()) {
10909 dir->dir_rep_by.insert(e);
10910 }
10911 }
10912 }
10913
10914
10915
10916
10917
10918 // LINK
10919
10920 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10921 {
10922 ENCODE_START(1, 1, bl);
10923 inodeno_t ino = dnl->get_remote_ino();
10924 encode(ino, bl);
10925 __u8 d_type = dnl->get_remote_d_type();
10926 encode(d_type, bl);
10927 ENCODE_FINISH(bl);
10928 }
10929
10930 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10931 {
10932 DECODE_START(1, p);
10933 inodeno_t ino;
10934 __u8 d_type;
10935 decode(ino, p);
10936 decode(d_type, p);
10937 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
10938 dir->link_remote_inode(dn, ino, d_type);
10939 DECODE_FINISH(p);
10940 }
10941
10942 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10943 {
10944 dout(7) << __func__ << " " << *dn << dendl;
10945
10946 CDir *subtree = get_subtree_root(dn->get_dir());
10947 for (const auto &p : dn->get_replicas()) {
10948 // don't tell (rename) witnesses; they already know
10949 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10950 continue;
10951 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10952 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10953 rejoin_gather.count(p.first)))
10954 continue;
10955 CDentry::linkage_t *dnl = dn->get_linkage();
10956 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
10957 if (dnl->is_primary()) {
10958 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
10959 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
10960 mds->mdsmap->get_up_features());
10961 } else if (dnl->is_remote()) {
10962 encode_remote_dentry_link(dnl, m->bl);
10963 } else
10964 ceph_abort(); // aie, bad caller!
10965 mds->send_message_mds(m, p.first);
10966 }
10967 }
10968
10969 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
10970 {
10971 CDentry *dn = NULL;
10972 CDir *dir = get_dirfrag(m->get_dirfrag());
10973 if (!dir) {
10974 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
10975 } else {
10976 dn = dir->lookup(m->get_dn());
10977 if (!dn) {
10978 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10979 } else {
10980 dout(7) << __func__ << " on " << *dn << dendl;
10981 CDentry::linkage_t *dnl = dn->get_linkage();
10982
10983 ceph_assert(!dn->is_auth());
10984 ceph_assert(dnl->is_null());
10985 }
10986 }
10987
10988 auto p = m->bl.cbegin();
10989 MDSContext::vec finished;
10990 if (dn) {
10991 if (m->get_is_primary()) {
10992 // primary link.
10993 CInode *in = nullptr;
10994 decode_replica_inode(in, p, dn, finished);
10995 } else {
10996 // remote link, easy enough.
10997 decode_remote_dentry_link(dir, dn, p);
10998 }
10999 } else {
11000 ceph_abort();
11001 }
11002
11003 if (!finished.empty())
11004 mds->queue_waiters(finished);
11005
11006 return;
11007 }
11008
11009
11010 // UNLINK
11011
11012 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11013 {
11014 dout(10) << __func__ << " " << *dn << dendl;
11015 // share unlink news with replicas
11016 set<mds_rank_t> replicas;
11017 dn->list_replicas(replicas);
11018 bufferlist snapbl;
11019 if (straydn) {
11020 straydn->list_replicas(replicas);
11021 CInode *strayin = straydn->get_linkage()->get_inode();
11022 strayin->encode_snap_blob(snapbl);
11023 }
11024 for (set<mds_rank_t>::iterator it = replicas.begin();
11025 it != replicas.end();
11026 ++it) {
11027 // don't tell (rmdir) witnesses; they already know
11028 if (mdr.get() && mdr->more()->witnessed.count(*it))
11029 continue;
11030
11031 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11032 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11033 rejoin_gather.count(*it)))
11034 continue;
11035
11036 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11037 if (straydn) {
11038 encode_replica_stray(straydn, *it, unlink->straybl);
11039 unlink->snapbl = snapbl;
11040 }
11041 mds->send_message_mds(unlink, *it);
11042 }
11043 }
11044
11045 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11046 {
11047 // straydn
11048 CDentry *straydn = nullptr;
11049 if (m->straybl.length())
11050 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
11051
11052 CDir *dir = get_dirfrag(m->get_dirfrag());
11053 if (!dir) {
11054 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11055 } else {
11056 CDentry *dn = dir->lookup(m->get_dn());
11057 if (!dn) {
11058 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11059 } else {
11060 dout(7) << __func__ << " on " << *dn << dendl;
11061 CDentry::linkage_t *dnl = dn->get_linkage();
11062
11063 // open inode?
11064 if (dnl->is_primary()) {
11065 CInode *in = dnl->get_inode();
11066 dn->dir->unlink_inode(dn);
11067 ceph_assert(straydn);
11068 straydn->dir->link_primary_inode(straydn, in);
11069
11070 // in->first is lazily updated on replica; drag it forward so
11071 // that we always keep it in sync with the dnq
11072 ceph_assert(straydn->first >= in->first);
11073 in->first = straydn->first;
11074
11075 // update subtree map?
11076 if (in->is_dir())
11077 adjust_subtree_after_rename(in, dir, false);
11078
11079 if (m->snapbl.length()) {
11080 bool hadrealm = (in->snaprealm ? true : false);
11081 in->decode_snap_blob(m->snapbl);
11082 ceph_assert(in->snaprealm);
11083 if (!hadrealm)
11084 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11085 }
11086
11087 // send caps to auth (if we're not already)
11088 if (in->is_any_caps() &&
11089 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11090 migrator->export_caps(in);
11091
11092 straydn = NULL;
11093 } else {
11094 ceph_assert(!straydn);
11095 ceph_assert(dnl->is_remote());
11096 dn->dir->unlink_inode(dn);
11097 }
11098 ceph_assert(dnl->is_null());
11099 }
11100 }
11101
11102 // race with trim_dentry()
11103 if (straydn) {
11104 ceph_assert(straydn->get_num_ref() == 0);
11105 ceph_assert(straydn->get_linkage()->is_null());
11106 expiremap ex;
11107 trim_dentry(straydn, ex);
11108 send_expire_messages(ex);
11109 }
11110 }
11111
11112
11113
11114
11115
11116
11117 // ===================================================================
11118
11119
11120
11121 // ===================================================================
11122 // FRAGMENT
11123
11124
11125 /**
11126 * adjust_dir_fragments -- adjust fragmentation for a directory
11127 *
11128 * @param diri directory inode
11129 * @param basefrag base fragment
11130 * @param bits bit adjustment. positive for split, negative for merge.
11131 */
11132 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11133 std::vector<CDir*>* resultfrags,
11134 MDSContext::vec& waiters,
11135 bool replay)
11136 {
11137 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11138 << " on " << *diri << dendl;
11139
11140 auto&& p = diri->get_dirfrags_under(basefrag);
11141
11142 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11143 }
11144
11145 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11146 {
11147 CDir *dir = diri->get_dirfrag(fg);
11148 if (dir)
11149 return dir;
11150
11151 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11152
11153 std::vector<CDir*> src, result;
11154 MDSContext::vec waiters;
11155
11156 // split a parent?
11157 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11158 while (1) {
11159 CDir *pdir = diri->get_dirfrag(parent);
11160 if (pdir) {
11161 int split = fg.bits() - parent.bits();
11162 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11163 src.push_back(pdir);
11164 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11165 dir = diri->get_dirfrag(fg);
11166 if (dir) {
11167 dout(10) << "force_dir_fragment result " << *dir << dendl;
11168 break;
11169 }
11170 }
11171 if (parent == frag_t())
11172 break;
11173 frag_t last = parent;
11174 parent = parent.parent();
11175 dout(10) << " " << last << " parent is " << parent << dendl;
11176 }
11177
11178 if (!dir) {
11179 // hoover up things under fg?
11180 {
11181 auto&& p = diri->get_dirfrags_under(fg);
11182 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11183 }
11184 if (src.empty()) {
11185 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11186 } else {
11187 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11188 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11189 dir = result.front();
11190 dout(10) << "force_dir_fragment result " << *dir << dendl;
11191 }
11192 }
11193 if (!replay)
11194 mds->queue_waiters(waiters);
11195 return dir;
11196 }
11197
11198 void MDCache::adjust_dir_fragments(CInode *diri,
11199 const std::vector<CDir*>& srcfrags,
11200 frag_t basefrag, int bits,
11201 std::vector<CDir*>* resultfrags,
11202 MDSContext::vec& waiters,
11203 bool replay)
11204 {
11205 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11206 << " srcfrags " << srcfrags
11207 << " on " << *diri << dendl;
11208
11209 // adjust fragtree
11210 // yuck. we may have discovered the inode while it was being fragmented.
11211 if (!diri->dirfragtree.is_leaf(basefrag))
11212 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11213
11214 if (bits > 0)
11215 diri->dirfragtree.split(basefrag, bits);
11216 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11217
11218 if (srcfrags.empty())
11219 return;
11220
11221 // split
11222 CDir *parent_dir = diri->get_parent_dir();
11223 CDir *parent_subtree = 0;
11224 if (parent_dir)
11225 parent_subtree = get_subtree_root(parent_dir);
11226
11227 ceph_assert(srcfrags.size() >= 1);
11228 if (bits > 0) {
11229 // SPLIT
11230 ceph_assert(srcfrags.size() == 1);
11231 CDir *dir = srcfrags.front();
11232
11233 dir->split(bits, resultfrags, waiters, replay);
11234
11235 // did i change the subtree map?
11236 if (dir->is_subtree_root()) {
11237 // new frags are now separate subtrees
11238 for (const auto& dir : *resultfrags) {
11239 subtrees[dir].clear(); // new frag is now its own subtree
11240 }
11241
11242 // was i a bound?
11243 if (parent_subtree) {
11244 ceph_assert(subtrees[parent_subtree].count(dir));
11245 subtrees[parent_subtree].erase(dir);
11246 for (const auto& dir : *resultfrags) {
11247 ceph_assert(dir->is_subtree_root());
11248 subtrees[parent_subtree].insert(dir);
11249 }
11250 }
11251
11252 // adjust my bounds.
11253 set<CDir*> bounds;
11254 bounds.swap(subtrees[dir]);
11255 subtrees.erase(dir);
11256 for (set<CDir*>::iterator p = bounds.begin();
11257 p != bounds.end();
11258 ++p) {
11259 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11260 subtrees[frag].insert(*p);
11261 }
11262
11263 show_subtrees(10);
11264 }
11265
11266 diri->close_dirfrag(dir->get_frag());
11267
11268 } else {
11269 // MERGE
11270
11271 // are my constituent bits subtrees? if so, i will be too.
11272 // (it's all or none, actually.)
11273 bool any_subtree = false, any_non_subtree = false;
11274 for (const auto& dir : srcfrags) {
11275 if (dir->is_subtree_root())
11276 any_subtree = true;
11277 else
11278 any_non_subtree = true;
11279 }
11280 ceph_assert(!any_subtree || !any_non_subtree);
11281
11282 set<CDir*> new_bounds;
11283 if (any_subtree) {
11284 for (const auto& dir : srcfrags) {
11285 // this simplifies the code that find subtrees underneath the dirfrag
11286 if (!dir->is_subtree_root()) {
11287 dir->state_set(CDir::STATE_AUXSUBTREE);
11288 adjust_subtree_auth(dir, mds->get_nodeid());
11289 }
11290 }
11291
11292 for (const auto& dir : srcfrags) {
11293 ceph_assert(dir->is_subtree_root());
11294 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11295 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11296 set<CDir*>::iterator r = q->second.begin();
11297 while (r != subtrees[dir].end()) {
11298 new_bounds.insert(*r);
11299 subtrees[dir].erase(r++);
11300 }
11301 subtrees.erase(q);
11302
11303 // remove myself as my parent's bound
11304 if (parent_subtree)
11305 subtrees[parent_subtree].erase(dir);
11306 }
11307 }
11308
11309 // merge
11310 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11311 f->merge(srcfrags, waiters, replay);
11312
11313 if (any_subtree) {
11314 ceph_assert(f->is_subtree_root());
11315 subtrees[f].swap(new_bounds);
11316 if (parent_subtree)
11317 subtrees[parent_subtree].insert(f);
11318
11319 show_subtrees(10);
11320 }
11321
11322 resultfrags->push_back(f);
11323 }
11324 }
11325
11326
11327 class C_MDC_FragmentFrozen : public MDSInternalContext {
11328 MDCache *mdcache;
11329 MDRequestRef mdr;
11330 public:
11331 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11332 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11333 void finish(int r) override {
11334 mdcache->fragment_frozen(mdr, r);
11335 }
11336 };
11337
11338 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11339 {
11340 if (is_readonly()) {
11341 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11342 return false;
11343 }
11344 if (mds->is_cluster_degraded()) {
11345 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11346 return false;
11347 }
11348 if (diri->get_parent_dir() &&
11349 diri->get_parent_dir()->get_inode()->is_stray()) {
11350 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11351 return false;
11352 }
11353 if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) {
11354 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl;
11355 return false;
11356 }
11357
11358 for (const auto& dir : dirs) {
11359 if (dir->scrub_is_in_progress()) {
11360 dout(7) << "can_fragment: scrub in progress " << *dir << dendl;
11361 return false;
11362 }
11363
11364 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11365 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11366 return false;
11367 }
11368 if (!dir->is_auth()) {
11369 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11370 return false;
11371 }
11372 if (dir->is_bad()) {
11373 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11374 return false;
11375 }
11376 if (dir->is_frozen() ||
11377 dir->is_freezing()) {
11378 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11379 return false;
11380 }
11381 }
11382
11383 return true;
11384 }
11385
11386 void MDCache::split_dir(CDir *dir, int bits)
11387 {
11388 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11389 ceph_assert(dir->is_auth());
11390 CInode *diri = dir->inode;
11391
11392 std::vector<CDir*> dirs;
11393 dirs.push_back(dir);
11394
11395 if (!can_fragment(diri, dirs)) {
11396 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11397 return;
11398 }
11399
11400 if (dir->frag.bits() + bits > 24) {
11401 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11402 return;
11403 }
11404
11405 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11406 mdr->more()->fragment_base = dir->dirfrag();
11407
11408 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11409 fragment_info_t& info = fragments[dir->dirfrag()];
11410 info.mdr = mdr;
11411 info.dirs.push_back(dir);
11412 info.bits = bits;
11413 info.last_cum_auth_pins_change = ceph_clock_now();
11414
11415 fragment_freeze_dirs(dirs);
11416 // initial mark+complete pass
11417 fragment_mark_and_complete(mdr);
11418 }
11419
11420 void MDCache::merge_dir(CInode *diri, frag_t frag)
11421 {
11422 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11423
11424 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11425 if (!all) {
11426 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11427 return;
11428 }
11429
11430 if (diri->dirfragtree.is_leaf(frag)) {
11431 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11432 return;
11433 }
11434
11435 if (!can_fragment(diri, dirs))
11436 return;
11437
11438 CDir *first = dirs.front();
11439 int bits = first->get_frag().bits() - frag.bits();
11440 dout(10) << " we are merging by " << bits << " bits" << dendl;
11441
11442 dirfrag_t basedirfrag(diri->ino(), frag);
11443 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11444 mdr->more()->fragment_base = basedirfrag;
11445
11446 ceph_assert(fragments.count(basedirfrag) == 0);
11447 fragment_info_t& info = fragments[basedirfrag];
11448 info.mdr = mdr;
11449 info.dirs = dirs;
11450 info.bits = -bits;
11451 info.last_cum_auth_pins_change = ceph_clock_now();
11452
11453 fragment_freeze_dirs(dirs);
11454 // initial mark+complete pass
11455 fragment_mark_and_complete(mdr);
11456 }
11457
11458 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11459 {
11460 bool any_subtree = false, any_non_subtree = false;
11461 for (const auto& dir : dirs) {
11462 dir->auth_pin(dir); // until we mark and complete them
11463 dir->state_set(CDir::STATE_FRAGMENTING);
11464 dir->freeze_dir();
11465 ceph_assert(dir->is_freezing_dir());
11466
11467 if (dir->is_subtree_root())
11468 any_subtree = true;
11469 else
11470 any_non_subtree = true;
11471 }
11472
11473 if (any_subtree && any_non_subtree) {
11474 // either all dirfrags are subtree roots or all are not.
11475 for (const auto& dir : dirs) {
11476 if (dir->is_subtree_root()) {
11477 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11478 } else {
11479 dir->state_set(CDir::STATE_AUXSUBTREE);
11480 adjust_subtree_auth(dir, mds->get_nodeid());
11481 }
11482 }
11483 }
11484 }
11485
11486 class C_MDC_FragmentMarking : public MDCacheContext {
11487 MDRequestRef mdr;
11488 public:
11489 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11490 void finish(int r) override {
11491 mdcache->fragment_mark_and_complete(mdr);
11492 }
11493 };
11494
11495 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11496 {
11497 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11498 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11499 if (it == fragments.end() || it->second.mdr != mdr) {
11500 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11501 request_finish(mdr);
11502 return;
11503 }
11504
11505 fragment_info_t& info = it->second;
11506 CInode *diri = info.dirs.front()->get_inode();
11507 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11508
11509 MDSGatherBuilder gather(g_ceph_context);
11510
11511 for (const auto& dir : info.dirs) {
11512 bool ready = true;
11513 if (!dir->is_complete()) {
11514 dout(15) << " fetching incomplete " << *dir << dendl;
11515 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11516 ready = false;
11517 } else if (dir->get_frag() == frag_t()) {
11518 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11519 // the operation. To avoid CDir::fetch() complaining about missing object,
11520 // we commit new dirfrag first.
11521 if (dir->state_test(CDir::STATE_CREATING)) {
11522 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11523 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11524 ready = false;
11525 } else if (dir->is_new()) {
11526 dout(15) << " committing new " << *dir << dendl;
11527 ceph_assert(dir->is_dirty());
11528 dir->commit(0, gather.new_sub(), true);
11529 ready = false;
11530 }
11531 }
11532 if (!ready)
11533 continue;
11534
11535 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11536 dout(15) << " marking " << *dir << dendl;
11537 for (auto &p : dir->items) {
11538 CDentry *dn = p.second;
11539 dn->get(CDentry::PIN_FRAGMENTING);
11540 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11541 dn->state_set(CDentry::STATE_FRAGMENTING);
11542 }
11543 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11544 dir->auth_unpin(dir);
11545 } else {
11546 dout(15) << " already marked " << *dir << dendl;
11547 }
11548 }
11549 if (gather.has_subs()) {
11550 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11551 gather.activate();
11552 return;
11553 }
11554
11555 for (const auto& dir : info.dirs) {
11556 if (!dir->is_frozen_dir()) {
11557 ceph_assert(dir->is_freezing_dir());
11558 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11559 }
11560 }
11561 if (gather.has_subs()) {
11562 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11563 gather.activate();
11564 // flush log so that request auth_pins are retired
11565 mds->mdlog->flush();
11566 return;
11567 }
11568
11569 fragment_frozen(mdr, 0);
11570 }
11571
11572 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11573 {
11574 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11575 for (const auto& dir : dirs) {
11576 dout(10) << " frag " << *dir << dendl;
11577
11578 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11579 dir->state_clear(CDir::STATE_FRAGMENTING);
11580
11581 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11582 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11583
11584 for (auto &p : dir->items) {
11585 CDentry *dn = p.second;
11586 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11587 dn->state_clear(CDentry::STATE_FRAGMENTING);
11588 dn->put(CDentry::PIN_FRAGMENTING);
11589 }
11590 } else {
11591 dir->auth_unpin(dir);
11592 }
11593
11594 dir->unfreeze_dir();
11595 }
11596 }
11597
11598 bool MDCache::fragment_are_all_frozen(CDir *dir)
11599 {
11600 ceph_assert(dir->is_frozen_dir());
11601 map<dirfrag_t,fragment_info_t>::iterator p;
11602 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11603 p != fragments.end() && p->first.ino == dir->ino();
11604 ++p) {
11605 if (p->first.frag.contains(dir->get_frag()))
11606 return p->second.all_frozen;
11607 }
11608 ceph_abort();
11609 return false;
11610 }
11611
11612 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11613 {
11614 map<dirfrag_t,fragment_info_t>::iterator p;
11615 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11616 p != fragments.end() && p->first.ino == dir->ino();
11617 ++p) {
11618 if (p->first.frag.contains(dir->get_frag())) {
11619 p->second.num_remote_waiters++;
11620 return;
11621 }
11622 }
11623 ceph_abort();
11624 }
11625
11626 void MDCache::find_stale_fragment_freeze()
11627 {
11628 dout(10) << "find_stale_fragment_freeze" << dendl;
11629 // see comment in Migrator::find_stale_export_freeze()
11630 utime_t now = ceph_clock_now();
11631 utime_t cutoff = now;
11632 cutoff -= g_conf()->mds_freeze_tree_timeout;
11633
11634 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11635 p != fragments.end(); ) {
11636 dirfrag_t df = p->first;
11637 fragment_info_t& info = p->second;
11638 ++p;
11639 if (info.all_frozen)
11640 continue;
11641 CDir *dir;
11642 int total_auth_pins = 0;
11643 for (const auto& d : info.dirs) {
11644 dir = d;
11645 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11646 total_auth_pins = -1;
11647 break;
11648 }
11649 if (dir->is_frozen_dir())
11650 continue;
11651 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11652 }
11653 if (total_auth_pins < 0)
11654 continue;
11655 if (info.last_cum_auth_pins != total_auth_pins) {
11656 info.last_cum_auth_pins = total_auth_pins;
11657 info.last_cum_auth_pins_change = now;
11658 continue;
11659 }
11660 if (info.last_cum_auth_pins_change >= cutoff)
11661 continue;
11662 dir = info.dirs.front();
11663 if (info.num_remote_waiters > 0 ||
11664 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11665 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11666 std::vector<CDir*> dirs;
11667 info.dirs.swap(dirs);
11668 fragments.erase(df);
11669 fragment_unmark_unfreeze_dirs(dirs);
11670 }
11671 }
11672 }
11673
11674 class C_MDC_FragmentPrep : public MDCacheLogContext {
11675 MDRequestRef mdr;
11676 public:
11677 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11678 void finish(int r) override {
11679 mdcache->_fragment_logged(mdr);
11680 }
11681 };
11682
11683 class C_MDC_FragmentStore : public MDCacheContext {
11684 MDRequestRef mdr;
11685 public:
11686 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11687 void finish(int r) override {
11688 mdcache->_fragment_stored(mdr);
11689 }
11690 };
11691
11692 class C_MDC_FragmentCommit : public MDCacheLogContext {
11693 dirfrag_t basedirfrag;
11694 MDRequestRef mdr;
11695 public:
11696 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11697 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11698 void finish(int r) override {
11699 mdcache->_fragment_committed(basedirfrag, mdr);
11700 }
11701 };
11702
11703 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11704 dirfrag_t basedirfrag;
11705 int bits;
11706 MDRequestRef mdr;
11707 public:
11708 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11709 const MDRequestRef& r) :
11710 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11711 void finish(int r) override {
11712 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
11713 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
11714 }
11715 void print(ostream& out) const override {
11716 out << "fragment_purge_old(" << basedirfrag << ")";
11717 }
11718 };
11719
11720 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11721 {
11722 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11723 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11724 if (it == fragments.end() || it->second.mdr != mdr) {
11725 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11726 request_finish(mdr);
11727 return;
11728 }
11729
11730 ceph_assert(r == 0);
11731 fragment_info_t& info = it->second;
11732 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11733 << " on " << info.dirs.front()->get_inode() << dendl;
11734
11735 info.all_frozen = true;
11736 dispatch_fragment_dir(mdr);
11737 }
11738
11739 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11740 {
11741 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11742 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11743 if (it == fragments.end() || it->second.mdr != mdr) {
11744 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11745 request_finish(mdr);
11746 return;
11747 }
11748
11749 fragment_info_t& info = it->second;
11750 CInode *diri = info.dirs.front()->get_inode();
11751
11752 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11753 << " on " << *diri << dendl;
11754
11755 if (mdr->more()->peer_error)
11756 mdr->aborted = true;
11757
11758 if (!mdr->aborted) {
11759 MutationImpl::LockOpVec lov;
11760 lov.add_wrlock(&diri->dirfragtreelock);
11761 // prevent a racing gather on any other scatterlocks too
11762 lov.lock_scatter_gather(&diri->nestlock);
11763 lov.lock_scatter_gather(&diri->filelock);
11764 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
11765 if (!mdr->aborted)
11766 return;
11767 }
11768 }
11769
11770 if (mdr->aborted) {
11771 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11772 << info.dirs.front()->dirfrag() << dendl;
11773 if (info.bits > 0)
11774 mds->balancer->queue_split(info.dirs.front(), false);
11775 else
11776 mds->balancer->queue_merge(info.dirs.front());
11777 fragment_unmark_unfreeze_dirs(info.dirs);
11778 fragments.erase(it);
11779 request_finish(mdr);
11780 return;
11781 }
11782
11783 mdr->ls = mds->mdlog->get_current_segment();
11784 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11785 mds->mdlog->start_entry(le);
11786
11787 for (const auto& dir : info.dirs) {
11788 dirfrag_rollback rollback;
11789 rollback.fnode = dir->fnode;
11790 le->add_orig_frag(dir->get_frag(), &rollback);
11791 }
11792
11793 // refragment
11794 MDSContext::vec waiters;
11795 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11796 &info.resultfrags, waiters, false);
11797 if (g_conf()->mds_debug_frag)
11798 diri->verify_dirfrags();
11799 mds->queue_waiters(waiters);
11800
11801 for (const auto& fg : le->orig_frags)
11802 ceph_assert(!diri->dirfragtree.is_leaf(fg));
11803
11804 le->metablob.add_dir_context(info.resultfrags.front());
11805 for (const auto& dir : info.resultfrags) {
11806 if (diri->is_auth()) {
11807 le->metablob.add_fragmented_dir(dir, false, false);
11808 } else {
11809 dir->state_set(CDir::STATE_DIRTYDFT);
11810 le->metablob.add_fragmented_dir(dir, false, true);
11811 }
11812 }
11813
11814 // dft lock
11815 if (diri->is_auth()) {
11816 // journal dirfragtree
11817 auto pi = diri->project_inode(mdr);
11818 pi.inode->version = diri->pre_dirty();
11819 predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
11820 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11821 } else {
11822 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11823 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11824 mdr->add_updated_lock(&diri->dirfragtreelock);
11825 }
11826
11827 /*
11828 // filelock
11829 mds->locker->mark_updated_scatterlock(&diri->filelock);
11830 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11831 mut->add_updated_lock(&diri->filelock);
11832
11833 // dirlock
11834 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11835 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11836 mut->add_updated_lock(&diri->nestlock);
11837 */
11838
11839 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11840 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11841 mdr, __func__);
11842 mds->mdlog->flush();
11843 }
11844
11845 void MDCache::_fragment_logged(MDRequestRef& mdr)
11846 {
11847 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11848 auto& info = fragments.at(basedirfrag);
11849 CInode *diri = info.resultfrags.front()->get_inode();
11850
11851 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11852 << " on " << *diri << dendl;
11853 mdr->mark_event("prepare logged");
11854
11855 mdr->apply(); // mark scatterlock
11856
11857 // store resulting frags
11858 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11859
11860 for (const auto& dir : info.resultfrags) {
11861 dout(10) << " storing result frag " << *dir << dendl;
11862
11863 dir->mark_dirty(mdr->ls);
11864 dir->mark_new(mdr->ls);
11865
11866 // freeze and store them too
11867 dir->auth_pin(this);
11868 dir->state_set(CDir::STATE_FRAGMENTING);
11869 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11870 }
11871
11872 gather.activate();
11873 }
11874
11875 void MDCache::_fragment_stored(MDRequestRef& mdr)
11876 {
11877 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11878 fragment_info_t &info = fragments.at(basedirfrag);
11879 CDir *first = info.resultfrags.front();
11880 CInode *diri = first->get_inode();
11881
11882 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11883 << " on " << *diri << dendl;
11884 mdr->mark_event("new frags stored");
11885
11886 // tell peers
11887 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11888 diri->authority().first : CDIR_AUTH_UNKNOWN;
11889 for (const auto &p : first->get_replicas()) {
11890 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11891 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11892 rejoin_gather.count(p.first)))
11893 continue;
11894
11895 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
11896 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11897 diri_auth != p.first) { // not auth mds of diri
11898 /*
11899 * In the nornal case, mds does not trim dir inode whose child dirfrags
11900 * are likely being fragmented (see trim_inode()). But when fragmenting
11901 * subtree roots, following race can happen:
11902 *
11903 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11904 * mds.c and drops wrlock on dirfragtreelock.
11905 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11906 * SYNC and send lock message mds.c
11907 * - mds.c receives the lock message and changes dirfragtreelock state
11908 * to SYNC
11909 * - mds.c trim dirfrag and dir inode from its cache
11910 * - mds.c receives the fragment_notify message
11911 *
11912 * So we need to ensure replicas have received the notify, then unlock
11913 * the dirfragtreelock.
11914 */
11915 notify->mark_ack_wanted();
11916 info.notify_ack_waiting.insert(p.first);
11917 }
11918
11919 // freshly replicate new dirs to peers
11920 for (const auto& dir : info.resultfrags) {
11921 encode_replica_dir(dir, p.first, notify->basebl);
11922 }
11923
11924 mds->send_message_mds(notify, p.first);
11925 }
11926
11927 // journal commit
11928 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11929 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
11930
11931
11932 // unfreeze resulting frags
11933 for (const auto& dir : info.resultfrags) {
11934 dout(10) << " result frag " << *dir << dendl;
11935
11936 for (auto &p : dir->items) {
11937 CDentry *dn = p.second;
11938 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11939 dn->state_clear(CDentry::STATE_FRAGMENTING);
11940 dn->put(CDentry::PIN_FRAGMENTING);
11941 }
11942
11943 // unfreeze
11944 dir->unfreeze_dir();
11945 }
11946
11947 if (info.notify_ack_waiting.empty()) {
11948 fragment_drop_locks(info);
11949 } else {
11950 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11951 }
11952 }
11953
11954 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
11955 {
11956 dout(10) << "fragment_committed " << basedirfrag << dendl;
11957 if (mdr)
11958 mdr->mark_event("commit logged");
11959
11960 ufragment &uf = uncommitted_fragments.at(basedirfrag);
11961
11962 // remove old frags
11963 C_GatherBuilder gather(
11964 g_ceph_context,
11965 new C_OnFinisher(
11966 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
11967 mds->finisher));
11968
11969 SnapContext nullsnapc;
11970 object_locator_t oloc(mds->get_metadata_pool());
11971 for (const auto& fg : uf.old_frags) {
11972 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
11973 ObjectOperation op;
11974 if (fg == frag_t()) {
11975 // backtrace object
11976 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11977 op.truncate(0);
11978 op.omap_clear();
11979 } else {
11980 dout(10) << " removing orphan dirfrag " << oid << dendl;
11981 op.remove();
11982 }
11983 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11984 ceph::real_clock::now(),
11985 0, gather.new_sub());
11986 }
11987
11988 ceph_assert(gather.has_subs());
11989 gather.activate();
11990 }
11991
11992 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
11993 {
11994 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
11995 if (mdr)
11996 mdr->mark_event("old frags purged");
11997
11998 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
11999 mds->mdlog->start_submit_entry(le);
12000
12001 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12002
12003 if (mds->logger) {
12004 if (bits > 0) {
12005 mds->logger->inc(l_mds_dir_split);
12006 } else {
12007 mds->logger->inc(l_mds_dir_merge);
12008 }
12009 }
12010
12011 if (mdr) {
12012 auto it = fragments.find(basedirfrag);
12013 ceph_assert(it != fragments.end());
12014 it->second.finishing = true;
12015 if (it->second.notify_ack_waiting.empty())
12016 fragment_maybe_finish(it);
12017 else
12018 mdr->mark_event("wating for notify acks");
12019 }
12020 }
12021
12022 void MDCache::fragment_drop_locks(fragment_info_t& info)
12023 {
12024 mds->locker->drop_locks(info.mdr.get());
12025 request_finish(info.mdr);
12026 //info.mdr.reset();
12027 }
12028
12029 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12030 {
12031 if (!it->second.finishing)
12032 return;
12033
12034 // unmark & auth_unpin
12035 for (const auto &dir : it->second.resultfrags) {
12036 dir->state_clear(CDir::STATE_FRAGMENTING);
12037 dir->auth_unpin(this);
12038
12039 // In case the resulting fragments are beyond the split size,
12040 // we might need to split them again right away (they could
12041 // have been taking inserts between unfreezing and getting
12042 // here)
12043 mds->balancer->maybe_fragment(dir, false);
12044 }
12045
12046 fragments.erase(it);
12047 }
12048
12049
12050 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12051 {
12052 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12053 mds_rank_t from = mds_rank_t(ack->get_source().num());
12054
12055 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12056 return;
12057 }
12058
12059 auto it = fragments.find(ack->get_base_dirfrag());
12060 if (it == fragments.end() ||
12061 it->second.get_tid() != ack->get_tid()) {
12062 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12063 return;
12064 }
12065
12066 if (it->second.notify_ack_waiting.erase(from) &&
12067 it->second.notify_ack_waiting.empty()) {
12068 fragment_drop_locks(it->second);
12069 fragment_maybe_finish(it);
12070 }
12071 }
12072
12073 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12074 {
12075 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12076 mds_rank_t from = mds_rank_t(notify->get_source().num());
12077
12078 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12079 return;
12080 }
12081
12082 CInode *diri = get_inode(notify->get_ino());
12083 if (diri) {
12084 frag_t base = notify->get_basefrag();
12085 int bits = notify->get_bits();
12086
12087 /*
12088 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12089 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12090 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12091 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12092 return;
12093 }
12094 */
12095
12096 // refragment
12097 MDSContext::vec waiters;
12098 std::vector<CDir*> resultfrags;
12099 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12100 if (g_conf()->mds_debug_frag)
12101 diri->verify_dirfrags();
12102
12103 for (const auto& dir : resultfrags) {
12104 diri->take_dir_waiting(dir->get_frag(), waiters);
12105 }
12106
12107 // add new replica dirs values
12108 auto p = notify->basebl.cbegin();
12109 while (!p.end()) {
12110 CDir *tmp_dir = nullptr;
12111 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12112 }
12113
12114 mds->queue_waiters(waiters);
12115 } else {
12116 ceph_abort();
12117 }
12118
12119 if (notify->is_ack_wanted()) {
12120 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12121 notify->get_bits(), notify->get_tid());
12122 mds->send_message_mds(ack, from);
12123 }
12124 }
12125
12126 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12127 LogSegment *ls, bufferlist *rollback)
12128 {
12129 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12130 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12131 ufragment& uf = uncommitted_fragments[basedirfrag];
12132 uf.old_frags = old_frags;
12133 uf.bits = bits;
12134 uf.ls = ls;
12135 ls->uncommitted_fragments.insert(basedirfrag);
12136 if (rollback)
12137 uf.rollback.swap(*rollback);
12138 }
12139
12140 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12141 {
12142 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12143 << " op " << EFragment::op_name(op) << dendl;
12144 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12145 if (it != uncommitted_fragments.end()) {
12146 ufragment& uf = it->second;
12147 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12148 uf.committed = true;
12149 } else {
12150 uf.ls->uncommitted_fragments.erase(basedirfrag);
12151 mds->queue_waiters(uf.waiters);
12152 uncommitted_fragments.erase(it);
12153 }
12154 }
12155 }
12156
12157 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12158 {
12159 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12160 << " old_frags (" << old_frags << ")" << dendl;
12161 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12162 if (it != uncommitted_fragments.end()) {
12163 ufragment& uf = it->second;
12164 if (!uf.old_frags.empty()) {
12165 uf.old_frags = std::move(old_frags);
12166 uf.committed = true;
12167 } else {
12168 uf.ls->uncommitted_fragments.erase(basedirfrag);
12169 uncommitted_fragments.erase(it);
12170 }
12171 }
12172 }
12173
12174 void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
12175 {
12176 MDSGatherBuilder gather(g_ceph_context, finisher);
12177 for (auto& p : uncommitted_fragments) {
12178 p.second.waiters.push_back(gather.new_sub());
12179 }
12180 gather.activate();
12181 }
12182
12183 struct C_MDC_FragmentRollback : public MDCacheLogContext {
12184 MutationRef mut;
12185 C_MDC_FragmentRollback(MDCache *c, MutationRef& m) :
12186 MDCacheLogContext(c), mut(m) {}
12187 void finish(int r) override {
12188 mut->apply();
12189 get_mds()->locker->drop_locks(mut.get());
12190 mut->cleanup();
12191 }
12192 };
12193
12194 void MDCache::rollback_uncommitted_fragments()
12195 {
12196 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12197 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12198 p != uncommitted_fragments.end();
12199 ++p) {
12200 ufragment &uf = p->second;
12201 CInode *diri = get_inode(p->first.ino);
12202 ceph_assert(diri);
12203
12204 if (uf.committed) {
12205 _fragment_committed(p->first, MDRequestRef());
12206 continue;
12207 }
12208
12209 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12210
12211 MutationRef mut(new MutationImpl());
12212 mut->ls = mds->mdlog->get_current_segment();
12213 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12214 mds->mdlog->start_entry(le);
12215 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12216
12217 frag_vec_t old_frags;
12218 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12219
12220 std::vector<CDir*> resultfrags;
12221 if (uf.old_frags.empty()) {
12222 // created by old format EFragment
12223 MDSContext::vec waiters;
12224 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12225 } else {
12226 auto bp = uf.rollback.cbegin();
12227 for (const auto& fg : uf.old_frags) {
12228 CDir *dir = force_dir_fragment(diri, fg);
12229 resultfrags.push_back(dir);
12230
12231 dirfrag_rollback rollback;
12232 decode(rollback, bp);
12233
12234 dir->fnode = rollback.fnode;
12235
12236 dir->mark_dirty(mut->ls);
12237
12238 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
12239 dout(10) << " dirty nestinfo on " << *dir << dendl;
12240 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12241 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12242 mut->add_updated_lock(&diri->nestlock);
12243 }
12244 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
12245 dout(10) << " dirty fragstat on " << *dir << dendl;
12246 mds->locker->mark_updated_scatterlock(&diri->filelock);
12247 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12248 mut->add_updated_lock(&diri->filelock);
12249 }
12250
12251 le->add_orig_frag(dir->get_frag());
12252 le->metablob.add_dir_context(dir);
12253 if (diri_auth) {
12254 le->metablob.add_fragmented_dir(dir, true, false);
12255 } else {
12256 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12257 dir->state_set(CDir::STATE_DIRTYDFT);
12258 le->metablob.add_fragmented_dir(dir, true, true);
12259 }
12260 }
12261 }
12262
12263 if (diri_auth) {
12264 auto pi = diri->project_inode(mut);
12265 pi.inode->version = diri->pre_dirty();
12266 predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
12267 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12268 } else {
12269 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12270 mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12271 mut->add_updated_lock(&diri->dirfragtreelock);
12272 }
12273
12274 if (g_conf()->mds_debug_frag)
12275 diri->verify_dirfrags();
12276
12277 for (const auto& leaf : old_frags) {
12278 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12279 }
12280
12281 mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut));
12282
12283 uf.old_frags.swap(old_frags);
12284 _fragment_committed(p->first, MDRequestRef());
12285 }
12286 }
12287
12288 void MDCache::force_readonly()
12289 {
12290 if (is_readonly())
12291 return;
12292
12293 dout(1) << "force file system read-only" << dendl;
12294 mds->clog->warn() << "force file system read-only";
12295
12296 set_readonly();
12297
12298 mds->server->force_clients_readonly();
12299
12300 // revoke write caps
12301 int count = 0;
12302 for (auto &p : inode_map) {
12303 CInode *in = p.second;
12304 if (in->is_head())
12305 mds->locker->eval(in, CEPH_CAP_LOCKS);
12306 if (!(++count % 1000))
12307 mds->heartbeat_reset();
12308 }
12309
12310 mds->mdlog->flush();
12311 }
12312
12313
12314 // ==============================================================
12315 // debug crap
12316
12317 void MDCache::show_subtrees(int dbl, bool force_print)
12318 {
12319 if (g_conf()->mds_thrash_exports)
12320 dbl += 15;
12321
12322 //dout(10) << "show_subtrees" << dendl;
12323
12324 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12325 return; // i won't print anything.
12326
12327 if (subtrees.empty()) {
12328 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12329 << dendl;
12330 return;
12331 }
12332
12333 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12334 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12335 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12336 "printing subtrees" << dendl;
12337 return;
12338 }
12339
12340 // root frags
12341 std::vector<CDir*> basefrags;
12342 for (set<CInode*>::iterator p = base_inodes.begin();
12343 p != base_inodes.end();
12344 ++p)
12345 (*p)->get_dirfrags(basefrags);
12346 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12347 dout(15) << "show_subtrees" << dendl;
12348
12349 // queue stuff
12350 list<pair<CDir*,int> > q;
12351 string indent;
12352 set<CDir*> seen;
12353
12354 // calc max depth
12355 for (const auto& dir : basefrags) {
12356 q.emplace_back(dir, 0);
12357 }
12358
12359 set<CDir*> subtrees_seen;
12360
12361 unsigned int depth = 0;
12362 while (!q.empty()) {
12363 CDir *dir = q.front().first;
12364 unsigned int d = q.front().second;
12365 q.pop_front();
12366
12367 if (subtrees.count(dir) == 0) continue;
12368
12369 subtrees_seen.insert(dir);
12370
12371 if (d > depth) depth = d;
12372
12373 // sanity check
12374 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12375 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12376 ceph_assert(seen.count(dir) == 0);
12377 seen.insert(dir);
12378
12379 // nested items?
12380 if (!subtrees[dir].empty()) {
12381 for (set<CDir*>::iterator p = subtrees[dir].begin();
12382 p != subtrees[dir].end();
12383 ++p) {
12384 //dout(25) << " saw sub " << **p << dendl;
12385 q.push_front(pair<CDir*,int>(*p, d+1));
12386 }
12387 }
12388 }
12389
12390 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12391 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12392 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12393 "subtrees" << dendl;
12394 return;
12395 }
12396
12397 // print tree
12398 for (const auto& dir : basefrags) {
12399 q.emplace_back(dir, 0);
12400 }
12401
12402 while (!q.empty()) {
12403 CDir *dir = q.front().first;
12404 int d = q.front().second;
12405 q.pop_front();
12406
12407 if (subtrees.count(dir) == 0) continue;
12408
12409 // adjust indenter
12410 while ((unsigned)d < indent.size())
12411 indent.resize(d);
12412
12413 // pad
12414 string pad = "______________________________________";
12415 pad.resize(depth*2+1-indent.size());
12416 if (!subtrees[dir].empty())
12417 pad[0] = '.'; // parent
12418
12419
12420 string auth;
12421 if (dir->is_auth())
12422 auth = "auth ";
12423 else
12424 auth = " rep ";
12425
12426 char s[10];
12427 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12428 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12429 else
12430 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12431
12432 // print
12433 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12434 << " " << auth << *dir << dendl;
12435
12436 if (dir->ino() == CEPH_INO_ROOT)
12437 ceph_assert(dir->inode == root);
12438 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12439 ceph_assert(dir->inode == myin);
12440 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12441 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12442
12443 // nested items?
12444 if (!subtrees[dir].empty()) {
12445 // more at my level?
12446 if (!q.empty() && q.front().second == d)
12447 indent += "| ";
12448 else
12449 indent += " ";
12450
12451 for (set<CDir*>::iterator p = subtrees[dir].begin();
12452 p != subtrees[dir].end();
12453 ++p)
12454 q.push_front(pair<CDir*,int>(*p, d+2));
12455 }
12456 }
12457
12458 // verify there isn't stray crap in subtree map
12459 int lost = 0;
12460 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12461 p != subtrees.end();
12462 ++p) {
12463 if (subtrees_seen.count(p->first)) continue;
12464 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12465 lost++;
12466 }
12467 ceph_assert(lost == 0);
12468 }
12469
12470 void MDCache::show_cache()
12471 {
12472 if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
12473 return;
12474 dout(7) << "show_cache" << dendl;
12475
12476 auto show_func = [this](CInode *in) {
12477 // unlinked?
12478 if (!in->parent)
12479 dout(7) << " unlinked " << *in << dendl;
12480
12481 // dirfrags?
12482 auto&& dfs = in->get_dirfrags();
12483 for (const auto& dir : dfs) {
12484 dout(7) << " dirfrag " << *dir << dendl;
12485
12486 for (auto &p : dir->items) {
12487 CDentry *dn = p.second;
12488 dout(7) << " dentry " << *dn << dendl;
12489 CDentry::linkage_t *dnl = dn->get_linkage();
12490 if (dnl->is_primary() && dnl->get_inode())
12491 dout(7) << " inode " << *dnl->get_inode() << dendl;
12492 }
12493 }
12494 };
12495
12496 for (auto &p : inode_map)
12497 show_func(p.second);
12498 for (auto &p : snap_inode_map)
12499 show_func(p.second);
12500 }
12501
12502 void MDCache::cache_status(Formatter *f)
12503 {
12504 f->open_object_section("cache");
12505
12506 f->open_object_section("pool");
12507 mempool::get_pool(mempool::mds_co::id).dump(f);
12508 f->close_section();
12509
12510 f->close_section();
12511 }
12512
12513 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12514 {
12515 ceph_assert(in);
12516 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12517 return;
12518 }
12519 auto&& ls = in->get_dirfrags();
12520 for (const auto &subdir : ls) {
12521 for (const auto &p : subdir->items) {
12522 CDentry *dn = p.second;
12523 CInode *in = dn->get_linkage()->get_inode();
12524 if (in) {
12525 dump_tree(in, cur_depth + 1, max_depth, f);
12526 }
12527 }
12528 }
12529 f->open_object_section("inode");
12530 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12531 f->close_section();
12532 }
12533
12534 int MDCache::dump_cache(std::string_view file_name, double timeout)
12535 {
12536 return dump_cache(file_name, NULL, timeout);
12537 }
12538
12539 int MDCache::dump_cache(Formatter *f, double timeout)
12540 {
12541 return dump_cache(std::string_view(""), f, timeout);
12542 }
12543
12544 /**
12545 * Dump the metadata cache, either to a Formatter, if
12546 * provided, else to a plain text file.
12547 */
12548 int MDCache::dump_cache(std::string_view fn, Formatter *f, double timeout)
12549 {
12550 int r = 0;
12551
12552 // dumping large caches may cause mds to hang or worse get killed.
12553 // so, disallow the dump if the cache size exceeds the configured
12554 // threshold, which is 1G for formatter and unlimited for file (note
12555 // that this can be jacked up by the admin... and is nothing but foot
12556 // shooting, but the option itself is for devs and hence dangerous to
12557 // tune). TODO: remove this when fixed.
12558 uint64_t threshold = f ?
12559 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12560 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12561
12562 if (threshold && cache_size() > threshold) {
12563 if (f) {
12564 CachedStackStringStream css;
12565 *css << "cache usage exceeds dump threshold";
12566 f->open_object_section("result");
12567 f->dump_string("error", css->strv());
12568 f->close_section();
12569 } else {
12570 derr << "cache usage exceeds dump threshold" << dendl;
12571 r = -CEPHFS_EINVAL;
12572 }
12573 return r;
12574 }
12575
12576 r = 0;
12577 int fd = -1;
12578
12579 if (f) {
12580 f->open_array_section("inodes");
12581 } else {
12582 char path[PATH_MAX] = "";
12583 if (fn.length()) {
12584 snprintf(path, sizeof path, "%s", fn.data());
12585 } else {
12586 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12587 }
12588
12589 dout(1) << "dump_cache to " << path << dendl;
12590
12591 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12592 if (fd < 0) {
12593 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12594 return errno;
12595 }
12596 }
12597
12598 auto dump_func = [fd, f](CInode *in) {
12599 int r;
12600 if (f) {
12601 f->open_object_section("inode");
12602 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12603 f->close_section();
12604 return 1;
12605 }
12606 CachedStackStringStream css;
12607 *css << *in << std::endl;
12608 auto sv = css->strv();
12609 r = safe_write(fd, sv.data(), sv.size());
12610 if (r < 0)
12611 return r;
12612 auto&& dfs = in->get_dirfrags();
12613 for (auto &dir : dfs) {
12614 CachedStackStringStream css2;
12615 *css2 << " " << *dir << std::endl;
12616 auto sv = css2->strv();
12617 r = safe_write(fd, sv.data(), sv.size());
12618 if (r < 0)
12619 return r;
12620 for (auto &p : dir->items) {
12621 CDentry *dn = p.second;
12622 CachedStackStringStream css3;
12623 *css3 << " " << *dn << std::endl;
12624 auto sv = css3->strv();
12625 r = safe_write(fd, sv.data(), sv.size());
12626 if (r < 0)
12627 return r;
12628 }
12629 dir->check_rstats();
12630 }
12631 return 1;
12632 };
12633
12634 auto start = mono_clock::now();
12635 int64_t count = 0;
12636 for (auto &p : inode_map) {
12637 r = dump_func(p.second);
12638 if (r < 0)
12639 goto out;
12640 if (!(++count % 1000) &&
12641 timeout > 0 &&
12642 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12643 r = -ETIMEDOUT;
12644 goto out;
12645 }
12646 }
12647 for (auto &p : snap_inode_map) {
12648 r = dump_func(p.second);
12649 if (r < 0)
12650 goto out;
12651 if (!(++count % 1000) &&
12652 timeout > 0 &&
12653 std::chrono::duration<double>(mono_clock::now() - start).count() > timeout) {
12654 r = -ETIMEDOUT;
12655 goto out;
12656 }
12657
12658 }
12659 r = 0;
12660
12661 out:
12662 if (f) {
12663 if (r == -ETIMEDOUT)
12664 {
12665 f->close_section();
12666 f->open_object_section("result");
12667 f->dump_string("error", "the operation timeout");
12668 }
12669 f->close_section(); // inodes
12670 } else {
12671 if (r == -ETIMEDOUT)
12672 {
12673 CachedStackStringStream css;
12674 *css << "error : the operation timeout" << std::endl;
12675 auto sv = css->strv();
12676 r = safe_write(fd, sv.data(), sv.size());
12677 }
12678 ::close(fd);
12679 }
12680 return r;
12681 }
12682
12683 void C_MDS_RetryRequest::finish(int r)
12684 {
12685 mdr->retry++;
12686 cache->dispatch_request(mdr);
12687 }
12688
12689 MDSContext *CF_MDS_RetryRequestFactory::build()
12690 {
12691 if (drop_locks) {
12692 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
12693 mdr->drop_local_auth_pins();
12694 }
12695 return new C_MDS_RetryRequest(mdcache, mdr);
12696 }
12697
12698 class C_MDS_EnqueueScrub : public Context
12699 {
12700 std::string tag;
12701 Formatter *formatter;
12702 Context *on_finish;
12703 public:
12704 ScrubHeaderRef header;
12705 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12706 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12707
12708 void finish(int r) override {
12709 formatter->open_object_section("results");
12710 formatter->dump_int("return_code", r);
12711 if (r == 0) {
12712 formatter->dump_string("scrub_tag", tag);
12713 formatter->dump_string("mode", "asynchronous");
12714 }
12715 formatter->close_section();
12716
12717 r = 0;
12718 if (on_finish)
12719 on_finish->complete(r);
12720 }
12721 };
12722
12723 void MDCache::enqueue_scrub(
12724 std::string_view path,
12725 std::string_view tag,
12726 bool force, bool recursive, bool repair,
12727 Formatter *f, Context *fin)
12728 {
12729 dout(10) << __func__ << " " << path << dendl;
12730
12731 filepath fp;
12732 if (path.compare(0, 4, "~mds") == 0) {
12733 mds_rank_t rank;
12734 if (path == "~mdsdir") {
12735 rank = mds->get_nodeid();
12736 } else {
12737 std::string err;
12738 rank = strict_strtoll(path.substr(4), 10, &err);
12739 if (!err.empty())
12740 rank = MDS_RANK_NONE;
12741 }
12742 if (rank >= 0 && rank < MAX_MDS)
12743 fp.set_path("", MDS_INO_MDSDIR(rank));
12744 }
12745 if (fp.get_ino() == inodeno_t(0))
12746 fp.set_path(path);
12747
12748 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12749 mdr->set_filepath(fp);
12750
12751 bool is_internal = false;
12752 std::string tag_str(tag);
12753 if (tag_str.empty()) {
12754 uuid_d uuid_gen;
12755 uuid_gen.generate_random();
12756 tag_str = uuid_gen.to_string();
12757 is_internal = true;
12758 }
12759
12760 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
12761 cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, recursive, repair);
12762
12763 mdr->internal_op_finish = cs;
12764 enqueue_scrub_work(mdr);
12765 }
12766
12767 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12768 {
12769 CInode *in;
12770 CF_MDS_RetryRequestFactory cf(this, mdr, true);
12771 int r = path_traverse(mdr, cf, mdr->get_filepath(),
12772 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH,
12773 nullptr, &in);
12774 if (r > 0)
12775 return;
12776 if (r < 0) {
12777 mds->server->respond_to_request(mdr, r);
12778 return;
12779 }
12780
12781 // Cannot scrub same dentry twice at same time
12782 if (in->scrub_is_in_progress()) {
12783 mds->server->respond_to_request(mdr, -CEPHFS_EBUSY);
12784 return;
12785 } else {
12786 in->scrub_info();
12787 }
12788
12789 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12790 ScrubHeaderRef& header = cs->header;
12791
12792 r = mds->scrubstack->enqueue(in, header, !header->get_recursive());
12793
12794 mds->server->respond_to_request(mdr, r);
12795 }
12796
12797 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
12798 MDRequestRef mdr;
12799 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
12800 MDCacheLogContext(c), mdr(m) {}
12801 void finish(int r) override {
12802 mdr->apply();
12803 get_mds()->server->respond_to_request(mdr, r);
12804 }
12805 };
12806
12807 struct C_MDC_ScrubRepaired : public MDCacheContext {
12808 ScrubHeaderRef header;
12809 public:
12810 C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h)
12811 : MDCacheContext(m), header(h) {
12812 header->inc_num_pending();
12813 }
12814 void finish(int r) override {
12815 header->dec_num_pending();
12816 }
12817 };
12818
12819 void MDCache::repair_dirfrag_stats(CDir *dir)
12820 {
12821 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12822 mdr->pin(dir);
12823 mdr->internal_op_private = dir;
12824 if (dir->scrub_is_in_progress())
12825 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header());
12826 else
12827 mdr->internal_op_finish = new C_MDSInternalNoop;
12828 repair_dirfrag_stats_work(mdr);
12829 }
12830
12831 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12832 {
12833 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12834 dout(10) << __func__ << " " << *dir << dendl;
12835
12836 if (!dir->is_auth()) {
12837 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
12838 return;
12839 }
12840
12841 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12842 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12843
12844 mds->locker->drop_locks(mdr.get());
12845 mdr->drop_local_auth_pins();
12846 if (mdr->is_any_remote_auth_pin())
12847 mds->locker->notify_freeze_waiter(dir);
12848 return;
12849 }
12850
12851 mdr->auth_pin(dir);
12852
12853 MutationImpl::LockOpVec lov;
12854 CInode *diri = dir->inode;
12855 lov.add_rdlock(&diri->dirfragtreelock);
12856 lov.add_wrlock(&diri->nestlock);
12857 lov.add_wrlock(&diri->filelock);
12858 if (!mds->locker->acquire_locks(mdr, lov))
12859 return;
12860
12861 if (!dir->is_complete()) {
12862 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12863 return;
12864 }
12865
12866 frag_info_t frag_info;
12867 nest_info_t nest_info;
12868 for (auto it = dir->begin(); it != dir->end(); ++it) {
12869 CDentry *dn = it->second;
12870 if (dn->last != CEPH_NOSNAP)
12871 continue;
12872 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12873 if (dnl->is_primary()) {
12874 CInode *in = dnl->get_inode();
12875 nest_info.add(in->get_projected_inode()->accounted_rstat);
12876 if (in->is_dir())
12877 frag_info.nsubdirs++;
12878 else
12879 frag_info.nfiles++;
12880 } else if (dnl->is_remote())
12881 frag_info.nfiles++;
12882 }
12883
12884 auto pf = dir->get_projected_fnode();
12885 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12886 bool good_rstat = nest_info.same_sums(pf->rstat);
12887 if (good_fragstat && good_rstat) {
12888 dout(10) << __func__ << " no corruption found" << dendl;
12889 mds->server->respond_to_request(mdr, 0);
12890 return;
12891 }
12892
12893 auto _pf = dir->project_fnode(mdr);
12894 _pf->version = dir->pre_dirty();
12895 pf = _pf;
12896
12897 mdr->ls = mds->mdlog->get_current_segment();
12898 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12899 mds->mdlog->start_entry(le);
12900
12901 if (!good_fragstat) {
12902 if (pf->fragstat.mtime > frag_info.mtime)
12903 frag_info.mtime = pf->fragstat.mtime;
12904 if (pf->fragstat.change_attr > frag_info.change_attr)
12905 frag_info.change_attr = pf->fragstat.change_attr;
12906 _pf->fragstat = frag_info;
12907 mds->locker->mark_updated_scatterlock(&diri->filelock);
12908 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12909 mdr->add_updated_lock(&diri->filelock);
12910 }
12911
12912 if (!good_rstat) {
12913 if (pf->rstat.rctime > nest_info.rctime)
12914 nest_info.rctime = pf->rstat.rctime;
12915 _pf->rstat = nest_info;
12916 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12917 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12918 mdr->add_updated_lock(&diri->nestlock);
12919 }
12920
12921 le->metablob.add_dir_context(dir);
12922 le->metablob.add_dir(dir, true);
12923
12924 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12925 }
12926
12927 void MDCache::repair_inode_stats(CInode *diri)
12928 {
12929 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12930 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
12931 mdr->internal_op_private = diri;
12932 if (diri->scrub_is_in_progress())
12933 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header());
12934 else
12935 mdr->internal_op_finish = new C_MDSInternalNoop;
12936 repair_inode_stats_work(mdr);
12937 }
12938
12939 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12940 {
12941 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12942 dout(10) << __func__ << " " << *diri << dendl;
12943
12944 if (!diri->is_auth()) {
12945 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
12946 return;
12947 }
12948 if (!diri->is_dir()) {
12949 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
12950 return;
12951 }
12952
12953 MutationImpl::LockOpVec lov;
12954
12955 if (mdr->ls) // already marked filelock/nestlock dirty ?
12956 goto do_rdlocks;
12957
12958 lov.add_rdlock(&diri->dirfragtreelock);
12959 lov.add_wrlock(&diri->nestlock);
12960 lov.add_wrlock(&diri->filelock);
12961 if (!mds->locker->acquire_locks(mdr, lov))
12962 return;
12963
12964 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12965 // the scatter-gather process, which will fix any fragstat/rstat errors.
12966 {
12967 frag_vec_t leaves;
12968 diri->dirfragtree.get_leaves(leaves);
12969 for (const auto& leaf : leaves) {
12970 CDir *dir = diri->get_dirfrag(leaf);
12971 if (!dir) {
12972 ceph_assert(mdr->is_auth_pinned(diri));
12973 dir = diri->get_or_open_dirfrag(this, leaf);
12974 }
12975 if (dir->get_version() == 0) {
12976 ceph_assert(dir->is_auth());
12977 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12978 return;
12979 }
12980 }
12981 }
12982
12983 diri->state_set(CInode::STATE_REPAIRSTATS);
12984 mdr->ls = mds->mdlog->get_current_segment();
12985 mds->locker->mark_updated_scatterlock(&diri->filelock);
12986 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12987 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12988 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12989
12990 mds->locker->drop_locks(mdr.get());
12991
12992 do_rdlocks:
12993 // force the scatter-gather process
12994 lov.clear();
12995 lov.add_rdlock(&diri->dirfragtreelock);
12996 lov.add_rdlock(&diri->nestlock);
12997 lov.add_rdlock(&diri->filelock);
12998 if (!mds->locker->acquire_locks(mdr, lov))
12999 return;
13000
13001 diri->state_clear(CInode::STATE_REPAIRSTATS);
13002
13003 frag_info_t dir_info;
13004 nest_info_t nest_info;
13005 nest_info.rsubdirs = 1; // it gets one to account for self
13006 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13007 nest_info.rsnaps = srnode->snaps.size();
13008
13009 {
13010 frag_vec_t leaves;
13011 diri->dirfragtree.get_leaves(leaves);
13012 for (const auto& leaf : leaves) {
13013 CDir *dir = diri->get_dirfrag(leaf);
13014 ceph_assert(dir);
13015 ceph_assert(dir->get_version() > 0);
13016 dir_info.add(dir->get_fnode()->accounted_fragstat);
13017 nest_info.add(dir->get_fnode()->accounted_rstat);
13018 }
13019 }
13020
13021 if (!dir_info.same_sums(diri->get_inode()->dirstat) ||
13022 !nest_info.same_sums(diri->get_inode()->rstat)) {
13023 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13024 << *diri << dendl;
13025 }
13026
13027 mds->server->respond_to_request(mdr, 0);
13028 }
13029
13030 void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin)
13031 {
13032 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS);
13033 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
13034 mdr->internal_op_private = diri;
13035 mdr->internal_op_finish = fin;
13036 return rdlock_dirfrags_stats_work(mdr);
13037 }
13038
13039 void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr)
13040 {
13041 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13042 dout(10) << __func__ << " " << *diri << dendl;
13043 if (!diri->is_auth()) {
13044 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13045 return;
13046 }
13047 if (!diri->is_dir()) {
13048 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
13049 return;
13050 }
13051
13052 MutationImpl::LockOpVec lov;
13053 lov.add_rdlock(&diri->dirfragtreelock);
13054 lov.add_rdlock(&diri->nestlock);
13055 lov.add_rdlock(&diri->filelock);
13056 if (!mds->locker->acquire_locks(mdr, lov))
13057 return;
13058 dout(10) << __func__ << " start dirfrags : " << *diri << dendl;
13059
13060 mds->server->respond_to_request(mdr, 0);
13061 return;
13062 }
13063
13064 void MDCache::flush_dentry(std::string_view path, Context *fin)
13065 {
13066 if (is_readonly()) {
13067 dout(10) << __func__ << ": read-only FS" << dendl;
13068 fin->complete(-CEPHFS_EROFS);
13069 return;
13070 }
13071 dout(10) << "flush_dentry " << path << dendl;
13072 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13073 filepath fp(path);
13074 mdr->set_filepath(fp);
13075 mdr->internal_op_finish = fin;
13076 flush_dentry_work(mdr);
13077 }
13078
13079 class C_FinishIOMDR : public MDSContext {
13080 protected:
13081 MDSRank *mds;
13082 MDRequestRef mdr;
13083 MDSRank *get_mds() override { return mds; }
13084 public:
13085 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13086 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13087 };
13088
13089 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13090 {
13091 MutationImpl::LockOpVec lov;
13092 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13093 if (!in)
13094 return;
13095
13096 ceph_assert(in->is_auth());
13097 in->flush(new C_FinishIOMDR(mds, mdr));
13098 }
13099
13100
13101 /**
13102 * Initialize performance counters with global perfcounter
13103 * collection.
13104 */
13105 void MDCache::register_perfcounters()
13106 {
13107 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13108
13109 // Stray/purge statistics
13110 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13111 PerfCountersBuilder::PRIO_INTERESTING);
13112 pcb.add_u64(l_mdc_num_recovering_enqueued,
13113 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13114 PerfCountersBuilder::PRIO_INTERESTING);
13115 pcb.add_u64_counter(l_mdc_recovery_completed,
13116 "recovery_completed", "File recoveries completed", "recd",
13117 PerfCountersBuilder::PRIO_INTERESTING);
13118
13119 // useful recovery queue statistics
13120 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13121 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13122 "Files currently being recovered");
13123 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13124 "Files waiting for recovery with elevated priority");
13125 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13126 "File recoveries started");
13127
13128 // along with other stray dentries stats
13129 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13130 "Stray dentries delayed");
13131 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13132 "Stray dentries enqueuing for purge");
13133 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13134 "Stray dentries created");
13135 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13136 "Stray dentries enqueued for purge");
13137 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13138 "Stray dentries reintegrated");
13139 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13140 "Stray dentries migrated");
13141
13142 // low prio internal request stats
13143 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13144 "Internal Request type enqueue scrub");
13145 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13146 "Internal Request type export dir");
13147 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13148 "Internal Request type flush");
13149 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13150 "Internal Request type fragmentdir");
13151 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13152 "Internal Request type frag stats");
13153 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13154 "Internal Request type inode stats");
13155
13156 logger.reset(pcb.create_perf_counters());
13157 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13158 recovery_queue.set_logger(logger.get());
13159 stray_manager.set_logger(logger.get());
13160 }
13161
13162 /**
13163 * Call this when putting references to an inode/dentry or
13164 * when attempting to trim it.
13165 *
13166 * If this inode is no longer linked by anyone, and this MDS
13167 * rank holds the primary dentry, and that dentry is in a stray
13168 * directory, then give up the dentry to the StrayManager, never
13169 * to be seen again by MDCache.
13170 *
13171 * @param delay if true, then purgeable inodes are stashed til
13172 * the next trim(), rather than being purged right
13173 * away.
13174 */
13175 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13176 if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() ||
13177 mds->get_state() <= MDSMap::STATE_REJOIN)
13178 return;
13179
13180 CDentry *dn = in->get_projected_parent_dn();
13181
13182 if (dn->state_test(CDentry::STATE_PURGING)) {
13183 /* We have already entered the purging process, no need
13184 * to re-evaluate me ! */
13185 return;
13186 }
13187
13188 if (dn->get_dir()->get_inode()->is_stray()) {
13189 if (delay)
13190 stray_manager.queue_delayed(dn);
13191 else
13192 stray_manager.eval_stray(dn);
13193 }
13194 }
13195
13196 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13197 dout(10) << __func__ << " " << *diri << dendl;
13198 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13199 auto&& ls = diri->get_dirfrags();
13200 for (auto &p : ls) {
13201 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13202 p->try_remove_dentries_for_stray();
13203 }
13204 if (!diri->snaprealm) {
13205 if (diri->is_auth())
13206 diri->clear_dirty_rstat();
13207 diri->clear_scatter_dirty();
13208 }
13209 }
13210
13211 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13212 CInode *in = get_inode(number);
13213 if (!in) {
13214 return false;
13215 }
13216 f->open_object_section("inode");
13217 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13218 f->close_section();
13219 return true;
13220 }
13221
13222 void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
13223 const mds_rank_t max_mds = mdsmap.get_max_mds();
13224
13225 // process export_pin_delayed_queue whenever a new MDSMap received
13226 auto &q = export_pin_delayed_queue;
13227 for (auto it = q.begin(); it != q.end(); ) {
13228 auto *in = *it;
13229 mds_rank_t export_pin = in->get_export_pin(false);
13230 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13231 << " max_mds=" << max_mds << dendl;
13232 if (export_pin >= mdsmap.get_max_mds()) {
13233 it++;
13234 continue;
13235 }
13236
13237 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13238 it = q.erase(it);
13239 in->queue_export_pin(export_pin);
13240 }
13241
13242 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13243 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13244 /* copy to vector to avoid removals during iteration */
13245 std::vector<CInode*> migrate;
13246 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
13247 for (auto& in : migrate) {
13248 in->maybe_export_pin();
13249 }
13250 }
13251
13252 if (max_mds <= 1) {
13253 export_ephemeral_dist_frag_bits = 0;
13254 } else {
13255 double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
13256 want *= max_mds;
13257 unsigned n = 0;
13258 while ((1U << n) < (unsigned)want)
13259 ++n;
13260 export_ephemeral_dist_frag_bits = n;
13261 }
13262 }
13263
13264 void MDCache::upkeep_main(void)
13265 {
13266 std::unique_lock lock(upkeep_mutex);
13267 while (!upkeep_trim_shutdown.load()) {
13268 auto now = clock::now();
13269 auto since = now-upkeep_last_trim;
13270 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
13271 if (since >= trim_interval*.90) {
13272 lock.unlock(); /* mds_lock -> upkeep_mutex */
13273 std::scoped_lock mds_lock(mds->mds_lock);
13274 lock.lock();
13275 if (upkeep_trim_shutdown.load())
13276 return;
13277 check_memory_usage();
13278 if (mds->is_cache_trimmable()) {
13279 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
13280 bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
13281 if (active_with_clients) {
13282 trim_client_leases();
13283 }
13284 if (is_open()) {
13285 trim();
13286 }
13287 if (active_with_clients) {
13288 auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
13289 if (cache_toofull()) {
13290 recall_flags = recall_flags|Server::RecallFlags::TRIM;
13291 }
13292 mds->server->recall_client_state(nullptr, recall_flags);
13293 }
13294 upkeep_last_trim = now = clock::now();
13295 } else {
13296 dout(10) << "cache not ready for trimming" << dendl;
13297 }
13298 } else {
13299 trim_interval -= since;
13300 }
13301 since = now-upkeep_last_release;
13302 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
13303 if (since >= release_interval*.90) {
13304 /* XXX not necessary once MDCache uses PriorityCache */
13305 dout(10) << "releasing free memory" << dendl;
13306 ceph_heap_release_free_memory();
13307 upkeep_last_release = clock::now();
13308 } else {
13309 release_interval -= since;
13310 }
13311 auto interval = std::min(release_interval, trim_interval);
13312 dout(20) << "upkeep thread waiting interval " << interval << dendl;
13313 upkeep_cvar.wait_for(lock, interval);
13314 }
13315 }