]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <ostream>
17 #include <string>
18 #include <string_view>
19 #include <map>
20
21 #include "MDCache.h"
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDLog.h"
26 #include "MDBalancer.h"
27 #include "Migrator.h"
28 #include "ScrubStack.h"
29
30 #include "SnapClient.h"
31
32 #include "MDSMap.h"
33
34 #include "CInode.h"
35 #include "CDir.h"
36
37 #include "Mutation.h"
38
39 #include "include/ceph_fs.h"
40 #include "include/filepath.h"
41 #include "include/util.h"
42
43 #include "messages/MClientCaps.h"
44
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
47
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
52
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
55
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/EPeerUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/EPurged.h"
63 #include "events/ESessions.h"
64
65 #include "InoTable.h"
66
67 #include "common/Timer.h"
68
69 #include "perfglue/heap_profiler.h"
70
71
72 #include "common/config.h"
73 #include "include/ceph_assert.h"
74
75 #define dout_context g_ceph_context
76 #define dout_subsys ceph_subsys_mds
77 #undef dout_prefix
78 #define dout_prefix _prefix(_dout, mds)
79 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
80 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
81 }
82
83 set<int> SimpleLock::empty_gather_set;
84
85
86 /**
87 * All non-I/O contexts that require a reference
88 * to an MDCache instance descend from this.
89 */
90 class MDCacheContext : public virtual MDSContext {
91 protected:
92 MDCache *mdcache;
93 MDSRank *get_mds() override
94 {
95 ceph_assert(mdcache != NULL);
96 return mdcache->mds;
97 }
98 public:
99 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
100 };
101
102
103 /**
104 * Only for contexts called back from an I/O completion
105 *
106 * Note: duplication of members wrt MDCacheContext, because
107 * it'ls the lesser of two evils compared with introducing
108 * yet another piece of (multiple) inheritance.
109 */
110 class MDCacheIOContext : public virtual MDSIOContextBase {
111 protected:
112 MDCache *mdcache;
113 MDSRank *get_mds() override
114 {
115 ceph_assert(mdcache != NULL);
116 return mdcache->mds;
117 }
118 public:
119 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
120 MDSIOContextBase(track), mdcache(mdc_) {}
121 };
122
123 class MDCacheLogContext : public virtual MDSLogContextBase {
124 protected:
125 MDCache *mdcache;
126 MDSRank *get_mds() override
127 {
128 ceph_assert(mdcache != NULL);
129 return mdcache->mds;
130 }
131 public:
132 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
133 };
134
135 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
136 mds(m),
137 open_file_table(m),
138 filer(m->objecter, m->finisher),
139 stray_manager(m, purge_queue_),
140 recovery_queue(m),
141 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
142 {
143 migrator.reset(new Migrator(mds, this));
144
145 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
146 (g_conf()->mds_dir_max_commit_size << 20) :
147 (0.9 *(g_conf()->osd_max_write_size << 20));
148
149 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
150 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
151 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
152
153 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
154 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
155 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
156
157 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
158
159 bottom_lru.lru_set_midpoint(0);
160
161 decayrate.set_halflife(g_conf()->mds_decay_halflife);
162
163 upkeeper = std::thread(&MDCache::upkeep_main, this);
164 }
165
166 MDCache::~MDCache()
167 {
168 if (logger) {
169 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
170 }
171 if (upkeeper.joinable())
172 upkeeper.join();
173 }
174
175 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
176 {
177 dout(20) << "config changes: " << changed << dendl;
178 if (changed.count("mds_cache_memory_limit"))
179 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
180 if (changed.count("mds_cache_reservation"))
181 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
182
183 bool ephemeral_pin_config_changed = false;
184 if (changed.count("mds_export_ephemeral_distributed")) {
185 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
186 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
187 /* copy to vector to avoid removals during iteration */
188 ephemeral_pin_config_changed = true;
189 }
190 if (changed.count("mds_export_ephemeral_random")) {
191 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
192 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
193 /* copy to vector to avoid removals during iteration */
194 ephemeral_pin_config_changed = true;
195 }
196 if (ephemeral_pin_config_changed) {
197 std::vector<CInode*> migrate;
198 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
199 for (auto& in : migrate) {
200 in->maybe_export_pin(true);
201 }
202 }
203 if (changed.count("mds_export_ephemeral_random_max")) {
204 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
205 }
206 if (changed.count("mds_health_cache_threshold"))
207 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
208 if (changed.count("mds_cache_mid"))
209 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
210 if (changed.count("mds_cache_trim_decay_rate")) {
211 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
212 }
213
214 migrator->handle_conf_change(changed, mdsmap);
215 mds->balancer->handle_conf_change(changed, mdsmap);
216 }
217
218 void MDCache::log_stat()
219 {
220 mds->logger->set(l_mds_inodes, lru.lru_get_size());
221 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
222 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
223 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
224 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
225 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
226 mds->logger->set(l_mds_caps, Capability::count());
227 if (root) {
228 mds->logger->set(l_mds_root_rfiles, root->get_inode()->rstat.rfiles);
229 mds->logger->set(l_mds_root_rbytes, root->get_inode()->rstat.rbytes);
230 mds->logger->set(l_mds_root_rsnaps, root->get_inode()->rstat.rsnaps);
231 }
232 }
233
234
235 //
236
237 bool MDCache::shutdown()
238 {
239 {
240 std::scoped_lock lock(upkeep_mutex);
241 upkeep_trim_shutdown = true;
242 upkeep_cvar.notify_one();
243 }
244 if (lru.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
246 //show_cache();
247 show_subtrees();
248 //dump();
249 }
250 return true;
251 }
252
253
254 // ====================================================================
255 // some inode functions
256
257 void MDCache::add_inode(CInode *in)
258 {
259 // add to lru, inode map
260 if (in->last == CEPH_NOSNAP) {
261 auto &p = inode_map[in->ino()];
262 ceph_assert(!p); // should be no dup inos!
263 p = in;
264 } else {
265 auto &p = snap_inode_map[in->vino()];
266 ceph_assert(!p); // should be no dup inos!
267 p = in;
268 }
269
270 if (in->ino() < MDS_INO_SYSTEM_BASE) {
271 if (in->ino() == CEPH_INO_ROOT)
272 root = in;
273 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
274 myin = in;
275 else if (in->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
277 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
278 }
279 }
280 if (in->is_base())
281 base_inodes.insert(in);
282 }
283 }
284
285 void MDCache::remove_inode(CInode *o)
286 {
287 dout(14) << "remove_inode " << *o << dendl;
288
289 if (o->get_parent_dn()) {
290 // FIXME: multiple parents?
291 CDentry *dn = o->get_parent_dn();
292 ceph_assert(!dn->is_dirty());
293 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
294 }
295
296 if (o->is_dirty())
297 o->mark_clean();
298 if (o->is_dirty_parent())
299 o->clear_dirty_parent();
300
301 o->clear_scatter_dirty();
302
303 o->clear_clientwriteable();
304
305 o->item_open_file.remove_myself();
306
307 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
308 export_pin_queue.erase(o);
309
310 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
311 export_pin_delayed_queue.erase(o);
312
313 o->clear_ephemeral_pin(true, true);
314
315 // remove from inode map
316 if (o->last == CEPH_NOSNAP) {
317 inode_map.erase(o->ino());
318 } else {
319 o->item_caps.remove_myself();
320 snap_inode_map.erase(o->vino());
321 }
322
323 if (o->ino() < MDS_INO_SYSTEM_BASE) {
324 if (o == root) root = 0;
325 if (o == myin) myin = 0;
326 if (o->is_stray()) {
327 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
328 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
329 }
330 }
331 if (o->is_base())
332 base_inodes.erase(o);
333 }
334
335 // delete it
336 ceph_assert(o->get_num_ref() == 0);
337 delete o;
338 }
339
340 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
341 {
342 file_layout_t result = file_layout_t::get_default();
343 result.pool_id = mdsmap.get_first_data_pool();
344 return result;
345 }
346
347 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
348 {
349 file_layout_t result = file_layout_t::get_default();
350 result.pool_id = mdsmap.get_metadata_pool();
351 if (g_conf()->mds_log_segment_size > 0) {
352 result.object_size = g_conf()->mds_log_segment_size;
353 result.stripe_unit = g_conf()->mds_log_segment_size;
354 }
355 return result;
356 }
357
358 void MDCache::init_layouts()
359 {
360 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
361 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
362 }
363
364 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, int mode) const
365 {
366 auto _inode = in->_get_inode();
367 _inode->ino = ino;
368 _inode->version = 1;
369 _inode->xattr_version = 1;
370 _inode->mode = 0500 | mode;
371 _inode->size = 0;
372 _inode->ctime = _inode->mtime = _inode->btime = ceph_clock_now();
373 _inode->nlink = 1;
374 _inode->truncate_size = -1ull;
375 _inode->change_attr = 0;
376 _inode->export_pin = MDS_RANK_NONE;
377
378 // FIPS zeroization audit 20191117: this memset is not security related.
379 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
380 if (_inode->is_dir()) {
381 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
382 _inode->rstat.rsubdirs = 1; /* itself */
383 _inode->rstat.rctime = in->get_inode()->ctime;
384 } else {
385 _inode->layout = default_file_layout;
386 ++_inode->rstat.rfiles;
387 }
388 _inode->accounted_rstat = _inode->rstat;
389
390 if (in->is_base()) {
391 if (in->is_root())
392 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
393 else
394 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
395 in->open_snaprealm(); // empty snaprealm
396 ceph_assert(!in->snaprealm->parent); // created its own
397 in->snaprealm->srnode.seq = 1;
398 }
399 }
400
401 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
402 {
403 dout(0) << "creating system inode with ino:" << ino << dendl;
404 CInode *in = new CInode(this);
405 create_unlinked_system_inode(in, ino, mode);
406 add_inode(in);
407 return in;
408 }
409
410 CInode *MDCache::create_root_inode()
411 {
412 CInode *in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);
413 auto _inode = in->_get_inode();
414 _inode->uid = g_conf()->mds_root_ino_uid;
415 _inode->gid = g_conf()->mds_root_ino_gid;
416 _inode->layout = default_file_layout;
417 _inode->layout.pool_id = mds->mdsmap->get_first_data_pool();
418 return in;
419 }
420
421 void MDCache::create_empty_hierarchy(MDSGather *gather)
422 {
423 // create root dir
424 CInode *root = create_root_inode();
425
426 // force empty root dir
427 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
428 adjust_subtree_auth(rootdir, mds->get_nodeid());
429 rootdir->dir_rep = CDir::REP_ALL; //NONE;
430
431 ceph_assert(rootdir->get_fnode()->accounted_fragstat == rootdir->get_fnode()->fragstat);
432 ceph_assert(rootdir->get_fnode()->fragstat == root->get_inode()->dirstat);
433 ceph_assert(rootdir->get_fnode()->accounted_rstat == rootdir->get_fnode()->rstat);
434 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
435 * assume version 0 is stale/invalid.
436 */
437
438 rootdir->mark_complete();
439 rootdir->_get_fnode()->version = rootdir->pre_dirty();
440 rootdir->mark_dirty(mds->mdlog->get_current_segment());
441 rootdir->commit(0, gather->new_sub());
442
443 root->store(gather->new_sub());
444 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
445 root->store_backtrace(gather->new_sub());
446 }
447
448 void MDCache::create_mydir_hierarchy(MDSGather *gather)
449 {
450 // create mds dir
451 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
452
453 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
454 auto mydir_fnode = mydir->_get_fnode();
455
456 adjust_subtree_auth(mydir, mds->get_nodeid());
457
458 LogSegment *ls = mds->mdlog->get_current_segment();
459
460 // stray dir
461 for (int i = 0; i < NUM_STRAY; ++i) {
462 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
463 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
464 CachedStackStringStream css;
465 *css << "stray" << i;
466 CDentry *sdn = mydir->add_primary_dentry(css->str(), stray, "");
467 sdn->_mark_dirty(mds->mdlog->get_current_segment());
468
469 stray->_get_inode()->dirstat = straydir->get_fnode()->fragstat;
470
471 mydir_fnode->rstat.add(stray->get_inode()->rstat);
472 mydir_fnode->fragstat.nsubdirs++;
473 // save them
474 straydir->mark_complete();
475 straydir->_get_fnode()->version = straydir->pre_dirty();
476 straydir->mark_dirty(ls);
477 straydir->commit(0, gather->new_sub());
478 stray->mark_dirty_parent(ls, true);
479 stray->store_backtrace(gather->new_sub());
480 }
481
482 mydir_fnode->accounted_fragstat = mydir->get_fnode()->fragstat;
483 mydir_fnode->accounted_rstat = mydir->get_fnode()->rstat;
484
485 auto inode = myin->_get_inode();
486 inode->dirstat = mydir->get_fnode()->fragstat;
487 inode->rstat = mydir->get_fnode()->rstat;
488 ++inode->rstat.rsubdirs;
489 inode->accounted_rstat = inode->rstat;
490
491 mydir->mark_complete();
492 mydir_fnode->version = mydir->pre_dirty();
493 mydir->mark_dirty(ls);
494 mydir->commit(0, gather->new_sub());
495
496 myin->store(gather->new_sub());
497 }
498
499 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
500 MutationRef mut;
501 CDentry *dn;
502 version_t dpv;
503 MDSContext *fin;
504 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
505 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
506 void finish(int r) override {
507 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
508 }
509 };
510
511 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
512 {
513 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
514 CDentry *dn = dir->add_null_dentry(name);
515
516 dn->push_projected_linkage(in);
517 version_t dpv = dn->pre_dirty();
518
519 CDir *mdir = 0;
520 auto inode = in->_get_inode();
521 if (in->is_dir()) {
522 inode->rstat.rsubdirs = 1;
523
524 mdir = in->get_or_open_dirfrag(this, frag_t());
525 mdir->mark_complete();
526 mdir->_get_fnode()->version = mdir->pre_dirty();
527 } else {
528 inode->rstat.rfiles = 1;
529 }
530
531 inode->version = dn->pre_dirty();
532
533 SnapRealm *realm = dir->get_inode()->find_snaprealm();
534 dn->first = in->first = realm->get_newest_seq() + 1;
535
536 MutationRef mut(new MutationImpl());
537
538 // force some locks. hacky.
539 mds->locker->wrlock_force(&dir->inode->filelock, mut);
540 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
541
542 mut->ls = mds->mdlog->get_current_segment();
543 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
544 mds->mdlog->start_entry(le);
545
546 if (!in->is_mdsdir()) {
547 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
548 le->metablob.add_primary_dentry(dn, in, true);
549 } else {
550 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
551 journal_dirty_inode(mut.get(), &le->metablob, in);
552 dn->push_projected_linkage(in->ino(), in->d_type());
553 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
554 le->metablob.add_root(true, in);
555 }
556 if (mdir)
557 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
558
559 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
560 mds->mdlog->flush();
561 }
562
563 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
564 {
565 dout(10) << "_create_system_file_finish " << *dn << dendl;
566
567 dn->pop_projected_linkage();
568 dn->mark_dirty(dpv, mut->ls);
569
570 CInode *in = dn->get_linkage()->get_inode();
571 in->mark_dirty(mut->ls);
572
573 if (in->is_dir()) {
574 CDir *dir = in->get_dirfrag(frag_t());
575 ceph_assert(dir);
576 dir->mark_dirty(mut->ls);
577 dir->mark_new(mut->ls);
578 }
579
580 mut->apply();
581 mds->locker->drop_locks(mut.get());
582 mut->cleanup();
583
584 fin->complete(0);
585
586 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
587 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
588 }
589
590
591
592 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
593 MDCache *cache;
594 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
595 void finish(int r) override {
596 if (r < 0) {
597 // If we can't open root, something disastrous has happened: mark
598 // this rank damaged for operator intervention. Note that
599 // it is not okay to call suicide() here because we are in
600 // a Finisher callback.
601 cache->mds->damaged();
602 ceph_abort(); // damaged should never return
603 } else {
604 cache->open_root();
605 }
606 }
607 };
608
609 void MDCache::open_root_inode(MDSContext *c)
610 {
611 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
612 CInode *in;
613 in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
614 in->fetch(c);
615 } else {
616 discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
617 }
618 }
619
620 void MDCache::open_mydir_inode(MDSContext *c)
621 {
622 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
623 in->fetch(c);
624 }
625
626 void MDCache::open_mydir_frag(MDSContext *c)
627 {
628 open_mydir_inode(
629 new MDSInternalContextWrapper(mds,
630 new LambdaContext([this, c](int r) {
631 if (r < 0) {
632 c->complete(r);
633 return;
634 }
635 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
636 ceph_assert(mydir);
637 adjust_subtree_auth(mydir, mds->get_nodeid());
638 mydir->fetch(c);
639 })
640 )
641 );
642 }
643
644 void MDCache::open_root()
645 {
646 dout(10) << "open_root" << dendl;
647
648 if (!root) {
649 open_root_inode(new C_MDS_RetryOpenRoot(this));
650 return;
651 }
652 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
653 ceph_assert(root->is_auth());
654 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
655 ceph_assert(rootdir);
656 if (!rootdir->is_subtree_root())
657 adjust_subtree_auth(rootdir, mds->get_nodeid());
658 if (!rootdir->is_complete()) {
659 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
660 return;
661 }
662 } else {
663 ceph_assert(!root->is_auth());
664 CDir *rootdir = root->get_dirfrag(frag_t());
665 if (!rootdir) {
666 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
667 return;
668 }
669 }
670
671 if (!myin) {
672 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
673 in->fetch(new C_MDS_RetryOpenRoot(this));
674 return;
675 }
676 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
677 ceph_assert(mydir);
678 adjust_subtree_auth(mydir, mds->get_nodeid());
679
680 populate_mydir();
681 }
682
683 void MDCache::advance_stray() {
684 // check whether the directory has been fragmented
685 if (stray_fragmenting_index >= 0) {
686 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
687 bool any_fragmenting = false;
688 for (const auto& dir : dfs) {
689 if (dir->state_test(CDir::STATE_FRAGMENTING) ||
690 mds->balancer->is_fragment_pending(dir->dirfrag())) {
691 any_fragmenting = true;
692 break;
693 }
694 }
695 if (!any_fragmenting)
696 stray_fragmenting_index = -1;
697 }
698
699 for (int i = 1; i < NUM_STRAY; i++){
700 stray_index = (stray_index + i) % NUM_STRAY;
701 if (stray_index != stray_fragmenting_index)
702 break;
703 }
704
705 if (stray_fragmenting_index == -1 && is_open()) {
706 // Fragment later stray dir in advance. We don't choose past
707 // stray dir because in-flight requests may still use it.
708 stray_fragmenting_index = (stray_index + 3) % NUM_STRAY;
709 auto&& dfs = strays[stray_fragmenting_index]->get_dirfrags();
710 bool any_fragmenting = false;
711 for (const auto& dir : dfs) {
712 if (dir->should_split()) {
713 mds->balancer->queue_split(dir, true);
714 any_fragmenting = true;
715 } else if (dir->should_merge()) {
716 mds->balancer->queue_merge(dir);
717 any_fragmenting = true;
718 }
719 }
720 if (!any_fragmenting)
721 stray_fragmenting_index = -1;
722 }
723
724 dout(10) << "advance_stray to index " << stray_index
725 << " fragmenting index " << stray_fragmenting_index << dendl;
726 }
727
728 void MDCache::populate_mydir()
729 {
730 ceph_assert(myin);
731 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
732 ceph_assert(mydir);
733
734 dout(10) << "populate_mydir " << *mydir << dendl;
735
736 if (!mydir->is_complete()) {
737 mydir->fetch(new C_MDS_RetryOpenRoot(this));
738 return;
739 }
740
741 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
742 // A missing dirfrag, we will recreate it. Before that, we must dirty
743 // it before dirtying any of the strays we create within it.
744 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
745 "recreating it now";
746 LogSegment *ls = mds->mdlog->get_current_segment();
747 mydir->state_clear(CDir::STATE_BADFRAG);
748 mydir->mark_complete();
749 mydir->_get_fnode()->version = mydir->pre_dirty();
750 mydir->mark_dirty(ls);
751 }
752
753 // open or create stray
754 uint64_t num_strays = 0;
755 for (int i = 0; i < NUM_STRAY; ++i) {
756 CachedStackStringStream css;
757 *css << "stray" << i;
758 CDentry *straydn = mydir->lookup(css->str());
759
760 // allow for older fs's with stray instead of stray0
761 if (straydn == NULL && i == 0)
762 straydn = mydir->lookup("stray");
763
764 if (!straydn || !straydn->get_linkage()->get_inode()) {
765 _create_system_file(mydir, css->strv(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
766 new C_MDS_RetryOpenRoot(this));
767 return;
768 }
769 ceph_assert(straydn);
770 ceph_assert(strays[i]);
771 // we make multiple passes through this method; make sure we only pin each stray once.
772 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
773 strays[i]->get(CInode::PIN_STRAY);
774 strays[i]->state_set(CInode::STATE_STRAYPINNED);
775 strays[i]->get_stickydirs();
776 }
777 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
778
779 // open all frags
780 frag_vec_t leaves;
781 strays[i]->dirfragtree.get_leaves(leaves);
782 for (const auto& leaf : leaves) {
783 CDir *dir = strays[i]->get_dirfrag(leaf);
784 if (!dir) {
785 dir = strays[i]->get_or_open_dirfrag(this, leaf);
786 }
787
788 // DamageTable applies special handling to strays: it will
789 // have damaged() us out if one is damaged.
790 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
791
792 if (dir->get_version() == 0) {
793 dir->fetch(new C_MDS_RetryOpenRoot(this));
794 return;
795 }
796
797 if (dir->get_frag_size() > 0)
798 num_strays += dir->get_frag_size();
799 }
800 }
801
802 // okay!
803 dout(10) << "populate_mydir done" << dendl;
804 ceph_assert(!open);
805 open = true;
806 mds->queue_waiters(waiting_for_open);
807
808 stray_manager.set_num_strays(num_strays);
809 stray_manager.activate();
810
811 scan_stray_dir();
812 }
813
814 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
815 {
816 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
817 }
818
819 CDir *MDCache::get_stray_dir(CInode *in)
820 {
821 string straydname;
822 in->name_stray_dentry(straydname);
823
824 CInode *strayi = get_stray();
825 ceph_assert(strayi);
826 frag_t fg = strayi->pick_dirfrag(straydname);
827 CDir *straydir = strayi->get_dirfrag(fg);
828 ceph_assert(straydir);
829 return straydir;
830 }
831
832 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
833 {
834 // inode?
835 if (info.ino)
836 return get_inode(info.ino, info.snapid);
837
838 // dir or dentry.
839 CDir *dir = get_dirfrag(info.dirfrag);
840 if (!dir) return 0;
841
842 if (info.dname.length())
843 return dir->lookup(info.dname, info.snapid);
844 else
845 return dir;
846 }
847
848
849 // ====================================================================
850 // consistent hash ring
851
852 /*
853 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
854 */
855 mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino, frag_t fg)
856 {
857 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
858 uint64_t hash = rjhash64(ino);
859 if (fg)
860 hash = rjhash64(hash + rjhash64(fg.value()));
861
862 int64_t b = -1, j = 0;
863 while (j < max_mds) {
864 b = j;
865 hash = hash*2862933555777941757ULL + 1;
866 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
867 }
868 // verify bounds before returning
869 auto result = mds_rank_t(b);
870 ceph_assert(result >= 0 && result < max_mds);
871 return result;
872 }
873
874
875 // ====================================================================
876 // subtree management
877
878 /*
879 * adjust the dir_auth of a subtree.
880 * merge with parent and/or child subtrees, if is it appropriate.
881 * merge can ONLY happen if both parent and child have unambiguous auth.
882 */
883 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
884 {
885 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
886 << " on " << *dir << dendl;
887
888 show_subtrees();
889
890 CDir *root;
891 if (dir->inode->is_base()) {
892 root = dir; // bootstrap hack.
893 if (subtrees.count(root) == 0) {
894 subtrees[root];
895 root->get(CDir::PIN_SUBTREE);
896 }
897 } else {
898 root = get_subtree_root(dir); // subtree root
899 }
900 ceph_assert(root);
901 ceph_assert(subtrees.count(root));
902 dout(7) << " current root is " << *root << dendl;
903
904 if (root == dir) {
905 // i am already a subtree.
906 dir->set_dir_auth(auth);
907 } else {
908 // i am a new subtree.
909 dout(10) << " new subtree at " << *dir << dendl;
910 ceph_assert(subtrees.count(dir) == 0);
911 subtrees[dir]; // create empty subtree bounds list for me.
912 dir->get(CDir::PIN_SUBTREE);
913
914 // set dir_auth
915 dir->set_dir_auth(auth);
916
917 // move items nested beneath me, under me.
918 set<CDir*>::iterator p = subtrees[root].begin();
919 while (p != subtrees[root].end()) {
920 set<CDir*>::iterator next = p;
921 ++next;
922 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
923 // move under me
924 dout(10) << " claiming child bound " << **p << dendl;
925 subtrees[dir].insert(*p);
926 subtrees[root].erase(p);
927 }
928 p = next;
929 }
930
931 // i am a bound of the parent subtree.
932 subtrees[root].insert(dir);
933
934 // i am now the subtree root.
935 root = dir;
936
937 // adjust recursive pop counters
938 if (adjust_pop && dir->is_auth()) {
939 CDir *p = dir->get_parent_dir();
940 while (p) {
941 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
942 if (p->is_subtree_root()) break;
943 p = p->inode->get_parent_dir();
944 }
945 }
946 }
947
948 show_subtrees();
949 }
950
951
952 void MDCache::try_subtree_merge(CDir *dir)
953 {
954 dout(7) << "try_subtree_merge " << *dir << dendl;
955 // record my old bounds
956 auto oldbounds = subtrees.at(dir);
957
958 set<CInode*> to_eval;
959 // try merge at my root
960 try_subtree_merge_at(dir, &to_eval);
961
962 // try merge at my old bounds
963 for (auto bound : oldbounds)
964 try_subtree_merge_at(bound, &to_eval);
965
966 if (!(mds->is_any_replay() || mds->is_resolve())) {
967 for(auto in : to_eval)
968 eval_subtree_root(in);
969 }
970 }
971
972 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
973 {
974 dout(10) << "try_subtree_merge_at " << *dir << dendl;
975
976 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
977 dir->state_test(CDir::STATE_EXPORTBOUND) ||
978 dir->state_test(CDir::STATE_AUXSUBTREE))
979 return;
980
981 auto it = subtrees.find(dir);
982 ceph_assert(it != subtrees.end());
983
984 // merge with parent?
985 CDir *parent = dir;
986 if (!dir->inode->is_base())
987 parent = get_subtree_root(dir->get_parent_dir());
988
989 if (parent != dir && // we have a parent,
990 parent->dir_auth == dir->dir_auth) { // auth matches,
991 // merge with parent.
992 dout(10) << " subtree merge at " << *dir << dendl;
993 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
994
995 // move our bounds under the parent
996 subtrees[parent].insert(it->second.begin(), it->second.end());
997
998 // we are no longer a subtree or bound
999 dir->put(CDir::PIN_SUBTREE);
1000 subtrees.erase(it);
1001 subtrees[parent].erase(dir);
1002
1003 // adjust popularity?
1004 if (adjust_pop && dir->is_auth()) {
1005 CDir *cur = dir;
1006 CDir *p = dir->get_parent_dir();
1007 while (p) {
1008 p->pop_auth_subtree.add(dir->pop_auth_subtree);
1009 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1010 if (p->is_subtree_root()) break;
1011 cur = p;
1012 p = p->inode->get_parent_dir();
1013 }
1014 }
1015
1016 if (to_eval && dir->get_inode()->is_auth())
1017 to_eval->insert(dir->get_inode());
1018
1019 show_subtrees(15);
1020 }
1021 }
1022
1023 void MDCache::eval_subtree_root(CInode *diri)
1024 {
1025 // evaluate subtree inode filelock?
1026 // (we should scatter the filelock on subtree bounds)
1027 ceph_assert(diri->is_auth());
1028 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1029 }
1030
1031
1032 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1033 {
1034 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1035 << " on " << *dir
1036 << " bounds " << bounds
1037 << dendl;
1038
1039 show_subtrees();
1040
1041 CDir *root;
1042 if (dir->ino() == CEPH_INO_ROOT) {
1043 root = dir; // bootstrap hack.
1044 if (subtrees.count(root) == 0) {
1045 subtrees[root];
1046 root->get(CDir::PIN_SUBTREE);
1047 }
1048 } else {
1049 root = get_subtree_root(dir); // subtree root
1050 }
1051 ceph_assert(root);
1052 ceph_assert(subtrees.count(root));
1053 dout(7) << " current root is " << *root << dendl;
1054
1055 mds_authority_t oldauth = dir->authority();
1056
1057 if (root == dir) {
1058 // i am already a subtree.
1059 dir->set_dir_auth(auth);
1060 } else {
1061 // i am a new subtree.
1062 dout(10) << " new subtree at " << *dir << dendl;
1063 ceph_assert(subtrees.count(dir) == 0);
1064 subtrees[dir]; // create empty subtree bounds list for me.
1065 dir->get(CDir::PIN_SUBTREE);
1066
1067 // set dir_auth
1068 dir->set_dir_auth(auth);
1069
1070 // move items nested beneath me, under me.
1071 set<CDir*>::iterator p = subtrees[root].begin();
1072 while (p != subtrees[root].end()) {
1073 set<CDir*>::iterator next = p;
1074 ++next;
1075 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1076 // move under me
1077 dout(10) << " claiming child bound " << **p << dendl;
1078 subtrees[dir].insert(*p);
1079 subtrees[root].erase(p);
1080 }
1081 p = next;
1082 }
1083
1084 // i am a bound of the parent subtree.
1085 subtrees[root].insert(dir);
1086
1087 // i am now the subtree root.
1088 root = dir;
1089 }
1090
1091 set<CInode*> to_eval;
1092
1093 // verify/adjust bounds.
1094 // - these may be new, or
1095 // - beneath existing ambiguous bounds (which will be collapsed),
1096 // - but NOT beneath unambiguous bounds.
1097 for (const auto& bound : bounds) {
1098 // new bound?
1099 if (subtrees[dir].count(bound) == 0) {
1100 if (get_subtree_root(bound) == dir) {
1101 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1102 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1103 }
1104 else {
1105 dout(10) << " want bound " << *bound << dendl;
1106 CDir *t = get_subtree_root(bound->get_parent_dir());
1107 if (subtrees[t].count(bound) == 0) {
1108 ceph_assert(t != dir);
1109 dout(10) << " new bound " << *bound << dendl;
1110 adjust_subtree_auth(bound, t->authority());
1111 }
1112 // make sure it's nested beneath ambiguous subtree(s)
1113 while (1) {
1114 while (subtrees[dir].count(t) == 0)
1115 t = get_subtree_root(t->get_parent_dir());
1116 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1117 adjust_subtree_auth(t, auth);
1118 try_subtree_merge_at(t, &to_eval);
1119 t = get_subtree_root(bound->get_parent_dir());
1120 if (t == dir) break;
1121 }
1122 }
1123 }
1124 else {
1125 dout(10) << " already have bound " << *bound << dendl;
1126 }
1127 }
1128 // merge stray bounds?
1129 while (!subtrees[dir].empty()) {
1130 set<CDir*> copy = subtrees[dir];
1131 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1132 if (bounds.count(*p) == 0) {
1133 CDir *stray = *p;
1134 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1135 adjust_subtree_auth(stray, auth);
1136 try_subtree_merge_at(stray, &to_eval);
1137 }
1138 }
1139 // swallowing subtree may add new subtree bounds
1140 if (copy == subtrees[dir])
1141 break;
1142 }
1143
1144 // bound should now match.
1145 verify_subtree_bounds(dir, bounds);
1146
1147 show_subtrees();
1148
1149 if (!(mds->is_any_replay() || mds->is_resolve())) {
1150 for(auto in : to_eval)
1151 eval_subtree_root(in);
1152 }
1153 }
1154
1155
1156 /*
1157 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1158 * fragmentation as necessary to get an equivalent bounding set. That is, only
1159 * split if one of our frags spans the provided bounding set. Never merge.
1160 */
1161 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1162 {
1163 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1164
1165 // sort by ino
1166 map<inodeno_t, fragset_t> byino;
1167 for (auto& frag : dfs) {
1168 byino[frag.ino].insert_raw(frag.frag);
1169 }
1170 dout(10) << " by ino: " << byino << dendl;
1171
1172 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1173 p->second.simplify();
1174 CInode *diri = get_inode(p->first);
1175 if (!diri)
1176 continue;
1177 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1178
1179 fragtree_t tmpdft;
1180 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1181 tmpdft.force_to_leaf(g_ceph_context, *q);
1182
1183 for (const auto& fg : p->second) {
1184 frag_vec_t leaves;
1185 diri->dirfragtree.get_leaves_under(fg, leaves);
1186 if (leaves.empty()) {
1187 frag_t approx_fg = diri->dirfragtree[fg.value()];
1188 frag_vec_t approx_leaves;
1189 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1190 for (const auto& leaf : approx_leaves) {
1191 if (p->second.get().count(leaf) == 0) {
1192 // not bound, so the resolve message is from auth MDS of the dirfrag
1193 force_dir_fragment(diri, leaf);
1194 }
1195 }
1196 }
1197
1198 auto&& [complete, sibs] = diri->get_dirfrags_under(fg);
1199 for (const auto& sib : sibs)
1200 bounds.insert(sib);
1201 }
1202 }
1203 }
1204
1205 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1206 {
1207 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1208 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1209
1210 set<CDir*> bounds;
1211 get_force_dirfrag_bound_set(bound_dfs, bounds);
1212 adjust_bounded_subtree_auth(dir, bounds, auth);
1213 }
1214
1215 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1216 {
1217 dout(10) << "map_dirfrag_set " << dfs << dendl;
1218
1219 // group by inode
1220 map<inodeno_t, fragset_t> ino_fragset;
1221 for (const auto &df : dfs) {
1222 ino_fragset[df.ino].insert_raw(df.frag);
1223 }
1224 // get frags
1225 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1226 p != ino_fragset.end();
1227 ++p) {
1228 p->second.simplify();
1229 CInode *in = get_inode(p->first);
1230 if (!in)
1231 continue;
1232
1233 frag_vec_t fgs;
1234 for (const auto& fg : p->second) {
1235 in->dirfragtree.get_leaves_under(fg, fgs);
1236 }
1237
1238 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1239 << " on " << *in << dendl;
1240
1241 for (const auto& fg : fgs) {
1242 CDir *dir = in->get_dirfrag(fg);
1243 if (dir)
1244 result.insert(dir);
1245 }
1246 }
1247 }
1248
1249
1250
1251 CDir *MDCache::get_subtree_root(CDir *dir)
1252 {
1253 // find the underlying dir that delegates (or is about to delegate) auth
1254 while (true) {
1255 if (dir->is_subtree_root())
1256 return dir;
1257 dir = dir->get_inode()->get_parent_dir();
1258 if (!dir)
1259 return 0; // none
1260 }
1261 }
1262
1263 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1264 {
1265 // find the underlying dir that delegates (or is about to delegate) auth
1266 while (true) {
1267 if (dir->is_subtree_root())
1268 return dir;
1269 dir = dir->get_inode()->get_projected_parent_dir();
1270 if (!dir)
1271 return 0; // none
1272 }
1273 }
1274
1275 void MDCache::remove_subtree(CDir *dir)
1276 {
1277 dout(10) << "remove_subtree " << *dir << dendl;
1278 auto it = subtrees.find(dir);
1279 ceph_assert(it != subtrees.end());
1280 subtrees.erase(it);
1281 dir->put(CDir::PIN_SUBTREE);
1282 if (dir->get_parent_dir()) {
1283 CDir *p = get_subtree_root(dir->get_parent_dir());
1284 auto it = subtrees.find(p);
1285 ceph_assert(it != subtrees.end());
1286 auto count = it->second.erase(dir);
1287 ceph_assert(count == 1);
1288 }
1289 }
1290
1291 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1292 {
1293 ceph_assert(subtrees.count(dir));
1294 bounds = subtrees[dir];
1295 }
1296
1297 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1298 {
1299 if (subtrees.count(dir)) {
1300 // just copy them, dir is a subtree.
1301 get_subtree_bounds(dir, bounds);
1302 } else {
1303 // find them
1304 CDir *root = get_subtree_root(dir);
1305 for (set<CDir*>::iterator p = subtrees[root].begin();
1306 p != subtrees[root].end();
1307 ++p) {
1308 CDir *t = *p;
1309 while (t != root) {
1310 t = t->get_parent_dir();
1311 ceph_assert(t);
1312 if (t == dir) {
1313 bounds.insert(*p);
1314 continue;
1315 }
1316 }
1317 }
1318 }
1319 }
1320
1321 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1322 {
1323 // for debugging only.
1324 ceph_assert(subtrees.count(dir));
1325 if (bounds != subtrees[dir]) {
1326 dout(0) << "verify_subtree_bounds failed" << dendl;
1327 set<CDir*> b = bounds;
1328 for (auto &cd : subtrees[dir]) {
1329 if (bounds.count(cd)) {
1330 b.erase(cd);
1331 continue;
1332 }
1333 dout(0) << " missing bound " << *cd << dendl;
1334 }
1335 for (const auto &cd : b)
1336 dout(0) << " extra bound " << *cd << dendl;
1337 }
1338 ceph_assert(bounds == subtrees[dir]);
1339 }
1340
1341 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1342 {
1343 // for debugging only.
1344 ceph_assert(subtrees.count(dir));
1345
1346 // make sure that any bounds i do have are properly noted as such.
1347 int failed = 0;
1348 for (const auto &fg : bounds) {
1349 CDir *bd = get_dirfrag(fg);
1350 if (!bd) continue;
1351 if (subtrees[dir].count(bd) == 0) {
1352 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1353 failed++;
1354 }
1355 }
1356 ceph_assert(failed == 0);
1357 }
1358
1359 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1360 {
1361 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1362 << " to " << *newdir << dendl;
1363 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1364 }
1365
1366 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1367 {
1368 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1369
1370 CDir *newdir = diri->get_parent_dir();
1371
1372 if (pop) {
1373 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1374 ceph_assert(p != projected_subtree_renames.end());
1375 ceph_assert(!p->second.empty());
1376 ceph_assert(p->second.front().first == olddir);
1377 ceph_assert(p->second.front().second == newdir);
1378 p->second.pop_front();
1379 if (p->second.empty())
1380 projected_subtree_renames.erase(p);
1381 }
1382
1383 // adjust total auth pin of freezing subtree
1384 if (olddir != newdir) {
1385 auto&& dfls = diri->get_nested_dirfrags();
1386 for (const auto& dir : dfls)
1387 olddir->adjust_freeze_after_rename(dir);
1388 }
1389
1390 // adjust subtree
1391 // N.B. make sure subtree dirfrags are at the front of the list
1392 auto dfls = diri->get_subtree_dirfrags();
1393 diri->get_nested_dirfrags(dfls);
1394 for (const auto& dir : dfls) {
1395 dout(10) << "dirfrag " << *dir << dendl;
1396 CDir *oldparent = get_subtree_root(olddir);
1397 dout(10) << " old parent " << *oldparent << dendl;
1398 CDir *newparent = get_subtree_root(newdir);
1399 dout(10) << " new parent " << *newparent << dendl;
1400
1401 auto& oldbounds = subtrees[oldparent];
1402 auto& newbounds = subtrees[newparent];
1403
1404 if (olddir != newdir)
1405 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1406
1407 if (oldparent == newparent) {
1408 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1409 } else if (dir->is_subtree_root()) {
1410 // children are fine. change parent.
1411 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1412 {
1413 auto n = oldbounds.erase(dir);
1414 ceph_assert(n == 1);
1415 }
1416 newbounds.insert(dir);
1417 // caller is responsible for 'eval diri'
1418 try_subtree_merge_at(dir, NULL, false);
1419 } else {
1420 // mid-subtree.
1421
1422 // see if any old bounds move to the new parent.
1423 std::vector<CDir*> tomove;
1424 for (const auto& bound : oldbounds) {
1425 CDir *broot = get_subtree_root(bound->get_parent_dir());
1426 if (broot != oldparent) {
1427 ceph_assert(broot == newparent);
1428 tomove.push_back(bound);
1429 }
1430 }
1431 for (const auto& bound : tomove) {
1432 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1433 oldbounds.erase(bound);
1434 newbounds.insert(bound);
1435 }
1436
1437 // did auth change?
1438 if (oldparent->authority() != newparent->authority()) {
1439 adjust_subtree_auth(dir, oldparent->authority(), false);
1440 // caller is responsible for 'eval diri'
1441 try_subtree_merge_at(dir, NULL, false);
1442 }
1443 }
1444
1445 if (olddir != newdir)
1446 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1447 }
1448
1449 show_subtrees();
1450 }
1451
1452 // ===================================
1453 // journal and snap/cow helpers
1454
1455
1456 /*
1457 * find first inode in cache that follows given snapid. otherwise, return current.
1458 */
1459 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1460 {
1461 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1462 ceph_assert(in->last == CEPH_NOSNAP);
1463
1464 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1465 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1466 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1467 in = p->second;
1468 }
1469
1470 return in;
1471 }
1472
1473
1474 /*
1475 * note: i'm currently cheating wrt dirty and inode.version on cow
1476 * items. instead of doing a full dir predirty, i just take the
1477 * original item's version, and set the dirty flag (via
1478 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1479 * means a special case in the dir commit clean sweep assertions.
1480 * bah.
1481 */
1482 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1483 {
1484 ceph_assert(last >= in->first);
1485
1486 CInode *oldin = new CInode(this, true, in->first, last);
1487 auto _inode = CInode::allocate_inode(*in->get_previous_projected_inode());
1488 _inode->trim_client_ranges(last);
1489 oldin->reset_inode(std::move(_inode));
1490 auto _xattrs = in->get_previous_projected_xattrs();
1491 oldin->reset_xattrs(std::move(_xattrs));
1492
1493 oldin->symlink = in->symlink;
1494
1495 if (in->first < in->oldest_snap)
1496 in->oldest_snap = in->first;
1497
1498 in->first = last+1;
1499
1500 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1501 add_inode(oldin);
1502
1503 if (in->last != CEPH_NOSNAP) {
1504 CInode *head_in = get_inode(in->ino());
1505 ceph_assert(head_in);
1506 auto ret = head_in->split_need_snapflush(oldin, in);
1507 if (ret.first) {
1508 oldin->client_snap_caps = in->client_snap_caps;
1509 if (!oldin->client_snap_caps.empty()) {
1510 for (int i = 0; i < num_cinode_locks; i++) {
1511 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1512 ceph_assert(lock);
1513 if (lock->get_state() != LOCK_SNAP_SYNC) {
1514 ceph_assert(lock->is_stable());
1515 lock->set_state(LOCK_SNAP_SYNC); // gathering
1516 oldin->auth_pin(lock);
1517 }
1518 lock->get_wrlock(true);
1519 }
1520 }
1521 }
1522 if (!ret.second) {
1523 auto client_snap_caps = std::move(in->client_snap_caps);
1524 in->client_snap_caps.clear();
1525 in->item_open_file.remove_myself();
1526 in->item_caps.remove_myself();
1527
1528 if (!client_snap_caps.empty()) {
1529 MDSContext::vec finished;
1530 for (int i = 0; i < num_cinode_locks; i++) {
1531 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1532 ceph_assert(lock);
1533 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1534 lock->put_wrlock();
1535 if (!lock->get_num_wrlocks()) {
1536 lock->set_state(LOCK_SYNC);
1537 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1538 in->auth_unpin(lock);
1539 }
1540 }
1541 mds->queue_waiters(finished);
1542 }
1543 }
1544 return oldin;
1545 }
1546
1547 if (!in->client_caps.empty()) {
1548 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1549 // clone caps?
1550 for (auto &p : in->client_caps) {
1551 client_t client = p.first;
1552 Capability *cap = &p.second;
1553 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1554 if ((issued & CEPH_CAP_ANY_WR) &&
1555 cap->client_follows < last) {
1556 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1557 oldin->client_snap_caps.insert(client);
1558 cap->client_follows = last;
1559
1560 // we need snapflushes for any intervening snaps
1561 dout(10) << " snaps " << snaps << dendl;
1562 for (auto q = snaps.lower_bound(oldin->first);
1563 q != snaps.end() && *q <= last;
1564 ++q) {
1565 in->add_need_snapflush(oldin, *q, client);
1566 }
1567 } else {
1568 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1569 }
1570 }
1571
1572 if (!oldin->client_snap_caps.empty()) {
1573 for (int i = 0; i < num_cinode_locks; i++) {
1574 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1575 ceph_assert(lock);
1576 if (lock->get_state() != LOCK_SNAP_SYNC) {
1577 ceph_assert(lock->is_stable());
1578 lock->set_state(LOCK_SNAP_SYNC); // gathering
1579 oldin->auth_pin(lock);
1580 }
1581 lock->get_wrlock(true);
1582 }
1583 }
1584 }
1585 return oldin;
1586 }
1587
1588 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1589 CDentry *dn, snapid_t follows,
1590 CInode **pcow_inode, CDentry::linkage_t *dnl)
1591 {
1592 if (!dn) {
1593 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1594 return;
1595 }
1596 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1597 ceph_assert(dn->is_auth());
1598
1599 // nothing to cow on a null dentry, fix caller
1600 if (!dnl)
1601 dnl = dn->get_projected_linkage();
1602 ceph_assert(!dnl->is_null());
1603
1604 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1605 bool cow_head = false;
1606 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1607 ceph_assert(in->is_frozen_inode());
1608 cow_head = true;
1609 }
1610 if (in && (in->is_multiversion() || cow_head)) {
1611 // multiversion inode.
1612 SnapRealm *realm = NULL;
1613
1614 if (in->get_projected_parent_dn() != dn) {
1615 ceph_assert(follows == CEPH_NOSNAP);
1616 realm = dn->dir->inode->find_snaprealm();
1617 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1618 ceph_assert(dir_follows >= realm->get_newest_seq());
1619
1620 if (dir_follows+1 > dn->first) {
1621 snapid_t oldfirst = dn->first;
1622 dn->first = dir_follows+1;
1623 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1624 CDir *dir = dn->dir;
1625 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(), dn->alternate_name, oldfirst, dir_follows);
1626 dout(10) << " olddn " << *olddn << dendl;
1627 ceph_assert(dir->is_projected());
1628 olddn->set_projected_version(dir->get_projected_version());
1629 metablob->add_remote_dentry(olddn, true);
1630 mut->add_cow_dentry(olddn);
1631 // FIXME: adjust link count here? hmm.
1632
1633 if (dir_follows+1 > in->first)
1634 in->cow_old_inode(dir_follows, cow_head);
1635 }
1636 }
1637
1638 follows = dir_follows;
1639 if (in->snaprealm) {
1640 realm = in->snaprealm;
1641 ceph_assert(follows >= realm->get_newest_seq());
1642 }
1643 } else {
1644 realm = in->find_snaprealm();
1645 if (follows == CEPH_NOSNAP) {
1646 follows = get_global_snaprealm()->get_newest_seq();
1647 ceph_assert(follows >= realm->get_newest_seq());
1648 }
1649 }
1650
1651 // already cloned?
1652 if (follows < in->first) {
1653 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1654 return;
1655 }
1656
1657 if (!realm->has_snaps_in_range(in->first, follows)) {
1658 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1659 in->first = follows + 1;
1660 return;
1661 }
1662
1663 in->cow_old_inode(follows, cow_head);
1664
1665 } else {
1666 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1667 if (follows == CEPH_NOSNAP) {
1668 follows = get_global_snaprealm()->get_newest_seq();
1669 ceph_assert(follows >= realm->get_newest_seq());
1670 }
1671
1672 // already cloned?
1673 if (follows < dn->first) {
1674 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1675 return;
1676 }
1677
1678 // update dn.first before adding old dentry to cdir's map
1679 snapid_t oldfirst = dn->first;
1680 dn->first = follows+1;
1681
1682 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1683 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1684 if (in)
1685 in->first = follows+1;
1686 return;
1687 }
1688
1689 dout(10) << " dn " << *dn << dendl;
1690 CDir *dir = dn->get_dir();
1691 ceph_assert(dir->is_projected());
1692
1693 if (in) {
1694 CInode *oldin = cow_inode(in, follows);
1695 ceph_assert(in->is_projected());
1696 mut->add_cow_inode(oldin);
1697 if (pcow_inode)
1698 *pcow_inode = oldin;
1699 CDentry *olddn = dir->add_primary_dentry(dn->get_name(), oldin, dn->alternate_name, oldfirst, follows);
1700 dout(10) << " olddn " << *olddn << dendl;
1701 bool need_snapflush = !oldin->client_snap_caps.empty();
1702 if (need_snapflush) {
1703 mut->ls->open_files.push_back(&oldin->item_open_file);
1704 mds->locker->mark_need_snapflush_inode(oldin);
1705 }
1706 olddn->set_projected_version(dir->get_projected_version());
1707 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1708 mut->add_cow_dentry(olddn);
1709 } else {
1710 ceph_assert(dnl->is_remote());
1711 CDentry *olddn = dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(), dn->alternate_name, oldfirst, follows);
1712 dout(10) << " olddn " << *olddn << dendl;
1713
1714 olddn->set_projected_version(dir->get_projected_version());
1715 metablob->add_remote_dentry(olddn, true);
1716 mut->add_cow_dentry(olddn);
1717 }
1718 }
1719 }
1720
1721 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1722 {
1723 if (in->is_base()) {
1724 metablob->add_root(true, in);
1725 } else {
1726 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1727 follows = in->first - 1;
1728 CDentry *dn = in->get_projected_parent_dn();
1729 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1730 journal_cow_dentry(mut, metablob, dn, follows);
1731 if (in->get_projected_inode()->is_backtrace_updated()) {
1732 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1733 in->get_previous_projected_inode()->layout.pool_id;
1734 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1735 } else {
1736 metablob->add_primary_dentry(dn, in, true);
1737 }
1738 }
1739 }
1740
1741
1742
1743 // nested ---------------------------------------------------------------
1744
1745 void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
1746 CInode *cur, CDir *parent, snapid_t first,
1747 int linkunlink, SnapRealm *prealm)
1748 {
1749 CDentry *parentdn = cur->get_projected_parent_dn();
1750
1751 if (cur->first > first)
1752 first = cur->first;
1753
1754 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1755 << " " << *cur << dendl;
1756 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1757 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1758
1759 /*
1760 * FIXME. this incompletely propagates rstats to _old_ parents
1761 * (i.e. shortly after a directory rename). but we need full
1762 * blown hard link backpointers to make this work properly...
1763 */
1764 snapid_t floor = parentdn->first;
1765 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1766
1767 if (!prealm)
1768 prealm = parent->inode->find_snaprealm();
1769 const set<snapid_t> snaps = prealm->get_snaps();
1770
1771 if (cur->last != CEPH_NOSNAP) {
1772 ceph_assert(cur->dirty_old_rstats.empty());
1773 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1774 if (q == snaps.end() || *q > cur->last)
1775 return;
1776 }
1777
1778 if (cur->last >= floor) {
1779 bool update = true;
1780 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1781 // rename src inode is not projected in the peer rename prep case. so we should
1782 // avoid updateing the inode.
1783 ceph_assert(linkunlink < 0);
1784 ceph_assert(cur->is_frozen_inode());
1785 update = false;
1786 }
1787 // hacky
1788 const CInode::mempool_inode *pi;
1789 if (update && mut->is_projected(cur)) {
1790 pi = cur->_get_projected_inode();
1791 } else {
1792 pi = cur->get_projected_inode().get();
1793 if (update) {
1794 // new inode
1795 ceph_assert(pi->rstat == pi->accounted_rstat);
1796 update = false;
1797 }
1798 }
1799 _project_rstat_inode_to_frag(pi, std::max(first, floor), cur->last, parent,
1800 linkunlink, update);
1801 }
1802
1803 if (g_conf()->mds_snap_rstat) {
1804 for (const auto &p : cur->dirty_old_rstats) {
1805 const auto &old = cur->get_old_inodes()->at(p);
1806 snapid_t ofirst = std::max(old.first, floor);
1807 auto it = snaps.lower_bound(ofirst);
1808 if (it == snaps.end() || *it > p)
1809 continue;
1810 if (p >= floor)
1811 _project_rstat_inode_to_frag(&old.inode, ofirst, p, parent, 0, false);
1812 }
1813 }
1814 cur->dirty_old_rstats.clear();
1815 }
1816
1817
1818 void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
1819 CDir *parent, int linkunlink, bool update_inode)
1820 {
1821 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1822 dout(20) << " inode rstat " << inode->rstat << dendl;
1823 dout(20) << " inode accounted_rstat " << inode->accounted_rstat << dendl;
1824 nest_info_t delta;
1825 if (linkunlink == 0) {
1826 delta.add(inode->rstat);
1827 delta.sub(inode->accounted_rstat);
1828 } else if (linkunlink < 0) {
1829 delta.sub(inode->accounted_rstat);
1830 } else {
1831 delta.add(inode->rstat);
1832 }
1833 dout(20) << " delta " << delta << dendl;
1834
1835
1836 while (last >= ofirst) {
1837 /*
1838 * pick fnode version to update. at each iteration, we want to
1839 * pick a segment ending in 'last' to update. split as necessary
1840 * to make that work. then, adjust first up so that we only
1841 * update one segment at a time. then loop to cover the whole
1842 * [ofirst,last] interval.
1843 */
1844 nest_info_t *prstat;
1845 snapid_t first;
1846 auto pf = parent->_get_projected_fnode();
1847 if (last == CEPH_NOSNAP) {
1848 if (g_conf()->mds_snap_rstat)
1849 first = std::max(ofirst, parent->first);
1850 else
1851 first = parent->first;
1852 prstat = &pf->rstat;
1853 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1854
1855 if (first > parent->first &&
1856 !(pf->rstat == pf->accounted_rstat)) {
1857 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1858 << parent->first << "," << (first-1) << "] "
1859 << " " << *prstat << "/" << pf->accounted_rstat
1860 << dendl;
1861 parent->dirty_old_rstat[first-1].first = parent->first;
1862 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1863 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1864 }
1865 parent->first = first;
1866 } else if (!g_conf()->mds_snap_rstat) {
1867 // drop snapshots' rstats
1868 break;
1869 } else if (last >= parent->first) {
1870 first = parent->first;
1871 parent->dirty_old_rstat[last].first = first;
1872 parent->dirty_old_rstat[last].rstat = pf->rstat;
1873 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1874 prstat = &parent->dirty_old_rstat[last].rstat;
1875 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1876 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1877 } else {
1878 // be careful, dirty_old_rstat is a _sparse_ map.
1879 // sorry, this is ugly.
1880 first = ofirst;
1881
1882 // find any intersection with last
1883 auto it = parent->dirty_old_rstat.lower_bound(last);
1884 if (it == parent->dirty_old_rstat.end()) {
1885 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1886 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1887 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1888 first = parent->dirty_old_rstat.rbegin()->first+1;
1889 }
1890 } else {
1891 // *it last is >= last
1892 if (it->second.first <= last) {
1893 // *it intersects [first,last]
1894 if (it->second.first < first) {
1895 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1896 parent->dirty_old_rstat[first-1] = it->second;
1897 it->second.first = first;
1898 }
1899 if (it->second.first > first)
1900 first = it->second.first;
1901 if (last < it->first) {
1902 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1903 parent->dirty_old_rstat[last] = it->second;
1904 it->second.first = last+1;
1905 }
1906 } else {
1907 // *it is to the _right_ of [first,last]
1908 it = parent->dirty_old_rstat.lower_bound(first);
1909 // new *it last is >= first
1910 if (it->second.first <= last && // new *it isn't also to the right, and
1911 it->first >= first) { // it intersects our first bit,
1912 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1913 first = it->first+1;
1914 }
1915 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1916 }
1917 }
1918 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1919 parent->dirty_old_rstat[last].first = first;
1920 prstat = &parent->dirty_old_rstat[last].rstat;
1921 }
1922
1923 // apply
1924 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1925 ceph_assert(last >= first);
1926 prstat->add(delta);
1927 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1928
1929 last = first-1;
1930 }
1931
1932 if (update_inode) {
1933 auto _inode = const_cast<CInode::mempool_inode*>(inode);
1934 _inode->accounted_rstat = _inode->rstat;
1935 }
1936 }
1937
1938 void MDCache::project_rstat_frag_to_inode(const nest_info_t& rstat,
1939 const nest_info_t& accounted_rstat,
1940 snapid_t ofirst, snapid_t last,
1941 CInode *pin, bool cow_head)
1942 {
1943 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1944 dout(20) << " frag rstat " << rstat << dendl;
1945 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1946 nest_info_t delta = rstat;
1947 delta.sub(accounted_rstat);
1948 dout(20) << " delta " << delta << dendl;
1949
1950 CInode::old_inode_map_ptr _old_inodes;
1951 while (last >= ofirst) {
1952 CInode::mempool_inode *pi;
1953 snapid_t first;
1954 if (last == pin->last) {
1955 pi = pin->_get_projected_inode();
1956 first = std::max(ofirst, pin->first);
1957 if (first > pin->first) {
1958 auto& old = pin->cow_old_inode(first-1, cow_head);
1959 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1960 }
1961 } else {
1962 if (!_old_inodes) {
1963 _old_inodes = CInode::allocate_old_inode_map();
1964 if (pin->is_any_old_inodes())
1965 *_old_inodes = *pin->get_old_inodes();
1966 }
1967 if (last >= pin->first) {
1968 first = pin->first;
1969 pin->cow_old_inode(last, cow_head);
1970 } else {
1971 // our life is easier here because old_inodes is not sparse
1972 // (although it may not begin at snapid 1)
1973 auto it = _old_inodes->lower_bound(last);
1974 if (it == _old_inodes->end()) {
1975 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1976 break;
1977 }
1978 first = it->second.first;
1979 if (first > last) {
1980 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1981 //assert(p == pin->old_inodes.begin());
1982 break;
1983 }
1984 if (it->first > last) {
1985 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1986 << (last+1) << "," << it->first << "]" << dendl;
1987 (*_old_inodes)[last] = it->second;
1988 it->second.first = last+1;
1989 pin->dirty_old_rstats.insert(it->first);
1990 }
1991 }
1992 if (first < ofirst) {
1993 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1994 << first << "," << ofirst-1 << "]" << dendl;
1995 (*_old_inodes)[ofirst-1] = (*_old_inodes)[last];
1996 pin->dirty_old_rstats.insert(ofirst-1);
1997 (*_old_inodes)[last].first = first = ofirst;
1998 }
1999 pi = &(*_old_inodes)[last].inode;
2000 pin->dirty_old_rstats.insert(last);
2001 }
2002 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
2003 pi->rstat.add(delta);
2004 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
2005
2006 last = first-1;
2007 }
2008 if (_old_inodes)
2009 pin->reset_old_inodes(std::move(_old_inodes));
2010 }
2011
2012 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
2013 {
2014 if (!(mds->is_active() || mds->is_stopping()))
2015 return;
2016
2017 if (!in->is_auth() || in->is_frozen())
2018 return;
2019
2020 const auto& pi = in->get_projected_inode();
2021 if (!pi->quota.is_enable() && !quota_change)
2022 return;
2023
2024 // creaete snaprealm for quota inode (quota was set before mimic)
2025 if (!in->get_projected_srnode())
2026 mds->server->create_quota_realm(in);
2027
2028 for (auto &p : in->client_caps) {
2029 Capability *cap = &p.second;
2030 if (cap->is_noquota())
2031 continue;
2032
2033 if (exclude_ct >= 0 && exclude_ct != p.first)
2034 goto update;
2035
2036 if (cap->last_rbytes == pi->rstat.rbytes &&
2037 cap->last_rsize == pi->rstat.rsize())
2038 continue;
2039
2040 if (pi->quota.max_files > 0) {
2041 if (pi->rstat.rsize() >= pi->quota.max_files)
2042 goto update;
2043
2044 if ((abs(cap->last_rsize - pi->quota.max_files) >> 4) <
2045 abs(cap->last_rsize - pi->rstat.rsize()))
2046 goto update;
2047 }
2048
2049 if (pi->quota.max_bytes > 0) {
2050 if (pi->rstat.rbytes > pi->quota.max_bytes - (pi->quota.max_bytes >> 3))
2051 goto update;
2052
2053 if ((abs(cap->last_rbytes - pi->quota.max_bytes) >> 4) <
2054 abs(cap->last_rbytes - pi->rstat.rbytes))
2055 goto update;
2056 }
2057
2058 continue;
2059
2060 update:
2061 cap->last_rsize = pi->rstat.rsize();
2062 cap->last_rbytes = pi->rstat.rbytes;
2063
2064 auto msg = make_message<MClientQuota>();
2065 msg->ino = in->ino();
2066 msg->rstat = pi->rstat;
2067 msg->quota = pi->quota;
2068 mds->send_message_client_counted(msg, cap->get_session());
2069 }
2070 for (const auto &it : in->get_replicas()) {
2071 auto msg = make_message<MGatherCaps>();
2072 msg->ino = in->ino();
2073 mds->send_message_mds(msg, it.first);
2074 }
2075 }
2076
2077 /*
2078 * NOTE: we _have_ to delay the scatter if we are called during a
2079 * rejoin, because we can't twiddle locks between when the
2080 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2081 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2082 * (no requests), and a survivor acks immediately. _except_ that
2083 * during rejoin_(weak|strong) processing, we may complete a lock
2084 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2085 * scatterlock state in that case or the lock states will get out of
2086 * sync between the auth and replica.
2087 *
2088 * the simple solution is to never do the scatter here. instead, put
2089 * the scatterlock on a list if it isn't already wrlockable. this is
2090 * probably the best plan anyway, since we avoid too many
2091 * scatters/locks under normal usage.
2092 */
2093 /*
2094 * some notes on dirlock/nestlock scatterlock semantics:
2095 *
2096 * the fragstat (dirlock) will never be updated without
2097 * dirlock+nestlock wrlock held by the caller.
2098 *
2099 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2100 * data is pushed up the tree. this could be changed with some
2101 * restructuring here, but in its current form we ensure that the
2102 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2103 * frag, which is nice. and, we only need to track frags that need to
2104 * be nudged (and not inodes with pending rstat changes that need to
2105 * be pushed into the frag). a consequence of this is that the
2106 * accounted_rstat on scatterlock sync may not match our current
2107 * rstat. this is normal and expected.
2108 */
2109 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2110 CInode *in, CDir *parent,
2111 int flags, int linkunlink,
2112 snapid_t cfollows)
2113 {
2114 bool primary_dn = flags & PREDIRTY_PRIMARY;
2115 bool do_parent_mtime = flags & PREDIRTY_DIR;
2116 bool shallow = flags & PREDIRTY_SHALLOW;
2117
2118 ceph_assert(mds->mdlog->entry_is_open());
2119
2120 // make sure stamp is set
2121 if (mut->get_mds_stamp() == utime_t())
2122 mut->set_mds_stamp(ceph_clock_now());
2123
2124 if (in->is_base())
2125 return;
2126
2127 dout(10) << "predirty_journal_parents"
2128 << (do_parent_mtime ? " do_parent_mtime":"")
2129 << " linkunlink=" << linkunlink
2130 << (primary_dn ? " primary_dn":" remote_dn")
2131 << (shallow ? " SHALLOW":"")
2132 << " follows " << cfollows
2133 << " " << *in << dendl;
2134
2135 if (!parent) {
2136 ceph_assert(primary_dn);
2137 parent = in->get_projected_parent_dn()->get_dir();
2138 }
2139
2140 if (flags == 0 && linkunlink == 0) {
2141 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2142 blob->add_dir_context(parent);
2143 return;
2144 }
2145
2146 // build list of inodes to wrlock, dirty, and update
2147 list<CInode*> lsi;
2148 CInode *cur = in;
2149 CDentry *parentdn = NULL;
2150 bool first = true;
2151 while (parent) {
2152 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2153 ceph_assert(parent->is_auth());
2154
2155 // opportunistically adjust parent dirfrag
2156 CInode *pin = parent->get_inode();
2157
2158 // inode -> dirfrag
2159 mut->auth_pin(parent);
2160
2161 auto pf = parent->project_fnode(mut);
2162 pf->version = parent->pre_dirty();
2163
2164 if (do_parent_mtime || linkunlink) {
2165 ceph_assert(mut->is_wrlocked(&pin->filelock));
2166 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2167 ceph_assert(cfollows == CEPH_NOSNAP);
2168
2169 // update stale fragstat/rstat?
2170 parent->resync_accounted_fragstat();
2171 parent->resync_accounted_rstat();
2172
2173 if (do_parent_mtime) {
2174 pf->fragstat.mtime = mut->get_op_stamp();
2175 pf->fragstat.change_attr++;
2176 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2177 if (pf->fragstat.mtime > pf->rstat.rctime) {
2178 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2179 pf->rstat.rctime = pf->fragstat.mtime;
2180 } else {
2181 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2182 }
2183 }
2184 if (linkunlink) {
2185 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2186 if (in->is_dir()) {
2187 pf->fragstat.nsubdirs += linkunlink;
2188 //pf->rstat.rsubdirs += linkunlink;
2189 } else {
2190 pf->fragstat.nfiles += linkunlink;
2191 //pf->rstat.rfiles += linkunlink;
2192 }
2193 }
2194 }
2195
2196 // rstat
2197 if (!primary_dn) {
2198 // don't update parent this pass
2199 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2200 pin->versionlock.can_wrlock())) {
2201 dout(20) << " unwritable parent nestlock " << pin->nestlock
2202 << ", marking dirty rstat on " << *cur << dendl;
2203 cur->mark_dirty_rstat();
2204 } else {
2205 // if we don't hold a wrlock reference on this nestlock, take one,
2206 // because we are about to write into the dirfrag fnode and that needs
2207 // to commit before the lock can cycle.
2208 if (linkunlink) {
2209 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_peer());
2210 }
2211
2212 if (!mut->is_wrlocked(&pin->nestlock)) {
2213 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2214 mds->locker->wrlock_force(&pin->nestlock, mut);
2215 }
2216
2217 // now we can project the inode rstat diff the dirfrag
2218 SnapRealm *prealm = pin->find_snaprealm();
2219
2220 snapid_t follows = cfollows;
2221 if (follows == CEPH_NOSNAP)
2222 follows = prealm->get_newest_seq();
2223
2224 snapid_t first = follows+1;
2225
2226 // first, if the frag is stale, bring it back in sync.
2227 parent->resync_accounted_rstat();
2228
2229 // now push inode rstats into frag
2230 project_rstat_inode_to_frag(mut, cur, parent, first, linkunlink, prealm);
2231 cur->clear_dirty_rstat();
2232 }
2233
2234 bool stop = false;
2235 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2236 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2237 stop = true;
2238 }
2239
2240 // delay propagating until later?
2241 if (!stop && !first &&
2242 g_conf()->mds_dirstat_min_interval > 0) {
2243 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2244 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2245 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2246 << " < " << g_conf()->mds_dirstat_min_interval
2247 << ", stopping" << dendl;
2248 stop = true;
2249 } else {
2250 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2251 }
2252 }
2253
2254 // can cast only because i'm passing nowait=true in the sole user
2255 if (!stop &&
2256 !mut->is_wrlocked(&pin->nestlock) &&
2257 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2258 !mds->locker->wrlock_try(&pin->nestlock, mut)
2259 )) { // ** do not initiate.. see above comment **
2260 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2261 << " on " << *pin << dendl;
2262 stop = true;
2263 }
2264 if (stop) {
2265 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2266 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2267 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2268 mut->add_updated_lock(&pin->nestlock);
2269 if (do_parent_mtime || linkunlink) {
2270 mds->locker->mark_updated_scatterlock(&pin->filelock);
2271 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2272 mut->add_updated_lock(&pin->filelock);
2273 }
2274 break;
2275 }
2276 if (!mut->is_wrlocked(&pin->versionlock))
2277 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2278
2279 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_peer());
2280
2281 pin->last_dirstat_prop = mut->get_mds_stamp();
2282
2283 // dirfrag -> diri
2284 mut->auth_pin(pin);
2285 lsi.push_front(pin);
2286
2287 pin->pre_cow_old_inode(); // avoid cow mayhem!
2288
2289 auto pi = pin->project_inode(mut);
2290 pi.inode->version = pin->pre_dirty();
2291
2292 // dirstat
2293 if (do_parent_mtime || linkunlink) {
2294 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2295 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2296 bool touched_mtime = false, touched_chattr = false;
2297 pi.inode->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2298 pf->accounted_fragstat = pf->fragstat;
2299 if (touched_mtime)
2300 pi.inode->mtime = pi.inode->ctime = pi.inode->dirstat.mtime;
2301 if (touched_chattr)
2302 pi.inode->change_attr = pi.inode->dirstat.change_attr;
2303 dout(20) << "predirty_journal_parents gives " << pi.inode->dirstat << " on " << *pin << dendl;
2304
2305 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2306 if (pi.inode->dirstat.size() < 0)
2307 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2308 if (pi.inode->dirstat.size() != pf->fragstat.size()) {
2309 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2310 << parent->dirfrag() << ", inode has " << pi.inode->dirstat
2311 << ", dirfrag has " << pf->fragstat;
2312
2313 // trust the dirfrag for now
2314 pi.inode->dirstat = pf->fragstat;
2315
2316 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2317 }
2318 }
2319 }
2320
2321 // rstat
2322 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2323
2324 // first, if the frag is stale, bring it back in sync.
2325 parent->resync_accounted_rstat();
2326
2327 if (g_conf()->mds_snap_rstat) {
2328 for (auto &p : parent->dirty_old_rstat) {
2329 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2330 p.first, pin, true);
2331 }
2332 }
2333 parent->dirty_old_rstat.clear();
2334 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2335
2336 pf->accounted_rstat = pf->rstat;
2337
2338 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2339 if (pi.inode->rstat.rbytes != pf->rstat.rbytes) {
2340 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2341 << parent->dirfrag() << ", inode has " << pi.inode->rstat
2342 << ", dirfrag has " << pf->rstat;
2343
2344 // trust the dirfrag for now
2345 pi.inode->rstat = pf->rstat;
2346
2347 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2348 }
2349 }
2350
2351 parent->check_rstats();
2352 broadcast_quota_to_client(pin);
2353 if (pin->is_base())
2354 break;
2355 // next parent!
2356 cur = pin;
2357 parentdn = pin->get_projected_parent_dn();
2358 ceph_assert(parentdn);
2359 parent = parentdn->get_dir();
2360 linkunlink = 0;
2361 do_parent_mtime = false;
2362 primary_dn = true;
2363 first = false;
2364 }
2365
2366 // now, stick it in the blob
2367 ceph_assert(parent);
2368 ceph_assert(parent->is_auth());
2369 blob->add_dir_context(parent);
2370 blob->add_dir(parent, true);
2371 for (const auto& in : lsi) {
2372 journal_dirty_inode(mut.get(), blob, in);
2373 }
2374
2375 }
2376
2377
2378
2379
2380
2381 // ===================================
2382 // peer requests
2383
2384
2385 /*
2386 * some handlers for leader requests with peers. we need to make
2387 * sure leader journal commits before we forget we leadered them and
2388 * remove them from the uncommitted_leaders map (used during recovery
2389 * to commit|abort peers).
2390 */
2391 struct C_MDC_CommittedLeader : public MDCacheLogContext {
2392 metareqid_t reqid;
2393 C_MDC_CommittedLeader(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2394 void finish(int r) override {
2395 mdcache->_logged_leader_commit(reqid);
2396 }
2397 };
2398
2399 void MDCache::log_leader_commit(metareqid_t reqid)
2400 {
2401 dout(10) << "log_leader_commit " << reqid << dendl;
2402 uncommitted_leaders[reqid].committing = true;
2403 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2404 new C_MDC_CommittedLeader(this, reqid));
2405 }
2406
2407 void MDCache::_logged_leader_commit(metareqid_t reqid)
2408 {
2409 dout(10) << "_logged_leader_commit " << reqid << dendl;
2410 ceph_assert(uncommitted_leaders.count(reqid));
2411 uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2412 mds->queue_waiters(uncommitted_leaders[reqid].waiters);
2413 uncommitted_leaders.erase(reqid);
2414 }
2415
2416 // while active...
2417
2418 void MDCache::committed_leader_peer(metareqid_t r, mds_rank_t from)
2419 {
2420 dout(10) << "committed_leader_peer mds." << from << " on " << r << dendl;
2421 ceph_assert(uncommitted_leaders.count(r));
2422 uncommitted_leaders[r].peers.erase(from);
2423 if (!uncommitted_leaders[r].recovering && uncommitted_leaders[r].peers.empty())
2424 log_leader_commit(r);
2425 }
2426
2427 void MDCache::logged_leader_update(metareqid_t reqid)
2428 {
2429 dout(10) << "logged_leader_update " << reqid << dendl;
2430 ceph_assert(uncommitted_leaders.count(reqid));
2431 uncommitted_leaders[reqid].safe = true;
2432 auto p = pending_leaders.find(reqid);
2433 if (p != pending_leaders.end()) {
2434 pending_leaders.erase(p);
2435 if (pending_leaders.empty())
2436 process_delayed_resolve();
2437 }
2438 }
2439
2440 /*
2441 * Leader may crash after receiving all peers' commit acks, but before journalling
2442 * the final commit. Peers may crash after journalling the peer commit, but before
2443 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
2444 * resolve finishes.
2445 */
2446 void MDCache::finish_committed_leaders()
2447 {
2448 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
2449 p != uncommitted_leaders.end();
2450 ++p) {
2451 p->second.recovering = false;
2452 if (!p->second.committing && p->second.peers.empty()) {
2453 dout(10) << "finish_committed_leaders " << p->first << dendl;
2454 log_leader_commit(p->first);
2455 }
2456 }
2457 }
2458
2459 /*
2460 * at end of resolve... we must journal a commit|abort for all peer
2461 * updates, before moving on.
2462 *
2463 * this is so that the leader can safely journal ECommitted on ops it
2464 * leaders when it reaches up:active (all other recovering nodes must
2465 * complete resolve before that happens).
2466 */
2467 struct C_MDC_PeerCommit : public MDCacheLogContext {
2468 mds_rank_t from;
2469 metareqid_t reqid;
2470 C_MDC_PeerCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2471 void finish(int r) override {
2472 mdcache->_logged_peer_commit(from, reqid);
2473 }
2474 };
2475
2476 void MDCache::_logged_peer_commit(mds_rank_t from, metareqid_t reqid)
2477 {
2478 dout(10) << "_logged_peer_commit from mds." << from << " " << reqid << dendl;
2479
2480 // send a message
2481 auto req = make_message<MMDSPeerRequest>(reqid, 0, MMDSPeerRequest::OP_COMMITTED);
2482 mds->send_message_mds(req, from);
2483 }
2484
2485
2486
2487
2488
2489
2490 // ====================================================================
2491 // import map, recovery
2492
2493 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2494 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2495 {
2496 if (subtrees.count(oldparent)) {
2497 vector<dirfrag_t>& v = subtrees[oldparent];
2498 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2499 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2500 if (*it == df) {
2501 v.erase(it);
2502 break;
2503 }
2504 }
2505 if (subtrees.count(newparent)) {
2506 vector<dirfrag_t>& v = subtrees[newparent];
2507 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2508 v.push_back(df);
2509 }
2510 }
2511
2512 ESubtreeMap *MDCache::create_subtree_map()
2513 {
2514 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2515 << num_subtrees_fullauth() << " fullauth"
2516 << dendl;
2517
2518 show_subtrees();
2519
2520 ESubtreeMap *le = new ESubtreeMap();
2521 mds->mdlog->_start_entry(le);
2522
2523 map<dirfrag_t, CDir*> dirs_to_add;
2524
2525 if (myin) {
2526 CDir* mydir = myin->get_dirfrag(frag_t());
2527 dirs_to_add[mydir->dirfrag()] = mydir;
2528 }
2529
2530 // include all auth subtrees, and their bounds.
2531 // and a spanning tree to tie it to the root.
2532 for (auto& [dir, bounds] : subtrees) {
2533 // journal subtree as "ours" if we are
2534 // me, -2
2535 // me, me
2536 // me, !me (may be importing and ambiguous!)
2537
2538 // so not
2539 // !me, *
2540 if (dir->get_dir_auth().first != mds->get_nodeid())
2541 continue;
2542
2543 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2544 my_ambiguous_imports.count(dir->dirfrag())) {
2545 dout(15) << " ambig subtree " << *dir << dendl;
2546 le->ambiguous_subtrees.insert(dir->dirfrag());
2547 } else {
2548 dout(15) << " auth subtree " << *dir << dendl;
2549 }
2550
2551 dirs_to_add[dir->dirfrag()] = dir;
2552 le->subtrees[dir->dirfrag()].clear();
2553
2554 // bounds
2555 size_t nbounds = bounds.size();
2556 if (nbounds > 3) {
2557 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2558 }
2559 for (auto& bound : bounds) {
2560 if (nbounds <= 3) {
2561 dout(15) << " subtree bound " << *bound << dendl;
2562 }
2563 dirs_to_add[bound->dirfrag()] = bound;
2564 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2565 }
2566 }
2567
2568 // apply projected renames
2569 for (const auto& [diri, renames] : projected_subtree_renames) {
2570 for (const auto& [olddir, newdir] : renames) {
2571 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2572
2573 auto&& dfls = diri->get_dirfrags();
2574 for (const auto& dir : dfls) {
2575 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2576 CDir *oldparent = get_projected_subtree_root(olddir);
2577 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2578 CDir *newparent = get_projected_subtree_root(newdir);
2579 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2580
2581 if (oldparent == newparent) {
2582 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
2583 << oldparent->dirfrag() << dendl;
2584 continue;
2585 }
2586
2587 if (dir->is_subtree_root()) {
2588 if (le->subtrees.count(newparent->dirfrag()) &&
2589 oldparent->get_dir_auth() != newparent->get_dir_auth())
2590 dirs_to_add[dir->dirfrag()] = dir;
2591 // children are fine. change parent.
2592 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2593 le->subtrees);
2594 } else {
2595 // mid-subtree.
2596
2597 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2598 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2599 // if oldparent is auth, subtree is mine; include it.
2600 if (le->subtrees.count(oldparent->dirfrag())) {
2601 dirs_to_add[dir->dirfrag()] = dir;
2602 le->subtrees[dir->dirfrag()].clear();
2603 }
2604 // if newparent is auth, subtree is a new bound
2605 if (le->subtrees.count(newparent->dirfrag())) {
2606 dirs_to_add[dir->dirfrag()] = dir;
2607 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2608 }
2609 newparent = dir;
2610 }
2611
2612 // see if any old bounds move to the new parent.
2613 for (auto& bound : subtrees.at(oldparent)) {
2614 if (dir->contains(bound->get_parent_dir()))
2615 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2616 le->subtrees);
2617 }
2618 }
2619 }
2620 }
2621 }
2622
2623 // simplify the journaled map. our in memory map may have more
2624 // subtrees than needed due to migrations that are just getting
2625 // started or just completing. but on replay, the "live" map will
2626 // be simple and we can do a straight comparison.
2627 for (auto& [frag, bfrags] : le->subtrees) {
2628 if (le->ambiguous_subtrees.count(frag))
2629 continue;
2630 unsigned i = 0;
2631 while (i < bfrags.size()) {
2632 dirfrag_t b = bfrags[i];
2633 if (le->subtrees.count(b) &&
2634 le->ambiguous_subtrees.count(b) == 0) {
2635 auto& bb = le->subtrees.at(b);
2636 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2637 for (auto& r : bb) {
2638 bfrags.push_back(r);
2639 }
2640 dirs_to_add.erase(b);
2641 le->subtrees.erase(b);
2642 bfrags.erase(bfrags.begin() + i);
2643 } else {
2644 ++i;
2645 }
2646 }
2647 }
2648
2649 for (auto &p : dirs_to_add) {
2650 CDir *dir = p.second;
2651 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2652 le->metablob.add_dir(dir, false);
2653 }
2654
2655 dout(15) << " subtrees " << le->subtrees << dendl;
2656 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2657
2658 //le->metablob.print(cout);
2659 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2660 return le;
2661 }
2662
2663 void MDCache::dump_resolve_status(Formatter *f) const
2664 {
2665 f->open_object_section("resolve_status");
2666 f->dump_stream("resolve_gather") << resolve_gather;
2667 f->dump_stream("resolve_ack_gather") << resolve_gather;
2668 f->close_section();
2669 }
2670
2671 void MDCache::resolve_start(MDSContext *resolve_done_)
2672 {
2673 dout(10) << "resolve_start" << dendl;
2674 ceph_assert(!resolve_done);
2675 resolve_done.reset(resolve_done_);
2676
2677 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2678 // if we don't have the root dir, adjust it to UNKNOWN. during
2679 // resolve we want mds0 to explicit claim the portion of it that
2680 // it owns, so that anything beyond its bounds get left as
2681 // unknown.
2682 CDir *rootdir = root->get_dirfrag(frag_t());
2683 if (rootdir)
2684 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2685 }
2686 resolve_gather = recovery_set;
2687
2688 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2689 }
2690
2691 void MDCache::send_resolves()
2692 {
2693 send_peer_resolves();
2694
2695 if (!resolve_done) {
2696 // I'm survivor: refresh snap cache
2697 mds->snapclient->sync(
2698 new MDSInternalContextWrapper(mds,
2699 new LambdaContext([this](int r) {
2700 maybe_finish_peer_resolve();
2701 })
2702 )
2703 );
2704 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2705 return;
2706 }
2707 if (!resolve_ack_gather.empty()) {
2708 dout(10) << "send_resolves still waiting for resolve ack from ("
2709 << resolve_ack_gather << ")" << dendl;
2710 return;
2711 }
2712 if (!resolve_need_rollback.empty()) {
2713 dout(10) << "send_resolves still waiting for rollback to commit on ("
2714 << resolve_need_rollback << ")" << dendl;
2715 return;
2716 }
2717
2718 send_subtree_resolves();
2719 }
2720
2721 void MDCache::send_peer_resolves()
2722 {
2723 dout(10) << "send_peer_resolves" << dendl;
2724
2725 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2726
2727 if (mds->is_resolve()) {
2728 for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
2729 p != uncommitted_peers.end();
2730 ++p) {
2731 mds_rank_t leader = p->second.leader;
2732 auto &m = resolves[leader];
2733 if (!m) m = make_message<MMDSResolve>();
2734 m->add_peer_request(p->first, false);
2735 }
2736 } else {
2737 set<mds_rank_t> resolve_set;
2738 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2739 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2740 p != active_requests.end();
2741 ++p) {
2742 MDRequestRef& mdr = p->second;
2743 if (!mdr->is_peer())
2744 continue;
2745 if (!mdr->peer_did_prepare() && !mdr->committing) {
2746 continue;
2747 }
2748 mds_rank_t leader = mdr->peer_to_mds;
2749 if (resolve_set.count(leader) || is_ambiguous_peer_update(p->first, leader)) {
2750 dout(10) << " including uncommitted " << *mdr << dendl;
2751 if (!resolves.count(leader))
2752 resolves[leader] = make_message<MMDSResolve>();
2753 if (!mdr->committing &&
2754 mdr->has_more() && mdr->more()->is_inode_exporter) {
2755 // re-send cap exports
2756 CInode *in = mdr->more()->rename_inode;
2757 map<client_t, Capability::Export> cap_map;
2758 in->export_client_caps(cap_map);
2759 bufferlist bl;
2760 MMDSResolve::peer_inode_cap inode_caps(in->ino(), cap_map);
2761 encode(inode_caps, bl);
2762 resolves[leader]->add_peer_request(p->first, bl);
2763 } else {
2764 resolves[leader]->add_peer_request(p->first, mdr->committing);
2765 }
2766 }
2767 }
2768 }
2769
2770 for (auto &p : resolves) {
2771 dout(10) << "sending peer resolve to mds." << p.first << dendl;
2772 mds->send_message_mds(p.second, p.first);
2773 resolve_ack_gather.insert(p.first);
2774 }
2775 }
2776
2777 void MDCache::send_subtree_resolves()
2778 {
2779 dout(10) << "send_subtree_resolves" << dendl;
2780
2781 if (migrator->is_exporting() || migrator->is_importing()) {
2782 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2783 migrator->show_importing();
2784 migrator->show_exporting();
2785 resolves_pending = true;
2786 return; // not now
2787 }
2788
2789 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2790 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2791 p != recovery_set.end();
2792 ++p) {
2793 if (*p == mds->get_nodeid())
2794 continue;
2795 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2796 resolves[*p] = make_message<MMDSResolve>();
2797 }
2798
2799 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2800 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2801
2802 // known
2803 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2804 p != subtrees.end();
2805 ++p) {
2806 CDir *dir = p->first;
2807
2808 // only our subtrees
2809 if (dir->authority().first != mds->get_nodeid())
2810 continue;
2811
2812 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2813 continue; // we'll add it below
2814
2815 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2816 // ambiguous (mid-import)
2817 set<CDir*> bounds;
2818 get_subtree_bounds(dir, bounds);
2819 vector<dirfrag_t> dfls;
2820 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2821 dfls.push_back((*q)->dirfrag());
2822
2823 my_ambig_imports[dir->dirfrag()] = dfls;
2824 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2825 } else {
2826 // not ambiguous.
2827 for (auto &q : resolves) {
2828 resolves[q.first]->add_subtree(dir->dirfrag());
2829 }
2830 // bounds too
2831 vector<dirfrag_t> dfls;
2832 for (set<CDir*>::iterator q = subtrees[dir].begin();
2833 q != subtrees[dir].end();
2834 ++q) {
2835 CDir *bound = *q;
2836 dfls.push_back(bound->dirfrag());
2837 }
2838
2839 my_subtrees[dir->dirfrag()] = dfls;
2840 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2841 }
2842 }
2843
2844 // ambiguous
2845 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2846 p != my_ambiguous_imports.end();
2847 ++p) {
2848 my_ambig_imports[p->first] = p->second;
2849 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2850 }
2851
2852 // simplify the claimed subtree.
2853 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2854 unsigned i = 0;
2855 while (i < p->second.size()) {
2856 dirfrag_t b = p->second[i];
2857 if (my_subtrees.count(b)) {
2858 vector<dirfrag_t>& bb = my_subtrees[b];
2859 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2860 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2861 p->second.push_back(*r);
2862 my_subtrees.erase(b);
2863 p->second.erase(p->second.begin() + i);
2864 } else {
2865 ++i;
2866 }
2867 }
2868 }
2869
2870 // send
2871 for (auto &p : resolves) {
2872 const ref_t<MMDSResolve> &m = p.second;
2873 if (mds->is_resolve()) {
2874 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2875 } else {
2876 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2877 }
2878 m->subtrees = my_subtrees;
2879 m->ambiguous_imports = my_ambig_imports;
2880 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2881 mds->send_message_mds(m, p.first);
2882 }
2883 resolves_pending = false;
2884 }
2885
2886 void MDCache::maybe_finish_peer_resolve() {
2887 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2888 // snap cache get synced or I'm in resolve state
2889 if (mds->snapclient->is_synced() || resolve_done)
2890 send_subtree_resolves();
2891 process_delayed_resolve();
2892 }
2893 }
2894
2895 void MDCache::handle_mds_failure(mds_rank_t who)
2896 {
2897 dout(7) << "handle_mds_failure mds." << who << dendl;
2898
2899 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2900
2901 resolve_gather.insert(who);
2902 discard_delayed_resolve(who);
2903 ambiguous_peer_updates.erase(who);
2904
2905 rejoin_gather.insert(who);
2906 rejoin_sent.erase(who); // i need to send another
2907 rejoin_ack_sent.erase(who); // i need to send another
2908 rejoin_ack_gather.erase(who); // i'll need/get another.
2909
2910 dout(10) << " resolve_gather " << resolve_gather << dendl;
2911 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2912 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2913 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2914 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2915
2916
2917 // tell the migrator too.
2918 migrator->handle_mds_failure_or_stop(who);
2919
2920 // tell the balancer too.
2921 mds->balancer->handle_mds_failure(who);
2922
2923 // clean up any requests peer to/from this node
2924 list<MDRequestRef> finish;
2925 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2926 p != active_requests.end();
2927 ++p) {
2928 MDRequestRef& mdr = p->second;
2929 // peer to the failed node?
2930 if (mdr->peer_to_mds == who) {
2931 if (mdr->peer_did_prepare()) {
2932 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2933 if (is_ambiguous_peer_update(p->first, mdr->peer_to_mds))
2934 remove_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2935
2936 if (!mdr->more()->waiting_on_peer.empty()) {
2937 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2938 // will rollback, no need to wait
2939 mdr->reset_peer_request();
2940 mdr->more()->waiting_on_peer.clear();
2941 }
2942 } else if (!mdr->committing) {
2943 dout(10) << " peer request " << *mdr << " has no prepare, finishing up" << dendl;
2944 if (mdr->peer_request || mdr->peer_rolling_back())
2945 mdr->aborted = true;
2946 else
2947 finish.push_back(mdr);
2948 }
2949 }
2950
2951 if (mdr->is_peer() && mdr->peer_did_prepare()) {
2952 if (mdr->more()->waiting_on_peer.count(who)) {
2953 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2954 dout(10) << " peer request " << *mdr << " no longer need rename notity ack from mds."
2955 << who << dendl;
2956 mdr->more()->waiting_on_peer.erase(who);
2957 if (mdr->more()->waiting_on_peer.empty() && mdr->peer_request)
2958 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2959 }
2960
2961 if (mdr->more()->srcdn_auth_mds == who &&
2962 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->peer_to_mds)) {
2963 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2964 dout(10) << " peer request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2965 add_ambiguous_peer_update(p->first, mdr->peer_to_mds);
2966 }
2967 } else if (mdr->peer_request) {
2968 const cref_t<MMDSPeerRequest> &peer_req = mdr->peer_request;
2969 // FIXME: Peer rename request can arrive after we notice mds failure.
2970 // This can cause mds to crash (does not affect integrity of FS).
2971 if (peer_req->get_op() == MMDSPeerRequest::OP_RENAMEPREP &&
2972 peer_req->srcdn_auth == who)
2973 peer_req->mark_interrupted();
2974 }
2975
2976 // failed node is peer?
2977 if (mdr->is_leader() && !mdr->committing) {
2978 if (mdr->more()->srcdn_auth_mds == who) {
2979 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
2980 << who << " to recover" << dendl;
2981 ceph_assert(mdr->more()->witnessed.count(who) == 0);
2982 if (mdr->more()->is_ambiguous_auth)
2983 mdr->clear_ambiguous_auth();
2984 // rename srcdn's auth mds failed, all witnesses will rollback
2985 mdr->more()->witnessed.clear();
2986 pending_leaders.erase(p->first);
2987 }
2988
2989 if (mdr->more()->witnessed.count(who)) {
2990 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2991 if (srcdn_auth >= 0 && mdr->more()->waiting_on_peer.count(srcdn_auth)) {
2992 dout(10) << " leader request " << *mdr << " waiting for rename srcdn's auth mds."
2993 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2994 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
2995 // until either the request is committing or the peer also fails.
2996 ceph_assert(mdr->more()->waiting_on_peer.size() == 1);
2997 pending_leaders.insert(p->first);
2998 } else {
2999 dout(10) << " leader request " << *mdr << " no longer witnessed by peer mds."
3000 << who << " to recover" << dendl;
3001 if (srcdn_auth >= 0)
3002 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
3003
3004 // discard this peer's prepare (if any)
3005 mdr->more()->witnessed.erase(who);
3006 }
3007 }
3008
3009 if (mdr->more()->waiting_on_peer.count(who)) {
3010 dout(10) << " leader request " << *mdr << " waiting for peer mds." << who
3011 << " to recover" << dendl;
3012 // retry request when peer recovers
3013 mdr->more()->waiting_on_peer.erase(who);
3014 if (mdr->more()->waiting_on_peer.empty())
3015 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3016 }
3017
3018 if (mdr->locking && mdr->locking_target_mds == who)
3019 mdr->finish_locking(mdr->locking);
3020 }
3021 }
3022
3023 for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
3024 p != uncommitted_leaders.end();
3025 ++p) {
3026 // The failed MDS may have already committed the peer update
3027 if (p->second.peers.count(who)) {
3028 p->second.recovering = true;
3029 p->second.peers.erase(who);
3030 }
3031 }
3032
3033 while (!finish.empty()) {
3034 dout(10) << "cleaning up peer request " << *finish.front() << dendl;
3035 request_finish(finish.front());
3036 finish.pop_front();
3037 }
3038
3039 kick_find_ino_peers(who);
3040 kick_open_ino_peers(who);
3041
3042 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3043 p != fragments.end(); ) {
3044 dirfrag_t df = p->first;
3045 fragment_info_t& info = p->second;
3046
3047 if (info.is_fragmenting()) {
3048 if (info.notify_ack_waiting.erase(who) &&
3049 info.notify_ack_waiting.empty()) {
3050 fragment_drop_locks(info);
3051 fragment_maybe_finish(p++);
3052 } else {
3053 ++p;
3054 }
3055 continue;
3056 }
3057
3058 ++p;
3059 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3060 std::vector<CDir*> dirs;
3061 info.dirs.swap(dirs);
3062 fragments.erase(df);
3063 fragment_unmark_unfreeze_dirs(dirs);
3064 }
3065
3066 // MDCache::shutdown_export_strays() always exports strays to mds.0
3067 if (who == mds_rank_t(0))
3068 shutdown_exporting_strays.clear();
3069
3070 show_subtrees();
3071 }
3072
3073 /*
3074 * handle_mds_recovery - called on another node's transition
3075 * from resolve -> active.
3076 */
3077 void MDCache::handle_mds_recovery(mds_rank_t who)
3078 {
3079 dout(7) << "handle_mds_recovery mds." << who << dendl;
3080
3081 // exclude all discover waiters. kick_discovers() will do the job
3082 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3083 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3084
3085 MDSContext::vec waiters;
3086
3087 // wake up any waiters in their subtrees
3088 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3089 p != subtrees.end();
3090 ++p) {
3091 CDir *dir = p->first;
3092
3093 if (dir->authority().first != who ||
3094 dir->authority().second == mds->get_nodeid())
3095 continue;
3096 ceph_assert(!dir->is_auth());
3097
3098 // wake any waiters
3099 std::queue<CDir*> q;
3100 q.push(dir);
3101
3102 while (!q.empty()) {
3103 CDir *d = q.front();
3104 q.pop();
3105 d->take_waiting(d_mask, waiters);
3106
3107 // inode waiters too
3108 for (auto &p : d->items) {
3109 CDentry *dn = p.second;
3110 CDentry::linkage_t *dnl = dn->get_linkage();
3111 if (dnl->is_primary()) {
3112 dnl->get_inode()->take_waiting(i_mask, waiters);
3113
3114 // recurse?
3115 auto&& ls = dnl->get_inode()->get_dirfrags();
3116 for (const auto& subdir : ls) {
3117 if (!subdir->is_subtree_root())
3118 q.push(subdir);
3119 }
3120 }
3121 }
3122 }
3123 }
3124
3125 kick_open_ino_peers(who);
3126 kick_find_ino_peers(who);
3127
3128 // queue them up.
3129 mds->queue_waiters(waiters);
3130 }
3131
3132 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3133 {
3134 dout(7) << "set_recovery_set " << s << dendl;
3135 recovery_set = s;
3136 }
3137
3138
3139 /*
3140 * during resolve state, we share resolves to determine who
3141 * is authoritative for which trees. we expect to get an resolve
3142 * from _everyone_ in the recovery_set (the mds cluster at the time of
3143 * the first failure).
3144 *
3145 * This functions puts the passed message before returning
3146 */
3147 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3148 {
3149 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3150 mds_rank_t from = mds_rank_t(m->get_source().num());
3151
3152 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3153 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3154 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3155 return;
3156 }
3157 // wait until we reach the resolve stage!
3158 return;
3159 }
3160
3161 discard_delayed_resolve(from);
3162
3163 // ambiguous peer requests?
3164 if (!m->peer_requests.empty()) {
3165 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3166 for (auto p = m->peer_requests.begin(); p != m->peer_requests.end(); ++p) {
3167 if (uncommitted_leaders.count(p->first) && !uncommitted_leaders[p->first].safe) {
3168 ceph_assert(!p->second.committing);
3169 pending_leaders.insert(p->first);
3170 }
3171 }
3172
3173 if (!pending_leaders.empty()) {
3174 dout(10) << " still have pending updates, delay processing peer resolve" << dendl;
3175 delayed_resolve[from] = m;
3176 return;
3177 }
3178 }
3179
3180 auto ack = make_message<MMDSResolveAck>();
3181 for (const auto &p : m->peer_requests) {
3182 if (uncommitted_leaders.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3183 // COMMIT
3184 if (p.second.committing) {
3185 // already committing, waiting for the OP_COMMITTED peer reply
3186 dout(10) << " already committing peer request " << p << " noop "<< dendl;
3187 } else {
3188 dout(10) << " ambiguous peer request " << p << " will COMMIT" << dendl;
3189 ack->add_commit(p.first);
3190 }
3191 uncommitted_leaders[p.first].peers.insert(from); // wait for peer OP_COMMITTED before we log ECommitted
3192
3193 if (p.second.inode_caps.length() > 0) {
3194 // peer wants to export caps (rename)
3195 ceph_assert(mds->is_resolve());
3196 MMDSResolve::peer_inode_cap inode_caps;
3197 auto q = p.second.inode_caps.cbegin();
3198 decode(inode_caps, q);
3199 inodeno_t ino = inode_caps.ino;
3200 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3201 ceph_assert(get_inode(ino));
3202
3203 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3204 q != cap_exports.end();
3205 ++q) {
3206 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3207 im.cap_id = ++last_cap_id; // assign a new cap ID
3208 im.issue_seq = 1;
3209 im.mseq = q->second.mseq;
3210
3211 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3212 if (session)
3213 rejoin_client_map.emplace(q->first, session->info.inst);
3214 }
3215
3216 // will process these caps in rejoin stage
3217 rejoin_peer_exports[ino].first = from;
3218 rejoin_peer_exports[ino].second.swap(cap_exports);
3219
3220 // send information of imported caps back to peer
3221 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3222 }
3223 } else {
3224 // ABORT
3225 dout(10) << " ambiguous peer request " << p << " will ABORT" << dendl;
3226 ceph_assert(!p.second.committing);
3227 ack->add_abort(p.first);
3228 }
3229 }
3230 mds->send_message(ack, m->get_connection());
3231 return;
3232 }
3233
3234 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3235 dout(10) << "delay processing subtree resolve" << dendl;
3236 delayed_resolve[from] = m;
3237 return;
3238 }
3239
3240 bool survivor = false;
3241 // am i a surviving ambiguous importer?
3242 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3243 survivor = true;
3244 // check for any import success/failure (from this node)
3245 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3246 while (p != my_ambiguous_imports.end()) {
3247 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3248 ++next;
3249 CDir *dir = get_dirfrag(p->first);
3250 ceph_assert(dir);
3251 dout(10) << "checking ambiguous import " << *dir << dendl;
3252 if (migrator->is_importing(dir->dirfrag()) &&
3253 migrator->get_import_peer(dir->dirfrag()) == from) {
3254 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3255
3256 // check if sender claims the subtree
3257 bool claimed_by_sender = false;
3258 for (const auto &q : m->subtrees) {
3259 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3260 CDir *base = get_force_dirfrag(q.first, false);
3261 if (!base || !base->contains(dir))
3262 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3263
3264 bool inside = true;
3265 set<CDir*> bounds;
3266 get_force_dirfrag_bound_set(q.second, bounds);
3267 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3268 CDir *bound = *p;
3269 if (bound->contains(dir)) {
3270 inside = false; // nope, bound is dir or parent of dir, not inside.
3271 break;
3272 }
3273 }
3274 if (inside)
3275 claimed_by_sender = true;
3276 }
3277
3278 my_ambiguous_imports.erase(p); // no longer ambiguous.
3279 if (claimed_by_sender) {
3280 dout(7) << "ambiguous import failed on " << *dir << dendl;
3281 migrator->import_reverse(dir);
3282 } else {
3283 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3284 migrator->import_finish(dir, true);
3285 }
3286 }
3287 p = next;
3288 }
3289 }
3290
3291 // update my dir_auth values
3292 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3293 // migrations between other nodes)
3294 for (const auto& p : m->subtrees) {
3295 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3296 CDir *dir = get_force_dirfrag(p.first, !survivor);
3297 if (!dir)
3298 continue;
3299 adjust_bounded_subtree_auth(dir, p.second, from);
3300 try_subtree_merge(dir);
3301 }
3302
3303 show_subtrees();
3304
3305 // note ambiguous imports too
3306 for (const auto& p : m->ambiguous_imports) {
3307 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3308 other_ambiguous_imports[from][p.first] = p.second;
3309 }
3310
3311 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3312 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3313 for (const auto& p : m->table_clients) {
3314 dout(10) << " noting " << get_mdstable_name(p.type)
3315 << " pending_commits " << p.pending_commits << dendl;
3316 MDSTableClient *client = mds->get_table_client(p.type);
3317 for (const auto& q : p.pending_commits)
3318 client->notify_commit(q);
3319 }
3320
3321 // did i get them all?
3322 resolve_gather.erase(from);
3323
3324 maybe_resolve_finish();
3325 }
3326
3327 void MDCache::process_delayed_resolve()
3328 {
3329 dout(10) << "process_delayed_resolve" << dendl;
3330 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3331 tmp.swap(delayed_resolve);
3332 for (auto &p : tmp) {
3333 handle_resolve(p.second);
3334 }
3335 }
3336
3337 void MDCache::discard_delayed_resolve(mds_rank_t who)
3338 {
3339 delayed_resolve.erase(who);
3340 }
3341
3342 void MDCache::maybe_resolve_finish()
3343 {
3344 ceph_assert(resolve_ack_gather.empty());
3345 ceph_assert(resolve_need_rollback.empty());
3346
3347 if (!resolve_gather.empty()) {
3348 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3349 << resolve_gather << ")" << dendl;
3350 return;
3351 }
3352
3353 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3354 disambiguate_my_imports();
3355 finish_committed_leaders();
3356
3357 if (resolve_done) {
3358 ceph_assert(mds->is_resolve());
3359 trim_unlinked_inodes();
3360 recalc_auth_bits(false);
3361 resolve_done.release()->complete(0);
3362 } else {
3363 // I am survivor.
3364 maybe_send_pending_rejoins();
3365 }
3366 }
3367
3368 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3369 {
3370 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3371 mds_rank_t from = mds_rank_t(ack->get_source().num());
3372
3373 if (!resolve_ack_gather.count(from) ||
3374 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3375 return;
3376 }
3377
3378 if (ambiguous_peer_updates.count(from)) {
3379 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3380 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3381 }
3382
3383 for (const auto &p : ack->commit) {
3384 dout(10) << " commit on peer " << p.first << dendl;
3385
3386 if (ambiguous_peer_updates.count(from)) {
3387 remove_ambiguous_peer_update(p.first, from);
3388 continue;
3389 }
3390
3391 if (mds->is_resolve()) {
3392 // replay
3393 MDPeerUpdate *su = get_uncommitted_peer(p.first, from);
3394 ceph_assert(su);
3395
3396 // log commit
3397 mds->mdlog->start_submit_entry(new EPeerUpdate(mds->mdlog, "unknown", p.first, from,
3398 EPeerUpdate::OP_COMMIT, su->origop),
3399 new C_MDC_PeerCommit(this, from, p.first));
3400 mds->mdlog->flush();
3401
3402 finish_uncommitted_peer(p.first);
3403 } else {
3404 MDRequestRef mdr = request_get(p.first);
3405 // information about leader imported caps
3406 if (p.second.length() > 0)
3407 mdr->more()->inode_import.share(p.second);
3408
3409 ceph_assert(mdr->peer_request == 0); // shouldn't be doing anything!
3410 request_finish(mdr);
3411 }
3412 }
3413
3414 for (const auto &metareq : ack->abort) {
3415 dout(10) << " abort on peer " << metareq << dendl;
3416
3417 if (mds->is_resolve()) {
3418 MDPeerUpdate *su = get_uncommitted_peer(metareq, from);
3419 ceph_assert(su);
3420
3421 // perform rollback (and journal a rollback entry)
3422 // note: this will hold up the resolve a bit, until the rollback entries journal.
3423 MDRequestRef null_ref;
3424 switch (su->origop) {
3425 case EPeerUpdate::LINK:
3426 mds->server->do_link_rollback(su->rollback, from, null_ref);
3427 break;
3428 case EPeerUpdate::RENAME:
3429 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3430 break;
3431 case EPeerUpdate::RMDIR:
3432 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3433 break;
3434 default:
3435 ceph_abort();
3436 }
3437 } else {
3438 MDRequestRef mdr = request_get(metareq);
3439 mdr->aborted = true;
3440 if (mdr->peer_request) {
3441 if (mdr->peer_did_prepare()) // journaling peer prepare ?
3442 add_rollback(metareq, from);
3443 } else {
3444 request_finish(mdr);
3445 }
3446 }
3447 }
3448
3449 if (!ambiguous_peer_updates.count(from)) {
3450 resolve_ack_gather.erase(from);
3451 maybe_finish_peer_resolve();
3452 }
3453 }
3454
3455 void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t leader, MDPeerUpdate *su)
3456 {
3457 auto const &ret = uncommitted_peers.emplace(std::piecewise_construct,
3458 std::forward_as_tuple(reqid),
3459 std::forward_as_tuple());
3460 ceph_assert(ret.second);
3461 ls->uncommitted_peers.insert(reqid);
3462 upeer &u = ret.first->second;
3463 u.leader = leader;
3464 u.ls = ls;
3465 u.su = su;
3466 if (su == nullptr) {
3467 return;
3468 }
3469 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3470 uncommitted_peer_rename_olddir[*p]++;
3471 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3472 uncommitted_peer_unlink[*p]++;
3473 }
3474
3475 void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
3476 {
3477 auto it = uncommitted_peers.find(reqid);
3478 if (it == uncommitted_peers.end()) {
3479 ceph_assert(!assert_exist);
3480 return;
3481 }
3482 upeer &u = it->second;
3483 MDPeerUpdate* su = u.su;
3484
3485 if (!u.waiters.empty()) {
3486 mds->queue_waiters(u.waiters);
3487 }
3488 u.ls->uncommitted_peers.erase(reqid);
3489 uncommitted_peers.erase(it);
3490
3491 if (su == nullptr) {
3492 return;
3493 }
3494 // discard the non-auth subtree we renamed out of
3495 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3496 CInode *diri = *p;
3497 map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
3498 ceph_assert(it != uncommitted_peer_rename_olddir.end());
3499 it->second--;
3500 if (it->second == 0) {
3501 uncommitted_peer_rename_olddir.erase(it);
3502 auto&& ls = diri->get_dirfrags();
3503 for (const auto& dir : ls) {
3504 CDir *root = get_subtree_root(dir);
3505 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3506 try_trim_non_auth_subtree(root);
3507 if (dir != root)
3508 break;
3509 }
3510 }
3511 } else
3512 ceph_assert(it->second > 0);
3513 }
3514 // removed the inodes that were unlinked by peer update
3515 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3516 CInode *in = *p;
3517 map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
3518 ceph_assert(it != uncommitted_peer_unlink.end());
3519 it->second--;
3520 if (it->second == 0) {
3521 uncommitted_peer_unlink.erase(it);
3522 if (!in->get_projected_parent_dn())
3523 mds->mdcache->remove_inode_recursive(in);
3524 } else
3525 ceph_assert(it->second > 0);
3526 }
3527 delete su;
3528 }
3529
3530 MDPeerUpdate* MDCache::get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader)
3531 {
3532
3533 MDPeerUpdate* su = nullptr;
3534 auto it = uncommitted_peers.find(reqid);
3535 if (it != uncommitted_peers.end() &&
3536 it->second.leader == leader) {
3537 su = it->second.su;
3538 }
3539 return su;
3540 }
3541
3542 void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
3543 auto p = resolve_need_rollback.find(reqid);
3544 ceph_assert(p != resolve_need_rollback.end());
3545 if (mds->is_resolve()) {
3546 finish_uncommitted_peer(reqid, false);
3547 } else if (mdr) {
3548 finish_uncommitted_peer(mdr->reqid, mdr->more()->peer_update_journaled);
3549 }
3550 resolve_need_rollback.erase(p);
3551 maybe_finish_peer_resolve();
3552 }
3553
3554 void MDCache::disambiguate_other_imports()
3555 {
3556 dout(10) << "disambiguate_other_imports" << dendl;
3557
3558 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3559 // other nodes' ambiguous imports
3560 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3561 p != other_ambiguous_imports.end();
3562 ++p) {
3563 mds_rank_t who = p->first;
3564 dout(10) << "ambiguous imports for mds." << who << dendl;
3565
3566 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3567 q != p->second.end();
3568 ++q) {
3569 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3570 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3571 CDir *dir = get_force_dirfrag(q->first, recovering);
3572 if (!dir) continue;
3573
3574 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3575 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3576 dout(10) << " mds." << who << " did import " << *dir << dendl;
3577 adjust_bounded_subtree_auth(dir, q->second, who);
3578 try_subtree_merge(dir);
3579 } else {
3580 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3581 }
3582 }
3583 }
3584 other_ambiguous_imports.clear();
3585 }
3586
3587 void MDCache::disambiguate_my_imports()
3588 {
3589 dout(10) << "disambiguate_my_imports" << dendl;
3590
3591 if (!mds->is_resolve()) {
3592 ceph_assert(my_ambiguous_imports.empty());
3593 return;
3594 }
3595
3596 disambiguate_other_imports();
3597
3598 // my ambiguous imports
3599 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3600 while (!my_ambiguous_imports.empty()) {
3601 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3602
3603 CDir *dir = get_dirfrag(q->first);
3604 ceph_assert(dir);
3605
3606 if (dir->authority() != me_ambig) {
3607 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3608 cancel_ambiguous_import(dir);
3609
3610 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3611
3612 // subtree may have been swallowed by another node claiming dir
3613 // as their own.
3614 CDir *root = get_subtree_root(dir);
3615 if (root != dir)
3616 dout(10) << " subtree root is " << *root << dendl;
3617 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3618 try_trim_non_auth_subtree(root);
3619 } else {
3620 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3621 finish_ambiguous_import(q->first);
3622 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3623 }
3624 }
3625 ceph_assert(my_ambiguous_imports.empty());
3626 mds->mdlog->flush();
3627
3628 // verify all my subtrees are unambiguous!
3629 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3630 p != subtrees.end();
3631 ++p) {
3632 CDir *dir = p->first;
3633 if (dir->is_ambiguous_dir_auth()) {
3634 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3635 }
3636 ceph_assert(!dir->is_ambiguous_dir_auth());
3637 }
3638
3639 show_subtrees();
3640 }
3641
3642
3643 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3644 {
3645 ceph_assert(my_ambiguous_imports.count(base) == 0);
3646 my_ambiguous_imports[base] = bounds;
3647 }
3648
3649
3650 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3651 {
3652 // make a list
3653 vector<dirfrag_t> binos;
3654 for (set<CDir*>::iterator p = bounds.begin();
3655 p != bounds.end();
3656 ++p)
3657 binos.push_back((*p)->dirfrag());
3658
3659 // note: this can get called twice if the exporter fails during recovery
3660 if (my_ambiguous_imports.count(base->dirfrag()))
3661 my_ambiguous_imports.erase(base->dirfrag());
3662
3663 add_ambiguous_import(base->dirfrag(), binos);
3664 }
3665
3666 void MDCache::cancel_ambiguous_import(CDir *dir)
3667 {
3668 dirfrag_t df = dir->dirfrag();
3669 ceph_assert(my_ambiguous_imports.count(df));
3670 dout(10) << "cancel_ambiguous_import " << df
3671 << " bounds " << my_ambiguous_imports[df]
3672 << " " << *dir
3673 << dendl;
3674 my_ambiguous_imports.erase(df);
3675 }
3676
3677 void MDCache::finish_ambiguous_import(dirfrag_t df)
3678 {
3679 ceph_assert(my_ambiguous_imports.count(df));
3680 vector<dirfrag_t> bounds;
3681 bounds.swap(my_ambiguous_imports[df]);
3682 my_ambiguous_imports.erase(df);
3683
3684 dout(10) << "finish_ambiguous_import " << df
3685 << " bounds " << bounds
3686 << dendl;
3687 CDir *dir = get_dirfrag(df);
3688 ceph_assert(dir);
3689
3690 // adjust dir_auth, import maps
3691 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3692 try_subtree_merge(dir);
3693 }
3694
3695 void MDCache::remove_inode_recursive(CInode *in)
3696 {
3697 dout(10) << "remove_inode_recursive " << *in << dendl;
3698 auto&& ls = in->get_dirfrags();
3699 for (const auto& subdir : ls) {
3700 dout(10) << " removing dirfrag " << *subdir << dendl;
3701 auto it = subdir->items.begin();
3702 while (it != subdir->items.end()) {
3703 CDentry *dn = it->second;
3704 ++it;
3705 CDentry::linkage_t *dnl = dn->get_linkage();
3706 if (dnl->is_primary()) {
3707 CInode *tin = dnl->get_inode();
3708 subdir->unlink_inode(dn, false);
3709 remove_inode_recursive(tin);
3710 }
3711 subdir->remove_dentry(dn);
3712 }
3713
3714 if (subdir->is_subtree_root())
3715 remove_subtree(subdir);
3716 in->close_dirfrag(subdir->dirfrag().frag);
3717 }
3718 remove_inode(in);
3719 }
3720
3721 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3722 {
3723 ceph_assert(!in->is_auth());
3724
3725 dout(10) << __func__ << ":" << *in << dendl;
3726
3727 // Recurse into any dirfrags beneath this inode
3728 auto&& ls = in->get_dirfrags();
3729 for (const auto& subdir : ls) {
3730 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3731 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3732 return true;
3733 }
3734
3735 for (auto &it : subdir->items) {
3736 CDentry *dn = it.second;
3737 CDentry::linkage_t *dnl = dn->get_linkage();
3738 if (dnl->is_primary()) {
3739 CInode *tin = dnl->get_inode();
3740
3741 /* Remote strays with linkage (i.e. hardlinks) should not be
3742 * expired, because they may be the target of
3743 * a rename() as the owning MDS shuts down */
3744 if (!tin->is_stray() && tin->get_inode()->nlink) {
3745 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3746 return true;
3747 }
3748
3749 const bool abort = expire_recursive(tin, expiremap);
3750 if (abort) {
3751 return true;
3752 }
3753 }
3754 if (dn->lru_is_expireable()) {
3755 trim_dentry(dn, expiremap);
3756 } else {
3757 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3758 return true;
3759 }
3760 }
3761 }
3762
3763 return false;
3764 }
3765
3766 void MDCache::trim_unlinked_inodes()
3767 {
3768 dout(7) << "trim_unlinked_inodes" << dendl;
3769 int count = 0;
3770 vector<CInode*> q;
3771 for (auto &p : inode_map) {
3772 CInode *in = p.second;
3773 if (in->get_parent_dn() == NULL && !in->is_base()) {
3774 dout(7) << " will trim from " << *in << dendl;
3775 q.push_back(in);
3776 }
3777
3778 if (!(++count % 1000))
3779 mds->heartbeat_reset();
3780 }
3781 for (auto& in : q) {
3782 remove_inode_recursive(in);
3783
3784 if (!(++count % 1000))
3785 mds->heartbeat_reset();
3786 }
3787 }
3788
3789 /** recalc_auth_bits()
3790 * once subtree auth is disambiguated, we need to adjust all the
3791 * auth and dirty bits in our cache before moving on.
3792 */
3793 void MDCache::recalc_auth_bits(bool replay)
3794 {
3795 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3796
3797 if (root) {
3798 root->inode_auth.first = mds->mdsmap->get_root();
3799 bool auth = mds->get_nodeid() == root->inode_auth.first;
3800 if (auth) {
3801 root->state_set(CInode::STATE_AUTH);
3802 } else {
3803 root->state_clear(CInode::STATE_AUTH);
3804 if (!replay)
3805 root->state_set(CInode::STATE_REJOINING);
3806 }
3807 }
3808
3809 set<CInode*> subtree_inodes;
3810 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3811 p != subtrees.end();
3812 ++p) {
3813 if (p->first->dir_auth.first == mds->get_nodeid())
3814 subtree_inodes.insert(p->first->inode);
3815 }
3816
3817 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3818 p != subtrees.end();
3819 ++p) {
3820 if (p->first->inode->is_mdsdir()) {
3821 CInode *in = p->first->inode;
3822 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3823 if (auth) {
3824 in->state_set(CInode::STATE_AUTH);
3825 } else {
3826 in->state_clear(CInode::STATE_AUTH);
3827 if (!replay)
3828 in->state_set(CInode::STATE_REJOINING);
3829 }
3830 }
3831
3832 std::queue<CDir*> dfq; // dirfrag queue
3833 dfq.push(p->first);
3834
3835 bool auth = p->first->authority().first == mds->get_nodeid();
3836 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3837
3838 while (!dfq.empty()) {
3839 CDir *dir = dfq.front();
3840 dfq.pop();
3841
3842 // dir
3843 if (auth) {
3844 dir->state_set(CDir::STATE_AUTH);
3845 } else {
3846 dir->state_clear(CDir::STATE_AUTH);
3847 if (!replay) {
3848 // close empty non-auth dirfrag
3849 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3850 dir->inode->close_dirfrag(dir->get_frag());
3851 continue;
3852 }
3853 dir->state_set(CDir::STATE_REJOINING);
3854 dir->state_clear(CDir::STATE_COMPLETE);
3855 if (dir->is_dirty())
3856 dir->mark_clean();
3857 }
3858 }
3859
3860 // dentries in this dir
3861 for (auto &p : dir->items) {
3862 // dn
3863 CDentry *dn = p.second;
3864 CDentry::linkage_t *dnl = dn->get_linkage();
3865 if (auth) {
3866 dn->state_set(CDentry::STATE_AUTH);
3867 } else {
3868 dn->state_clear(CDentry::STATE_AUTH);
3869 if (!replay) {
3870 dn->state_set(CDentry::STATE_REJOINING);
3871 if (dn->is_dirty())
3872 dn->mark_clean();
3873 }
3874 }
3875
3876 if (dnl->is_primary()) {
3877 // inode
3878 CInode *in = dnl->get_inode();
3879 if (auth) {
3880 in->state_set(CInode::STATE_AUTH);
3881 } else {
3882 in->state_clear(CInode::STATE_AUTH);
3883 if (!replay) {
3884 in->state_set(CInode::STATE_REJOINING);
3885 if (in->is_dirty())
3886 in->mark_clean();
3887 if (in->is_dirty_parent())
3888 in->clear_dirty_parent();
3889 // avoid touching scatterlocks for our subtree roots!
3890 if (subtree_inodes.count(in) == 0)
3891 in->clear_scatter_dirty();
3892 }
3893 }
3894 // recurse?
3895 if (in->is_dir()) {
3896 auto&& dfv = in->get_nested_dirfrags();
3897 for (const auto& dir : dfv) {
3898 dfq.push(dir);
3899 }
3900 }
3901 }
3902 }
3903 }
3904 }
3905
3906 show_subtrees();
3907 show_cache();
3908 }
3909
3910
3911
3912 // ===========================================================================
3913 // REJOIN
3914
3915 /*
3916 * notes on scatterlock recovery:
3917 *
3918 * - recovering inode replica sends scatterlock data for any subtree
3919 * roots (the only ones that are possibly dirty).
3920 *
3921 * - surviving auth incorporates any provided scatterlock data. any
3922 * pending gathers are then finished, as with the other lock types.
3923 *
3924 * that takes care of surviving auth + (recovering replica)*.
3925 *
3926 * - surviving replica sends strong_inode, which includes current
3927 * scatterlock state, AND any dirty scatterlock data. this
3928 * provides the recovering auth with everything it might need.
3929 *
3930 * - recovering auth must pick initial scatterlock state based on
3931 * (weak|strong) rejoins.
3932 * - always assimilate scatterlock data (it can't hurt)
3933 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3934 * - include base inode in ack for all inodes that saw scatterlock content
3935 *
3936 * also, for scatter gather,
3937 *
3938 * - auth increments {frag,r}stat.version on completion of any gather.
3939 *
3940 * - auth incorporates changes in a gather _only_ if the version
3941 * matches.
3942 *
3943 * - replica discards changes any time the scatterlock syncs, and
3944 * after recovery.
3945 */
3946
3947 void MDCache::dump_rejoin_status(Formatter *f) const
3948 {
3949 f->open_object_section("rejoin_status");
3950 f->dump_stream("rejoin_gather") << rejoin_gather;
3951 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3952 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3953 f->close_section();
3954 }
3955
3956 void MDCache::rejoin_start(MDSContext *rejoin_done_)
3957 {
3958 dout(10) << "rejoin_start" << dendl;
3959 ceph_assert(!rejoin_done);
3960 rejoin_done.reset(rejoin_done_);
3961
3962 rejoin_gather = recovery_set;
3963 // need finish opening cap inodes before sending cache rejoins
3964 rejoin_gather.insert(mds->get_nodeid());
3965 process_imported_caps();
3966 }
3967
3968 /*
3969 * rejoin phase!
3970 *
3971 * this initiates rejoin. it should be called before we get any
3972 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3973 *
3974 * we start out by sending rejoins to everyone in the recovery set.
3975 *
3976 * if we are rejoin, send for all regions in our cache.
3977 * if we are active|stopping, send only to nodes that are rejoining.
3978 */
3979 void MDCache::rejoin_send_rejoins()
3980 {
3981 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3982
3983 if (rejoin_gather.count(mds->get_nodeid())) {
3984 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3985 rejoins_pending = true;
3986 return;
3987 }
3988 if (!resolve_gather.empty()) {
3989 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3990 << resolve_gather << ")" << dendl;
3991 rejoins_pending = true;
3992 return;
3993 }
3994
3995 ceph_assert(!migrator->is_importing());
3996 ceph_assert(!migrator->is_exporting());
3997
3998 if (!mds->is_rejoin()) {
3999 disambiguate_other_imports();
4000 }
4001
4002 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
4003
4004
4005 // if i am rejoining, send a rejoin to everyone.
4006 // otherwise, just send to others who are rejoining.
4007 for (const auto& rank : recovery_set) {
4008 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4009 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
4010 if (mds->is_rejoin())
4011 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4012 else if (mds->mdsmap->is_rejoin(rank))
4013 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
4014 }
4015
4016 if (mds->is_rejoin()) {
4017 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4018 for (auto& p : cap_exports) {
4019 mds_rank_t target = p.second.first;
4020 if (rejoins.count(target) == 0)
4021 continue;
4022 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4023 Session *session = nullptr;
4024 auto it = client_exports.find(q->first);
4025 if (it != client_exports.end()) {
4026 session = it->second.first;
4027 if (session)
4028 it->second.second.insert(target);
4029 } else {
4030 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4031 auto& r = client_exports[q->first];
4032 r.first = session;
4033 if (session)
4034 r.second.insert(target);
4035 }
4036 if (session) {
4037 ++q;
4038 } else {
4039 // remove reconnect with no session
4040 p.second.second.erase(q++);
4041 }
4042 }
4043 rejoins[target]->cap_exports[p.first] = p.second.second;
4044 }
4045 for (auto& p : client_exports) {
4046 Session *session = p.second.first;
4047 for (auto& q : p.second.second) {
4048 auto rejoin = rejoins[q];
4049 rejoin->client_map[p.first] = session->info.inst;
4050 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4051 }
4052 }
4053 }
4054
4055
4056 // check all subtrees
4057 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4058 p != subtrees.end();
4059 ++p) {
4060 CDir *dir = p->first;
4061 ceph_assert(dir->is_subtree_root());
4062 if (dir->is_ambiguous_dir_auth()) {
4063 // exporter is recovering, importer is survivor.
4064 ceph_assert(rejoins.count(dir->authority().first));
4065 ceph_assert(!rejoins.count(dir->authority().second));
4066 continue;
4067 }
4068
4069 // my subtree?
4070 if (dir->is_auth())
4071 continue; // skip my own regions!
4072
4073 mds_rank_t auth = dir->get_dir_auth().first;
4074 ceph_assert(auth >= 0);
4075 if (rejoins.count(auth) == 0)
4076 continue; // don't care about this node's subtrees
4077
4078 rejoin_walk(dir, rejoins[auth]);
4079 }
4080
4081 // rejoin root inodes, too
4082 for (auto &p : rejoins) {
4083 if (mds->is_rejoin()) {
4084 // weak
4085 if (p.first == 0 && root) {
4086 p.second->add_weak_inode(root->vino());
4087 if (root->is_dirty_scattered()) {
4088 dout(10) << " sending scatterlock state on root " << *root << dendl;
4089 p.second->add_scatterlock_state(root);
4090 }
4091 }
4092 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4093 if (in)
4094 p.second->add_weak_inode(in->vino());
4095 }
4096 } else {
4097 // strong
4098 if (p.first == 0 && root) {
4099 p.second->add_strong_inode(root->vino(),
4100 root->get_replica_nonce(),
4101 root->get_caps_wanted(),
4102 root->filelock.get_state(),
4103 root->nestlock.get_state(),
4104 root->dirfragtreelock.get_state());
4105 root->state_set(CInode::STATE_REJOINING);
4106 if (root->is_dirty_scattered()) {
4107 dout(10) << " sending scatterlock state on root " << *root << dendl;
4108 p.second->add_scatterlock_state(root);
4109 }
4110 }
4111
4112 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4113 p.second->add_strong_inode(in->vino(),
4114 in->get_replica_nonce(),
4115 in->get_caps_wanted(),
4116 in->filelock.get_state(),
4117 in->nestlock.get_state(),
4118 in->dirfragtreelock.get_state());
4119 in->state_set(CInode::STATE_REJOINING);
4120 }
4121 }
4122 }
4123
4124 if (!mds->is_rejoin()) {
4125 // i am survivor. send strong rejoin.
4126 // note request remote_auth_pins, xlocks
4127 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4128 p != active_requests.end();
4129 ++p) {
4130 MDRequestRef& mdr = p->second;
4131 if (mdr->is_peer())
4132 continue;
4133 // auth pins
4134 for (const auto& q : mdr->object_states) {
4135 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4136 continue;
4137 if (!q.first->is_auth()) {
4138 mds_rank_t target = q.second.remote_auth_pinned;
4139 ceph_assert(target == q.first->authority().first);
4140 if (rejoins.count(target) == 0) continue;
4141 const auto& rejoin = rejoins[target];
4142
4143 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4144 MDSCacheObjectInfo i;
4145 q.first->set_object_info(i);
4146 if (i.ino)
4147 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4148 else
4149 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4150
4151 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4152 mdr->more()->rename_inode == q.first)
4153 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4154 mdr->reqid, mdr->attempt);
4155 }
4156 }
4157 // xlocks
4158 for (const auto& q : mdr->locks) {
4159 auto lock = q.lock;
4160 auto obj = lock->get_parent();
4161 if (q.is_xlock() && !obj->is_auth()) {
4162 mds_rank_t who = obj->authority().first;
4163 if (rejoins.count(who) == 0) continue;
4164 const auto& rejoin = rejoins[who];
4165
4166 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4167 MDSCacheObjectInfo i;
4168 obj->set_object_info(i);
4169 if (i.ino)
4170 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4171 mdr->reqid, mdr->attempt);
4172 else
4173 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4174 mdr->reqid, mdr->attempt);
4175 } else if (q.is_remote_wrlock()) {
4176 mds_rank_t who = q.wrlock_target;
4177 if (rejoins.count(who) == 0) continue;
4178 const auto& rejoin = rejoins[who];
4179
4180 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4181 MDSCacheObjectInfo i;
4182 obj->set_object_info(i);
4183 ceph_assert(i.ino);
4184 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4185 mdr->reqid, mdr->attempt);
4186 }
4187 }
4188 }
4189 }
4190
4191 // send the messages
4192 for (auto &p : rejoins) {
4193 ceph_assert(rejoin_sent.count(p.first) == 0);
4194 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4195 rejoin_sent.insert(p.first);
4196 rejoin_ack_gather.insert(p.first);
4197 mds->send_message_mds(p.second, p.first);
4198 }
4199 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4200 rejoins_pending = false;
4201
4202 // nothing?
4203 if (mds->is_rejoin() && rejoin_gather.empty()) {
4204 dout(10) << "nothing to rejoin" << dendl;
4205 rejoin_gather_finish();
4206 }
4207 }
4208
4209
4210 /**
4211 * rejoin_walk - build rejoin declarations for a subtree
4212 *
4213 * @param dir subtree root
4214 * @param rejoin rejoin message
4215 *
4216 * from a rejoining node:
4217 * weak dirfrag
4218 * weak dentries (w/ connectivity)
4219 *
4220 * from a surviving node:
4221 * strong dirfrag
4222 * strong dentries (no connectivity!)
4223 * strong inodes
4224 */
4225 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4226 {
4227 dout(10) << "rejoin_walk " << *dir << dendl;
4228
4229 std::vector<CDir*> nested; // finish this dir, then do nested items
4230
4231 if (mds->is_rejoin()) {
4232 // WEAK
4233 rejoin->add_weak_dirfrag(dir->dirfrag());
4234 for (auto &p : dir->items) {
4235 CDentry *dn = p.second;
4236 ceph_assert(dn->last == CEPH_NOSNAP);
4237 CDentry::linkage_t *dnl = dn->get_linkage();
4238 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4239 ceph_assert(dnl->is_primary());
4240 CInode *in = dnl->get_inode();
4241 ceph_assert(dnl->get_inode()->is_dir());
4242 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4243 {
4244 auto&& dirs = in->get_nested_dirfrags();
4245 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4246 }
4247 if (in->is_dirty_scattered()) {
4248 dout(10) << " sending scatterlock state on " << *in << dendl;
4249 rejoin->add_scatterlock_state(in);
4250 }
4251 }
4252 } else {
4253 // STRONG
4254 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4255 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4256 dir->state_set(CDir::STATE_REJOINING);
4257
4258 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4259 CDentry *dn = it->second;
4260 ++it;
4261 dn->state_set(CDentry::STATE_REJOINING);
4262 CDentry::linkage_t *dnl = dn->get_linkage();
4263 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4264
4265 // trim snap dentries. because they may have been pruned by
4266 // their auth mds (snap deleted)
4267 if (dn->last != CEPH_NOSNAP) {
4268 if (in && !in->remote_parents.empty()) {
4269 // unlink any stale remote snap dentry.
4270 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4271 CDentry *remote_dn = *it2;
4272 ++it2;
4273 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4274 remote_dn->unlink_remote(remote_dn->get_linkage());
4275 }
4276 }
4277 if (dn->lru_is_expireable()) {
4278 if (!dnl->is_null())
4279 dir->unlink_inode(dn, false);
4280 if (in)
4281 remove_inode(in);
4282 dir->remove_dentry(dn);
4283 continue;
4284 } else {
4285 // Inventing null/remote dentry shouldn't cause problem
4286 ceph_assert(!dnl->is_primary());
4287 }
4288 }
4289
4290 dout(15) << " add_strong_dentry " << *dn << dendl;
4291 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4292 dn->first, dn->last,
4293 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4294 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4295 dnl->is_remote() ? dnl->get_remote_d_type():0,
4296 dn->get_replica_nonce(),
4297 dn->lock.get_state());
4298 dn->state_set(CDentry::STATE_REJOINING);
4299 if (dnl->is_primary()) {
4300 CInode *in = dnl->get_inode();
4301 dout(15) << " add_strong_inode " << *in << dendl;
4302 rejoin->add_strong_inode(in->vino(),
4303 in->get_replica_nonce(),
4304 in->get_caps_wanted(),
4305 in->filelock.get_state(),
4306 in->nestlock.get_state(),
4307 in->dirfragtreelock.get_state());
4308 in->state_set(CInode::STATE_REJOINING);
4309 {
4310 auto&& dirs = in->get_nested_dirfrags();
4311 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4312 }
4313 if (in->is_dirty_scattered()) {
4314 dout(10) << " sending scatterlock state on " << *in << dendl;
4315 rejoin->add_scatterlock_state(in);
4316 }
4317 }
4318 }
4319 }
4320
4321 // recurse into nested dirs
4322 for (const auto& dir : nested) {
4323 rejoin_walk(dir, rejoin);
4324 }
4325 }
4326
4327
4328 /*
4329 * i got a rejoin.
4330 * - reply with the lockstate
4331 *
4332 * if i am active|stopping,
4333 * - remove source from replica list for everything not referenced here.
4334 */
4335 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4336 {
4337 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4338 << " (" << m->get_payload().length() << " bytes)"
4339 << dendl;
4340
4341 switch (m->op) {
4342 case MMDSCacheRejoin::OP_WEAK:
4343 handle_cache_rejoin_weak(m);
4344 break;
4345 case MMDSCacheRejoin::OP_STRONG:
4346 handle_cache_rejoin_strong(m);
4347 break;
4348 case MMDSCacheRejoin::OP_ACK:
4349 handle_cache_rejoin_ack(m);
4350 break;
4351
4352 default:
4353 ceph_abort();
4354 }
4355 }
4356
4357
4358 /*
4359 * handle_cache_rejoin_weak
4360 *
4361 * the sender
4362 * - is recovering from their journal.
4363 * - may have incorrect (out of date) inode contents
4364 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4365 *
4366 * if the sender didn't trim_non_auth(), they
4367 * - may have incorrect (out of date) dentry/inode linkage
4368 * - may have deleted/purged inodes
4369 * and i may have to go to disk to get accurate inode contents. yuck.
4370 */
4371 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4372 {
4373 mds_rank_t from = mds_rank_t(weak->get_source().num());
4374
4375 // possible response(s)
4376 ref_t<MMDSCacheRejoin> ack; // if survivor
4377 set<vinodeno_t> acked_inodes; // if survivor
4378 set<SimpleLock *> gather_locks; // if survivor
4379 bool survivor = false; // am i a survivor?
4380
4381 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4382 survivor = true;
4383 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4384 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4385
4386 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4387
4388 // check cap exports
4389 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4390 CInode *in = get_inode(p->first);
4391 ceph_assert(!in || in->is_auth());
4392 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4393 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4394 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4395 Capability::Import& im = imported_caps[p->first][q->first];
4396 if (cap) {
4397 im.cap_id = cap->get_cap_id();
4398 im.issue_seq = cap->get_last_seq();
4399 im.mseq = cap->get_mseq();
4400 } else {
4401 // all are zero
4402 }
4403 }
4404 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4405 }
4406
4407 encode(imported_caps, ack->imported_caps);
4408 } else {
4409 ceph_assert(mds->is_rejoin());
4410
4411 // we may have already received a strong rejoin from the sender.
4412 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4413 ceph_assert(gather_locks.empty());
4414
4415 // check cap exports.
4416 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4417 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4418 weak->client_metadata_map.end());
4419
4420 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4421 CInode *in = get_inode(p->first);
4422 ceph_assert(!in || in->is_auth());
4423 // note
4424 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4425 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4426 cap_imports[p->first][q->first][from] = q->second;
4427 }
4428 }
4429 }
4430
4431 // assimilate any potentially dirty scatterlock state
4432 for (const auto &p : weak->inode_scatterlocks) {
4433 CInode *in = get_inode(p.first);
4434 ceph_assert(in);
4435 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4436 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4437 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4438 if (!survivor)
4439 rejoin_potential_updated_scatterlocks.insert(in);
4440 }
4441
4442 // recovering peer may send incorrect dirfrags here. we need to
4443 // infer which dirfrag they meant. the ack will include a
4444 // strong_dirfrag that will set them straight on the fragmentation.
4445
4446 // walk weak map
4447 set<CDir*> dirs_to_share;
4448 for (const auto &p : weak->weak_dirfrags) {
4449 CInode *diri = get_inode(p.ino);
4450 if (!diri)
4451 dout(0) << " missing dir ino " << p.ino << dendl;
4452 ceph_assert(diri);
4453
4454 frag_vec_t leaves;
4455 if (diri->dirfragtree.is_leaf(p.frag)) {
4456 leaves.push_back(p.frag);
4457 } else {
4458 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4459 if (leaves.empty())
4460 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4461 }
4462 for (const auto& leaf : leaves) {
4463 CDir *dir = diri->get_dirfrag(leaf);
4464 if (!dir) {
4465 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4466 continue;
4467 }
4468 ceph_assert(dir);
4469 if (dirs_to_share.count(dir)) {
4470 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4471 } else {
4472 dirs_to_share.insert(dir);
4473 unsigned nonce = dir->add_replica(from);
4474 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4475 if (ack) {
4476 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4477 ack->add_dirfrag_base(dir);
4478 }
4479 }
4480 }
4481 }
4482
4483 for (const auto &p : weak->weak) {
4484 CInode *diri = get_inode(p.first);
4485 if (!diri)
4486 dout(0) << " missing dir ino " << p.first << dendl;
4487 ceph_assert(diri);
4488
4489 // weak dentries
4490 CDir *dir = 0;
4491 for (const auto &q : p.second) {
4492 // locate proper dirfrag.
4493 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4494 frag_t fg = diri->pick_dirfrag(q.first.name);
4495 if (!dir || dir->get_frag() != fg) {
4496 dir = diri->get_dirfrag(fg);
4497 if (!dir)
4498 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4499 ceph_assert(dir);
4500 ceph_assert(dirs_to_share.count(dir));
4501 }
4502
4503 // and dentry
4504 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4505 ceph_assert(dn);
4506 CDentry::linkage_t *dnl = dn->get_linkage();
4507 ceph_assert(dnl->is_primary());
4508
4509 if (survivor && dn->is_replica(from))
4510 dentry_remove_replica(dn, from, gather_locks);
4511 unsigned dnonce = dn->add_replica(from);
4512 dout(10) << " have " << *dn << dendl;
4513 if (ack)
4514 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
4515 dn->first, dn->last,
4516 dnl->get_inode()->ino(), inodeno_t(0), 0,
4517 dnonce, dn->lock.get_replica_state());
4518
4519 // inode
4520 CInode *in = dnl->get_inode();
4521 ceph_assert(in);
4522
4523 if (survivor && in->is_replica(from))
4524 inode_remove_replica(in, from, true, gather_locks);
4525 unsigned inonce = in->add_replica(from);
4526 dout(10) << " have " << *in << dendl;
4527
4528 // scatter the dirlock, just in case?
4529 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4530 in->filelock.set_state(LOCK_MIX);
4531
4532 if (ack) {
4533 acked_inodes.insert(in->vino());
4534 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4535 bufferlist bl;
4536 in->_encode_locks_state_for_rejoin(bl, from);
4537 ack->add_inode_locks(in, inonce, bl);
4538 }
4539 }
4540 }
4541
4542 // weak base inodes? (root, stray, etc.)
4543 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4544 p != weak->weak_inodes.end();
4545 ++p) {
4546 CInode *in = get_inode(*p);
4547 ceph_assert(in); // hmm fixme wrt stray?
4548 if (survivor && in->is_replica(from))
4549 inode_remove_replica(in, from, true, gather_locks);
4550 unsigned inonce = in->add_replica(from);
4551 dout(10) << " have base " << *in << dendl;
4552
4553 if (ack) {
4554 acked_inodes.insert(in->vino());
4555 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4556 bufferlist bl;
4557 in->_encode_locks_state_for_rejoin(bl, from);
4558 ack->add_inode_locks(in, inonce, bl);
4559 }
4560 }
4561
4562 ceph_assert(rejoin_gather.count(from));
4563 rejoin_gather.erase(from);
4564 if (survivor) {
4565 // survivor. do everything now.
4566 for (const auto &p : weak->inode_scatterlocks) {
4567 CInode *in = get_inode(p.first);
4568 ceph_assert(in);
4569 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4570 acked_inodes.insert(in->vino());
4571 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4572 }
4573
4574 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4575 mds->send_message(ack, weak->get_connection());
4576
4577 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4578 if (!(*p)->is_stable())
4579 mds->locker->eval_gather(*p);
4580 }
4581 } else {
4582 // done?
4583 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4584 rejoin_gather_finish();
4585 } else {
4586 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4587 }
4588 }
4589 }
4590
4591 /*
4592 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4593 *
4594 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4595 * ack, the replica dne, and we can remove it from our replica maps.
4596 */
4597 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4598 set<vinodeno_t>& acked_inodes,
4599 set<SimpleLock *>& gather_locks)
4600 {
4601 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4602
4603 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4604 // inode?
4605 if (in->is_auth() &&
4606 in->is_replica(from) &&
4607 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4608 inode_remove_replica(in, from, false, gather_locks);
4609 dout(10) << " rem " << *in << dendl;
4610 }
4611
4612 if (!in->is_dir())
4613 return;
4614
4615 const auto&& dfs = in->get_dirfrags();
4616 for (const auto& dir : dfs) {
4617 if (!dir->is_auth())
4618 continue;
4619
4620 if (dir->is_replica(from) &&
4621 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4622 dir->remove_replica(from);
4623 dout(10) << " rem " << *dir << dendl;
4624 }
4625
4626 // dentries
4627 for (auto &p : dir->items) {
4628 CDentry *dn = p.second;
4629
4630 if (dn->is_replica(from)) {
4631 if (ack) {
4632 const auto it = ack->strong_dentries.find(dir->dirfrag());
4633 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4634 continue;
4635 }
4636 }
4637 dentry_remove_replica(dn, from, gather_locks);
4638 dout(10) << " rem " << *dn << dendl;
4639 }
4640 }
4641 }
4642 };
4643
4644 for (auto &p : inode_map)
4645 scour_func(p.second);
4646 for (auto &p : snap_inode_map)
4647 scour_func(p.second);
4648 }
4649
4650
4651 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4652 {
4653 CInode *in = new CInode(this, true, 2, last);
4654 in->_get_inode()->ino = ino;
4655 in->state_set(CInode::STATE_REJOINUNDEF);
4656 add_inode(in);
4657 rejoin_undef_inodes.insert(in);
4658 dout(10) << " invented " << *in << dendl;
4659 return in;
4660 }
4661
4662 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4663 {
4664 CInode *in = get_inode(df.ino);
4665 if (!in)
4666 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4667 if (!in->is_dir()) {
4668 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4669 in->_get_inode()->mode = S_IFDIR;
4670 in->_get_inode()->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4671 }
4672 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4673 dir->state_set(CDir::STATE_REJOINUNDEF);
4674 rejoin_undef_dirfrags.insert(dir);
4675 dout(10) << " invented " << *dir << dendl;
4676 return dir;
4677 }
4678
4679 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4680 {
4681 mds_rank_t from = mds_rank_t(strong->get_source().num());
4682
4683 // only a recovering node will get a strong rejoin.
4684 if (!mds->is_rejoin()) {
4685 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4686 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4687 return;
4688 }
4689 ceph_abort_msg("got unexpected rejoin message during recovery");
4690 }
4691
4692 // assimilate any potentially dirty scatterlock state
4693 for (const auto &p : strong->inode_scatterlocks) {
4694 CInode *in = get_inode(p.first);
4695 ceph_assert(in);
4696 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4697 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4698 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4699 rejoin_potential_updated_scatterlocks.insert(in);
4700 }
4701
4702 rejoin_unlinked_inodes[from].clear();
4703
4704 // surviving peer may send incorrect dirfrag here (maybe they didn't
4705 // get the fragment notify, or maybe we rolled back?). we need to
4706 // infer the right frag and get them with the program. somehow.
4707 // we don't normally send ACK.. so we'll need to bundle this with
4708 // MISSING or something.
4709
4710 // strong dirfrags/dentries.
4711 // also process auth_pins, xlocks.
4712 for (const auto &p : strong->strong_dirfrags) {
4713 auto& dirfrag = p.first;
4714 CInode *diri = get_inode(dirfrag.ino);
4715 if (!diri)
4716 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4717 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4718 bool refragged = false;
4719 if (dir) {
4720 dout(10) << " have " << *dir << dendl;
4721 } else {
4722 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4723 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4724 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4725 dir = rejoin_invent_dirfrag(dirfrag);
4726 }
4727 if (dir) {
4728 dir->add_replica(from, p.second.nonce);
4729 dir->dir_rep = p.second.dir_rep;
4730 } else {
4731 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4732 frag_vec_t leaves;
4733 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4734 if (leaves.empty())
4735 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4736 dout(10) << " maps to frag(s) " << leaves << dendl;
4737 for (const auto& leaf : leaves) {
4738 CDir *dir = diri->get_dirfrag(leaf);
4739 if (!dir)
4740 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4741 else
4742 dout(10) << " have(approx) " << *dir << dendl;
4743 dir->add_replica(from, p.second.nonce);
4744 dir->dir_rep = p.second.dir_rep;
4745 }
4746 refragged = true;
4747 }
4748
4749 const auto it = strong->strong_dentries.find(dirfrag);
4750 if (it != strong->strong_dentries.end()) {
4751 const auto& dmap = it->second;
4752 for (const auto &q : dmap) {
4753 const string_snap_t& ss = q.first;
4754 const MMDSCacheRejoin::dn_strong& d = q.second;
4755 CDentry *dn;
4756 if (!refragged)
4757 dn = dir->lookup(ss.name, ss.snapid);
4758 else {
4759 frag_t fg = diri->pick_dirfrag(ss.name);
4760 dir = diri->get_dirfrag(fg);
4761 ceph_assert(dir);
4762 dn = dir->lookup(ss.name, ss.snapid);
4763 }
4764 if (!dn) {
4765 if (d.is_remote()) {
4766 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4767 } else if (d.is_null()) {
4768 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4769 } else {
4770 CInode *in = get_inode(d.ino, ss.snapid);
4771 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4772 dn = dir->add_primary_dentry(ss.name, in, mempool::mds_co::string(d.alternate_name), d.first, ss.snapid);
4773 }
4774 dout(10) << " invented " << *dn << dendl;
4775 }
4776 CDentry::linkage_t *dnl = dn->get_linkage();
4777
4778 // dn auth_pin?
4779 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4780 if (pinned_it != strong->authpinned_dentries.end()) {
4781 const auto peer_reqid_it = pinned_it->second.find(ss);
4782 if (peer_reqid_it != pinned_it->second.end()) {
4783 for (const auto &r : peer_reqid_it->second) {
4784 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4785
4786 // get/create peer mdrequest
4787 MDRequestRef mdr;
4788 if (have_request(r.reqid))
4789 mdr = request_get(r.reqid);
4790 else
4791 mdr = request_start_peer(r.reqid, r.attempt, strong);
4792 mdr->auth_pin(dn);
4793 }
4794 }
4795 }
4796
4797 // dn xlock?
4798 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4799 if (xlocked_it != strong->xlocked_dentries.end()) {
4800 const auto ss_req_it = xlocked_it->second.find(ss);
4801 if (ss_req_it != xlocked_it->second.end()) {
4802 const MMDSCacheRejoin::peer_reqid& r = ss_req_it->second;
4803 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4804 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4805 ceph_assert(mdr->is_auth_pinned(dn));
4806 if (!mdr->is_xlocked(&dn->versionlock)) {
4807 ceph_assert(dn->versionlock.can_xlock_local());
4808 dn->versionlock.get_xlock(mdr, mdr->get_client());
4809 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4810 }
4811 if (dn->lock.is_stable())
4812 dn->auth_pin(&dn->lock);
4813 dn->lock.set_state(LOCK_XLOCK);
4814 dn->lock.get_xlock(mdr, mdr->get_client());
4815 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4816 }
4817 }
4818
4819 dn->add_replica(from, d.nonce);
4820 dout(10) << " have " << *dn << dendl;
4821
4822 if (dnl->is_primary()) {
4823 if (d.is_primary()) {
4824 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4825 // the survivor missed MDentryUnlink+MDentryLink messages ?
4826 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4827 CInode *in = get_inode(d.ino, ss.snapid);
4828 ceph_assert(in);
4829 ceph_assert(in->get_parent_dn());
4830 rejoin_unlinked_inodes[from].insert(in);
4831 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4832 }
4833 } else {
4834 // the survivor missed MDentryLink message ?
4835 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4836 dout(7) << " sender doesn't have primay dentry" << dendl;
4837 }
4838 } else {
4839 if (d.is_primary()) {
4840 // the survivor missed MDentryUnlink message ?
4841 CInode *in = get_inode(d.ino, ss.snapid);
4842 ceph_assert(in);
4843 ceph_assert(in->get_parent_dn());
4844 rejoin_unlinked_inodes[from].insert(in);
4845 dout(7) << " sender has primary dentry but we don't" << dendl;
4846 }
4847 }
4848 }
4849 }
4850 }
4851
4852 for (const auto &p : strong->strong_inodes) {
4853 CInode *in = get_inode(p.first);
4854 ceph_assert(in);
4855 in->add_replica(from, p.second.nonce);
4856 dout(10) << " have " << *in << dendl;
4857
4858 const MMDSCacheRejoin::inode_strong& is = p.second;
4859
4860 // caps_wanted
4861 if (is.caps_wanted) {
4862 in->set_mds_caps_wanted(from, is.caps_wanted);
4863 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4864 << " on " << *in << dendl;
4865 }
4866
4867 // scatterlocks?
4868 // infer state from replica state:
4869 // * go to MIX if they might have wrlocks
4870 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4871 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4872 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4873 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4874
4875 // auth pin?
4876 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4877 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4878 for (const auto& r : authpinned_inodes_it->second) {
4879 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4880
4881 // get/create peer mdrequest
4882 MDRequestRef mdr;
4883 if (have_request(r.reqid))
4884 mdr = request_get(r.reqid);
4885 else
4886 mdr = request_start_peer(r.reqid, r.attempt, strong);
4887 if (strong->frozen_authpin_inodes.count(in->vino())) {
4888 ceph_assert(!in->get_num_auth_pins());
4889 mdr->freeze_auth_pin(in);
4890 } else {
4891 ceph_assert(!in->is_frozen_auth_pin());
4892 }
4893 mdr->auth_pin(in);
4894 }
4895 }
4896 // xlock(s)?
4897 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4898 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4899 for (const auto &q : xlocked_inodes_it->second) {
4900 SimpleLock *lock = in->get_lock(q.first);
4901 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4902 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4903 ceph_assert(mdr->is_auth_pinned(in));
4904 if (!mdr->is_xlocked(&in->versionlock)) {
4905 ceph_assert(in->versionlock.can_xlock_local());
4906 in->versionlock.get_xlock(mdr, mdr->get_client());
4907 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4908 }
4909 if (lock->is_stable())
4910 in->auth_pin(lock);
4911 lock->set_state(LOCK_XLOCK);
4912 if (lock == &in->filelock)
4913 in->loner_cap = -1;
4914 lock->get_xlock(mdr, mdr->get_client());
4915 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4916 }
4917 }
4918 }
4919 // wrlock(s)?
4920 for (const auto &p : strong->wrlocked_inodes) {
4921 CInode *in = get_inode(p.first);
4922 for (const auto &q : p.second) {
4923 SimpleLock *lock = in->get_lock(q.first);
4924 for (const auto &r : q.second) {
4925 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4926 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4927 if (in->is_auth())
4928 ceph_assert(mdr->is_auth_pinned(in));
4929 lock->set_state(LOCK_MIX);
4930 if (lock == &in->filelock)
4931 in->loner_cap = -1;
4932 lock->get_wrlock(true);
4933 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4934 }
4935 }
4936 }
4937
4938 // done?
4939 ceph_assert(rejoin_gather.count(from));
4940 rejoin_gather.erase(from);
4941 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4942 rejoin_gather_finish();
4943 } else {
4944 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4945 }
4946 }
4947
4948 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4949 {
4950 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4951 mds_rank_t from = mds_rank_t(ack->get_source().num());
4952
4953 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4954 bool survivor = !mds->is_rejoin();
4955
4956 // for sending cache expire message
4957 set<CInode*> isolated_inodes;
4958 set<CInode*> refragged_inodes;
4959 list<pair<CInode*,int> > updated_realms;
4960
4961 // dirs
4962 for (const auto &p : ack->strong_dirfrags) {
4963 // we may have had incorrect dir fragmentation; refragment based
4964 // on what they auth tells us.
4965 CDir *dir = get_dirfrag(p.first);
4966 if (!dir) {
4967 dir = get_force_dirfrag(p.first, false);
4968 if (dir)
4969 refragged_inodes.insert(dir->get_inode());
4970 }
4971 if (!dir) {
4972 CInode *diri = get_inode(p.first.ino);
4973 if (!diri) {
4974 // barebones inode; the full inode loop below will clean up.
4975 diri = new CInode(this, false);
4976 auto _inode = diri->_get_inode();
4977 _inode->ino = p.first.ino;
4978 _inode->mode = S_IFDIR;
4979 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4980
4981 add_inode(diri);
4982 if (MDS_INO_MDSDIR(from) == p.first.ino) {
4983 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4984 dout(10) << " add inode " << *diri << dendl;
4985 } else {
4986 diri->inode_auth = CDIR_AUTH_DEFAULT;
4987 isolated_inodes.insert(diri);
4988 dout(10) << " unconnected dirfrag " << p.first << dendl;
4989 }
4990 }
4991 // barebones dirfrag; the full dirfrag loop below will clean up.
4992 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4993 if (MDS_INO_MDSDIR(from) == p.first.ino ||
4994 (dir->authority() != CDIR_AUTH_UNDEF &&
4995 dir->authority().first != from))
4996 adjust_subtree_auth(dir, from);
4997 dout(10) << " add dirfrag " << *dir << dendl;
4998 }
4999
5000 dir->set_replica_nonce(p.second.nonce);
5001 dir->state_clear(CDir::STATE_REJOINING);
5002 dout(10) << " got " << *dir << dendl;
5003
5004 // dentries
5005 auto it = ack->strong_dentries.find(p.first);
5006 if (it != ack->strong_dentries.end()) {
5007 for (const auto &q : it->second) {
5008 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5009 if(!dn)
5010 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5011
5012 CDentry::linkage_t *dnl = dn->get_linkage();
5013
5014 ceph_assert(dn->last == q.first.snapid);
5015 if (dn->first != q.second.first) {
5016 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5017 dn->first = q.second.first;
5018 }
5019
5020 // may have bad linkage if we missed dentry link/unlink messages
5021 if (dnl->is_primary()) {
5022 CInode *in = dnl->get_inode();
5023 if (!q.second.is_primary() ||
5024 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5025 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5026 dir->unlink_inode(dn);
5027 }
5028 } else if (dnl->is_remote()) {
5029 if (!q.second.is_remote() ||
5030 q.second.remote_ino != dnl->get_remote_ino() ||
5031 q.second.remote_d_type != dnl->get_remote_d_type()) {
5032 dout(10) << " had bad linkage for " << *dn << dendl;
5033 dir->unlink_inode(dn);
5034 }
5035 } else {
5036 if (!q.second.is_null())
5037 dout(10) << " had bad linkage for " << *dn << dendl;
5038 }
5039
5040 // hmm, did we have the proper linkage here?
5041 if (dnl->is_null() && !q.second.is_null()) {
5042 if (q.second.is_remote()) {
5043 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5044 } else {
5045 CInode *in = get_inode(q.second.ino, q.first.snapid);
5046 if (!in) {
5047 // barebones inode; assume it's dir, the full inode loop below will clean up.
5048 in = new CInode(this, false, q.second.first, q.first.snapid);
5049 auto _inode = in->_get_inode();
5050 _inode->ino = q.second.ino;
5051 _inode->mode = S_IFDIR;
5052 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5053 add_inode(in);
5054 dout(10) << " add inode " << *in << dendl;
5055 } else if (in->get_parent_dn()) {
5056 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5057 << ", unlinking " << *in << dendl;
5058 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5059 }
5060 dn->dir->link_primary_inode(dn, in);
5061 isolated_inodes.erase(in);
5062 }
5063 }
5064
5065 dn->set_replica_nonce(q.second.nonce);
5066 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5067 dn->state_clear(CDentry::STATE_REJOINING);
5068 dout(10) << " got " << *dn << dendl;
5069 }
5070 }
5071 }
5072
5073 for (const auto& in : refragged_inodes) {
5074 auto&& ls = in->get_nested_dirfrags();
5075 for (const auto& dir : ls) {
5076 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5077 continue;
5078 ceph_assert(dir->get_num_any() == 0);
5079 in->close_dirfrag(dir->get_frag());
5080 }
5081 }
5082
5083 // full dirfrags
5084 for (const auto &p : ack->dirfrag_bases) {
5085 CDir *dir = get_dirfrag(p.first);
5086 ceph_assert(dir);
5087 auto q = p.second.cbegin();
5088 dir->_decode_base(q);
5089 dout(10) << " got dir replica " << *dir << dendl;
5090 }
5091
5092 // full inodes
5093 auto p = ack->inode_base.cbegin();
5094 while (!p.end()) {
5095 inodeno_t ino;
5096 snapid_t last;
5097 bufferlist basebl;
5098 decode(ino, p);
5099 decode(last, p);
5100 decode(basebl, p);
5101 CInode *in = get_inode(ino, last);
5102 ceph_assert(in);
5103 auto q = basebl.cbegin();
5104 snapid_t sseq = 0;
5105 if (in->snaprealm)
5106 sseq = in->snaprealm->srnode.seq;
5107 in->_decode_base(q);
5108 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5109 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5110 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5111 }
5112 dout(10) << " got inode base " << *in << dendl;
5113 }
5114
5115 // inodes
5116 p = ack->inode_locks.cbegin();
5117 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5118 while (!p.end()) {
5119 inodeno_t ino;
5120 snapid_t last;
5121 __u32 nonce;
5122 bufferlist lockbl;
5123 decode(ino, p);
5124 decode(last, p);
5125 decode(nonce, p);
5126 decode(lockbl, p);
5127
5128 CInode *in = get_inode(ino, last);
5129 ceph_assert(in);
5130 in->set_replica_nonce(nonce);
5131 auto q = lockbl.cbegin();
5132 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5133 in->state_clear(CInode::STATE_REJOINING);
5134 dout(10) << " got inode locks " << *in << dendl;
5135 }
5136
5137 // FIXME: This can happen if entire subtree, together with the inode subtree root
5138 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5139 ceph_assert(isolated_inodes.empty());
5140
5141 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5142 auto bp = ack->imported_caps.cbegin();
5143 decode(peer_imported, bp);
5144
5145 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5146 p != peer_imported.end();
5147 ++p) {
5148 auto& ex = cap_exports.at(p->first);
5149 ceph_assert(ex.first == from);
5150 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5151 q != p->second.end();
5152 ++q) {
5153 auto r = ex.second.find(q->first);
5154 ceph_assert(r != ex.second.end());
5155
5156 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5157 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5158 if (!session) {
5159 dout(10) << " no session for client." << p->first << dendl;
5160 ex.second.erase(r);
5161 continue;
5162 }
5163
5164 // mark client caps stale.
5165 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5166 r->second.capinfo.cap_id, 0,
5167 mds->get_osd_epoch_barrier());
5168 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5169 (q->second.cap_id > 0 ? from : -1), 0);
5170 mds->send_message_client_counted(m, session);
5171
5172 ex.second.erase(r);
5173 }
5174 ceph_assert(ex.second.empty());
5175 }
5176
5177 for (auto p : updated_realms) {
5178 CInode *in = p.first;
5179 bool notify_clients;
5180 if (mds->is_rejoin()) {
5181 if (!rejoin_pending_snaprealms.count(in)) {
5182 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5183 rejoin_pending_snaprealms.insert(in);
5184 }
5185 notify_clients = false;
5186 } else {
5187 // notify clients if I'm survivor
5188 notify_clients = true;
5189 }
5190 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5191 }
5192
5193 // done?
5194 ceph_assert(rejoin_ack_gather.count(from));
5195 rejoin_ack_gather.erase(from);
5196 if (!survivor) {
5197 if (rejoin_gather.empty()) {
5198 // eval unstable scatter locks after all wrlocks are rejoined.
5199 while (!rejoin_eval_locks.empty()) {
5200 SimpleLock *lock = rejoin_eval_locks.front();
5201 rejoin_eval_locks.pop_front();
5202 if (!lock->is_stable())
5203 mds->locker->eval_gather(lock);
5204 }
5205 }
5206
5207 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5208 rejoin_ack_gather.empty()) {
5209 // finally, kickstart past snap parent opens
5210 open_snaprealms();
5211 } else {
5212 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5213 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5214 }
5215 } else {
5216 // survivor.
5217 mds->queue_waiters(rejoin_waiters);
5218 }
5219 }
5220
5221 /**
5222 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5223 *
5224 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5225 * messages that clean these guys up...
5226 */
5227 void MDCache::rejoin_trim_undef_inodes()
5228 {
5229 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5230
5231 while (!rejoin_undef_inodes.empty()) {
5232 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5233 CInode *in = *p;
5234 rejoin_undef_inodes.erase(p);
5235
5236 in->clear_replica_map();
5237
5238 // close out dirfrags
5239 if (in->is_dir()) {
5240 const auto&& dfls = in->get_dirfrags();
5241 for (const auto& dir : dfls) {
5242 dir->clear_replica_map();
5243
5244 for (auto &p : dir->items) {
5245 CDentry *dn = p.second;
5246 dn->clear_replica_map();
5247
5248 dout(10) << " trimming " << *dn << dendl;
5249 dir->remove_dentry(dn);
5250 }
5251
5252 dout(10) << " trimming " << *dir << dendl;
5253 in->close_dirfrag(dir->dirfrag().frag);
5254 }
5255 }
5256
5257 CDentry *dn = in->get_parent_dn();
5258 if (dn) {
5259 dn->clear_replica_map();
5260 dout(10) << " trimming " << *dn << dendl;
5261 dn->dir->remove_dentry(dn);
5262 } else {
5263 dout(10) << " trimming " << *in << dendl;
5264 remove_inode(in);
5265 }
5266 }
5267
5268 ceph_assert(rejoin_undef_inodes.empty());
5269 }
5270
5271 void MDCache::rejoin_gather_finish()
5272 {
5273 dout(10) << "rejoin_gather_finish" << dendl;
5274 ceph_assert(mds->is_rejoin());
5275 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5276
5277 if (open_undef_inodes_dirfrags())
5278 return;
5279
5280 if (process_imported_caps())
5281 return;
5282
5283 choose_lock_states_and_reconnect_caps();
5284
5285 identify_files_to_recover();
5286 rejoin_send_acks();
5287
5288 // signal completion of fetches, rejoin_gather_finish, etc.
5289 rejoin_ack_gather.erase(mds->get_nodeid());
5290
5291 // did we already get our acks too?
5292 if (rejoin_ack_gather.empty()) {
5293 // finally, open snaprealms
5294 open_snaprealms();
5295 }
5296 }
5297
5298 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5299 inodeno_t ino;
5300 public:
5301 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5302 void finish(int r) override {
5303 mdcache->rejoin_open_ino_finish(ino, r);
5304 }
5305 };
5306
5307 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5308 {
5309 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5310
5311 if (ret < 0) {
5312 cap_imports_missing.insert(ino);
5313 } else if (ret == mds->get_nodeid()) {
5314 ceph_assert(get_inode(ino));
5315 } else {
5316 auto p = cap_imports.find(ino);
5317 ceph_assert(p != cap_imports.end());
5318 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5319 ceph_assert(q->second.count(MDS_RANK_NONE));
5320 ceph_assert(q->second.size() == 1);
5321 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5322 }
5323 cap_imports.erase(p);
5324 }
5325
5326 ceph_assert(cap_imports_num_opening > 0);
5327 cap_imports_num_opening--;
5328
5329 if (cap_imports_num_opening == 0) {
5330 if (rejoin_gather.empty())
5331 rejoin_gather_finish();
5332 else if (rejoin_gather.count(mds->get_nodeid()))
5333 process_imported_caps();
5334 }
5335 }
5336
5337 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5338 public:
5339 map<client_t,pair<Session*,uint64_t> > session_map;
5340 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5341 void finish(int r) override {
5342 ceph_assert(r == 0);
5343 mdcache->rejoin_open_sessions_finish(session_map);
5344 }
5345 };
5346
5347 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5348 {
5349 dout(10) << "rejoin_open_sessions_finish" << dendl;
5350 mds->server->finish_force_open_sessions(session_map);
5351 rejoin_session_map.swap(session_map);
5352 if (rejoin_gather.empty())
5353 rejoin_gather_finish();
5354 }
5355
5356 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5357 {
5358 auto p = cap_imports.find(ino);
5359 if (p != cap_imports.end()) {
5360 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5361 if (ret < 0) {
5362 cap_imports_missing.insert(ino);
5363 } else if (ret != mds->get_nodeid()) {
5364 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5365 ceph_assert(q->second.count(MDS_RANK_NONE));
5366 ceph_assert(q->second.size() == 1);
5367 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5368 }
5369 cap_imports.erase(p);
5370 }
5371 }
5372 }
5373
5374 bool MDCache::process_imported_caps()
5375 {
5376 dout(10) << "process_imported_caps" << dendl;
5377
5378 if (!open_file_table.is_prefetched() &&
5379 open_file_table.prefetch_inodes()) {
5380 open_file_table.wait_for_prefetch(
5381 new MDSInternalContextWrapper(mds,
5382 new LambdaContext([this](int r) {
5383 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5384 process_imported_caps();
5385 })
5386 )
5387 );
5388 return true;
5389 }
5390
5391 for (auto& p : cap_imports) {
5392 CInode *in = get_inode(p.first);
5393 if (in) {
5394 ceph_assert(in->is_auth());
5395 cap_imports_missing.erase(p.first);
5396 continue;
5397 }
5398 if (cap_imports_missing.count(p.first) > 0)
5399 continue;
5400
5401 uint64_t parent_ino = 0;
5402 std::string_view d_name;
5403 for (auto& q : p.second) {
5404 for (auto& r : q.second) {
5405 auto &icr = r.second;
5406 if (icr.capinfo.pathbase &&
5407 icr.path.length() > 0 &&
5408 icr.path.find('/') == string::npos) {
5409 parent_ino = icr.capinfo.pathbase;
5410 d_name = icr.path;
5411 break;
5412 }
5413 }
5414 if (parent_ino)
5415 break;
5416 }
5417
5418 dout(10) << " opening missing ino " << p.first << dendl;
5419 cap_imports_num_opening++;
5420 auto fin = new C_MDC_RejoinOpenInoFinish(this, p.first);
5421 if (parent_ino) {
5422 vector<inode_backpointer_t> ancestors;
5423 ancestors.push_back(inode_backpointer_t(parent_ino, string{d_name}, 0));
5424 open_ino(p.first, (int64_t)-1, fin, false, false, &ancestors);
5425 } else {
5426 open_ino(p.first, (int64_t)-1, fin, false);
5427 }
5428 if (!(cap_imports_num_opening % 1000))
5429 mds->heartbeat_reset();
5430 }
5431
5432 if (cap_imports_num_opening > 0)
5433 return true;
5434
5435 // called by rejoin_gather_finish() ?
5436 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5437 if (!rejoin_client_map.empty() &&
5438 rejoin_session_map.empty()) {
5439 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5440 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5441 rejoin_client_metadata_map,
5442 finish->session_map);
5443 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5444 std::move(rejoin_client_metadata_map));
5445 mds->mdlog->start_submit_entry(le, finish);
5446 mds->mdlog->flush();
5447 rejoin_client_map.clear();
5448 rejoin_client_metadata_map.clear();
5449 return true;
5450 }
5451
5452 // process caps that were exported by peer rename
5453 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
5454 p != rejoin_peer_exports.end();
5455 ++p) {
5456 CInode *in = get_inode(p->first);
5457 ceph_assert(in);
5458 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5459 q != p->second.second.end();
5460 ++q) {
5461 auto r = rejoin_session_map.find(q->first);
5462 if (r == rejoin_session_map.end())
5463 continue;
5464
5465 Session *session = r->second.first;
5466 Capability *cap = in->get_client_cap(q->first);
5467 if (!cap) {
5468 cap = in->add_client_cap(q->first, session);
5469 // add empty item to reconnected_caps
5470 (void)reconnected_caps[p->first][q->first];
5471 }
5472 cap->merge(q->second, true);
5473
5474 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5475 ceph_assert(cap->get_last_seq() == im.issue_seq);
5476 ceph_assert(cap->get_mseq() == im.mseq);
5477 cap->set_cap_id(im.cap_id);
5478 // send cap import because we assigned a new cap ID
5479 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5480 p->second.first, CEPH_CAP_FLAG_AUTH);
5481 }
5482 }
5483 rejoin_peer_exports.clear();
5484 rejoin_imported_caps.clear();
5485
5486 // process cap imports
5487 // ino -> client -> frommds -> capex
5488 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5489 CInode *in = get_inode(p->first);
5490 if (!in) {
5491 dout(10) << " still missing ino " << p->first
5492 << ", will try again after replayed client requests" << dendl;
5493 ++p;
5494 continue;
5495 }
5496 ceph_assert(in->is_auth());
5497 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5498 Session *session;
5499 {
5500 auto r = rejoin_session_map.find(q->first);
5501 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5502 }
5503
5504 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5505 if (!session) {
5506 if (r->first >= 0)
5507 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5508 continue;
5509 }
5510
5511 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5512 add_reconnected_cap(q->first, in->ino(), r->second);
5513 if (r->first >= 0) {
5514 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5515 cap->inc_mseq();
5516 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5517
5518 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5519 im.cap_id = cap->get_cap_id();
5520 im.issue_seq = cap->get_last_seq();
5521 im.mseq = cap->get_mseq();
5522 }
5523 }
5524 }
5525 cap_imports.erase(p++); // remove and move on
5526 }
5527 } else {
5528 trim_non_auth();
5529
5530 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5531 rejoin_gather.erase(mds->get_nodeid());
5532 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5533 maybe_send_pending_rejoins();
5534 }
5535 return false;
5536 }
5537
5538 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5539 client_t client, snapid_t snap_follows)
5540 {
5541 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5542
5543 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5544 return;
5545
5546 const set<snapid_t>& snaps = realm->get_snaps();
5547 snapid_t follows = snap_follows;
5548
5549 while (true) {
5550 CInode *in = pick_inode_snap(head_in, follows);
5551 if (in == head_in)
5552 break;
5553
5554 bool need_snapflush = false;
5555 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5556 p != snaps.end() && *p <= in->last;
5557 ++p) {
5558 head_in->add_need_snapflush(in, *p, client);
5559 need_snapflush = true;
5560 }
5561 follows = in->last;
5562 if (!need_snapflush)
5563 continue;
5564
5565 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5566
5567 if (in->client_snap_caps.empty()) {
5568 for (int i = 0; i < num_cinode_locks; i++) {
5569 int lockid = cinode_lock_info[i].lock;
5570 SimpleLock *lock = in->get_lock(lockid);
5571 ceph_assert(lock);
5572 in->auth_pin(lock);
5573 lock->set_state(LOCK_SNAP_SYNC);
5574 lock->get_wrlock(true);
5575 }
5576 }
5577 in->client_snap_caps.insert(client);
5578 mds->locker->mark_need_snapflush_inode(in);
5579 }
5580 }
5581
5582 /*
5583 * choose lock states based on reconnected caps
5584 */
5585 void MDCache::choose_lock_states_and_reconnect_caps()
5586 {
5587 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5588
5589 int count = 0;
5590 for (auto p : inode_map) {
5591 CInode *in = p.second;
5592 if (in->last != CEPH_NOSNAP)
5593 continue;
5594
5595 if (in->is_auth() && !in->is_base() && in->get_inode()->is_dirty_rstat())
5596 in->mark_dirty_rstat();
5597
5598 int dirty_caps = 0;
5599 auto q = reconnected_caps.find(in->ino());
5600 if (q != reconnected_caps.end()) {
5601 for (const auto &it : q->second)
5602 dirty_caps |= it.second.dirty_caps;
5603 }
5604 in->choose_lock_states(dirty_caps);
5605 dout(15) << " chose lock states on " << *in << dendl;
5606
5607 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5608 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5609 rejoin_pending_snaprealms.insert(in);
5610 }
5611
5612 if (!(++count % 1000))
5613 mds->heartbeat_reset();
5614 }
5615 }
5616
5617 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5618 map<client_t,ref_t<MClientSnap>>& splits)
5619 {
5620 ref_t<MClientSnap> snap;
5621 auto it = splits.find(client);
5622 if (it != splits.end()) {
5623 snap = it->second;
5624 snap->head.op = CEPH_SNAP_OP_SPLIT;
5625 } else {
5626 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5627 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5628 snap->head.split = realm->inode->ino();
5629 snap->bl = realm->get_snap_trace();
5630
5631 for (const auto& child : realm->open_children)
5632 snap->split_realms.push_back(child->inode->ino());
5633 }
5634 snap->split_inos.push_back(ino);
5635 }
5636
5637 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5638 map<client_t,ref_t<MClientSnap>>& splits)
5639 {
5640 ceph_assert(parent_realm);
5641
5642 vector<inodeno_t> split_inos;
5643 vector<inodeno_t> split_realms;
5644
5645 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
5646 split_inos.push_back((*p)->ino());
5647 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5648 p != realm->open_children.end();
5649 ++p)
5650 split_realms.push_back((*p)->inode->ino());
5651
5652 for (const auto& p : realm->client_caps) {
5653 ceph_assert(!p.second->empty());
5654 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5655 if (em.second) {
5656 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5657 update->head.split = parent_realm->inode->ino();
5658 update->split_inos = split_inos;
5659 update->split_realms = split_realms;
5660 update->bl = parent_realm->get_snap_trace();
5661 em.first->second = std::move(update);
5662 }
5663 }
5664 }
5665
5666 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5667 {
5668 dout(10) << "send_snaps" << dendl;
5669
5670 for (auto &p : splits) {
5671 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5672 if (session) {
5673 dout(10) << " client." << p.first
5674 << " split " << p.second->head.split
5675 << " inos " << p.second->split_inos
5676 << dendl;
5677 mds->send_message_client_counted(p.second, session);
5678 } else {
5679 dout(10) << " no session for client." << p.first << dendl;
5680 }
5681 }
5682 splits.clear();
5683 }
5684
5685
5686 /*
5687 * remove any items from logsegment open_file lists that don't have
5688 * any caps
5689 */
5690 void MDCache::clean_open_file_lists()
5691 {
5692 dout(10) << "clean_open_file_lists" << dendl;
5693
5694 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5695 p != mds->mdlog->segments.end();
5696 ++p) {
5697 LogSegment *ls = p->second;
5698
5699 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5700 while (!q.end()) {
5701 CInode *in = *q;
5702 ++q;
5703 if (in->last == CEPH_NOSNAP) {
5704 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5705 in->item_open_file.remove_myself();
5706 } else {
5707 if (in->client_snap_caps.empty()) {
5708 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5709 in->item_open_file.remove_myself();
5710 }
5711 }
5712 }
5713 }
5714 }
5715
5716 void MDCache::dump_openfiles(Formatter *f)
5717 {
5718 f->open_array_section("openfiles");
5719 for (auto p = mds->mdlog->segments.begin();
5720 p != mds->mdlog->segments.end();
5721 ++p) {
5722 LogSegment *ls = p->second;
5723
5724 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5725 while (!q.end()) {
5726 CInode *in = *q;
5727 ++q;
5728 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5729 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5730 continue;
5731 f->open_object_section("file");
5732 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5733 f->close_section();
5734 }
5735 }
5736 f->close_section();
5737 }
5738
5739 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5740 {
5741 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5742 << " on " << *in << dendl;
5743 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5744 if (!session) {
5745 dout(10) << " no session for client." << client << dendl;
5746 return NULL;
5747 }
5748
5749 Capability *cap = in->reconnect_cap(client, icr, session);
5750
5751 if (frommds >= 0) {
5752 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5753 cap->inc_mseq();
5754 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5755 }
5756
5757 return cap;
5758 }
5759
5760 void MDCache::export_remaining_imported_caps()
5761 {
5762 dout(10) << "export_remaining_imported_caps" << dendl;
5763
5764 CachedStackStringStream css;
5765
5766 int count = 0;
5767 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5768 *css << " ino " << p->first << "\n";
5769 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5770 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5771 if (session) {
5772 // mark client caps stale.
5773 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5774 0, 0, 0,
5775 mds->get_osd_epoch_barrier());
5776 stale->set_cap_peer(0, 0, 0, -1, 0);
5777 mds->send_message_client_counted(stale, q->first);
5778 }
5779 }
5780
5781 if (!(++count % 1000))
5782 mds->heartbeat_reset();
5783 }
5784
5785 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5786 p != cap_reconnect_waiters.end();
5787 ++p)
5788 mds->queue_waiters(p->second);
5789
5790 cap_imports.clear();
5791 cap_reconnect_waiters.clear();
5792
5793 if (css->strv().length()) {
5794 mds->clog->warn() << "failed to reconnect caps for missing inodes:"
5795 << css->strv();
5796 }
5797 }
5798
5799 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5800 {
5801 client_t client = session->info.get_client();
5802 Capability *cap = nullptr;
5803 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5804 if (rc) {
5805 cap = in->reconnect_cap(client, *rc, session);
5806 dout(10) << "try_reconnect_cap client." << client
5807 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5808 << " issue " << ccap_string(rc->capinfo.issued)
5809 << " on " << *in << dendl;
5810 remove_replay_cap_reconnect(in->ino(), client);
5811
5812 if (in->is_replicated()) {
5813 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5814 } else {
5815 int dirty_caps = 0;
5816 auto p = reconnected_caps.find(in->ino());
5817 if (p != reconnected_caps.end()) {
5818 auto q = p->second.find(client);
5819 if (q != p->second.end())
5820 dirty_caps = q->second.dirty_caps;
5821 }
5822 in->choose_lock_states(dirty_caps);
5823 dout(15) << " chose lock states on " << *in << dendl;
5824 }
5825
5826 map<inodeno_t, MDSContext::vec >::iterator it =
5827 cap_reconnect_waiters.find(in->ino());
5828 if (it != cap_reconnect_waiters.end()) {
5829 mds->queue_waiters(it->second);
5830 cap_reconnect_waiters.erase(it);
5831 }
5832 }
5833 return cap;
5834 }
5835
5836
5837
5838 // -------
5839 // cap imports and delayed snap parent opens
5840
5841 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5842 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5843 int peer, int p_flags)
5844 {
5845 SnapRealm *realm = in->find_snaprealm();
5846 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5847 if (cap->get_last_seq() == 0) // reconnected cap
5848 cap->inc_last_seq();
5849 cap->set_last_issue();
5850 cap->set_last_issue_stamp(ceph_clock_now());
5851 cap->clear_new();
5852 auto reap = make_message<MClientCaps>(CEPH_CAP_OP_IMPORT,
5853 in->ino(), realm->inode->ino(), cap->get_cap_id(),
5854 cap->get_last_seq(), cap->pending(), cap->wanted(),
5855 0, cap->get_mseq(), mds->get_osd_epoch_barrier());
5856 in->encode_cap_message(reap, cap);
5857 reap->snapbl = realm->get_snap_trace();
5858 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5859 mds->send_message_client_counted(reap, session);
5860 }
5861
5862 void MDCache::do_delayed_cap_imports()
5863 {
5864 dout(10) << "do_delayed_cap_imports" << dendl;
5865
5866 ceph_assert(delayed_imported_caps.empty());
5867 }
5868
5869 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5870 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5871 void finish(int r) override {
5872 mdcache->open_snaprealms();
5873 }
5874 };
5875
5876 void MDCache::open_snaprealms()
5877 {
5878 dout(10) << "open_snaprealms" << dendl;
5879
5880 auto it = rejoin_pending_snaprealms.begin();
5881 while (it != rejoin_pending_snaprealms.end()) {
5882 CInode *in = *it;
5883 SnapRealm *realm = in->snaprealm;
5884 ceph_assert(realm);
5885
5886 map<client_t,ref_t<MClientSnap>> splits;
5887 // finish off client snaprealm reconnects?
5888 auto q = reconnected_snaprealms.find(in->ino());
5889 if (q != reconnected_snaprealms.end()) {
5890 for (const auto& r : q->second)
5891 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5892 reconnected_snaprealms.erase(q);
5893 }
5894
5895 for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p) {
5896 CInode *child = *p;
5897 auto q = reconnected_caps.find(child->ino());
5898 ceph_assert(q != reconnected_caps.end());
5899 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5900 Capability *cap = child->get_client_cap(r->first);
5901 if (!cap)
5902 continue;
5903 if (r->second.snap_follows > 0) {
5904 if (r->second.snap_follows < child->first - 1) {
5905 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5906 } else if (r->second.snapflush) {
5907 // When processing a cap flush message that is re-sent, it's possble
5908 // that the sender has already released all WR caps. So we should
5909 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5910 cap->mark_needsnapflush();
5911 }
5912 }
5913 // make sure client's cap is in the correct snaprealm.
5914 if (r->second.realm_ino != in->ino()) {
5915 prepare_realm_split(realm, r->first, child->ino(), splits);
5916 }
5917 }
5918 }
5919
5920 rejoin_pending_snaprealms.erase(it++);
5921 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5922
5923 send_snaps(splits);
5924 }
5925
5926 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5927
5928 if (!reconnected_snaprealms.empty()) {
5929 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5930 for (auto& p : reconnected_snaprealms) {
5931 CachedStackStringStream css;
5932 *css << " " << p.first << " {";
5933 bool first = true;
5934 for (auto& q : p.second) {
5935 if (!first)
5936 *css << ", ";
5937 *css << "client." << q.first << "/" << q.second;
5938 }
5939 *css << "}";
5940 dout(5) << css->strv() << dendl;
5941 }
5942 }
5943 ceph_assert(rejoin_waiters.empty());
5944 ceph_assert(rejoin_pending_snaprealms.empty());
5945 dout(10) << "open_snaprealms - all open" << dendl;
5946 do_delayed_cap_imports();
5947
5948 ceph_assert(rejoin_done);
5949 rejoin_done.release()->complete(0);
5950 reconnected_caps.clear();
5951 }
5952
5953 bool MDCache::open_undef_inodes_dirfrags()
5954 {
5955 dout(10) << "open_undef_inodes_dirfrags "
5956 << rejoin_undef_inodes.size() << " inodes "
5957 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5958
5959 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5960
5961 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5962 p != rejoin_undef_inodes.end();
5963 ++p) {
5964 CInode *in = *p;
5965 ceph_assert(!in->is_base());
5966 ceph_assert(in->get_parent_dir());
5967 fetch_queue.insert(in->get_parent_dir());
5968 }
5969
5970 if (fetch_queue.empty())
5971 return false;
5972
5973 MDSGatherBuilder gather(g_ceph_context,
5974 new MDSInternalContextWrapper(mds,
5975 new LambdaContext([this](int r) {
5976 if (rejoin_gather.empty())
5977 rejoin_gather_finish();
5978 })
5979 )
5980 );
5981
5982 for (set<CDir*>::iterator p = fetch_queue.begin();
5983 p != fetch_queue.end();
5984 ++p) {
5985 CDir *dir = *p;
5986 CInode *diri = dir->get_inode();
5987 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5988 continue;
5989 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5990 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5991 dir->fetch(gather.new_sub());
5992 }
5993 ceph_assert(gather.has_subs());
5994 gather.activate();
5995 return true;
5996 }
5997
5998 void MDCache::opened_undef_inode(CInode *in) {
5999 dout(10) << "opened_undef_inode " << *in << dendl;
6000 rejoin_undef_inodes.erase(in);
6001 if (in->is_dir()) {
6002 // FIXME: re-hash dentries if necessary
6003 ceph_assert(in->get_inode()->dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
6004 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
6005 CDir *dir = in->get_dirfrag(frag_t());
6006 ceph_assert(dir);
6007 rejoin_undef_dirfrags.erase(dir);
6008 in->force_dirfrags();
6009 auto&& ls = in->get_dirfrags();
6010 for (const auto& dir : ls) {
6011 rejoin_undef_dirfrags.insert(dir);
6012 }
6013 }
6014 }
6015 }
6016
6017 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6018 map<client_t,ref_t<MClientSnap>>& updates)
6019 {
6020 if (seq < realm->get_newest_seq()) {
6021 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
6022 << realm->get_newest_seq() << " on " << *realm << dendl;
6023 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
6024 snap->bl = realm->get_snap_trace();
6025 for (const auto& child : realm->open_children)
6026 snap->split_realms.push_back(child->inode->ino());
6027 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
6028 } else {
6029 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6030 << " on " << *realm << dendl;
6031 }
6032 }
6033
6034
6035
6036 void MDCache::rejoin_send_acks()
6037 {
6038 dout(7) << "rejoin_send_acks" << dendl;
6039
6040 // replicate stray
6041 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6042 p != rejoin_unlinked_inodes.end();
6043 ++p) {
6044 for (set<CInode*>::iterator q = p->second.begin();
6045 q != p->second.end();
6046 ++q) {
6047 CInode *in = *q;
6048 dout(7) << " unlinked inode " << *in << dendl;
6049 // inode expired
6050 if (!in->is_replica(p->first))
6051 continue;
6052 while (1) {
6053 CDentry *dn = in->get_parent_dn();
6054 if (dn->is_replica(p->first))
6055 break;
6056 dn->add_replica(p->first);
6057 CDir *dir = dn->get_dir();
6058 if (dir->is_replica(p->first))
6059 break;
6060 dir->add_replica(p->first);
6061 in = dir->get_inode();
6062 if (in->is_replica(p->first))
6063 break;
6064 in->add_replica(p->first);
6065 if (in->is_base())
6066 break;
6067 }
6068 }
6069 }
6070 rejoin_unlinked_inodes.clear();
6071
6072 // send acks to everyone in the recovery set
6073 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6074 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6075 p != recovery_set.end();
6076 ++p) {
6077 if (rejoin_ack_sent.count(*p))
6078 continue;
6079 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6080 }
6081
6082 rejoin_ack_sent = recovery_set;
6083
6084 // walk subtrees
6085 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6086 p != subtrees.end();
6087 ++p) {
6088 CDir *dir = p->first;
6089 if (!dir->is_auth())
6090 continue;
6091 dout(10) << "subtree " << *dir << dendl;
6092
6093 // auth items in this subtree
6094 std::queue<CDir*> dq;
6095 dq.push(dir);
6096
6097 while (!dq.empty()) {
6098 CDir *dir = dq.front();
6099 dq.pop();
6100
6101 // dir
6102 for (auto &r : dir->get_replicas()) {
6103 auto it = acks.find(r.first);
6104 if (it == acks.end())
6105 continue;
6106 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6107 it->second->add_dirfrag_base(dir);
6108 }
6109
6110 for (auto &p : dir->items) {
6111 CDentry *dn = p.second;
6112 CDentry::linkage_t *dnl = dn->get_linkage();
6113
6114 // inode
6115 CInode *in = NULL;
6116 if (dnl->is_primary())
6117 in = dnl->get_inode();
6118
6119 // dentry
6120 for (auto &r : dn->get_replicas()) {
6121 auto it = acks.find(r.first);
6122 if (it == acks.end())
6123 continue;
6124 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->get_alternate_name(),
6125 dn->first, dn->last,
6126 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6127 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6128 dnl->is_remote() ? dnl->get_remote_d_type():0,
6129 ++r.second,
6130 dn->lock.get_replica_state());
6131 // peer missed MDentrylink message ?
6132 if (in && !in->is_replica(r.first))
6133 in->add_replica(r.first);
6134 }
6135
6136 if (!in)
6137 continue;
6138
6139 for (auto &r : in->get_replicas()) {
6140 auto it = acks.find(r.first);
6141 if (it == acks.end())
6142 continue;
6143 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6144 bufferlist bl;
6145 in->_encode_locks_state_for_rejoin(bl, r.first);
6146 it->second->add_inode_locks(in, ++r.second, bl);
6147 }
6148
6149 // subdirs in this subtree?
6150 {
6151 auto&& dirs = in->get_nested_dirfrags();
6152 for (const auto& dir : dirs) {
6153 dq.push(dir);
6154 }
6155 }
6156 }
6157 }
6158 }
6159
6160 // base inodes too
6161 if (root && root->is_auth())
6162 for (auto &r : root->get_replicas()) {
6163 auto it = acks.find(r.first);
6164 if (it == acks.end())
6165 continue;
6166 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6167 bufferlist bl;
6168 root->_encode_locks_state_for_rejoin(bl, r.first);
6169 it->second->add_inode_locks(root, ++r.second, bl);
6170 }
6171 if (myin)
6172 for (auto &r : myin->get_replicas()) {
6173 auto it = acks.find(r.first);
6174 if (it == acks.end())
6175 continue;
6176 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6177 bufferlist bl;
6178 myin->_encode_locks_state_for_rejoin(bl, r.first);
6179 it->second->add_inode_locks(myin, ++r.second, bl);
6180 }
6181
6182 // include inode base for any inodes whose scatterlocks may have updated
6183 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6184 p != rejoin_potential_updated_scatterlocks.end();
6185 ++p) {
6186 CInode *in = *p;
6187 for (const auto &r : in->get_replicas()) {
6188 auto it = acks.find(r.first);
6189 if (it == acks.end())
6190 continue;
6191 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6192 }
6193 }
6194
6195 // send acks
6196 for (auto p = acks.begin(); p != acks.end(); ++p) {
6197 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6198 mds->send_message_mds(p->second, p->first);
6199 }
6200
6201 rejoin_imported_caps.clear();
6202 }
6203
6204 class C_MDC_ReIssueCaps : public MDCacheContext {
6205 CInode *in;
6206 public:
6207 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6208 MDCacheContext(mdc), in(i)
6209 {
6210 in->get(CInode::PIN_PTRWAITER);
6211 }
6212 void finish(int r) override {
6213 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6214 mdcache->mds->locker->issue_caps(in);
6215 in->put(CInode::PIN_PTRWAITER);
6216 }
6217 };
6218
6219 void MDCache::reissue_all_caps()
6220 {
6221 dout(10) << "reissue_all_caps" << dendl;
6222
6223 int count = 0;
6224 for (auto &p : inode_map) {
6225 int n = 1;
6226 CInode *in = p.second;
6227 if (in->is_head() && in->is_any_caps()) {
6228 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6229 if (in->is_frozen_inode()) {
6230 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6231 continue;
6232 }
6233 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6234 n += mds->locker->issue_caps(in);
6235 }
6236
6237 if ((count % 1000) + n >= 1000)
6238 mds->heartbeat_reset();
6239 count += n;
6240 }
6241 }
6242
6243
6244 // ===============================================================================
6245
6246 struct C_MDC_QueuedCow : public MDCacheContext {
6247 CInode *in;
6248 MutationRef mut;
6249 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6250 MDCacheContext(mdc), in(i), mut(m) {}
6251 void finish(int r) override {
6252 mdcache->_queued_file_recover_cow(in, mut);
6253 }
6254 };
6255
6256
6257 void MDCache::queue_file_recover(CInode *in)
6258 {
6259 dout(10) << "queue_file_recover " << *in << dendl;
6260 ceph_assert(in->is_auth());
6261
6262 // cow?
6263 /*
6264 SnapRealm *realm = in->find_snaprealm();
6265 set<snapid_t> s = realm->get_snaps();
6266 while (!s.empty() && *s.begin() < in->first)
6267 s.erase(s.begin());
6268 while (!s.empty() && *s.rbegin() > in->last)
6269 s.erase(*s.rbegin());
6270 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6271 if (s.size() > 1) {
6272 auto pi = in->project_inode(mut);
6273 pi.inode.version = in->pre_dirty();
6274
6275 auto mut(std::make_shared<MutationImpl>());
6276 mut->ls = mds->mdlog->get_current_segment();
6277 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6278 mds->mdlog->start_entry(le);
6279 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6280
6281 s.erase(*s.begin());
6282 while (!s.empty()) {
6283 snapid_t snapid = *s.begin();
6284 CInode *cow_inode = 0;
6285 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6286 ceph_assert(cow_inode);
6287 recovery_queue.enqueue(cow_inode);
6288 s.erase(*s.begin());
6289 }
6290
6291 in->parent->first = in->first;
6292 le->metablob.add_primary_dentry(in->parent, in, true);
6293 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6294 mds->mdlog->flush();
6295 }
6296 */
6297
6298 recovery_queue.enqueue(in);
6299 }
6300
6301 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6302 {
6303 mut->apply();
6304 mds->locker->drop_locks(mut.get());
6305 mut->cleanup();
6306 }
6307
6308
6309 /*
6310 * called after recovery to recover file sizes for previously opened (for write)
6311 * files. that is, those where max_size > size.
6312 */
6313 void MDCache::identify_files_to_recover()
6314 {
6315 dout(10) << "identify_files_to_recover" << dendl;
6316 int count = 0;
6317 for (auto &p : inode_map) {
6318 CInode *in = p.second;
6319 if (!in->is_auth())
6320 continue;
6321
6322 if (in->last != CEPH_NOSNAP)
6323 continue;
6324
6325 // Only normal files need file size recovery
6326 if (!in->is_file()) {
6327 continue;
6328 }
6329
6330 bool recover = false;
6331 const auto& client_ranges = in->get_projected_inode()->client_ranges;
6332 if (!client_ranges.empty()) {
6333 in->mark_clientwriteable();
6334 for (auto& p : client_ranges) {
6335 Capability *cap = in->get_client_cap(p.first);
6336 if (cap) {
6337 cap->mark_clientwriteable();
6338 } else {
6339 dout(10) << " client." << p.first << " has range " << p.second << " but no cap on " << *in << dendl;
6340 recover = true;
6341 break;
6342 }
6343 }
6344 }
6345
6346 if (recover) {
6347 if (in->filelock.is_stable()) {
6348 in->auth_pin(&in->filelock);
6349 } else {
6350 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6351 }
6352 in->filelock.set_state(LOCK_PRE_SCAN);
6353 rejoin_recover_q.push_back(in);
6354 } else {
6355 rejoin_check_q.push_back(in);
6356 }
6357
6358 if (!(++count % 1000))
6359 mds->heartbeat_reset();
6360 }
6361 }
6362
6363 void MDCache::start_files_to_recover()
6364 {
6365 int count = 0;
6366 for (CInode *in : rejoin_check_q) {
6367 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6368 mds->locker->issue_caps(in);
6369 mds->locker->check_inode_max_size(in);
6370 if (!(++count % 1000))
6371 mds->heartbeat_reset();
6372 }
6373 rejoin_check_q.clear();
6374 for (CInode *in : rejoin_recover_q) {
6375 mds->locker->file_recover(&in->filelock);
6376 if (!(++count % 1000))
6377 mds->heartbeat_reset();
6378 }
6379 if (!rejoin_recover_q.empty()) {
6380 rejoin_recover_q.clear();
6381 do_file_recover();
6382 }
6383 }
6384
6385 void MDCache::do_file_recover()
6386 {
6387 recovery_queue.advance();
6388 }
6389
6390 // ===============================================================================
6391
6392
6393 // ----------------------------
6394 // truncate
6395
6396 class C_MDC_RetryTruncate : public MDCacheContext {
6397 CInode *in;
6398 LogSegment *ls;
6399 public:
6400 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6401 MDCacheContext(c), in(i), ls(l) {}
6402 void finish(int r) override {
6403 mdcache->_truncate_inode(in, ls);
6404 }
6405 };
6406
6407 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6408 {
6409 const auto& pi = in->get_projected_inode();
6410 dout(10) << "truncate_inode "
6411 << pi->truncate_from << " -> " << pi->truncate_size
6412 << " on " << *in
6413 << dendl;
6414
6415 ls->truncating_inodes.insert(in);
6416 in->get(CInode::PIN_TRUNCATING);
6417 in->auth_pin(this);
6418
6419 if (!in->client_need_snapflush.empty() &&
6420 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6421 ceph_assert(in->filelock.is_xlocked());
6422 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6423 mds->locker->issue_caps(in);
6424 return;
6425 }
6426
6427 _truncate_inode(in, ls);
6428 }
6429
6430 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6431 CInode *in;
6432 LogSegment *ls;
6433 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6434 MDCacheIOContext(c, false), in(i), ls(l) {
6435 }
6436 void finish(int r) override {
6437 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
6438 mdcache->truncate_inode_finish(in, ls);
6439 }
6440 void print(ostream& out) const override {
6441 out << "file_truncate(" << in->ino() << ")";
6442 }
6443 };
6444
6445 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6446 {
6447 const auto& pi = in->get_inode();
6448 dout(10) << "_truncate_inode "
6449 << pi->truncate_from << " -> " << pi->truncate_size
6450 << " on " << *in << dendl;
6451
6452 ceph_assert(pi->is_truncating());
6453 ceph_assert(pi->truncate_size < (1ULL << 63));
6454 ceph_assert(pi->truncate_from < (1ULL << 63));
6455 ceph_assert(pi->truncate_size < pi->truncate_from);
6456
6457
6458 SnapRealm *realm = in->find_snaprealm();
6459 SnapContext nullsnap;
6460 const SnapContext *snapc;
6461 if (realm) {
6462 dout(10) << " realm " << *realm << dendl;
6463 snapc = &realm->get_snap_context();
6464 } else {
6465 dout(10) << " NO realm, using null context" << dendl;
6466 snapc = &nullsnap;
6467 ceph_assert(in->last == CEPH_NOSNAP);
6468 }
6469 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6470 auto layout = pi->layout;
6471 filer.truncate(in->ino(), &layout, *snapc,
6472 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6473 pi->truncate_seq, ceph::real_time::min(), 0,
6474 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6475 mds->finisher));
6476 }
6477
6478 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6479 CInode *in;
6480 MutationRef mut;
6481 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6482 MDCacheLogContext(m), in(i), mut(mu) {}
6483 void finish(int r) override {
6484 mdcache->truncate_inode_logged(in, mut);
6485 }
6486 };
6487
6488 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6489 {
6490 dout(10) << "truncate_inode_finish " << *in << dendl;
6491
6492 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6493 ceph_assert(p != ls->truncating_inodes.end());
6494 ls->truncating_inodes.erase(p);
6495
6496 MutationRef mut(new MutationImpl());
6497 mut->ls = mds->mdlog->get_current_segment();
6498
6499 // update
6500 auto pi = in->project_inode(mut);
6501 pi.inode->version = in->pre_dirty();
6502 pi.inode->truncate_from = 0;
6503 pi.inode->truncate_pending--;
6504
6505 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6506 mds->mdlog->start_entry(le);
6507
6508 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6509 journal_dirty_inode(mut.get(), &le->metablob, in);
6510 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6511 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6512
6513 // flush immediately if there are readers/writers waiting
6514 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6515 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6516 mds->mdlog->flush();
6517 }
6518
6519 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6520 {
6521 dout(10) << "truncate_inode_logged " << *in << dendl;
6522 mut->apply();
6523 mds->locker->drop_locks(mut.get());
6524 mut->cleanup();
6525
6526 in->put(CInode::PIN_TRUNCATING);
6527 in->auth_unpin(this);
6528
6529 MDSContext::vec waiters;
6530 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6531 mds->queue_waiters(waiters);
6532 }
6533
6534
6535 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6536 {
6537 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6538 << ls->seq << "/" << ls->offset << dendl;
6539 ls->truncating_inodes.insert(in);
6540 in->get(CInode::PIN_TRUNCATING);
6541 }
6542
6543 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6544 {
6545 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6546 << ls->seq << "/" << ls->offset << dendl;
6547 // if we have the logseg the truncate started in, it must be in our list.
6548 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6549 ceph_assert(p != ls->truncating_inodes.end());
6550 ls->truncating_inodes.erase(p);
6551 in->put(CInode::PIN_TRUNCATING);
6552 }
6553
6554 void MDCache::start_recovered_truncates()
6555 {
6556 dout(10) << "start_recovered_truncates" << dendl;
6557 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6558 p != mds->mdlog->segments.end();
6559 ++p) {
6560 LogSegment *ls = p->second;
6561 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6562 q != ls->truncating_inodes.end();
6563 ++q) {
6564 CInode *in = *q;
6565 in->auth_pin(this);
6566
6567 if (!in->client_need_snapflush.empty() &&
6568 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6569 ceph_assert(in->filelock.is_stable());
6570 in->filelock.set_state(LOCK_XLOCKDONE);
6571 in->auth_pin(&in->filelock);
6572 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6573 // start_files_to_recover will revoke caps
6574 continue;
6575 }
6576 _truncate_inode(in, ls);
6577 }
6578 }
6579 }
6580
6581
6582 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6583 interval_set<inodeno_t> inos;
6584 LogSegment *ls;
6585 version_t inotablev;
6586 public:
6587 C_MDS_purge_completed_finish(MDCache *m, const interval_set<inodeno_t>& _inos,
6588 LogSegment *_ls, version_t iv)
6589 : MDCacheLogContext(m), inos(_inos), ls(_ls), inotablev(iv) {}
6590 void finish(int r) override {
6591 assert(r == 0);
6592 if (inotablev) {
6593 get_mds()->inotable->apply_release_ids(inos);
6594 assert(get_mds()->inotable->get_version() == inotablev);
6595 }
6596 ls->purge_inodes_finish(inos);
6597 }
6598 };
6599
6600 void MDCache::start_purge_inodes(){
6601 dout(10) << "start_purge_inodes" << dendl;
6602 for (auto& p : mds->mdlog->segments){
6603 LogSegment *ls = p.second;
6604 if (ls->purging_inodes.size()){
6605 purge_inodes(ls->purging_inodes, ls);
6606 }
6607 }
6608 }
6609
6610 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6611 {
6612 dout(10) << __func__ << " purging inos " << inos << " logseg " << ls->seq << dendl;
6613 // FIXME: handle non-default data pool and namespace
6614
6615 auto cb = new LambdaContext([this, inos, ls](int r){
6616 assert(r == 0 || r == -2);
6617 mds->inotable->project_release_ids(inos);
6618 version_t piv = mds->inotable->get_projected_version();
6619 assert(piv != 0);
6620 mds->mdlog->start_submit_entry(new EPurged(inos, ls->seq, piv),
6621 new C_MDS_purge_completed_finish(this, inos, ls, piv));
6622 mds->mdlog->flush();
6623 });
6624
6625 C_GatherBuilder gather(g_ceph_context,
6626 new C_OnFinisher(new MDSIOContextWrapper(mds, cb), mds->finisher));
6627 SnapContext nullsnapc;
6628 for (const auto& [start, len] : inos) {
6629 for (auto i = start; i < start + len ; i += 1) {
6630 filer.purge_range(i, &default_file_layout, nullsnapc, 0, 1,
6631 ceph::real_clock::now(), 0, gather.new_sub());
6632 }
6633 }
6634 gather.activate();
6635 }
6636
6637 // ================================================================================
6638 // cache trimming
6639
6640 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6641 {
6642 bool is_standby_replay = mds->is_standby_replay();
6643 std::vector<CDentry *> unexpirables;
6644 uint64_t trimmed = 0;
6645
6646 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6647
6648 dout(7) << "trim_lru trimming " << count
6649 << " items from LRU"
6650 << " size=" << lru.lru_get_size()
6651 << " mid=" << lru.lru_get_top()
6652 << " pintail=" << lru.lru_get_pintail()
6653 << " pinned=" << lru.lru_get_num_pinned()
6654 << dendl;
6655
6656 const uint64_t trim_counter_start = trim_counter.get();
6657 bool throttled = false;
6658 while (1) {
6659 throttled |= trim_counter_start+trimmed >= trim_threshold;
6660 if (throttled) break;
6661 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6662 if (!dn)
6663 break;
6664 if (trim_dentry(dn, expiremap)) {
6665 unexpirables.push_back(dn);
6666 } else {
6667 trimmed++;
6668 }
6669 }
6670
6671 for (auto &dn : unexpirables) {
6672 bottom_lru.lru_insert_mid(dn);
6673 }
6674 unexpirables.clear();
6675
6676 // trim dentries from the LRU until count is reached
6677 // if mds is in standby_replay and skip trimming the inodes
6678 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6679 throttled |= trim_counter_start+trimmed >= trim_threshold;
6680 if (throttled) break;
6681 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6682 if (!dn) {
6683 break;
6684 }
6685 if (is_standby_replay && dn->get_linkage()->inode) {
6686 // we move the inodes that need to be trimmed to the end of the lru queue.
6687 // refer to MDCache::standby_trim_segment
6688 lru.lru_insert_bot(dn);
6689 break;
6690 } else if (trim_dentry(dn, expiremap)) {
6691 unexpirables.push_back(dn);
6692 } else {
6693 trimmed++;
6694 if (count > 0) count--;
6695 }
6696 }
6697 trim_counter.hit(trimmed);
6698
6699 for (auto &dn : unexpirables) {
6700 lru.lru_insert_mid(dn);
6701 }
6702 unexpirables.clear();
6703
6704 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6705 return std::pair<bool, uint64_t>(throttled, trimmed);
6706 }
6707
6708 /*
6709 * note: only called while MDS is active or stopping... NOT during recovery.
6710 * however, we may expire a replica whose authority is recovering.
6711 *
6712 * @param count is number of dentries to try to expire
6713 */
6714 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6715 {
6716 uint64_t used = cache_size();
6717 uint64_t limit = cache_memory_limit;
6718 expiremap expiremap;
6719
6720 dout(7) << "trim bytes_used=" << bytes2str(used)
6721 << " limit=" << bytes2str(limit)
6722 << " reservation=" << cache_reservation
6723 << "% count=" << count << dendl;
6724
6725 // process delayed eval_stray()
6726 stray_manager.advance_delayed();
6727
6728 auto result = trim_lru(count, expiremap);
6729 auto& trimmed = result.second;
6730
6731 // trim non-auth, non-bound subtrees
6732 for (auto p = subtrees.begin(); p != subtrees.end();) {
6733 CDir *dir = p->first;
6734 ++p;
6735 CInode *diri = dir->get_inode();
6736 if (dir->is_auth()) {
6737 if (diri->is_auth() && !diri->is_base()) {
6738 /* this situation should correspond to an export pin */
6739 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6740 /* pinned empty subtree, try to drop */
6741 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6742 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6743 dir->state_clear(CDir::STATE_AUXSUBTREE);
6744 remove_subtree(dir);
6745 diri->close_dirfrag(dir->dirfrag().frag);
6746 }
6747 }
6748 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6749 if (dir->state_test(CDir::STATE_EXPORTING) ||
6750 !(mds->is_active() || mds->is_stopping()) ||
6751 dir->is_freezing() || dir->is_frozen())
6752 continue;
6753
6754 migrator->export_empty_import(dir);
6755 ++trimmed;
6756 }
6757 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6758 // only subtree pin
6759 if (diri->get_num_ref() > diri->get_num_subtree_roots()) {
6760 continue;
6761 }
6762
6763 // don't trim subtree root if its auth MDS is recovering.
6764 // This simplify the cache rejoin code.
6765 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6766 continue;
6767 trim_dirfrag(dir, 0, expiremap);
6768 ++trimmed;
6769 }
6770 }
6771
6772 // trim root?
6773 if (mds->is_stopping() && root) {
6774 auto&& ls = root->get_dirfrags();
6775 for (const auto& dir : ls) {
6776 if (dir->get_num_ref() == 1) { // subtree pin
6777 trim_dirfrag(dir, 0, expiremap);
6778 ++trimmed;
6779 }
6780 }
6781 if (root->get_num_ref() == 0) {
6782 trim_inode(0, root, 0, expiremap);
6783 ++trimmed;
6784 }
6785 }
6786
6787 std::set<mds_rank_t> stopping;
6788 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6789 stopping.erase(mds->get_nodeid());
6790 for (auto rank : stopping) {
6791 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6792 if (!mdsdir_in)
6793 continue;
6794
6795 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6796 if (em.second) {
6797 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6798 }
6799
6800 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6801
6802 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6803 if (!aborted) {
6804 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6805 auto&& ls = mdsdir_in->get_dirfrags();
6806 for (auto dir : ls) {
6807 if (dir->get_num_ref() == 1) { // subtree pin
6808 trim_dirfrag(dir, dir, expiremap);
6809 ++trimmed;
6810 }
6811 }
6812 if (mdsdir_in->get_num_ref() == 0) {
6813 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6814 ++trimmed;
6815 }
6816 } else {
6817 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6818 }
6819 }
6820
6821 // Other rank's base inodes (when I'm stopping)
6822 if (mds->is_stopping()) {
6823 for (set<CInode*>::iterator p = base_inodes.begin();
6824 p != base_inodes.end();) {
6825 CInode *base_in = *p;
6826 ++p;
6827 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6828 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6829 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6830 if (base_in->get_num_ref() == 0) {
6831 trim_inode(NULL, base_in, NULL, expiremap);
6832 ++trimmed;
6833 }
6834 }
6835 }
6836 }
6837
6838 // send any expire messages
6839 send_expire_messages(expiremap);
6840
6841 return result;
6842 }
6843
6844 void MDCache::send_expire_messages(expiremap& expiremap)
6845 {
6846 // send expires
6847 for (const auto &p : expiremap) {
6848 if (mds->is_cluster_degraded() &&
6849 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6850 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6851 rejoin_sent.count(p.first) == 0))) {
6852 continue;
6853 }
6854 dout(7) << "sending cache_expire to " << p.first << dendl;
6855 mds->send_message_mds(p.second, p.first);
6856 }
6857 expiremap.clear();
6858 }
6859
6860
6861 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6862 {
6863 dout(12) << "trim_dentry " << *dn << dendl;
6864
6865 CDentry::linkage_t *dnl = dn->get_linkage();
6866
6867 CDir *dir = dn->get_dir();
6868 ceph_assert(dir);
6869
6870 CDir *con = get_subtree_root(dir);
6871 if (con)
6872 dout(12) << " in container " << *con << dendl;
6873 else {
6874 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6875 ceph_assert(dn->is_auth());
6876 }
6877
6878 // If replica dentry is not readable, it's likely we will receive
6879 // MDentryLink/MDentryUnlink message soon (It's possible we first
6880 // receive a MDentryUnlink message, then MDentryLink message)
6881 // MDentryLink message only replicates an inode, so we should
6882 // avoid trimming the inode's parent dentry. This is because that
6883 // unconnected replicas are problematic for subtree migration.
6884 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6885 !dn->get_dir()->get_inode()->is_stray())
6886 return true;
6887
6888 // adjust the dir state
6889 // NOTE: we can safely remove a clean, null dentry without effecting
6890 // directory completeness.
6891 // (check this _before_ we unlink the inode, below!)
6892 bool clear_complete = false;
6893 if (!(dnl->is_null() && dn->is_clean()))
6894 clear_complete = true;
6895
6896 // unlink the dentry
6897 if (dnl->is_remote()) {
6898 // just unlink.
6899 dir->unlink_inode(dn, false);
6900 } else if (dnl->is_primary()) {
6901 // expire the inode, too.
6902 CInode *in = dnl->get_inode();
6903 ceph_assert(in);
6904 if (trim_inode(dn, in, con, expiremap))
6905 return true; // purging stray instead of trimming
6906 } else {
6907 ceph_assert(dnl->is_null());
6908 }
6909
6910 if (!dn->is_auth()) {
6911 // notify dentry authority.
6912 mds_authority_t auth = dn->authority();
6913
6914 for (int p=0; p<2; p++) {
6915 mds_rank_t a = auth.first;
6916 if (p) a = auth.second;
6917 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6918 if (mds->get_nodeid() == auth.second &&
6919 con->is_importing()) break; // don't send any expire while importing.
6920 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6921
6922 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6923 ceph_assert(a != mds->get_nodeid());
6924 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6925 if (em.second)
6926 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6927 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6928 }
6929 }
6930
6931 // remove dentry
6932 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6933 dir->add_to_bloom(dn);
6934 dir->remove_dentry(dn);
6935
6936 if (clear_complete)
6937 dir->state_clear(CDir::STATE_COMPLETE);
6938
6939 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6940 return false;
6941 }
6942
6943
6944 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
6945 {
6946 dout(15) << "trim_dirfrag " << *dir << dendl;
6947
6948 if (dir->is_subtree_root()) {
6949 ceph_assert(!dir->is_auth() ||
6950 (!dir->is_replicated() && dir->inode->is_base()));
6951 remove_subtree(dir); // remove from subtree map
6952 }
6953 ceph_assert(dir->get_num_ref() == 0);
6954
6955 CInode *in = dir->get_inode();
6956
6957 if (!dir->is_auth()) {
6958 mds_authority_t auth = dir->authority();
6959
6960 // was this an auth delegation? (if so, slightly modified container)
6961 dirfrag_t condf;
6962 if (dir->is_subtree_root()) {
6963 dout(12) << " subtree root, container is " << *dir << dendl;
6964 con = dir;
6965 condf = dir->dirfrag();
6966 } else {
6967 condf = con->dirfrag();
6968 }
6969
6970 for (int p=0; p<2; p++) {
6971 mds_rank_t a = auth.first;
6972 if (p) a = auth.second;
6973 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6974 if (mds->get_nodeid() == auth.second &&
6975 con->is_importing()) break; // don't send any expire while importing.
6976 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6977
6978 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6979 ceph_assert(a != mds->get_nodeid());
6980 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6981 if (em.second)
6982 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
6983 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6984 }
6985 }
6986
6987 in->close_dirfrag(dir->dirfrag().frag);
6988 }
6989
6990 /**
6991 * Try trimming an inode from the cache
6992 *
6993 * @return true if the inode is still in cache, else false if it was trimmed
6994 */
6995 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
6996 {
6997 dout(15) << "trim_inode " << *in << dendl;
6998 ceph_assert(in->get_num_ref() == 0);
6999
7000 if (in->is_dir()) {
7001 // If replica inode's dirfragtreelock is not readable, it's likely
7002 // some dirfrags of the inode are being fragmented and we will receive
7003 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7004 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7005 // This is because that unconnected replicas are problematic for
7006 // subtree migration.
7007 //
7008 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7009 return true;
7010 }
7011
7012 // DIR
7013 auto&& dfls = in->get_dirfrags();
7014 for (const auto& dir : dfls) {
7015 ceph_assert(!dir->is_subtree_root());
7016 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7017 }
7018 }
7019
7020 // INODE
7021 if (in->is_auth()) {
7022 // eval stray after closing dirfrags
7023 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7024 maybe_eval_stray(in);
7025 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7026 return true;
7027 }
7028 } else {
7029 mds_authority_t auth = in->authority();
7030
7031 dirfrag_t df;
7032 if (con)
7033 df = con->dirfrag();
7034 else
7035 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7036
7037 for (int p=0; p<2; p++) {
7038 mds_rank_t a = auth.first;
7039 if (p) a = auth.second;
7040 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7041 if (con && mds->get_nodeid() == auth.second &&
7042 con->is_importing()) break; // don't send any expire while importing.
7043 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7044
7045 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7046 ceph_assert(a != mds->get_nodeid());
7047 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7048 if (em.second)
7049 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7050 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7051 }
7052 }
7053
7054 /*
7055 if (in->is_auth()) {
7056 if (in->hack_accessed)
7057 mds->logger->inc("outt");
7058 else {
7059 mds->logger->inc("outut");
7060 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7061 }
7062 }
7063 */
7064
7065 // unlink
7066 if (dn)
7067 dn->get_dir()->unlink_inode(dn, false);
7068 remove_inode(in);
7069 return false;
7070 }
7071
7072
7073 /**
7074 * trim_non_auth - remove any non-auth items from our cache
7075 *
7076 * this reduces the amount of non-auth metadata in our cache, reducing the
7077 * load incurred by the rejoin phase.
7078 *
7079 * the only non-auth items that remain are those that are needed to
7080 * attach our own subtrees to the root.
7081 *
7082 * when we are done, all dentries will be in the top bit of the lru.
7083 *
7084 * why we have to do this:
7085 * we may not have accurate linkage for non-auth items. which means we will
7086 * know which subtree it falls into, and can not be sure to declare it to the
7087 * correct authority.
7088 */
7089 void MDCache::trim_non_auth()
7090 {
7091 dout(7) << "trim_non_auth" << dendl;
7092
7093 // temporarily pin all subtree roots
7094 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7095 p != subtrees.end();
7096 ++p)
7097 p->first->get(CDir::PIN_SUBTREETEMP);
7098
7099 list<CDentry*> auth_list;
7100
7101 // trim non-auth items from the lru
7102 for (;;) {
7103 CDentry *dn = NULL;
7104 if (bottom_lru.lru_get_size() > 0)
7105 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7106 if (!dn && lru.lru_get_size() > 0)
7107 dn = static_cast<CDentry*>(lru.lru_expire());
7108 if (!dn)
7109 break;
7110
7111 CDentry::linkage_t *dnl = dn->get_linkage();
7112
7113 if (dn->is_auth()) {
7114 // add back into lru (at the top)
7115 auth_list.push_back(dn);
7116
7117 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7118 dn->unlink_remote(dnl);
7119 } else {
7120 // non-auth. expire.
7121 CDir *dir = dn->get_dir();
7122 ceph_assert(dir);
7123
7124 // unlink the dentry
7125 dout(10) << " removing " << *dn << dendl;
7126 if (dnl->is_remote()) {
7127 dir->unlink_inode(dn, false);
7128 }
7129 else if (dnl->is_primary()) {
7130 CInode *in = dnl->get_inode();
7131 dout(10) << " removing " << *in << dendl;
7132 auto&& ls = in->get_dirfrags();
7133 for (const auto& subdir : ls) {
7134 ceph_assert(!subdir->is_subtree_root());
7135 in->close_dirfrag(subdir->dirfrag().frag);
7136 }
7137 dir->unlink_inode(dn, false);
7138 remove_inode(in);
7139 }
7140 else {
7141 ceph_assert(dnl->is_null());
7142 }
7143
7144 ceph_assert(!dir->has_bloom());
7145 dir->remove_dentry(dn);
7146 // adjust the dir state
7147 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7148 // close empty non-auth dirfrag
7149 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7150 dir->inode->close_dirfrag(dir->get_frag());
7151 }
7152 }
7153
7154 for (const auto& dn : auth_list) {
7155 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7156 bottom_lru.lru_insert_mid(dn);
7157 else
7158 lru.lru_insert_top(dn);
7159 }
7160
7161 // move everything in the pintail to the top bit of the lru.
7162 lru.lru_touch_entire_pintail();
7163
7164 // unpin all subtrees
7165 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7166 p != subtrees.end();
7167 ++p)
7168 p->first->put(CDir::PIN_SUBTREETEMP);
7169
7170 if (lru.lru_get_size() == 0 &&
7171 bottom_lru.lru_get_size() == 0) {
7172 // root, stray, etc.?
7173 auto p = inode_map.begin();
7174 while (p != inode_map.end()) {
7175 CInode *in = p->second;
7176 ++p;
7177 if (!in->is_auth()) {
7178 auto&& ls = in->get_dirfrags();
7179 for (const auto& dir : ls) {
7180 dout(10) << " removing " << *dir << dendl;
7181 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7182 remove_subtree(dir);
7183 in->close_dirfrag(dir->dirfrag().frag);
7184 }
7185 dout(10) << " removing " << *in << dendl;
7186 ceph_assert(!in->get_parent_dn());
7187 ceph_assert(in->get_num_ref() == 0);
7188 remove_inode(in);
7189 }
7190 }
7191 }
7192
7193 show_subtrees();
7194 }
7195
7196 /**
7197 * Recursively trim the subtree rooted at directory to remove all
7198 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7199 * of those links. This is used to clear invalid data out of the cache.
7200 * Note that it doesn't clear the passed-in directory, since that's not
7201 * always safe.
7202 */
7203 bool MDCache::trim_non_auth_subtree(CDir *dir)
7204 {
7205 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7206
7207 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7208
7209 auto j = dir->begin();
7210 auto i = j;
7211 while (j != dir->end()) {
7212 i = j++;
7213 CDentry *dn = i->second;
7214 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7215 CDentry::linkage_t *dnl = dn->get_linkage();
7216 if (dnl->is_primary()) { // check for subdirectories, etc
7217 CInode *in = dnl->get_inode();
7218 bool keep_inode = false;
7219 if (in->is_dir()) {
7220 auto&& subdirs = in->get_dirfrags();
7221 for (const auto& subdir : subdirs) {
7222 if (subdir->is_subtree_root()) {
7223 keep_inode = true;
7224 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7225 } else {
7226 if (trim_non_auth_subtree(subdir))
7227 keep_inode = true;
7228 else {
7229 in->close_dirfrag(subdir->get_frag());
7230 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7231 }
7232 }
7233 }
7234
7235 }
7236 if (!keep_inode) { // remove it!
7237 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7238 dir->unlink_inode(dn, false);
7239 remove_inode(in);
7240 ceph_assert(!dir->has_bloom());
7241 dir->remove_dentry(dn);
7242 } else {
7243 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7244 dn->state_clear(CDentry::STATE_AUTH);
7245 in->state_clear(CInode::STATE_AUTH);
7246 }
7247 } else if (keep_dir && dnl->is_null()) { // keep null dentry for peer rollback
7248 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7249 } else { // just remove it
7250 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7251 if (dnl->is_remote())
7252 dir->unlink_inode(dn, false);
7253 dir->remove_dentry(dn);
7254 }
7255 }
7256 dir->state_clear(CDir::STATE_AUTH);
7257 /**
7258 * We've now checked all our children and deleted those that need it.
7259 * Now return to caller, and tell them if *we're* a keeper.
7260 */
7261 return keep_dir || dir->get_num_any();
7262 }
7263
7264 /*
7265 * during replay, when we determine a subtree is no longer ours, we
7266 * try to trim it from our cache. because subtrees must be connected
7267 * to the root, the fact that we can trim this tree may mean that our
7268 * children or parents can also be trimmed.
7269 */
7270 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7271 {
7272 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7273
7274 // can we now trim child subtrees?
7275 set<CDir*> bounds;
7276 get_subtree_bounds(dir, bounds);
7277 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7278 CDir *bd = *p;
7279 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7280 bd->get_num_any() == 0 && // and empty
7281 can_trim_non_auth_dirfrag(bd)) {
7282 CInode *bi = bd->get_inode();
7283 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7284 remove_subtree(bd);
7285 bd->mark_clean();
7286 bi->close_dirfrag(bd->get_frag());
7287 }
7288 }
7289
7290 if (trim_non_auth_subtree(dir)) {
7291 // keep
7292 try_subtree_merge(dir);
7293 } else {
7294 // can we trim this subtree (and possibly our ancestors) too?
7295 while (true) {
7296 CInode *diri = dir->get_inode();
7297 if (diri->is_base()) {
7298 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7299 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7300 remove_subtree(dir);
7301 dir->mark_clean();
7302 diri->close_dirfrag(dir->get_frag());
7303
7304 dout(10) << " removing " << *diri << dendl;
7305 ceph_assert(!diri->get_parent_dn());
7306 ceph_assert(diri->get_num_ref() == 0);
7307 remove_inode(diri);
7308 }
7309 break;
7310 }
7311
7312 CDir *psub = get_subtree_root(diri->get_parent_dir());
7313 dout(10) << " parent subtree is " << *psub << dendl;
7314 if (psub->get_dir_auth().first == mds->get_nodeid())
7315 break; // we are auth, keep.
7316
7317 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7318 remove_subtree(dir);
7319 dir->mark_clean();
7320 diri->close_dirfrag(dir->get_frag());
7321
7322 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7323 if (trim_non_auth_subtree(psub))
7324 break;
7325 dir = psub;
7326 }
7327 }
7328
7329 show_subtrees();
7330 }
7331
7332 void MDCache::standby_trim_segment(LogSegment *ls)
7333 {
7334 auto try_trim_inode = [this](CInode *in) {
7335 if (in->get_num_ref() == 0 &&
7336 !in->item_open_file.is_on_list() &&
7337 in->parent != NULL &&
7338 in->parent->get_num_ref() == 0){
7339 touch_dentry_bottom(in->parent);
7340 }
7341 };
7342
7343 auto try_trim_dentry = [this](CDentry *dn) {
7344 if (dn->get_num_ref() > 0)
7345 return;
7346 auto in = dn->get_linkage()->inode;
7347 if(in && in->item_open_file.is_on_list())
7348 return;
7349 touch_dentry_bottom(dn);
7350 };
7351
7352 ls->new_dirfrags.clear_list();
7353 ls->open_files.clear_list();
7354
7355 while (!ls->dirty_dirfrags.empty()) {
7356 CDir *dir = ls->dirty_dirfrags.front();
7357 dir->mark_clean();
7358 if (dir->inode)
7359 try_trim_inode(dir->inode);
7360 }
7361 while (!ls->dirty_inodes.empty()) {
7362 CInode *in = ls->dirty_inodes.front();
7363 in->mark_clean();
7364 try_trim_inode(in);
7365 }
7366 while (!ls->dirty_dentries.empty()) {
7367 CDentry *dn = ls->dirty_dentries.front();
7368 dn->mark_clean();
7369 try_trim_dentry(dn);
7370 }
7371 while (!ls->dirty_parent_inodes.empty()) {
7372 CInode *in = ls->dirty_parent_inodes.front();
7373 in->clear_dirty_parent();
7374 try_trim_inode(in);
7375 }
7376 while (!ls->dirty_dirfrag_dir.empty()) {
7377 CInode *in = ls->dirty_dirfrag_dir.front();
7378 in->filelock.remove_dirty();
7379 try_trim_inode(in);
7380 }
7381 while (!ls->dirty_dirfrag_nest.empty()) {
7382 CInode *in = ls->dirty_dirfrag_nest.front();
7383 in->nestlock.remove_dirty();
7384 try_trim_inode(in);
7385 }
7386 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7387 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7388 in->dirfragtreelock.remove_dirty();
7389 try_trim_inode(in);
7390 }
7391 while (!ls->truncating_inodes.empty()) {
7392 auto it = ls->truncating_inodes.begin();
7393 CInode *in = *it;
7394 ls->truncating_inodes.erase(it);
7395 in->put(CInode::PIN_TRUNCATING);
7396 try_trim_inode(in);
7397 }
7398 }
7399
7400 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7401 {
7402 mds_rank_t from = mds_rank_t(m->get_from());
7403
7404 dout(7) << "cache_expire from mds." << from << dendl;
7405
7406 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7407 return;
7408 }
7409
7410 set<SimpleLock *> gather_locks;
7411 // loop over realms
7412 for (const auto &p : m->realms) {
7413 // check container?
7414 if (p.first.ino > 0) {
7415 CInode *expired_inode = get_inode(p.first.ino);
7416 ceph_assert(expired_inode); // we had better have this.
7417 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7418 ceph_assert(parent_dir);
7419
7420 int export_state = -1;
7421 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7422 export_state = migrator->get_export_state(parent_dir);
7423 ceph_assert(export_state >= 0);
7424 }
7425
7426 if (!parent_dir->is_auth() ||
7427 (export_state != -1 &&
7428 ((export_state == Migrator::EXPORT_WARNING &&
7429 migrator->export_has_warned(parent_dir,from)) ||
7430 export_state == Migrator::EXPORT_EXPORTING ||
7431 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7432 (export_state == Migrator::EXPORT_NOTIFYING &&
7433 !migrator->export_has_notified(parent_dir,from))))) {
7434
7435 // not auth.
7436 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7437 ceph_assert(parent_dir->is_frozen_tree_root());
7438
7439 // make a message container
7440
7441 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7442 if (em.second)
7443 em.first->second = make_message<MCacheExpire>(from); /* new */
7444
7445 // merge these expires into it
7446 em.first->second->add_realm(p.first, p.second);
7447 continue;
7448 }
7449 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7450 (export_state == Migrator::EXPORT_WARNING &&
7451 !migrator->export_has_warned(parent_dir, from)));
7452
7453 dout(7) << "expires for " << *parent_dir << dendl;
7454 } else {
7455 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7456 }
7457
7458 // INODES
7459 for (const auto &q : p.second.inodes) {
7460 CInode *in = get_inode(q.first);
7461 unsigned nonce = q.second;
7462
7463 if (!in) {
7464 dout(0) << " inode expire on " << q.first << " from " << from
7465 << ", don't have it" << dendl;
7466 ceph_assert(in);
7467 }
7468 ceph_assert(in->is_auth());
7469 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7470
7471 // check nonce
7472 if (nonce == in->get_replica_nonce(from)) {
7473 // remove from our cached_by
7474 dout(7) << " inode expire on " << *in << " from mds." << from
7475 << " cached_by was " << in->get_replicas() << dendl;
7476 inode_remove_replica(in, from, false, gather_locks);
7477 }
7478 else {
7479 // this is an old nonce, ignore expire.
7480 dout(7) << " inode expire on " << *in << " from mds." << from
7481 << " with old nonce " << nonce
7482 << " (current " << in->get_replica_nonce(from) << "), dropping"
7483 << dendl;
7484 }
7485 }
7486
7487 // DIRS
7488 for (const auto &q : p.second.dirs) {
7489 CDir *dir = get_dirfrag(q.first);
7490 unsigned nonce = q.second;
7491
7492 if (!dir) {
7493 CInode *diri = get_inode(q.first.ino);
7494 if (diri) {
7495 if (mds->is_rejoin() &&
7496 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7497 !diri->is_replica(from)) {
7498 auto&& ls = diri->get_nested_dirfrags();
7499 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7500 << " while rejoining, inode isn't replicated" << dendl;
7501 for (const auto& d : ls) {
7502 dir = d;
7503 if (dir->is_replica(from)) {
7504 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7505 dir->remove_replica(from);
7506 }
7507 }
7508 continue;
7509 }
7510 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7511 if (other) {
7512 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7513 << " have " << *other << ", mismatched frags, dropping" << dendl;
7514 continue;
7515 }
7516 }
7517 dout(0) << " dir expire on " << q.first << " from " << from
7518 << ", don't have it" << dendl;
7519 ceph_assert(dir);
7520 }
7521 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7522
7523 ceph_assert(dir->is_auth());
7524
7525 // check nonce
7526 if (nonce == dir->get_replica_nonce(from)) {
7527 // remove from our cached_by
7528 dout(7) << " dir expire on " << *dir << " from mds." << from
7529 << " replicas was " << dir->get_replicas() << dendl;
7530 dir->remove_replica(from);
7531 }
7532 else {
7533 // this is an old nonce, ignore expire.
7534 dout(7) << " dir expire on " << *dir << " from mds." << from
7535 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7536 << "), dropping" << dendl;
7537 }
7538 }
7539
7540 // DENTRIES
7541 for (const auto &pd : p.second.dentries) {
7542 dout(10) << " dn expires in dir " << pd.first << dendl;
7543 CInode *diri = get_inode(pd.first.ino);
7544 ceph_assert(diri);
7545 CDir *dir = diri->get_dirfrag(pd.first.frag);
7546
7547 if (!dir) {
7548 dout(0) << " dn expires on " << pd.first << " from " << from
7549 << ", must have refragmented" << dendl;
7550 } else {
7551 ceph_assert(dir->is_auth());
7552 }
7553
7554 for (const auto &p : pd.second) {
7555 unsigned nonce = p.second;
7556 CDentry *dn;
7557
7558 if (dir) {
7559 dn = dir->lookup(p.first.first, p.first.second);
7560 } else {
7561 // which dirfrag for this dentry?
7562 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7563 ceph_assert(dir);
7564 ceph_assert(dir->is_auth());
7565 dn = dir->lookup(p.first.first, p.first.second);
7566 }
7567
7568 if (!dn) {
7569 if (dir)
7570 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7571 else
7572 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7573 }
7574 ceph_assert(dn);
7575
7576 if (nonce == dn->get_replica_nonce(from)) {
7577 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7578 dentry_remove_replica(dn, from, gather_locks);
7579 }
7580 else {
7581 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7582 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7583 << "), dropping" << dendl;
7584 }
7585 }
7586 }
7587 }
7588
7589 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7590 if (!(*p)->is_stable())
7591 mds->locker->eval_gather(*p);
7592 }
7593 }
7594
7595 void MDCache::process_delayed_expire(CDir *dir)
7596 {
7597 dout(7) << "process_delayed_expire on " << *dir << dendl;
7598 for (const auto &p : delayed_expire[dir]) {
7599 handle_cache_expire(p.second);
7600 }
7601 delayed_expire.erase(dir);
7602 }
7603
7604 void MDCache::discard_delayed_expire(CDir *dir)
7605 {
7606 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7607 delayed_expire.erase(dir);
7608 }
7609
7610 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7611 set<SimpleLock *>& gather_locks)
7612 {
7613 in->remove_replica(from);
7614 in->set_mds_caps_wanted(from, 0);
7615
7616 // note: this code calls _eval more often than it needs to!
7617 // fix lock
7618 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7619 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7620 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7621 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7622 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7623 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7624
7625 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7626 // Don't remove the recovering mds from lock's gathering list because
7627 // it may hold rejoined wrlocks.
7628 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7629 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7630 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7631 }
7632
7633 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7634 {
7635 dn->remove_replica(from);
7636
7637 // fix lock
7638 if (dn->lock.remove_replica(from))
7639 gather_locks.insert(&dn->lock);
7640
7641 // Replicated strays might now be elegible for purge
7642 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7643 if (dnl->is_primary()) {
7644 maybe_eval_stray(dnl->get_inode());
7645 }
7646 }
7647
7648 void MDCache::trim_client_leases()
7649 {
7650 utime_t now = ceph_clock_now();
7651
7652 dout(10) << "trim_client_leases" << dendl;
7653
7654 std::size_t pool = 0;
7655 for (const auto& list : client_leases) {
7656 pool += 1;
7657 if (list.empty())
7658 continue;
7659
7660 auto before = list.size();
7661 while (!list.empty()) {
7662 ClientLease *r = list.front();
7663 if (r->ttl > now) break;
7664 CDentry *dn = static_cast<CDentry*>(r->parent);
7665 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7666 dn->remove_client_lease(r, mds->locker);
7667 }
7668 auto after = list.size();
7669 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7670 << (before-after) << " leases, " << after << " left" << dendl;
7671 }
7672 }
7673
7674 void MDCache::check_memory_usage()
7675 {
7676 static MemoryModel mm(g_ceph_context);
7677 static MemoryModel::snap last;
7678 mm.sample(&last);
7679 static MemoryModel::snap baseline = last;
7680
7681 // check client caps
7682 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7683 double caps_per_inode = 0.0;
7684 if (CInode::count())
7685 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7686
7687 dout(2) << "Memory usage: "
7688 << " total " << last.get_total()
7689 << ", rss " << last.get_rss()
7690 << ", heap " << last.get_heap()
7691 << ", baseline " << baseline.get_heap()
7692 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7693 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7694 << dendl;
7695
7696 mds->update_mlogger();
7697 mds->mlogger->set(l_mdm_rss, last.get_rss());
7698 mds->mlogger->set(l_mdm_heap, last.get_heap());
7699 }
7700
7701
7702
7703 // =========================================================================================
7704 // shutdown
7705
7706 class C_MDC_ShutdownCheck : public MDCacheContext {
7707 public:
7708 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7709 void finish(int) override {
7710 mdcache->shutdown_check();
7711 }
7712 };
7713
7714 void MDCache::shutdown_check()
7715 {
7716 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7717
7718 // cache
7719 char old_val[32] = { 0 };
7720 char *o = old_val;
7721 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7722 g_conf().set_val("debug_mds", "10");
7723 g_conf().apply_changes(nullptr);
7724 show_cache();
7725 g_conf().set_val("debug_mds", old_val);
7726 g_conf().apply_changes(nullptr);
7727 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7728
7729 // this
7730 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7731 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7732
7733
7734 if (mds->objecter->is_active()) {
7735 dout(0) << "objecter still active" << dendl;
7736 mds->objecter->dump_active();
7737 }
7738 }
7739
7740
7741 void MDCache::shutdown_start()
7742 {
7743 dout(5) << "shutdown_start" << dendl;
7744
7745 if (g_conf()->mds_shutdown_check)
7746 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7747
7748 // g_conf()->debug_mds = 10;
7749 }
7750
7751
7752
7753 bool MDCache::shutdown_pass()
7754 {
7755 dout(7) << "shutdown_pass" << dendl;
7756
7757 if (mds->is_stopped()) {
7758 dout(7) << " already shut down" << dendl;
7759 show_cache();
7760 show_subtrees();
7761 return true;
7762 }
7763
7764 // empty stray dir
7765 bool strays_all_exported = shutdown_export_strays();
7766
7767 // trim cache
7768 trim(UINT64_MAX);
7769 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7770
7771 // Export all subtrees to another active (usually rank 0) if not rank 0
7772 int num_auth_subtree = 0;
7773 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7774 dout(7) << "looking for subtrees to export" << dendl;
7775 std::vector<CDir*> ls;
7776 for (auto& [dir, bounds] : subtrees) {
7777 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7778 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7779 continue;
7780 num_auth_subtree++;
7781 if (dir->is_frozen() ||
7782 dir->is_freezing() ||
7783 dir->is_ambiguous_dir_auth() ||
7784 dir->state_test(CDir::STATE_EXPORTING) ||
7785 dir->get_inode()->is_ephemerally_pinned()) {
7786 continue;
7787 }
7788 ls.push_back(dir);
7789 }
7790
7791 migrator->clear_export_queue();
7792 // stopping mds does not call MDBalancer::tick()
7793 mds->balancer->handle_export_pins();
7794 for (const auto& dir : ls) {
7795 mds_rank_t dest = dir->get_inode()->authority().first;
7796 if (dest > 0 && !mds->mdsmap->is_active(dest))
7797 dest = 0;
7798 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7799 migrator->export_dir_nicely(dir, dest);
7800 }
7801 }
7802
7803 if (!strays_all_exported) {
7804 dout(7) << "waiting for strays to migrate" << dendl;
7805 return false;
7806 }
7807
7808 if (num_auth_subtree > 0) {
7809 ceph_assert(mds->get_nodeid() > 0);
7810 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7811 show_subtrees();
7812 return false;
7813 }
7814
7815 // close out any sessions (and open files!) before we try to trim the log, etc.
7816 if (mds->sessionmap.have_unclosed_sessions()) {
7817 if (!mds->server->terminating_sessions)
7818 mds->server->terminate_sessions();
7819 return false;
7820 }
7821
7822 // Fully trim the log so that all objects in cache are clean and may be
7823 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7824 // trim the log such that the cache eventually becomes clean.
7825 if (mds->mdlog->get_num_segments() > 0) {
7826 auto ls = mds->mdlog->get_current_segment();
7827 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7828 // Current segment contains events other than subtreemap or
7829 // there are dirty dirfrags (see CDir::log_mark_dirty())
7830 mds->mdlog->start_new_segment();
7831 mds->mdlog->flush();
7832 }
7833 }
7834 mds->mdlog->trim_all();
7835 if (mds->mdlog->get_num_segments() > 1) {
7836 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7837 return false;
7838 }
7839
7840 // drop our reference to our stray dir inode
7841 for (int i = 0; i < NUM_STRAY; ++i) {
7842 if (strays[i] &&
7843 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7844 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7845 strays[i]->put(CInode::PIN_STRAY);
7846 strays[i]->put_stickydirs();
7847 }
7848 }
7849
7850 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7851 if (mydir && !mydir->is_subtree_root())
7852 mydir = NULL;
7853
7854 // subtrees map not empty yet?
7855 if (subtrees.size() > (mydir ? 1 : 0)) {
7856 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7857 show_subtrees();
7858 migrator->show_importing();
7859 migrator->show_exporting();
7860 if (!migrator->is_importing() && !migrator->is_exporting())
7861 show_cache();
7862 return false;
7863 }
7864 ceph_assert(!migrator->is_exporting());
7865 ceph_assert(!migrator->is_importing());
7866
7867 // replicas may dirty scatter locks
7868 if (myin && myin->is_replicated()) {
7869 dout(7) << "still have replicated objects" << dendl;
7870 return false;
7871 }
7872
7873 if ((myin && myin->get_num_auth_pins()) ||
7874 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
7875 dout(7) << "still have auth pinned objects" << dendl;
7876 return false;
7877 }
7878
7879 // (only do this once!)
7880 if (!mds->mdlog->is_capped()) {
7881 dout(7) << "capping the log" << dendl;
7882 mds->mdlog->cap();
7883 }
7884
7885 if (!mds->mdlog->empty())
7886 mds->mdlog->trim(0);
7887
7888 if (!mds->mdlog->empty()) {
7889 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7890 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7891 return false;
7892 }
7893
7894 if (!did_shutdown_log_cap) {
7895 // flush journal header
7896 dout(7) << "writing header for (now-empty) journal" << dendl;
7897 ceph_assert(mds->mdlog->empty());
7898 mds->mdlog->write_head(0);
7899 // NOTE: filer active checker below will block us until this completes.
7900 did_shutdown_log_cap = true;
7901 return false;
7902 }
7903
7904 // filer active?
7905 if (mds->objecter->is_active()) {
7906 dout(7) << "objecter still active" << dendl;
7907 mds->objecter->dump_active();
7908 return false;
7909 }
7910
7911 // trim what we can from the cache
7912 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7913 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7914 show_cache();
7915 //dump();
7916 return false;
7917 }
7918
7919 // make mydir subtree go away
7920 if (mydir) {
7921 if (mydir->get_num_ref() > 1) { // subtree pin
7922 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7923 show_cache();
7924 return false;
7925 }
7926
7927 remove_subtree(mydir);
7928 myin->close_dirfrag(mydir->get_frag());
7929 }
7930 ceph_assert(subtrees.empty());
7931
7932 if (myin) {
7933 remove_inode(myin);
7934 ceph_assert(!myin);
7935 }
7936
7937 if (global_snaprealm) {
7938 remove_inode(global_snaprealm->inode);
7939 global_snaprealm = nullptr;
7940 }
7941
7942 // done!
7943 dout(5) << "shutdown done." << dendl;
7944 return true;
7945 }
7946
7947 bool MDCache::shutdown_export_strays()
7948 {
7949 static const unsigned MAX_EXPORTING = 100;
7950
7951 if (mds->get_nodeid() == 0)
7952 return true;
7953
7954 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7955 return false;
7956
7957 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7958 << " '" << shutdown_export_next.second << "'" << dendl;
7959
7960 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7961 bool all_exported = false;
7962
7963 again:
7964 auto next = shutdown_export_next;
7965
7966 for (int i = 0; i < NUM_STRAY; ++i) {
7967 CInode *strayi = strays[i];
7968 if (!strayi ||
7969 !strayi->state_test(CInode::STATE_STRAYPINNED))
7970 continue;
7971 if (strayi->ino() < next.first.ino)
7972 continue;
7973
7974 deque<CDir*> dfls;
7975 strayi->get_dirfrags(dfls);
7976
7977 while (!dfls.empty()) {
7978 CDir *dir = dfls.front();
7979 dfls.pop_front();
7980
7981 if (dir->dirfrag() < next.first)
7982 continue;
7983 if (next.first < dir->dirfrag()) {
7984 next.first = dir->dirfrag();
7985 next.second.clear();
7986 }
7987
7988 if (!dir->is_complete()) {
7989 MDSContext *fin = nullptr;
7990 if (shutdown_exporting_strays.empty()) {
7991 fin = new MDSInternalContextWrapper(mds,
7992 new LambdaContext([this](int r) {
7993 shutdown_export_strays();
7994 })
7995 );
7996 }
7997 dir->fetch(fin);
7998 goto done;
7999 }
8000
8001 CDir::dentry_key_map::iterator it;
8002 if (next.second.empty()) {
8003 it = dir->begin();
8004 } else {
8005 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8006 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8007 }
8008
8009 for (; it != dir->end(); ++it) {
8010 CDentry *dn = it->second;
8011 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8012 if (dnl->is_null())
8013 continue;
8014
8015 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8016 next.second = it->first.name;
8017 goto done;
8018 }
8019
8020 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8021 if (!ret.second) {
8022 dout(10) << "already exporting/purging " << *dn << dendl;
8023 continue;
8024 }
8025
8026 // Don't try to migrate anything that is actually
8027 // being purged right now
8028 if (!dn->state_test(CDentry::STATE_PURGING))
8029 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8030
8031 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8032 ++it;
8033 if (it != dir->end()) {
8034 next.second = it->first.name;
8035 } else {
8036 if (dfls.empty())
8037 next.first.ino.val++;
8038 else
8039 next.first = dfls.front()->dirfrag();
8040 next.second.clear();
8041 }
8042 goto done;
8043 }
8044 }
8045 }
8046 }
8047
8048 if (shutdown_exporting_strays.empty()) {
8049 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8050 if (first_df < shutdown_export_next.first ||
8051 !shutdown_export_next.second.empty()) {
8052 shutdown_export_next.first = first_df;
8053 shutdown_export_next.second.clear();
8054 goto again;
8055 }
8056 all_exported = true;
8057 }
8058
8059 done:
8060 shutdown_export_next = next;
8061 return all_exported;
8062 }
8063
8064 // ========= messaging ==============
8065
8066 void MDCache::dispatch(const cref_t<Message> &m)
8067 {
8068 switch (m->get_type()) {
8069
8070 // RESOLVE
8071 case MSG_MDS_RESOLVE:
8072 handle_resolve(ref_cast<MMDSResolve>(m));
8073 break;
8074 case MSG_MDS_RESOLVEACK:
8075 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8076 break;
8077
8078 // REJOIN
8079 case MSG_MDS_CACHEREJOIN:
8080 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8081 break;
8082
8083 case MSG_MDS_DISCOVER:
8084 handle_discover(ref_cast<MDiscover>(m));
8085 break;
8086 case MSG_MDS_DISCOVERREPLY:
8087 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8088 break;
8089
8090 case MSG_MDS_DIRUPDATE:
8091 handle_dir_update(ref_cast<MDirUpdate>(m));
8092 break;
8093
8094 case MSG_MDS_CACHEEXPIRE:
8095 handle_cache_expire(ref_cast<MCacheExpire>(m));
8096 break;
8097
8098 case MSG_MDS_DENTRYLINK:
8099 handle_dentry_link(ref_cast<MDentryLink>(m));
8100 break;
8101 case MSG_MDS_DENTRYUNLINK:
8102 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8103 break;
8104
8105 case MSG_MDS_FRAGMENTNOTIFY:
8106 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8107 break;
8108 case MSG_MDS_FRAGMENTNOTIFYACK:
8109 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8110 break;
8111
8112 case MSG_MDS_FINDINO:
8113 handle_find_ino(ref_cast<MMDSFindIno>(m));
8114 break;
8115 case MSG_MDS_FINDINOREPLY:
8116 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8117 break;
8118
8119 case MSG_MDS_OPENINO:
8120 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8121 break;
8122 case MSG_MDS_OPENINOREPLY:
8123 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8124 break;
8125
8126 case MSG_MDS_SNAPUPDATE:
8127 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8128 break;
8129
8130 default:
8131 derr << "cache unknown message " << m->get_type() << dendl;
8132 ceph_abort_msg("cache unknown message");
8133 }
8134 }
8135
8136 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8137 const filepath& path, int flags,
8138 vector<CDentry*> *pdnvec, CInode **pin)
8139 {
8140 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8141 bool forward = !discover;
8142 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8143 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8144 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8145 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8146 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8147 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8148 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8149
8150 if (forward)
8151 ceph_assert(mdr); // forward requires a request
8152
8153 snapid_t snapid = CEPH_NOSNAP;
8154 if (mdr)
8155 mdr->snapid = snapid;
8156
8157 client_t client = mdr ? mdr->get_client() : -1;
8158
8159 if (mds->logger) mds->logger->inc(l_mds_traverse);
8160
8161 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8162 CInode *cur = get_inode(path.get_ino());
8163 if (!cur) {
8164 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8165 open_foreign_mdsdir(path.get_ino(), cf.build());
8166 return 1;
8167 }
8168 if (MDS_INO_IS_STRAY(path.get_ino())) {
8169 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8170 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8171 filepath path(strays[idx]->get_parent_dn()->get_name(),
8172 MDS_INO_MDSDIR(rank));
8173 MDRequestRef null_ref;
8174 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8175 }
8176 return -CEPHFS_ESTALE;
8177 }
8178 if (cur->state_test(CInode::STATE_PURGING))
8179 return -CEPHFS_ESTALE;
8180
8181 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8182 mds->locker->find_and_attach_lock_cache(mdr, cur);
8183
8184 if (mdr && mdr->lock_cache) {
8185 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8186 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8187 } else if (rdlock_snap) {
8188 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8189 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8190 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8191 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8192 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8193 return 1;
8194 }
8195 }
8196
8197 // start trace
8198 if (pdnvec)
8199 pdnvec->clear();
8200 if (pin)
8201 *pin = cur;
8202
8203 MutationImpl::LockOpVec lov;
8204
8205 for (unsigned depth = 0; depth < path.depth(); ) {
8206 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8207 << "' snapid " << snapid << dendl;
8208
8209 if (!cur->is_dir()) {
8210 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8211 return -CEPHFS_ENOTDIR;
8212 }
8213
8214 // walk into snapdir?
8215 if (path[depth].length() == 0) {
8216 dout(10) << "traverse: snapdir" << dendl;
8217 if (!mdr || depth > 0) // snapdir must be the first component
8218 return -CEPHFS_EINVAL;
8219 snapid = CEPH_SNAPDIR;
8220 mdr->snapid = snapid;
8221 depth++;
8222 continue;
8223 }
8224 // walk thru snapdir?
8225 if (snapid == CEPH_SNAPDIR) {
8226 if (!mdr)
8227 return -CEPHFS_EINVAL;
8228 SnapRealm *realm = cur->find_snaprealm();
8229 snapid = realm->resolve_snapname(path[depth], cur->ino());
8230 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8231 if (!snapid) {
8232 if (pdnvec)
8233 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8234 return -CEPHFS_ENOENT;
8235 }
8236 mdr->snapid = snapid;
8237 depth++;
8238 continue;
8239 }
8240
8241 // open dir
8242 frag_t fg = cur->pick_dirfrag(path[depth]);
8243 CDir *curdir = cur->get_dirfrag(fg);
8244 if (!curdir) {
8245 if (cur->is_auth()) {
8246 // parent dir frozen_dir?
8247 if (cur->is_frozen()) {
8248 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8249 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8250 return 1;
8251 }
8252 curdir = cur->get_or_open_dirfrag(this, fg);
8253 } else {
8254 // discover?
8255 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8256 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8257 path_locked);
8258 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8259 return 1;
8260 }
8261 }
8262 ceph_assert(curdir);
8263
8264 #ifdef MDS_VERIFY_FRAGSTAT
8265 if (curdir->is_complete())
8266 curdir->verify_fragstat();
8267 #endif
8268
8269 // frozen?
8270 /*
8271 if (curdir->is_frozen()) {
8272 // doh!
8273 // FIXME: traverse is allowed?
8274 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8275 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8276 if (onfinish) delete onfinish;
8277 return 1;
8278 }
8279 */
8280
8281 if (want_auth && want_dentry && depth == path.depth() - 1) {
8282 if (curdir->is_ambiguous_auth()) {
8283 dout(10) << "waiting for single auth on " << *curdir << dendl;
8284 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8285 return 1;
8286 }
8287 if (!curdir->is_auth()) {
8288 dout(10) << "fw to auth for " << *curdir << dendl;
8289 request_forward(mdr, curdir->authority().first);
8290 return 2;
8291 }
8292 }
8293
8294 // Before doing dirfrag->dn lookup, compare with DamageTable's
8295 // record of which dentries were unreadable
8296 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8297 dout(4) << "traverse: stopped lookup at damaged dentry "
8298 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8299 return -CEPHFS_EIO;
8300 }
8301
8302 // dentry
8303 CDentry *dn = curdir->lookup(path[depth], snapid);
8304 if (dn) {
8305 if (dn->state_test(CDentry::STATE_PURGING))
8306 return -CEPHFS_ENOENT;
8307
8308 if (rdlock_path) {
8309 lov.clear();
8310 if (xlock_dentry && depth == path.depth() - 1) {
8311 if (depth > 0 || !mdr->lock_cache) {
8312 lov.add_wrlock(&cur->filelock);
8313 lov.add_wrlock(&cur->nestlock);
8314 if (rdlock_authlock)
8315 lov.add_rdlock(&cur->authlock);
8316 }
8317 lov.add_xlock(&dn->lock);
8318 } else {
8319 // force client to flush async dir operation if necessary
8320 if (cur->filelock.is_cached())
8321 lov.add_wrlock(&cur->filelock);
8322 lov.add_rdlock(&dn->lock);
8323 }
8324 if (!mds->locker->acquire_locks(mdr, lov)) {
8325 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8326 return 1;
8327 }
8328 } else if (!path_locked &&
8329 !dn->lock.can_read(client) &&
8330 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8331 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8332 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8333 if (mds->logger)
8334 mds->logger->inc(l_mds_traverse_lock);
8335 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8336 mds->mdlog->flush();
8337 return 1;
8338 }
8339
8340 if (pdnvec)
8341 pdnvec->push_back(dn);
8342
8343 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8344 // can we conclude CEPHFS_ENOENT?
8345 if (dnl->is_null()) {
8346 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8347 if (depth == path.depth() - 1) {
8348 if (want_dentry)
8349 break;
8350 } else {
8351 if (pdnvec)
8352 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8353 }
8354 return -CEPHFS_ENOENT;
8355 }
8356
8357 // do we have inode?
8358 CInode *in = dnl->get_inode();
8359 if (!in) {
8360 ceph_assert(dnl->is_remote());
8361 // do i have it?
8362 in = get_inode(dnl->get_remote_ino());
8363 if (in) {
8364 dout(7) << "linking in remote in " << *in << dendl;
8365 dn->link_remote(dnl, in);
8366 } else {
8367 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8368 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8369 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8370 dout(4) << "traverse: remote dentry points to damaged ino "
8371 << *dn << dendl;
8372 return -CEPHFS_EIO;
8373 }
8374 open_remote_dentry(dn, true, cf.build(),
8375 (path_locked && depth == path.depth() - 1));
8376 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8377 return 1;
8378 }
8379 }
8380
8381 cur = in;
8382
8383 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8384 lov.clear();
8385 lov.add_rdlock(&cur->snaplock);
8386 if (!mds->locker->acquire_locks(mdr, lov)) {
8387 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8388 return 1;
8389 }
8390 }
8391
8392 // add to trace, continue.
8393 touch_inode(cur);
8394 if (pin)
8395 *pin = cur;
8396 depth++;
8397 continue;
8398 }
8399
8400 ceph_assert(!dn);
8401
8402 // MISS. dentry doesn't exist.
8403 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8404
8405 if (curdir->is_auth()) {
8406 // dentry is mine.
8407 if (curdir->is_complete() ||
8408 (snapid == CEPH_NOSNAP &&
8409 curdir->has_bloom() &&
8410 !curdir->is_in_bloom(path[depth]))) {
8411 // file not found
8412 if (pdnvec) {
8413 // instantiate a null dn?
8414 if (depth < path.depth() - 1) {
8415 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8416 } else if (snapid < CEPH_MAXSNAP) {
8417 dout(20) << " not adding null for snapid " << snapid << dendl;
8418 } else if (curdir->is_frozen()) {
8419 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8420 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8421 return 1;
8422 } else {
8423 // create a null dentry
8424 dn = curdir->add_null_dentry(path[depth]);
8425 dout(20) << " added null " << *dn << dendl;
8426
8427 if (rdlock_path) {
8428 lov.clear();
8429 if (xlock_dentry) {
8430 if (depth > 0 || !mdr->lock_cache) {
8431 lov.add_wrlock(&cur->filelock);
8432 lov.add_wrlock(&cur->nestlock);
8433 if (rdlock_authlock)
8434 lov.add_rdlock(&cur->authlock);
8435 }
8436 lov.add_xlock(&dn->lock);
8437 } else {
8438 // force client to flush async dir operation if necessary
8439 if (cur->filelock.is_cached())
8440 lov.add_wrlock(&cur->filelock);
8441 lov.add_rdlock(&dn->lock);
8442 }
8443 if (!mds->locker->acquire_locks(mdr, lov)) {
8444 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8445 return 1;
8446 }
8447 }
8448 }
8449 if (dn) {
8450 pdnvec->push_back(dn);
8451 if (want_dentry)
8452 break;
8453 } else {
8454 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8455 }
8456 }
8457 return -CEPHFS_ENOENT;
8458 } else {
8459
8460 // Check DamageTable for missing fragments before trying to fetch
8461 // this
8462 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8463 dout(4) << "traverse: damaged dirfrag " << *curdir
8464 << ", blocking fetch" << dendl;
8465 return -CEPHFS_EIO;
8466 }
8467
8468 // directory isn't complete; reload
8469 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8470 touch_inode(cur);
8471 curdir->fetch(cf.build(), path[depth]);
8472 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8473 return 1;
8474 }
8475 } else {
8476 // dirfrag/dentry is not mine.
8477 mds_authority_t dauth = curdir->authority();
8478
8479 if (forward &&
8480 mdr && mdr->client_request &&
8481 (int)depth < mdr->client_request->get_num_fwd()){
8482 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8483 << " < fwd " << mdr->client_request->get_num_fwd()
8484 << ", discovering instead of forwarding" << dendl;
8485 discover = true;
8486 }
8487
8488 if ((discover)) {
8489 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8490 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8491 path_locked);
8492 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8493 return 1;
8494 }
8495 if (forward) {
8496 // forward
8497 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8498
8499 if (curdir->is_ambiguous_auth()) {
8500 // wait
8501 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8502 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
8503 return 1;
8504 }
8505
8506 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8507
8508 request_forward(mdr, dauth.first);
8509
8510 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8511 return 2;
8512 }
8513 }
8514
8515 ceph_abort(); // i shouldn't get here
8516 }
8517
8518 if (want_auth && !want_dentry) {
8519 if (cur->is_ambiguous_auth()) {
8520 dout(10) << "waiting for single auth on " << *cur << dendl;
8521 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8522 return 1;
8523 }
8524 if (!cur->is_auth()) {
8525 dout(10) << "fw to auth for " << *cur << dendl;
8526 request_forward(mdr, cur->authority().first);
8527 return 2;
8528 }
8529 }
8530
8531 // success.
8532 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8533 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8534 if (mdr)
8535 ceph_assert(mdr->snapid == snapid);
8536
8537 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8538 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8539 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8540 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8541
8542 if (rdlock_path)
8543 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8544
8545 return 0;
8546 }
8547
8548 CInode *MDCache::cache_traverse(const filepath& fp)
8549 {
8550 dout(10) << "cache_traverse " << fp << dendl;
8551
8552 CInode *in;
8553 unsigned depth = 0;
8554
8555 if (fp.get_ino()) {
8556 in = get_inode(fp.get_ino());
8557 } else if (fp.depth() > 0 && fp[0] == "~mdsdir") {
8558 in = myin;
8559 depth = 1;
8560 } else {
8561 in = root;
8562 }
8563 if (!in)
8564 return NULL;
8565
8566 for (; depth < fp.depth(); depth++) {
8567 std::string_view dname = fp[depth];
8568 frag_t fg = in->pick_dirfrag(dname);
8569 dout(20) << " " << depth << " " << dname << " frag " << fg << " from " << *in << dendl;
8570 CDir *curdir = in->get_dirfrag(fg);
8571 if (!curdir)
8572 return NULL;
8573 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8574 if (!dn)
8575 return NULL;
8576 in = dn->get_linkage()->get_inode();
8577 if (!in)
8578 return NULL;
8579 }
8580 dout(10) << " got " << *in << dendl;
8581 return in;
8582 }
8583
8584
8585 /**
8586 * open_remote_dir -- open up a remote dirfrag
8587 *
8588 * @param diri base inode
8589 * @param approxfg approximate fragment.
8590 * @param fin completion callback
8591 */
8592 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8593 {
8594 dout(10) << "open_remote_dir on " << *diri << dendl;
8595 ceph_assert(diri->is_dir());
8596 ceph_assert(!diri->is_auth());
8597 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8598
8599 discover_dir_frag(diri, approxfg, fin);
8600 }
8601
8602
8603 /**
8604 * get_dentry_inode - get or open inode
8605 *
8606 * @param dn the dentry
8607 * @param mdr current request
8608 *
8609 * will return inode for primary, or link up/open up remote link's inode as necessary.
8610 * If it's not available right now, puts mdr on wait list and returns null.
8611 */
8612 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8613 {
8614 CDentry::linkage_t *dnl;
8615 if (projected)
8616 dnl = dn->get_projected_linkage();
8617 else
8618 dnl = dn->get_linkage();
8619
8620 ceph_assert(!dnl->is_null());
8621
8622 if (dnl->is_primary())
8623 return dnl->inode;
8624
8625 ceph_assert(dnl->is_remote());
8626 CInode *in = get_inode(dnl->get_remote_ino());
8627 if (in) {
8628 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8629 dn->link_remote(dnl, in);
8630 return in;
8631 } else {
8632 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8633 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8634 return 0;
8635 }
8636 }
8637
8638 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8639 CDentry *dn;
8640 inodeno_t ino;
8641 MDSContext *onfinish;
8642 bool want_xlocked;
8643 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8644 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8645 dn->get(MDSCacheObject::PIN_PTRWAITER);
8646 }
8647 void finish(int r) override {
8648 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8649 dn->put(MDSCacheObject::PIN_PTRWAITER);
8650 }
8651 };
8652
8653 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8654 {
8655 dout(10) << "open_remote_dentry " << *dn << dendl;
8656 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8657 inodeno_t ino = dnl->get_remote_ino();
8658 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->get_metadata_pool() : -1;
8659 open_ino(ino, pool,
8660 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8661 }
8662
8663 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8664 bool want_xlocked, int r)
8665 {
8666 if (r < 0) {
8667 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8668 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8669 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8670 dn->state_set(CDentry::STATE_BADREMOTEINO);
8671
8672 std::string path;
8673 CDir *dir = dn->get_dir();
8674 if (dir) {
8675 dir->get_inode()->make_path_string(path);
8676 path += "/";
8677 path += dn->get_name();
8678 }
8679
8680 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8681 if (fatal) {
8682 mds->damaged();
8683 ceph_abort(); // unreachable, damaged() respawns us
8684 }
8685 } else {
8686 r = 0;
8687 }
8688 }
8689 fin->complete(r < 0 ? r : 0);
8690 }
8691
8692
8693 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8694 {
8695 // empty trace if we're a base inode
8696 if (in->is_base())
8697 return;
8698
8699 CInode *parent = in->get_parent_inode();
8700 ceph_assert(parent);
8701 make_trace(trace, parent);
8702
8703 CDentry *dn = in->get_parent_dn();
8704 dout(15) << "make_trace adding " << *dn << dendl;
8705 trace.push_back(dn);
8706 }
8707
8708
8709 // -------------------------------------------------------------------------------
8710 // Open inode by inode number
8711
8712 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8713 inodeno_t ino;
8714 public:
8715 bufferlist bl;
8716 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8717 MDCacheIOContext(c), ino(i) {}
8718 void finish(int r) override {
8719 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8720 }
8721 void print(ostream& out) const override {
8722 out << "openino_backtrace_fetch" << ino << ")";
8723 }
8724 };
8725
8726 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8727 inodeno_t ino;
8728 cref_t<MMDSOpenIno> msg;
8729 bool parent;
8730 public:
8731 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8732 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8733 void finish(int r) override {
8734 if (r < 0 && !parent)
8735 r = -CEPHFS_EAGAIN;
8736 if (msg) {
8737 mdcache->handle_open_ino(msg, r);
8738 return;
8739 }
8740 auto& info = mdcache->opening_inodes.at(ino);
8741 mdcache->_open_ino_traverse_dir(ino, info, r);
8742 }
8743 };
8744
8745 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8746 inodeno_t ino;
8747 public:
8748 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8749 void finish(int r) override {
8750 mdcache->_open_ino_parent_opened(ino, r);
8751 }
8752 };
8753
8754 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8755 {
8756 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8757
8758 open_ino_info_t& info = opening_inodes.at(ino);
8759
8760 CInode *in = get_inode(ino);
8761 if (in) {
8762 dout(10) << " found cached " << *in << dendl;
8763 open_ino_finish(ino, info, in->authority().first);
8764 return;
8765 }
8766
8767 inode_backtrace_t backtrace;
8768 if (err == 0) {
8769 try {
8770 decode(backtrace, bl);
8771 } catch (const buffer::error &decode_exc) {
8772 derr << "corrupt backtrace on ino x0" << std::hex << ino
8773 << std::dec << ": " << decode_exc.what() << dendl;
8774 open_ino_finish(ino, info, -CEPHFS_EIO);
8775 return;
8776 }
8777 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8778 dout(10) << " old object in pool " << info.pool
8779 << ", retrying pool " << backtrace.pool << dendl;
8780 info.pool = backtrace.pool;
8781 C_IO_MDC_OpenInoBacktraceFetched *fin =
8782 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8783 fetch_backtrace(ino, info.pool, fin->bl,
8784 new C_OnFinisher(fin, mds->finisher));
8785 return;
8786 }
8787 } else if (err == -CEPHFS_ENOENT) {
8788 int64_t meta_pool = mds->get_metadata_pool();
8789 if (info.pool != meta_pool) {
8790 dout(10) << " no object in pool " << info.pool
8791 << ", retrying pool " << meta_pool << dendl;
8792 info.pool = meta_pool;
8793 C_IO_MDC_OpenInoBacktraceFetched *fin =
8794 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8795 fetch_backtrace(ino, info.pool, fin->bl,
8796 new C_OnFinisher(fin, mds->finisher));
8797 return;
8798 }
8799 err = 0; // backtrace.ancestors.empty() is checked below
8800 }
8801
8802 if (err == 0) {
8803 if (backtrace.ancestors.empty()) {
8804 dout(10) << " got empty backtrace " << dendl;
8805 err = -CEPHFS_ESTALE;
8806 } else if (!info.ancestors.empty()) {
8807 if (info.ancestors[0] == backtrace.ancestors[0]) {
8808 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8809 err = -CEPHFS_EINVAL;
8810 } else {
8811 info.last_err = 0;
8812 }
8813 }
8814 }
8815 if (err) {
8816 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8817 if (info.last_err)
8818 err = info.last_err;
8819 open_ino_finish(ino, info, err);
8820 return;
8821 }
8822
8823 dout(10) << " got backtrace " << backtrace << dendl;
8824 info.ancestors = backtrace.ancestors;
8825
8826 _open_ino_traverse_dir(ino, info, 0);
8827 }
8828
8829 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8830 {
8831 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8832
8833 open_ino_info_t& info = opening_inodes.at(ino);
8834
8835 CInode *in = get_inode(ino);
8836 if (in) {
8837 dout(10) << " found cached " << *in << dendl;
8838 open_ino_finish(ino, info, in->authority().first);
8839 return;
8840 }
8841
8842 if (ret == mds->get_nodeid()) {
8843 _open_ino_traverse_dir(ino, info, 0);
8844 } else {
8845 if (ret >= 0) {
8846 mds_rank_t checked_rank = mds_rank_t(ret);
8847 info.check_peers = true;
8848 info.auth_hint = checked_rank;
8849 info.checked.erase(checked_rank);
8850 }
8851 do_open_ino(ino, info, ret);
8852 }
8853 }
8854
8855 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8856 {
8857 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8858
8859 CInode *in = get_inode(ino);
8860 if (in) {
8861 dout(10) << " found cached " << *in << dendl;
8862 open_ino_finish(ino, info, in->authority().first);
8863 return;
8864 }
8865
8866 if (ret) {
8867 do_open_ino(ino, info, ret);
8868 return;
8869 }
8870
8871 mds_rank_t hint = info.auth_hint;
8872 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8873 info.discover, info.want_xlocked, &hint);
8874 if (ret > 0)
8875 return;
8876 if (hint != mds->get_nodeid())
8877 info.auth_hint = hint;
8878 do_open_ino(ino, info, ret);
8879 }
8880
8881 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
8882 {
8883 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8884 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8885 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8886 if (mds->logger)
8887 mds->logger->inc(l_mds_openino_dir_fetch);
8888 }
8889
8890 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
8891 const vector<inode_backpointer_t>& ancestors,
8892 bool discover, bool want_xlocked, mds_rank_t *hint)
8893 {
8894 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8895 int err = 0;
8896 for (unsigned i = 0; i < ancestors.size(); i++) {
8897 const auto& ancestor = ancestors.at(i);
8898 CInode *diri = get_inode(ancestor.dirino);
8899
8900 if (!diri) {
8901 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8902 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8903 return 1;
8904 }
8905 continue;
8906 }
8907
8908 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8909 CDir *dir = diri->get_parent_dir();
8910 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8911 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8912 dir = dir->get_inode()->get_parent_dir();
8913 _open_ino_fetch_dir(ino, m, dir, i == 0);
8914 return 1;
8915 }
8916
8917 if (!diri->is_dir()) {
8918 dout(10) << " " << *diri << " is not dir" << dendl;
8919 if (i == 0)
8920 err = -CEPHFS_ENOTDIR;
8921 break;
8922 }
8923
8924 const string& name = ancestor.dname;
8925 frag_t fg = diri->pick_dirfrag(name);
8926 CDir *dir = diri->get_dirfrag(fg);
8927 if (!dir) {
8928 if (diri->is_auth()) {
8929 if (diri->is_frozen()) {
8930 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8931 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8932 return 1;
8933 }
8934 dir = diri->get_or_open_dirfrag(this, fg);
8935 } else if (discover) {
8936 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8937 return 1;
8938 }
8939 }
8940 if (dir) {
8941 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
8942 CDentry *dn = dir->lookup(name);
8943 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8944 if (dir->is_auth()) {
8945 if (dnl && dnl->is_primary() &&
8946 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8947 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8948 _open_ino_fetch_dir(ino, m, dir, i == 0);
8949 return 1;
8950 }
8951
8952 if (!dnl && !dir->is_complete() &&
8953 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8954 dout(10) << " fetching incomplete " << *dir << dendl;
8955 _open_ino_fetch_dir(ino, m, dir, i == 0);
8956 return 1;
8957 }
8958
8959 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8960 if (i == 0)
8961 err = -CEPHFS_ENOENT;
8962 } else if (discover) {
8963 if (!dnl) {
8964 filepath path(name, 0);
8965 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8966 (i == 0 && want_xlocked));
8967 return 1;
8968 }
8969 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8970 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8971 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8972 return 1;
8973 }
8974 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8975 if (i == 0)
8976 err = -CEPHFS_ENOENT;
8977 }
8978 }
8979 if (hint && i == 0)
8980 *hint = dir ? dir->authority().first : diri->authority().first;
8981 break;
8982 }
8983 return err;
8984 }
8985
8986 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8987 {
8988 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8989
8990 MDSContext::vec waiters;
8991 waiters.swap(info.waiters);
8992 opening_inodes.erase(ino);
8993 finish_contexts(g_ceph_context, waiters, ret);
8994 }
8995
8996 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
8997 {
8998 if (err < 0 && err != -CEPHFS_EAGAIN) {
8999 info.checked.clear();
9000 info.checking = MDS_RANK_NONE;
9001 info.check_peers = true;
9002 info.fetch_backtrace = true;
9003 if (info.discover) {
9004 info.discover = false;
9005 info.ancestors.clear();
9006 }
9007 if (err != -CEPHFS_ENOENT && err != -CEPHFS_ENOTDIR)
9008 info.last_err = err;
9009 }
9010
9011 if (info.check_peers || info.discover) {
9012 if (info.discover) {
9013 // got backtrace from peer, but failed to find inode. re-check peers
9014 info.discover = false;
9015 info.ancestors.clear();
9016 info.checked.clear();
9017 }
9018 info.check_peers = false;
9019 info.checking = MDS_RANK_NONE;
9020 do_open_ino_peer(ino, info);
9021 } else if (info.fetch_backtrace) {
9022 info.check_peers = true;
9023 info.fetch_backtrace = false;
9024 info.checking = mds->get_nodeid();
9025 info.checked.clear();
9026 C_IO_MDC_OpenInoBacktraceFetched *fin =
9027 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9028 fetch_backtrace(ino, info.pool, fin->bl,
9029 new C_OnFinisher(fin, mds->finisher));
9030 } else {
9031 ceph_assert(!info.ancestors.empty());
9032 info.checking = mds->get_nodeid();
9033 open_ino(info.ancestors[0].dirino, mds->get_metadata_pool(),
9034 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9035 }
9036 }
9037
9038 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9039 {
9040 set<mds_rank_t> all, active;
9041 mds->mdsmap->get_mds_set(all);
9042 if (mds->get_state() == MDSMap::STATE_REJOIN)
9043 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9044 else
9045 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9046
9047 dout(10) << "do_open_ino_peer " << ino << " active " << active
9048 << " all " << all << " checked " << info.checked << dendl;
9049
9050 mds_rank_t whoami = mds->get_nodeid();
9051 mds_rank_t peer = MDS_RANK_NONE;
9052 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9053 if (active.count(info.auth_hint)) {
9054 peer = info.auth_hint;
9055 info.auth_hint = MDS_RANK_NONE;
9056 }
9057 } else {
9058 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9059 if (*p != whoami && info.checked.count(*p) == 0) {
9060 peer = *p;
9061 break;
9062 }
9063 }
9064 if (peer < 0) {
9065 all.erase(whoami);
9066 if (all != info.checked) {
9067 dout(10) << " waiting for more peers to be active" << dendl;
9068 } else {
9069 dout(10) << " all MDS peers have been checked " << dendl;
9070 do_open_ino(ino, info, 0);
9071 }
9072 } else {
9073 info.checking = peer;
9074 vector<inode_backpointer_t> *pa = NULL;
9075 // got backtrace from peer or backtrace just fetched
9076 if (info.discover || !info.fetch_backtrace)
9077 pa = &info.ancestors;
9078 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9079 if (mds->logger)
9080 mds->logger->inc(l_mds_openino_peer_discover);
9081 }
9082 }
9083
9084 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9085 {
9086 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9087 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9088 return;
9089 }
9090
9091 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9092
9093 auto from = mds_rank_t(m->get_source().num());
9094 inodeno_t ino = m->ino;
9095 ref_t<MMDSOpenInoReply> reply;
9096 CInode *in = get_inode(ino);
9097 if (in) {
9098 dout(10) << " have " << *in << dendl;
9099 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9100 if (in->is_auth()) {
9101 touch_inode(in);
9102 while (1) {
9103 CDentry *pdn = in->get_parent_dn();
9104 if (!pdn)
9105 break;
9106 CInode *diri = pdn->get_dir()->get_inode();
9107 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9108 in->get_version()));
9109 in = diri;
9110 }
9111 } else {
9112 reply->hint = in->authority().first;
9113 }
9114 } else if (err < 0) {
9115 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9116 } else {
9117 mds_rank_t hint = MDS_RANK_NONE;
9118 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9119 if (ret > 0)
9120 return;
9121 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9122 }
9123 mds->send_message_mds(reply, from);
9124 }
9125
9126 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9127 {
9128 dout(10) << "handle_open_ino_reply " << *m << dendl;
9129
9130 inodeno_t ino = m->ino;
9131 mds_rank_t from = mds_rank_t(m->get_source().num());
9132 auto it = opening_inodes.find(ino);
9133 if (it != opening_inodes.end() && it->second.checking == from) {
9134 open_ino_info_t& info = it->second;
9135 info.checking = MDS_RANK_NONE;
9136 info.checked.insert(from);
9137
9138 CInode *in = get_inode(ino);
9139 if (in) {
9140 dout(10) << " found cached " << *in << dendl;
9141 open_ino_finish(ino, info, in->authority().first);
9142 } else if (!m->ancestors.empty()) {
9143 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9144 if (!info.want_replica) {
9145 open_ino_finish(ino, info, from);
9146 return;
9147 }
9148
9149 info.ancestors = m->ancestors;
9150 info.auth_hint = from;
9151 info.checking = mds->get_nodeid();
9152 info.discover = true;
9153 _open_ino_traverse_dir(ino, info, 0);
9154 } else if (m->error) {
9155 dout(10) << " error " << m->error << " from mds." << from << dendl;
9156 do_open_ino(ino, info, m->error);
9157 } else {
9158 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9159 info.auth_hint = m->hint;
9160 info.checked.erase(m->hint);
9161 }
9162 do_open_ino_peer(ino, info);
9163 }
9164 }
9165 }
9166
9167 void MDCache::kick_open_ino_peers(mds_rank_t who)
9168 {
9169 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9170
9171 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9172 p != opening_inodes.end();
9173 ++p) {
9174 open_ino_info_t& info = p->second;
9175 if (info.checking == who) {
9176 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9177 info.checking = MDS_RANK_NONE;
9178 do_open_ino_peer(p->first, info);
9179 } else if (info.checking == MDS_RANK_NONE) {
9180 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9181 do_open_ino_peer(p->first, info);
9182 }
9183 }
9184 }
9185
9186 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9187 bool want_replica, bool want_xlocked,
9188 vector<inode_backpointer_t> *ancestors_hint,
9189 mds_rank_t auth_hint)
9190 {
9191 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9192 << want_replica << dendl;
9193
9194 auto it = opening_inodes.find(ino);
9195 if (it != opening_inodes.end()) {
9196 open_ino_info_t& info = it->second;
9197 if (want_replica) {
9198 info.want_replica = true;
9199 if (want_xlocked && !info.want_xlocked) {
9200 if (!info.ancestors.empty()) {
9201 CInode *diri = get_inode(info.ancestors[0].dirino);
9202 if (diri) {
9203 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9204 CDir *dir = diri->get_dirfrag(fg);
9205 if (dir && !dir->is_auth()) {
9206 filepath path(info.ancestors[0].dname, 0);
9207 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9208 }
9209 }
9210 }
9211 info.want_xlocked = true;
9212 }
9213 }
9214 info.waiters.push_back(fin);
9215 } else {
9216 open_ino_info_t& info = opening_inodes[ino];
9217 info.want_replica = want_replica;
9218 info.want_xlocked = want_xlocked;
9219 info.tid = ++open_ino_last_tid;
9220 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9221 info.waiters.push_back(fin);
9222 if (auth_hint != MDS_RANK_NONE)
9223 info.auth_hint = auth_hint;
9224 if (ancestors_hint) {
9225 info.ancestors = std::move(*ancestors_hint);
9226 info.fetch_backtrace = false;
9227 info.checking = mds->get_nodeid();
9228 _open_ino_traverse_dir(ino, info, 0);
9229 } else {
9230 do_open_ino(ino, info, 0);
9231 }
9232 }
9233 }
9234
9235 /* ---------------------------- */
9236
9237 /*
9238 * search for a given inode on MDS peers. optionally start with the given node.
9239
9240
9241 TODO
9242 - recover from mds node failure, recovery
9243 - traverse path
9244
9245 */
9246 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9247 mds_rank_t hint, bool path_locked)
9248 {
9249 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9250 CInode *in = get_inode(ino);
9251 if (in && in->state_test(CInode::STATE_PURGING)) {
9252 c->complete(-CEPHFS_ESTALE);
9253 return;
9254 }
9255 ceph_assert(!in);
9256
9257 ceph_tid_t tid = ++find_ino_peer_last_tid;
9258 find_ino_peer_info_t& fip = find_ino_peer[tid];
9259 fip.ino = ino;
9260 fip.tid = tid;
9261 fip.fin = c;
9262 fip.path_locked = path_locked;
9263 fip.hint = hint;
9264 _do_find_ino_peer(fip);
9265 }
9266
9267 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9268 {
9269 set<mds_rank_t> all, active;
9270 mds->mdsmap->get_mds_set(all);
9271 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9272
9273 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9274 << " active " << active << " all " << all
9275 << " checked " << fip.checked
9276 << dendl;
9277
9278 mds_rank_t m = MDS_RANK_NONE;
9279 if (fip.hint >= 0) {
9280 m = fip.hint;
9281 fip.hint = MDS_RANK_NONE;
9282 } else {
9283 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9284 if (*p != mds->get_nodeid() &&
9285 fip.checked.count(*p) == 0) {
9286 m = *p;
9287 break;
9288 }
9289 }
9290 if (m == MDS_RANK_NONE) {
9291 all.erase(mds->get_nodeid());
9292 if (all != fip.checked) {
9293 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9294 } else {
9295 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9296 fip.fin->complete(-CEPHFS_ESTALE);
9297 find_ino_peer.erase(fip.tid);
9298 }
9299 } else {
9300 fip.checking = m;
9301 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9302 }
9303 }
9304
9305 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9306 {
9307 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9308 return;
9309 }
9310
9311 dout(10) << "handle_find_ino " << *m << dendl;
9312 auto r = make_message<MMDSFindInoReply>(m->tid);
9313 CInode *in = get_inode(m->ino);
9314 if (in) {
9315 in->make_path(r->path);
9316 dout(10) << " have " << r->path << " " << *in << dendl;
9317 }
9318 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9319 }
9320
9321
9322 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9323 {
9324 auto p = find_ino_peer.find(m->tid);
9325 if (p != find_ino_peer.end()) {
9326 dout(10) << "handle_find_ino_reply " << *m << dendl;
9327 find_ino_peer_info_t& fip = p->second;
9328
9329 // success?
9330 if (get_inode(fip.ino)) {
9331 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9332 mds->queue_waiter(fip.fin);
9333 find_ino_peer.erase(p);
9334 return;
9335 }
9336
9337 mds_rank_t from = mds_rank_t(m->get_source().num());
9338 if (fip.checking == from)
9339 fip.checking = MDS_RANK_NONE;
9340 fip.checked.insert(from);
9341
9342 if (!m->path.empty()) {
9343 // we got a path!
9344 vector<CDentry*> trace;
9345 CF_MDS_RetryMessageFactory cf(mds, m);
9346 MDRequestRef null_ref;
9347 int flags = MDS_TRAVERSE_DISCOVER;
9348 if (fip.path_locked)
9349 flags |= MDS_TRAVERSE_PATH_LOCKED;
9350 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9351 if (r > 0)
9352 return;
9353 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9354 << ", retrying" << dendl;
9355 fip.checked.clear();
9356 _do_find_ino_peer(fip);
9357 } else {
9358 // nope, continue.
9359 _do_find_ino_peer(fip);
9360 }
9361 } else {
9362 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9363 }
9364 }
9365
9366 void MDCache::kick_find_ino_peers(mds_rank_t who)
9367 {
9368 // find_ino_peers requests we should move on from
9369 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9370 p != find_ino_peer.end();
9371 ++p) {
9372 find_ino_peer_info_t& fip = p->second;
9373 if (fip.checking == who) {
9374 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9375 fip.checking = MDS_RANK_NONE;
9376 _do_find_ino_peer(fip);
9377 } else if (fip.checking == MDS_RANK_NONE) {
9378 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9379 _do_find_ino_peer(fip);
9380 }
9381 }
9382 }
9383
9384 /* ---------------------------- */
9385
9386 int MDCache::get_num_client_requests()
9387 {
9388 int count = 0;
9389 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9390 p != active_requests.end();
9391 ++p) {
9392 MDRequestRef& mdr = p->second;
9393 if (mdr->reqid.name.is_client() && !mdr->is_peer())
9394 count++;
9395 }
9396 return count;
9397 }
9398
9399 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9400 {
9401 // did we win a forward race against a peer?
9402 if (active_requests.count(req->get_reqid())) {
9403 MDRequestRef& mdr = active_requests[req->get_reqid()];
9404 ceph_assert(mdr);
9405 if (mdr->is_peer()) {
9406 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9407 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9408 } else {
9409 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9410 }
9411 return MDRequestRef();
9412 }
9413
9414 // register new client request
9415 MDRequestImpl::Params params;
9416 params.reqid = req->get_reqid();
9417 params.attempt = req->get_num_fwd();
9418 params.client_req = req;
9419 params.initiated = req->get_recv_stamp();
9420 params.throttled = req->get_throttle_stamp();
9421 params.all_read = req->get_recv_complete_stamp();
9422 params.dispatched = req->get_dispatch_stamp();
9423
9424 MDRequestRef mdr =
9425 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9426 active_requests[params.reqid] = mdr;
9427 mdr->set_op_stamp(req->get_stamp());
9428 dout(7) << "request_start " << *mdr << dendl;
9429 return mdr;
9430 }
9431
9432 MDRequestRef MDCache::request_start_peer(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9433 {
9434 int by = m->get_source().num();
9435 MDRequestImpl::Params params;
9436 params.reqid = ri;
9437 params.attempt = attempt;
9438 params.triggering_peer_req = m;
9439 params.peer_to = by;
9440 params.initiated = m->get_recv_stamp();
9441 params.throttled = m->get_throttle_stamp();
9442 params.all_read = m->get_recv_complete_stamp();
9443 params.dispatched = m->get_dispatch_stamp();
9444 MDRequestRef mdr =
9445 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9446 ceph_assert(active_requests.count(mdr->reqid) == 0);
9447 active_requests[mdr->reqid] = mdr;
9448 dout(7) << "request_start_peer " << *mdr << " by mds." << by << dendl;
9449 return mdr;
9450 }
9451
9452 MDRequestRef MDCache::request_start_internal(int op)
9453 {
9454 utime_t now = ceph_clock_now();
9455 MDRequestImpl::Params params;
9456 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9457 params.reqid.tid = mds->issue_tid();
9458 params.initiated = now;
9459 params.throttled = now;
9460 params.all_read = now;
9461 params.dispatched = now;
9462 params.internal_op = op;
9463 MDRequestRef mdr =
9464 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9465
9466 ceph_assert(active_requests.count(mdr->reqid) == 0);
9467 active_requests[mdr->reqid] = mdr;
9468 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9469 return mdr;
9470 }
9471
9472 MDRequestRef MDCache::request_get(metareqid_t rid)
9473 {
9474 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9475 ceph_assert(p != active_requests.end());
9476 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9477 return p->second;
9478 }
9479
9480 void MDCache::request_finish(MDRequestRef& mdr)
9481 {
9482 dout(7) << "request_finish " << *mdr << dendl;
9483 mdr->mark_event("finishing request");
9484
9485 // peer finisher?
9486 if (mdr->has_more() && mdr->more()->peer_commit) {
9487 Context *fin = mdr->more()->peer_commit;
9488 mdr->more()->peer_commit = 0;
9489 int ret;
9490 if (mdr->aborted) {
9491 mdr->aborted = false;
9492 ret = -1;
9493 mdr->more()->peer_rolling_back = true;
9494 } else {
9495 ret = 0;
9496 mdr->committing = true;
9497 }
9498 fin->complete(ret); // this must re-call request_finish.
9499 return;
9500 }
9501
9502 switch(mdr->internal_op) {
9503 case CEPH_MDS_OP_FRAGMENTDIR:
9504 logger->inc(l_mdss_ireq_fragmentdir);
9505 break;
9506 case CEPH_MDS_OP_EXPORTDIR:
9507 logger->inc(l_mdss_ireq_exportdir);
9508 break;
9509 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9510 logger->inc(l_mdss_ireq_enqueue_scrub);
9511 break;
9512 case CEPH_MDS_OP_FLUSH:
9513 logger->inc(l_mdss_ireq_flush);
9514 break;
9515 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9516 logger->inc(l_mdss_ireq_fragstats);
9517 break;
9518 case CEPH_MDS_OP_REPAIR_INODESTATS:
9519 logger->inc(l_mdss_ireq_inodestats);
9520 break;
9521 }
9522
9523 request_cleanup(mdr);
9524 }
9525
9526
9527 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9528 {
9529 CachedStackStringStream css;
9530 *css << "forwarding request to mds." << who;
9531 mdr->mark_event(css->strv());
9532 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9533 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9534 << *mdr->client_request << dendl;
9535 if (mdr->is_batch_head()) {
9536 mdr->release_batch_op()->forward(who);
9537 } else {
9538 mds->forward_message_mds(mdr->release_client_request(), who);
9539 }
9540 if (mds->logger) mds->logger->inc(l_mds_forward);
9541 } else if (mdr->internal_op >= 0) {
9542 dout(10) << "request_forward on internal op; cancelling" << dendl;
9543 mdr->internal_op_finish->complete(-CEPHFS_EXDEV);
9544 } else {
9545 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9546 << " was from mds" << dendl;
9547 }
9548 request_cleanup(mdr);
9549 }
9550
9551
9552 void MDCache::dispatch_request(MDRequestRef& mdr)
9553 {
9554 if (mdr->client_request) {
9555 mds->server->dispatch_client_request(mdr);
9556 } else if (mdr->peer_request) {
9557 mds->server->dispatch_peer_request(mdr);
9558 } else {
9559 switch (mdr->internal_op) {
9560 case CEPH_MDS_OP_FRAGMENTDIR:
9561 dispatch_fragment_dir(mdr);
9562 break;
9563 case CEPH_MDS_OP_EXPORTDIR:
9564 migrator->dispatch_export_dir(mdr, 0);
9565 break;
9566 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9567 enqueue_scrub_work(mdr);
9568 break;
9569 case CEPH_MDS_OP_FLUSH:
9570 flush_dentry_work(mdr);
9571 break;
9572 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9573 repair_dirfrag_stats_work(mdr);
9574 break;
9575 case CEPH_MDS_OP_REPAIR_INODESTATS:
9576 repair_inode_stats_work(mdr);
9577 break;
9578 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
9579 rdlock_dirfrags_stats_work(mdr);
9580 break;
9581 default:
9582 ceph_abort();
9583 }
9584 }
9585 }
9586
9587
9588 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9589 {
9590 if (!mdr->has_more())
9591 return;
9592
9593 // clean up peers
9594 // (will implicitly drop remote dn pins)
9595 for (set<mds_rank_t>::iterator p = mdr->more()->peers.begin();
9596 p != mdr->more()->peers.end();
9597 ++p) {
9598 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
9599 MMDSPeerRequest::OP_FINISH);
9600
9601 if (mdr->killed && !mdr->committing) {
9602 r->mark_abort();
9603 } else if (mdr->more()->srcdn_auth_mds == *p &&
9604 mdr->more()->inode_import.length() > 0) {
9605 // information about rename imported caps
9606 r->inode_export = std::move(mdr->more()->inode_import);
9607 }
9608
9609 mds->send_message_mds(r, *p);
9610 }
9611
9612 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9613 * implicitly. Note that we don't call the finishers -- there shouldn't
9614 * be any on a remote lock and the request finish wakes up all
9615 * the waiters anyway! */
9616
9617 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9618 SimpleLock *lock = it->lock;
9619 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9620 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9621 << " on " << lock->get_parent() << dendl;
9622 lock->put_xlock();
9623 mdr->locks.erase(it++);
9624 } else if (it->is_remote_wrlock()) {
9625 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9626 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9627 if (it->is_wrlock()) {
9628 it->clear_remote_wrlock();
9629 ++it;
9630 } else {
9631 mdr->locks.erase(it++);
9632 }
9633 } else {
9634 ++it;
9635 }
9636 }
9637
9638 mdr->more()->peers.clear(); /* we no longer have requests out to them, and
9639 * leaving them in can cause double-notifies as
9640 * this function can get called more than once */
9641 }
9642
9643 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9644 {
9645 request_drop_foreign_locks(mdr);
9646 mds->locker->drop_non_rdlocks(mdr.get());
9647 }
9648
9649 void MDCache::request_drop_locks(MDRequestRef& mdr)
9650 {
9651 request_drop_foreign_locks(mdr);
9652 mds->locker->drop_locks(mdr.get());
9653 }
9654
9655 void MDCache::request_cleanup(MDRequestRef& mdr)
9656 {
9657 dout(15) << "request_cleanup " << *mdr << dendl;
9658
9659 if (mdr->has_more()) {
9660 if (mdr->more()->is_ambiguous_auth)
9661 mdr->clear_ambiguous_auth();
9662 if (!mdr->more()->waiting_for_finish.empty())
9663 mds->queue_waiters(mdr->more()->waiting_for_finish);
9664 }
9665
9666 request_drop_locks(mdr);
9667
9668 // drop (local) auth pins
9669 mdr->drop_local_auth_pins();
9670
9671 // drop stickydirs
9672 mdr->put_stickydirs();
9673
9674 mds->locker->kick_cap_releases(mdr);
9675
9676 // drop cache pins
9677 mdr->drop_pins();
9678
9679 // remove from session
9680 mdr->item_session_request.remove_myself();
9681
9682 // remove from map
9683 active_requests.erase(mdr->reqid);
9684
9685 if (mds->logger)
9686 log_stat();
9687
9688 mdr->mark_event("cleaned up request");
9689 }
9690
9691 void MDCache::request_kill(MDRequestRef& mdr)
9692 {
9693 // rollback peer requests is tricky. just let the request proceed.
9694 if (mdr->has_more() &&
9695 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_peer.empty())) {
9696 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9697 ceph_assert(mdr->more()->witnessed.empty());
9698 mdr->aborted = true;
9699 dout(10) << "request_kill " << *mdr << " -- waiting for peer reply, delaying" << dendl;
9700 } else {
9701 dout(10) << "request_kill " << *mdr << " -- already started peer prep, no-op" << dendl;
9702 }
9703
9704 ceph_assert(mdr->used_prealloc_ino == 0);
9705 ceph_assert(mdr->prealloc_inos.empty());
9706
9707 mdr->session = NULL;
9708 mdr->item_session_request.remove_myself();
9709 return;
9710 }
9711
9712 mdr->killed = true;
9713 mdr->mark_event("killing request");
9714
9715 if (mdr->committing) {
9716 dout(10) << "request_kill " << *mdr << " -- already committing, remove it from sesssion requests" << dendl;
9717 mdr->item_session_request.remove_myself();
9718 } else {
9719 dout(10) << "request_kill " << *mdr << dendl;
9720 request_cleanup(mdr);
9721 }
9722 }
9723
9724 // -------------------------------------------------------------------------------
9725 // SNAPREALMS
9726
9727 void MDCache::create_global_snaprealm()
9728 {
9729 CInode *in = new CInode(this); // dummy inode
9730 create_unlinked_system_inode(in, CEPH_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9731 add_inode(in);
9732 global_snaprealm = in->snaprealm;
9733 }
9734
9735 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9736 {
9737 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9738
9739 vector<inodeno_t> split_inos;
9740 vector<inodeno_t> split_realms;
9741
9742 if (notify_clients) {
9743 if (snapop == CEPH_SNAP_OP_SPLIT) {
9744 // notify clients of update|split
9745 for (auto p = in->snaprealm->inodes_with_caps.begin(); !p.end(); ++p)
9746 split_inos.push_back((*p)->ino());
9747
9748 for (auto& r : in->snaprealm->open_children)
9749 split_realms.push_back(r->inode->ino());
9750 }
9751 }
9752
9753 map<client_t, ref_t<MClientSnap>> updates;
9754 list<SnapRealm*> q;
9755 q.push_back(in->snaprealm);
9756 while (!q.empty()) {
9757 SnapRealm *realm = q.front();
9758 q.pop_front();
9759
9760 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9761 realm->invalidate_cached_snaps();
9762
9763 if (notify_clients) {
9764 for (const auto& p : realm->client_caps) {
9765 const auto& client = p.first;
9766 const auto& caps = p.second;
9767 ceph_assert(!caps->empty());
9768
9769 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9770 if (em.second) {
9771 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9772 update->head.split = in->ino();
9773 update->split_inos = split_inos;
9774 update->split_realms = split_realms;
9775 update->bl = in->snaprealm->get_snap_trace();
9776 em.first->second = std::move(update);
9777 }
9778 }
9779 }
9780
9781 // notify for active children, too.
9782 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9783 for (auto& r : realm->open_children)
9784 q.push_back(r);
9785 }
9786
9787 if (notify_clients)
9788 send_snaps(updates);
9789 }
9790
9791 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
9792 {
9793 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9794 ceph_assert(in->is_auth());
9795
9796 set<mds_rank_t> mds_set;
9797 if (stid > 0) {
9798 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9799 mds_set.erase(mds->get_nodeid());
9800 } else {
9801 in->list_replicas(mds_set);
9802 }
9803
9804 if (!mds_set.empty()) {
9805 bufferlist snap_blob;
9806 in->encode_snap(snap_blob);
9807
9808 for (auto p : mds_set) {
9809 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
9810 m->snap_blob = snap_blob;
9811 mds->send_message_mds(m, p);
9812 }
9813 }
9814
9815 if (stid > 0)
9816 notify_global_snaprealm_update(snap_op);
9817 }
9818
9819 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
9820 {
9821 mds_rank_t from = mds_rank_t(m->get_source().num());
9822 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
9823
9824 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9825 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9826 return;
9827 }
9828
9829 // null rejoin_done means open_snaprealms() has already been called
9830 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9831 (mds->is_rejoin() && !rejoin_done);
9832
9833 if (m->get_tid() > 0) {
9834 mds->snapclient->notify_commit(m->get_tid());
9835 if (notify_clients)
9836 notify_global_snaprealm_update(m->get_snap_op());
9837 }
9838
9839 CInode *in = get_inode(m->get_ino());
9840 if (in) {
9841 ceph_assert(!in->is_auth());
9842 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9843 (mds->is_rejoin() && !in->is_rejoining())) {
9844 auto p = m->snap_blob.cbegin();
9845 in->decode_snap(p);
9846
9847 if (!notify_clients) {
9848 if (!rejoin_pending_snaprealms.count(in)) {
9849 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9850 rejoin_pending_snaprealms.insert(in);
9851 }
9852 }
9853 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9854 }
9855 }
9856 }
9857
9858 void MDCache::notify_global_snaprealm_update(int snap_op)
9859 {
9860 if (snap_op != CEPH_SNAP_OP_DESTROY)
9861 snap_op = CEPH_SNAP_OP_UPDATE;
9862 set<Session*> sessions;
9863 mds->sessionmap.get_client_session_set(sessions);
9864 for (auto &session : sessions) {
9865 if (!session->is_open() && !session->is_stale())
9866 continue;
9867 auto update = make_message<MClientSnap>(snap_op);
9868 update->head.split = global_snaprealm->inode->ino();
9869 update->bl = global_snaprealm->get_snap_trace();
9870 mds->send_message_client_counted(update, session);
9871 }
9872 }
9873
9874 // -------------------------------------------------------------------------------
9875 // STRAYS
9876
9877 struct C_MDC_RetryScanStray : public MDCacheContext {
9878 dirfrag_t next;
9879 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9880 void finish(int r) override {
9881 mdcache->scan_stray_dir(next);
9882 }
9883 };
9884
9885 void MDCache::scan_stray_dir(dirfrag_t next)
9886 {
9887 dout(10) << "scan_stray_dir " << next << dendl;
9888
9889 if (next.ino)
9890 next.frag = strays[MDS_INO_STRAY_INDEX(next.ino)]->dirfragtree[next.frag.value()];
9891
9892 for (int i = 0; i < NUM_STRAY; ++i) {
9893 if (strays[i]->ino() < next.ino)
9894 continue;
9895
9896 std::vector<CDir*> ls;
9897 strays[i]->get_dirfrags(ls);
9898
9899 for (const auto& dir : ls) {
9900 if (dir->get_frag() < next.frag)
9901 continue;
9902
9903 if (!dir->can_auth_pin()) {
9904 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_RetryScanStray(this, dir->dirfrag()));
9905 return;
9906 }
9907
9908 if (!dir->is_complete()) {
9909 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9910 return;
9911 }
9912
9913 for (auto &p : dir->items) {
9914 CDentry *dn = p.second;
9915 dn->state_set(CDentry::STATE_STRAY);
9916 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9917 if (dnl->is_primary()) {
9918 CInode *in = dnl->get_inode();
9919 if (in->get_inode()->nlink == 0)
9920 in->state_set(CInode::STATE_ORPHAN);
9921 maybe_eval_stray(in);
9922 }
9923 }
9924 }
9925 }
9926 }
9927
9928 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9929 {
9930 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9931 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9932 if (mds->logger)
9933 mds->logger->inc(l_mds_openino_backtrace_fetch);
9934 }
9935
9936
9937
9938
9939
9940 // ========================================================================================
9941 // DISCOVER
9942 /*
9943
9944 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9945 to the parent metadata object in the cache (pinning it).
9946
9947 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9948
9949 */
9950
9951 void MDCache::_send_discover(discover_info_t& d)
9952 {
9953 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
9954 d.want_base_dir, d.path_locked);
9955 dis->set_tid(d.tid);
9956 mds->send_message_mds(dis, d.mds);
9957 }
9958
9959 void MDCache::discover_base_ino(inodeno_t want_ino,
9960 MDSContext *onfinish,
9961 mds_rank_t from)
9962 {
9963 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
9964 if (waiting_for_base_ino[from].count(want_ino) == 0) {
9965 discover_info_t& d = _create_discover(from);
9966 d.ino = want_ino;
9967 _send_discover(d);
9968 }
9969 waiting_for_base_ino[from][want_ino].push_back(onfinish);
9970 }
9971
9972
9973 void MDCache::discover_dir_frag(CInode *base,
9974 frag_t approx_fg,
9975 MDSContext *onfinish,
9976 mds_rank_t from)
9977 {
9978 if (from < 0)
9979 from = base->authority().first;
9980
9981 dirfrag_t df(base->ino(), approx_fg);
9982 dout(7) << "discover_dir_frag " << df
9983 << " from mds." << from << dendl;
9984
9985 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
9986 discover_info_t& d = _create_discover(from);
9987 d.pin_base(base);
9988 d.ino = base->ino();
9989 d.frag = approx_fg;
9990 d.want_base_dir = true;
9991 _send_discover(d);
9992 }
9993
9994 if (onfinish)
9995 base->add_dir_waiter(approx_fg, onfinish);
9996 }
9997
9998 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
9999 CInode *base;
10000 snapid_t snapid;
10001 filepath path;
10002 mds_rank_t from;
10003 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10004 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10005 void finish(int r) override {
10006 mdcache->discover_path(base, snapid, path, 0, from);
10007 }
10008 };
10009
10010 void MDCache::discover_path(CInode *base,
10011 snapid_t snap,
10012 filepath want_path,
10013 MDSContext *onfinish,
10014 bool path_locked,
10015 mds_rank_t from)
10016 {
10017 if (from < 0)
10018 from = base->authority().first;
10019
10020 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10021 << (path_locked ? " path_locked":"")
10022 << dendl;
10023
10024 if (base->is_ambiguous_auth()) {
10025 dout(10) << " waiting for single auth on " << *base << dendl;
10026 if (!onfinish)
10027 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10028 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10029 return;
10030 } else if (from == mds->get_nodeid()) {
10031 MDSContext::vec finished;
10032 base->take_waiting(CInode::WAIT_DIR, finished);
10033 mds->queue_waiters(finished);
10034 return;
10035 }
10036
10037 frag_t fg = base->pick_dirfrag(want_path[0]);
10038 if ((path_locked && want_path.depth() == 1) ||
10039 !base->is_waiting_for_dir(fg) || !onfinish) {
10040 discover_info_t& d = _create_discover(from);
10041 d.ino = base->ino();
10042 d.pin_base(base);
10043 d.frag = fg;
10044 d.snap = snap;
10045 d.want_path = want_path;
10046 d.want_base_dir = true;
10047 d.path_locked = path_locked;
10048 _send_discover(d);
10049 }
10050
10051 // register + wait
10052 if (onfinish)
10053 base->add_dir_waiter(fg, onfinish);
10054 }
10055
10056 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10057 CDir *base;
10058 snapid_t snapid;
10059 filepath path;
10060 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10061 MDCacheContext(c), base(b), snapid(s), path(p) {}
10062 void finish(int r) override {
10063 mdcache->discover_path(base, snapid, path, 0);
10064 }
10065 };
10066
10067 void MDCache::discover_path(CDir *base,
10068 snapid_t snap,
10069 filepath want_path,
10070 MDSContext *onfinish,
10071 bool path_locked)
10072 {
10073 mds_rank_t from = base->authority().first;
10074
10075 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10076 << (path_locked ? " path_locked":"")
10077 << dendl;
10078
10079 if (base->is_ambiguous_auth()) {
10080 dout(7) << " waiting for single auth on " << *base << dendl;
10081 if (!onfinish)
10082 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10083 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10084 return;
10085 } else if (from == mds->get_nodeid()) {
10086 MDSContext::vec finished;
10087 base->take_sub_waiting(finished);
10088 mds->queue_waiters(finished);
10089 return;
10090 }
10091
10092 if ((path_locked && want_path.depth() == 1) ||
10093 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10094 discover_info_t& d = _create_discover(from);
10095 d.ino = base->ino();
10096 d.pin_base(base->inode);
10097 d.frag = base->get_frag();
10098 d.snap = snap;
10099 d.want_path = want_path;
10100 d.want_base_dir = false;
10101 d.path_locked = path_locked;
10102 _send_discover(d);
10103 }
10104
10105 // register + wait
10106 if (onfinish)
10107 base->add_dentry_waiter(want_path[0], snap, onfinish);
10108 }
10109
10110 void MDCache::kick_discovers(mds_rank_t who)
10111 {
10112 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10113 p != discovers.end();
10114 ++p) {
10115 if (p->second.mds != who)
10116 continue;
10117 _send_discover(p->second);
10118 }
10119 }
10120
10121
10122 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10123 {
10124 mds_rank_t whoami = mds->get_nodeid();
10125 mds_rank_t from = mds_rank_t(dis->get_source().num());
10126
10127 ceph_assert(from != whoami);
10128
10129 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10130 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10131 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10132 return;
10133 }
10134
10135 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10136 // delay processing request from survivor because we may not yet choose lock states.
10137 if (!mds->mdsmap->is_rejoin(from)) {
10138 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10139 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10140 return;
10141 }
10142 }
10143
10144
10145 CInode *cur = 0;
10146 auto reply = make_message<MDiscoverReply>(*dis);
10147
10148 snapid_t snapid = dis->get_snapid();
10149
10150 // get started.
10151 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10152 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10153 // wants root
10154 dout(7) << "handle_discover from mds." << from
10155 << " wants base + " << dis->get_want().get_path()
10156 << " snap " << snapid
10157 << dendl;
10158
10159 cur = get_inode(dis->get_base_ino());
10160 ceph_assert(cur);
10161
10162 // add root
10163 reply->starts_with = MDiscoverReply::INODE;
10164 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10165 dout(10) << "added base " << *cur << dendl;
10166 }
10167 else {
10168 // there's a base inode
10169 cur = get_inode(dis->get_base_ino(), snapid);
10170 if (!cur && snapid != CEPH_NOSNAP) {
10171 cur = get_inode(dis->get_base_ino());
10172 if (cur && !cur->is_multiversion())
10173 cur = NULL; // nope!
10174 }
10175
10176 if (!cur) {
10177 dout(7) << "handle_discover mds." << from
10178 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10179 << dendl;
10180 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10181 reply->set_error_dentry(dis->get_dentry(0));
10182 reply->set_flag_error_dir();
10183 } else if (dis->wants_base_dir()) {
10184 dout(7) << "handle_discover mds." << from
10185 << " wants basedir+" << dis->get_want().get_path()
10186 << " has " << *cur
10187 << dendl;
10188 } else {
10189 dout(7) << "handle_discover mds." << from
10190 << " wants " << dis->get_want().get_path()
10191 << " has " << *cur
10192 << dendl;
10193 }
10194 }
10195
10196 ceph_assert(reply);
10197
10198 // add content
10199 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10200 for (unsigned i = 0;
10201 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10202 i++) {
10203
10204 // -- figure out the dir
10205
10206 // is *cur even a dir at all?
10207 if (!cur->is_dir()) {
10208 dout(7) << *cur << " not a dir" << dendl;
10209 reply->set_flag_error_dir();
10210 break;
10211 }
10212
10213 // pick frag
10214 frag_t fg;
10215 if (dis->get_want().depth()) {
10216 // dentry specifies
10217 fg = cur->pick_dirfrag(dis->get_dentry(i));
10218 } else {
10219 // requester explicity specified the frag
10220 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10221 fg = dis->get_base_dir_frag();
10222 if (!cur->dirfragtree.is_leaf(fg))
10223 fg = cur->dirfragtree[fg.value()];
10224 }
10225 CDir *curdir = cur->get_dirfrag(fg);
10226
10227 if ((!curdir && !cur->is_auth()) ||
10228 (curdir && !curdir->is_auth())) {
10229
10230 /* before:
10231 * ONLY set flag if empty!!
10232 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10233 * resulting in duplicate discovers in flight,
10234 * which can wreak havoc when discovering rename srcdn (which may move)
10235 */
10236
10237 if (reply->is_empty()) {
10238 // only hint if empty.
10239 // someday this could be better, but right now the waiter logic isn't smart enough.
10240
10241 // hint
10242 if (curdir) {
10243 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10244 reply->set_dir_auth_hint(curdir->authority().first);
10245 } else {
10246 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10247 << *cur << dendl;
10248 reply->set_dir_auth_hint(cur->authority().first);
10249 }
10250
10251 // note error dentry, if any
10252 // NOTE: important, as it allows requester to issue an equivalent discover
10253 // to whomever we hint at.
10254 if (dis->get_want().depth() > i)
10255 reply->set_error_dentry(dis->get_dentry(i));
10256 }
10257
10258 break;
10259 }
10260
10261 if (!curdir) { // open dir?
10262 if (cur->is_frozen()) {
10263 if (!reply->is_empty()) {
10264 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10265 break;
10266 }
10267 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10268 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10269 return;
10270 }
10271 curdir = cur->get_or_open_dirfrag(this, fg);
10272 } else if (curdir->is_frozen_tree() ||
10273 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10274 if (!reply->is_empty()) {
10275 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10276 break;
10277 }
10278 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10279 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10280 reply->set_flag_error_dir();
10281 break;
10282 }
10283 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10284 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10285 return;
10286 }
10287
10288 // add dir
10289 if (curdir->get_version() == 0) {
10290 // fetch newly opened dir
10291 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10292 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10293 // make sure the base frag is correct, though, in there was a refragment since the
10294 // original request was sent.
10295 reply->set_base_dir_frag(curdir->get_frag());
10296 } else {
10297 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10298 if (!reply->trace.length())
10299 reply->starts_with = MDiscoverReply::DIR;
10300 encode_replica_dir(curdir, from, reply->trace);
10301 dout(7) << "handle_discover added dir " << *curdir << dendl;
10302 }
10303
10304 // lookup
10305 CDentry *dn = 0;
10306 if (curdir->get_version() == 0) {
10307 // fetch newly opened dir
10308 ceph_assert(!curdir->has_bloom());
10309 } else if (dis->get_want().depth() > 0) {
10310 // lookup dentry
10311 dn = curdir->lookup(dis->get_dentry(i), snapid);
10312 } else
10313 break; // done!
10314
10315 // incomplete dir?
10316 if (!dn) {
10317 if (!curdir->is_complete() &&
10318 !(snapid == CEPH_NOSNAP &&
10319 curdir->has_bloom() &&
10320 !curdir->is_in_bloom(dis->get_dentry(i)))) {
10321 // readdir
10322 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10323 if (reply->is_empty()) {
10324 // fetch and wait
10325 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10326 dis->wants_base_dir() && curdir->get_version() == 0);
10327 return;
10328 } else {
10329 // initiate fetch, but send what we have so far
10330 curdir->fetch(0);
10331 break;
10332 }
10333 }
10334
10335 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10336 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10337 << " dne, non-empty reply, stopping" << dendl;
10338 break;
10339 }
10340
10341 // send null dentry
10342 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10343 << *curdir << dendl;
10344 if (snapid == CEPH_NOSNAP)
10345 dn = curdir->add_null_dentry(dis->get_dentry(i));
10346 else
10347 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10348 }
10349 ceph_assert(dn);
10350
10351 // don't add replica to purging dentry/inode
10352 if (dn->state_test(CDentry::STATE_PURGING)) {
10353 if (reply->is_empty())
10354 reply->set_flag_error_dn(dis->get_dentry(i));
10355 break;
10356 }
10357
10358 CDentry::linkage_t *dnl = dn->get_linkage();
10359
10360 // xlocked dentry?
10361 // ...always block on non-tail items (they are unrelated)
10362 // ...allow xlocked tail disocvery _only_ if explicitly requested
10363 if (dn->lock.is_xlocked()) {
10364 // is this the last (tail) item in the discover traversal?
10365 if (dis->is_path_locked()) {
10366 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10367 } else if (reply->is_empty()) {
10368 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10369 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10370 return;
10371 } else {
10372 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10373 break;
10374 }
10375 }
10376
10377 // frozen inode?
10378 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10379 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10380 if (tailitem && dis->is_path_locked()) {
10381 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10382 } else if (reply->is_empty()) {
10383 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10384 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10385 return;
10386 } else {
10387 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10388 break;
10389 }
10390 }
10391
10392 // add dentry
10393 if (!reply->trace.length())
10394 reply->starts_with = MDiscoverReply::DENTRY;
10395 encode_replica_dentry(dn, from, reply->trace);
10396 dout(7) << "handle_discover added dentry " << *dn << dendl;
10397
10398 if (!dnl->is_primary()) break; // stop on null or remote link.
10399
10400 // add inode
10401 CInode *next = dnl->get_inode();
10402 ceph_assert(next->is_auth());
10403
10404 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10405 dout(7) << "handle_discover added inode " << *next << dendl;
10406
10407 // descend, keep going.
10408 cur = next;
10409 continue;
10410 }
10411
10412 // how did we do?
10413 ceph_assert(!reply->is_empty());
10414 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10415 mds->send_message(reply, dis->get_connection());
10416 }
10417
10418 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10419 {
10420 /*
10421 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10422 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10423 return;
10424 }
10425 */
10426 dout(7) << "discover_reply " << *m << dendl;
10427 if (m->is_flag_error_dir())
10428 dout(7) << " flag error, dir" << dendl;
10429 if (m->is_flag_error_dn())
10430 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10431
10432 MDSContext::vec finished, error;
10433 mds_rank_t from = mds_rank_t(m->get_source().num());
10434
10435 // starting point
10436 CInode *cur = get_inode(m->get_base_ino());
10437 auto p = m->trace.cbegin();
10438
10439 int next = m->starts_with;
10440
10441 // decrement discover counters
10442 if (m->get_tid()) {
10443 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10444 if (p != discovers.end()) {
10445 dout(10) << " found tid " << m->get_tid() << dendl;
10446 discovers.erase(p);
10447 } else {
10448 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10449 }
10450 }
10451
10452 // discover may start with an inode
10453 if (!p.end() && next == MDiscoverReply::INODE) {
10454 decode_replica_inode(cur, p, NULL, finished);
10455 dout(7) << "discover_reply got base inode " << *cur << dendl;
10456 ceph_assert(cur->is_base());
10457
10458 next = MDiscoverReply::DIR;
10459
10460 // take waiters?
10461 if (cur->is_base() &&
10462 waiting_for_base_ino[from].count(cur->ino())) {
10463 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10464 waiting_for_base_ino[from].erase(cur->ino());
10465 }
10466 }
10467 ceph_assert(cur);
10468
10469 // loop over discover results.
10470 // indexes follow each ([[dir] dentry] inode)
10471 // can start, end with any type.
10472 while (!p.end()) {
10473 // dir
10474 frag_t fg;
10475 CDir *curdir = nullptr;
10476 if (next == MDiscoverReply::DIR) {
10477 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10478 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10479 ceph_assert(m->get_wanted_base_dir());
10480 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10481 }
10482 } else {
10483 // note: this can only happen our first way around this loop.
10484 if (p.end() && m->is_flag_error_dn()) {
10485 fg = cur->pick_dirfrag(m->get_error_dentry());
10486 curdir = cur->get_dirfrag(fg);
10487 } else
10488 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10489 }
10490
10491 if (p.end())
10492 break;
10493
10494 // dentry
10495 CDentry *dn = nullptr;
10496 decode_replica_dentry(dn, p, curdir, finished);
10497
10498 if (p.end())
10499 break;
10500
10501 // inode
10502 decode_replica_inode(cur, p, dn, finished);
10503
10504 next = MDiscoverReply::DIR;
10505 }
10506
10507 // dir error?
10508 // or dir_auth hint?
10509 if (m->is_flag_error_dir() && !cur->is_dir()) {
10510 // not a dir.
10511 cur->take_waiting(CInode::WAIT_DIR, error);
10512 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10513 mds_rank_t who = m->get_dir_auth_hint();
10514 if (who == mds->get_nodeid()) who = -1;
10515 if (who >= 0)
10516 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10517
10518
10519 if (m->get_wanted_base_dir()) {
10520 frag_t fg = m->get_base_dir_frag();
10521 CDir *dir = cur->get_dirfrag(fg);
10522
10523 if (cur->is_waiting_for_dir(fg)) {
10524 if (cur->is_auth())
10525 cur->take_waiting(CInode::WAIT_DIR, finished);
10526 else if (dir || !cur->dirfragtree.is_leaf(fg))
10527 cur->take_dir_waiting(fg, finished);
10528 else
10529 discover_dir_frag(cur, fg, 0, who);
10530 } else
10531 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10532 }
10533
10534 // try again?
10535 if (m->get_error_dentry().length()) {
10536 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10537 CDir *dir = cur->get_dirfrag(fg);
10538 // wanted a dentry
10539 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10540 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10541 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10542 m->get_wanted_snapid(), finished);
10543 } else {
10544 filepath relpath(m->get_error_dentry(), 0);
10545 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10546 }
10547 } else
10548 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10549 << m->get_error_dentry() << dendl;
10550 }
10551 } else if (m->is_flag_error_dn()) {
10552 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10553 CDir *dir = cur->get_dirfrag(fg);
10554 if (dir) {
10555 if (dir->is_auth()) {
10556 dir->take_sub_waiting(finished);
10557 } else {
10558 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10559 m->get_wanted_snapid(), error);
10560 }
10561 }
10562 }
10563
10564 // waiters
10565 finish_contexts(g_ceph_context, error, -CEPHFS_ENOENT); // finish errors directly
10566 mds->queue_waiters(finished);
10567 }
10568
10569
10570
10571 // ----------------------------
10572 // REPLICAS
10573
10574
10575 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10576 {
10577 ENCODE_START(1, 1, bl);
10578 dirfrag_t df = dir->dirfrag();
10579 encode(df, bl);
10580 __u32 nonce = dir->add_replica(to);
10581 encode(nonce, bl);
10582 dir->_encode_base(bl);
10583 ENCODE_FINISH(bl);
10584 }
10585
10586 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10587 {
10588 ENCODE_START(2, 1, bl);
10589 encode(dn->get_name(), bl);
10590 encode(dn->last, bl);
10591
10592 __u32 nonce = dn->add_replica(to);
10593 encode(nonce, bl);
10594 encode(dn->first, bl);
10595 encode(dn->linkage.remote_ino, bl);
10596 encode(dn->linkage.remote_d_type, bl);
10597 dn->lock.encode_state_for_replica(bl);
10598 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10599 encode(need_recover, bl);
10600 encode(dn->alternate_name, bl);
10601 ENCODE_FINISH(bl);
10602 }
10603
10604 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10605 uint64_t features)
10606 {
10607 ceph_assert(in->is_auth());
10608
10609 ENCODE_START(2, 1, bl);
10610 encode(in->ino(), bl); // bleh, minor assymetry here
10611 encode(in->last, bl);
10612
10613 __u32 nonce = in->add_replica(to);
10614 encode(nonce, bl);
10615
10616 in->_encode_base(bl, features);
10617 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10618
10619 __u32 state = in->state;
10620 encode(state, bl);
10621
10622 ENCODE_FINISH(bl);
10623 }
10624
10625 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10626 MDSContext::vec& finished)
10627 {
10628 DECODE_START(1, p);
10629 dirfrag_t df;
10630 decode(df, p);
10631
10632 ceph_assert(diri->ino() == df.ino);
10633
10634 // add it (_replica_)
10635 dir = diri->get_dirfrag(df.frag);
10636
10637 if (dir) {
10638 // had replica. update w/ new nonce.
10639 __u32 nonce;
10640 decode(nonce, p);
10641 dir->set_replica_nonce(nonce);
10642 dir->_decode_base(p);
10643 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10644 } else {
10645 // force frag to leaf in the diri tree
10646 if (!diri->dirfragtree.is_leaf(df.frag)) {
10647 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10648 << diri->dirfragtree << dendl;
10649 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10650 }
10651 // add replica.
10652 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10653 __u32 nonce;
10654 decode(nonce, p);
10655 dir->set_replica_nonce(nonce);
10656 dir->_decode_base(p);
10657 // is this a dir_auth delegation boundary?
10658 if (from != diri->authority().first ||
10659 diri->is_ambiguous_auth() ||
10660 diri->is_base())
10661 adjust_subtree_auth(dir, from);
10662
10663 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10664 // get waiters
10665 diri->take_dir_waiting(df.frag, finished);
10666 }
10667 DECODE_FINISH(p);
10668 }
10669
10670 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10671 {
10672 DECODE_START(1, p);
10673 string name;
10674 snapid_t last;
10675 decode(name, p);
10676 decode(last, p);
10677
10678 dn = dir->lookup(name, last);
10679
10680 // have it?
10681 bool is_new = false;
10682 if (dn) {
10683 is_new = false;
10684 dout(7) << __func__ << " had " << *dn << dendl;
10685 } else {
10686 is_new = true;
10687 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10688 dout(7) << __func__ << " added " << *dn << dendl;
10689 }
10690
10691 __u32 nonce;
10692 decode(nonce, p);
10693 dn->set_replica_nonce(nonce);
10694 decode(dn->first, p);
10695
10696 inodeno_t rino;
10697 unsigned char rdtype;
10698 decode(rino, p);
10699 decode(rdtype, p);
10700 dn->lock.decode_state(p, is_new);
10701
10702 bool need_recover;
10703 decode(need_recover, p);
10704
10705 mempool::mds_co::string alternate_name;
10706 if (struct_v >= 2) {
10707 decode(alternate_name, p);
10708 }
10709
10710 if (is_new) {
10711 dn->set_alternate_name(std::move(alternate_name));
10712 if (rino)
10713 dir->link_remote_inode(dn, rino, rdtype);
10714 if (need_recover)
10715 dn->lock.mark_need_recover();
10716 } else {
10717 ceph_assert(dn->alternate_name == alternate_name);
10718 }
10719
10720 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10721 DECODE_FINISH(p);
10722 }
10723
10724 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10725 {
10726 DECODE_START(2, p);
10727 inodeno_t ino;
10728 snapid_t last;
10729 __u32 nonce;
10730 decode(ino, p);
10731 decode(last, p);
10732 decode(nonce, p);
10733 in = get_inode(ino, last);
10734 if (!in) {
10735 in = new CInode(this, false, 2, last);
10736 in->set_replica_nonce(nonce);
10737 in->_decode_base(p);
10738 in->_decode_locks_state_for_replica(p, true);
10739 add_inode(in);
10740 if (in->ino() == CEPH_INO_ROOT)
10741 in->inode_auth.first = 0;
10742 else if (in->is_mdsdir())
10743 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10744 dout(10) << __func__ << " added " << *in << dendl;
10745 if (dn) {
10746 ceph_assert(dn->get_linkage()->is_null());
10747 dn->dir->link_primary_inode(dn, in);
10748 }
10749 } else {
10750 in->set_replica_nonce(nonce);
10751 in->_decode_base(p);
10752 in->_decode_locks_state_for_replica(p, false);
10753 dout(10) << __func__ << " had " << *in << dendl;
10754 }
10755
10756 if (dn) {
10757 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10758 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10759 }
10760
10761 if (struct_v >= 2) {
10762 __u32 s;
10763 decode(s, p);
10764 s &= CInode::MASK_STATE_REPLICATED;
10765 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10766 dout(10) << "replica inode is random ephemeral pinned" << dendl;
10767 in->set_ephemeral_pin(false, true);
10768 }
10769 }
10770
10771 DECODE_FINISH(p);
10772 }
10773
10774
10775 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10776 {
10777 ceph_assert(straydn->get_num_auth_pins());
10778 ENCODE_START(1, 1, bl);
10779 uint64_t features = mds->mdsmap->get_up_features();
10780 encode_replica_inode(get_myin(), who, bl, features);
10781 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10782 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10783 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10784 encode_replica_dir(straydn->get_dir(), who, bl);
10785 encode_replica_dentry(straydn, who, bl);
10786 ENCODE_FINISH(bl);
10787 }
10788
10789 void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
10790 {
10791 MDSContext::vec finished;
10792 auto p = bl.cbegin();
10793
10794 DECODE_START(1, p);
10795 CInode *mdsin = nullptr;
10796 decode_replica_inode(mdsin, p, NULL, finished);
10797 CDir *mdsdir = nullptr;
10798 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10799 CDentry *straydirdn = nullptr;
10800 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10801 CInode *strayin = nullptr;
10802 decode_replica_inode(strayin, p, straydirdn, finished);
10803 CDir *straydir = nullptr;
10804 decode_replica_dir(straydir, p, strayin, from, finished);
10805
10806 decode_replica_dentry(straydn, p, straydir, finished);
10807 if (!finished.empty())
10808 mds->queue_waiters(finished);
10809 DECODE_FINISH(p);
10810 }
10811
10812
10813 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10814 {
10815 // this is an FYI, re: replication
10816
10817 set<mds_rank_t> who;
10818 if (bcast) {
10819 set<mds_rank_t> mds_set;
10820 mds->get_mds_map()->get_active_mds_set(mds_set);
10821
10822 set<mds_rank_t> replica_set;
10823 for (const auto &p : dir->get_replicas()) {
10824 replica_set.insert(p.first);
10825 }
10826
10827 std::set_difference(mds_set.begin(), mds_set.end(),
10828 replica_set.begin(), replica_set.end(),
10829 std::inserter(who, who.end()));
10830 } else {
10831 for (const auto &p : dir->get_replicas()) {
10832 who.insert(p.first);
10833 }
10834 }
10835
10836 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10837
10838 filepath path;
10839 dir->inode->make_path(path);
10840
10841 std::set<int32_t> dir_rep_set;
10842 for (const auto &r : dir->dir_rep_by) {
10843 dir_rep_set.insert(r);
10844 }
10845
10846 mds_rank_t whoami = mds->get_nodeid();
10847 for (set<mds_rank_t>::iterator it = who.begin();
10848 it != who.end();
10849 ++it) {
10850 if (*it == whoami) continue;
10851 //if (*it == except) continue;
10852 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10853
10854 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, dir_rep_set, path, bcast), *it);
10855 }
10856
10857 return 0;
10858 }
10859
10860 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
10861 {
10862 dirfrag_t df = m->get_dirfrag();
10863 CDir *dir = get_dirfrag(df);
10864 if (!dir) {
10865 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10866
10867 // discover it?
10868 if (m->should_discover()) {
10869 // only try once!
10870 // this is key to avoid a fragtree update race, among other things.
10871 m->inc_tried_discover();
10872 vector<CDentry*> trace;
10873 CInode *in;
10874 filepath path = m->get_path();
10875 dout(5) << "trying discover on dir_update for " << path << dendl;
10876 CF_MDS_RetryMessageFactory cf(mds, m);
10877 MDRequestRef null_ref;
10878 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
10879 if (r > 0)
10880 return;
10881 if (r == 0 &&
10882 in->ino() == df.ino &&
10883 in->get_approx_dirfrag(df.frag) == NULL) {
10884 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10885 return;
10886 }
10887 }
10888
10889 return;
10890 }
10891
10892 if (!m->has_tried_discover()) {
10893 // Update if it already exists. Othwerwise it got updated by discover reply.
10894 dout(5) << "dir_update on " << *dir << dendl;
10895 dir->dir_rep = m->get_dir_rep();
10896 dir->dir_rep_by.clear();
10897 for (const auto &e : m->get_dir_rep_by()) {
10898 dir->dir_rep_by.insert(e);
10899 }
10900 }
10901 }
10902
10903
10904
10905
10906
10907 // LINK
10908
10909 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10910 {
10911 ENCODE_START(1, 1, bl);
10912 inodeno_t ino = dnl->get_remote_ino();
10913 encode(ino, bl);
10914 __u8 d_type = dnl->get_remote_d_type();
10915 encode(d_type, bl);
10916 ENCODE_FINISH(bl);
10917 }
10918
10919 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10920 {
10921 DECODE_START(1, p);
10922 inodeno_t ino;
10923 __u8 d_type;
10924 decode(ino, p);
10925 decode(d_type, p);
10926 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
10927 dir->link_remote_inode(dn, ino, d_type);
10928 DECODE_FINISH(p);
10929 }
10930
10931 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10932 {
10933 dout(7) << __func__ << " " << *dn << dendl;
10934
10935 CDir *subtree = get_subtree_root(dn->get_dir());
10936 for (const auto &p : dn->get_replicas()) {
10937 // don't tell (rename) witnesses; they already know
10938 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10939 continue;
10940 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10941 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10942 rejoin_gather.count(p.first)))
10943 continue;
10944 CDentry::linkage_t *dnl = dn->get_linkage();
10945 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
10946 if (dnl->is_primary()) {
10947 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
10948 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
10949 mds->mdsmap->get_up_features());
10950 } else if (dnl->is_remote()) {
10951 encode_remote_dentry_link(dnl, m->bl);
10952 } else
10953 ceph_abort(); // aie, bad caller!
10954 mds->send_message_mds(m, p.first);
10955 }
10956 }
10957
10958 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
10959 {
10960 CDentry *dn = NULL;
10961 CDir *dir = get_dirfrag(m->get_dirfrag());
10962 if (!dir) {
10963 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
10964 } else {
10965 dn = dir->lookup(m->get_dn());
10966 if (!dn) {
10967 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10968 } else {
10969 dout(7) << __func__ << " on " << *dn << dendl;
10970 CDentry::linkage_t *dnl = dn->get_linkage();
10971
10972 ceph_assert(!dn->is_auth());
10973 ceph_assert(dnl->is_null());
10974 }
10975 }
10976
10977 auto p = m->bl.cbegin();
10978 MDSContext::vec finished;
10979 if (dn) {
10980 if (m->get_is_primary()) {
10981 // primary link.
10982 CInode *in = nullptr;
10983 decode_replica_inode(in, p, dn, finished);
10984 } else {
10985 // remote link, easy enough.
10986 decode_remote_dentry_link(dir, dn, p);
10987 }
10988 } else {
10989 ceph_abort();
10990 }
10991
10992 if (!finished.empty())
10993 mds->queue_waiters(finished);
10994
10995 return;
10996 }
10997
10998
10999 // UNLINK
11000
11001 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11002 {
11003 dout(10) << __func__ << " " << *dn << dendl;
11004 // share unlink news with replicas
11005 set<mds_rank_t> replicas;
11006 dn->list_replicas(replicas);
11007 bufferlist snapbl;
11008 if (straydn) {
11009 straydn->list_replicas(replicas);
11010 CInode *strayin = straydn->get_linkage()->get_inode();
11011 strayin->encode_snap_blob(snapbl);
11012 }
11013 for (set<mds_rank_t>::iterator it = replicas.begin();
11014 it != replicas.end();
11015 ++it) {
11016 // don't tell (rmdir) witnesses; they already know
11017 if (mdr.get() && mdr->more()->witnessed.count(*it))
11018 continue;
11019
11020 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11021 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11022 rejoin_gather.count(*it)))
11023 continue;
11024
11025 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11026 if (straydn) {
11027 encode_replica_stray(straydn, *it, unlink->straybl);
11028 unlink->snapbl = snapbl;
11029 }
11030 mds->send_message_mds(unlink, *it);
11031 }
11032 }
11033
11034 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11035 {
11036 // straydn
11037 CDentry *straydn = nullptr;
11038 if (m->straybl.length())
11039 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
11040
11041 CDir *dir = get_dirfrag(m->get_dirfrag());
11042 if (!dir) {
11043 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11044 } else {
11045 CDentry *dn = dir->lookup(m->get_dn());
11046 if (!dn) {
11047 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11048 } else {
11049 dout(7) << __func__ << " on " << *dn << dendl;
11050 CDentry::linkage_t *dnl = dn->get_linkage();
11051
11052 // open inode?
11053 if (dnl->is_primary()) {
11054 CInode *in = dnl->get_inode();
11055 dn->dir->unlink_inode(dn);
11056 ceph_assert(straydn);
11057 straydn->dir->link_primary_inode(straydn, in);
11058
11059 // in->first is lazily updated on replica; drag it forward so
11060 // that we always keep it in sync with the dnq
11061 ceph_assert(straydn->first >= in->first);
11062 in->first = straydn->first;
11063
11064 // update subtree map?
11065 if (in->is_dir())
11066 adjust_subtree_after_rename(in, dir, false);
11067
11068 if (m->snapbl.length()) {
11069 bool hadrealm = (in->snaprealm ? true : false);
11070 in->decode_snap_blob(m->snapbl);
11071 ceph_assert(in->snaprealm);
11072 if (!hadrealm)
11073 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11074 }
11075
11076 // send caps to auth (if we're not already)
11077 if (in->is_any_caps() &&
11078 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11079 migrator->export_caps(in);
11080
11081 straydn = NULL;
11082 } else {
11083 ceph_assert(!straydn);
11084 ceph_assert(dnl->is_remote());
11085 dn->dir->unlink_inode(dn);
11086 }
11087 ceph_assert(dnl->is_null());
11088 }
11089 }
11090
11091 // race with trim_dentry()
11092 if (straydn) {
11093 ceph_assert(straydn->get_num_ref() == 0);
11094 ceph_assert(straydn->get_linkage()->is_null());
11095 expiremap ex;
11096 trim_dentry(straydn, ex);
11097 send_expire_messages(ex);
11098 }
11099 }
11100
11101
11102
11103
11104
11105
11106 // ===================================================================
11107
11108
11109
11110 // ===================================================================
11111 // FRAGMENT
11112
11113
11114 /**
11115 * adjust_dir_fragments -- adjust fragmentation for a directory
11116 *
11117 * @param diri directory inode
11118 * @param basefrag base fragment
11119 * @param bits bit adjustment. positive for split, negative for merge.
11120 */
11121 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11122 std::vector<CDir*>* resultfrags,
11123 MDSContext::vec& waiters,
11124 bool replay)
11125 {
11126 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11127 << " on " << *diri << dendl;
11128
11129 auto&& p = diri->get_dirfrags_under(basefrag);
11130
11131 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11132 }
11133
11134 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11135 {
11136 CDir *dir = diri->get_dirfrag(fg);
11137 if (dir)
11138 return dir;
11139
11140 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11141
11142 std::vector<CDir*> src, result;
11143 MDSContext::vec waiters;
11144
11145 // split a parent?
11146 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11147 while (1) {
11148 CDir *pdir = diri->get_dirfrag(parent);
11149 if (pdir) {
11150 int split = fg.bits() - parent.bits();
11151 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11152 src.push_back(pdir);
11153 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11154 dir = diri->get_dirfrag(fg);
11155 if (dir) {
11156 dout(10) << "force_dir_fragment result " << *dir << dendl;
11157 break;
11158 }
11159 }
11160 if (parent == frag_t())
11161 break;
11162 frag_t last = parent;
11163 parent = parent.parent();
11164 dout(10) << " " << last << " parent is " << parent << dendl;
11165 }
11166
11167 if (!dir) {
11168 // hoover up things under fg?
11169 {
11170 auto&& p = diri->get_dirfrags_under(fg);
11171 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11172 }
11173 if (src.empty()) {
11174 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11175 } else {
11176 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11177 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11178 dir = result.front();
11179 dout(10) << "force_dir_fragment result " << *dir << dendl;
11180 }
11181 }
11182 if (!replay)
11183 mds->queue_waiters(waiters);
11184 return dir;
11185 }
11186
11187 void MDCache::adjust_dir_fragments(CInode *diri,
11188 const std::vector<CDir*>& srcfrags,
11189 frag_t basefrag, int bits,
11190 std::vector<CDir*>* resultfrags,
11191 MDSContext::vec& waiters,
11192 bool replay)
11193 {
11194 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11195 << " srcfrags " << srcfrags
11196 << " on " << *diri << dendl;
11197
11198 // adjust fragtree
11199 // yuck. we may have discovered the inode while it was being fragmented.
11200 if (!diri->dirfragtree.is_leaf(basefrag))
11201 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11202
11203 if (bits > 0)
11204 diri->dirfragtree.split(basefrag, bits);
11205 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11206
11207 if (srcfrags.empty())
11208 return;
11209
11210 // split
11211 CDir *parent_dir = diri->get_parent_dir();
11212 CDir *parent_subtree = 0;
11213 if (parent_dir)
11214 parent_subtree = get_subtree_root(parent_dir);
11215
11216 ceph_assert(srcfrags.size() >= 1);
11217 if (bits > 0) {
11218 // SPLIT
11219 ceph_assert(srcfrags.size() == 1);
11220 CDir *dir = srcfrags.front();
11221
11222 dir->split(bits, resultfrags, waiters, replay);
11223
11224 // did i change the subtree map?
11225 if (dir->is_subtree_root()) {
11226 // new frags are now separate subtrees
11227 for (const auto& dir : *resultfrags) {
11228 subtrees[dir].clear(); // new frag is now its own subtree
11229 }
11230
11231 // was i a bound?
11232 if (parent_subtree) {
11233 ceph_assert(subtrees[parent_subtree].count(dir));
11234 subtrees[parent_subtree].erase(dir);
11235 for (const auto& dir : *resultfrags) {
11236 ceph_assert(dir->is_subtree_root());
11237 subtrees[parent_subtree].insert(dir);
11238 }
11239 }
11240
11241 // adjust my bounds.
11242 set<CDir*> bounds;
11243 bounds.swap(subtrees[dir]);
11244 subtrees.erase(dir);
11245 for (set<CDir*>::iterator p = bounds.begin();
11246 p != bounds.end();
11247 ++p) {
11248 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11249 subtrees[frag].insert(*p);
11250 }
11251
11252 show_subtrees(10);
11253 }
11254
11255 diri->close_dirfrag(dir->get_frag());
11256
11257 } else {
11258 // MERGE
11259
11260 // are my constituent bits subtrees? if so, i will be too.
11261 // (it's all or none, actually.)
11262 bool any_subtree = false, any_non_subtree = false;
11263 for (const auto& dir : srcfrags) {
11264 if (dir->is_subtree_root())
11265 any_subtree = true;
11266 else
11267 any_non_subtree = true;
11268 }
11269 ceph_assert(!any_subtree || !any_non_subtree);
11270
11271 set<CDir*> new_bounds;
11272 if (any_subtree) {
11273 for (const auto& dir : srcfrags) {
11274 // this simplifies the code that find subtrees underneath the dirfrag
11275 if (!dir->is_subtree_root()) {
11276 dir->state_set(CDir::STATE_AUXSUBTREE);
11277 adjust_subtree_auth(dir, mds->get_nodeid());
11278 }
11279 }
11280
11281 for (const auto& dir : srcfrags) {
11282 ceph_assert(dir->is_subtree_root());
11283 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11284 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11285 set<CDir*>::iterator r = q->second.begin();
11286 while (r != subtrees[dir].end()) {
11287 new_bounds.insert(*r);
11288 subtrees[dir].erase(r++);
11289 }
11290 subtrees.erase(q);
11291
11292 // remove myself as my parent's bound
11293 if (parent_subtree)
11294 subtrees[parent_subtree].erase(dir);
11295 }
11296 }
11297
11298 // merge
11299 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11300 f->merge(srcfrags, waiters, replay);
11301
11302 if (any_subtree) {
11303 ceph_assert(f->is_subtree_root());
11304 subtrees[f].swap(new_bounds);
11305 if (parent_subtree)
11306 subtrees[parent_subtree].insert(f);
11307
11308 show_subtrees(10);
11309 }
11310
11311 resultfrags->push_back(f);
11312 }
11313 }
11314
11315
11316 class C_MDC_FragmentFrozen : public MDSInternalContext {
11317 MDCache *mdcache;
11318 MDRequestRef mdr;
11319 public:
11320 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11321 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11322 void finish(int r) override {
11323 mdcache->fragment_frozen(mdr, r);
11324 }
11325 };
11326
11327 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11328 {
11329 if (is_readonly()) {
11330 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11331 return false;
11332 }
11333 if (mds->is_cluster_degraded()) {
11334 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11335 return false;
11336 }
11337 if (diri->get_parent_dir() &&
11338 diri->get_parent_dir()->get_inode()->is_stray()) {
11339 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11340 return false;
11341 }
11342 if (diri->is_mdsdir() || diri->ino() == CEPH_INO_CEPH) {
11343 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl;
11344 return false;
11345 }
11346
11347 for (const auto& dir : dirs) {
11348 if (dir->scrub_is_in_progress()) {
11349 dout(7) << "can_fragment: scrub in progress " << *dir << dendl;
11350 return false;
11351 }
11352
11353 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11354 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11355 return false;
11356 }
11357 if (!dir->is_auth()) {
11358 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11359 return false;
11360 }
11361 if (dir->is_bad()) {
11362 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11363 return false;
11364 }
11365 if (dir->is_frozen() ||
11366 dir->is_freezing()) {
11367 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11368 return false;
11369 }
11370 }
11371
11372 return true;
11373 }
11374
11375 void MDCache::split_dir(CDir *dir, int bits)
11376 {
11377 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11378 ceph_assert(dir->is_auth());
11379 CInode *diri = dir->inode;
11380
11381 std::vector<CDir*> dirs;
11382 dirs.push_back(dir);
11383
11384 if (!can_fragment(diri, dirs)) {
11385 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11386 return;
11387 }
11388
11389 if (dir->frag.bits() + bits > 24) {
11390 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11391 return;
11392 }
11393
11394 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11395 mdr->more()->fragment_base = dir->dirfrag();
11396
11397 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11398 fragment_info_t& info = fragments[dir->dirfrag()];
11399 info.mdr = mdr;
11400 info.dirs.push_back(dir);
11401 info.bits = bits;
11402 info.last_cum_auth_pins_change = ceph_clock_now();
11403
11404 fragment_freeze_dirs(dirs);
11405 // initial mark+complete pass
11406 fragment_mark_and_complete(mdr);
11407 }
11408
11409 void MDCache::merge_dir(CInode *diri, frag_t frag)
11410 {
11411 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11412
11413 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11414 if (!all) {
11415 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11416 return;
11417 }
11418
11419 if (diri->dirfragtree.is_leaf(frag)) {
11420 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11421 return;
11422 }
11423
11424 if (!can_fragment(diri, dirs))
11425 return;
11426
11427 CDir *first = dirs.front();
11428 int bits = first->get_frag().bits() - frag.bits();
11429 dout(10) << " we are merging by " << bits << " bits" << dendl;
11430
11431 dirfrag_t basedirfrag(diri->ino(), frag);
11432 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11433 mdr->more()->fragment_base = basedirfrag;
11434
11435 ceph_assert(fragments.count(basedirfrag) == 0);
11436 fragment_info_t& info = fragments[basedirfrag];
11437 info.mdr = mdr;
11438 info.dirs = dirs;
11439 info.bits = -bits;
11440 info.last_cum_auth_pins_change = ceph_clock_now();
11441
11442 fragment_freeze_dirs(dirs);
11443 // initial mark+complete pass
11444 fragment_mark_and_complete(mdr);
11445 }
11446
11447 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11448 {
11449 bool any_subtree = false, any_non_subtree = false;
11450 for (const auto& dir : dirs) {
11451 dir->auth_pin(dir); // until we mark and complete them
11452 dir->state_set(CDir::STATE_FRAGMENTING);
11453 dir->freeze_dir();
11454 ceph_assert(dir->is_freezing_dir());
11455
11456 if (dir->is_subtree_root())
11457 any_subtree = true;
11458 else
11459 any_non_subtree = true;
11460 }
11461
11462 if (any_subtree && any_non_subtree) {
11463 // either all dirfrags are subtree roots or all are not.
11464 for (const auto& dir : dirs) {
11465 if (dir->is_subtree_root()) {
11466 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11467 } else {
11468 dir->state_set(CDir::STATE_AUXSUBTREE);
11469 adjust_subtree_auth(dir, mds->get_nodeid());
11470 }
11471 }
11472 }
11473 }
11474
11475 class C_MDC_FragmentMarking : public MDCacheContext {
11476 MDRequestRef mdr;
11477 public:
11478 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11479 void finish(int r) override {
11480 mdcache->fragment_mark_and_complete(mdr);
11481 }
11482 };
11483
11484 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11485 {
11486 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11487 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11488 if (it == fragments.end() || it->second.mdr != mdr) {
11489 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11490 request_finish(mdr);
11491 return;
11492 }
11493
11494 fragment_info_t& info = it->second;
11495 CInode *diri = info.dirs.front()->get_inode();
11496 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11497
11498 MDSGatherBuilder gather(g_ceph_context);
11499
11500 for (const auto& dir : info.dirs) {
11501 bool ready = true;
11502 if (!dir->is_complete()) {
11503 dout(15) << " fetching incomplete " << *dir << dendl;
11504 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11505 ready = false;
11506 } else if (dir->get_frag() == frag_t()) {
11507 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11508 // the operation. To avoid CDir::fetch() complaining about missing object,
11509 // we commit new dirfrag first.
11510 if (dir->state_test(CDir::STATE_CREATING)) {
11511 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11512 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11513 ready = false;
11514 } else if (dir->is_new()) {
11515 dout(15) << " committing new " << *dir << dendl;
11516 ceph_assert(dir->is_dirty());
11517 dir->commit(0, gather.new_sub(), true);
11518 ready = false;
11519 }
11520 }
11521 if (!ready)
11522 continue;
11523
11524 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11525 dout(15) << " marking " << *dir << dendl;
11526 for (auto &p : dir->items) {
11527 CDentry *dn = p.second;
11528 dn->get(CDentry::PIN_FRAGMENTING);
11529 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11530 dn->state_set(CDentry::STATE_FRAGMENTING);
11531 }
11532 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11533 dir->auth_unpin(dir);
11534 } else {
11535 dout(15) << " already marked " << *dir << dendl;
11536 }
11537 }
11538 if (gather.has_subs()) {
11539 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11540 gather.activate();
11541 return;
11542 }
11543
11544 for (const auto& dir : info.dirs) {
11545 if (!dir->is_frozen_dir()) {
11546 ceph_assert(dir->is_freezing_dir());
11547 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11548 }
11549 }
11550 if (gather.has_subs()) {
11551 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11552 gather.activate();
11553 // flush log so that request auth_pins are retired
11554 mds->mdlog->flush();
11555 return;
11556 }
11557
11558 fragment_frozen(mdr, 0);
11559 }
11560
11561 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11562 {
11563 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11564 for (const auto& dir : dirs) {
11565 dout(10) << " frag " << *dir << dendl;
11566
11567 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11568 dir->state_clear(CDir::STATE_FRAGMENTING);
11569
11570 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11571 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11572
11573 for (auto &p : dir->items) {
11574 CDentry *dn = p.second;
11575 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11576 dn->state_clear(CDentry::STATE_FRAGMENTING);
11577 dn->put(CDentry::PIN_FRAGMENTING);
11578 }
11579 } else {
11580 dir->auth_unpin(dir);
11581 }
11582
11583 dir->unfreeze_dir();
11584 }
11585 }
11586
11587 bool MDCache::fragment_are_all_frozen(CDir *dir)
11588 {
11589 ceph_assert(dir->is_frozen_dir());
11590 map<dirfrag_t,fragment_info_t>::iterator p;
11591 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11592 p != fragments.end() && p->first.ino == dir->ino();
11593 ++p) {
11594 if (p->first.frag.contains(dir->get_frag()))
11595 return p->second.all_frozen;
11596 }
11597 ceph_abort();
11598 return false;
11599 }
11600
11601 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11602 {
11603 map<dirfrag_t,fragment_info_t>::iterator p;
11604 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11605 p != fragments.end() && p->first.ino == dir->ino();
11606 ++p) {
11607 if (p->first.frag.contains(dir->get_frag())) {
11608 p->second.num_remote_waiters++;
11609 return;
11610 }
11611 }
11612 ceph_abort();
11613 }
11614
11615 void MDCache::find_stale_fragment_freeze()
11616 {
11617 dout(10) << "find_stale_fragment_freeze" << dendl;
11618 // see comment in Migrator::find_stale_export_freeze()
11619 utime_t now = ceph_clock_now();
11620 utime_t cutoff = now;
11621 cutoff -= g_conf()->mds_freeze_tree_timeout;
11622
11623 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11624 p != fragments.end(); ) {
11625 dirfrag_t df = p->first;
11626 fragment_info_t& info = p->second;
11627 ++p;
11628 if (info.all_frozen)
11629 continue;
11630 CDir *dir;
11631 int total_auth_pins = 0;
11632 for (const auto& d : info.dirs) {
11633 dir = d;
11634 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11635 total_auth_pins = -1;
11636 break;
11637 }
11638 if (dir->is_frozen_dir())
11639 continue;
11640 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11641 }
11642 if (total_auth_pins < 0)
11643 continue;
11644 if (info.last_cum_auth_pins != total_auth_pins) {
11645 info.last_cum_auth_pins = total_auth_pins;
11646 info.last_cum_auth_pins_change = now;
11647 continue;
11648 }
11649 if (info.last_cum_auth_pins_change >= cutoff)
11650 continue;
11651 dir = info.dirs.front();
11652 if (info.num_remote_waiters > 0 ||
11653 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11654 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11655 std::vector<CDir*> dirs;
11656 info.dirs.swap(dirs);
11657 fragments.erase(df);
11658 fragment_unmark_unfreeze_dirs(dirs);
11659 }
11660 }
11661 }
11662
11663 class C_MDC_FragmentPrep : public MDCacheLogContext {
11664 MDRequestRef mdr;
11665 public:
11666 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11667 void finish(int r) override {
11668 mdcache->_fragment_logged(mdr);
11669 }
11670 };
11671
11672 class C_MDC_FragmentStore : public MDCacheContext {
11673 MDRequestRef mdr;
11674 public:
11675 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11676 void finish(int r) override {
11677 mdcache->_fragment_stored(mdr);
11678 }
11679 };
11680
11681 class C_MDC_FragmentCommit : public MDCacheLogContext {
11682 dirfrag_t basedirfrag;
11683 MDRequestRef mdr;
11684 public:
11685 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11686 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11687 void finish(int r) override {
11688 mdcache->_fragment_committed(basedirfrag, mdr);
11689 }
11690 };
11691
11692 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11693 dirfrag_t basedirfrag;
11694 int bits;
11695 MDRequestRef mdr;
11696 public:
11697 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11698 const MDRequestRef& r) :
11699 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11700 void finish(int r) override {
11701 ceph_assert(r == 0 || r == -CEPHFS_ENOENT);
11702 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
11703 }
11704 void print(ostream& out) const override {
11705 out << "fragment_purge_old(" << basedirfrag << ")";
11706 }
11707 };
11708
11709 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11710 {
11711 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11712 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11713 if (it == fragments.end() || it->second.mdr != mdr) {
11714 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11715 request_finish(mdr);
11716 return;
11717 }
11718
11719 ceph_assert(r == 0);
11720 fragment_info_t& info = it->second;
11721 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11722 << " on " << info.dirs.front()->get_inode() << dendl;
11723
11724 info.all_frozen = true;
11725 dispatch_fragment_dir(mdr);
11726 }
11727
11728 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11729 {
11730 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11731 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11732 if (it == fragments.end() || it->second.mdr != mdr) {
11733 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11734 request_finish(mdr);
11735 return;
11736 }
11737
11738 fragment_info_t& info = it->second;
11739 CInode *diri = info.dirs.front()->get_inode();
11740
11741 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11742 << " on " << *diri << dendl;
11743
11744 if (mdr->more()->peer_error)
11745 mdr->aborted = true;
11746
11747 if (!mdr->aborted) {
11748 MutationImpl::LockOpVec lov;
11749 lov.add_wrlock(&diri->dirfragtreelock);
11750 // prevent a racing gather on any other scatterlocks too
11751 lov.lock_scatter_gather(&diri->nestlock);
11752 lov.lock_scatter_gather(&diri->filelock);
11753 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
11754 if (!mdr->aborted)
11755 return;
11756 }
11757 }
11758
11759 if (mdr->aborted) {
11760 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11761 << info.dirs.front()->dirfrag() << dendl;
11762 if (info.bits > 0)
11763 mds->balancer->queue_split(info.dirs.front(), false);
11764 else
11765 mds->balancer->queue_merge(info.dirs.front());
11766 fragment_unmark_unfreeze_dirs(info.dirs);
11767 fragments.erase(it);
11768 request_finish(mdr);
11769 return;
11770 }
11771
11772 mdr->ls = mds->mdlog->get_current_segment();
11773 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11774 mds->mdlog->start_entry(le);
11775
11776 for (const auto& dir : info.dirs) {
11777 dirfrag_rollback rollback;
11778 rollback.fnode = dir->fnode;
11779 le->add_orig_frag(dir->get_frag(), &rollback);
11780 }
11781
11782 // refragment
11783 MDSContext::vec waiters;
11784 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11785 &info.resultfrags, waiters, false);
11786 if (g_conf()->mds_debug_frag)
11787 diri->verify_dirfrags();
11788 mds->queue_waiters(waiters);
11789
11790 for (const auto& fg : le->orig_frags)
11791 ceph_assert(!diri->dirfragtree.is_leaf(fg));
11792
11793 le->metablob.add_dir_context(info.resultfrags.front());
11794 for (const auto& dir : info.resultfrags) {
11795 if (diri->is_auth()) {
11796 le->metablob.add_fragmented_dir(dir, false, false);
11797 } else {
11798 dir->state_set(CDir::STATE_DIRTYDFT);
11799 le->metablob.add_fragmented_dir(dir, false, true);
11800 }
11801 }
11802
11803 // dft lock
11804 if (diri->is_auth()) {
11805 // journal dirfragtree
11806 auto pi = diri->project_inode(mdr);
11807 pi.inode->version = diri->pre_dirty();
11808 predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
11809 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11810 } else {
11811 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11812 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11813 mdr->add_updated_lock(&diri->dirfragtreelock);
11814 }
11815
11816 /*
11817 // filelock
11818 mds->locker->mark_updated_scatterlock(&diri->filelock);
11819 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11820 mut->add_updated_lock(&diri->filelock);
11821
11822 // dirlock
11823 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11824 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11825 mut->add_updated_lock(&diri->nestlock);
11826 */
11827
11828 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11829 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11830 mdr, __func__);
11831 mds->mdlog->flush();
11832 }
11833
11834 void MDCache::_fragment_logged(MDRequestRef& mdr)
11835 {
11836 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11837 auto& info = fragments.at(basedirfrag);
11838 CInode *diri = info.resultfrags.front()->get_inode();
11839
11840 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11841 << " on " << *diri << dendl;
11842 mdr->mark_event("prepare logged");
11843
11844 mdr->apply(); // mark scatterlock
11845
11846 // store resulting frags
11847 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11848
11849 for (const auto& dir : info.resultfrags) {
11850 dout(10) << " storing result frag " << *dir << dendl;
11851
11852 dir->mark_dirty(mdr->ls);
11853 dir->mark_new(mdr->ls);
11854
11855 // freeze and store them too
11856 dir->auth_pin(this);
11857 dir->state_set(CDir::STATE_FRAGMENTING);
11858 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11859 }
11860
11861 gather.activate();
11862 }
11863
11864 void MDCache::_fragment_stored(MDRequestRef& mdr)
11865 {
11866 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11867 fragment_info_t &info = fragments.at(basedirfrag);
11868 CDir *first = info.resultfrags.front();
11869 CInode *diri = first->get_inode();
11870
11871 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11872 << " on " << *diri << dendl;
11873 mdr->mark_event("new frags stored");
11874
11875 // tell peers
11876 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11877 diri->authority().first : CDIR_AUTH_UNKNOWN;
11878 for (const auto &p : first->get_replicas()) {
11879 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11880 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11881 rejoin_gather.count(p.first)))
11882 continue;
11883
11884 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
11885 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11886 diri_auth != p.first) { // not auth mds of diri
11887 /*
11888 * In the nornal case, mds does not trim dir inode whose child dirfrags
11889 * are likely being fragmented (see trim_inode()). But when fragmenting
11890 * subtree roots, following race can happen:
11891 *
11892 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11893 * mds.c and drops wrlock on dirfragtreelock.
11894 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11895 * SYNC and send lock message mds.c
11896 * - mds.c receives the lock message and changes dirfragtreelock state
11897 * to SYNC
11898 * - mds.c trim dirfrag and dir inode from its cache
11899 * - mds.c receives the fragment_notify message
11900 *
11901 * So we need to ensure replicas have received the notify, then unlock
11902 * the dirfragtreelock.
11903 */
11904 notify->mark_ack_wanted();
11905 info.notify_ack_waiting.insert(p.first);
11906 }
11907
11908 // freshly replicate new dirs to peers
11909 for (const auto& dir : info.resultfrags) {
11910 encode_replica_dir(dir, p.first, notify->basebl);
11911 }
11912
11913 mds->send_message_mds(notify, p.first);
11914 }
11915
11916 // journal commit
11917 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11918 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
11919
11920
11921 // unfreeze resulting frags
11922 for (const auto& dir : info.resultfrags) {
11923 dout(10) << " result frag " << *dir << dendl;
11924
11925 for (auto &p : dir->items) {
11926 CDentry *dn = p.second;
11927 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11928 dn->state_clear(CDentry::STATE_FRAGMENTING);
11929 dn->put(CDentry::PIN_FRAGMENTING);
11930 }
11931
11932 // unfreeze
11933 dir->unfreeze_dir();
11934 }
11935
11936 if (info.notify_ack_waiting.empty()) {
11937 fragment_drop_locks(info);
11938 } else {
11939 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11940 }
11941 }
11942
11943 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
11944 {
11945 dout(10) << "fragment_committed " << basedirfrag << dendl;
11946 if (mdr)
11947 mdr->mark_event("commit logged");
11948
11949 ufragment &uf = uncommitted_fragments.at(basedirfrag);
11950
11951 // remove old frags
11952 C_GatherBuilder gather(
11953 g_ceph_context,
11954 new C_OnFinisher(
11955 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
11956 mds->finisher));
11957
11958 SnapContext nullsnapc;
11959 object_locator_t oloc(mds->get_metadata_pool());
11960 for (const auto& fg : uf.old_frags) {
11961 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
11962 ObjectOperation op;
11963 if (fg == frag_t()) {
11964 // backtrace object
11965 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11966 op.truncate(0);
11967 op.omap_clear();
11968 } else {
11969 dout(10) << " removing orphan dirfrag " << oid << dendl;
11970 op.remove();
11971 }
11972 mds->objecter->mutate(oid, oloc, op, nullsnapc,
11973 ceph::real_clock::now(),
11974 0, gather.new_sub());
11975 }
11976
11977 ceph_assert(gather.has_subs());
11978 gather.activate();
11979 }
11980
11981 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
11982 {
11983 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
11984 if (mdr)
11985 mdr->mark_event("old frags purged");
11986
11987 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
11988 mds->mdlog->start_submit_entry(le);
11989
11990 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
11991
11992 if (mds->logger) {
11993 if (bits > 0) {
11994 mds->logger->inc(l_mds_dir_split);
11995 } else {
11996 mds->logger->inc(l_mds_dir_merge);
11997 }
11998 }
11999
12000 if (mdr) {
12001 auto it = fragments.find(basedirfrag);
12002 ceph_assert(it != fragments.end());
12003 it->second.finishing = true;
12004 if (it->second.notify_ack_waiting.empty())
12005 fragment_maybe_finish(it);
12006 else
12007 mdr->mark_event("wating for notify acks");
12008 }
12009 }
12010
12011 void MDCache::fragment_drop_locks(fragment_info_t& info)
12012 {
12013 mds->locker->drop_locks(info.mdr.get());
12014 request_finish(info.mdr);
12015 //info.mdr.reset();
12016 }
12017
12018 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12019 {
12020 if (!it->second.finishing)
12021 return;
12022
12023 // unmark & auth_unpin
12024 for (const auto &dir : it->second.resultfrags) {
12025 dir->state_clear(CDir::STATE_FRAGMENTING);
12026 dir->auth_unpin(this);
12027
12028 // In case the resulting fragments are beyond the split size,
12029 // we might need to split them again right away (they could
12030 // have been taking inserts between unfreezing and getting
12031 // here)
12032 mds->balancer->maybe_fragment(dir, false);
12033 }
12034
12035 fragments.erase(it);
12036 }
12037
12038
12039 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12040 {
12041 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12042 mds_rank_t from = mds_rank_t(ack->get_source().num());
12043
12044 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12045 return;
12046 }
12047
12048 auto it = fragments.find(ack->get_base_dirfrag());
12049 if (it == fragments.end() ||
12050 it->second.get_tid() != ack->get_tid()) {
12051 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12052 return;
12053 }
12054
12055 if (it->second.notify_ack_waiting.erase(from) &&
12056 it->second.notify_ack_waiting.empty()) {
12057 fragment_drop_locks(it->second);
12058 fragment_maybe_finish(it);
12059 }
12060 }
12061
12062 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12063 {
12064 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12065 mds_rank_t from = mds_rank_t(notify->get_source().num());
12066
12067 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12068 return;
12069 }
12070
12071 CInode *diri = get_inode(notify->get_ino());
12072 if (diri) {
12073 frag_t base = notify->get_basefrag();
12074 int bits = notify->get_bits();
12075
12076 /*
12077 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12078 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12079 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12080 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12081 return;
12082 }
12083 */
12084
12085 // refragment
12086 MDSContext::vec waiters;
12087 std::vector<CDir*> resultfrags;
12088 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12089 if (g_conf()->mds_debug_frag)
12090 diri->verify_dirfrags();
12091
12092 for (const auto& dir : resultfrags) {
12093 diri->take_dir_waiting(dir->get_frag(), waiters);
12094 }
12095
12096 // add new replica dirs values
12097 auto p = notify->basebl.cbegin();
12098 while (!p.end()) {
12099 CDir *tmp_dir = nullptr;
12100 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12101 }
12102
12103 mds->queue_waiters(waiters);
12104 } else {
12105 ceph_abort();
12106 }
12107
12108 if (notify->is_ack_wanted()) {
12109 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12110 notify->get_bits(), notify->get_tid());
12111 mds->send_message_mds(ack, from);
12112 }
12113 }
12114
12115 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12116 LogSegment *ls, bufferlist *rollback)
12117 {
12118 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12119 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12120 ufragment& uf = uncommitted_fragments[basedirfrag];
12121 uf.old_frags = old_frags;
12122 uf.bits = bits;
12123 uf.ls = ls;
12124 ls->uncommitted_fragments.insert(basedirfrag);
12125 if (rollback)
12126 uf.rollback.swap(*rollback);
12127 }
12128
12129 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12130 {
12131 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12132 << " op " << EFragment::op_name(op) << dendl;
12133 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12134 if (it != uncommitted_fragments.end()) {
12135 ufragment& uf = it->second;
12136 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12137 uf.committed = true;
12138 } else {
12139 uf.ls->uncommitted_fragments.erase(basedirfrag);
12140 mds->queue_waiters(uf.waiters);
12141 uncommitted_fragments.erase(it);
12142 }
12143 }
12144 }
12145
12146 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12147 {
12148 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12149 << " old_frags (" << old_frags << ")" << dendl;
12150 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12151 if (it != uncommitted_fragments.end()) {
12152 ufragment& uf = it->second;
12153 if (!uf.old_frags.empty()) {
12154 uf.old_frags = std::move(old_frags);
12155 uf.committed = true;
12156 } else {
12157 uf.ls->uncommitted_fragments.erase(basedirfrag);
12158 uncommitted_fragments.erase(it);
12159 }
12160 }
12161 }
12162
12163 void MDCache::wait_for_uncommitted_fragments(MDSContext* finisher)
12164 {
12165 MDSGatherBuilder gather(g_ceph_context, finisher);
12166 for (auto& p : uncommitted_fragments) {
12167 p.second.waiters.push_back(gather.new_sub());
12168 }
12169 gather.activate();
12170 }
12171
12172 struct C_MDC_FragmentRollback : public MDCacheLogContext {
12173 MutationRef mut;
12174 C_MDC_FragmentRollback(MDCache *c, MutationRef& m) :
12175 MDCacheLogContext(c), mut(m) {}
12176 void finish(int r) override {
12177 mut->apply();
12178 get_mds()->locker->drop_locks(mut.get());
12179 mut->cleanup();
12180 }
12181 };
12182
12183 void MDCache::rollback_uncommitted_fragments()
12184 {
12185 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12186 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12187 p != uncommitted_fragments.end();
12188 ++p) {
12189 ufragment &uf = p->second;
12190 CInode *diri = get_inode(p->first.ino);
12191 ceph_assert(diri);
12192
12193 if (uf.committed) {
12194 _fragment_committed(p->first, MDRequestRef());
12195 continue;
12196 }
12197
12198 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12199
12200 MutationRef mut(new MutationImpl());
12201 mut->ls = mds->mdlog->get_current_segment();
12202 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12203 mds->mdlog->start_entry(le);
12204 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12205
12206 frag_vec_t old_frags;
12207 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12208
12209 std::vector<CDir*> resultfrags;
12210 if (uf.old_frags.empty()) {
12211 // created by old format EFragment
12212 MDSContext::vec waiters;
12213 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12214 } else {
12215 auto bp = uf.rollback.cbegin();
12216 for (const auto& fg : uf.old_frags) {
12217 CDir *dir = force_dir_fragment(diri, fg);
12218 resultfrags.push_back(dir);
12219
12220 dirfrag_rollback rollback;
12221 decode(rollback, bp);
12222
12223 dir->fnode = rollback.fnode;
12224
12225 dir->mark_dirty(mut->ls);
12226
12227 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
12228 dout(10) << " dirty nestinfo on " << *dir << dendl;
12229 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12230 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12231 mut->add_updated_lock(&diri->nestlock);
12232 }
12233 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
12234 dout(10) << " dirty fragstat on " << *dir << dendl;
12235 mds->locker->mark_updated_scatterlock(&diri->filelock);
12236 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12237 mut->add_updated_lock(&diri->filelock);
12238 }
12239
12240 le->add_orig_frag(dir->get_frag());
12241 le->metablob.add_dir_context(dir);
12242 if (diri_auth) {
12243 le->metablob.add_fragmented_dir(dir, true, false);
12244 } else {
12245 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12246 dir->state_set(CDir::STATE_DIRTYDFT);
12247 le->metablob.add_fragmented_dir(dir, true, true);
12248 }
12249 }
12250 }
12251
12252 if (diri_auth) {
12253 auto pi = diri->project_inode(mut);
12254 pi.inode->version = diri->pre_dirty();
12255 predirty_journal_parents(mut, &le->metablob, diri, 0, PREDIRTY_PRIMARY);
12256 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12257 } else {
12258 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12259 mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12260 mut->add_updated_lock(&diri->dirfragtreelock);
12261 }
12262
12263 if (g_conf()->mds_debug_frag)
12264 diri->verify_dirfrags();
12265
12266 for (const auto& leaf : old_frags) {
12267 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12268 }
12269
12270 mds->mdlog->submit_entry(le, new C_MDC_FragmentRollback(this, mut));
12271
12272 uf.old_frags.swap(old_frags);
12273 _fragment_committed(p->first, MDRequestRef());
12274 }
12275 }
12276
12277 void MDCache::force_readonly()
12278 {
12279 if (is_readonly())
12280 return;
12281
12282 dout(1) << "force file system read-only" << dendl;
12283 mds->clog->warn() << "force file system read-only";
12284
12285 set_readonly();
12286
12287 mds->server->force_clients_readonly();
12288
12289 // revoke write caps
12290 int count = 0;
12291 for (auto &p : inode_map) {
12292 CInode *in = p.second;
12293 if (in->is_head())
12294 mds->locker->eval(in, CEPH_CAP_LOCKS);
12295 if (!(++count % 1000))
12296 mds->heartbeat_reset();
12297 }
12298
12299 mds->mdlog->flush();
12300 }
12301
12302
12303 // ==============================================================
12304 // debug crap
12305
12306 void MDCache::show_subtrees(int dbl, bool force_print)
12307 {
12308 if (g_conf()->mds_thrash_exports)
12309 dbl += 15;
12310
12311 //dout(10) << "show_subtrees" << dendl;
12312
12313 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12314 return; // i won't print anything.
12315
12316 if (subtrees.empty()) {
12317 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12318 << dendl;
12319 return;
12320 }
12321
12322 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12323 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12324 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12325 "printing subtrees" << dendl;
12326 return;
12327 }
12328
12329 // root frags
12330 std::vector<CDir*> basefrags;
12331 for (set<CInode*>::iterator p = base_inodes.begin();
12332 p != base_inodes.end();
12333 ++p)
12334 (*p)->get_dirfrags(basefrags);
12335 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12336 dout(15) << "show_subtrees" << dendl;
12337
12338 // queue stuff
12339 list<pair<CDir*,int> > q;
12340 string indent;
12341 set<CDir*> seen;
12342
12343 // calc max depth
12344 for (const auto& dir : basefrags) {
12345 q.emplace_back(dir, 0);
12346 }
12347
12348 set<CDir*> subtrees_seen;
12349
12350 unsigned int depth = 0;
12351 while (!q.empty()) {
12352 CDir *dir = q.front().first;
12353 unsigned int d = q.front().second;
12354 q.pop_front();
12355
12356 if (subtrees.count(dir) == 0) continue;
12357
12358 subtrees_seen.insert(dir);
12359
12360 if (d > depth) depth = d;
12361
12362 // sanity check
12363 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12364 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12365 ceph_assert(seen.count(dir) == 0);
12366 seen.insert(dir);
12367
12368 // nested items?
12369 if (!subtrees[dir].empty()) {
12370 for (set<CDir*>::iterator p = subtrees[dir].begin();
12371 p != subtrees[dir].end();
12372 ++p) {
12373 //dout(25) << " saw sub " << **p << dendl;
12374 q.push_front(pair<CDir*,int>(*p, d+1));
12375 }
12376 }
12377 }
12378
12379 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12380 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12381 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12382 "subtrees" << dendl;
12383 return;
12384 }
12385
12386 // print tree
12387 for (const auto& dir : basefrags) {
12388 q.emplace_back(dir, 0);
12389 }
12390
12391 while (!q.empty()) {
12392 CDir *dir = q.front().first;
12393 int d = q.front().second;
12394 q.pop_front();
12395
12396 if (subtrees.count(dir) == 0) continue;
12397
12398 // adjust indenter
12399 while ((unsigned)d < indent.size())
12400 indent.resize(d);
12401
12402 // pad
12403 string pad = "______________________________________";
12404 pad.resize(depth*2+1-indent.size());
12405 if (!subtrees[dir].empty())
12406 pad[0] = '.'; // parent
12407
12408
12409 string auth;
12410 if (dir->is_auth())
12411 auth = "auth ";
12412 else
12413 auth = " rep ";
12414
12415 char s[10];
12416 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12417 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12418 else
12419 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12420
12421 // print
12422 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12423 << " " << auth << *dir << dendl;
12424
12425 if (dir->ino() == CEPH_INO_ROOT)
12426 ceph_assert(dir->inode == root);
12427 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12428 ceph_assert(dir->inode == myin);
12429 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12430 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12431
12432 // nested items?
12433 if (!subtrees[dir].empty()) {
12434 // more at my level?
12435 if (!q.empty() && q.front().second == d)
12436 indent += "| ";
12437 else
12438 indent += " ";
12439
12440 for (set<CDir*>::iterator p = subtrees[dir].begin();
12441 p != subtrees[dir].end();
12442 ++p)
12443 q.push_front(pair<CDir*,int>(*p, d+2));
12444 }
12445 }
12446
12447 // verify there isn't stray crap in subtree map
12448 int lost = 0;
12449 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12450 p != subtrees.end();
12451 ++p) {
12452 if (subtrees_seen.count(p->first)) continue;
12453 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12454 lost++;
12455 }
12456 ceph_assert(lost == 0);
12457 }
12458
12459 void MDCache::show_cache()
12460 {
12461 if (!g_conf()->subsys.should_gather<ceph_subsys_mds, 7>())
12462 return;
12463 dout(7) << "show_cache" << dendl;
12464
12465 auto show_func = [this](CInode *in) {
12466 // unlinked?
12467 if (!in->parent)
12468 dout(7) << " unlinked " << *in << dendl;
12469
12470 // dirfrags?
12471 auto&& dfs = in->get_dirfrags();
12472 for (const auto& dir : dfs) {
12473 dout(7) << " dirfrag " << *dir << dendl;
12474
12475 for (auto &p : dir->items) {
12476 CDentry *dn = p.second;
12477 dout(7) << " dentry " << *dn << dendl;
12478 CDentry::linkage_t *dnl = dn->get_linkage();
12479 if (dnl->is_primary() && dnl->get_inode())
12480 dout(7) << " inode " << *dnl->get_inode() << dendl;
12481 }
12482 }
12483 };
12484
12485 for (auto &p : inode_map)
12486 show_func(p.second);
12487 for (auto &p : snap_inode_map)
12488 show_func(p.second);
12489 }
12490
12491 void MDCache::cache_status(Formatter *f)
12492 {
12493 f->open_object_section("cache");
12494
12495 f->open_object_section("pool");
12496 mempool::get_pool(mempool::mds_co::id).dump(f);
12497 f->close_section();
12498
12499 f->close_section();
12500 }
12501
12502 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12503 {
12504 ceph_assert(in);
12505 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12506 return;
12507 }
12508 auto&& ls = in->get_dirfrags();
12509 for (const auto &subdir : ls) {
12510 for (const auto &p : subdir->items) {
12511 CDentry *dn = p.second;
12512 CInode *in = dn->get_linkage()->get_inode();
12513 if (in) {
12514 dump_tree(in, cur_depth + 1, max_depth, f);
12515 }
12516 }
12517 }
12518 f->open_object_section("inode");
12519 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12520 f->close_section();
12521 }
12522
12523 int MDCache::dump_cache(std::string_view file_name)
12524 {
12525 return dump_cache(file_name, NULL);
12526 }
12527
12528 int MDCache::dump_cache(Formatter *f)
12529 {
12530 return dump_cache(std::string_view(""), f);
12531 }
12532
12533 /**
12534 * Dump the metadata cache, either to a Formatter, if
12535 * provided, else to a plain text file.
12536 */
12537 int MDCache::dump_cache(std::string_view fn, Formatter *f)
12538 {
12539 int r = 0;
12540
12541 // dumping large caches may cause mds to hang or worse get killed.
12542 // so, disallow the dump if the cache size exceeds the configured
12543 // threshold, which is 1G for formatter and unlimited for file (note
12544 // that this can be jacked up by the admin... and is nothing but foot
12545 // shooting, but the option itself is for devs and hence dangerous to
12546 // tune). TODO: remove this when fixed.
12547 uint64_t threshold = f ?
12548 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12549 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12550
12551 if (threshold && cache_size() > threshold) {
12552 if (f) {
12553 CachedStackStringStream css;
12554 *css << "cache usage exceeds dump threshold";
12555 f->open_object_section("result");
12556 f->dump_string("error", css->strv());
12557 f->close_section();
12558 } else {
12559 derr << "cache usage exceeds dump threshold" << dendl;
12560 r = -CEPHFS_EINVAL;
12561 }
12562 return r;
12563 }
12564
12565 r = 0;
12566 int fd = -1;
12567
12568 if (f) {
12569 f->open_array_section("inodes");
12570 } else {
12571 char path[PATH_MAX] = "";
12572 if (fn.length()) {
12573 snprintf(path, sizeof path, "%s", fn.data());
12574 } else {
12575 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12576 }
12577
12578 dout(1) << "dump_cache to " << path << dendl;
12579
12580 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12581 if (fd < 0) {
12582 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12583 return errno;
12584 }
12585 }
12586
12587 auto dump_func = [fd, f](CInode *in) {
12588 int r;
12589 if (f) {
12590 f->open_object_section("inode");
12591 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12592 f->close_section();
12593 return 1;
12594 }
12595 CachedStackStringStream css;
12596 *css << *in << std::endl;
12597 auto sv = css->strv();
12598 r = safe_write(fd, sv.data(), sv.size());
12599 if (r < 0)
12600 return r;
12601 auto&& dfs = in->get_dirfrags();
12602 for (auto &dir : dfs) {
12603 CachedStackStringStream css2;
12604 *css2 << " " << *dir << std::endl;
12605 auto sv = css2->strv();
12606 r = safe_write(fd, sv.data(), sv.size());
12607 if (r < 0)
12608 return r;
12609 for (auto &p : dir->items) {
12610 CDentry *dn = p.second;
12611 CachedStackStringStream css3;
12612 *css3 << " " << *dn << std::endl;
12613 auto sv = css3->strv();
12614 r = safe_write(fd, sv.data(), sv.size());
12615 if (r < 0)
12616 return r;
12617 }
12618 dir->check_rstats();
12619 }
12620 return 1;
12621 };
12622
12623 for (auto &p : inode_map) {
12624 r = dump_func(p.second);
12625 if (r < 0)
12626 goto out;
12627 }
12628 for (auto &p : snap_inode_map) {
12629 r = dump_func(p.second);
12630 if (r < 0)
12631 goto out;
12632 }
12633 r = 0;
12634
12635 out:
12636 if (f) {
12637 f->close_section(); // inodes
12638 } else {
12639 ::close(fd);
12640 }
12641 return r;
12642 }
12643
12644 void C_MDS_RetryRequest::finish(int r)
12645 {
12646 mdr->retry++;
12647 cache->dispatch_request(mdr);
12648 }
12649
12650 MDSContext *CF_MDS_RetryRequestFactory::build()
12651 {
12652 if (drop_locks) {
12653 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
12654 mdr->drop_local_auth_pins();
12655 }
12656 return new C_MDS_RetryRequest(mdcache, mdr);
12657 }
12658
12659 class C_MDS_EnqueueScrub : public Context
12660 {
12661 std::string tag;
12662 Formatter *formatter;
12663 Context *on_finish;
12664 public:
12665 ScrubHeaderRef header;
12666 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12667 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12668
12669 void finish(int r) override {
12670 formatter->open_object_section("results");
12671 formatter->dump_int("return_code", r);
12672 if (r == 0) {
12673 formatter->dump_string("scrub_tag", tag);
12674 formatter->dump_string("mode", "asynchronous");
12675 }
12676 formatter->close_section();
12677
12678 r = 0;
12679 if (on_finish)
12680 on_finish->complete(r);
12681 }
12682 };
12683
12684 void MDCache::enqueue_scrub(
12685 std::string_view path,
12686 std::string_view tag,
12687 bool force, bool recursive, bool repair,
12688 Formatter *f, Context *fin)
12689 {
12690 dout(10) << __func__ << " " << path << dendl;
12691
12692 filepath fp;
12693 if (path.compare(0, 4, "~mds") == 0) {
12694 mds_rank_t rank;
12695 if (path == "~mdsdir") {
12696 rank = mds->get_nodeid();
12697 } else {
12698 std::string err;
12699 rank = strict_strtoll(path.substr(4), 10, &err);
12700 if (!err.empty())
12701 rank = MDS_RANK_NONE;
12702 }
12703 if (rank >= 0 && rank < MAX_MDS)
12704 fp.set_path("", MDS_INO_MDSDIR(rank));
12705 }
12706 if (fp.get_ino() == inodeno_t(0))
12707 fp.set_path(path);
12708
12709 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12710 mdr->set_filepath(fp);
12711
12712 bool is_internal = false;
12713 std::string tag_str(tag);
12714 if (tag_str.empty()) {
12715 uuid_d uuid_gen;
12716 uuid_gen.generate_random();
12717 tag_str = uuid_gen.to_string();
12718 is_internal = true;
12719 }
12720
12721 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
12722 cs->header = std::make_shared<ScrubHeader>(tag_str, is_internal, force, recursive, repair);
12723
12724 mdr->internal_op_finish = cs;
12725 enqueue_scrub_work(mdr);
12726 }
12727
12728 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12729 {
12730 CInode *in;
12731 CF_MDS_RetryRequestFactory cf(this, mdr, true);
12732 int r = path_traverse(mdr, cf, mdr->get_filepath(),
12733 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_RDLOCK_PATH,
12734 nullptr, &in);
12735 if (r > 0)
12736 return;
12737 if (r < 0) {
12738 mds->server->respond_to_request(mdr, r);
12739 return;
12740 }
12741
12742 // Cannot scrub same dentry twice at same time
12743 if (in->scrub_is_in_progress()) {
12744 mds->server->respond_to_request(mdr, -CEPHFS_EBUSY);
12745 return;
12746 } else {
12747 in->scrub_info();
12748 }
12749
12750 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12751 ScrubHeaderRef& header = cs->header;
12752
12753 r = mds->scrubstack->enqueue(in, header, !header->get_recursive());
12754
12755 mds->server->respond_to_request(mdr, r);
12756 }
12757
12758 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
12759 MDRequestRef mdr;
12760 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
12761 MDCacheLogContext(c), mdr(m) {}
12762 void finish(int r) override {
12763 mdr->apply();
12764 get_mds()->server->respond_to_request(mdr, r);
12765 }
12766 };
12767
12768 struct C_MDC_ScrubRepaired : public MDCacheContext {
12769 ScrubHeaderRef header;
12770 public:
12771 C_MDC_ScrubRepaired(MDCache *m, const ScrubHeaderRef& h)
12772 : MDCacheContext(m), header(h) {
12773 header->inc_num_pending();
12774 }
12775 void finish(int r) override {
12776 header->dec_num_pending();
12777 }
12778 };
12779
12780 void MDCache::repair_dirfrag_stats(CDir *dir)
12781 {
12782 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12783 mdr->pin(dir);
12784 mdr->internal_op_private = dir;
12785 if (dir->scrub_is_in_progress())
12786 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, dir->get_scrub_header());
12787 else
12788 mdr->internal_op_finish = new C_MDSInternalNoop;
12789 repair_dirfrag_stats_work(mdr);
12790 }
12791
12792 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12793 {
12794 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12795 dout(10) << __func__ << " " << *dir << dendl;
12796
12797 if (!dir->is_auth()) {
12798 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
12799 return;
12800 }
12801
12802 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12803 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12804
12805 mds->locker->drop_locks(mdr.get());
12806 mdr->drop_local_auth_pins();
12807 if (mdr->is_any_remote_auth_pin())
12808 mds->locker->notify_freeze_waiter(dir);
12809 return;
12810 }
12811
12812 mdr->auth_pin(dir);
12813
12814 MutationImpl::LockOpVec lov;
12815 CInode *diri = dir->inode;
12816 lov.add_rdlock(&diri->dirfragtreelock);
12817 lov.add_wrlock(&diri->nestlock);
12818 lov.add_wrlock(&diri->filelock);
12819 if (!mds->locker->acquire_locks(mdr, lov))
12820 return;
12821
12822 if (!dir->is_complete()) {
12823 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12824 return;
12825 }
12826
12827 frag_info_t frag_info;
12828 nest_info_t nest_info;
12829 for (auto it = dir->begin(); it != dir->end(); ++it) {
12830 CDentry *dn = it->second;
12831 if (dn->last != CEPH_NOSNAP)
12832 continue;
12833 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12834 if (dnl->is_primary()) {
12835 CInode *in = dnl->get_inode();
12836 nest_info.add(in->get_projected_inode()->accounted_rstat);
12837 if (in->is_dir())
12838 frag_info.nsubdirs++;
12839 else
12840 frag_info.nfiles++;
12841 } else if (dnl->is_remote())
12842 frag_info.nfiles++;
12843 }
12844
12845 auto pf = dir->get_projected_fnode();
12846 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12847 bool good_rstat = nest_info.same_sums(pf->rstat);
12848 if (good_fragstat && good_rstat) {
12849 dout(10) << __func__ << " no corruption found" << dendl;
12850 mds->server->respond_to_request(mdr, 0);
12851 return;
12852 }
12853
12854 auto _pf = dir->project_fnode(mdr);
12855 _pf->version = dir->pre_dirty();
12856 pf = _pf;
12857
12858 mdr->ls = mds->mdlog->get_current_segment();
12859 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12860 mds->mdlog->start_entry(le);
12861
12862 if (!good_fragstat) {
12863 if (pf->fragstat.mtime > frag_info.mtime)
12864 frag_info.mtime = pf->fragstat.mtime;
12865 if (pf->fragstat.change_attr > frag_info.change_attr)
12866 frag_info.change_attr = pf->fragstat.change_attr;
12867 _pf->fragstat = frag_info;
12868 mds->locker->mark_updated_scatterlock(&diri->filelock);
12869 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12870 mdr->add_updated_lock(&diri->filelock);
12871 }
12872
12873 if (!good_rstat) {
12874 if (pf->rstat.rctime > nest_info.rctime)
12875 nest_info.rctime = pf->rstat.rctime;
12876 _pf->rstat = nest_info;
12877 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12878 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12879 mdr->add_updated_lock(&diri->nestlock);
12880 }
12881
12882 le->metablob.add_dir_context(dir);
12883 le->metablob.add_dir(dir, true);
12884
12885 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12886 }
12887
12888 void MDCache::repair_inode_stats(CInode *diri)
12889 {
12890 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12891 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
12892 mdr->internal_op_private = diri;
12893 if (diri->scrub_is_in_progress())
12894 mdr->internal_op_finish = new C_MDC_ScrubRepaired(this, diri->get_scrub_header());
12895 else
12896 mdr->internal_op_finish = new C_MDSInternalNoop;
12897 repair_inode_stats_work(mdr);
12898 }
12899
12900 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12901 {
12902 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12903 dout(10) << __func__ << " " << *diri << dendl;
12904
12905 if (!diri->is_auth()) {
12906 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
12907 return;
12908 }
12909 if (!diri->is_dir()) {
12910 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
12911 return;
12912 }
12913
12914 MutationImpl::LockOpVec lov;
12915
12916 if (mdr->ls) // already marked filelock/nestlock dirty ?
12917 goto do_rdlocks;
12918
12919 lov.add_rdlock(&diri->dirfragtreelock);
12920 lov.add_wrlock(&diri->nestlock);
12921 lov.add_wrlock(&diri->filelock);
12922 if (!mds->locker->acquire_locks(mdr, lov))
12923 return;
12924
12925 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12926 // the scatter-gather process, which will fix any fragstat/rstat errors.
12927 {
12928 frag_vec_t leaves;
12929 diri->dirfragtree.get_leaves(leaves);
12930 for (const auto& leaf : leaves) {
12931 CDir *dir = diri->get_dirfrag(leaf);
12932 if (!dir) {
12933 ceph_assert(mdr->is_auth_pinned(diri));
12934 dir = diri->get_or_open_dirfrag(this, leaf);
12935 }
12936 if (dir->get_version() == 0) {
12937 ceph_assert(dir->is_auth());
12938 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12939 return;
12940 }
12941 }
12942 }
12943
12944 diri->state_set(CInode::STATE_REPAIRSTATS);
12945 mdr->ls = mds->mdlog->get_current_segment();
12946 mds->locker->mark_updated_scatterlock(&diri->filelock);
12947 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12948 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12949 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12950
12951 mds->locker->drop_locks(mdr.get());
12952
12953 do_rdlocks:
12954 // force the scatter-gather process
12955 lov.clear();
12956 lov.add_rdlock(&diri->dirfragtreelock);
12957 lov.add_rdlock(&diri->nestlock);
12958 lov.add_rdlock(&diri->filelock);
12959 if (!mds->locker->acquire_locks(mdr, lov))
12960 return;
12961
12962 diri->state_clear(CInode::STATE_REPAIRSTATS);
12963
12964 frag_info_t dir_info;
12965 nest_info_t nest_info;
12966 nest_info.rsubdirs = 1; // it gets one to account for self
12967 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
12968 nest_info.rsnaps = srnode->snaps.size();
12969
12970 {
12971 frag_vec_t leaves;
12972 diri->dirfragtree.get_leaves(leaves);
12973 for (const auto& leaf : leaves) {
12974 CDir *dir = diri->get_dirfrag(leaf);
12975 ceph_assert(dir);
12976 ceph_assert(dir->get_version() > 0);
12977 dir_info.add(dir->get_fnode()->accounted_fragstat);
12978 nest_info.add(dir->get_fnode()->accounted_rstat);
12979 }
12980 }
12981
12982 if (!dir_info.same_sums(diri->get_inode()->dirstat) ||
12983 !nest_info.same_sums(diri->get_inode()->rstat)) {
12984 dout(10) << __func__ << " failed to fix fragstat/rstat on "
12985 << *diri << dendl;
12986 }
12987
12988 mds->server->respond_to_request(mdr, 0);
12989 }
12990
12991 void MDCache::rdlock_dirfrags_stats(CInode *diri, MDSInternalContext* fin)
12992 {
12993 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS);
12994 mdr->auth_pin(diri); // already auth pinned by CInode::validate_disk_state()
12995 mdr->internal_op_private = diri;
12996 mdr->internal_op_finish = fin;
12997 return rdlock_dirfrags_stats_work(mdr);
12998 }
12999
13000 void MDCache::rdlock_dirfrags_stats_work(MDRequestRef& mdr)
13001 {
13002 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13003 dout(10) << __func__ << " " << *diri << dendl;
13004 if (!diri->is_auth()) {
13005 mds->server->respond_to_request(mdr, -CEPHFS_ESTALE);
13006 return;
13007 }
13008 if (!diri->is_dir()) {
13009 mds->server->respond_to_request(mdr, -CEPHFS_ENOTDIR);
13010 return;
13011 }
13012
13013 MutationImpl::LockOpVec lov;
13014 lov.add_rdlock(&diri->dirfragtreelock);
13015 lov.add_rdlock(&diri->nestlock);
13016 lov.add_rdlock(&diri->filelock);
13017 if (!mds->locker->acquire_locks(mdr, lov))
13018 return;
13019 dout(10) << __func__ << " start dirfrags : " << *diri << dendl;
13020
13021 mds->server->respond_to_request(mdr, 0);
13022 return;
13023 }
13024
13025 void MDCache::flush_dentry(std::string_view path, Context *fin)
13026 {
13027 if (is_readonly()) {
13028 dout(10) << __func__ << ": read-only FS" << dendl;
13029 fin->complete(-CEPHFS_EROFS);
13030 return;
13031 }
13032 dout(10) << "flush_dentry " << path << dendl;
13033 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13034 filepath fp(path);
13035 mdr->set_filepath(fp);
13036 mdr->internal_op_finish = fin;
13037 flush_dentry_work(mdr);
13038 }
13039
13040 class C_FinishIOMDR : public MDSContext {
13041 protected:
13042 MDSRank *mds;
13043 MDRequestRef mdr;
13044 MDSRank *get_mds() override { return mds; }
13045 public:
13046 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13047 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13048 };
13049
13050 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13051 {
13052 MutationImpl::LockOpVec lov;
13053 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13054 if (!in)
13055 return;
13056
13057 ceph_assert(in->is_auth());
13058 in->flush(new C_FinishIOMDR(mds, mdr));
13059 }
13060
13061
13062 /**
13063 * Initialize performance counters with global perfcounter
13064 * collection.
13065 */
13066 void MDCache::register_perfcounters()
13067 {
13068 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13069
13070 // Stray/purge statistics
13071 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13072 PerfCountersBuilder::PRIO_INTERESTING);
13073 pcb.add_u64(l_mdc_num_recovering_enqueued,
13074 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13075 PerfCountersBuilder::PRIO_INTERESTING);
13076 pcb.add_u64_counter(l_mdc_recovery_completed,
13077 "recovery_completed", "File recoveries completed", "recd",
13078 PerfCountersBuilder::PRIO_INTERESTING);
13079
13080 // useful recovery queue statistics
13081 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13082 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13083 "Files currently being recovered");
13084 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13085 "Files waiting for recovery with elevated priority");
13086 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13087 "File recoveries started");
13088
13089 // along with other stray dentries stats
13090 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13091 "Stray dentries delayed");
13092 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13093 "Stray dentries enqueuing for purge");
13094 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13095 "Stray dentries created");
13096 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13097 "Stray dentries enqueued for purge");
13098 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13099 "Stray dentries reintegrated");
13100 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13101 "Stray dentries migrated");
13102
13103 // low prio internal request stats
13104 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13105 "Internal Request type enqueue scrub");
13106 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13107 "Internal Request type export dir");
13108 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13109 "Internal Request type flush");
13110 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13111 "Internal Request type fragmentdir");
13112 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13113 "Internal Request type frag stats");
13114 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13115 "Internal Request type inode stats");
13116
13117 logger.reset(pcb.create_perf_counters());
13118 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13119 recovery_queue.set_logger(logger.get());
13120 stray_manager.set_logger(logger.get());
13121 }
13122
13123 /**
13124 * Call this when putting references to an inode/dentry or
13125 * when attempting to trim it.
13126 *
13127 * If this inode is no longer linked by anyone, and this MDS
13128 * rank holds the primary dentry, and that dentry is in a stray
13129 * directory, then give up the dentry to the StrayManager, never
13130 * to be seen again by MDCache.
13131 *
13132 * @param delay if true, then purgeable inodes are stashed til
13133 * the next trim(), rather than being purged right
13134 * away.
13135 */
13136 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13137 if (in->get_inode()->nlink > 0 || in->is_base() || is_readonly() ||
13138 mds->get_state() <= MDSMap::STATE_REJOIN)
13139 return;
13140
13141 CDentry *dn = in->get_projected_parent_dn();
13142
13143 if (dn->state_test(CDentry::STATE_PURGING)) {
13144 /* We have already entered the purging process, no need
13145 * to re-evaluate me ! */
13146 return;
13147 }
13148
13149 if (dn->get_dir()->get_inode()->is_stray()) {
13150 if (delay)
13151 stray_manager.queue_delayed(dn);
13152 else
13153 stray_manager.eval_stray(dn);
13154 }
13155 }
13156
13157 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13158 dout(10) << __func__ << " " << *diri << dendl;
13159 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13160 auto&& ls = diri->get_dirfrags();
13161 for (auto &p : ls) {
13162 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13163 p->try_remove_dentries_for_stray();
13164 }
13165 if (!diri->snaprealm) {
13166 if (diri->is_auth())
13167 diri->clear_dirty_rstat();
13168 diri->clear_scatter_dirty();
13169 }
13170 }
13171
13172 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13173 CInode *in = get_inode(number);
13174 if (!in) {
13175 return false;
13176 }
13177 f->open_object_section("inode");
13178 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13179 f->close_section();
13180 return true;
13181 }
13182
13183 void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
13184 const mds_rank_t max_mds = mdsmap.get_max_mds();
13185
13186 // process export_pin_delayed_queue whenever a new MDSMap received
13187 auto &q = export_pin_delayed_queue;
13188 for (auto it = q.begin(); it != q.end(); ) {
13189 auto *in = *it;
13190 mds_rank_t export_pin = in->get_export_pin(false);
13191 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13192 << " max_mds=" << max_mds << dendl;
13193 if (export_pin >= mdsmap.get_max_mds()) {
13194 it++;
13195 continue;
13196 }
13197
13198 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13199 it = q.erase(it);
13200 in->queue_export_pin(export_pin);
13201 }
13202
13203 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13204 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13205 /* copy to vector to avoid removals during iteration */
13206 std::vector<CInode*> migrate;
13207 migrate.assign(export_ephemeral_pins.begin(), export_ephemeral_pins.end());
13208 for (auto& in : migrate) {
13209 in->maybe_export_pin();
13210 }
13211 }
13212
13213 if (max_mds <= 1) {
13214 export_ephemeral_dist_frag_bits = 0;
13215 } else {
13216 double want = g_conf().get_val<double>("mds_export_ephemeral_distributed_factor");
13217 want *= max_mds;
13218 unsigned n = 0;
13219 while ((1U << n) < (unsigned)want)
13220 ++n;
13221 export_ephemeral_dist_frag_bits = n;
13222 }
13223 }
13224
13225 void MDCache::upkeep_main(void)
13226 {
13227 std::unique_lock lock(upkeep_mutex);
13228 while (!upkeep_trim_shutdown.load()) {
13229 auto now = clock::now();
13230 auto since = now-upkeep_last_trim;
13231 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
13232 if (since >= trim_interval*.90) {
13233 lock.unlock(); /* mds_lock -> upkeep_mutex */
13234 std::scoped_lock mds_lock(mds->mds_lock);
13235 lock.lock();
13236 if (upkeep_trim_shutdown.load())
13237 return;
13238 check_memory_usage();
13239 if (mds->is_cache_trimmable()) {
13240 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
13241 bool active_with_clients = mds->is_active() || mds->is_clientreplay() || mds->is_stopping();
13242 if (active_with_clients) {
13243 trim_client_leases();
13244 }
13245 trim();
13246 if (active_with_clients) {
13247 auto recall_flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
13248 if (cache_toofull()) {
13249 recall_flags = recall_flags|Server::RecallFlags::TRIM;
13250 }
13251 mds->server->recall_client_state(nullptr, recall_flags);
13252 }
13253 upkeep_last_trim = now = clock::now();
13254 } else {
13255 dout(10) << "cache not ready for trimming" << dendl;
13256 }
13257 } else {
13258 trim_interval -= since;
13259 }
13260 since = now-upkeep_last_release;
13261 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
13262 if (since >= release_interval*.90) {
13263 /* XXX not necessary once MDCache uses PriorityCache */
13264 dout(10) << "releasing free memory" << dendl;
13265 ceph_heap_release_free_memory();
13266 upkeep_last_release = clock::now();
13267 } else {
13268 release_interval -= since;
13269 }
13270 auto interval = std::min(release_interval, trim_interval);
13271 dout(20) << "upkeep thread waiting interval " << interval << dendl;
13272 upkeep_cvar.wait_for(lock, interval);
13273 }
13274 }