]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
325cbfc88ecef8e58bd869805537dbaa38b61607
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <string_view>
21 #include <map>
22
23 #include "MDCache.h"
24 #include "MDSRank.h"
25 #include "Server.h"
26 #include "Locker.h"
27 #include "MDLog.h"
28 #include "MDBalancer.h"
29 #include "Migrator.h"
30 #include "ScrubStack.h"
31
32 #include "SnapClient.h"
33
34 #include "MDSMap.h"
35
36 #include "CInode.h"
37 #include "CDir.h"
38
39 #include "Mutation.h"
40
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
44
45 #include "messages/MClientCaps.h"
46
47 #include "msg/Message.h"
48 #include "msg/Messenger.h"
49
50 #include "common/MemoryModel.h"
51 #include "common/errno.h"
52 #include "common/perf_counters.h"
53 #include "common/safe_io.h"
54
55 #include "osdc/Journaler.h"
56 #include "osdc/Filer.h"
57
58 #include "events/ESubtreeMap.h"
59 #include "events/EUpdate.h"
60 #include "events/ESlaveUpdate.h"
61 #include "events/EImportFinish.h"
62 #include "events/EFragment.h"
63 #include "events/ECommitted.h"
64 #include "events/EPurged.h"
65 #include "events/ESessions.h"
66
67 #include "InoTable.h"
68
69 #include "common/Timer.h"
70
71 #include "perfglue/heap_profiler.h"
72
73
74 #include "common/config.h"
75 #include "include/ceph_assert.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_mds
79 #undef dout_prefix
80 #define dout_prefix _prefix(_dout, mds)
81 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
82 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
83 }
84
85 set<int> SimpleLock::empty_gather_set;
86
87
88 /**
89 * All non-I/O contexts that require a reference
90 * to an MDCache instance descend from this.
91 */
92 class MDCacheContext : public virtual MDSContext {
93 protected:
94 MDCache *mdcache;
95 MDSRank *get_mds() override
96 {
97 ceph_assert(mdcache != NULL);
98 return mdcache->mds;
99 }
100 public:
101 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
102 };
103
104
105 /**
106 * Only for contexts called back from an I/O completion
107 *
108 * Note: duplication of members wrt MDCacheContext, because
109 * it'ls the lesser of two evils compared with introducing
110 * yet another piece of (multiple) inheritance.
111 */
112 class MDCacheIOContext : public virtual MDSIOContextBase {
113 protected:
114 MDCache *mdcache;
115 MDSRank *get_mds() override
116 {
117 ceph_assert(mdcache != NULL);
118 return mdcache->mds;
119 }
120 public:
121 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
122 MDSIOContextBase(track), mdcache(mdc_) {}
123 };
124
125 class MDCacheLogContext : public virtual MDSLogContextBase {
126 protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
130 ceph_assert(mdcache != NULL);
131 return mdcache->mds;
132 }
133 public:
134 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
135 };
136
137 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
138 mds(m),
139 open_file_table(m),
140 filer(m->objecter, m->finisher),
141 stray_manager(m, purge_queue_),
142 recovery_queue(m),
143 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
144 {
145 migrator.reset(new Migrator(mds, this));
146
147 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
148 (g_conf()->mds_dir_max_commit_size << 20) :
149 (0.9 *(g_conf()->osd_max_write_size << 20));
150
151 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
153 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
154 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
155
156 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
157
158 bottom_lru.lru_set_midpoint(0);
159
160 decayrate.set_halflife(g_conf()->mds_decay_halflife);
161
162 upkeeper = std::thread([this]() {
163 std::unique_lock lock(upkeep_mutex);
164 while (!upkeep_trim_shutdown.load()) {
165 auto now = clock::now();
166 auto since = now-upkeep_last_trim;
167 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
168 if (since >= trim_interval*.90) {
169 lock.unlock(); /* mds_lock -> upkeep_mutex */
170 std::scoped_lock mds_lock(mds->mds_lock);
171 lock.lock();
172 if (upkeep_trim_shutdown.load())
173 return;
174 if (mds->is_cache_trimmable()) {
175 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
176 trim_client_leases();
177 trim();
178 check_memory_usage();
179 auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
180 mds->server->recall_client_state(nullptr, flags);
181 upkeep_last_trim = now = clock::now();
182 } else {
183 dout(10) << "cache not ready for trimming" << dendl;
184 }
185 } else {
186 trim_interval -= since;
187 }
188 since = now-upkeep_last_release;
189 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
190 if (since >= release_interval) {
191 /* XXX not necessary once MDCache uses PriorityCache */
192 dout(10) << "releasing free memory" << dendl;
193 ceph_heap_release_free_memory();
194 upkeep_last_release = clock::now();
195 } else {
196 release_interval -= since;
197 }
198 auto interval = std::min(release_interval, trim_interval);
199 dout(20) << "upkeep thread waiting interval " << interval << dendl;
200 upkeep_cvar.wait_for(lock, interval);
201 }
202 });
203 }
204
205 MDCache::~MDCache()
206 {
207 if (logger) {
208 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
209 }
210 if (upkeeper.joinable())
211 upkeeper.join();
212 }
213
214 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
215 {
216 if (changed.count("mds_cache_memory_limit"))
217 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
218 if (changed.count("mds_cache_reservation"))
219 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
220 if (changed.count("mds_health_cache_threshold"))
221 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
222 if (changed.count("mds_cache_mid"))
223 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
224 if (changed.count("mds_cache_trim_decay_rate")) {
225 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
226 }
227 if (changed.count("mds_forward_all_requests_to_auth")){
228 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
229 }
230
231 migrator->handle_conf_change(changed, mdsmap);
232 mds->balancer->handle_conf_change(changed, mdsmap);
233 }
234
235 void MDCache::log_stat()
236 {
237 mds->logger->set(l_mds_inodes, lru.lru_get_size());
238 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
239 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
240 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
241 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
242 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
243 mds->logger->set(l_mds_caps, Capability::count());
244 if (root) {
245 mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
246 mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
247 mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
248 }
249 }
250
251
252 //
253
254 bool MDCache::shutdown()
255 {
256 {
257 std::scoped_lock lock(upkeep_mutex);
258 upkeep_trim_shutdown = true;
259 upkeep_cvar.notify_one();
260 }
261 if (lru.lru_get_size() > 0) {
262 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
263 //show_cache();
264 show_subtrees();
265 //dump();
266 }
267 return true;
268 }
269
270
271 // ====================================================================
272 // some inode functions
273
274 void MDCache::add_inode(CInode *in)
275 {
276 // add to lru, inode map
277 if (in->last == CEPH_NOSNAP) {
278 auto &p = inode_map[in->ino()];
279 ceph_assert(!p); // should be no dup inos!
280 p = in;
281 } else {
282 auto &p = snap_inode_map[in->vino()];
283 ceph_assert(!p); // should be no dup inos!
284 p = in;
285 }
286
287 if (in->ino() < MDS_INO_SYSTEM_BASE) {
288 if (in->ino() == MDS_INO_ROOT)
289 root = in;
290 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
291 myin = in;
292 else if (in->is_stray()) {
293 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
294 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
295 }
296 }
297 if (in->is_base())
298 base_inodes.insert(in);
299 }
300
301 if (cache_toofull()) {
302 exceeded_size_limit = true;
303 }
304 }
305
306 void MDCache::remove_inode(CInode *o)
307 {
308 dout(14) << "remove_inode " << *o << dendl;
309
310 if (o->get_parent_dn()) {
311 // FIXME: multiple parents?
312 CDentry *dn = o->get_parent_dn();
313 ceph_assert(!dn->is_dirty());
314 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
315 }
316
317 if (o->is_dirty())
318 o->mark_clean();
319 if (o->is_dirty_parent())
320 o->clear_dirty_parent();
321
322 o->clear_scatter_dirty();
323
324 o->item_open_file.remove_myself();
325
326 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
327 export_pin_queue.erase(o);
328
329 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
330 export_pin_delayed_queue.erase(o);
331
332 // remove from inode map
333 if (o->last == CEPH_NOSNAP) {
334 inode_map.erase(o->ino());
335 } else {
336 o->item_caps.remove_myself();
337 snap_inode_map.erase(o->vino());
338 }
339
340 if (o->ino() < MDS_INO_SYSTEM_BASE) {
341 if (o == root) root = 0;
342 if (o == myin) myin = 0;
343 if (o->is_stray()) {
344 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
345 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
346 }
347 }
348 if (o->is_base())
349 base_inodes.erase(o);
350 }
351
352 // delete it
353 ceph_assert(o->get_num_ref() == 0);
354 delete o;
355 }
356
357 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
358 {
359 file_layout_t result = file_layout_t::get_default();
360 result.pool_id = mdsmap.get_first_data_pool();
361 return result;
362 }
363
364 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
365 {
366 file_layout_t result = file_layout_t::get_default();
367 result.pool_id = mdsmap.get_metadata_pool();
368 if (g_conf()->mds_log_segment_size > 0) {
369 result.object_size = g_conf()->mds_log_segment_size;
370 result.stripe_unit = g_conf()->mds_log_segment_size;
371 }
372 return result;
373 }
374
375 void MDCache::init_layouts()
376 {
377 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
378 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
379 }
380
381 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
382 int mode) const
383 {
384 in->inode.ino = ino;
385 in->inode.version = 1;
386 in->inode.xattr_version = 1;
387 in->inode.mode = 0500 | mode;
388 in->inode.size = 0;
389 in->inode.ctime =
390 in->inode.mtime =
391 in->inode.btime = ceph_clock_now();
392 in->inode.nlink = 1;
393 in->inode.truncate_size = -1ull;
394 in->inode.change_attr = 0;
395 in->inode.export_pin = MDS_RANK_NONE;
396
397 // FIPS zeroization audit 20191117: this memset is not security related.
398 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
399 if (in->inode.is_dir()) {
400 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
401 in->inode.rstat.rsubdirs = 1; /* itself */
402 in->inode.rstat.rctime = in->inode.ctime;
403 } else {
404 in->inode.layout = default_file_layout;
405 ++in->inode.rstat.rfiles;
406 }
407 in->inode.accounted_rstat = in->inode.rstat;
408
409 if (in->is_base()) {
410 if (in->is_root())
411 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
412 else
413 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
414 in->open_snaprealm(); // empty snaprealm
415 ceph_assert(!in->snaprealm->parent); // created its own
416 in->snaprealm->srnode.seq = 1;
417 }
418 }
419
420 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
421 {
422 dout(0) << "creating system inode with ino:" << ino << dendl;
423 CInode *in = new CInode(this);
424 create_unlinked_system_inode(in, ino, mode);
425 add_inode(in);
426 return in;
427 }
428
429 CInode *MDCache::create_root_inode()
430 {
431 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
432 i->inode.uid = g_conf()->mds_root_ino_uid;
433 i->inode.gid = g_conf()->mds_root_ino_gid;
434 i->inode.layout = default_file_layout;
435 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
436 return i;
437 }
438
439 void MDCache::create_empty_hierarchy(MDSGather *gather)
440 {
441 // create root dir
442 CInode *root = create_root_inode();
443
444 // force empty root dir
445 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
446 adjust_subtree_auth(rootdir, mds->get_nodeid());
447 rootdir->dir_rep = CDir::REP_ALL; //NONE;
448
449 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
450 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
451 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
452 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
453 * assume version 0 is stale/invalid.
454 */
455
456 rootdir->mark_complete();
457 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
458 rootdir->commit(0, gather->new_sub());
459
460 root->mark_clean();
461 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
462 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
463 root->flush(gather->new_sub());
464 }
465
466 void MDCache::create_mydir_hierarchy(MDSGather *gather)
467 {
468 // create mds dir
469 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
470
471 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
472 adjust_subtree_auth(mydir, mds->get_nodeid());
473
474 LogSegment *ls = mds->mdlog->get_current_segment();
475
476 // stray dir
477 for (int i = 0; i < NUM_STRAY; ++i) {
478 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
479 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
480 stringstream name;
481 name << "stray" << i;
482 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
483 sdn->_mark_dirty(mds->mdlog->get_current_segment());
484
485 stray->inode.dirstat = straydir->fnode.fragstat;
486
487 mydir->fnode.rstat.add(stray->inode.rstat);
488 mydir->fnode.fragstat.nsubdirs++;
489 // save them
490 straydir->mark_complete();
491 straydir->mark_dirty(straydir->pre_dirty(), ls);
492 straydir->commit(0, gather->new_sub());
493 stray->mark_dirty_parent(ls, true);
494 stray->store_backtrace(gather->new_sub());
495 }
496
497 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
498 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
499
500 myin->inode.dirstat = mydir->fnode.fragstat;
501 myin->inode.rstat = mydir->fnode.rstat;
502 ++myin->inode.rstat.rsubdirs;
503 myin->inode.accounted_rstat = myin->inode.rstat;
504
505 mydir->mark_complete();
506 mydir->mark_dirty(mydir->pre_dirty(), ls);
507 mydir->commit(0, gather->new_sub());
508
509 myin->store(gather->new_sub());
510 }
511
512 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
513 MutationRef mut;
514 CDentry *dn;
515 version_t dpv;
516 MDSContext *fin;
517 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
518 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
519 void finish(int r) override {
520 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
521 }
522 };
523
524 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
525 {
526 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
527 CDentry *dn = dir->add_null_dentry(name);
528
529 dn->push_projected_linkage(in);
530 version_t dpv = dn->pre_dirty();
531
532 CDir *mdir = 0;
533 if (in->inode.is_dir()) {
534 in->inode.rstat.rsubdirs = 1;
535
536 mdir = in->get_or_open_dirfrag(this, frag_t());
537 mdir->mark_complete();
538 mdir->pre_dirty();
539 } else
540 in->inode.rstat.rfiles = 1;
541 in->inode.version = dn->pre_dirty();
542
543 SnapRealm *realm = dir->get_inode()->find_snaprealm();
544 dn->first = in->first = realm->get_newest_seq() + 1;
545
546 MutationRef mut(new MutationImpl());
547
548 // force some locks. hacky.
549 mds->locker->wrlock_force(&dir->inode->filelock, mut);
550 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
551
552 mut->ls = mds->mdlog->get_current_segment();
553 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
554 mds->mdlog->start_entry(le);
555
556 if (!in->is_mdsdir()) {
557 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
558 le->metablob.add_primary_dentry(dn, in, true);
559 } else {
560 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
561 journal_dirty_inode(mut.get(), &le->metablob, in);
562 dn->push_projected_linkage(in->ino(), in->d_type());
563 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
564 le->metablob.add_root(true, in);
565 }
566 if (mdir)
567 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
568
569 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
570 mds->mdlog->flush();
571 }
572
573 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
574 {
575 dout(10) << "_create_system_file_finish " << *dn << dendl;
576
577 dn->pop_projected_linkage();
578 dn->mark_dirty(dpv, mut->ls);
579
580 CInode *in = dn->get_linkage()->get_inode();
581 in->inode.version--;
582 in->mark_dirty(in->inode.version + 1, mut->ls);
583
584 if (in->inode.is_dir()) {
585 CDir *dir = in->get_dirfrag(frag_t());
586 ceph_assert(dir);
587 dir->mark_dirty(1, mut->ls);
588 dir->mark_new(mut->ls);
589 }
590
591 mut->apply();
592 mds->locker->drop_locks(mut.get());
593 mut->cleanup();
594
595 fin->complete(0);
596
597 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
598 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
599 }
600
601
602
603 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
604 MDCache *cache;
605 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
606 void finish(int r) override {
607 if (r < 0) {
608 // If we can't open root, something disastrous has happened: mark
609 // this rank damaged for operator intervention. Note that
610 // it is not okay to call suicide() here because we are in
611 // a Finisher callback.
612 cache->mds->damaged();
613 ceph_abort(); // damaged should never return
614 } else {
615 cache->open_root();
616 }
617 }
618 };
619
620 void MDCache::open_root_inode(MDSContext *c)
621 {
622 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
623 CInode *in;
624 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
625 in->fetch(c);
626 } else {
627 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
628 }
629 }
630
631 void MDCache::open_mydir_inode(MDSContext *c)
632 {
633 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
634 in->fetch(c);
635 }
636
637 void MDCache::open_mydir_frag(MDSContext *c)
638 {
639 open_mydir_inode(
640 new MDSInternalContextWrapper(mds,
641 new LambdaContext([this, c](int r) {
642 if (r < 0) {
643 c->complete(r);
644 return;
645 }
646 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
647 ceph_assert(mydir);
648 adjust_subtree_auth(mydir, mds->get_nodeid());
649 mydir->fetch(c);
650 })
651 )
652 );
653 }
654
655 void MDCache::open_root()
656 {
657 dout(10) << "open_root" << dendl;
658
659 if (!root) {
660 open_root_inode(new C_MDS_RetryOpenRoot(this));
661 return;
662 }
663 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
664 ceph_assert(root->is_auth());
665 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
666 ceph_assert(rootdir);
667 if (!rootdir->is_subtree_root())
668 adjust_subtree_auth(rootdir, mds->get_nodeid());
669 if (!rootdir->is_complete()) {
670 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
671 return;
672 }
673 } else {
674 ceph_assert(!root->is_auth());
675 CDir *rootdir = root->get_dirfrag(frag_t());
676 if (!rootdir) {
677 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
678 return;
679 }
680 }
681
682 if (!myin) {
683 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
684 in->fetch(new C_MDS_RetryOpenRoot(this));
685 return;
686 }
687 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
688 ceph_assert(mydir);
689 adjust_subtree_auth(mydir, mds->get_nodeid());
690
691 populate_mydir();
692 }
693
694 void MDCache::populate_mydir()
695 {
696 ceph_assert(myin);
697 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
698 ceph_assert(mydir);
699
700 dout(10) << "populate_mydir " << *mydir << dendl;
701
702 if (!mydir->is_complete()) {
703 mydir->fetch(new C_MDS_RetryOpenRoot(this));
704 return;
705 }
706
707 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
708 // A missing dirfrag, we will recreate it. Before that, we must dirty
709 // it before dirtying any of the strays we create within it.
710 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
711 "recreating it now";
712 LogSegment *ls = mds->mdlog->get_current_segment();
713 mydir->state_clear(CDir::STATE_BADFRAG);
714 mydir->mark_complete();
715 mydir->mark_dirty(mydir->pre_dirty(), ls);
716 }
717
718 // open or create stray
719 uint64_t num_strays = 0;
720 for (int i = 0; i < NUM_STRAY; ++i) {
721 stringstream name;
722 name << "stray" << i;
723 CDentry *straydn = mydir->lookup(name.str());
724
725 // allow for older fs's with stray instead of stray0
726 if (straydn == NULL && i == 0)
727 straydn = mydir->lookup("stray");
728
729 if (!straydn || !straydn->get_linkage()->get_inode()) {
730 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
731 new C_MDS_RetryOpenRoot(this));
732 return;
733 }
734 ceph_assert(straydn);
735 ceph_assert(strays[i]);
736 // we make multiple passes through this method; make sure we only pin each stray once.
737 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
738 strays[i]->get(CInode::PIN_STRAY);
739 strays[i]->state_set(CInode::STATE_STRAYPINNED);
740 strays[i]->get_stickydirs();
741 }
742 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
743
744 // open all frags
745 frag_vec_t leaves;
746 strays[i]->dirfragtree.get_leaves(leaves);
747 for (const auto& leaf : leaves) {
748 CDir *dir = strays[i]->get_dirfrag(leaf);
749 if (!dir) {
750 dir = strays[i]->get_or_open_dirfrag(this, leaf);
751 }
752
753 // DamageTable applies special handling to strays: it will
754 // have damaged() us out if one is damaged.
755 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
756
757 if (dir->get_version() == 0) {
758 dir->fetch(new C_MDS_RetryOpenRoot(this));
759 return;
760 }
761
762 if (dir->get_frag_size() > 0)
763 num_strays += dir->get_frag_size();
764 }
765 }
766
767 // okay!
768 dout(10) << "populate_mydir done" << dendl;
769 ceph_assert(!open);
770 open = true;
771 mds->queue_waiters(waiting_for_open);
772
773 stray_manager.set_num_strays(num_strays);
774 stray_manager.activate();
775
776 scan_stray_dir();
777 }
778
779 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
780 {
781 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
782 }
783
784 CDir *MDCache::get_stray_dir(CInode *in)
785 {
786 string straydname;
787 in->name_stray_dentry(straydname);
788
789 CInode *strayi = get_stray();
790 ceph_assert(strayi);
791 frag_t fg = strayi->pick_dirfrag(straydname);
792 CDir *straydir = strayi->get_dirfrag(fg);
793 ceph_assert(straydir);
794 return straydir;
795 }
796
797 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
798 {
799 CDir *straydir = get_stray_dir(in);
800 string straydname;
801 in->name_stray_dentry(straydname);
802 CDentry *straydn = straydir->lookup(straydname);
803 if (!straydn) {
804 straydn = straydir->add_null_dentry(straydname);
805 straydn->mark_new();
806 } else {
807 ceph_assert(straydn->get_projected_linkage()->is_null());
808 }
809
810 straydn->state_set(CDentry::STATE_STRAY);
811 return straydn;
812 }
813
814
815
816 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
817 {
818 // inode?
819 if (info.ino)
820 return get_inode(info.ino, info.snapid);
821
822 // dir or dentry.
823 CDir *dir = get_dirfrag(info.dirfrag);
824 if (!dir) return 0;
825
826 if (info.dname.length())
827 return dir->lookup(info.dname, info.snapid);
828 else
829 return dir;
830 }
831
832
833
834
835 // ====================================================================
836 // subtree management
837
838 /*
839 * adjust the dir_auth of a subtree.
840 * merge with parent and/or child subtrees, if is it appropriate.
841 * merge can ONLY happen if both parent and child have unambiguous auth.
842 */
843 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
844 {
845 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
846 << " on " << *dir << dendl;
847
848 show_subtrees();
849
850 CDir *root;
851 if (dir->inode->is_base()) {
852 root = dir; // bootstrap hack.
853 if (subtrees.count(root) == 0) {
854 subtrees[root];
855 root->get(CDir::PIN_SUBTREE);
856 }
857 } else {
858 root = get_subtree_root(dir); // subtree root
859 }
860 ceph_assert(root);
861 ceph_assert(subtrees.count(root));
862 dout(7) << " current root is " << *root << dendl;
863
864 if (root == dir) {
865 // i am already a subtree.
866 dir->set_dir_auth(auth);
867 } else {
868 // i am a new subtree.
869 dout(10) << " new subtree at " << *dir << dendl;
870 ceph_assert(subtrees.count(dir) == 0);
871 subtrees[dir]; // create empty subtree bounds list for me.
872 dir->get(CDir::PIN_SUBTREE);
873
874 // set dir_auth
875 dir->set_dir_auth(auth);
876
877 // move items nested beneath me, under me.
878 set<CDir*>::iterator p = subtrees[root].begin();
879 while (p != subtrees[root].end()) {
880 set<CDir*>::iterator next = p;
881 ++next;
882 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
883 // move under me
884 dout(10) << " claiming child bound " << **p << dendl;
885 subtrees[dir].insert(*p);
886 subtrees[root].erase(p);
887 }
888 p = next;
889 }
890
891 // i am a bound of the parent subtree.
892 subtrees[root].insert(dir);
893
894 // i am now the subtree root.
895 root = dir;
896
897 // adjust recursive pop counters
898 if (adjust_pop && dir->is_auth()) {
899 CDir *p = dir->get_parent_dir();
900 while (p) {
901 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
902 if (p->is_subtree_root()) break;
903 p = p->inode->get_parent_dir();
904 }
905 }
906 }
907
908 show_subtrees();
909 }
910
911
912 void MDCache::try_subtree_merge(CDir *dir)
913 {
914 dout(7) << "try_subtree_merge " << *dir << dendl;
915 // record my old bounds
916 auto oldbounds = subtrees.at(dir);
917
918 set<CInode*> to_eval;
919 // try merge at my root
920 try_subtree_merge_at(dir, &to_eval);
921
922 // try merge at my old bounds
923 for (auto bound : oldbounds)
924 try_subtree_merge_at(bound, &to_eval);
925
926 if (!(mds->is_any_replay() || mds->is_resolve())) {
927 for(auto in : to_eval)
928 eval_subtree_root(in);
929 }
930 }
931
932 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
933 CInode *in;
934 MutationRef mut;
935 public:
936 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
937 void finish(int r) override {
938 mdcache->subtree_merge_writebehind_finish(in, mut);
939 }
940 };
941
942 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
943 {
944 dout(10) << "try_subtree_merge_at " << *dir << dendl;
945
946 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
947 dir->state_test(CDir::STATE_EXPORTBOUND) ||
948 dir->state_test(CDir::STATE_AUXSUBTREE))
949 return;
950
951 auto it = subtrees.find(dir);
952 ceph_assert(it != subtrees.end());
953
954 // merge with parent?
955 CDir *parent = dir;
956 if (!dir->inode->is_base())
957 parent = get_subtree_root(dir->get_parent_dir());
958
959 if (parent != dir && // we have a parent,
960 parent->dir_auth == dir->dir_auth) { // auth matches,
961 // merge with parent.
962 dout(10) << " subtree merge at " << *dir << dendl;
963 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
964
965 // move our bounds under the parent
966 subtrees[parent].insert(it->second.begin(), it->second.end());
967
968 // we are no longer a subtree or bound
969 dir->put(CDir::PIN_SUBTREE);
970 subtrees.erase(it);
971 subtrees[parent].erase(dir);
972
973 // adjust popularity?
974 if (adjust_pop && dir->is_auth()) {
975 CDir *cur = dir;
976 CDir *p = dir->get_parent_dir();
977 while (p) {
978 p->pop_auth_subtree.add(dir->pop_auth_subtree);
979 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
980 if (p->is_subtree_root()) break;
981 cur = p;
982 p = p->inode->get_parent_dir();
983 }
984 }
985
986 if (to_eval && dir->get_inode()->is_auth())
987 to_eval->insert(dir->get_inode());
988
989 show_subtrees(15);
990 }
991 }
992
993 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
994 {
995 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
996 in->pop_and_dirty_projected_inode(mut->ls);
997
998 mut->apply();
999 mds->locker->drop_locks(mut.get());
1000 mut->cleanup();
1001
1002 in->auth_unpin(this);
1003 }
1004
1005 void MDCache::eval_subtree_root(CInode *diri)
1006 {
1007 // evaluate subtree inode filelock?
1008 // (we should scatter the filelock on subtree bounds)
1009 ceph_assert(diri->is_auth());
1010 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1011 }
1012
1013
1014 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1015 {
1016 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1017 << " on " << *dir
1018 << " bounds " << bounds
1019 << dendl;
1020
1021 show_subtrees();
1022
1023 CDir *root;
1024 if (dir->ino() == MDS_INO_ROOT) {
1025 root = dir; // bootstrap hack.
1026 if (subtrees.count(root) == 0) {
1027 subtrees[root];
1028 root->get(CDir::PIN_SUBTREE);
1029 }
1030 } else {
1031 root = get_subtree_root(dir); // subtree root
1032 }
1033 ceph_assert(root);
1034 ceph_assert(subtrees.count(root));
1035 dout(7) << " current root is " << *root << dendl;
1036
1037 mds_authority_t oldauth = dir->authority();
1038
1039 if (root == dir) {
1040 // i am already a subtree.
1041 dir->set_dir_auth(auth);
1042 } else {
1043 // i am a new subtree.
1044 dout(10) << " new subtree at " << *dir << dendl;
1045 ceph_assert(subtrees.count(dir) == 0);
1046 subtrees[dir]; // create empty subtree bounds list for me.
1047 dir->get(CDir::PIN_SUBTREE);
1048
1049 // set dir_auth
1050 dir->set_dir_auth(auth);
1051
1052 // move items nested beneath me, under me.
1053 set<CDir*>::iterator p = subtrees[root].begin();
1054 while (p != subtrees[root].end()) {
1055 set<CDir*>::iterator next = p;
1056 ++next;
1057 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1058 // move under me
1059 dout(10) << " claiming child bound " << **p << dendl;
1060 subtrees[dir].insert(*p);
1061 subtrees[root].erase(p);
1062 }
1063 p = next;
1064 }
1065
1066 // i am a bound of the parent subtree.
1067 subtrees[root].insert(dir);
1068
1069 // i am now the subtree root.
1070 root = dir;
1071 }
1072
1073 set<CInode*> to_eval;
1074
1075 // verify/adjust bounds.
1076 // - these may be new, or
1077 // - beneath existing ambiguous bounds (which will be collapsed),
1078 // - but NOT beneath unambiguous bounds.
1079 for (const auto& bound : bounds) {
1080 // new bound?
1081 if (subtrees[dir].count(bound) == 0) {
1082 if (get_subtree_root(bound) == dir) {
1083 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1084 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1085 }
1086 else {
1087 dout(10) << " want bound " << *bound << dendl;
1088 CDir *t = get_subtree_root(bound->get_parent_dir());
1089 if (subtrees[t].count(bound) == 0) {
1090 ceph_assert(t != dir);
1091 dout(10) << " new bound " << *bound << dendl;
1092 adjust_subtree_auth(bound, t->authority());
1093 }
1094 // make sure it's nested beneath ambiguous subtree(s)
1095 while (1) {
1096 while (subtrees[dir].count(t) == 0)
1097 t = get_subtree_root(t->get_parent_dir());
1098 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1099 adjust_subtree_auth(t, auth);
1100 try_subtree_merge_at(t, &to_eval);
1101 t = get_subtree_root(bound->get_parent_dir());
1102 if (t == dir) break;
1103 }
1104 }
1105 }
1106 else {
1107 dout(10) << " already have bound " << *bound << dendl;
1108 }
1109 }
1110 // merge stray bounds?
1111 while (!subtrees[dir].empty()) {
1112 set<CDir*> copy = subtrees[dir];
1113 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1114 if (bounds.count(*p) == 0) {
1115 CDir *stray = *p;
1116 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1117 adjust_subtree_auth(stray, auth);
1118 try_subtree_merge_at(stray, &to_eval);
1119 }
1120 }
1121 // swallowing subtree may add new subtree bounds
1122 if (copy == subtrees[dir])
1123 break;
1124 }
1125
1126 // bound should now match.
1127 verify_subtree_bounds(dir, bounds);
1128
1129 show_subtrees();
1130
1131 if (!(mds->is_any_replay() || mds->is_resolve())) {
1132 for(auto in : to_eval)
1133 eval_subtree_root(in);
1134 }
1135 }
1136
1137
1138 /*
1139 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1140 * fragmentation as necessary to get an equivalent bounding set. That is, only
1141 * split if one of our frags spans the provided bounding set. Never merge.
1142 */
1143 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1144 {
1145 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1146
1147 // sort by ino
1148 map<inodeno_t, fragset_t> byino;
1149 for (auto& frag : dfs) {
1150 byino[frag.ino].insert_raw(frag.frag);
1151 }
1152 dout(10) << " by ino: " << byino << dendl;
1153
1154 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1155 p->second.simplify();
1156 CInode *diri = get_inode(p->first);
1157 if (!diri)
1158 continue;
1159 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1160
1161 fragtree_t tmpdft;
1162 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1163 tmpdft.force_to_leaf(g_ceph_context, *q);
1164
1165 for (const auto& fg : p->second) {
1166 frag_vec_t leaves;
1167 diri->dirfragtree.get_leaves_under(fg, leaves);
1168 if (leaves.empty()) {
1169 bool all = true;
1170 frag_t approx_fg = diri->dirfragtree[fg.value()];
1171 frag_vec_t approx_leaves;
1172 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1173 for (const auto& leaf : approx_leaves) {
1174 if (p->second.get().count(leaf) == 0) {
1175 // not bound, so the resolve message is from auth MDS of the dirfrag
1176 force_dir_fragment(diri, leaf);
1177 all = false;
1178 }
1179 }
1180 if (all)
1181 leaves.push_back(approx_fg);
1182 else
1183 diri->dirfragtree.get_leaves_under(fg, leaves);
1184 }
1185 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1186 for (const auto& leaf : leaves) {
1187 CDir *dir = diri->get_dirfrag(leaf);
1188 if (dir)
1189 bounds.insert(dir);
1190 }
1191 }
1192 }
1193 }
1194
1195 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1196 {
1197 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1198 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1199
1200 set<CDir*> bounds;
1201 get_force_dirfrag_bound_set(bound_dfs, bounds);
1202 adjust_bounded_subtree_auth(dir, bounds, auth);
1203 }
1204
1205 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1206 {
1207 dout(10) << "map_dirfrag_set " << dfs << dendl;
1208
1209 // group by inode
1210 map<inodeno_t, fragset_t> ino_fragset;
1211 for (const auto &df : dfs) {
1212 ino_fragset[df.ino].insert_raw(df.frag);
1213 }
1214 // get frags
1215 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1216 p != ino_fragset.end();
1217 ++p) {
1218 p->second.simplify();
1219 CInode *in = get_inode(p->first);
1220 if (!in)
1221 continue;
1222
1223 frag_vec_t fgs;
1224 for (const auto& fg : p->second) {
1225 in->dirfragtree.get_leaves_under(fg, fgs);
1226 }
1227
1228 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1229 << " on " << *in << dendl;
1230
1231 for (const auto& fg : fgs) {
1232 CDir *dir = in->get_dirfrag(fg);
1233 if (dir)
1234 result.insert(dir);
1235 }
1236 }
1237 }
1238
1239
1240
1241 CDir *MDCache::get_subtree_root(CDir *dir)
1242 {
1243 // find the underlying dir that delegates (or is about to delegate) auth
1244 while (true) {
1245 if (dir->is_subtree_root())
1246 return dir;
1247 dir = dir->get_inode()->get_parent_dir();
1248 if (!dir)
1249 return 0; // none
1250 }
1251 }
1252
1253 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1254 {
1255 // find the underlying dir that delegates (or is about to delegate) auth
1256 while (true) {
1257 if (dir->is_subtree_root())
1258 return dir;
1259 dir = dir->get_inode()->get_projected_parent_dir();
1260 if (!dir)
1261 return 0; // none
1262 }
1263 }
1264
1265 void MDCache::remove_subtree(CDir *dir)
1266 {
1267 dout(10) << "remove_subtree " << *dir << dendl;
1268 ceph_assert(subtrees.count(dir));
1269 ceph_assert(subtrees[dir].empty());
1270 subtrees.erase(dir);
1271 dir->put(CDir::PIN_SUBTREE);
1272 if (dir->get_parent_dir()) {
1273 CDir *p = get_subtree_root(dir->get_parent_dir());
1274 ceph_assert(subtrees[p].count(dir));
1275 subtrees[p].erase(dir);
1276 }
1277 }
1278
1279 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1280 {
1281 ceph_assert(subtrees.count(dir));
1282 bounds = subtrees[dir];
1283 }
1284
1285 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1286 {
1287 if (subtrees.count(dir)) {
1288 // just copy them, dir is a subtree.
1289 get_subtree_bounds(dir, bounds);
1290 } else {
1291 // find them
1292 CDir *root = get_subtree_root(dir);
1293 for (set<CDir*>::iterator p = subtrees[root].begin();
1294 p != subtrees[root].end();
1295 ++p) {
1296 CDir *t = *p;
1297 while (t != root) {
1298 t = t->get_parent_dir();
1299 ceph_assert(t);
1300 if (t == dir) {
1301 bounds.insert(*p);
1302 continue;
1303 }
1304 }
1305 }
1306 }
1307 }
1308
1309 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1310 {
1311 // for debugging only.
1312 ceph_assert(subtrees.count(dir));
1313 if (bounds != subtrees[dir]) {
1314 dout(0) << "verify_subtree_bounds failed" << dendl;
1315 set<CDir*> b = bounds;
1316 for (auto &cd : subtrees[dir]) {
1317 if (bounds.count(cd)) {
1318 b.erase(cd);
1319 continue;
1320 }
1321 dout(0) << " missing bound " << *cd << dendl;
1322 }
1323 for (const auto &cd : b)
1324 dout(0) << " extra bound " << *cd << dendl;
1325 }
1326 ceph_assert(bounds == subtrees[dir]);
1327 }
1328
1329 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1330 {
1331 // for debugging only.
1332 ceph_assert(subtrees.count(dir));
1333
1334 // make sure that any bounds i do have are properly noted as such.
1335 int failed = 0;
1336 for (const auto &fg : bounds) {
1337 CDir *bd = get_dirfrag(fg);
1338 if (!bd) continue;
1339 if (subtrees[dir].count(bd) == 0) {
1340 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1341 failed++;
1342 }
1343 }
1344 ceph_assert(failed == 0);
1345 }
1346
1347 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1348 {
1349 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1350 << " to " << *newdir << dendl;
1351 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1352 }
1353
1354 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1355 {
1356 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1357
1358 CDir *newdir = diri->get_parent_dir();
1359
1360 if (pop) {
1361 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1362 ceph_assert(p != projected_subtree_renames.end());
1363 ceph_assert(!p->second.empty());
1364 ceph_assert(p->second.front().first == olddir);
1365 ceph_assert(p->second.front().second == newdir);
1366 p->second.pop_front();
1367 if (p->second.empty())
1368 projected_subtree_renames.erase(p);
1369 }
1370
1371 // adjust total auth pin of freezing subtree
1372 if (olddir != newdir) {
1373 auto&& dfls = diri->get_nested_dirfrags();
1374 for (const auto& dir : dfls)
1375 olddir->adjust_freeze_after_rename(dir);
1376 }
1377
1378 // adjust subtree
1379 // N.B. make sure subtree dirfrags are at the front of the list
1380 auto dfls = diri->get_subtree_dirfrags();
1381 diri->get_nested_dirfrags(dfls);
1382 for (const auto& dir : dfls) {
1383 dout(10) << "dirfrag " << *dir << dendl;
1384 CDir *oldparent = get_subtree_root(olddir);
1385 dout(10) << " old parent " << *oldparent << dendl;
1386 CDir *newparent = get_subtree_root(newdir);
1387 dout(10) << " new parent " << *newparent << dendl;
1388
1389 auto& oldbounds = subtrees[oldparent];
1390 auto& newbounds = subtrees[newparent];
1391
1392 if (olddir != newdir)
1393 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1394
1395 if (oldparent == newparent) {
1396 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1397 } else if (dir->is_subtree_root()) {
1398 // children are fine. change parent.
1399 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1400 {
1401 auto n = oldbounds.erase(dir);
1402 ceph_assert(n == 1);
1403 }
1404 newbounds.insert(dir);
1405 // caller is responsible for 'eval diri'
1406 try_subtree_merge_at(dir, NULL, false);
1407 } else {
1408 // mid-subtree.
1409
1410 // see if any old bounds move to the new parent.
1411 std::vector<CDir*> tomove;
1412 for (const auto& bound : oldbounds) {
1413 CDir *broot = get_subtree_root(bound->get_parent_dir());
1414 if (broot != oldparent) {
1415 ceph_assert(broot == newparent);
1416 tomove.push_back(bound);
1417 }
1418 }
1419 for (const auto& bound : tomove) {
1420 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1421 oldbounds.erase(bound);
1422 newbounds.insert(bound);
1423 }
1424
1425 // did auth change?
1426 if (oldparent->authority() != newparent->authority()) {
1427 adjust_subtree_auth(dir, oldparent->authority(), false);
1428 // caller is responsible for 'eval diri'
1429 try_subtree_merge_at(dir, NULL, false);
1430 }
1431 }
1432
1433 if (olddir != newdir)
1434 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1435 }
1436
1437 show_subtrees();
1438 }
1439
1440 // ===================================
1441 // journal and snap/cow helpers
1442
1443
1444 /*
1445 * find first inode in cache that follows given snapid. otherwise, return current.
1446 */
1447 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1448 {
1449 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1450 ceph_assert(in->last == CEPH_NOSNAP);
1451
1452 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1453 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1454 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1455 in = p->second;
1456 }
1457
1458 return in;
1459 }
1460
1461
1462 /*
1463 * note: i'm currently cheating wrt dirty and inode.version on cow
1464 * items. instead of doing a full dir predirty, i just take the
1465 * original item's version, and set the dirty flag (via
1466 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1467 * means a special case in the dir commit clean sweep assertions.
1468 * bah.
1469 */
1470 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1471 {
1472 ceph_assert(last >= in->first);
1473
1474 CInode *oldin = new CInode(this, true, in->first, last);
1475 oldin->inode = *in->get_previous_projected_inode();
1476 oldin->xattrs = *in->get_previous_projected_xattrs();
1477 oldin->symlink = in->symlink;
1478 oldin->inode.trim_client_ranges(last);
1479
1480 if (in->first < in->oldest_snap)
1481 in->oldest_snap = in->first;
1482
1483 in->first = last+1;
1484
1485 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1486 add_inode(oldin);
1487
1488 if (in->last != CEPH_NOSNAP) {
1489 CInode *head_in = get_inode(in->ino());
1490 ceph_assert(head_in);
1491 auto ret = head_in->split_need_snapflush(oldin, in);
1492 if (ret.first) {
1493 oldin->client_snap_caps = in->client_snap_caps;
1494 if (!oldin->client_snap_caps.empty()) {
1495 for (int i = 0; i < num_cinode_locks; i++) {
1496 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1497 ceph_assert(lock);
1498 if (lock->get_state() != LOCK_SNAP_SYNC) {
1499 ceph_assert(lock->is_stable());
1500 lock->set_state(LOCK_SNAP_SYNC); // gathering
1501 oldin->auth_pin(lock);
1502 }
1503 lock->get_wrlock(true);
1504 }
1505 }
1506 }
1507 if (!ret.second) {
1508 auto client_snap_caps = std::move(in->client_snap_caps);
1509 in->client_snap_caps.clear();
1510 in->item_open_file.remove_myself();
1511 in->item_caps.remove_myself();
1512
1513 if (!client_snap_caps.empty()) {
1514 MDSContext::vec finished;
1515 for (int i = 0; i < num_cinode_locks; i++) {
1516 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1517 ceph_assert(lock);
1518 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1519 lock->put_wrlock();
1520 if (!lock->get_num_wrlocks()) {
1521 lock->set_state(LOCK_SYNC);
1522 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1523 in->auth_unpin(lock);
1524 }
1525 }
1526 mds->queue_waiters(finished);
1527 }
1528 }
1529 return oldin;
1530 }
1531
1532 if (!in->client_caps.empty()) {
1533 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1534 // clone caps?
1535 for (auto &p : in->client_caps) {
1536 client_t client = p.first;
1537 Capability *cap = &p.second;
1538 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1539 if ((issued & CEPH_CAP_ANY_WR) &&
1540 cap->client_follows < last) {
1541 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1542 oldin->client_snap_caps.insert(client);
1543 cap->client_follows = last;
1544
1545 // we need snapflushes for any intervening snaps
1546 dout(10) << " snaps " << snaps << dendl;
1547 for (auto q = snaps.lower_bound(oldin->first);
1548 q != snaps.end() && *q <= last;
1549 ++q) {
1550 in->add_need_snapflush(oldin, *q, client);
1551 }
1552 } else {
1553 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1554 }
1555 }
1556
1557 if (!oldin->client_snap_caps.empty()) {
1558 for (int i = 0; i < num_cinode_locks; i++) {
1559 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1560 ceph_assert(lock);
1561 if (lock->get_state() != LOCK_SNAP_SYNC) {
1562 ceph_assert(lock->is_stable());
1563 lock->set_state(LOCK_SNAP_SYNC); // gathering
1564 oldin->auth_pin(lock);
1565 }
1566 lock->get_wrlock(true);
1567 }
1568 }
1569 }
1570 return oldin;
1571 }
1572
1573 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1574 CDentry *dn, snapid_t follows,
1575 CInode **pcow_inode, CDentry::linkage_t *dnl)
1576 {
1577 if (!dn) {
1578 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1579 return;
1580 }
1581 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1582 ceph_assert(dn->is_auth());
1583
1584 // nothing to cow on a null dentry, fix caller
1585 if (!dnl)
1586 dnl = dn->get_projected_linkage();
1587 ceph_assert(!dnl->is_null());
1588
1589 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1590 bool cow_head = false;
1591 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1592 ceph_assert(in->is_frozen_inode());
1593 cow_head = true;
1594 }
1595 if (in && (in->is_multiversion() || cow_head)) {
1596 // multiversion inode.
1597 SnapRealm *realm = NULL;
1598
1599 if (in->get_projected_parent_dn() != dn) {
1600 ceph_assert(follows == CEPH_NOSNAP);
1601 realm = dn->dir->inode->find_snaprealm();
1602 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1603 ceph_assert(dir_follows >= realm->get_newest_seq());
1604
1605 if (dir_follows+1 > dn->first) {
1606 snapid_t oldfirst = dn->first;
1607 dn->first = dir_follows+1;
1608 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1609 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
1610 oldfirst, dir_follows);
1611 olddn->pre_dirty();
1612 dout(10) << " olddn " << *olddn << dendl;
1613 metablob->add_remote_dentry(olddn, true);
1614 mut->add_cow_dentry(olddn);
1615 // FIXME: adjust link count here? hmm.
1616
1617 if (dir_follows+1 > in->first)
1618 in->cow_old_inode(dir_follows, cow_head);
1619 }
1620 }
1621
1622 follows = dir_follows;
1623 if (in->snaprealm) {
1624 realm = in->snaprealm;
1625 ceph_assert(follows >= realm->get_newest_seq());
1626 }
1627 } else {
1628 realm = in->find_snaprealm();
1629 if (follows == CEPH_NOSNAP) {
1630 follows = get_global_snaprealm()->get_newest_seq();
1631 ceph_assert(follows >= realm->get_newest_seq());
1632 }
1633 }
1634
1635 // already cloned?
1636 if (follows < in->first) {
1637 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1638 return;
1639 }
1640
1641 if (!realm->has_snaps_in_range(in->first, follows)) {
1642 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1643 in->first = follows + 1;
1644 return;
1645 }
1646
1647 in->cow_old_inode(follows, cow_head);
1648
1649 } else {
1650 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1651 if (follows == CEPH_NOSNAP) {
1652 follows = get_global_snaprealm()->get_newest_seq();
1653 ceph_assert(follows >= realm->get_newest_seq());
1654 }
1655
1656 // already cloned?
1657 if (follows < dn->first) {
1658 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1659 return;
1660 }
1661
1662 // update dn.first before adding old dentry to cdir's map
1663 snapid_t oldfirst = dn->first;
1664 dn->first = follows+1;
1665
1666 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1667 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1668 if (in)
1669 in->first = follows+1;
1670 return;
1671 }
1672
1673 dout(10) << " dn " << *dn << dendl;
1674 if (in) {
1675 CInode *oldin = cow_inode(in, follows);
1676 mut->add_cow_inode(oldin);
1677 if (pcow_inode)
1678 *pcow_inode = oldin;
1679 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
1680 oldin->inode.version = olddn->pre_dirty();
1681 dout(10) << " olddn " << *olddn << dendl;
1682 bool need_snapflush = !oldin->client_snap_caps.empty();
1683 if (need_snapflush) {
1684 mut->ls->open_files.push_back(&oldin->item_open_file);
1685 mds->locker->mark_need_snapflush_inode(oldin);
1686 }
1687 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1688 mut->add_cow_dentry(olddn);
1689 } else {
1690 ceph_assert(dnl->is_remote());
1691 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
1692 oldfirst, follows);
1693 olddn->pre_dirty();
1694 dout(10) << " olddn " << *olddn << dendl;
1695 metablob->add_remote_dentry(olddn, true);
1696 mut->add_cow_dentry(olddn);
1697 }
1698 }
1699 }
1700
1701
1702 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1703 CInode *in, snapid_t follows,
1704 CInode **pcow_inode)
1705 {
1706 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1707 CDentry *dn = in->get_projected_parent_dn();
1708 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1709 }
1710
1711 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1712 {
1713 if (in->is_base()) {
1714 metablob->add_root(true, in);
1715 } else {
1716 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1717 follows = in->first - 1;
1718 CDentry *dn = in->get_projected_parent_dn();
1719 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1720 journal_cow_dentry(mut, metablob, dn, follows);
1721 if (in->get_projected_inode()->is_backtrace_updated()) {
1722 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1723 in->get_previous_projected_inode()->layout.pool_id;
1724 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1725 } else {
1726 metablob->add_primary_dentry(dn, in, true);
1727 }
1728 }
1729 }
1730
1731
1732
1733 // nested ---------------------------------------------------------------
1734
1735 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1736 int linkunlink, SnapRealm *prealm)
1737 {
1738 CDentry *parentdn = cur->get_projected_parent_dn();
1739 CInode::mempool_inode *curi = cur->get_projected_inode();
1740
1741 if (cur->first > first)
1742 first = cur->first;
1743
1744 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1745 << " " << *cur << dendl;
1746 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1747 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1748
1749 /*
1750 * FIXME. this incompletely propagates rstats to _old_ parents
1751 * (i.e. shortly after a directory rename). but we need full
1752 * blown hard link backpointers to make this work properly...
1753 */
1754 snapid_t floor = parentdn->first;
1755 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1756
1757 if (!prealm)
1758 prealm = parent->inode->find_snaprealm();
1759 const set<snapid_t> snaps = prealm->get_snaps();
1760
1761 if (cur->last != CEPH_NOSNAP) {
1762 ceph_assert(cur->dirty_old_rstats.empty());
1763 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1764 if (q == snaps.end() || *q > cur->last)
1765 return;
1766 }
1767
1768 if (cur->last >= floor) {
1769 bool update = true;
1770 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1771 // rename src inode is not projected in the slave rename prep case. so we should
1772 // avoid updateing the inode.
1773 ceph_assert(linkunlink < 0);
1774 ceph_assert(cur->is_frozen_inode());
1775 update = false;
1776 }
1777 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
1778 linkunlink, update);
1779 }
1780
1781 if (g_conf()->mds_snap_rstat) {
1782 for (const auto &p : cur->dirty_old_rstats) {
1783 auto &old = cur->old_inodes[p];
1784 snapid_t ofirst = std::max(old.first, floor);
1785 auto it = snaps.lower_bound(ofirst);
1786 if (it == snaps.end() || *it > p)
1787 continue;
1788 if (p >= floor)
1789 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
1790 }
1791 }
1792 cur->dirty_old_rstats.clear();
1793 }
1794
1795
1796 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
1797 CDir *parent, int linkunlink, bool update_inode)
1798 {
1799 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1800 dout(20) << " inode rstat " << inode.rstat << dendl;
1801 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1802 nest_info_t delta;
1803 if (linkunlink == 0) {
1804 delta.add(inode.rstat);
1805 delta.sub(inode.accounted_rstat);
1806 } else if (linkunlink < 0) {
1807 delta.sub(inode.accounted_rstat);
1808 } else {
1809 delta.add(inode.rstat);
1810 }
1811 dout(20) << " delta " << delta << dendl;
1812
1813 if (update_inode)
1814 inode.accounted_rstat = inode.rstat;
1815
1816 while (last >= ofirst) {
1817 /*
1818 * pick fnode version to update. at each iteration, we want to
1819 * pick a segment ending in 'last' to update. split as necessary
1820 * to make that work. then, adjust first up so that we only
1821 * update one segment at a time. then loop to cover the whole
1822 * [ofirst,last] interval.
1823 */
1824 nest_info_t *prstat;
1825 snapid_t first;
1826 fnode_t *pf = parent->get_projected_fnode();
1827 if (last == CEPH_NOSNAP) {
1828 if (g_conf()->mds_snap_rstat)
1829 first = std::max(ofirst, parent->first);
1830 else
1831 first = parent->first;
1832 prstat = &pf->rstat;
1833 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1834
1835 if (first > parent->first &&
1836 !(pf->rstat == pf->accounted_rstat)) {
1837 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1838 << parent->first << "," << (first-1) << "] "
1839 << " " << *prstat << "/" << pf->accounted_rstat
1840 << dendl;
1841 parent->dirty_old_rstat[first-1].first = parent->first;
1842 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1843 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1844 }
1845 parent->first = first;
1846 } else if (!g_conf()->mds_snap_rstat) {
1847 // drop snapshots' rstats
1848 break;
1849 } else if (last >= parent->first) {
1850 first = parent->first;
1851 parent->dirty_old_rstat[last].first = first;
1852 parent->dirty_old_rstat[last].rstat = pf->rstat;
1853 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1854 prstat = &parent->dirty_old_rstat[last].rstat;
1855 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1856 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1857 } else {
1858 // be careful, dirty_old_rstat is a _sparse_ map.
1859 // sorry, this is ugly.
1860 first = ofirst;
1861
1862 // find any intersection with last
1863 auto it = parent->dirty_old_rstat.lower_bound(last);
1864 if (it == parent->dirty_old_rstat.end()) {
1865 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1866 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1867 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1868 first = parent->dirty_old_rstat.rbegin()->first+1;
1869 }
1870 } else {
1871 // *it last is >= last
1872 if (it->second.first <= last) {
1873 // *it intersects [first,last]
1874 if (it->second.first < first) {
1875 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1876 parent->dirty_old_rstat[first-1] = it->second;
1877 it->second.first = first;
1878 }
1879 if (it->second.first > first)
1880 first = it->second.first;
1881 if (last < it->first) {
1882 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1883 parent->dirty_old_rstat[last] = it->second;
1884 it->second.first = last+1;
1885 }
1886 } else {
1887 // *it is to the _right_ of [first,last]
1888 it = parent->dirty_old_rstat.lower_bound(first);
1889 // new *it last is >= first
1890 if (it->second.first <= last && // new *it isn't also to the right, and
1891 it->first >= first) { // it intersects our first bit,
1892 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1893 first = it->first+1;
1894 }
1895 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1896 }
1897 }
1898 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1899 parent->dirty_old_rstat[last].first = first;
1900 prstat = &parent->dirty_old_rstat[last].rstat;
1901 }
1902
1903 // apply
1904 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1905 ceph_assert(last >= first);
1906 prstat->add(delta);
1907 if (update_inode)
1908 inode.accounted_rstat = inode.rstat;
1909 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1910
1911 last = first-1;
1912 }
1913 }
1914
1915 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1916 snapid_t ofirst, snapid_t last,
1917 CInode *pin, bool cow_head)
1918 {
1919 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1920 dout(20) << " frag rstat " << rstat << dendl;
1921 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1922 nest_info_t delta = rstat;
1923 delta.sub(accounted_rstat);
1924 dout(20) << " delta " << delta << dendl;
1925
1926 while (last >= ofirst) {
1927 CInode::mempool_inode *pi;
1928 snapid_t first;
1929 if (last == pin->last) {
1930 pi = pin->get_projected_inode();
1931 first = std::max(ofirst, pin->first);
1932 if (first > pin->first) {
1933 auto &old = pin->cow_old_inode(first-1, cow_head);
1934 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1935 }
1936 } else {
1937 if (last >= pin->first) {
1938 first = pin->first;
1939 pin->cow_old_inode(last, cow_head);
1940 } else {
1941 // our life is easier here because old_inodes is not sparse
1942 // (although it may not begin at snapid 1)
1943 auto it = pin->old_inodes.lower_bound(last);
1944 if (it == pin->old_inodes.end()) {
1945 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1946 break;
1947 }
1948 first = it->second.first;
1949 if (first > last) {
1950 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1951 //assert(p == pin->old_inodes.begin());
1952 break;
1953 }
1954 if (it->first > last) {
1955 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1956 << (last+1) << "," << it->first << "]" << dendl;
1957 pin->old_inodes[last] = it->second;
1958 it->second.first = last+1;
1959 pin->dirty_old_rstats.insert(it->first);
1960 }
1961 }
1962 if (first < ofirst) {
1963 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1964 << first << "," << ofirst-1 << "]" << dendl;
1965 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1966 pin->dirty_old_rstats.insert(ofirst-1);
1967 pin->old_inodes[last].first = first = ofirst;
1968 }
1969 pi = &pin->old_inodes[last].inode;
1970 pin->dirty_old_rstats.insert(last);
1971 }
1972 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1973 pi->rstat.add(delta);
1974 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1975
1976 last = first-1;
1977 }
1978 }
1979
1980 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
1981 {
1982 if (!(mds->is_active() || mds->is_stopping()))
1983 return;
1984
1985 if (!in->is_auth() || in->is_frozen())
1986 return;
1987
1988 auto i = in->get_projected_inode();
1989
1990 if (!i->quota.is_enable() &&
1991 !quota_change)
1992 return;
1993
1994 // creaete snaprealm for quota inode (quota was set before mimic)
1995 if (!in->get_projected_srnode())
1996 mds->server->create_quota_realm(in);
1997
1998 for (auto &p : in->client_caps) {
1999 Capability *cap = &p.second;
2000 if (cap->is_noquota())
2001 continue;
2002
2003 if (exclude_ct >= 0 && exclude_ct != p.first)
2004 goto update;
2005
2006 if (cap->last_rbytes == i->rstat.rbytes &&
2007 cap->last_rsize == i->rstat.rsize())
2008 continue;
2009
2010 if (i->quota.max_files > 0) {
2011 if (i->rstat.rsize() >= i->quota.max_files)
2012 goto update;
2013
2014 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2015 abs(cap->last_rsize - i->rstat.rsize()))
2016 goto update;
2017 }
2018
2019 if (i->quota.max_bytes > 0) {
2020 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2021 goto update;
2022
2023 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2024 abs(cap->last_rbytes - i->rstat.rbytes))
2025 goto update;
2026 }
2027
2028 continue;
2029
2030 update:
2031 cap->last_rsize = i->rstat.rsize();
2032 cap->last_rbytes = i->rstat.rbytes;
2033
2034 auto msg = make_message<MClientQuota>();
2035 msg->ino = in->ino();
2036 msg->rstat = i->rstat;
2037 msg->quota = i->quota;
2038 mds->send_message_client_counted(msg, cap->get_session());
2039 }
2040 for (const auto &it : in->get_replicas()) {
2041 auto msg = make_message<MGatherCaps>();
2042 msg->ino = in->ino();
2043 mds->send_message_mds(msg, it.first);
2044 }
2045 }
2046
2047 /*
2048 * NOTE: we _have_ to delay the scatter if we are called during a
2049 * rejoin, because we can't twiddle locks between when the
2050 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2051 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2052 * (no requests), and a survivor acks immediately. _except_ that
2053 * during rejoin_(weak|strong) processing, we may complete a lock
2054 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2055 * scatterlock state in that case or the lock states will get out of
2056 * sync between the auth and replica.
2057 *
2058 * the simple solution is to never do the scatter here. instead, put
2059 * the scatterlock on a list if it isn't already wrlockable. this is
2060 * probably the best plan anyway, since we avoid too many
2061 * scatters/locks under normal usage.
2062 */
2063 /*
2064 * some notes on dirlock/nestlock scatterlock semantics:
2065 *
2066 * the fragstat (dirlock) will never be updated without
2067 * dirlock+nestlock wrlock held by the caller.
2068 *
2069 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2070 * data is pushed up the tree. this could be changed with some
2071 * restructuring here, but in its current form we ensure that the
2072 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2073 * frag, which is nice. and, we only need to track frags that need to
2074 * be nudged (and not inodes with pending rstat changes that need to
2075 * be pushed into the frag). a consequence of this is that the
2076 * accounted_rstat on scatterlock sync may not match our current
2077 * rstat. this is normal and expected.
2078 */
2079 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2080 CInode *in, CDir *parent,
2081 int flags, int linkunlink,
2082 snapid_t cfollows)
2083 {
2084 bool primary_dn = flags & PREDIRTY_PRIMARY;
2085 bool do_parent_mtime = flags & PREDIRTY_DIR;
2086 bool shallow = flags & PREDIRTY_SHALLOW;
2087
2088 ceph_assert(mds->mdlog->entry_is_open());
2089
2090 // make sure stamp is set
2091 if (mut->get_mds_stamp() == utime_t())
2092 mut->set_mds_stamp(ceph_clock_now());
2093
2094 if (in->is_base())
2095 return;
2096
2097 dout(10) << "predirty_journal_parents"
2098 << (do_parent_mtime ? " do_parent_mtime":"")
2099 << " linkunlink=" << linkunlink
2100 << (primary_dn ? " primary_dn":" remote_dn")
2101 << (shallow ? " SHALLOW":"")
2102 << " follows " << cfollows
2103 << " " << *in << dendl;
2104
2105 if (!parent) {
2106 ceph_assert(primary_dn);
2107 parent = in->get_projected_parent_dn()->get_dir();
2108 }
2109
2110 if (flags == 0 && linkunlink == 0) {
2111 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2112 blob->add_dir_context(parent);
2113 return;
2114 }
2115
2116 // build list of inodes to wrlock, dirty, and update
2117 list<CInode*> lsi;
2118 CInode *cur = in;
2119 CDentry *parentdn = NULL;
2120 bool first = true;
2121 while (parent) {
2122 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2123 ceph_assert(parent->is_auth());
2124
2125 // opportunistically adjust parent dirfrag
2126 CInode *pin = parent->get_inode();
2127
2128 // inode -> dirfrag
2129 mut->auth_pin(parent);
2130 mut->add_projected_fnode(parent);
2131
2132 fnode_t *pf = parent->project_fnode();
2133 pf->version = parent->pre_dirty();
2134
2135 if (do_parent_mtime || linkunlink) {
2136 ceph_assert(mut->is_wrlocked(&pin->filelock));
2137 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2138 ceph_assert(cfollows == CEPH_NOSNAP);
2139
2140 // update stale fragstat/rstat?
2141 parent->resync_accounted_fragstat();
2142 parent->resync_accounted_rstat();
2143
2144 if (do_parent_mtime) {
2145 pf->fragstat.mtime = mut->get_op_stamp();
2146 pf->fragstat.change_attr++;
2147 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2148 if (pf->fragstat.mtime > pf->rstat.rctime) {
2149 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2150 pf->rstat.rctime = pf->fragstat.mtime;
2151 } else {
2152 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2153 }
2154 }
2155 if (linkunlink) {
2156 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2157 if (in->is_dir()) {
2158 pf->fragstat.nsubdirs += linkunlink;
2159 //pf->rstat.rsubdirs += linkunlink;
2160 } else {
2161 pf->fragstat.nfiles += linkunlink;
2162 //pf->rstat.rfiles += linkunlink;
2163 }
2164 }
2165 }
2166
2167 // rstat
2168 if (!primary_dn) {
2169 // don't update parent this pass
2170 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2171 pin->versionlock.can_wrlock())) {
2172 dout(20) << " unwritable parent nestlock " << pin->nestlock
2173 << ", marking dirty rstat on " << *cur << dendl;
2174 cur->mark_dirty_rstat();
2175 } else {
2176 // if we don't hold a wrlock reference on this nestlock, take one,
2177 // because we are about to write into the dirfrag fnode and that needs
2178 // to commit before the lock can cycle.
2179 if (linkunlink) {
2180 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2181 }
2182
2183 if (!mut->is_wrlocked(&pin->nestlock)) {
2184 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2185 mds->locker->wrlock_force(&pin->nestlock, mut);
2186 }
2187
2188 // now we can project the inode rstat diff the dirfrag
2189 SnapRealm *prealm = pin->find_snaprealm();
2190
2191 snapid_t follows = cfollows;
2192 if (follows == CEPH_NOSNAP)
2193 follows = prealm->get_newest_seq();
2194
2195 snapid_t first = follows+1;
2196
2197 // first, if the frag is stale, bring it back in sync.
2198 parent->resync_accounted_rstat();
2199
2200 // now push inode rstats into frag
2201 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2202 cur->clear_dirty_rstat();
2203 }
2204
2205 bool stop = false;
2206 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2207 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2208 stop = true;
2209 }
2210
2211 // delay propagating until later?
2212 if (!stop && !first &&
2213 g_conf()->mds_dirstat_min_interval > 0) {
2214 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2215 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2216 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2217 << " < " << g_conf()->mds_dirstat_min_interval
2218 << ", stopping" << dendl;
2219 stop = true;
2220 } else {
2221 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2222 }
2223 }
2224
2225 // can cast only because i'm passing nowait=true in the sole user
2226 if (!stop &&
2227 !mut->is_wrlocked(&pin->nestlock) &&
2228 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2229 !mds->locker->wrlock_try(&pin->nestlock, mut)
2230 )) { // ** do not initiate.. see above comment **
2231 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2232 << " on " << *pin << dendl;
2233 stop = true;
2234 }
2235 if (stop) {
2236 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2237 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2238 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2239 mut->add_updated_lock(&pin->nestlock);
2240 if (do_parent_mtime || linkunlink) {
2241 mds->locker->mark_updated_scatterlock(&pin->filelock);
2242 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2243 mut->add_updated_lock(&pin->filelock);
2244 }
2245 break;
2246 }
2247 if (!mut->is_wrlocked(&pin->versionlock))
2248 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2249
2250 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
2251
2252 pin->last_dirstat_prop = mut->get_mds_stamp();
2253
2254 // dirfrag -> diri
2255 mut->auth_pin(pin);
2256 mut->add_projected_inode(pin);
2257 lsi.push_front(pin);
2258
2259 pin->pre_cow_old_inode(); // avoid cow mayhem!
2260
2261 auto &pi = pin->project_inode();
2262 pi.inode.version = pin->pre_dirty();
2263
2264 // dirstat
2265 if (do_parent_mtime || linkunlink) {
2266 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2267 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2268 bool touched_mtime = false, touched_chattr = false;
2269 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2270 pf->accounted_fragstat = pf->fragstat;
2271 if (touched_mtime)
2272 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
2273 if (touched_chattr)
2274 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2275 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
2276
2277 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2278 if (pi.inode.dirstat.size() < 0)
2279 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2280 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
2281 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2282 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
2283 << ", dirfrag has " << pf->fragstat;
2284
2285 // trust the dirfrag for now
2286 pi.inode.dirstat = pf->fragstat;
2287
2288 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2289 }
2290 }
2291 }
2292
2293 /*
2294 * the rule here is to follow the _oldest_ parent with dirty rstat
2295 * data. if we don't propagate all data, we add ourselves to the
2296 * nudge list. that way all rstat data will (eventually) get
2297 * pushed up the tree.
2298 *
2299 * actually, no. for now, silently drop rstats for old parents. we need
2300 * hard link backpointers to do the above properly.
2301 */
2302
2303 // stop?
2304 if (pin->is_base())
2305 break;
2306 parentdn = pin->get_projected_parent_dn();
2307 ceph_assert(parentdn);
2308
2309 // rstat
2310 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2311
2312 // first, if the frag is stale, bring it back in sync.
2313 parent->resync_accounted_rstat();
2314
2315 if (g_conf()->mds_snap_rstat) {
2316 for (auto &p : parent->dirty_old_rstat) {
2317 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2318 p.first, pin, true);
2319 }
2320 }
2321 parent->dirty_old_rstat.clear();
2322 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2323
2324 pf->accounted_rstat = pf->rstat;
2325
2326 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2327 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
2328 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2329 << parent->dirfrag() << ", inode has " << pi.inode.rstat
2330 << ", dirfrag has " << pf->rstat;
2331
2332 // trust the dirfrag for now
2333 pi.inode.rstat = pf->rstat;
2334
2335 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2336 }
2337 }
2338
2339 parent->check_rstats();
2340 broadcast_quota_to_client(pin);
2341 // next parent!
2342 cur = pin;
2343 parent = parentdn->get_dir();
2344 linkunlink = 0;
2345 do_parent_mtime = false;
2346 primary_dn = true;
2347 first = false;
2348 }
2349
2350 // now, stick it in the blob
2351 ceph_assert(parent);
2352 ceph_assert(parent->is_auth());
2353 blob->add_dir_context(parent);
2354 blob->add_dir(parent, true);
2355 for (const auto& in : lsi) {
2356 journal_dirty_inode(mut.get(), blob, in);
2357 }
2358
2359 }
2360
2361
2362
2363
2364
2365 // ===================================
2366 // slave requests
2367
2368
2369 /*
2370 * some handlers for master requests with slaves. we need to make
2371 * sure slaves journal commits before we forget we mastered them and
2372 * remove them from the uncommitted_masters map (used during recovery
2373 * to commit|abort slaves).
2374 */
2375 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2376 metareqid_t reqid;
2377 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2378 void finish(int r) override {
2379 mdcache->_logged_master_commit(reqid);
2380 }
2381 };
2382
2383 void MDCache::log_master_commit(metareqid_t reqid)
2384 {
2385 dout(10) << "log_master_commit " << reqid << dendl;
2386 uncommitted_masters[reqid].committing = true;
2387 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2388 new C_MDC_CommittedMaster(this, reqid));
2389 }
2390
2391 void MDCache::_logged_master_commit(metareqid_t reqid)
2392 {
2393 dout(10) << "_logged_master_commit " << reqid << dendl;
2394 ceph_assert(uncommitted_masters.count(reqid));
2395 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2396 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2397 uncommitted_masters.erase(reqid);
2398 }
2399
2400 // while active...
2401
2402 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2403 {
2404 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2405 ceph_assert(uncommitted_masters.count(r));
2406 uncommitted_masters[r].slaves.erase(from);
2407 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2408 log_master_commit(r);
2409 }
2410
2411 void MDCache::logged_master_update(metareqid_t reqid)
2412 {
2413 dout(10) << "logged_master_update " << reqid << dendl;
2414 ceph_assert(uncommitted_masters.count(reqid));
2415 uncommitted_masters[reqid].safe = true;
2416 auto p = pending_masters.find(reqid);
2417 if (p != pending_masters.end()) {
2418 pending_masters.erase(p);
2419 if (pending_masters.empty())
2420 process_delayed_resolve();
2421 }
2422 }
2423
2424 /*
2425 * Master may crash after receiving all slaves' commit acks, but before journalling
2426 * the final commit. Slaves may crash after journalling the slave commit, but before
2427 * sending commit ack to the master. Commit masters with no uncommitted slave when
2428 * resolve finishes.
2429 */
2430 void MDCache::finish_committed_masters()
2431 {
2432 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2433 p != uncommitted_masters.end();
2434 ++p) {
2435 p->second.recovering = false;
2436 if (!p->second.committing && p->second.slaves.empty()) {
2437 dout(10) << "finish_committed_masters " << p->first << dendl;
2438 log_master_commit(p->first);
2439 }
2440 }
2441 }
2442
2443 /*
2444 * at end of resolve... we must journal a commit|abort for all slave
2445 * updates, before moving on.
2446 *
2447 * this is so that the master can safely journal ECommitted on ops it
2448 * masters when it reaches up:active (all other recovering nodes must
2449 * complete resolve before that happens).
2450 */
2451 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2452 mds_rank_t from;
2453 metareqid_t reqid;
2454 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2455 void finish(int r) override {
2456 mdcache->_logged_slave_commit(from, reqid);
2457 }
2458 };
2459
2460 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2461 {
2462 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2463
2464 // send a message
2465 auto req = make_message<MMDSSlaveRequest>(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2466 mds->send_message_mds(req, from);
2467 }
2468
2469
2470
2471
2472
2473
2474 // ====================================================================
2475 // import map, recovery
2476
2477 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2478 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2479 {
2480 if (subtrees.count(oldparent)) {
2481 vector<dirfrag_t>& v = subtrees[oldparent];
2482 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2483 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2484 if (*it == df) {
2485 v.erase(it);
2486 break;
2487 }
2488 }
2489 if (subtrees.count(newparent)) {
2490 vector<dirfrag_t>& v = subtrees[newparent];
2491 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2492 v.push_back(df);
2493 }
2494 }
2495
2496 ESubtreeMap *MDCache::create_subtree_map()
2497 {
2498 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2499 << num_subtrees_fullauth() << " fullauth"
2500 << dendl;
2501
2502 show_subtrees();
2503
2504 ESubtreeMap *le = new ESubtreeMap();
2505 mds->mdlog->_start_entry(le);
2506
2507 map<dirfrag_t, CDir*> dirs_to_add;
2508
2509 if (myin) {
2510 CDir* mydir = myin->get_dirfrag(frag_t());
2511 dirs_to_add[mydir->dirfrag()] = mydir;
2512 }
2513
2514 // include all auth subtrees, and their bounds.
2515 // and a spanning tree to tie it to the root.
2516 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2517 p != subtrees.end();
2518 ++p) {
2519 CDir *dir = p->first;
2520
2521 // journal subtree as "ours" if we are
2522 // me, -2
2523 // me, me
2524 // me, !me (may be importing and ambiguous!)
2525
2526 // so not
2527 // !me, *
2528 if (dir->get_dir_auth().first != mds->get_nodeid())
2529 continue;
2530
2531 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2532 my_ambiguous_imports.count(dir->dirfrag())) {
2533 dout(15) << " ambig subtree " << *dir << dendl;
2534 le->ambiguous_subtrees.insert(dir->dirfrag());
2535 } else {
2536 dout(15) << " subtree " << *dir << dendl;
2537 }
2538
2539 dirs_to_add[dir->dirfrag()] = dir;
2540 le->subtrees[dir->dirfrag()].clear();
2541
2542
2543 // bounds
2544 for (set<CDir*>::iterator q = p->second.begin();
2545 q != p->second.end();
2546 ++q) {
2547 CDir *bound = *q;
2548 dout(15) << " subtree bound " << *bound << dendl;
2549 dirs_to_add[bound->dirfrag()] = bound;
2550 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2551 }
2552 }
2553
2554 // apply projected renames
2555 for (const auto& [diri, renames] : projected_subtree_renames) {
2556 for (const auto& [olddir, newdir] : renames) {
2557 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2558
2559 auto&& dfls = diri->get_dirfrags();
2560 for (const auto& dir : dfls) {
2561 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2562 CDir *oldparent = get_projected_subtree_root(olddir);
2563 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2564 CDir *newparent = get_projected_subtree_root(newdir);
2565 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2566
2567 if (oldparent == newparent) {
2568 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2569 << oldparent->dirfrag() << dendl;
2570 continue;
2571 }
2572
2573 if (dir->is_subtree_root()) {
2574 if (le->subtrees.count(newparent->dirfrag()) &&
2575 oldparent->get_dir_auth() != newparent->get_dir_auth())
2576 dirs_to_add[dir->dirfrag()] = dir;
2577 // children are fine. change parent.
2578 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2579 le->subtrees);
2580 } else {
2581 // mid-subtree.
2582
2583 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2584 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2585 // if oldparent is auth, subtree is mine; include it.
2586 if (le->subtrees.count(oldparent->dirfrag())) {
2587 dirs_to_add[dir->dirfrag()] = dir;
2588 le->subtrees[dir->dirfrag()].clear();
2589 }
2590 // if newparent is auth, subtree is a new bound
2591 if (le->subtrees.count(newparent->dirfrag())) {
2592 dirs_to_add[dir->dirfrag()] = dir;
2593 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2594 }
2595 newparent = dir;
2596 }
2597
2598 // see if any old bounds move to the new parent.
2599 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2600 p != subtrees[oldparent].end();
2601 ++p) {
2602 CDir *bound = *p;
2603 if (dir->contains(bound->get_parent_dir()))
2604 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2605 le->subtrees);
2606 }
2607 }
2608 }
2609 }
2610 }
2611
2612 // simplify the journaled map. our in memory map may have more
2613 // subtrees than needed due to migrations that are just getting
2614 // started or just completing. but on replay, the "live" map will
2615 // be simple and we can do a straight comparison.
2616 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2617 if (le->ambiguous_subtrees.count(p->first))
2618 continue;
2619 unsigned i = 0;
2620 while (i < p->second.size()) {
2621 dirfrag_t b = p->second[i];
2622 if (le->subtrees.count(b) &&
2623 le->ambiguous_subtrees.count(b) == 0) {
2624 vector<dirfrag_t>& bb = le->subtrees[b];
2625 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2626 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2627 p->second.push_back(*r);
2628 dirs_to_add.erase(b);
2629 le->subtrees.erase(b);
2630 p->second.erase(p->second.begin() + i);
2631 } else {
2632 ++i;
2633 }
2634 }
2635 }
2636
2637 for (auto &p : dirs_to_add) {
2638 CDir *dir = p.second;
2639 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2640 le->metablob.add_dir(dir, false);
2641 }
2642
2643 dout(15) << " subtrees " << le->subtrees << dendl;
2644 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2645
2646 //le->metablob.print(cout);
2647 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2648 return le;
2649 }
2650
2651 void MDCache::dump_resolve_status(Formatter *f) const
2652 {
2653 f->open_object_section("resolve_status");
2654 f->dump_stream("resolve_gather") << resolve_gather;
2655 f->dump_stream("resolve_ack_gather") << resolve_gather;
2656 f->close_section();
2657 }
2658
2659 void MDCache::resolve_start(MDSContext *resolve_done_)
2660 {
2661 dout(10) << "resolve_start" << dendl;
2662 ceph_assert(!resolve_done);
2663 resolve_done.reset(resolve_done_);
2664
2665 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2666 // if we don't have the root dir, adjust it to UNKNOWN. during
2667 // resolve we want mds0 to explicit claim the portion of it that
2668 // it owns, so that anything beyond its bounds get left as
2669 // unknown.
2670 CDir *rootdir = root->get_dirfrag(frag_t());
2671 if (rootdir)
2672 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2673 }
2674 resolve_gather = recovery_set;
2675
2676 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2677 }
2678
2679 void MDCache::send_resolves()
2680 {
2681 send_slave_resolves();
2682
2683 if (!resolve_done) {
2684 // I'm survivor: refresh snap cache
2685 mds->snapclient->sync(
2686 new MDSInternalContextWrapper(mds,
2687 new LambdaContext([this](int r) {
2688 maybe_finish_slave_resolve();
2689 })
2690 )
2691 );
2692 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2693 return;
2694 }
2695 if (!resolve_ack_gather.empty()) {
2696 dout(10) << "send_resolves still waiting for resolve ack from ("
2697 << resolve_ack_gather << ")" << dendl;
2698 return;
2699 }
2700 if (!resolve_need_rollback.empty()) {
2701 dout(10) << "send_resolves still waiting for rollback to commit on ("
2702 << resolve_need_rollback << ")" << dendl;
2703 return;
2704 }
2705
2706 send_subtree_resolves();
2707 }
2708
2709 void MDCache::send_slave_resolves()
2710 {
2711 dout(10) << "send_slave_resolves" << dendl;
2712
2713 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2714
2715 if (mds->is_resolve()) {
2716 for (map<metareqid_t, uslave>::iterator p = uncommitted_slaves.begin();
2717 p != uncommitted_slaves.end();
2718 ++p) {
2719 mds_rank_t master = p->second.master;
2720 auto &m = resolves[master];
2721 if (!m) m = make_message<MMDSResolve>();
2722 m->add_slave_request(p->first, false);
2723 }
2724 } else {
2725 set<mds_rank_t> resolve_set;
2726 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2727 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2728 p != active_requests.end();
2729 ++p) {
2730 MDRequestRef& mdr = p->second;
2731 if (!mdr->is_slave())
2732 continue;
2733 if (!mdr->slave_did_prepare() && !mdr->committing) {
2734 continue;
2735 }
2736 mds_rank_t master = mdr->slave_to_mds;
2737 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2738 dout(10) << " including uncommitted " << *mdr << dendl;
2739 if (!resolves.count(master))
2740 resolves[master] = make_message<MMDSResolve>();
2741 if (!mdr->committing &&
2742 mdr->has_more() && mdr->more()->is_inode_exporter) {
2743 // re-send cap exports
2744 CInode *in = mdr->more()->rename_inode;
2745 map<client_t, Capability::Export> cap_map;
2746 in->export_client_caps(cap_map);
2747 bufferlist bl;
2748 MMDSResolve::slave_inode_cap inode_caps(in->ino(), cap_map);
2749 encode(inode_caps, bl);
2750 resolves[master]->add_slave_request(p->first, bl);
2751 } else {
2752 resolves[master]->add_slave_request(p->first, mdr->committing);
2753 }
2754 }
2755 }
2756 }
2757
2758 for (auto &p : resolves) {
2759 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2760 mds->send_message_mds(p.second, p.first);
2761 resolve_ack_gather.insert(p.first);
2762 }
2763 }
2764
2765 void MDCache::send_subtree_resolves()
2766 {
2767 dout(10) << "send_subtree_resolves" << dendl;
2768
2769 if (migrator->is_exporting() || migrator->is_importing()) {
2770 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2771 migrator->show_importing();
2772 migrator->show_exporting();
2773 resolves_pending = true;
2774 return; // not now
2775 }
2776
2777 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2778 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2779 p != recovery_set.end();
2780 ++p) {
2781 if (*p == mds->get_nodeid())
2782 continue;
2783 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2784 resolves[*p] = make_message<MMDSResolve>();
2785 }
2786
2787 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2788 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2789
2790 // known
2791 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2792 p != subtrees.end();
2793 ++p) {
2794 CDir *dir = p->first;
2795
2796 // only our subtrees
2797 if (dir->authority().first != mds->get_nodeid())
2798 continue;
2799
2800 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2801 continue; // we'll add it below
2802
2803 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2804 // ambiguous (mid-import)
2805 set<CDir*> bounds;
2806 get_subtree_bounds(dir, bounds);
2807 vector<dirfrag_t> dfls;
2808 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2809 dfls.push_back((*q)->dirfrag());
2810
2811 my_ambig_imports[dir->dirfrag()] = dfls;
2812 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2813 } else {
2814 // not ambiguous.
2815 for (auto &q : resolves) {
2816 resolves[q.first]->add_subtree(dir->dirfrag());
2817 }
2818 // bounds too
2819 vector<dirfrag_t> dfls;
2820 for (set<CDir*>::iterator q = subtrees[dir].begin();
2821 q != subtrees[dir].end();
2822 ++q) {
2823 CDir *bound = *q;
2824 dfls.push_back(bound->dirfrag());
2825 }
2826
2827 my_subtrees[dir->dirfrag()] = dfls;
2828 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2829 }
2830 }
2831
2832 // ambiguous
2833 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2834 p != my_ambiguous_imports.end();
2835 ++p) {
2836 my_ambig_imports[p->first] = p->second;
2837 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2838 }
2839
2840 // simplify the claimed subtree.
2841 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2842 unsigned i = 0;
2843 while (i < p->second.size()) {
2844 dirfrag_t b = p->second[i];
2845 if (my_subtrees.count(b)) {
2846 vector<dirfrag_t>& bb = my_subtrees[b];
2847 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2848 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2849 p->second.push_back(*r);
2850 my_subtrees.erase(b);
2851 p->second.erase(p->second.begin() + i);
2852 } else {
2853 ++i;
2854 }
2855 }
2856 }
2857
2858 // send
2859 for (auto &p : resolves) {
2860 const ref_t<MMDSResolve> &m = p.second;
2861 if (mds->is_resolve()) {
2862 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2863 } else {
2864 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2865 }
2866 m->subtrees = my_subtrees;
2867 m->ambiguous_imports = my_ambig_imports;
2868 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2869 mds->send_message_mds(m, p.first);
2870 }
2871 resolves_pending = false;
2872 }
2873
2874 void MDCache::maybe_finish_slave_resolve() {
2875 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2876 // snap cache get synced or I'm in resolve state
2877 if (mds->snapclient->is_synced() || resolve_done)
2878 send_subtree_resolves();
2879 process_delayed_resolve();
2880 }
2881 }
2882
2883 void MDCache::handle_mds_failure(mds_rank_t who)
2884 {
2885 dout(7) << "handle_mds_failure mds." << who << dendl;
2886
2887 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2888
2889 resolve_gather.insert(who);
2890 discard_delayed_resolve(who);
2891 ambiguous_slave_updates.erase(who);
2892
2893 rejoin_gather.insert(who);
2894 rejoin_sent.erase(who); // i need to send another
2895 rejoin_ack_sent.erase(who); // i need to send another
2896 rejoin_ack_gather.erase(who); // i'll need/get another.
2897
2898 dout(10) << " resolve_gather " << resolve_gather << dendl;
2899 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2900 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2901 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2902 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2903
2904
2905 // tell the migrator too.
2906 migrator->handle_mds_failure_or_stop(who);
2907
2908 // tell the balancer too.
2909 mds->balancer->handle_mds_failure(who);
2910
2911 // clean up any requests slave to/from this node
2912 list<MDRequestRef> finish;
2913 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2914 p != active_requests.end();
2915 ++p) {
2916 MDRequestRef& mdr = p->second;
2917 // slave to the failed node?
2918 if (mdr->slave_to_mds == who) {
2919 if (mdr->slave_did_prepare()) {
2920 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2921 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2922 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2923
2924 if (!mdr->more()->waiting_on_slave.empty()) {
2925 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2926 // will rollback, no need to wait
2927 mdr->reset_slave_request();
2928 mdr->more()->waiting_on_slave.clear();
2929 }
2930 } else if (!mdr->committing) {
2931 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2932 if (mdr->slave_request || mdr->slave_rolling_back())
2933 mdr->aborted = true;
2934 else
2935 finish.push_back(mdr);
2936 }
2937 }
2938
2939 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2940 if (mdr->more()->waiting_on_slave.count(who)) {
2941 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2942 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2943 << who << dendl;
2944 mdr->more()->waiting_on_slave.erase(who);
2945 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2946 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2947 }
2948
2949 if (mdr->more()->srcdn_auth_mds == who &&
2950 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2951 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2952 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2953 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2954 }
2955 } else if (mdr->slave_request) {
2956 const cref_t<MMDSSlaveRequest> &slave_req = mdr->slave_request;
2957 // FIXME: Slave rename request can arrive after we notice mds failure.
2958 // This can cause mds to crash (does not affect integrity of FS).
2959 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2960 slave_req->srcdn_auth == who)
2961 slave_req->mark_interrupted();
2962 }
2963
2964 // failed node is slave?
2965 if (mdr->is_master() && !mdr->committing) {
2966 if (mdr->more()->srcdn_auth_mds == who) {
2967 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2968 << who << " to recover" << dendl;
2969 ceph_assert(mdr->more()->witnessed.count(who) == 0);
2970 if (mdr->more()->is_ambiguous_auth)
2971 mdr->clear_ambiguous_auth();
2972 // rename srcdn's auth mds failed, all witnesses will rollback
2973 mdr->more()->witnessed.clear();
2974 pending_masters.erase(p->first);
2975 }
2976
2977 if (mdr->more()->witnessed.count(who)) {
2978 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2979 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2980 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2981 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2982 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2983 // until either the request is committing or the slave also fails.
2984 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
2985 pending_masters.insert(p->first);
2986 } else {
2987 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2988 << who << " to recover" << dendl;
2989 if (srcdn_auth >= 0)
2990 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2991
2992 // discard this peer's prepare (if any)
2993 mdr->more()->witnessed.erase(who);
2994 }
2995 }
2996
2997 if (mdr->more()->waiting_on_slave.count(who)) {
2998 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
2999 << " to recover" << dendl;
3000 // retry request when peer recovers
3001 mdr->more()->waiting_on_slave.erase(who);
3002 if (mdr->more()->waiting_on_slave.empty())
3003 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3004 }
3005
3006 if (mdr->locking && mdr->locking_target_mds == who)
3007 mdr->finish_locking(mdr->locking);
3008 }
3009 }
3010
3011 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3012 p != uncommitted_masters.end();
3013 ++p) {
3014 // The failed MDS may have already committed the slave update
3015 if (p->second.slaves.count(who)) {
3016 p->second.recovering = true;
3017 p->second.slaves.erase(who);
3018 }
3019 }
3020
3021 while (!finish.empty()) {
3022 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3023 request_finish(finish.front());
3024 finish.pop_front();
3025 }
3026
3027 kick_find_ino_peers(who);
3028 kick_open_ino_peers(who);
3029
3030 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3031 p != fragments.end(); ) {
3032 dirfrag_t df = p->first;
3033 fragment_info_t& info = p->second;
3034
3035 if (info.is_fragmenting()) {
3036 if (info.notify_ack_waiting.erase(who) &&
3037 info.notify_ack_waiting.empty()) {
3038 fragment_drop_locks(info);
3039 fragment_maybe_finish(p++);
3040 } else {
3041 ++p;
3042 }
3043 continue;
3044 }
3045
3046 ++p;
3047 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3048 std::vector<CDir*> dirs;
3049 info.dirs.swap(dirs);
3050 fragments.erase(df);
3051 fragment_unmark_unfreeze_dirs(dirs);
3052 }
3053
3054 // MDCache::shutdown_export_strays() always exports strays to mds.0
3055 if (who == mds_rank_t(0))
3056 shutdown_exporting_strays.clear();
3057
3058 show_subtrees();
3059 }
3060
3061 /*
3062 * handle_mds_recovery - called on another node's transition
3063 * from resolve -> active.
3064 */
3065 void MDCache::handle_mds_recovery(mds_rank_t who)
3066 {
3067 dout(7) << "handle_mds_recovery mds." << who << dendl;
3068
3069 // exclude all discover waiters. kick_discovers() will do the job
3070 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3071 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3072
3073 MDSContext::vec waiters;
3074
3075 // wake up any waiters in their subtrees
3076 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3077 p != subtrees.end();
3078 ++p) {
3079 CDir *dir = p->first;
3080
3081 if (dir->authority().first != who ||
3082 dir->authority().second == mds->get_nodeid())
3083 continue;
3084 ceph_assert(!dir->is_auth());
3085
3086 // wake any waiters
3087 std::queue<CDir*> q;
3088 q.push(dir);
3089
3090 while (!q.empty()) {
3091 CDir *d = q.front();
3092 q.pop();
3093 d->take_waiting(d_mask, waiters);
3094
3095 // inode waiters too
3096 for (auto &p : d->items) {
3097 CDentry *dn = p.second;
3098 CDentry::linkage_t *dnl = dn->get_linkage();
3099 if (dnl->is_primary()) {
3100 dnl->get_inode()->take_waiting(i_mask, waiters);
3101
3102 // recurse?
3103 auto&& ls = dnl->get_inode()->get_dirfrags();
3104 for (const auto& subdir : ls) {
3105 if (!subdir->is_subtree_root())
3106 q.push(subdir);
3107 }
3108 }
3109 }
3110 }
3111 }
3112
3113 kick_open_ino_peers(who);
3114 kick_find_ino_peers(who);
3115
3116 // queue them up.
3117 mds->queue_waiters(waiters);
3118 }
3119
3120 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3121 {
3122 dout(7) << "set_recovery_set " << s << dendl;
3123 recovery_set = s;
3124 }
3125
3126
3127 /*
3128 * during resolve state, we share resolves to determine who
3129 * is authoritative for which trees. we expect to get an resolve
3130 * from _everyone_ in the recovery_set (the mds cluster at the time of
3131 * the first failure).
3132 *
3133 * This functions puts the passed message before returning
3134 */
3135 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3136 {
3137 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3138 mds_rank_t from = mds_rank_t(m->get_source().num());
3139
3140 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3141 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3142 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3143 return;
3144 }
3145 // wait until we reach the resolve stage!
3146 return;
3147 }
3148
3149 discard_delayed_resolve(from);
3150
3151 // ambiguous slave requests?
3152 if (!m->slave_requests.empty()) {
3153 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3154 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3155 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3156 ceph_assert(!p->second.committing);
3157 pending_masters.insert(p->first);
3158 }
3159 }
3160
3161 if (!pending_masters.empty()) {
3162 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3163 delayed_resolve[from] = m;
3164 return;
3165 }
3166 }
3167
3168 auto ack = make_message<MMDSResolveAck>();
3169 for (const auto &p : m->slave_requests) {
3170 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3171 // COMMIT
3172 if (p.second.committing) {
3173 // already committing, waiting for the OP_COMMITTED slave reply
3174 dout(10) << " already committing slave request " << p << " noop "<< dendl;
3175 } else {
3176 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3177 ack->add_commit(p.first);
3178 }
3179 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3180
3181 if (p.second.inode_caps.length() > 0) {
3182 // slave wants to export caps (rename)
3183 ceph_assert(mds->is_resolve());
3184 MMDSResolve::slave_inode_cap inode_caps;
3185 auto q = p.second.inode_caps.cbegin();
3186 decode(inode_caps, q);
3187 inodeno_t ino = inode_caps.ino;
3188 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3189 ceph_assert(get_inode(ino));
3190
3191 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3192 q != cap_exports.end();
3193 ++q) {
3194 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3195 im.cap_id = ++last_cap_id; // assign a new cap ID
3196 im.issue_seq = 1;
3197 im.mseq = q->second.mseq;
3198
3199 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3200 if (session)
3201 rejoin_client_map.emplace(q->first, session->info.inst);
3202 }
3203
3204 // will process these caps in rejoin stage
3205 rejoin_slave_exports[ino].first = from;
3206 rejoin_slave_exports[ino].second.swap(cap_exports);
3207
3208 // send information of imported caps back to slave
3209 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3210 }
3211 } else {
3212 // ABORT
3213 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3214 ceph_assert(!p.second.committing);
3215 ack->add_abort(p.first);
3216 }
3217 }
3218 mds->send_message(ack, m->get_connection());
3219 return;
3220 }
3221
3222 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3223 dout(10) << "delay processing subtree resolve" << dendl;
3224 delayed_resolve[from] = m;
3225 return;
3226 }
3227
3228 bool survivor = false;
3229 // am i a surviving ambiguous importer?
3230 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3231 survivor = true;
3232 // check for any import success/failure (from this node)
3233 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3234 while (p != my_ambiguous_imports.end()) {
3235 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3236 ++next;
3237 CDir *dir = get_dirfrag(p->first);
3238 ceph_assert(dir);
3239 dout(10) << "checking ambiguous import " << *dir << dendl;
3240 if (migrator->is_importing(dir->dirfrag()) &&
3241 migrator->get_import_peer(dir->dirfrag()) == from) {
3242 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3243
3244 // check if sender claims the subtree
3245 bool claimed_by_sender = false;
3246 for (const auto &q : m->subtrees) {
3247 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3248 CDir *base = get_force_dirfrag(q.first, false);
3249 if (!base || !base->contains(dir))
3250 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3251
3252 bool inside = true;
3253 set<CDir*> bounds;
3254 get_force_dirfrag_bound_set(q.second, bounds);
3255 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3256 CDir *bound = *p;
3257 if (bound->contains(dir)) {
3258 inside = false; // nope, bound is dir or parent of dir, not inside.
3259 break;
3260 }
3261 }
3262 if (inside)
3263 claimed_by_sender = true;
3264 }
3265
3266 my_ambiguous_imports.erase(p); // no longer ambiguous.
3267 if (claimed_by_sender) {
3268 dout(7) << "ambiguous import failed on " << *dir << dendl;
3269 migrator->import_reverse(dir);
3270 } else {
3271 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3272 migrator->import_finish(dir, true);
3273 }
3274 }
3275 p = next;
3276 }
3277 }
3278
3279 // update my dir_auth values
3280 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3281 // migrations between other nodes)
3282 for (const auto& p : m->subtrees) {
3283 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3284 CDir *dir = get_force_dirfrag(p.first, !survivor);
3285 if (!dir)
3286 continue;
3287 adjust_bounded_subtree_auth(dir, p.second, from);
3288 try_subtree_merge(dir);
3289 }
3290
3291 show_subtrees();
3292
3293 // note ambiguous imports too
3294 for (const auto& p : m->ambiguous_imports) {
3295 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3296 other_ambiguous_imports[from][p.first] = p.second;
3297 }
3298
3299 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3300 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3301 for (const auto& p : m->table_clients) {
3302 dout(10) << " noting " << get_mdstable_name(p.type)
3303 << " pending_commits " << p.pending_commits << dendl;
3304 MDSTableClient *client = mds->get_table_client(p.type);
3305 for (const auto& q : p.pending_commits)
3306 client->notify_commit(q);
3307 }
3308
3309 // did i get them all?
3310 resolve_gather.erase(from);
3311
3312 maybe_resolve_finish();
3313 }
3314
3315 void MDCache::process_delayed_resolve()
3316 {
3317 dout(10) << "process_delayed_resolve" << dendl;
3318 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3319 tmp.swap(delayed_resolve);
3320 for (auto &p : tmp) {
3321 handle_resolve(p.second);
3322 }
3323 }
3324
3325 void MDCache::discard_delayed_resolve(mds_rank_t who)
3326 {
3327 delayed_resolve.erase(who);
3328 }
3329
3330 void MDCache::maybe_resolve_finish()
3331 {
3332 ceph_assert(resolve_ack_gather.empty());
3333 ceph_assert(resolve_need_rollback.empty());
3334
3335 if (!resolve_gather.empty()) {
3336 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3337 << resolve_gather << ")" << dendl;
3338 return;
3339 }
3340
3341 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3342 disambiguate_my_imports();
3343 finish_committed_masters();
3344
3345 if (resolve_done) {
3346 ceph_assert(mds->is_resolve());
3347 trim_unlinked_inodes();
3348 recalc_auth_bits(false);
3349 resolve_done.release()->complete(0);
3350 } else {
3351 // I am survivor.
3352 maybe_send_pending_rejoins();
3353 }
3354 }
3355
3356 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3357 {
3358 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3359 mds_rank_t from = mds_rank_t(ack->get_source().num());
3360
3361 if (!resolve_ack_gather.count(from) ||
3362 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3363 return;
3364 }
3365
3366 if (ambiguous_slave_updates.count(from)) {
3367 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3368 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3369 }
3370
3371 for (const auto &p : ack->commit) {
3372 dout(10) << " commit on slave " << p.first << dendl;
3373
3374 if (ambiguous_slave_updates.count(from)) {
3375 remove_ambiguous_slave_update(p.first, from);
3376 continue;
3377 }
3378
3379 if (mds->is_resolve()) {
3380 // replay
3381 MDSlaveUpdate *su = get_uncommitted_slave(p.first, from);
3382 ceph_assert(su);
3383
3384 // log commit
3385 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
3386 ESlaveUpdate::OP_COMMIT, su->origop),
3387 new C_MDC_SlaveCommit(this, from, p.first));
3388 mds->mdlog->flush();
3389
3390 finish_uncommitted_slave(p.first);
3391 } else {
3392 MDRequestRef mdr = request_get(p.first);
3393 // information about master imported caps
3394 if (p.second.length() > 0)
3395 mdr->more()->inode_import.share(p.second);
3396
3397 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
3398 request_finish(mdr);
3399 }
3400 }
3401
3402 for (const auto &metareq : ack->abort) {
3403 dout(10) << " abort on slave " << metareq << dendl;
3404
3405 if (mds->is_resolve()) {
3406 MDSlaveUpdate *su = get_uncommitted_slave(metareq, from);
3407 ceph_assert(su);
3408
3409 // perform rollback (and journal a rollback entry)
3410 // note: this will hold up the resolve a bit, until the rollback entries journal.
3411 MDRequestRef null_ref;
3412 switch (su->origop) {
3413 case ESlaveUpdate::LINK:
3414 mds->server->do_link_rollback(su->rollback, from, null_ref);
3415 break;
3416 case ESlaveUpdate::RENAME:
3417 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3418 break;
3419 case ESlaveUpdate::RMDIR:
3420 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3421 break;
3422 default:
3423 ceph_abort();
3424 }
3425 } else {
3426 MDRequestRef mdr = request_get(metareq);
3427 mdr->aborted = true;
3428 if (mdr->slave_request) {
3429 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3430 add_rollback(metareq, from);
3431 } else {
3432 request_finish(mdr);
3433 }
3434 }
3435 }
3436
3437 if (!ambiguous_slave_updates.count(from)) {
3438 resolve_ack_gather.erase(from);
3439 maybe_finish_slave_resolve();
3440 }
3441 }
3442
3443 void MDCache::add_uncommitted_slave(metareqid_t reqid, LogSegment *ls, mds_rank_t master, MDSlaveUpdate *su)
3444 {
3445 auto const &ret = uncommitted_slaves.emplace(std::piecewise_construct,
3446 std::forward_as_tuple(reqid),
3447 std::forward_as_tuple());
3448 ceph_assert(ret.second);
3449 ls->uncommitted_slaves.insert(reqid);
3450 uslave &u = ret.first->second;
3451 u.master = master;
3452 u.ls = ls;
3453 u.su = su;
3454 if (su == nullptr) {
3455 return;
3456 }
3457 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3458 uncommitted_slave_rename_olddir[*p]++;
3459 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3460 uncommitted_slave_unlink[*p]++;
3461 }
3462
3463 void MDCache::finish_uncommitted_slave(metareqid_t reqid, bool assert_exist)
3464 {
3465 auto it = uncommitted_slaves.find(reqid);
3466 if (it == uncommitted_slaves.end()) {
3467 ceph_assert(!assert_exist);
3468 return;
3469 }
3470 uslave &u = it->second;
3471 MDSlaveUpdate* su = u.su;
3472
3473 if (!u.waiters.empty()) {
3474 mds->queue_waiters(u.waiters);
3475 }
3476 u.ls->uncommitted_slaves.erase(reqid);
3477 uncommitted_slaves.erase(it);
3478
3479 if (su == nullptr) {
3480 return;
3481 }
3482 // discard the non-auth subtree we renamed out of
3483 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3484 CInode *diri = *p;
3485 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3486 ceph_assert(it != uncommitted_slave_rename_olddir.end());
3487 it->second--;
3488 if (it->second == 0) {
3489 uncommitted_slave_rename_olddir.erase(it);
3490 auto&& ls = diri->get_dirfrags();
3491 for (const auto& dir : ls) {
3492 CDir *root = get_subtree_root(dir);
3493 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3494 try_trim_non_auth_subtree(root);
3495 if (dir != root)
3496 break;
3497 }
3498 }
3499 } else
3500 ceph_assert(it->second > 0);
3501 }
3502 // removed the inodes that were unlinked by slave update
3503 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3504 CInode *in = *p;
3505 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3506 ceph_assert(it != uncommitted_slave_unlink.end());
3507 it->second--;
3508 if (it->second == 0) {
3509 uncommitted_slave_unlink.erase(it);
3510 if (!in->get_projected_parent_dn())
3511 mds->mdcache->remove_inode_recursive(in);
3512 } else
3513 ceph_assert(it->second > 0);
3514 }
3515 delete su;
3516 }
3517
3518 MDSlaveUpdate* MDCache::get_uncommitted_slave(metareqid_t reqid, mds_rank_t master)
3519 {
3520
3521 MDSlaveUpdate* su = nullptr;
3522 auto it = uncommitted_slaves.find(reqid);
3523 if (it != uncommitted_slaves.end() &&
3524 it->second.master == master) {
3525 su = it->second.su;
3526 }
3527 return su;
3528 }
3529
3530 void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
3531 auto p = resolve_need_rollback.find(mdr->reqid);
3532 ceph_assert(p != resolve_need_rollback.end());
3533 if (mds->is_resolve()) {
3534 finish_uncommitted_slave(reqid, false);
3535 } else if (mdr) {
3536 finish_uncommitted_slave(mdr->reqid, mdr->more()->slave_update_journaled);
3537 }
3538 resolve_need_rollback.erase(p);
3539 maybe_finish_slave_resolve();
3540 }
3541
3542 void MDCache::disambiguate_other_imports()
3543 {
3544 dout(10) << "disambiguate_other_imports" << dendl;
3545
3546 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3547 // other nodes' ambiguous imports
3548 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3549 p != other_ambiguous_imports.end();
3550 ++p) {
3551 mds_rank_t who = p->first;
3552 dout(10) << "ambiguous imports for mds." << who << dendl;
3553
3554 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3555 q != p->second.end();
3556 ++q) {
3557 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3558 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3559 CDir *dir = get_force_dirfrag(q->first, recovering);
3560 if (!dir) continue;
3561
3562 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3563 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3564 dout(10) << " mds." << who << " did import " << *dir << dendl;
3565 adjust_bounded_subtree_auth(dir, q->second, who);
3566 try_subtree_merge(dir);
3567 } else {
3568 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3569 }
3570 }
3571 }
3572 other_ambiguous_imports.clear();
3573 }
3574
3575 void MDCache::disambiguate_my_imports()
3576 {
3577 dout(10) << "disambiguate_my_imports" << dendl;
3578
3579 if (!mds->is_resolve()) {
3580 ceph_assert(my_ambiguous_imports.empty());
3581 return;
3582 }
3583
3584 disambiguate_other_imports();
3585
3586 // my ambiguous imports
3587 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3588 while (!my_ambiguous_imports.empty()) {
3589 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3590
3591 CDir *dir = get_dirfrag(q->first);
3592 ceph_assert(dir);
3593
3594 if (dir->authority() != me_ambig) {
3595 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3596 cancel_ambiguous_import(dir);
3597
3598 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3599
3600 // subtree may have been swallowed by another node claiming dir
3601 // as their own.
3602 CDir *root = get_subtree_root(dir);
3603 if (root != dir)
3604 dout(10) << " subtree root is " << *root << dendl;
3605 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3606 try_trim_non_auth_subtree(root);
3607 } else {
3608 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3609 finish_ambiguous_import(q->first);
3610 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3611 }
3612 }
3613 ceph_assert(my_ambiguous_imports.empty());
3614 mds->mdlog->flush();
3615
3616 // verify all my subtrees are unambiguous!
3617 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3618 p != subtrees.end();
3619 ++p) {
3620 CDir *dir = p->first;
3621 if (dir->is_ambiguous_dir_auth()) {
3622 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3623 }
3624 ceph_assert(!dir->is_ambiguous_dir_auth());
3625 }
3626
3627 show_subtrees();
3628 }
3629
3630
3631 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3632 {
3633 ceph_assert(my_ambiguous_imports.count(base) == 0);
3634 my_ambiguous_imports[base] = bounds;
3635 }
3636
3637
3638 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3639 {
3640 // make a list
3641 vector<dirfrag_t> binos;
3642 for (set<CDir*>::iterator p = bounds.begin();
3643 p != bounds.end();
3644 ++p)
3645 binos.push_back((*p)->dirfrag());
3646
3647 // note: this can get called twice if the exporter fails during recovery
3648 if (my_ambiguous_imports.count(base->dirfrag()))
3649 my_ambiguous_imports.erase(base->dirfrag());
3650
3651 add_ambiguous_import(base->dirfrag(), binos);
3652 }
3653
3654 void MDCache::cancel_ambiguous_import(CDir *dir)
3655 {
3656 dirfrag_t df = dir->dirfrag();
3657 ceph_assert(my_ambiguous_imports.count(df));
3658 dout(10) << "cancel_ambiguous_import " << df
3659 << " bounds " << my_ambiguous_imports[df]
3660 << " " << *dir
3661 << dendl;
3662 my_ambiguous_imports.erase(df);
3663 }
3664
3665 void MDCache::finish_ambiguous_import(dirfrag_t df)
3666 {
3667 ceph_assert(my_ambiguous_imports.count(df));
3668 vector<dirfrag_t> bounds;
3669 bounds.swap(my_ambiguous_imports[df]);
3670 my_ambiguous_imports.erase(df);
3671
3672 dout(10) << "finish_ambiguous_import " << df
3673 << " bounds " << bounds
3674 << dendl;
3675 CDir *dir = get_dirfrag(df);
3676 ceph_assert(dir);
3677
3678 // adjust dir_auth, import maps
3679 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3680 try_subtree_merge(dir);
3681 }
3682
3683 void MDCache::remove_inode_recursive(CInode *in)
3684 {
3685 dout(10) << "remove_inode_recursive " << *in << dendl;
3686 auto&& ls = in->get_dirfrags();
3687 for (const auto& subdir : ls) {
3688 dout(10) << " removing dirfrag " << *subdir << dendl;
3689 auto it = subdir->items.begin();
3690 while (it != subdir->items.end()) {
3691 CDentry *dn = it->second;
3692 ++it;
3693 CDentry::linkage_t *dnl = dn->get_linkage();
3694 if (dnl->is_primary()) {
3695 CInode *tin = dnl->get_inode();
3696 subdir->unlink_inode(dn, false);
3697 remove_inode_recursive(tin);
3698 }
3699 subdir->remove_dentry(dn);
3700 }
3701
3702 if (subdir->is_subtree_root())
3703 remove_subtree(subdir);
3704 in->close_dirfrag(subdir->dirfrag().frag);
3705 }
3706 remove_inode(in);
3707 }
3708
3709 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3710 {
3711 ceph_assert(!in->is_auth());
3712
3713 dout(10) << __func__ << ":" << *in << dendl;
3714
3715 // Recurse into any dirfrags beneath this inode
3716 auto&& ls = in->get_dirfrags();
3717 for (const auto& subdir : ls) {
3718 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3719 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3720 return true;
3721 }
3722
3723 for (auto &it : subdir->items) {
3724 CDentry *dn = it.second;
3725 CDentry::linkage_t *dnl = dn->get_linkage();
3726 if (dnl->is_primary()) {
3727 CInode *tin = dnl->get_inode();
3728
3729 /* Remote strays with linkage (i.e. hardlinks) should not be
3730 * expired, because they may be the target of
3731 * a rename() as the owning MDS shuts down */
3732 if (!tin->is_stray() && tin->inode.nlink) {
3733 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3734 return true;
3735 }
3736
3737 const bool abort = expire_recursive(tin, expiremap);
3738 if (abort) {
3739 return true;
3740 }
3741 }
3742 if (dn->lru_is_expireable()) {
3743 trim_dentry(dn, expiremap);
3744 } else {
3745 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3746 return true;
3747 }
3748 }
3749 }
3750
3751 return false;
3752 }
3753
3754 void MDCache::trim_unlinked_inodes()
3755 {
3756 dout(7) << "trim_unlinked_inodes" << dendl;
3757 int count = 0;
3758 vector<CInode*> q;
3759 for (auto &p : inode_map) {
3760 CInode *in = p.second;
3761 if (in->get_parent_dn() == NULL && !in->is_base()) {
3762 dout(7) << " will trim from " << *in << dendl;
3763 q.push_back(in);
3764 }
3765
3766 if (!(++count % 1000))
3767 mds->heartbeat_reset();
3768 }
3769 for (auto& in : q) {
3770 remove_inode_recursive(in);
3771
3772 if (!(++count % 1000))
3773 mds->heartbeat_reset();
3774 }
3775 }
3776
3777 /** recalc_auth_bits()
3778 * once subtree auth is disambiguated, we need to adjust all the
3779 * auth and dirty bits in our cache before moving on.
3780 */
3781 void MDCache::recalc_auth_bits(bool replay)
3782 {
3783 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3784
3785 if (root) {
3786 root->inode_auth.first = mds->mdsmap->get_root();
3787 bool auth = mds->get_nodeid() == root->inode_auth.first;
3788 if (auth) {
3789 root->state_set(CInode::STATE_AUTH);
3790 } else {
3791 root->state_clear(CInode::STATE_AUTH);
3792 if (!replay)
3793 root->state_set(CInode::STATE_REJOINING);
3794 }
3795 }
3796
3797 set<CInode*> subtree_inodes;
3798 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3799 p != subtrees.end();
3800 ++p) {
3801 if (p->first->dir_auth.first == mds->get_nodeid())
3802 subtree_inodes.insert(p->first->inode);
3803 }
3804
3805 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3806 p != subtrees.end();
3807 ++p) {
3808 if (p->first->inode->is_mdsdir()) {
3809 CInode *in = p->first->inode;
3810 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3811 if (auth) {
3812 in->state_set(CInode::STATE_AUTH);
3813 } else {
3814 in->state_clear(CInode::STATE_AUTH);
3815 if (!replay)
3816 in->state_set(CInode::STATE_REJOINING);
3817 }
3818 }
3819
3820 std::queue<CDir*> dfq; // dirfrag queue
3821 dfq.push(p->first);
3822
3823 bool auth = p->first->authority().first == mds->get_nodeid();
3824 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3825
3826 while (!dfq.empty()) {
3827 CDir *dir = dfq.front();
3828 dfq.pop();
3829
3830 // dir
3831 if (auth) {
3832 dir->state_set(CDir::STATE_AUTH);
3833 } else {
3834 dir->state_clear(CDir::STATE_AUTH);
3835 if (!replay) {
3836 // close empty non-auth dirfrag
3837 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3838 dir->inode->close_dirfrag(dir->get_frag());
3839 continue;
3840 }
3841 dir->state_set(CDir::STATE_REJOINING);
3842 dir->state_clear(CDir::STATE_COMPLETE);
3843 if (dir->is_dirty())
3844 dir->mark_clean();
3845 }
3846 }
3847
3848 // dentries in this dir
3849 for (auto &p : dir->items) {
3850 // dn
3851 CDentry *dn = p.second;
3852 CDentry::linkage_t *dnl = dn->get_linkage();
3853 if (auth) {
3854 dn->state_set(CDentry::STATE_AUTH);
3855 } else {
3856 dn->state_clear(CDentry::STATE_AUTH);
3857 if (!replay) {
3858 dn->state_set(CDentry::STATE_REJOINING);
3859 if (dn->is_dirty())
3860 dn->mark_clean();
3861 }
3862 }
3863
3864 if (dnl->is_primary()) {
3865 // inode
3866 CInode *in = dnl->get_inode();
3867 if (auth) {
3868 in->state_set(CInode::STATE_AUTH);
3869 } else {
3870 in->state_clear(CInode::STATE_AUTH);
3871 if (!replay) {
3872 in->state_set(CInode::STATE_REJOINING);
3873 if (in->is_dirty())
3874 in->mark_clean();
3875 if (in->is_dirty_parent())
3876 in->clear_dirty_parent();
3877 // avoid touching scatterlocks for our subtree roots!
3878 if (subtree_inodes.count(in) == 0)
3879 in->clear_scatter_dirty();
3880 }
3881 }
3882 // recurse?
3883 if (in->is_dir()) {
3884 auto&& dfv = in->get_nested_dirfrags();
3885 for (const auto& dir : dfv) {
3886 dfq.push(dir);
3887 }
3888 }
3889 }
3890 }
3891 }
3892 }
3893
3894 show_subtrees();
3895 show_cache();
3896 }
3897
3898
3899
3900 // ===========================================================================
3901 // REJOIN
3902
3903 /*
3904 * notes on scatterlock recovery:
3905 *
3906 * - recovering inode replica sends scatterlock data for any subtree
3907 * roots (the only ones that are possibly dirty).
3908 *
3909 * - surviving auth incorporates any provided scatterlock data. any
3910 * pending gathers are then finished, as with the other lock types.
3911 *
3912 * that takes care of surviving auth + (recovering replica)*.
3913 *
3914 * - surviving replica sends strong_inode, which includes current
3915 * scatterlock state, AND any dirty scatterlock data. this
3916 * provides the recovering auth with everything it might need.
3917 *
3918 * - recovering auth must pick initial scatterlock state based on
3919 * (weak|strong) rejoins.
3920 * - always assimilate scatterlock data (it can't hurt)
3921 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3922 * - include base inode in ack for all inodes that saw scatterlock content
3923 *
3924 * also, for scatter gather,
3925 *
3926 * - auth increments {frag,r}stat.version on completion of any gather.
3927 *
3928 * - auth incorporates changes in a gather _only_ if the version
3929 * matches.
3930 *
3931 * - replica discards changes any time the scatterlock syncs, and
3932 * after recovery.
3933 */
3934
3935 void MDCache::dump_rejoin_status(Formatter *f) const
3936 {
3937 f->open_object_section("rejoin_status");
3938 f->dump_stream("rejoin_gather") << rejoin_gather;
3939 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3940 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3941 f->close_section();
3942 }
3943
3944 void MDCache::rejoin_start(MDSContext *rejoin_done_)
3945 {
3946 dout(10) << "rejoin_start" << dendl;
3947 ceph_assert(!rejoin_done);
3948 rejoin_done.reset(rejoin_done_);
3949
3950 rejoin_gather = recovery_set;
3951 // need finish opening cap inodes before sending cache rejoins
3952 rejoin_gather.insert(mds->get_nodeid());
3953 process_imported_caps();
3954 }
3955
3956 /*
3957 * rejoin phase!
3958 *
3959 * this initiates rejoin. it should be called before we get any
3960 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3961 *
3962 * we start out by sending rejoins to everyone in the recovery set.
3963 *
3964 * if we are rejoin, send for all regions in our cache.
3965 * if we are active|stopping, send only to nodes that are rejoining.
3966 */
3967 void MDCache::rejoin_send_rejoins()
3968 {
3969 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3970
3971 if (rejoin_gather.count(mds->get_nodeid())) {
3972 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3973 rejoins_pending = true;
3974 return;
3975 }
3976 if (!resolve_gather.empty()) {
3977 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3978 << resolve_gather << ")" << dendl;
3979 rejoins_pending = true;
3980 return;
3981 }
3982
3983 ceph_assert(!migrator->is_importing());
3984 ceph_assert(!migrator->is_exporting());
3985
3986 if (!mds->is_rejoin()) {
3987 disambiguate_other_imports();
3988 }
3989
3990 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
3991
3992
3993 // if i am rejoining, send a rejoin to everyone.
3994 // otherwise, just send to others who are rejoining.
3995 for (const auto& rank : recovery_set) {
3996 if (rank == mds->get_nodeid()) continue; // nothing to myself!
3997 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
3998 if (mds->is_rejoin())
3999 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4000 else if (mds->mdsmap->is_rejoin(rank))
4001 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
4002 }
4003
4004 if (mds->is_rejoin()) {
4005 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4006 for (auto& p : cap_exports) {
4007 mds_rank_t target = p.second.first;
4008 if (rejoins.count(target) == 0)
4009 continue;
4010 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4011 Session *session = nullptr;
4012 auto it = client_exports.find(q->first);
4013 if (it != client_exports.end()) {
4014 session = it->second.first;
4015 if (session)
4016 it->second.second.insert(target);
4017 } else {
4018 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4019 auto& r = client_exports[q->first];
4020 r.first = session;
4021 if (session)
4022 r.second.insert(target);
4023 }
4024 if (session) {
4025 ++q;
4026 } else {
4027 // remove reconnect with no session
4028 p.second.second.erase(q++);
4029 }
4030 }
4031 rejoins[target]->cap_exports[p.first] = p.second.second;
4032 }
4033 for (auto& p : client_exports) {
4034 Session *session = p.second.first;
4035 for (auto& q : p.second.second) {
4036 auto rejoin = rejoins[q];
4037 rejoin->client_map[p.first] = session->info.inst;
4038 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4039 }
4040 }
4041 }
4042
4043
4044 // check all subtrees
4045 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4046 p != subtrees.end();
4047 ++p) {
4048 CDir *dir = p->first;
4049 ceph_assert(dir->is_subtree_root());
4050 if (dir->is_ambiguous_dir_auth()) {
4051 // exporter is recovering, importer is survivor.
4052 ceph_assert(rejoins.count(dir->authority().first));
4053 ceph_assert(!rejoins.count(dir->authority().second));
4054 continue;
4055 }
4056
4057 // my subtree?
4058 if (dir->is_auth())
4059 continue; // skip my own regions!
4060
4061 mds_rank_t auth = dir->get_dir_auth().first;
4062 ceph_assert(auth >= 0);
4063 if (rejoins.count(auth) == 0)
4064 continue; // don't care about this node's subtrees
4065
4066 rejoin_walk(dir, rejoins[auth]);
4067 }
4068
4069 // rejoin root inodes, too
4070 for (auto &p : rejoins) {
4071 if (mds->is_rejoin()) {
4072 // weak
4073 if (p.first == 0 && root) {
4074 p.second->add_weak_inode(root->vino());
4075 if (root->is_dirty_scattered()) {
4076 dout(10) << " sending scatterlock state on root " << *root << dendl;
4077 p.second->add_scatterlock_state(root);
4078 }
4079 }
4080 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4081 if (in)
4082 p.second->add_weak_inode(in->vino());
4083 }
4084 } else {
4085 // strong
4086 if (p.first == 0 && root) {
4087 p.second->add_strong_inode(root->vino(),
4088 root->get_replica_nonce(),
4089 root->get_caps_wanted(),
4090 root->filelock.get_state(),
4091 root->nestlock.get_state(),
4092 root->dirfragtreelock.get_state());
4093 root->state_set(CInode::STATE_REJOINING);
4094 if (root->is_dirty_scattered()) {
4095 dout(10) << " sending scatterlock state on root " << *root << dendl;
4096 p.second->add_scatterlock_state(root);
4097 }
4098 }
4099
4100 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4101 p.second->add_strong_inode(in->vino(),
4102 in->get_replica_nonce(),
4103 in->get_caps_wanted(),
4104 in->filelock.get_state(),
4105 in->nestlock.get_state(),
4106 in->dirfragtreelock.get_state());
4107 in->state_set(CInode::STATE_REJOINING);
4108 }
4109 }
4110 }
4111
4112 if (!mds->is_rejoin()) {
4113 // i am survivor. send strong rejoin.
4114 // note request remote_auth_pins, xlocks
4115 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4116 p != active_requests.end();
4117 ++p) {
4118 MDRequestRef& mdr = p->second;
4119 if (mdr->is_slave())
4120 continue;
4121 // auth pins
4122 for (const auto& q : mdr->object_states) {
4123 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4124 continue;
4125 if (!q.first->is_auth()) {
4126 mds_rank_t target = q.second.remote_auth_pinned;
4127 ceph_assert(target == q.first->authority().first);
4128 if (rejoins.count(target) == 0) continue;
4129 const auto& rejoin = rejoins[target];
4130
4131 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4132 MDSCacheObjectInfo i;
4133 q.first->set_object_info(i);
4134 if (i.ino)
4135 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4136 else
4137 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4138
4139 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4140 mdr->more()->rename_inode == q.first)
4141 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4142 mdr->reqid, mdr->attempt);
4143 }
4144 }
4145 // xlocks
4146 for (const auto& q : mdr->locks) {
4147 auto lock = q.lock;
4148 auto obj = lock->get_parent();
4149 if (q.is_xlock() && !obj->is_auth()) {
4150 mds_rank_t who = obj->authority().first;
4151 if (rejoins.count(who) == 0) continue;
4152 const auto& rejoin = rejoins[who];
4153
4154 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4155 MDSCacheObjectInfo i;
4156 obj->set_object_info(i);
4157 if (i.ino)
4158 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4159 mdr->reqid, mdr->attempt);
4160 else
4161 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4162 mdr->reqid, mdr->attempt);
4163 } else if (q.is_remote_wrlock()) {
4164 mds_rank_t who = q.wrlock_target;
4165 if (rejoins.count(who) == 0) continue;
4166 const auto& rejoin = rejoins[who];
4167
4168 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4169 MDSCacheObjectInfo i;
4170 obj->set_object_info(i);
4171 ceph_assert(i.ino);
4172 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4173 mdr->reqid, mdr->attempt);
4174 }
4175 }
4176 }
4177 }
4178
4179 // send the messages
4180 for (auto &p : rejoins) {
4181 ceph_assert(rejoin_sent.count(p.first) == 0);
4182 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4183 rejoin_sent.insert(p.first);
4184 rejoin_ack_gather.insert(p.first);
4185 mds->send_message_mds(p.second, p.first);
4186 }
4187 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4188 rejoins_pending = false;
4189
4190 // nothing?
4191 if (mds->is_rejoin() && rejoin_gather.empty()) {
4192 dout(10) << "nothing to rejoin" << dendl;
4193 rejoin_gather_finish();
4194 }
4195 }
4196
4197
4198 /**
4199 * rejoin_walk - build rejoin declarations for a subtree
4200 *
4201 * @param dir subtree root
4202 * @param rejoin rejoin message
4203 *
4204 * from a rejoining node:
4205 * weak dirfrag
4206 * weak dentries (w/ connectivity)
4207 *
4208 * from a surviving node:
4209 * strong dirfrag
4210 * strong dentries (no connectivity!)
4211 * strong inodes
4212 */
4213 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4214 {
4215 dout(10) << "rejoin_walk " << *dir << dendl;
4216
4217 std::vector<CDir*> nested; // finish this dir, then do nested items
4218
4219 if (mds->is_rejoin()) {
4220 // WEAK
4221 rejoin->add_weak_dirfrag(dir->dirfrag());
4222 for (auto &p : dir->items) {
4223 CDentry *dn = p.second;
4224 ceph_assert(dn->last == CEPH_NOSNAP);
4225 CDentry::linkage_t *dnl = dn->get_linkage();
4226 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4227 ceph_assert(dnl->is_primary());
4228 CInode *in = dnl->get_inode();
4229 ceph_assert(dnl->get_inode()->is_dir());
4230 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4231 {
4232 auto&& dirs = in->get_nested_dirfrags();
4233 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4234 }
4235 if (in->is_dirty_scattered()) {
4236 dout(10) << " sending scatterlock state on " << *in << dendl;
4237 rejoin->add_scatterlock_state(in);
4238 }
4239 }
4240 } else {
4241 // STRONG
4242 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4243 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4244 dir->state_set(CDir::STATE_REJOINING);
4245
4246 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4247 CDentry *dn = it->second;
4248 ++it;
4249 dn->state_set(CDentry::STATE_REJOINING);
4250 CDentry::linkage_t *dnl = dn->get_linkage();
4251 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4252
4253 // trim snap dentries. because they may have been pruned by
4254 // their auth mds (snap deleted)
4255 if (dn->last != CEPH_NOSNAP) {
4256 if (in && !in->remote_parents.empty()) {
4257 // unlink any stale remote snap dentry.
4258 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4259 CDentry *remote_dn = *it2;
4260 ++it2;
4261 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4262 remote_dn->unlink_remote(remote_dn->get_linkage());
4263 }
4264 }
4265 if (dn->lru_is_expireable()) {
4266 if (!dnl->is_null())
4267 dir->unlink_inode(dn, false);
4268 if (in)
4269 remove_inode(in);
4270 dir->remove_dentry(dn);
4271 continue;
4272 } else {
4273 // Inventing null/remote dentry shouldn't cause problem
4274 ceph_assert(!dnl->is_primary());
4275 }
4276 }
4277
4278 dout(15) << " add_strong_dentry " << *dn << dendl;
4279 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4280 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4281 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4282 dnl->is_remote() ? dnl->get_remote_d_type():0,
4283 dn->get_replica_nonce(),
4284 dn->lock.get_state());
4285 dn->state_set(CDentry::STATE_REJOINING);
4286 if (dnl->is_primary()) {
4287 CInode *in = dnl->get_inode();
4288 dout(15) << " add_strong_inode " << *in << dendl;
4289 rejoin->add_strong_inode(in->vino(),
4290 in->get_replica_nonce(),
4291 in->get_caps_wanted(),
4292 in->filelock.get_state(),
4293 in->nestlock.get_state(),
4294 in->dirfragtreelock.get_state());
4295 in->state_set(CInode::STATE_REJOINING);
4296 {
4297 auto&& dirs = in->get_nested_dirfrags();
4298 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4299 }
4300 if (in->is_dirty_scattered()) {
4301 dout(10) << " sending scatterlock state on " << *in << dendl;
4302 rejoin->add_scatterlock_state(in);
4303 }
4304 }
4305 }
4306 }
4307
4308 // recurse into nested dirs
4309 for (const auto& dir : nested) {
4310 rejoin_walk(dir, rejoin);
4311 }
4312 }
4313
4314
4315 /*
4316 * i got a rejoin.
4317 * - reply with the lockstate
4318 *
4319 * if i am active|stopping,
4320 * - remove source from replica list for everything not referenced here.
4321 */
4322 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4323 {
4324 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4325 << " (" << m->get_payload().length() << " bytes)"
4326 << dendl;
4327
4328 switch (m->op) {
4329 case MMDSCacheRejoin::OP_WEAK:
4330 handle_cache_rejoin_weak(m);
4331 break;
4332 case MMDSCacheRejoin::OP_STRONG:
4333 handle_cache_rejoin_strong(m);
4334 break;
4335 case MMDSCacheRejoin::OP_ACK:
4336 handle_cache_rejoin_ack(m);
4337 break;
4338
4339 default:
4340 ceph_abort();
4341 }
4342 }
4343
4344
4345 /*
4346 * handle_cache_rejoin_weak
4347 *
4348 * the sender
4349 * - is recovering from their journal.
4350 * - may have incorrect (out of date) inode contents
4351 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4352 *
4353 * if the sender didn't trim_non_auth(), they
4354 * - may have incorrect (out of date) dentry/inode linkage
4355 * - may have deleted/purged inodes
4356 * and i may have to go to disk to get accurate inode contents. yuck.
4357 */
4358 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4359 {
4360 mds_rank_t from = mds_rank_t(weak->get_source().num());
4361
4362 // possible response(s)
4363 ref_t<MMDSCacheRejoin> ack; // if survivor
4364 set<vinodeno_t> acked_inodes; // if survivor
4365 set<SimpleLock *> gather_locks; // if survivor
4366 bool survivor = false; // am i a survivor?
4367
4368 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4369 survivor = true;
4370 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4371 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4372
4373 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4374
4375 // check cap exports
4376 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4377 CInode *in = get_inode(p->first);
4378 ceph_assert(!in || in->is_auth());
4379 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4380 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4381 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4382 Capability::Import& im = imported_caps[p->first][q->first];
4383 if (cap) {
4384 im.cap_id = cap->get_cap_id();
4385 im.issue_seq = cap->get_last_seq();
4386 im.mseq = cap->get_mseq();
4387 } else {
4388 // all are zero
4389 }
4390 }
4391 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4392 }
4393
4394 encode(imported_caps, ack->imported_caps);
4395 } else {
4396 ceph_assert(mds->is_rejoin());
4397
4398 // we may have already received a strong rejoin from the sender.
4399 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4400 ceph_assert(gather_locks.empty());
4401
4402 // check cap exports.
4403 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4404 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4405 weak->client_metadata_map.end());
4406
4407 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4408 CInode *in = get_inode(p->first);
4409 ceph_assert(!in || in->is_auth());
4410 // note
4411 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4412 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4413 cap_imports[p->first][q->first][from] = q->second;
4414 }
4415 }
4416 }
4417
4418 // assimilate any potentially dirty scatterlock state
4419 for (const auto &p : weak->inode_scatterlocks) {
4420 CInode *in = get_inode(p.first);
4421 ceph_assert(in);
4422 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4423 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4424 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4425 if (!survivor)
4426 rejoin_potential_updated_scatterlocks.insert(in);
4427 }
4428
4429 // recovering peer may send incorrect dirfrags here. we need to
4430 // infer which dirfrag they meant. the ack will include a
4431 // strong_dirfrag that will set them straight on the fragmentation.
4432
4433 // walk weak map
4434 set<CDir*> dirs_to_share;
4435 for (const auto &p : weak->weak_dirfrags) {
4436 CInode *diri = get_inode(p.ino);
4437 if (!diri)
4438 dout(0) << " missing dir ino " << p.ino << dendl;
4439 ceph_assert(diri);
4440
4441 frag_vec_t leaves;
4442 if (diri->dirfragtree.is_leaf(p.frag)) {
4443 leaves.push_back(p.frag);
4444 } else {
4445 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4446 if (leaves.empty())
4447 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4448 }
4449 for (const auto& leaf : leaves) {
4450 CDir *dir = diri->get_dirfrag(leaf);
4451 if (!dir) {
4452 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4453 continue;
4454 }
4455 ceph_assert(dir);
4456 if (dirs_to_share.count(dir)) {
4457 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4458 } else {
4459 dirs_to_share.insert(dir);
4460 unsigned nonce = dir->add_replica(from);
4461 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4462 if (ack) {
4463 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4464 ack->add_dirfrag_base(dir);
4465 }
4466 }
4467 }
4468 }
4469
4470 for (const auto &p : weak->weak) {
4471 CInode *diri = get_inode(p.first);
4472 if (!diri)
4473 dout(0) << " missing dir ino " << p.first << dendl;
4474 ceph_assert(diri);
4475
4476 // weak dentries
4477 CDir *dir = 0;
4478 for (const auto &q : p.second) {
4479 // locate proper dirfrag.
4480 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4481 frag_t fg = diri->pick_dirfrag(q.first.name);
4482 if (!dir || dir->get_frag() != fg) {
4483 dir = diri->get_dirfrag(fg);
4484 if (!dir)
4485 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4486 ceph_assert(dir);
4487 ceph_assert(dirs_to_share.count(dir));
4488 }
4489
4490 // and dentry
4491 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4492 ceph_assert(dn);
4493 CDentry::linkage_t *dnl = dn->get_linkage();
4494 ceph_assert(dnl->is_primary());
4495
4496 if (survivor && dn->is_replica(from))
4497 dentry_remove_replica(dn, from, gather_locks);
4498 unsigned dnonce = dn->add_replica(from);
4499 dout(10) << " have " << *dn << dendl;
4500 if (ack)
4501 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4502 dnl->get_inode()->ino(), inodeno_t(0), 0,
4503 dnonce, dn->lock.get_replica_state());
4504
4505 // inode
4506 CInode *in = dnl->get_inode();
4507 ceph_assert(in);
4508
4509 if (survivor && in->is_replica(from))
4510 inode_remove_replica(in, from, true, gather_locks);
4511 unsigned inonce = in->add_replica(from);
4512 dout(10) << " have " << *in << dendl;
4513
4514 // scatter the dirlock, just in case?
4515 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4516 in->filelock.set_state(LOCK_MIX);
4517
4518 if (ack) {
4519 acked_inodes.insert(in->vino());
4520 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4521 bufferlist bl;
4522 in->_encode_locks_state_for_rejoin(bl, from);
4523 ack->add_inode_locks(in, inonce, bl);
4524 }
4525 }
4526 }
4527
4528 // weak base inodes? (root, stray, etc.)
4529 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4530 p != weak->weak_inodes.end();
4531 ++p) {
4532 CInode *in = get_inode(*p);
4533 ceph_assert(in); // hmm fixme wrt stray?
4534 if (survivor && in->is_replica(from))
4535 inode_remove_replica(in, from, true, gather_locks);
4536 unsigned inonce = in->add_replica(from);
4537 dout(10) << " have base " << *in << dendl;
4538
4539 if (ack) {
4540 acked_inodes.insert(in->vino());
4541 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4542 bufferlist bl;
4543 in->_encode_locks_state_for_rejoin(bl, from);
4544 ack->add_inode_locks(in, inonce, bl);
4545 }
4546 }
4547
4548 ceph_assert(rejoin_gather.count(from));
4549 rejoin_gather.erase(from);
4550 if (survivor) {
4551 // survivor. do everything now.
4552 for (const auto &p : weak->inode_scatterlocks) {
4553 CInode *in = get_inode(p.first);
4554 ceph_assert(in);
4555 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4556 acked_inodes.insert(in->vino());
4557 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4558 }
4559
4560 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4561 mds->send_message(ack, weak->get_connection());
4562
4563 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4564 if (!(*p)->is_stable())
4565 mds->locker->eval_gather(*p);
4566 }
4567 } else {
4568 // done?
4569 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4570 rejoin_gather_finish();
4571 } else {
4572 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4573 }
4574 }
4575 }
4576
4577 /*
4578 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4579 *
4580 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4581 * ack, the replica dne, and we can remove it from our replica maps.
4582 */
4583 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4584 set<vinodeno_t>& acked_inodes,
4585 set<SimpleLock *>& gather_locks)
4586 {
4587 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4588
4589 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4590 // inode?
4591 if (in->is_auth() &&
4592 in->is_replica(from) &&
4593 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4594 inode_remove_replica(in, from, false, gather_locks);
4595 dout(10) << " rem " << *in << dendl;
4596 }
4597
4598 if (!in->is_dir())
4599 return;
4600
4601 const auto&& dfs = in->get_dirfrags();
4602 for (const auto& dir : dfs) {
4603 if (!dir->is_auth())
4604 continue;
4605
4606 if (dir->is_replica(from) &&
4607 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4608 dir->remove_replica(from);
4609 dout(10) << " rem " << *dir << dendl;
4610 }
4611
4612 // dentries
4613 for (auto &p : dir->items) {
4614 CDentry *dn = p.second;
4615
4616 if (dn->is_replica(from)) {
4617 if (ack) {
4618 const auto it = ack->strong_dentries.find(dir->dirfrag());
4619 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4620 continue;
4621 }
4622 }
4623 dentry_remove_replica(dn, from, gather_locks);
4624 dout(10) << " rem " << *dn << dendl;
4625 }
4626 }
4627 }
4628 };
4629
4630 for (auto &p : inode_map)
4631 scour_func(p.second);
4632 for (auto &p : snap_inode_map)
4633 scour_func(p.second);
4634 }
4635
4636
4637 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4638 {
4639 CInode *in = new CInode(this, true, 1, last);
4640 in->inode.ino = ino;
4641 in->state_set(CInode::STATE_REJOINUNDEF);
4642 add_inode(in);
4643 rejoin_undef_inodes.insert(in);
4644 dout(10) << " invented " << *in << dendl;
4645 return in;
4646 }
4647
4648 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4649 {
4650 CInode *in = get_inode(df.ino);
4651 if (!in)
4652 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4653 if (!in->is_dir()) {
4654 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4655 in->inode.mode = S_IFDIR;
4656 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4657 }
4658 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4659 dir->state_set(CDir::STATE_REJOINUNDEF);
4660 rejoin_undef_dirfrags.insert(dir);
4661 dout(10) << " invented " << *dir << dendl;
4662 return dir;
4663 }
4664
4665 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4666 {
4667 mds_rank_t from = mds_rank_t(strong->get_source().num());
4668
4669 // only a recovering node will get a strong rejoin.
4670 if (!mds->is_rejoin()) {
4671 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4672 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4673 return;
4674 }
4675 ceph_abort_msg("got unexpected rejoin message during recovery");
4676 }
4677
4678 // assimilate any potentially dirty scatterlock state
4679 for (const auto &p : strong->inode_scatterlocks) {
4680 CInode *in = get_inode(p.first);
4681 ceph_assert(in);
4682 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4683 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4684 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4685 rejoin_potential_updated_scatterlocks.insert(in);
4686 }
4687
4688 rejoin_unlinked_inodes[from].clear();
4689
4690 // surviving peer may send incorrect dirfrag here (maybe they didn't
4691 // get the fragment notify, or maybe we rolled back?). we need to
4692 // infer the right frag and get them with the program. somehow.
4693 // we don't normally send ACK.. so we'll need to bundle this with
4694 // MISSING or something.
4695
4696 // strong dirfrags/dentries.
4697 // also process auth_pins, xlocks.
4698 for (const auto &p : strong->strong_dirfrags) {
4699 auto& dirfrag = p.first;
4700 CInode *diri = get_inode(dirfrag.ino);
4701 if (!diri)
4702 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4703 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4704 bool refragged = false;
4705 if (dir) {
4706 dout(10) << " have " << *dir << dendl;
4707 } else {
4708 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4709 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4710 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4711 dir = rejoin_invent_dirfrag(dirfrag);
4712 }
4713 if (dir) {
4714 dir->add_replica(from, p.second.nonce);
4715 dir->dir_rep = p.second.dir_rep;
4716 } else {
4717 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4718 frag_vec_t leaves;
4719 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4720 if (leaves.empty())
4721 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4722 dout(10) << " maps to frag(s) " << leaves << dendl;
4723 for (const auto& leaf : leaves) {
4724 CDir *dir = diri->get_dirfrag(leaf);
4725 if (!dir)
4726 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4727 else
4728 dout(10) << " have(approx) " << *dir << dendl;
4729 dir->add_replica(from, p.second.nonce);
4730 dir->dir_rep = p.second.dir_rep;
4731 }
4732 refragged = true;
4733 }
4734
4735 const auto it = strong->strong_dentries.find(dirfrag);
4736 if (it != strong->strong_dentries.end()) {
4737 const auto& dmap = it->second;
4738 for (const auto &q : dmap) {
4739 const string_snap_t& ss = q.first;
4740 const MMDSCacheRejoin::dn_strong& d = q.second;
4741 CDentry *dn;
4742 if (!refragged)
4743 dn = dir->lookup(ss.name, ss.snapid);
4744 else {
4745 frag_t fg = diri->pick_dirfrag(ss.name);
4746 dir = diri->get_dirfrag(fg);
4747 ceph_assert(dir);
4748 dn = dir->lookup(ss.name, ss.snapid);
4749 }
4750 if (!dn) {
4751 if (d.is_remote()) {
4752 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4753 } else if (d.is_null()) {
4754 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4755 } else {
4756 CInode *in = get_inode(d.ino, ss.snapid);
4757 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4758 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4759 }
4760 dout(10) << " invented " << *dn << dendl;
4761 }
4762 CDentry::linkage_t *dnl = dn->get_linkage();
4763
4764 // dn auth_pin?
4765 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4766 if (pinned_it != strong->authpinned_dentries.end()) {
4767 const auto slave_reqid_it = pinned_it->second.find(ss);
4768 if (slave_reqid_it != pinned_it->second.end()) {
4769 for (const auto &r : slave_reqid_it->second) {
4770 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4771
4772 // get/create slave mdrequest
4773 MDRequestRef mdr;
4774 if (have_request(r.reqid))
4775 mdr = request_get(r.reqid);
4776 else
4777 mdr = request_start_slave(r.reqid, r.attempt, strong);
4778 mdr->auth_pin(dn);
4779 }
4780 }
4781 }
4782
4783 // dn xlock?
4784 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4785 if (xlocked_it != strong->xlocked_dentries.end()) {
4786 const auto ss_req_it = xlocked_it->second.find(ss);
4787 if (ss_req_it != xlocked_it->second.end()) {
4788 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4789 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4790 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4791 ceph_assert(mdr->is_auth_pinned(dn));
4792 if (!mdr->is_xlocked(&dn->versionlock)) {
4793 ceph_assert(dn->versionlock.can_xlock_local());
4794 dn->versionlock.get_xlock(mdr, mdr->get_client());
4795 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4796 }
4797 if (dn->lock.is_stable())
4798 dn->auth_pin(&dn->lock);
4799 dn->lock.set_state(LOCK_XLOCK);
4800 dn->lock.get_xlock(mdr, mdr->get_client());
4801 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4802 }
4803 }
4804
4805 dn->add_replica(from, d.nonce);
4806 dout(10) << " have " << *dn << dendl;
4807
4808 if (dnl->is_primary()) {
4809 if (d.is_primary()) {
4810 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4811 // the survivor missed MDentryUnlink+MDentryLink messages ?
4812 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4813 CInode *in = get_inode(d.ino, ss.snapid);
4814 ceph_assert(in);
4815 ceph_assert(in->get_parent_dn());
4816 rejoin_unlinked_inodes[from].insert(in);
4817 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4818 }
4819 } else {
4820 // the survivor missed MDentryLink message ?
4821 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4822 dout(7) << " sender doesn't have primay dentry" << dendl;
4823 }
4824 } else {
4825 if (d.is_primary()) {
4826 // the survivor missed MDentryUnlink message ?
4827 CInode *in = get_inode(d.ino, ss.snapid);
4828 ceph_assert(in);
4829 ceph_assert(in->get_parent_dn());
4830 rejoin_unlinked_inodes[from].insert(in);
4831 dout(7) << " sender has primary dentry but we don't" << dendl;
4832 }
4833 }
4834 }
4835 }
4836 }
4837
4838 for (const auto &p : strong->strong_inodes) {
4839 CInode *in = get_inode(p.first);
4840 ceph_assert(in);
4841 in->add_replica(from, p.second.nonce);
4842 dout(10) << " have " << *in << dendl;
4843
4844 const MMDSCacheRejoin::inode_strong& is = p.second;
4845
4846 // caps_wanted
4847 if (is.caps_wanted) {
4848 in->set_mds_caps_wanted(from, is.caps_wanted);
4849 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4850 << " on " << *in << dendl;
4851 }
4852
4853 // scatterlocks?
4854 // infer state from replica state:
4855 // * go to MIX if they might have wrlocks
4856 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4857 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4858 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4859 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4860
4861 // auth pin?
4862 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4863 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4864 for (const auto& r : authpinned_inodes_it->second) {
4865 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4866
4867 // get/create slave mdrequest
4868 MDRequestRef mdr;
4869 if (have_request(r.reqid))
4870 mdr = request_get(r.reqid);
4871 else
4872 mdr = request_start_slave(r.reqid, r.attempt, strong);
4873 if (strong->frozen_authpin_inodes.count(in->vino())) {
4874 ceph_assert(!in->get_num_auth_pins());
4875 mdr->freeze_auth_pin(in);
4876 } else {
4877 ceph_assert(!in->is_frozen_auth_pin());
4878 }
4879 mdr->auth_pin(in);
4880 }
4881 }
4882 // xlock(s)?
4883 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4884 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4885 for (const auto &q : xlocked_inodes_it->second) {
4886 SimpleLock *lock = in->get_lock(q.first);
4887 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4888 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4889 ceph_assert(mdr->is_auth_pinned(in));
4890 if (!mdr->is_xlocked(&in->versionlock)) {
4891 ceph_assert(in->versionlock.can_xlock_local());
4892 in->versionlock.get_xlock(mdr, mdr->get_client());
4893 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4894 }
4895 if (lock->is_stable())
4896 in->auth_pin(lock);
4897 lock->set_state(LOCK_XLOCK);
4898 if (lock == &in->filelock)
4899 in->loner_cap = -1;
4900 lock->get_xlock(mdr, mdr->get_client());
4901 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4902 }
4903 }
4904 }
4905 // wrlock(s)?
4906 for (const auto &p : strong->wrlocked_inodes) {
4907 CInode *in = get_inode(p.first);
4908 for (const auto &q : p.second) {
4909 SimpleLock *lock = in->get_lock(q.first);
4910 for (const auto &r : q.second) {
4911 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4912 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4913 if (in->is_auth())
4914 ceph_assert(mdr->is_auth_pinned(in));
4915 lock->set_state(LOCK_MIX);
4916 if (lock == &in->filelock)
4917 in->loner_cap = -1;
4918 lock->get_wrlock(true);
4919 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4920 }
4921 }
4922 }
4923
4924 // done?
4925 ceph_assert(rejoin_gather.count(from));
4926 rejoin_gather.erase(from);
4927 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4928 rejoin_gather_finish();
4929 } else {
4930 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4931 }
4932 }
4933
4934 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4935 {
4936 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4937 mds_rank_t from = mds_rank_t(ack->get_source().num());
4938
4939 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4940 bool survivor = !mds->is_rejoin();
4941
4942 // for sending cache expire message
4943 set<CInode*> isolated_inodes;
4944 set<CInode*> refragged_inodes;
4945 list<pair<CInode*,int> > updated_realms;
4946
4947 // dirs
4948 for (const auto &p : ack->strong_dirfrags) {
4949 // we may have had incorrect dir fragmentation; refragment based
4950 // on what they auth tells us.
4951 CDir *dir = get_dirfrag(p.first);
4952 if (!dir) {
4953 dir = get_force_dirfrag(p.first, false);
4954 if (dir)
4955 refragged_inodes.insert(dir->get_inode());
4956 }
4957 if (!dir) {
4958 CInode *diri = get_inode(p.first.ino);
4959 if (!diri) {
4960 // barebones inode; the full inode loop below will clean up.
4961 diri = new CInode(this, false);
4962 diri->inode.ino = p.first.ino;
4963 diri->inode.mode = S_IFDIR;
4964 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4965 add_inode(diri);
4966 if (MDS_INO_MDSDIR(from) == p.first.ino) {
4967 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4968 dout(10) << " add inode " << *diri << dendl;
4969 } else {
4970 diri->inode_auth = CDIR_AUTH_DEFAULT;
4971 isolated_inodes.insert(diri);
4972 dout(10) << " unconnected dirfrag " << p.first << dendl;
4973 }
4974 }
4975 // barebones dirfrag; the full dirfrag loop below will clean up.
4976 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4977 if (MDS_INO_MDSDIR(from) == p.first.ino ||
4978 (dir->authority() != CDIR_AUTH_UNDEF &&
4979 dir->authority().first != from))
4980 adjust_subtree_auth(dir, from);
4981 dout(10) << " add dirfrag " << *dir << dendl;
4982 }
4983
4984 dir->set_replica_nonce(p.second.nonce);
4985 dir->state_clear(CDir::STATE_REJOINING);
4986 dout(10) << " got " << *dir << dendl;
4987
4988 // dentries
4989 auto it = ack->strong_dentries.find(p.first);
4990 if (it != ack->strong_dentries.end()) {
4991 for (const auto &q : it->second) {
4992 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4993 if(!dn)
4994 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
4995
4996 CDentry::linkage_t *dnl = dn->get_linkage();
4997
4998 ceph_assert(dn->last == q.first.snapid);
4999 if (dn->first != q.second.first) {
5000 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5001 dn->first = q.second.first;
5002 }
5003
5004 // may have bad linkage if we missed dentry link/unlink messages
5005 if (dnl->is_primary()) {
5006 CInode *in = dnl->get_inode();
5007 if (!q.second.is_primary() ||
5008 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5009 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5010 dir->unlink_inode(dn);
5011 }
5012 } else if (dnl->is_remote()) {
5013 if (!q.second.is_remote() ||
5014 q.second.remote_ino != dnl->get_remote_ino() ||
5015 q.second.remote_d_type != dnl->get_remote_d_type()) {
5016 dout(10) << " had bad linkage for " << *dn << dendl;
5017 dir->unlink_inode(dn);
5018 }
5019 } else {
5020 if (!q.second.is_null())
5021 dout(10) << " had bad linkage for " << *dn << dendl;
5022 }
5023
5024 // hmm, did we have the proper linkage here?
5025 if (dnl->is_null() && !q.second.is_null()) {
5026 if (q.second.is_remote()) {
5027 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5028 } else {
5029 CInode *in = get_inode(q.second.ino, q.first.snapid);
5030 if (!in) {
5031 // barebones inode; assume it's dir, the full inode loop below will clean up.
5032 in = new CInode(this, false, q.second.first, q.first.snapid);
5033 in->inode.ino = q.second.ino;
5034 in->inode.mode = S_IFDIR;
5035 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5036 add_inode(in);
5037 dout(10) << " add inode " << *in << dendl;
5038 } else if (in->get_parent_dn()) {
5039 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5040 << ", unlinking " << *in << dendl;
5041 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5042 }
5043 dn->dir->link_primary_inode(dn, in);
5044 isolated_inodes.erase(in);
5045 }
5046 }
5047
5048 dn->set_replica_nonce(q.second.nonce);
5049 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5050 dn->state_clear(CDentry::STATE_REJOINING);
5051 dout(10) << " got " << *dn << dendl;
5052 }
5053 }
5054 }
5055
5056 for (const auto& in : refragged_inodes) {
5057 auto&& ls = in->get_nested_dirfrags();
5058 for (const auto& dir : ls) {
5059 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5060 continue;
5061 ceph_assert(dir->get_num_any() == 0);
5062 in->close_dirfrag(dir->get_frag());
5063 }
5064 }
5065
5066 // full dirfrags
5067 for (const auto &p : ack->dirfrag_bases) {
5068 CDir *dir = get_dirfrag(p.first);
5069 ceph_assert(dir);
5070 auto q = p.second.cbegin();
5071 dir->_decode_base(q);
5072 dout(10) << " got dir replica " << *dir << dendl;
5073 }
5074
5075 // full inodes
5076 auto p = ack->inode_base.cbegin();
5077 while (!p.end()) {
5078 inodeno_t ino;
5079 snapid_t last;
5080 bufferlist basebl;
5081 decode(ino, p);
5082 decode(last, p);
5083 decode(basebl, p);
5084 CInode *in = get_inode(ino, last);
5085 ceph_assert(in);
5086 auto q = basebl.cbegin();
5087 snapid_t sseq = 0;
5088 if (in->snaprealm)
5089 sseq = in->snaprealm->srnode.seq;
5090 in->_decode_base(q);
5091 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5092 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5093 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5094 }
5095 dout(10) << " got inode base " << *in << dendl;
5096 }
5097
5098 // inodes
5099 p = ack->inode_locks.cbegin();
5100 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5101 while (!p.end()) {
5102 inodeno_t ino;
5103 snapid_t last;
5104 __u32 nonce;
5105 bufferlist lockbl;
5106 decode(ino, p);
5107 decode(last, p);
5108 decode(nonce, p);
5109 decode(lockbl, p);
5110
5111 CInode *in = get_inode(ino, last);
5112 ceph_assert(in);
5113 in->set_replica_nonce(nonce);
5114 auto q = lockbl.cbegin();
5115 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5116 in->state_clear(CInode::STATE_REJOINING);
5117 dout(10) << " got inode locks " << *in << dendl;
5118 }
5119
5120 // FIXME: This can happen if entire subtree, together with the inode subtree root
5121 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5122 ceph_assert(isolated_inodes.empty());
5123
5124 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5125 auto bp = ack->imported_caps.cbegin();
5126 decode(peer_imported, bp);
5127
5128 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5129 p != peer_imported.end();
5130 ++p) {
5131 auto& ex = cap_exports.at(p->first);
5132 ceph_assert(ex.first == from);
5133 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5134 q != p->second.end();
5135 ++q) {
5136 auto r = ex.second.find(q->first);
5137 ceph_assert(r != ex.second.end());
5138
5139 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5140 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5141 if (!session) {
5142 dout(10) << " no session for client." << p->first << dendl;
5143 ex.second.erase(r);
5144 continue;
5145 }
5146
5147 // mark client caps stale.
5148 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5149 r->second.capinfo.cap_id, 0,
5150 mds->get_osd_epoch_barrier());
5151 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5152 (q->second.cap_id > 0 ? from : -1), 0);
5153 mds->send_message_client_counted(m, session);
5154
5155 ex.second.erase(r);
5156 }
5157 ceph_assert(ex.second.empty());
5158 }
5159
5160 for (auto p : updated_realms) {
5161 CInode *in = p.first;
5162 bool notify_clients;
5163 if (mds->is_rejoin()) {
5164 if (!rejoin_pending_snaprealms.count(in)) {
5165 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5166 rejoin_pending_snaprealms.insert(in);
5167 }
5168 notify_clients = false;
5169 } else {
5170 // notify clients if I'm survivor
5171 notify_clients = true;
5172 }
5173 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5174 }
5175
5176 // done?
5177 ceph_assert(rejoin_ack_gather.count(from));
5178 rejoin_ack_gather.erase(from);
5179 if (!survivor) {
5180 if (rejoin_gather.empty()) {
5181 // eval unstable scatter locks after all wrlocks are rejoined.
5182 while (!rejoin_eval_locks.empty()) {
5183 SimpleLock *lock = rejoin_eval_locks.front();
5184 rejoin_eval_locks.pop_front();
5185 if (!lock->is_stable())
5186 mds->locker->eval_gather(lock);
5187 }
5188 }
5189
5190 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5191 rejoin_ack_gather.empty()) {
5192 // finally, kickstart past snap parent opens
5193 open_snaprealms();
5194 } else {
5195 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5196 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5197 }
5198 } else {
5199 // survivor.
5200 mds->queue_waiters(rejoin_waiters);
5201 }
5202 }
5203
5204 /**
5205 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5206 *
5207 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5208 * messages that clean these guys up...
5209 */
5210 void MDCache::rejoin_trim_undef_inodes()
5211 {
5212 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5213
5214 while (!rejoin_undef_inodes.empty()) {
5215 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5216 CInode *in = *p;
5217 rejoin_undef_inodes.erase(p);
5218
5219 in->clear_replica_map();
5220
5221 // close out dirfrags
5222 if (in->is_dir()) {
5223 const auto&& dfls = in->get_dirfrags();
5224 for (const auto& dir : dfls) {
5225 dir->clear_replica_map();
5226
5227 for (auto &p : dir->items) {
5228 CDentry *dn = p.second;
5229 dn->clear_replica_map();
5230
5231 dout(10) << " trimming " << *dn << dendl;
5232 dir->remove_dentry(dn);
5233 }
5234
5235 dout(10) << " trimming " << *dir << dendl;
5236 in->close_dirfrag(dir->dirfrag().frag);
5237 }
5238 }
5239
5240 CDentry *dn = in->get_parent_dn();
5241 if (dn) {
5242 dn->clear_replica_map();
5243 dout(10) << " trimming " << *dn << dendl;
5244 dn->dir->remove_dentry(dn);
5245 } else {
5246 dout(10) << " trimming " << *in << dendl;
5247 remove_inode(in);
5248 }
5249 }
5250
5251 ceph_assert(rejoin_undef_inodes.empty());
5252 }
5253
5254 void MDCache::rejoin_gather_finish()
5255 {
5256 dout(10) << "rejoin_gather_finish" << dendl;
5257 ceph_assert(mds->is_rejoin());
5258 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5259
5260 if (open_undef_inodes_dirfrags())
5261 return;
5262
5263 if (process_imported_caps())
5264 return;
5265
5266 choose_lock_states_and_reconnect_caps();
5267
5268 identify_files_to_recover();
5269 rejoin_send_acks();
5270
5271 // signal completion of fetches, rejoin_gather_finish, etc.
5272 rejoin_ack_gather.erase(mds->get_nodeid());
5273
5274 // did we already get our acks too?
5275 if (rejoin_ack_gather.empty()) {
5276 // finally, open snaprealms
5277 open_snaprealms();
5278 }
5279 }
5280
5281 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5282 inodeno_t ino;
5283 public:
5284 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5285 void finish(int r) override {
5286 mdcache->rejoin_open_ino_finish(ino, r);
5287 }
5288 };
5289
5290 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5291 {
5292 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5293
5294 if (ret < 0) {
5295 cap_imports_missing.insert(ino);
5296 } else if (ret == mds->get_nodeid()) {
5297 ceph_assert(get_inode(ino));
5298 } else {
5299 auto p = cap_imports.find(ino);
5300 ceph_assert(p != cap_imports.end());
5301 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5302 ceph_assert(q->second.count(MDS_RANK_NONE));
5303 ceph_assert(q->second.size() == 1);
5304 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5305 }
5306 cap_imports.erase(p);
5307 }
5308
5309 ceph_assert(cap_imports_num_opening > 0);
5310 cap_imports_num_opening--;
5311
5312 if (cap_imports_num_opening == 0) {
5313 if (rejoin_gather.empty())
5314 rejoin_gather_finish();
5315 else if (rejoin_gather.count(mds->get_nodeid()))
5316 process_imported_caps();
5317 }
5318 }
5319
5320 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5321 public:
5322 map<client_t,pair<Session*,uint64_t> > session_map;
5323 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5324 void finish(int r) override {
5325 ceph_assert(r == 0);
5326 mdcache->rejoin_open_sessions_finish(session_map);
5327 }
5328 };
5329
5330 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5331 {
5332 dout(10) << "rejoin_open_sessions_finish" << dendl;
5333 mds->server->finish_force_open_sessions(session_map);
5334 rejoin_session_map.swap(session_map);
5335 if (rejoin_gather.empty())
5336 rejoin_gather_finish();
5337 }
5338
5339 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5340 {
5341 auto p = cap_imports.find(ino);
5342 if (p != cap_imports.end()) {
5343 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5344 if (ret < 0) {
5345 cap_imports_missing.insert(ino);
5346 } else if (ret != mds->get_nodeid()) {
5347 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5348 ceph_assert(q->second.count(MDS_RANK_NONE));
5349 ceph_assert(q->second.size() == 1);
5350 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5351 }
5352 cap_imports.erase(p);
5353 }
5354 }
5355 }
5356
5357 bool MDCache::process_imported_caps()
5358 {
5359 dout(10) << "process_imported_caps" << dendl;
5360
5361 if (!open_file_table.is_prefetched() &&
5362 open_file_table.prefetch_inodes()) {
5363 open_file_table.wait_for_prefetch(
5364 new MDSInternalContextWrapper(mds,
5365 new LambdaContext([this](int r) {
5366 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5367 process_imported_caps();
5368 })
5369 )
5370 );
5371 return true;
5372 }
5373
5374 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5375 CInode *in = get_inode(p->first);
5376 if (in) {
5377 ceph_assert(in->is_auth());
5378 cap_imports_missing.erase(p->first);
5379 continue;
5380 }
5381 if (cap_imports_missing.count(p->first) > 0)
5382 continue;
5383
5384 cap_imports_num_opening++;
5385 dout(10) << " opening missing ino " << p->first << dendl;
5386 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5387 if (!(cap_imports_num_opening % 1000))
5388 mds->heartbeat_reset();
5389 }
5390
5391 if (cap_imports_num_opening > 0)
5392 return true;
5393
5394 // called by rejoin_gather_finish() ?
5395 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5396 if (!rejoin_client_map.empty() &&
5397 rejoin_session_map.empty()) {
5398 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5399 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5400 rejoin_client_metadata_map,
5401 finish->session_map);
5402 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5403 std::move(rejoin_client_metadata_map));
5404 mds->mdlog->start_submit_entry(le, finish);
5405 mds->mdlog->flush();
5406 rejoin_client_map.clear();
5407 rejoin_client_metadata_map.clear();
5408 return true;
5409 }
5410
5411 // process caps that were exported by slave rename
5412 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5413 p != rejoin_slave_exports.end();
5414 ++p) {
5415 CInode *in = get_inode(p->first);
5416 ceph_assert(in);
5417 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5418 q != p->second.second.end();
5419 ++q) {
5420 auto r = rejoin_session_map.find(q->first);
5421 if (r == rejoin_session_map.end())
5422 continue;
5423
5424 Session *session = r->second.first;
5425 Capability *cap = in->get_client_cap(q->first);
5426 if (!cap) {
5427 cap = in->add_client_cap(q->first, session);
5428 // add empty item to reconnected_caps
5429 (void)reconnected_caps[p->first][q->first];
5430 }
5431 cap->merge(q->second, true);
5432
5433 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5434 ceph_assert(cap->get_last_seq() == im.issue_seq);
5435 ceph_assert(cap->get_mseq() == im.mseq);
5436 cap->set_cap_id(im.cap_id);
5437 // send cap import because we assigned a new cap ID
5438 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5439 p->second.first, CEPH_CAP_FLAG_AUTH);
5440 }
5441 }
5442 rejoin_slave_exports.clear();
5443 rejoin_imported_caps.clear();
5444
5445 // process cap imports
5446 // ino -> client -> frommds -> capex
5447 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5448 CInode *in = get_inode(p->first);
5449 if (!in) {
5450 dout(10) << " still missing ino " << p->first
5451 << ", will try again after replayed client requests" << dendl;
5452 ++p;
5453 continue;
5454 }
5455 ceph_assert(in->is_auth());
5456 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5457 Session *session;
5458 {
5459 auto r = rejoin_session_map.find(q->first);
5460 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5461 }
5462
5463 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5464 if (!session) {
5465 if (r->first >= 0)
5466 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5467 continue;
5468 }
5469
5470 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5471 add_reconnected_cap(q->first, in->ino(), r->second);
5472 if (r->first >= 0) {
5473 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5474 cap->inc_mseq();
5475 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5476
5477 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5478 im.cap_id = cap->get_cap_id();
5479 im.issue_seq = cap->get_last_seq();
5480 im.mseq = cap->get_mseq();
5481 }
5482 }
5483 }
5484 cap_imports.erase(p++); // remove and move on
5485 }
5486 } else {
5487 trim_non_auth();
5488
5489 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5490 rejoin_gather.erase(mds->get_nodeid());
5491 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5492 maybe_send_pending_rejoins();
5493 }
5494 return false;
5495 }
5496
5497 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5498 client_t client, snapid_t snap_follows)
5499 {
5500 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5501
5502 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5503 return;
5504
5505 const set<snapid_t>& snaps = realm->get_snaps();
5506 snapid_t follows = snap_follows;
5507
5508 while (true) {
5509 CInode *in = pick_inode_snap(head_in, follows);
5510 if (in == head_in)
5511 break;
5512
5513 bool need_snapflush = false;
5514 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5515 p != snaps.end() && *p <= in->last;
5516 ++p) {
5517 head_in->add_need_snapflush(in, *p, client);
5518 need_snapflush = true;
5519 }
5520 follows = in->last;
5521 if (!need_snapflush)
5522 continue;
5523
5524 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5525
5526 if (in->client_snap_caps.empty()) {
5527 for (int i = 0; i < num_cinode_locks; i++) {
5528 int lockid = cinode_lock_info[i].lock;
5529 SimpleLock *lock = in->get_lock(lockid);
5530 ceph_assert(lock);
5531 in->auth_pin(lock);
5532 lock->set_state(LOCK_SNAP_SYNC);
5533 lock->get_wrlock(true);
5534 }
5535 }
5536 in->client_snap_caps.insert(client);
5537 mds->locker->mark_need_snapflush_inode(in);
5538 }
5539 }
5540
5541 /*
5542 * choose lock states based on reconnected caps
5543 */
5544 void MDCache::choose_lock_states_and_reconnect_caps()
5545 {
5546 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5547
5548 int count = 0;
5549 for (auto p : inode_map) {
5550 CInode *in = p.second;
5551 if (in->last != CEPH_NOSNAP)
5552 continue;
5553
5554 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5555 in->mark_dirty_rstat();
5556
5557 int dirty_caps = 0;
5558 auto q = reconnected_caps.find(in->ino());
5559 if (q != reconnected_caps.end()) {
5560 for (const auto &it : q->second)
5561 dirty_caps |= it.second.dirty_caps;
5562 }
5563 in->choose_lock_states(dirty_caps);
5564 dout(15) << " chose lock states on " << *in << dendl;
5565
5566 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5567 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5568 rejoin_pending_snaprealms.insert(in);
5569 }
5570
5571 if (!(++count % 1000))
5572 mds->heartbeat_reset();
5573 }
5574 }
5575
5576 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5577 map<client_t,ref_t<MClientSnap>>& splits)
5578 {
5579 ref_t<MClientSnap> snap;
5580 auto it = splits.find(client);
5581 if (it != splits.end()) {
5582 snap = it->second;
5583 snap->head.op = CEPH_SNAP_OP_SPLIT;
5584 } else {
5585 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5586 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5587 snap->head.split = realm->inode->ino();
5588 snap->bl = realm->get_snap_trace();
5589
5590 for (const auto& child : realm->open_children)
5591 snap->split_realms.push_back(child->inode->ino());
5592 }
5593 snap->split_inos.push_back(ino);
5594 }
5595
5596 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5597 map<client_t,ref_t<MClientSnap>>& splits)
5598 {
5599 ceph_assert(parent_realm);
5600
5601 vector<inodeno_t> split_inos;
5602 vector<inodeno_t> split_realms;
5603
5604 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5605 !p.end();
5606 ++p)
5607 split_inos.push_back((*p)->ino());
5608 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5609 p != realm->open_children.end();
5610 ++p)
5611 split_realms.push_back((*p)->inode->ino());
5612
5613 for (const auto& p : realm->client_caps) {
5614 ceph_assert(!p.second->empty());
5615 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5616 if (em.second) {
5617 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5618 update->head.split = parent_realm->inode->ino();
5619 update->split_inos = split_inos;
5620 update->split_realms = split_realms;
5621 update->bl = parent_realm->get_snap_trace();
5622 em.first->second = std::move(update);
5623 }
5624 }
5625 }
5626
5627 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5628 {
5629 dout(10) << "send_snaps" << dendl;
5630
5631 for (auto &p : splits) {
5632 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5633 if (session) {
5634 dout(10) << " client." << p.first
5635 << " split " << p.second->head.split
5636 << " inos " << p.second->split_inos
5637 << dendl;
5638 mds->send_message_client_counted(p.second, session);
5639 } else {
5640 dout(10) << " no session for client." << p.first << dendl;
5641 }
5642 }
5643 splits.clear();
5644 }
5645
5646
5647 /*
5648 * remove any items from logsegment open_file lists that don't have
5649 * any caps
5650 */
5651 void MDCache::clean_open_file_lists()
5652 {
5653 dout(10) << "clean_open_file_lists" << dendl;
5654
5655 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5656 p != mds->mdlog->segments.end();
5657 ++p) {
5658 LogSegment *ls = p->second;
5659
5660 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5661 while (!q.end()) {
5662 CInode *in = *q;
5663 ++q;
5664 if (in->last == CEPH_NOSNAP) {
5665 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5666 in->item_open_file.remove_myself();
5667 } else {
5668 if (in->client_snap_caps.empty()) {
5669 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5670 in->item_open_file.remove_myself();
5671 }
5672 }
5673 }
5674 }
5675 }
5676
5677 void MDCache::dump_openfiles(Formatter *f)
5678 {
5679 f->open_array_section("openfiles");
5680 for (auto p = mds->mdlog->segments.begin();
5681 p != mds->mdlog->segments.end();
5682 ++p) {
5683 LogSegment *ls = p->second;
5684
5685 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5686 while (!q.end()) {
5687 CInode *in = *q;
5688 ++q;
5689 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5690 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5691 continue;
5692 f->open_object_section("file");
5693 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5694 f->close_section();
5695 }
5696 }
5697 f->close_section();
5698 }
5699
5700 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5701 {
5702 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5703 << " on " << *in << dendl;
5704 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5705 if (!session) {
5706 dout(10) << " no session for client." << client << dendl;
5707 return NULL;
5708 }
5709
5710 Capability *cap = in->reconnect_cap(client, icr, session);
5711
5712 if (frommds >= 0) {
5713 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5714 cap->inc_mseq();
5715 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5716 }
5717
5718 return cap;
5719 }
5720
5721 void MDCache::export_remaining_imported_caps()
5722 {
5723 dout(10) << "export_remaining_imported_caps" << dendl;
5724
5725 stringstream warn_str;
5726
5727 int count = 0;
5728 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5729 warn_str << " ino " << p->first << "\n";
5730 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5731 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5732 if (session) {
5733 // mark client caps stale.
5734 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5735 0, 0, 0,
5736 mds->get_osd_epoch_barrier());
5737 stale->set_cap_peer(0, 0, 0, -1, 0);
5738 mds->send_message_client_counted(stale, q->first);
5739 }
5740 }
5741
5742 if (!(++count % 1000))
5743 mds->heartbeat_reset();
5744 }
5745
5746 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5747 p != cap_reconnect_waiters.end();
5748 ++p)
5749 mds->queue_waiters(p->second);
5750
5751 cap_imports.clear();
5752 cap_reconnect_waiters.clear();
5753
5754 if (warn_str.peek() != EOF) {
5755 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5756 mds->clog->warn(warn_str);
5757 }
5758 }
5759
5760 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5761 {
5762 client_t client = session->info.get_client();
5763 Capability *cap = nullptr;
5764 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5765 if (rc) {
5766 cap = in->reconnect_cap(client, *rc, session);
5767 dout(10) << "try_reconnect_cap client." << client
5768 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5769 << " issue " << ccap_string(rc->capinfo.issued)
5770 << " on " << *in << dendl;
5771 remove_replay_cap_reconnect(in->ino(), client);
5772
5773 if (in->is_replicated()) {
5774 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5775 } else {
5776 int dirty_caps = 0;
5777 auto p = reconnected_caps.find(in->ino());
5778 if (p != reconnected_caps.end()) {
5779 auto q = p->second.find(client);
5780 if (q != p->second.end())
5781 dirty_caps = q->second.dirty_caps;
5782 }
5783 in->choose_lock_states(dirty_caps);
5784 dout(15) << " chose lock states on " << *in << dendl;
5785 }
5786
5787 map<inodeno_t, MDSContext::vec >::iterator it =
5788 cap_reconnect_waiters.find(in->ino());
5789 if (it != cap_reconnect_waiters.end()) {
5790 mds->queue_waiters(it->second);
5791 cap_reconnect_waiters.erase(it);
5792 }
5793 }
5794 return cap;
5795 }
5796
5797
5798
5799 // -------
5800 // cap imports and delayed snap parent opens
5801
5802 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5803 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5804 int peer, int p_flags)
5805 {
5806 SnapRealm *realm = in->find_snaprealm();
5807 if (realm->have_past_parents_open()) {
5808 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5809 if (cap->get_last_seq() == 0) // reconnected cap
5810 cap->inc_last_seq();
5811 cap->set_last_issue();
5812 cap->set_last_issue_stamp(ceph_clock_now());
5813 cap->clear_new();
5814 auto reap = make_message<MClientCaps>(
5815 CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(),
5816 cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(),
5817 mds->get_osd_epoch_barrier());
5818 in->encode_cap_message(reap, cap);
5819 reap->snapbl = realm->get_snap_trace();
5820 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5821 mds->send_message_client_counted(reap, session);
5822 } else {
5823 ceph_abort();
5824 }
5825 }
5826
5827 void MDCache::do_delayed_cap_imports()
5828 {
5829 dout(10) << "do_delayed_cap_imports" << dendl;
5830
5831 ceph_assert(delayed_imported_caps.empty());
5832 }
5833
5834 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5835 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5836 void finish(int r) override {
5837 mdcache->open_snaprealms();
5838 }
5839 };
5840
5841 void MDCache::open_snaprealms()
5842 {
5843 dout(10) << "open_snaprealms" << dendl;
5844
5845 MDSGatherBuilder gather(g_ceph_context);
5846
5847 auto it = rejoin_pending_snaprealms.begin();
5848 while (it != rejoin_pending_snaprealms.end()) {
5849 CInode *in = *it;
5850 SnapRealm *realm = in->snaprealm;
5851 ceph_assert(realm);
5852 if (realm->have_past_parents_open() ||
5853 realm->open_parents(gather.new_sub())) {
5854 dout(10) << " past parents now open on " << *in << dendl;
5855
5856 map<client_t,ref_t<MClientSnap>> splits;
5857 // finish off client snaprealm reconnects?
5858 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5859 if (q != reconnected_snaprealms.end()) {
5860 for (const auto& r : q->second)
5861 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5862 reconnected_snaprealms.erase(q);
5863 }
5864
5865 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5866 !p.end(); ++p) {
5867 CInode *child = *p;
5868 auto q = reconnected_caps.find(child->ino());
5869 ceph_assert(q != reconnected_caps.end());
5870 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5871 Capability *cap = child->get_client_cap(r->first);
5872 if (!cap)
5873 continue;
5874 if (r->second.snap_follows > 0) {
5875 if (r->second.snap_follows < child->first - 1) {
5876 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5877 } else if (r->second.snapflush) {
5878 // When processing a cap flush message that is re-sent, it's possble
5879 // that the sender has already released all WR caps. So we should
5880 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5881 cap->mark_needsnapflush();
5882 }
5883 }
5884 // make sure client's cap is in the correct snaprealm.
5885 if (r->second.realm_ino != in->ino()) {
5886 prepare_realm_split(realm, r->first, child->ino(), splits);
5887 }
5888 }
5889 }
5890
5891 rejoin_pending_snaprealms.erase(it++);
5892 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5893
5894 send_snaps(splits);
5895 } else {
5896 dout(10) << " opening past parents on " << *in << dendl;
5897 ++it;
5898 }
5899 }
5900
5901 if (gather.has_subs()) {
5902 if (gather.num_subs_remaining() == 0) {
5903 // cleanup gather
5904 gather.set_finisher(new C_MDSInternalNoop);
5905 gather.activate();
5906 } else {
5907 // for multimds, must succeed the first time
5908 ceph_assert(recovery_set.empty());
5909
5910 dout(10) << "open_snaprealms - waiting for "
5911 << gather.num_subs_remaining() << dendl;
5912 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5913 gather.activate();
5914 return;
5915 }
5916 }
5917
5918 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5919
5920 if (!reconnected_snaprealms.empty()) {
5921 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5922 for (auto& p : reconnected_snaprealms) {
5923 stringstream warn_str;
5924 warn_str << " " << p.first << " {";
5925 bool first = true;
5926 for (auto& q : p.second) {
5927 if (!first)
5928 warn_str << ", ";
5929 warn_str << "client." << q.first << "/" << q.second;
5930 }
5931 warn_str << "}";
5932 dout(5) << warn_str.str() << dendl;
5933 }
5934 }
5935 ceph_assert(rejoin_waiters.empty());
5936 ceph_assert(rejoin_pending_snaprealms.empty());
5937 dout(10) << "open_snaprealms - all open" << dendl;
5938 do_delayed_cap_imports();
5939
5940 ceph_assert(rejoin_done);
5941 rejoin_done.release()->complete(0);
5942 reconnected_caps.clear();
5943 }
5944
5945 bool MDCache::open_undef_inodes_dirfrags()
5946 {
5947 dout(10) << "open_undef_inodes_dirfrags "
5948 << rejoin_undef_inodes.size() << " inodes "
5949 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5950
5951 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5952
5953 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5954 p != rejoin_undef_inodes.end();
5955 ++p) {
5956 CInode *in = *p;
5957 ceph_assert(!in->is_base());
5958 fetch_queue.insert(in->get_parent_dir());
5959 }
5960
5961 if (fetch_queue.empty())
5962 return false;
5963
5964 MDSGatherBuilder gather(g_ceph_context,
5965 new MDSInternalContextWrapper(mds,
5966 new LambdaContext([this](int r) {
5967 if (rejoin_gather.empty())
5968 rejoin_gather_finish();
5969 })
5970 )
5971 );
5972
5973 for (set<CDir*>::iterator p = fetch_queue.begin();
5974 p != fetch_queue.end();
5975 ++p) {
5976 CDir *dir = *p;
5977 CInode *diri = dir->get_inode();
5978 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5979 continue;
5980 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5981 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5982 dir->fetch(gather.new_sub());
5983 }
5984 ceph_assert(gather.has_subs());
5985 gather.activate();
5986 return true;
5987 }
5988
5989 void MDCache::opened_undef_inode(CInode *in) {
5990 dout(10) << "opened_undef_inode " << *in << dendl;
5991 rejoin_undef_inodes.erase(in);
5992 if (in->is_dir()) {
5993 // FIXME: re-hash dentries if necessary
5994 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
5995 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5996 CDir *dir = in->get_dirfrag(frag_t());
5997 ceph_assert(dir);
5998 rejoin_undef_dirfrags.erase(dir);
5999 in->force_dirfrags();
6000 auto&& ls = in->get_dirfrags();
6001 for (const auto& dir : ls) {
6002 rejoin_undef_dirfrags.insert(dir);
6003 }
6004 }
6005 }
6006 }
6007
6008 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6009 map<client_t,ref_t<MClientSnap>>& updates)
6010 {
6011 if (seq < realm->get_newest_seq()) {
6012 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
6013 << realm->get_newest_seq() << " on " << *realm << dendl;
6014 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
6015 snap->bl = realm->get_snap_trace();
6016 for (const auto& child : realm->open_children)
6017 snap->split_realms.push_back(child->inode->ino());
6018 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
6019 } else {
6020 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6021 << " on " << *realm << dendl;
6022 }
6023 }
6024
6025
6026
6027 void MDCache::rejoin_send_acks()
6028 {
6029 dout(7) << "rejoin_send_acks" << dendl;
6030
6031 // replicate stray
6032 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6033 p != rejoin_unlinked_inodes.end();
6034 ++p) {
6035 for (set<CInode*>::iterator q = p->second.begin();
6036 q != p->second.end();
6037 ++q) {
6038 CInode *in = *q;
6039 dout(7) << " unlinked inode " << *in << dendl;
6040 // inode expired
6041 if (!in->is_replica(p->first))
6042 continue;
6043 while (1) {
6044 CDentry *dn = in->get_parent_dn();
6045 if (dn->is_replica(p->first))
6046 break;
6047 dn->add_replica(p->first);
6048 CDir *dir = dn->get_dir();
6049 if (dir->is_replica(p->first))
6050 break;
6051 dir->add_replica(p->first);
6052 in = dir->get_inode();
6053 if (in->is_replica(p->first))
6054 break;
6055 in->add_replica(p->first);
6056 if (in->is_base())
6057 break;
6058 }
6059 }
6060 }
6061 rejoin_unlinked_inodes.clear();
6062
6063 // send acks to everyone in the recovery set
6064 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6065 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6066 p != recovery_set.end();
6067 ++p) {
6068 if (rejoin_ack_sent.count(*p))
6069 continue;
6070 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6071 }
6072
6073 rejoin_ack_sent = recovery_set;
6074
6075 // walk subtrees
6076 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6077 p != subtrees.end();
6078 ++p) {
6079 CDir *dir = p->first;
6080 if (!dir->is_auth())
6081 continue;
6082 dout(10) << "subtree " << *dir << dendl;
6083
6084 // auth items in this subtree
6085 std::queue<CDir*> dq;
6086 dq.push(dir);
6087
6088 while (!dq.empty()) {
6089 CDir *dir = dq.front();
6090 dq.pop();
6091
6092 // dir
6093 for (auto &r : dir->get_replicas()) {
6094 auto it = acks.find(r.first);
6095 if (it == acks.end())
6096 continue;
6097 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6098 it->second->add_dirfrag_base(dir);
6099 }
6100
6101 for (auto &p : dir->items) {
6102 CDentry *dn = p.second;
6103 CDentry::linkage_t *dnl = dn->get_linkage();
6104
6105 // inode
6106 CInode *in = NULL;
6107 if (dnl->is_primary())
6108 in = dnl->get_inode();
6109
6110 // dentry
6111 for (auto &r : dn->get_replicas()) {
6112 auto it = acks.find(r.first);
6113 if (it == acks.end())
6114 continue;
6115 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
6116 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6117 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6118 dnl->is_remote() ? dnl->get_remote_d_type():0,
6119 ++r.second,
6120 dn->lock.get_replica_state());
6121 // peer missed MDentrylink message ?
6122 if (in && !in->is_replica(r.first))
6123 in->add_replica(r.first);
6124 }
6125
6126 if (!in)
6127 continue;
6128
6129 for (auto &r : in->get_replicas()) {
6130 auto it = acks.find(r.first);
6131 if (it == acks.end())
6132 continue;
6133 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6134 bufferlist bl;
6135 in->_encode_locks_state_for_rejoin(bl, r.first);
6136 it->second->add_inode_locks(in, ++r.second, bl);
6137 }
6138
6139 // subdirs in this subtree?
6140 {
6141 auto&& dirs = in->get_nested_dirfrags();
6142 for (const auto& dir : dirs) {
6143 dq.push(dir);
6144 }
6145 }
6146 }
6147 }
6148 }
6149
6150 // base inodes too
6151 if (root && root->is_auth())
6152 for (auto &r : root->get_replicas()) {
6153 auto it = acks.find(r.first);
6154 if (it == acks.end())
6155 continue;
6156 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6157 bufferlist bl;
6158 root->_encode_locks_state_for_rejoin(bl, r.first);
6159 it->second->add_inode_locks(root, ++r.second, bl);
6160 }
6161 if (myin)
6162 for (auto &r : myin->get_replicas()) {
6163 auto it = acks.find(r.first);
6164 if (it == acks.end())
6165 continue;
6166 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6167 bufferlist bl;
6168 myin->_encode_locks_state_for_rejoin(bl, r.first);
6169 it->second->add_inode_locks(myin, ++r.second, bl);
6170 }
6171
6172 // include inode base for any inodes whose scatterlocks may have updated
6173 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6174 p != rejoin_potential_updated_scatterlocks.end();
6175 ++p) {
6176 CInode *in = *p;
6177 for (const auto &r : in->get_replicas()) {
6178 auto it = acks.find(r.first);
6179 if (it == acks.end())
6180 continue;
6181 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6182 }
6183 }
6184
6185 // send acks
6186 for (auto p = acks.begin(); p != acks.end(); ++p) {
6187 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6188 mds->send_message_mds(p->second, p->first);
6189 }
6190
6191 rejoin_imported_caps.clear();
6192 }
6193
6194 class C_MDC_ReIssueCaps : public MDCacheContext {
6195 CInode *in;
6196 public:
6197 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6198 MDCacheContext(mdc), in(i)
6199 {
6200 in->get(CInode::PIN_PTRWAITER);
6201 }
6202 void finish(int r) override {
6203 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6204 mdcache->mds->locker->issue_caps(in);
6205 in->put(CInode::PIN_PTRWAITER);
6206 }
6207 };
6208
6209 void MDCache::reissue_all_caps()
6210 {
6211 dout(10) << "reissue_all_caps" << dendl;
6212
6213 int count = 0;
6214 for (auto &p : inode_map) {
6215 int n = 1;
6216 CInode *in = p.second;
6217 if (in->is_head() && in->is_any_caps()) {
6218 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6219 if (in->is_frozen_inode()) {
6220 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6221 continue;
6222 }
6223 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6224 n += mds->locker->issue_caps(in);
6225 }
6226
6227 if ((count % 1000) + n >= 1000)
6228 mds->heartbeat_reset();
6229 count += n;
6230 }
6231 }
6232
6233
6234 // ===============================================================================
6235
6236 struct C_MDC_QueuedCow : public MDCacheContext {
6237 CInode *in;
6238 MutationRef mut;
6239 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6240 MDCacheContext(mdc), in(i), mut(m) {}
6241 void finish(int r) override {
6242 mdcache->_queued_file_recover_cow(in, mut);
6243 }
6244 };
6245
6246
6247 void MDCache::queue_file_recover(CInode *in)
6248 {
6249 dout(10) << "queue_file_recover " << *in << dendl;
6250 ceph_assert(in->is_auth());
6251
6252 // cow?
6253 /*
6254 SnapRealm *realm = in->find_snaprealm();
6255 set<snapid_t> s = realm->get_snaps();
6256 while (!s.empty() && *s.begin() < in->first)
6257 s.erase(s.begin());
6258 while (!s.empty() && *s.rbegin() > in->last)
6259 s.erase(*s.rbegin());
6260 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6261 if (s.size() > 1) {
6262 CInode::mempool_inode pi = in->project_inode();
6263 pi->version = in->pre_dirty();
6264
6265 auto mut(std::make_shared<MutationImpl>());
6266 mut->ls = mds->mdlog->get_current_segment();
6267 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6268 mds->mdlog->start_entry(le);
6269 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6270
6271 s.erase(*s.begin());
6272 while (!s.empty()) {
6273 snapid_t snapid = *s.begin();
6274 CInode *cow_inode = 0;
6275 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6276 ceph_assert(cow_inode);
6277 recovery_queue.enqueue(cow_inode);
6278 s.erase(*s.begin());
6279 }
6280
6281 in->parent->first = in->first;
6282 le->metablob.add_primary_dentry(in->parent, in, true);
6283 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6284 mds->mdlog->flush();
6285 }
6286 */
6287
6288 recovery_queue.enqueue(in);
6289 }
6290
6291 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6292 {
6293 in->pop_and_dirty_projected_inode(mut->ls);
6294 mut->apply();
6295 mds->locker->drop_locks(mut.get());
6296 mut->cleanup();
6297 }
6298
6299
6300 /*
6301 * called after recovery to recover file sizes for previously opened (for write)
6302 * files. that is, those where max_size > size.
6303 */
6304 void MDCache::identify_files_to_recover()
6305 {
6306 dout(10) << "identify_files_to_recover" << dendl;
6307 int count = 0;
6308 for (auto &p : inode_map) {
6309 CInode *in = p.second;
6310 if (!in->is_auth())
6311 continue;
6312
6313 if (in->last != CEPH_NOSNAP)
6314 continue;
6315
6316 // Only normal files need file size recovery
6317 if (!in->is_file()) {
6318 continue;
6319 }
6320
6321 bool recover = false;
6322 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6323 p != in->inode.client_ranges.end();
6324 ++p) {
6325 Capability *cap = in->get_client_cap(p->first);
6326 if (cap) {
6327 cap->mark_clientwriteable();
6328 } else {
6329 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6330 recover = true;
6331 break;
6332 }
6333 }
6334
6335 if (recover) {
6336 if (in->filelock.is_stable()) {
6337 in->auth_pin(&in->filelock);
6338 } else {
6339 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6340 }
6341 in->filelock.set_state(LOCK_PRE_SCAN);
6342 rejoin_recover_q.push_back(in);
6343 } else {
6344 rejoin_check_q.push_back(in);
6345 }
6346
6347 if (!(++count % 1000))
6348 mds->heartbeat_reset();
6349 }
6350 }
6351
6352 void MDCache::start_files_to_recover()
6353 {
6354 for (CInode *in : rejoin_check_q) {
6355 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6356 mds->locker->issue_caps(in);
6357 mds->locker->check_inode_max_size(in);
6358 }
6359 rejoin_check_q.clear();
6360 for (CInode *in : rejoin_recover_q) {
6361 mds->locker->file_recover(&in->filelock);
6362 }
6363 if (!rejoin_recover_q.empty()) {
6364 rejoin_recover_q.clear();
6365 do_file_recover();
6366 }
6367 }
6368
6369 void MDCache::do_file_recover()
6370 {
6371 recovery_queue.advance();
6372 }
6373
6374 // ===============================================================================
6375
6376
6377 // ----------------------------
6378 // truncate
6379
6380 class C_MDC_RetryTruncate : public MDCacheContext {
6381 CInode *in;
6382 LogSegment *ls;
6383 public:
6384 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6385 MDCacheContext(c), in(i), ls(l) {}
6386 void finish(int r) override {
6387 mdcache->_truncate_inode(in, ls);
6388 }
6389 };
6390
6391 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6392 {
6393 auto pi = in->get_projected_inode();
6394 dout(10) << "truncate_inode "
6395 << pi->truncate_from << " -> " << pi->truncate_size
6396 << " on " << *in
6397 << dendl;
6398
6399 ls->truncating_inodes.insert(in);
6400 in->get(CInode::PIN_TRUNCATING);
6401 in->auth_pin(this);
6402
6403 if (!in->client_need_snapflush.empty() &&
6404 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6405 ceph_assert(in->filelock.is_xlocked());
6406 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6407 mds->locker->issue_caps(in);
6408 return;
6409 }
6410
6411 _truncate_inode(in, ls);
6412 }
6413
6414 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6415 CInode *in;
6416 LogSegment *ls;
6417 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6418 MDCacheIOContext(c, false), in(i), ls(l) {
6419 }
6420 void finish(int r) override {
6421 ceph_assert(r == 0 || r == -ENOENT);
6422 mdcache->truncate_inode_finish(in, ls);
6423 }
6424 void print(ostream& out) const override {
6425 out << "file_truncate(" << in->ino() << ")";
6426 }
6427 };
6428
6429 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6430 {
6431 auto pi = &in->inode;
6432 dout(10) << "_truncate_inode "
6433 << pi->truncate_from << " -> " << pi->truncate_size
6434 << " on " << *in << dendl;
6435
6436 ceph_assert(pi->is_truncating());
6437 ceph_assert(pi->truncate_size < (1ULL << 63));
6438 ceph_assert(pi->truncate_from < (1ULL << 63));
6439 ceph_assert(pi->truncate_size < pi->truncate_from);
6440
6441
6442 SnapRealm *realm = in->find_snaprealm();
6443 SnapContext nullsnap;
6444 const SnapContext *snapc;
6445 if (realm) {
6446 dout(10) << " realm " << *realm << dendl;
6447 snapc = &realm->get_snap_context();
6448 } else {
6449 dout(10) << " NO realm, using null context" << dendl;
6450 snapc = &nullsnap;
6451 ceph_assert(in->last == CEPH_NOSNAP);
6452 }
6453 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6454 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6455 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6456 pi->truncate_seq, ceph::real_time::min(), 0,
6457 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6458 mds->finisher));
6459 }
6460
6461 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6462 CInode *in;
6463 MutationRef mut;
6464 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6465 MDCacheLogContext(m), in(i), mut(mu) {}
6466 void finish(int r) override {
6467 mdcache->truncate_inode_logged(in, mut);
6468 }
6469 };
6470
6471 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6472 {
6473 dout(10) << "truncate_inode_finish " << *in << dendl;
6474
6475 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6476 ceph_assert(p != ls->truncating_inodes.end());
6477 ls->truncating_inodes.erase(p);
6478
6479 // update
6480 auto &pi = in->project_inode();
6481 pi.inode.version = in->pre_dirty();
6482 pi.inode.truncate_from = 0;
6483 pi.inode.truncate_pending--;
6484
6485 MutationRef mut(new MutationImpl());
6486 mut->ls = mds->mdlog->get_current_segment();
6487 mut->add_projected_inode(in);
6488
6489 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6490 mds->mdlog->start_entry(le);
6491 CDentry *dn = in->get_projected_parent_dn();
6492 le->metablob.add_dir_context(dn->get_dir());
6493 le->metablob.add_primary_dentry(dn, in, true);
6494 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6495
6496 journal_dirty_inode(mut.get(), &le->metablob, in);
6497 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6498
6499 // flush immediately if there are readers/writers waiting
6500 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6501 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6502 mds->mdlog->flush();
6503 }
6504
6505 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6506 {
6507 dout(10) << "truncate_inode_logged " << *in << dendl;
6508 mut->apply();
6509 mds->locker->drop_locks(mut.get());
6510 mut->cleanup();
6511
6512 in->put(CInode::PIN_TRUNCATING);
6513 in->auth_unpin(this);
6514
6515 MDSContext::vec waiters;
6516 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6517 mds->queue_waiters(waiters);
6518 }
6519
6520
6521 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6522 {
6523 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6524 << ls->seq << "/" << ls->offset << dendl;
6525 ls->truncating_inodes.insert(in);
6526 in->get(CInode::PIN_TRUNCATING);
6527 }
6528
6529 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6530 {
6531 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6532 << ls->seq << "/" << ls->offset << dendl;
6533 // if we have the logseg the truncate started in, it must be in our list.
6534 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6535 ceph_assert(p != ls->truncating_inodes.end());
6536 ls->truncating_inodes.erase(p);
6537 in->put(CInode::PIN_TRUNCATING);
6538 }
6539
6540 void MDCache::start_recovered_truncates()
6541 {
6542 dout(10) << "start_recovered_truncates" << dendl;
6543 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6544 p != mds->mdlog->segments.end();
6545 ++p) {
6546 LogSegment *ls = p->second;
6547 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6548 q != ls->truncating_inodes.end();
6549 ++q) {
6550 CInode *in = *q;
6551 in->auth_pin(this);
6552
6553 if (!in->client_need_snapflush.empty() &&
6554 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6555 ceph_assert(in->filelock.is_stable());
6556 in->filelock.set_state(LOCK_XLOCKDONE);
6557 in->auth_pin(&in->filelock);
6558 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6559 // start_files_to_recover will revoke caps
6560 continue;
6561 }
6562 _truncate_inode(in, ls);
6563 }
6564 }
6565 }
6566
6567
6568 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6569 interval_set<inodeno_t> inos;
6570 version_t inotablev;
6571 LogSegment *ls;
6572 public:
6573 C_MDS_purge_completed_finish(MDCache *m,
6574 interval_set<inodeno_t> i,
6575 version_t iv,
6576 LogSegment *_ls)
6577 : MDCacheLogContext(m),
6578 inos(std::move(i)),
6579 inotablev(iv),
6580 ls(_ls) {}
6581 void finish(int r) override {
6582 assert(r == 0);
6583 if (inotablev) {
6584 ls->purge_inodes_finish(inos);
6585 mdcache->mds->inotable->apply_release_ids(inos);
6586 assert(mdcache->mds->inotable->get_version() == inotablev);
6587 }
6588 }
6589 };
6590
6591 void MDCache::start_purge_inodes(){
6592 dout(10) << "start_purge_inodes" << dendl;
6593 for (auto& p : mds->mdlog->segments){
6594 LogSegment *ls = p.second;
6595 if (ls->purge_inodes.size()){
6596 purge_inodes(ls->purge_inodes, ls);
6597 }
6598 }
6599 }
6600
6601 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6602 {
6603 auto cb = new LambdaContext([this, inos, ls](int r){
6604 assert(r == 0 || r == -2);
6605 mds->inotable->project_release_ids(inos);
6606 version_t piv = mds->inotable->get_projected_version();
6607 assert(piv != 0);
6608 mds->mdlog->start_submit_entry(new EPurged(inos, piv, ls->seq),
6609 new C_MDS_purge_completed_finish(this, inos, piv, ls));
6610 mds->mdlog->flush();
6611 });
6612
6613 dout(10) << __func__ << " start purge data : " << inos << dendl;
6614 C_GatherBuilder gather(g_ceph_context,
6615 new C_OnFinisher( new MDSIOContextWrapper(mds, cb), mds->finisher));
6616 SnapContext nullsnapc;
6617 uint64_t num = Striper::get_num_objects(default_file_layout, default_file_layout.get_period());
6618 for (auto p = inos.begin();
6619 p != inos.end();
6620 ++p){
6621 dout(10) << __func__
6622 << " prealloc_inos : " << inos.size()
6623 << " start : " << p.get_start().val
6624 << " length : " << p.get_len() << " "
6625 << " seq : " << ls->seq << dendl;
6626
6627 for (_inodeno_t i = 0; i < p.get_len(); i++){
6628 dout(20) << __func__ << " : " << p.get_start() + i << dendl;
6629 filer.purge_range(p.get_start() + i,
6630 &default_file_layout,
6631 nullsnapc,
6632 0, num,
6633 ceph::real_clock::now(),
6634 0, gather.new_sub());
6635 }
6636 }
6637 gather.activate();
6638 }
6639
6640 // ================================================================================
6641 // cache trimming
6642
6643 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6644 {
6645 bool is_standby_replay = mds->is_standby_replay();
6646 std::vector<CDentry *> unexpirables;
6647 uint64_t trimmed = 0;
6648
6649 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6650
6651 dout(7) << "trim_lru trimming " << count
6652 << " items from LRU"
6653 << " size=" << lru.lru_get_size()
6654 << " mid=" << lru.lru_get_top()
6655 << " pintail=" << lru.lru_get_pintail()
6656 << " pinned=" << lru.lru_get_num_pinned()
6657 << dendl;
6658
6659 const uint64_t trim_counter_start = trim_counter.get();
6660 bool throttled = false;
6661 while (1) {
6662 throttled |= trim_counter_start+trimmed >= trim_threshold;
6663 if (throttled) break;
6664 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6665 if (!dn)
6666 break;
6667 if (trim_dentry(dn, expiremap)) {
6668 unexpirables.push_back(dn);
6669 } else {
6670 trimmed++;
6671 }
6672 }
6673
6674 for (auto &dn : unexpirables) {
6675 bottom_lru.lru_insert_mid(dn);
6676 }
6677 unexpirables.clear();
6678
6679 // trim dentries from the LRU until count is reached
6680 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6681 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6682 throttled |= trim_counter_start+trimmed >= trim_threshold;
6683 if (throttled) break;
6684 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6685 if (!dn) {
6686 break;
6687 }
6688 if ((is_standby_replay && dn->get_linkage()->inode &&
6689 dn->get_linkage()->inode->item_open_file.is_on_list())) {
6690 // we move the inodes that need to be trimmed to the end of the lru queue.
6691 // refer to MDCache::standby_trim_segment
6692 lru.lru_insert_bot(dn);
6693 break;
6694 } else if (trim_dentry(dn, expiremap)) {
6695 unexpirables.push_back(dn);
6696 } else {
6697 trimmed++;
6698 if (count > 0) count--;
6699 }
6700 }
6701 trim_counter.hit(trimmed);
6702
6703 for (auto &dn : unexpirables) {
6704 lru.lru_insert_mid(dn);
6705 }
6706 unexpirables.clear();
6707
6708 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6709 return std::pair<bool, uint64_t>(throttled, trimmed);
6710 }
6711
6712 /*
6713 * note: only called while MDS is active or stopping... NOT during recovery.
6714 * however, we may expire a replica whose authority is recovering.
6715 *
6716 * @param count is number of dentries to try to expire
6717 */
6718 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6719 {
6720 uint64_t used = cache_size();
6721 uint64_t limit = cache_memory_limit;
6722 expiremap expiremap;
6723
6724 dout(7) << "trim bytes_used=" << bytes2str(used)
6725 << " limit=" << bytes2str(limit)
6726 << " reservation=" << cache_reservation
6727 << "% count=" << count << dendl;
6728
6729 // process delayed eval_stray()
6730 stray_manager.advance_delayed();
6731
6732 auto result = trim_lru(count, expiremap);
6733 auto& trimmed = result.second;
6734
6735 // trim non-auth, non-bound subtrees
6736 for (auto p = subtrees.begin(); p != subtrees.end();) {
6737 CDir *dir = p->first;
6738 ++p;
6739 CInode *diri = dir->get_inode();
6740 if (dir->is_auth()) {
6741 if (!diri->is_auth() && !diri->is_base() &&
6742 dir->get_num_head_items() == 0) {
6743 if (dir->state_test(CDir::STATE_EXPORTING) ||
6744 !(mds->is_active() || mds->is_stopping()) ||
6745 dir->is_freezing() || dir->is_frozen())
6746 continue;
6747
6748 migrator->export_empty_import(dir);
6749 ++trimmed;
6750 }
6751 } else {
6752 if (!diri->is_auth()) {
6753 if (dir->get_num_ref() > 1) // only subtree pin
6754 continue;
6755 auto&& ls = diri->get_subtree_dirfrags();
6756 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6757 continue;
6758
6759 // don't trim subtree root if its auth MDS is recovering.
6760 // This simplify the cache rejoin code.
6761 if (dir->is_subtree_root() &&
6762 rejoin_ack_gather.count(dir->get_dir_auth().first))
6763 continue;
6764 trim_dirfrag(dir, 0, expiremap);
6765 ++trimmed;
6766 }
6767 }
6768 }
6769
6770 // trim root?
6771 if (mds->is_stopping() && root) {
6772 auto&& ls = root->get_dirfrags();
6773 for (const auto& dir : ls) {
6774 if (dir->get_num_ref() == 1) { // subtree pin
6775 trim_dirfrag(dir, 0, expiremap);
6776 ++trimmed;
6777 }
6778 }
6779 if (root->get_num_ref() == 0) {
6780 trim_inode(0, root, 0, expiremap);
6781 ++trimmed;
6782 }
6783 }
6784
6785 std::set<mds_rank_t> stopping;
6786 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6787 stopping.erase(mds->get_nodeid());
6788 for (auto rank : stopping) {
6789 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6790 if (!mdsdir_in)
6791 continue;
6792
6793 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6794 if (em.second) {
6795 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6796 }
6797
6798 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6799
6800 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6801 if (!aborted) {
6802 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6803 auto&& ls = mdsdir_in->get_dirfrags();
6804 for (auto dir : ls) {
6805 if (dir->get_num_ref() == 1) { // subtree pin
6806 trim_dirfrag(dir, dir, expiremap);
6807 ++trimmed;
6808 }
6809 }
6810 if (mdsdir_in->get_num_ref() == 0) {
6811 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6812 ++trimmed;
6813 }
6814 } else {
6815 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6816 }
6817 }
6818
6819 // Other rank's base inodes (when I'm stopping)
6820 if (mds->is_stopping()) {
6821 for (set<CInode*>::iterator p = base_inodes.begin();
6822 p != base_inodes.end();) {
6823 CInode *base_in = *p;
6824 ++p;
6825 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6826 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6827 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6828 if (base_in->get_num_ref() == 0) {
6829 trim_inode(NULL, base_in, NULL, expiremap);
6830 ++trimmed;
6831 }
6832 }
6833 }
6834 }
6835
6836 // send any expire messages
6837 send_expire_messages(expiremap);
6838
6839 return result;
6840 }
6841
6842 void MDCache::send_expire_messages(expiremap& expiremap)
6843 {
6844 // send expires
6845 for (const auto &p : expiremap) {
6846 if (mds->is_cluster_degraded() &&
6847 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6848 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6849 rejoin_sent.count(p.first) == 0))) {
6850 continue;
6851 }
6852 dout(7) << "sending cache_expire to " << p.first << dendl;
6853 mds->send_message_mds(p.second, p.first);
6854 }
6855 expiremap.clear();
6856 }
6857
6858
6859 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6860 {
6861 dout(12) << "trim_dentry " << *dn << dendl;
6862
6863 CDentry::linkage_t *dnl = dn->get_linkage();
6864
6865 CDir *dir = dn->get_dir();
6866 ceph_assert(dir);
6867
6868 CDir *con = get_subtree_root(dir);
6869 if (con)
6870 dout(12) << " in container " << *con << dendl;
6871 else {
6872 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6873 ceph_assert(dn->is_auth());
6874 }
6875
6876 // If replica dentry is not readable, it's likely we will receive
6877 // MDentryLink/MDentryUnlink message soon (It's possible we first
6878 // receive a MDentryUnlink message, then MDentryLink message)
6879 // MDentryLink message only replicates an inode, so we should
6880 // avoid trimming the inode's parent dentry. This is because that
6881 // unconnected replicas are problematic for subtree migration.
6882 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6883 !dn->get_dir()->get_inode()->is_stray())
6884 return true;
6885
6886 // adjust the dir state
6887 // NOTE: we can safely remove a clean, null dentry without effecting
6888 // directory completeness.
6889 // (check this _before_ we unlink the inode, below!)
6890 bool clear_complete = false;
6891 if (!(dnl->is_null() && dn->is_clean()))
6892 clear_complete = true;
6893
6894 // unlink the dentry
6895 if (dnl->is_remote()) {
6896 // just unlink.
6897 dir->unlink_inode(dn, false);
6898 } else if (dnl->is_primary()) {
6899 // expire the inode, too.
6900 CInode *in = dnl->get_inode();
6901 ceph_assert(in);
6902 if (trim_inode(dn, in, con, expiremap))
6903 return true; // purging stray instead of trimming
6904 } else {
6905 ceph_assert(dnl->is_null());
6906 }
6907
6908 if (!dn->is_auth()) {
6909 // notify dentry authority.
6910 mds_authority_t auth = dn->authority();
6911
6912 for (int p=0; p<2; p++) {
6913 mds_rank_t a = auth.first;
6914 if (p) a = auth.second;
6915 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6916 if (mds->get_nodeid() == auth.second &&
6917 con->is_importing()) break; // don't send any expire while importing.
6918 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6919
6920 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6921 ceph_assert(a != mds->get_nodeid());
6922 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6923 if (em.second)
6924 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6925 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6926 }
6927 }
6928
6929 // remove dentry
6930 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6931 dir->add_to_bloom(dn);
6932 dir->remove_dentry(dn);
6933
6934 if (clear_complete)
6935 dir->state_clear(CDir::STATE_COMPLETE);
6936
6937 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6938 return false;
6939 }
6940
6941
6942 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
6943 {
6944 dout(15) << "trim_dirfrag " << *dir << dendl;
6945
6946 if (dir->is_subtree_root()) {
6947 ceph_assert(!dir->is_auth() ||
6948 (!dir->is_replicated() && dir->inode->is_base()));
6949 remove_subtree(dir); // remove from subtree map
6950 }
6951 ceph_assert(dir->get_num_ref() == 0);
6952
6953 CInode *in = dir->get_inode();
6954
6955 if (!dir->is_auth()) {
6956 mds_authority_t auth = dir->authority();
6957
6958 // was this an auth delegation? (if so, slightly modified container)
6959 dirfrag_t condf;
6960 if (dir->is_subtree_root()) {
6961 dout(12) << " subtree root, container is " << *dir << dendl;
6962 con = dir;
6963 condf = dir->dirfrag();
6964 } else {
6965 condf = con->dirfrag();
6966 }
6967
6968 for (int p=0; p<2; p++) {
6969 mds_rank_t a = auth.first;
6970 if (p) a = auth.second;
6971 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6972 if (mds->get_nodeid() == auth.second &&
6973 con->is_importing()) break; // don't send any expire while importing.
6974 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6975
6976 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6977 ceph_assert(a != mds->get_nodeid());
6978 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6979 if (em.second)
6980 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
6981 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6982 }
6983 }
6984
6985 in->close_dirfrag(dir->dirfrag().frag);
6986 }
6987
6988 /**
6989 * Try trimming an inode from the cache
6990 *
6991 * @return true if the inode is still in cache, else false if it was trimmed
6992 */
6993 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
6994 {
6995 dout(15) << "trim_inode " << *in << dendl;
6996 ceph_assert(in->get_num_ref() == 0);
6997
6998 if (in->is_dir()) {
6999 // If replica inode's dirfragtreelock is not readable, it's likely
7000 // some dirfrags of the inode are being fragmented and we will receive
7001 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7002 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7003 // This is because that unconnected replicas are problematic for
7004 // subtree migration.
7005 //
7006 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7007 return true;
7008 }
7009
7010 // DIR
7011 auto&& dfls = in->get_dirfrags();
7012 for (const auto& dir : dfls) {
7013 ceph_assert(!dir->is_subtree_root());
7014 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7015 }
7016 }
7017
7018 // INODE
7019 if (in->is_auth()) {
7020 // eval stray after closing dirfrags
7021 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7022 maybe_eval_stray(in);
7023 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7024 return true;
7025 }
7026 } else {
7027 mds_authority_t auth = in->authority();
7028
7029 dirfrag_t df;
7030 if (con)
7031 df = con->dirfrag();
7032 else
7033 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7034
7035 for (int p=0; p<2; p++) {
7036 mds_rank_t a = auth.first;
7037 if (p) a = auth.second;
7038 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7039 if (con && mds->get_nodeid() == auth.second &&
7040 con->is_importing()) break; // don't send any expire while importing.
7041 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7042
7043 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7044 ceph_assert(a != mds->get_nodeid());
7045 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7046 if (em.second)
7047 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7048 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7049 }
7050 }
7051
7052 /*
7053 if (in->is_auth()) {
7054 if (in->hack_accessed)
7055 mds->logger->inc("outt");
7056 else {
7057 mds->logger->inc("outut");
7058 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7059 }
7060 }
7061 */
7062
7063 // unlink
7064 if (dn)
7065 dn->get_dir()->unlink_inode(dn, false);
7066 remove_inode(in);
7067 return false;
7068 }
7069
7070
7071 /**
7072 * trim_non_auth - remove any non-auth items from our cache
7073 *
7074 * this reduces the amount of non-auth metadata in our cache, reducing the
7075 * load incurred by the rejoin phase.
7076 *
7077 * the only non-auth items that remain are those that are needed to
7078 * attach our own subtrees to the root.
7079 *
7080 * when we are done, all dentries will be in the top bit of the lru.
7081 *
7082 * why we have to do this:
7083 * we may not have accurate linkage for non-auth items. which means we will
7084 * know which subtree it falls into, and can not be sure to declare it to the
7085 * correct authority.
7086 */
7087 void MDCache::trim_non_auth()
7088 {
7089 dout(7) << "trim_non_auth" << dendl;
7090
7091 // temporarily pin all subtree roots
7092 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7093 p != subtrees.end();
7094 ++p)
7095 p->first->get(CDir::PIN_SUBTREETEMP);
7096
7097 list<CDentry*> auth_list;
7098
7099 // trim non-auth items from the lru
7100 for (;;) {
7101 CDentry *dn = NULL;
7102 if (bottom_lru.lru_get_size() > 0)
7103 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7104 if (!dn && lru.lru_get_size() > 0)
7105 dn = static_cast<CDentry*>(lru.lru_expire());
7106 if (!dn)
7107 break;
7108
7109 CDentry::linkage_t *dnl = dn->get_linkage();
7110
7111 if (dn->is_auth()) {
7112 // add back into lru (at the top)
7113 auth_list.push_back(dn);
7114
7115 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7116 dn->unlink_remote(dnl);
7117 } else {
7118 // non-auth. expire.
7119 CDir *dir = dn->get_dir();
7120 ceph_assert(dir);
7121
7122 // unlink the dentry
7123 dout(10) << " removing " << *dn << dendl;
7124 if (dnl->is_remote()) {
7125 dir->unlink_inode(dn, false);
7126 }
7127 else if (dnl->is_primary()) {
7128 CInode *in = dnl->get_inode();
7129 dout(10) << " removing " << *in << dendl;
7130 auto&& ls = in->get_dirfrags();
7131 for (const auto& subdir : ls) {
7132 ceph_assert(!subdir->is_subtree_root());
7133 in->close_dirfrag(subdir->dirfrag().frag);
7134 }
7135 dir->unlink_inode(dn, false);
7136 remove_inode(in);
7137 }
7138 else {
7139 ceph_assert(dnl->is_null());
7140 }
7141
7142 ceph_assert(!dir->has_bloom());
7143 dir->remove_dentry(dn);
7144 // adjust the dir state
7145 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7146 // close empty non-auth dirfrag
7147 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7148 dir->inode->close_dirfrag(dir->get_frag());
7149 }
7150 }
7151
7152 for (const auto& dn : auth_list) {
7153 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7154 bottom_lru.lru_insert_mid(dn);
7155 else
7156 lru.lru_insert_top(dn);
7157 }
7158
7159 // move everything in the pintail to the top bit of the lru.
7160 lru.lru_touch_entire_pintail();
7161
7162 // unpin all subtrees
7163 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7164 p != subtrees.end();
7165 ++p)
7166 p->first->put(CDir::PIN_SUBTREETEMP);
7167
7168 if (lru.lru_get_size() == 0 &&
7169 bottom_lru.lru_get_size() == 0) {
7170 // root, stray, etc.?
7171 auto p = inode_map.begin();
7172 while (p != inode_map.end()) {
7173 CInode *in = p->second;
7174 ++p;
7175 if (!in->is_auth()) {
7176 auto&& ls = in->get_dirfrags();
7177 for (const auto& dir : ls) {
7178 dout(10) << " removing " << *dir << dendl;
7179 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7180 remove_subtree(dir);
7181 in->close_dirfrag(dir->dirfrag().frag);
7182 }
7183 dout(10) << " removing " << *in << dendl;
7184 ceph_assert(!in->get_parent_dn());
7185 ceph_assert(in->get_num_ref() == 0);
7186 remove_inode(in);
7187 }
7188 }
7189 }
7190
7191 show_subtrees();
7192 }
7193
7194 /**
7195 * Recursively trim the subtree rooted at directory to remove all
7196 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7197 * of those links. This is used to clear invalid data out of the cache.
7198 * Note that it doesn't clear the passed-in directory, since that's not
7199 * always safe.
7200 */
7201 bool MDCache::trim_non_auth_subtree(CDir *dir)
7202 {
7203 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7204
7205 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7206
7207 auto j = dir->begin();
7208 auto i = j;
7209 while (j != dir->end()) {
7210 i = j++;
7211 CDentry *dn = i->second;
7212 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7213 CDentry::linkage_t *dnl = dn->get_linkage();
7214 if (dnl->is_primary()) { // check for subdirectories, etc
7215 CInode *in = dnl->get_inode();
7216 bool keep_inode = false;
7217 if (in->is_dir()) {
7218 auto&& subdirs = in->get_dirfrags();
7219 for (const auto& subdir : subdirs) {
7220 if (subdir->is_subtree_root()) {
7221 keep_inode = true;
7222 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7223 } else {
7224 if (trim_non_auth_subtree(subdir))
7225 keep_inode = true;
7226 else {
7227 in->close_dirfrag(subdir->get_frag());
7228 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7229 }
7230 }
7231 }
7232
7233 }
7234 if (!keep_inode) { // remove it!
7235 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7236 dir->unlink_inode(dn, false);
7237 remove_inode(in);
7238 ceph_assert(!dir->has_bloom());
7239 dir->remove_dentry(dn);
7240 } else {
7241 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7242 dn->state_clear(CDentry::STATE_AUTH);
7243 in->state_clear(CInode::STATE_AUTH);
7244 }
7245 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7246 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7247 } else { // just remove it
7248 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7249 if (dnl->is_remote())
7250 dir->unlink_inode(dn, false);
7251 dir->remove_dentry(dn);
7252 }
7253 }
7254 dir->state_clear(CDir::STATE_AUTH);
7255 /**
7256 * We've now checked all our children and deleted those that need it.
7257 * Now return to caller, and tell them if *we're* a keeper.
7258 */
7259 return keep_dir || dir->get_num_any();
7260 }
7261
7262 /*
7263 * during replay, when we determine a subtree is no longer ours, we
7264 * try to trim it from our cache. because subtrees must be connected
7265 * to the root, the fact that we can trim this tree may mean that our
7266 * children or parents can also be trimmed.
7267 */
7268 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7269 {
7270 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7271
7272 // can we now trim child subtrees?
7273 set<CDir*> bounds;
7274 get_subtree_bounds(dir, bounds);
7275 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7276 CDir *bd = *p;
7277 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7278 bd->get_num_any() == 0 && // and empty
7279 can_trim_non_auth_dirfrag(bd)) {
7280 CInode *bi = bd->get_inode();
7281 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7282 remove_subtree(bd);
7283 bd->mark_clean();
7284 bi->close_dirfrag(bd->get_frag());
7285 }
7286 }
7287
7288 if (trim_non_auth_subtree(dir)) {
7289 // keep
7290 try_subtree_merge(dir);
7291 } else {
7292 // can we trim this subtree (and possibly our ancestors) too?
7293 while (true) {
7294 CInode *diri = dir->get_inode();
7295 if (diri->is_base()) {
7296 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7297 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7298 remove_subtree(dir);
7299 dir->mark_clean();
7300 diri->close_dirfrag(dir->get_frag());
7301
7302 dout(10) << " removing " << *diri << dendl;
7303 ceph_assert(!diri->get_parent_dn());
7304 ceph_assert(diri->get_num_ref() == 0);
7305 remove_inode(diri);
7306 }
7307 break;
7308 }
7309
7310 CDir *psub = get_subtree_root(diri->get_parent_dir());
7311 dout(10) << " parent subtree is " << *psub << dendl;
7312 if (psub->get_dir_auth().first == mds->get_nodeid())
7313 break; // we are auth, keep.
7314
7315 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7316 remove_subtree(dir);
7317 dir->mark_clean();
7318 diri->close_dirfrag(dir->get_frag());
7319
7320 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7321 if (trim_non_auth_subtree(psub))
7322 break;
7323 dir = psub;
7324 }
7325 }
7326
7327 show_subtrees();
7328 }
7329
7330 void MDCache::standby_trim_segment(LogSegment *ls)
7331 {
7332 auto try_trim_inode = [this](CInode *in) {
7333 if (in->get_num_ref() == 0 &&
7334 !in->item_open_file.is_on_list() &&
7335 in->parent != NULL &&
7336 in->parent->get_num_ref() == 0){
7337 touch_dentry_bottom(in->parent);
7338 }
7339 };
7340
7341 auto try_trim_dentry = [this](CDentry *dn) {
7342 if (dn->get_num_ref() > 0)
7343 return;
7344 auto in = dn->get_linkage()->inode;
7345 if(in && in->item_open_file.is_on_list())
7346 return;
7347 touch_dentry_bottom(dn);
7348 };
7349
7350 ls->new_dirfrags.clear_list();
7351 ls->open_files.clear_list();
7352
7353 while (!ls->dirty_dirfrags.empty()) {
7354 CDir *dir = ls->dirty_dirfrags.front();
7355 dir->mark_clean();
7356 if (dir->inode)
7357 try_trim_inode(dir->inode);
7358 }
7359 while (!ls->dirty_inodes.empty()) {
7360 CInode *in = ls->dirty_inodes.front();
7361 in->mark_clean();
7362 try_trim_inode(in);
7363 }
7364 while (!ls->dirty_dentries.empty()) {
7365 CDentry *dn = ls->dirty_dentries.front();
7366 dn->mark_clean();
7367 try_trim_dentry(dn);
7368 }
7369 while (!ls->dirty_parent_inodes.empty()) {
7370 CInode *in = ls->dirty_parent_inodes.front();
7371 in->clear_dirty_parent();
7372 try_trim_inode(in);
7373 }
7374 while (!ls->dirty_dirfrag_dir.empty()) {
7375 CInode *in = ls->dirty_dirfrag_dir.front();
7376 in->filelock.remove_dirty();
7377 try_trim_inode(in);
7378 }
7379 while (!ls->dirty_dirfrag_nest.empty()) {
7380 CInode *in = ls->dirty_dirfrag_nest.front();
7381 in->nestlock.remove_dirty();
7382 try_trim_inode(in);
7383 }
7384 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7385 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7386 in->dirfragtreelock.remove_dirty();
7387 try_trim_inode(in);
7388 }
7389 while (!ls->truncating_inodes.empty()) {
7390 auto it = ls->truncating_inodes.begin();
7391 CInode *in = *it;
7392 ls->truncating_inodes.erase(it);
7393 in->put(CInode::PIN_TRUNCATING);
7394 try_trim_inode(in);
7395 }
7396 }
7397
7398 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7399 {
7400 mds_rank_t from = mds_rank_t(m->get_from());
7401
7402 dout(7) << "cache_expire from mds." << from << dendl;
7403
7404 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7405 return;
7406 }
7407
7408 set<SimpleLock *> gather_locks;
7409 // loop over realms
7410 for (const auto &p : m->realms) {
7411 // check container?
7412 if (p.first.ino > 0) {
7413 CInode *expired_inode = get_inode(p.first.ino);
7414 ceph_assert(expired_inode); // we had better have this.
7415 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7416 ceph_assert(parent_dir);
7417
7418 int export_state = -1;
7419 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7420 export_state = migrator->get_export_state(parent_dir);
7421 ceph_assert(export_state >= 0);
7422 }
7423
7424 if (!parent_dir->is_auth() ||
7425 (export_state != -1 &&
7426 ((export_state == Migrator::EXPORT_WARNING &&
7427 migrator->export_has_warned(parent_dir,from)) ||
7428 export_state == Migrator::EXPORT_EXPORTING ||
7429 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7430 (export_state == Migrator::EXPORT_NOTIFYING &&
7431 !migrator->export_has_notified(parent_dir,from))))) {
7432
7433 // not auth.
7434 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7435 ceph_assert(parent_dir->is_frozen_tree_root());
7436
7437 // make a message container
7438
7439 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7440 if (em.second)
7441 em.first->second = make_message<MCacheExpire>(from); /* new */
7442
7443 // merge these expires into it
7444 em.first->second->add_realm(p.first, p.second);
7445 continue;
7446 }
7447 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7448 (export_state == Migrator::EXPORT_WARNING &&
7449 !migrator->export_has_warned(parent_dir, from)));
7450
7451 dout(7) << "expires for " << *parent_dir << dendl;
7452 } else {
7453 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7454 }
7455
7456 // INODES
7457 for (const auto &q : p.second.inodes) {
7458 CInode *in = get_inode(q.first);
7459 unsigned nonce = q.second;
7460
7461 if (!in) {
7462 dout(0) << " inode expire on " << q.first << " from " << from
7463 << ", don't have it" << dendl;
7464 ceph_assert(in);
7465 }
7466 ceph_assert(in->is_auth());
7467 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7468
7469 // check nonce
7470 if (nonce == in->get_replica_nonce(from)) {
7471 // remove from our cached_by
7472 dout(7) << " inode expire on " << *in << " from mds." << from
7473 << " cached_by was " << in->get_replicas() << dendl;
7474 inode_remove_replica(in, from, false, gather_locks);
7475 }
7476 else {
7477 // this is an old nonce, ignore expire.
7478 dout(7) << " inode expire on " << *in << " from mds." << from
7479 << " with old nonce " << nonce
7480 << " (current " << in->get_replica_nonce(from) << "), dropping"
7481 << dendl;
7482 }
7483 }
7484
7485 // DIRS
7486 for (const auto &q : p.second.dirs) {
7487 CDir *dir = get_dirfrag(q.first);
7488 unsigned nonce = q.second;
7489
7490 if (!dir) {
7491 CInode *diri = get_inode(q.first.ino);
7492 if (diri) {
7493 if (mds->is_rejoin() &&
7494 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7495 !diri->is_replica(from)) {
7496 auto&& ls = diri->get_nested_dirfrags();
7497 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7498 << " while rejoining, inode isn't replicated" << dendl;
7499 for (const auto& d : ls) {
7500 dir = d;
7501 if (dir->is_replica(from)) {
7502 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7503 dir->remove_replica(from);
7504 }
7505 }
7506 continue;
7507 }
7508 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7509 if (other) {
7510 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7511 << " have " << *other << ", mismatched frags, dropping" << dendl;
7512 continue;
7513 }
7514 }
7515 dout(0) << " dir expire on " << q.first << " from " << from
7516 << ", don't have it" << dendl;
7517 ceph_assert(dir);
7518 }
7519 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7520
7521 ceph_assert(dir->is_auth());
7522
7523 // check nonce
7524 if (nonce == dir->get_replica_nonce(from)) {
7525 // remove from our cached_by
7526 dout(7) << " dir expire on " << *dir << " from mds." << from
7527 << " replicas was " << dir->get_replicas() << dendl;
7528 dir->remove_replica(from);
7529 }
7530 else {
7531 // this is an old nonce, ignore expire.
7532 dout(7) << " dir expire on " << *dir << " from mds." << from
7533 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7534 << "), dropping" << dendl;
7535 }
7536 }
7537
7538 // DENTRIES
7539 for (const auto &pd : p.second.dentries) {
7540 dout(10) << " dn expires in dir " << pd.first << dendl;
7541 CInode *diri = get_inode(pd.first.ino);
7542 ceph_assert(diri);
7543 CDir *dir = diri->get_dirfrag(pd.first.frag);
7544
7545 if (!dir) {
7546 dout(0) << " dn expires on " << pd.first << " from " << from
7547 << ", must have refragmented" << dendl;
7548 } else {
7549 ceph_assert(dir->is_auth());
7550 }
7551
7552 for (const auto &p : pd.second) {
7553 unsigned nonce = p.second;
7554 CDentry *dn;
7555
7556 if (dir) {
7557 dn = dir->lookup(p.first.first, p.first.second);
7558 } else {
7559 // which dirfrag for this dentry?
7560 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7561 ceph_assert(dir);
7562 ceph_assert(dir->is_auth());
7563 dn = dir->lookup(p.first.first, p.first.second);
7564 }
7565
7566 if (!dn) {
7567 if (dir)
7568 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7569 else
7570 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7571 }
7572 ceph_assert(dn);
7573
7574 if (nonce == dn->get_replica_nonce(from)) {
7575 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7576 dentry_remove_replica(dn, from, gather_locks);
7577 }
7578 else {
7579 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7580 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7581 << "), dropping" << dendl;
7582 }
7583 }
7584 }
7585 }
7586
7587 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7588 if (!(*p)->is_stable())
7589 mds->locker->eval_gather(*p);
7590 }
7591 }
7592
7593 void MDCache::process_delayed_expire(CDir *dir)
7594 {
7595 dout(7) << "process_delayed_expire on " << *dir << dendl;
7596 for (const auto &p : delayed_expire[dir]) {
7597 handle_cache_expire(p.second);
7598 }
7599 delayed_expire.erase(dir);
7600 }
7601
7602 void MDCache::discard_delayed_expire(CDir *dir)
7603 {
7604 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7605 delayed_expire.erase(dir);
7606 }
7607
7608 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7609 set<SimpleLock *>& gather_locks)
7610 {
7611 in->remove_replica(from);
7612 in->set_mds_caps_wanted(from, 0);
7613
7614 // note: this code calls _eval more often than it needs to!
7615 // fix lock
7616 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7617 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7618 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7619 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7620 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7621 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7622
7623 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7624 // Don't remove the recovering mds from lock's gathering list because
7625 // it may hold rejoined wrlocks.
7626 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7627 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7628 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7629 }
7630
7631 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7632 {
7633 dn->remove_replica(from);
7634
7635 // fix lock
7636 if (dn->lock.remove_replica(from))
7637 gather_locks.insert(&dn->lock);
7638
7639 // Replicated strays might now be elegible for purge
7640 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7641 if (dnl->is_primary()) {
7642 maybe_eval_stray(dnl->get_inode());
7643 }
7644 }
7645
7646 void MDCache::trim_client_leases()
7647 {
7648 utime_t now = ceph_clock_now();
7649
7650 dout(10) << "trim_client_leases" << dendl;
7651
7652 std::size_t pool = 0;
7653 for (const auto& list : client_leases) {
7654 pool += 1;
7655 if (list.empty())
7656 continue;
7657
7658 auto before = list.size();
7659 while (!list.empty()) {
7660 ClientLease *r = list.front();
7661 if (r->ttl > now) break;
7662 CDentry *dn = static_cast<CDentry*>(r->parent);
7663 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7664 dn->remove_client_lease(r, mds->locker);
7665 }
7666 auto after = list.size();
7667 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7668 << (before-after) << " leases, " << after << " left" << dendl;
7669 }
7670 }
7671
7672
7673 void MDCache::check_memory_usage()
7674 {
7675 static MemoryModel mm(g_ceph_context);
7676 static MemoryModel::snap last;
7677 mm.sample(&last);
7678 static MemoryModel::snap baseline = last;
7679
7680 // check client caps
7681 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7682 double caps_per_inode = 0.0;
7683 if (CInode::count())
7684 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7685
7686 dout(2) << "Memory usage: "
7687 << " total " << last.get_total()
7688 << ", rss " << last.get_rss()
7689 << ", heap " << last.get_heap()
7690 << ", baseline " << baseline.get_heap()
7691 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7692 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7693 << dendl;
7694
7695 mds->update_mlogger();
7696 mds->mlogger->set(l_mdm_rss, last.get_rss());
7697 mds->mlogger->set(l_mdm_heap, last.get_heap());
7698
7699 if (cache_toofull()) {
7700 mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM);
7701 }
7702
7703 // If the cache size had exceeded its limit, but we're back in bounds
7704 // now, free any unused pool memory so that our memory usage isn't
7705 // permanently bloated.
7706 if (exceeded_size_limit && !cache_toofull()) {
7707 // Only do this once we are back in bounds: otherwise the releases would
7708 // slow down whatever process caused us to exceed bounds to begin with
7709 if (ceph_using_tcmalloc()) {
7710 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7711 << dendl;
7712 ceph_heap_release_free_memory();
7713 }
7714 exceeded_size_limit = false;
7715 }
7716 }
7717
7718
7719
7720 // =========================================================================================
7721 // shutdown
7722
7723 class C_MDC_ShutdownCheck : public MDCacheContext {
7724 public:
7725 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7726 void finish(int) override {
7727 mdcache->shutdown_check();
7728 }
7729 };
7730
7731 void MDCache::shutdown_check()
7732 {
7733 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7734
7735 // cache
7736 char old_val[32] = { 0 };
7737 char *o = old_val;
7738 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7739 g_conf().set_val("debug_mds", "10");
7740 g_conf().apply_changes(nullptr);
7741 show_cache();
7742 g_conf().set_val("debug_mds", old_val);
7743 g_conf().apply_changes(nullptr);
7744 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7745
7746 // this
7747 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7748 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7749
7750
7751 if (mds->objecter->is_active()) {
7752 dout(0) << "objecter still active" << dendl;
7753 mds->objecter->dump_active();
7754 }
7755 }
7756
7757
7758 void MDCache::shutdown_start()
7759 {
7760 dout(5) << "shutdown_start" << dendl;
7761
7762 if (g_conf()->mds_shutdown_check)
7763 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7764
7765 // g_conf()->debug_mds = 10;
7766 }
7767
7768
7769
7770 bool MDCache::shutdown_pass()
7771 {
7772 dout(7) << "shutdown_pass" << dendl;
7773
7774 if (mds->is_stopped()) {
7775 dout(7) << " already shut down" << dendl;
7776 show_cache();
7777 show_subtrees();
7778 return true;
7779 }
7780
7781 // empty stray dir
7782 bool strays_all_exported = shutdown_export_strays();
7783
7784 // trim cache
7785 trim(UINT64_MAX);
7786 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7787
7788 // Export all subtrees to another active (usually rank 0) if not rank 0
7789 int num_auth_subtree = 0;
7790 if (!subtrees.empty() &&
7791 mds->get_nodeid() != 0) {
7792 dout(7) << "looking for subtrees to export to mds0" << dendl;
7793 std::vector<CDir*> ls;
7794 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7795 it != subtrees.end();
7796 ++it) {
7797 CDir *dir = it->first;
7798 if (dir->get_inode()->is_mdsdir())
7799 continue;
7800 if (dir->is_auth()) {
7801 num_auth_subtree++;
7802 if (dir->is_frozen() ||
7803 dir->is_freezing() ||
7804 dir->is_ambiguous_dir_auth() ||
7805 dir->state_test(CDir::STATE_EXPORTING))
7806 continue;
7807 ls.push_back(dir);
7808 }
7809 }
7810
7811 migrator->clear_export_queue();
7812 for (const auto& dir : ls) {
7813 mds_rank_t dest = dir->get_inode()->authority().first;
7814 if (dest > 0 && !mds->mdsmap->is_active(dest))
7815 dest = 0;
7816 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7817 migrator->export_dir_nicely(dir, dest);
7818 }
7819 }
7820
7821 if (!strays_all_exported) {
7822 dout(7) << "waiting for strays to migrate" << dendl;
7823 return false;
7824 }
7825
7826 if (num_auth_subtree > 0) {
7827 ceph_assert(mds->get_nodeid() > 0);
7828 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7829 show_subtrees();
7830 return false;
7831 }
7832
7833 // close out any sessions (and open files!) before we try to trim the log, etc.
7834 if (mds->sessionmap.have_unclosed_sessions()) {
7835 if (!mds->server->terminating_sessions)
7836 mds->server->terminate_sessions();
7837 return false;
7838 }
7839
7840 // Fully trim the log so that all objects in cache are clean and may be
7841 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7842 // trim the log such that the cache eventually becomes clean.
7843 if (mds->mdlog->get_num_segments() > 0) {
7844 auto ls = mds->mdlog->get_current_segment();
7845 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7846 // Current segment contains events other than subtreemap or
7847 // there are dirty dirfrags (see CDir::log_mark_dirty())
7848 mds->mdlog->start_new_segment();
7849 mds->mdlog->flush();
7850 }
7851 }
7852 mds->mdlog->trim_all();
7853 if (mds->mdlog->get_num_segments() > 1) {
7854 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7855 return false;
7856 }
7857
7858 // drop our reference to our stray dir inode
7859 for (int i = 0; i < NUM_STRAY; ++i) {
7860 if (strays[i] &&
7861 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7862 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7863 strays[i]->put(CInode::PIN_STRAY);
7864 strays[i]->put_stickydirs();
7865 }
7866 }
7867
7868 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7869 if (mydir && !mydir->is_subtree_root())
7870 mydir = NULL;
7871
7872 // subtrees map not empty yet?
7873 if (subtrees.size() > (mydir ? 1 : 0)) {
7874 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7875 show_subtrees();
7876 migrator->show_importing();
7877 migrator->show_exporting();
7878 if (!migrator->is_importing() && !migrator->is_exporting())
7879 show_cache();
7880 return false;
7881 }
7882 ceph_assert(!migrator->is_exporting());
7883 ceph_assert(!migrator->is_importing());
7884
7885 // replicas may dirty scatter locks
7886 if (myin && myin->is_replicated()) {
7887 dout(7) << "still have replicated objects" << dendl;
7888 return false;
7889 }
7890
7891 if ((myin && myin->get_num_auth_pins()) ||
7892 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
7893 dout(7) << "still have auth pinned objects" << dendl;
7894 return false;
7895 }
7896
7897 // (only do this once!)
7898 if (!mds->mdlog->is_capped()) {
7899 dout(7) << "capping the log" << dendl;
7900 mds->mdlog->cap();
7901 }
7902
7903 if (!mds->mdlog->empty())
7904 mds->mdlog->trim(0);
7905
7906 if (!mds->mdlog->empty()) {
7907 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7908 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7909 return false;
7910 }
7911
7912 if (!did_shutdown_log_cap) {
7913 // flush journal header
7914 dout(7) << "writing header for (now-empty) journal" << dendl;
7915 ceph_assert(mds->mdlog->empty());
7916 mds->mdlog->write_head(0);
7917 // NOTE: filer active checker below will block us until this completes.
7918 did_shutdown_log_cap = true;
7919 return false;
7920 }
7921
7922 // filer active?
7923 if (mds->objecter->is_active()) {
7924 dout(7) << "objecter still active" << dendl;
7925 mds->objecter->dump_active();
7926 return false;
7927 }
7928
7929 // trim what we can from the cache
7930 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7931 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7932 show_cache();
7933 //dump();
7934 return false;
7935 }
7936
7937 // make mydir subtree go away
7938 if (mydir) {
7939 if (mydir->get_num_ref() > 1) { // subtree pin
7940 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7941 show_cache();
7942 return false;
7943 }
7944
7945 remove_subtree(mydir);
7946 myin->close_dirfrag(mydir->get_frag());
7947 }
7948 ceph_assert(subtrees.empty());
7949
7950 if (myin) {
7951 remove_inode(myin);
7952 ceph_assert(!myin);
7953 }
7954
7955 if (global_snaprealm) {
7956 remove_inode(global_snaprealm->inode);
7957 global_snaprealm = nullptr;
7958 }
7959
7960 // done!
7961 dout(5) << "shutdown done." << dendl;
7962 return true;
7963 }
7964
7965 bool MDCache::shutdown_export_strays()
7966 {
7967 static const unsigned MAX_EXPORTING = 100;
7968
7969 if (mds->get_nodeid() == 0)
7970 return true;
7971
7972 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7973 return false;
7974
7975 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7976 << " '" << shutdown_export_next.second << "'" << dendl;
7977
7978 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7979 bool all_exported = false;
7980
7981 again:
7982 auto next = shutdown_export_next;
7983
7984 for (int i = 0; i < NUM_STRAY; ++i) {
7985 CInode *strayi = strays[i];
7986 if (!strayi ||
7987 !strayi->state_test(CInode::STATE_STRAYPINNED))
7988 continue;
7989 if (strayi->ino() < next.first.ino)
7990 continue;
7991
7992 deque<CDir*> dfls;
7993 strayi->get_dirfrags(dfls);
7994
7995 while (!dfls.empty()) {
7996 CDir *dir = dfls.front();
7997 dfls.pop_front();
7998
7999 if (dir->dirfrag() < next.first)
8000 continue;
8001 if (next.first < dir->dirfrag()) {
8002 next.first = dir->dirfrag();
8003 next.second.clear();
8004 }
8005
8006 if (!dir->is_complete()) {
8007 MDSContext *fin = nullptr;
8008 if (shutdown_exporting_strays.empty()) {
8009 fin = new MDSInternalContextWrapper(mds,
8010 new LambdaContext([this](int r) {
8011 shutdown_export_strays();
8012 })
8013 );
8014 }
8015 dir->fetch(fin);
8016 goto done;
8017 }
8018
8019 CDir::dentry_key_map::iterator it;
8020 if (next.second.empty()) {
8021 it = dir->begin();
8022 } else {
8023 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8024 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8025 }
8026
8027 for (; it != dir->end(); ++it) {
8028 CDentry *dn = it->second;
8029 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8030 if (dnl->is_null())
8031 continue;
8032
8033 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8034 next.second = it->first.name;
8035 goto done;
8036 }
8037
8038 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8039 if (!ret.second) {
8040 dout(10) << "already exporting/purging " << *dn << dendl;
8041 continue;
8042 }
8043
8044 // Don't try to migrate anything that is actually
8045 // being purged right now
8046 if (!dn->state_test(CDentry::STATE_PURGING))
8047 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8048
8049 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8050 ++it;
8051 if (it != dir->end()) {
8052 next.second = it->first.name;
8053 } else {
8054 if (dfls.empty())
8055 next.first.ino.val++;
8056 else
8057 next.first = dfls.front()->dirfrag();
8058 next.second.clear();
8059 }
8060 goto done;
8061 }
8062 }
8063 }
8064 }
8065
8066 if (shutdown_exporting_strays.empty()) {
8067 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8068 if (first_df < shutdown_export_next.first ||
8069 !shutdown_export_next.second.empty()) {
8070 shutdown_export_next.first = first_df;
8071 shutdown_export_next.second.clear();
8072 goto again;
8073 }
8074 all_exported = true;
8075 }
8076
8077 done:
8078 shutdown_export_next = next;
8079 return all_exported;
8080 }
8081
8082 // ========= messaging ==============
8083
8084 void MDCache::dispatch(const cref_t<Message> &m)
8085 {
8086 switch (m->get_type()) {
8087
8088 // RESOLVE
8089 case MSG_MDS_RESOLVE:
8090 handle_resolve(ref_cast<MMDSResolve>(m));
8091 break;
8092 case MSG_MDS_RESOLVEACK:
8093 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8094 break;
8095
8096 // REJOIN
8097 case MSG_MDS_CACHEREJOIN:
8098 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8099 break;
8100
8101 case MSG_MDS_DISCOVER:
8102 handle_discover(ref_cast<MDiscover>(m));
8103 break;
8104 case MSG_MDS_DISCOVERREPLY:
8105 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8106 break;
8107
8108 case MSG_MDS_DIRUPDATE:
8109 handle_dir_update(ref_cast<MDirUpdate>(m));
8110 break;
8111
8112 case MSG_MDS_CACHEEXPIRE:
8113 handle_cache_expire(ref_cast<MCacheExpire>(m));
8114 break;
8115
8116 case MSG_MDS_DENTRYLINK:
8117 handle_dentry_link(ref_cast<MDentryLink>(m));
8118 break;
8119 case MSG_MDS_DENTRYUNLINK:
8120 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8121 break;
8122
8123 case MSG_MDS_FRAGMENTNOTIFY:
8124 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8125 break;
8126 case MSG_MDS_FRAGMENTNOTIFYACK:
8127 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8128 break;
8129
8130 case MSG_MDS_FINDINO:
8131 handle_find_ino(ref_cast<MMDSFindIno>(m));
8132 break;
8133 case MSG_MDS_FINDINOREPLY:
8134 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8135 break;
8136
8137 case MSG_MDS_OPENINO:
8138 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8139 break;
8140 case MSG_MDS_OPENINOREPLY:
8141 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8142 break;
8143
8144 case MSG_MDS_SNAPUPDATE:
8145 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8146 break;
8147
8148 default:
8149 derr << "cache unknown message " << m->get_type() << dendl;
8150 ceph_abort_msg("cache unknown message");
8151 }
8152 }
8153
8154 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8155 const filepath& path, int flags,
8156 vector<CDentry*> *pdnvec, CInode **pin)
8157 {
8158 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8159 bool forward = !discover;
8160 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8161 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8162 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8163 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8164 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8165 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8166 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8167
8168 if (forward)
8169 ceph_assert(mdr); // forward requires a request
8170
8171 snapid_t snapid = CEPH_NOSNAP;
8172 if (mdr)
8173 mdr->snapid = snapid;
8174
8175 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8176
8177 if (mds->logger) mds->logger->inc(l_mds_traverse);
8178
8179 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8180 CInode *cur = get_inode(path.get_ino());
8181 if (!cur) {
8182 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8183 open_foreign_mdsdir(path.get_ino(), cf.build());
8184 return 1;
8185 }
8186 if (MDS_INO_IS_STRAY(path.get_ino())) {
8187 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8188 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8189 filepath path(strays[idx]->get_parent_dn()->get_name(),
8190 MDS_INO_MDSDIR(rank));
8191 MDRequestRef null_ref;
8192 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8193 }
8194 return -ESTALE;
8195 }
8196 if (cur->state_test(CInode::STATE_PURGING))
8197 return -ESTALE;
8198
8199 // make sure snaprealm are open...
8200 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8201 !cur->snaprealm->open_parents(cf.build())) {
8202 return 1;
8203 }
8204
8205 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8206 mds->locker->find_and_attach_lock_cache(mdr, cur);
8207
8208 if (mdr && mdr->lock_cache) {
8209 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8210 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8211 } else if (rdlock_snap) {
8212 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8213 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8214 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8215 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8216 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8217 return 1;
8218 }
8219 }
8220
8221 // start trace
8222 if (pdnvec)
8223 pdnvec->clear();
8224 if (pin)
8225 *pin = cur;
8226
8227 MutationImpl::LockOpVec lov;
8228
8229 for (unsigned depth = 0; depth < path.depth(); ) {
8230 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8231 << "' snapid " << snapid << dendl;
8232
8233 if (!cur->is_dir()) {
8234 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8235 return -ENOTDIR;
8236 }
8237
8238 // walk into snapdir?
8239 if (path[depth].length() == 0) {
8240 dout(10) << "traverse: snapdir" << dendl;
8241 if (!mdr || depth > 0) // snapdir must be the first component
8242 return -EINVAL;
8243 snapid = CEPH_SNAPDIR;
8244 mdr->snapid = snapid;
8245 depth++;
8246 continue;
8247 }
8248 // walk thru snapdir?
8249 if (snapid == CEPH_SNAPDIR) {
8250 if (!mdr)
8251 return -EINVAL;
8252 SnapRealm *realm = cur->find_snaprealm();
8253 snapid = realm->resolve_snapname(path[depth], cur->ino());
8254 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8255 if (!snapid) {
8256 if (pdnvec)
8257 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8258 return -ENOENT;
8259 }
8260 mdr->snapid = snapid;
8261 depth++;
8262 continue;
8263 }
8264
8265 // open dir
8266 frag_t fg = cur->pick_dirfrag(path[depth]);
8267 CDir *curdir = cur->get_dirfrag(fg);
8268 if (!curdir) {
8269 if (cur->is_auth()) {
8270 // parent dir frozen_dir?
8271 if (cur->is_frozen()) {
8272 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8273 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8274 return 1;
8275 }
8276 curdir = cur->get_or_open_dirfrag(this, fg);
8277 } else {
8278 // discover?
8279 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8280 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8281 path_locked);
8282 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8283 return 1;
8284 }
8285 }
8286 ceph_assert(curdir);
8287
8288 #ifdef MDS_VERIFY_FRAGSTAT
8289 if (curdir->is_complete())
8290 curdir->verify_fragstat();
8291 #endif
8292
8293 // frozen?
8294 /*
8295 if (curdir->is_frozen()) {
8296 // doh!
8297 // FIXME: traverse is allowed?
8298 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8299 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8300 if (onfinish) delete onfinish;
8301 return 1;
8302 }
8303 */
8304
8305 if (want_auth && want_dentry && depth == path.depth() - 1) {
8306 if (curdir->is_ambiguous_auth()) {
8307 dout(10) << "waiting for single auth on " << *curdir << dendl;
8308 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8309 return 1;
8310 }
8311 if (!curdir->is_auth()) {
8312 dout(10) << "fw to auth for " << *curdir << dendl;
8313 request_forward(mdr, curdir->authority().first);
8314 return 2;
8315 }
8316 }
8317
8318 // Before doing dirfrag->dn lookup, compare with DamageTable's
8319 // record of which dentries were unreadable
8320 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8321 dout(4) << "traverse: stopped lookup at damaged dentry "
8322 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8323 return -EIO;
8324 }
8325
8326 // dentry
8327 CDentry *dn = curdir->lookup(path[depth], snapid);
8328 if (dn) {
8329 if (dn->state_test(CDentry::STATE_PURGING))
8330 return -ENOENT;
8331
8332 if (rdlock_path) {
8333 lov.clear();
8334 if (xlock_dentry && depth == path.depth() - 1) {
8335 if (depth > 0 || !mdr->lock_cache) {
8336 lov.add_wrlock(&cur->filelock);
8337 lov.add_wrlock(&cur->nestlock);
8338 if (rdlock_authlock)
8339 lov.add_rdlock(&cur->authlock);
8340 }
8341 lov.add_xlock(&dn->lock);
8342 } else {
8343 // force client to flush async dir operation if necessary
8344 if (cur->filelock.is_cached())
8345 lov.add_wrlock(&cur->filelock);
8346 lov.add_rdlock(&dn->lock);
8347 }
8348 if (!mds->locker->acquire_locks(mdr, lov)) {
8349 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8350 return 1;
8351 }
8352 } else if (!path_locked &&
8353 !dn->lock.can_read(client) &&
8354 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8355 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8356 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8357 if (mds->logger)
8358 mds->logger->inc(l_mds_traverse_lock);
8359 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8360 mds->mdlog->flush();
8361 return 1;
8362 }
8363
8364 if (pdnvec)
8365 pdnvec->push_back(dn);
8366
8367 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8368 // can we conclude ENOENT?
8369 if (dnl->is_null()) {
8370 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8371 if (depth == path.depth() - 1) {
8372 if (want_dentry)
8373 break;
8374 } else {
8375 if (pdnvec)
8376 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8377 }
8378 return -ENOENT;
8379 }
8380
8381 // do we have inode?
8382 CInode *in = dnl->get_inode();
8383 if (!in) {
8384 ceph_assert(dnl->is_remote());
8385 // do i have it?
8386 in = get_inode(dnl->get_remote_ino());
8387 if (in) {
8388 dout(7) << "linking in remote in " << *in << dendl;
8389 dn->link_remote(dnl, in);
8390 } else {
8391 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8392 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8393 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8394 dout(4) << "traverse: remote dentry points to damaged ino "
8395 << *dn << dendl;
8396 return -EIO;
8397 }
8398 open_remote_dentry(dn, true, cf.build(),
8399 (path_locked && depth == path.depth() - 1));
8400 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8401 return 1;
8402 }
8403 }
8404
8405 cur = in;
8406 // make sure snaprealm are open...
8407 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8408 !cur->snaprealm->open_parents(cf.build())) {
8409 return 1;
8410 }
8411
8412 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8413 lov.clear();
8414 lov.add_rdlock(&cur->snaplock);
8415 if (!mds->locker->acquire_locks(mdr, lov)) {
8416 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8417 return 1;
8418 }
8419 }
8420
8421 // add to trace, continue.
8422 touch_inode(cur);
8423 if (pin)
8424 *pin = cur;
8425 depth++;
8426 continue;
8427 }
8428
8429 ceph_assert(!dn);
8430
8431 // MISS. dentry doesn't exist.
8432 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8433
8434 if (curdir->is_auth()) {
8435 // dentry is mine.
8436 if (curdir->is_complete() ||
8437 (snapid == CEPH_NOSNAP &&
8438 curdir->has_bloom() &&
8439 !curdir->is_in_bloom(path[depth]))) {
8440 // file not found
8441 if (pdnvec) {
8442 // instantiate a null dn?
8443 if (depth < path.depth() - 1) {
8444 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8445 } else if (snapid < CEPH_MAXSNAP) {
8446 dout(20) << " not adding null for snapid " << snapid << dendl;
8447 } else if (curdir->is_frozen()) {
8448 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8449 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8450 return 1;
8451 } else {
8452 // create a null dentry
8453 dn = curdir->add_null_dentry(path[depth]);
8454 dout(20) << " added null " << *dn << dendl;
8455
8456 if (rdlock_path) {
8457 lov.clear();
8458 if (xlock_dentry) {
8459 if (depth > 0 || !mdr->lock_cache) {
8460 lov.add_wrlock(&cur->filelock);
8461 lov.add_wrlock(&cur->nestlock);
8462 if (rdlock_authlock)
8463 lov.add_rdlock(&cur->authlock);
8464 }
8465 lov.add_xlock(&dn->lock);
8466 } else {
8467 // force client to flush async dir operation if necessary
8468 if (cur->filelock.is_cached())
8469 lov.add_wrlock(&cur->filelock);
8470 lov.add_rdlock(&dn->lock);
8471 }
8472 if (!mds->locker->acquire_locks(mdr, lov)) {
8473 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8474 return 1;
8475 }
8476 }
8477 }
8478 if (dn) {
8479 pdnvec->push_back(dn);
8480 if (want_dentry)
8481 break;
8482 } else {
8483 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8484 }
8485 }
8486 return -ENOENT;
8487 } else {
8488
8489 // Check DamageTable for missing fragments before trying to fetch
8490 // this
8491 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8492 dout(4) << "traverse: damaged dirfrag " << *curdir
8493 << ", blocking fetch" << dendl;
8494 return -EIO;
8495 }
8496
8497 // directory isn't complete; reload
8498 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8499 touch_inode(cur);
8500 curdir->fetch(cf.build(), path[depth]);
8501 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8502 return 1;
8503 }
8504 } else {
8505 // dirfrag/dentry is not mine.
8506 mds_authority_t dauth = curdir->authority();
8507
8508 if (!forward_all_requests_to_auth &&
8509 forward &&
8510 mdr && mdr->client_request &&
8511 (int)depth < mdr->client_request->get_num_fwd()){
8512 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8513 << " < fwd " << mdr->client_request->get_num_fwd()
8514 << ", discovering instead of forwarding" << dendl;
8515 discover = true;
8516 }
8517
8518 if ((discover)) {
8519 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8520 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8521 path_locked);
8522 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8523 return 1;
8524 }
8525 if (forward) {
8526 // forward
8527 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8528
8529 if (curdir->is_ambiguous_auth()) {
8530 // wait
8531 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8532 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
8533 return 1;
8534 }
8535
8536 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8537
8538 request_forward(mdr, dauth.first);
8539
8540 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8541 return 2;
8542 }
8543 }
8544
8545 ceph_abort(); // i shouldn't get here
8546 }
8547
8548 if (want_auth && !want_dentry) {
8549 if (cur->is_ambiguous_auth()) {
8550 dout(10) << "waiting for single auth on " << *cur << dendl;
8551 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8552 return 1;
8553 }
8554 if (!cur->is_auth()) {
8555 dout(10) << "fw to auth for " << *cur << dendl;
8556 request_forward(mdr, cur->authority().first);
8557 return 2;
8558 }
8559 }
8560
8561 // success.
8562 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8563 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8564 if (mdr)
8565 ceph_assert(mdr->snapid == snapid);
8566
8567 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8568 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8569 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8570 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8571
8572 if (rdlock_path)
8573 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8574
8575 return 0;
8576 }
8577
8578 CInode *MDCache::cache_traverse(const filepath& fp)
8579 {
8580 dout(10) << "cache_traverse " << fp << dendl;
8581
8582 CInode *in;
8583 if (fp.get_ino())
8584 in = get_inode(fp.get_ino());
8585 else
8586 in = root;
8587 if (!in)
8588 return NULL;
8589
8590 for (unsigned i = 0; i < fp.depth(); i++) {
8591 std::string_view dname = fp[i];
8592 frag_t fg = in->pick_dirfrag(dname);
8593 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8594 CDir *curdir = in->get_dirfrag(fg);
8595 if (!curdir)
8596 return NULL;
8597 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8598 if (!dn)
8599 return NULL;
8600 in = dn->get_linkage()->get_inode();
8601 if (!in)
8602 return NULL;
8603 }
8604 dout(10) << " got " << *in << dendl;
8605 return in;
8606 }
8607
8608
8609 /**
8610 * open_remote_dir -- open up a remote dirfrag
8611 *
8612 * @param diri base inode
8613 * @param approxfg approximate fragment.
8614 * @param fin completion callback
8615 */
8616 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8617 {
8618 dout(10) << "open_remote_dir on " << *diri << dendl;
8619 ceph_assert(diri->is_dir());
8620 ceph_assert(!diri->is_auth());
8621 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8622
8623 discover_dir_frag(diri, approxfg, fin);
8624 }
8625
8626
8627 /**
8628 * get_dentry_inode - get or open inode
8629 *
8630 * @param dn the dentry
8631 * @param mdr current request
8632 *
8633 * will return inode for primary, or link up/open up remote link's inode as necessary.
8634 * If it's not available right now, puts mdr on wait list and returns null.
8635 */
8636 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8637 {
8638 CDentry::linkage_t *dnl;
8639 if (projected)
8640 dnl = dn->get_projected_linkage();
8641 else
8642 dnl = dn->get_linkage();
8643
8644 ceph_assert(!dnl->is_null());
8645
8646 if (dnl->is_primary())
8647 return dnl->inode;
8648
8649 ceph_assert(dnl->is_remote());
8650 CInode *in = get_inode(dnl->get_remote_ino());
8651 if (in) {
8652 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8653 dn->link_remote(dnl, in);
8654 return in;
8655 } else {
8656 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8657 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8658 return 0;
8659 }
8660 }
8661
8662 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8663 CDentry *dn;
8664 inodeno_t ino;
8665 MDSContext *onfinish;
8666 bool want_xlocked;
8667 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8668 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8669 dn->get(MDSCacheObject::PIN_PTRWAITER);
8670 }
8671 void finish(int r) override {
8672 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8673 dn->put(MDSCacheObject::PIN_PTRWAITER);
8674 }
8675 };
8676
8677 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8678 {
8679 dout(10) << "open_remote_dentry " << *dn << dendl;
8680 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8681 inodeno_t ino = dnl->get_remote_ino();
8682 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8683 open_ino(ino, pool,
8684 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8685 }
8686
8687 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8688 bool want_xlocked, int r)
8689 {
8690 if (r < 0) {
8691 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8692 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8693 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8694 dn->state_set(CDentry::STATE_BADREMOTEINO);
8695
8696 std::string path;
8697 CDir *dir = dn->get_dir();
8698 if (dir) {
8699 dir->get_inode()->make_path_string(path);
8700 path += "/";
8701 path += dn->get_name();
8702 }
8703
8704 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8705 if (fatal) {
8706 mds->damaged();
8707 ceph_abort(); // unreachable, damaged() respawns us
8708 }
8709 } else {
8710 r = 0;
8711 }
8712 }
8713 fin->complete(r < 0 ? r : 0);
8714 }
8715
8716
8717 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8718 {
8719 // empty trace if we're a base inode
8720 if (in->is_base())
8721 return;
8722
8723 CInode *parent = in->get_parent_inode();
8724 ceph_assert(parent);
8725 make_trace(trace, parent);
8726
8727 CDentry *dn = in->get_parent_dn();
8728 dout(15) << "make_trace adding " << *dn << dendl;
8729 trace.push_back(dn);
8730 }
8731
8732
8733 // -------------------------------------------------------------------------------
8734 // Open inode by inode number
8735
8736 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8737 inodeno_t ino;
8738 public:
8739 bufferlist bl;
8740 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8741 MDCacheIOContext(c), ino(i) {}
8742 void finish(int r) override {
8743 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8744 }
8745 void print(ostream& out) const override {
8746 out << "openino_backtrace_fetch" << ino << ")";
8747 }
8748 };
8749
8750 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8751 inodeno_t ino;
8752 cref_t<MMDSOpenIno> msg;
8753 bool parent;
8754 public:
8755 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8756 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8757 void finish(int r) override {
8758 if (r < 0 && !parent)
8759 r = -EAGAIN;
8760 if (msg) {
8761 mdcache->handle_open_ino(msg, r);
8762 return;
8763 }
8764 auto& info = mdcache->opening_inodes.at(ino);
8765 mdcache->_open_ino_traverse_dir(ino, info, r);
8766 }
8767 };
8768
8769 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8770 inodeno_t ino;
8771 public:
8772 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8773 void finish(int r) override {
8774 mdcache->_open_ino_parent_opened(ino, r);
8775 }
8776 };
8777
8778 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8779 {
8780 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8781
8782 open_ino_info_t& info = opening_inodes.at(ino);
8783
8784 CInode *in = get_inode(ino);
8785 if (in) {
8786 dout(10) << " found cached " << *in << dendl;
8787 open_ino_finish(ino, info, in->authority().first);
8788 return;
8789 }
8790
8791 inode_backtrace_t backtrace;
8792 if (err == 0) {
8793 try {
8794 decode(backtrace, bl);
8795 } catch (const buffer::error &decode_exc) {
8796 derr << "corrupt backtrace on ino x0" << std::hex << ino
8797 << std::dec << ": " << decode_exc << dendl;
8798 open_ino_finish(ino, info, -EIO);
8799 return;
8800 }
8801 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8802 dout(10) << " old object in pool " << info.pool
8803 << ", retrying pool " << backtrace.pool << dendl;
8804 info.pool = backtrace.pool;
8805 C_IO_MDC_OpenInoBacktraceFetched *fin =
8806 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8807 fetch_backtrace(ino, info.pool, fin->bl,
8808 new C_OnFinisher(fin, mds->finisher));
8809 return;
8810 }
8811 } else if (err == -ENOENT) {
8812 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8813 if (info.pool != meta_pool) {
8814 dout(10) << " no object in pool " << info.pool
8815 << ", retrying pool " << meta_pool << dendl;
8816 info.pool = meta_pool;
8817 C_IO_MDC_OpenInoBacktraceFetched *fin =
8818 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8819 fetch_backtrace(ino, info.pool, fin->bl,
8820 new C_OnFinisher(fin, mds->finisher));
8821 return;
8822 }
8823 err = 0; // backtrace.ancestors.empty() is checked below
8824 }
8825
8826 if (err == 0) {
8827 if (backtrace.ancestors.empty()) {
8828 dout(10) << " got empty backtrace " << dendl;
8829 err = -ESTALE;
8830 } else if (!info.ancestors.empty()) {
8831 if (info.ancestors[0] == backtrace.ancestors[0]) {
8832 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8833 err = -EINVAL;
8834 } else {
8835 info.last_err = 0;
8836 }
8837 }
8838 }
8839 if (err) {
8840 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8841 if (info.last_err)
8842 err = info.last_err;
8843 open_ino_finish(ino, info, err);
8844 return;
8845 }
8846
8847 dout(10) << " got backtrace " << backtrace << dendl;
8848 info.ancestors = backtrace.ancestors;
8849
8850 _open_ino_traverse_dir(ino, info, 0);
8851 }
8852
8853 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8854 {
8855 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8856
8857 open_ino_info_t& info = opening_inodes.at(ino);
8858
8859 CInode *in = get_inode(ino);
8860 if (in) {
8861 dout(10) << " found cached " << *in << dendl;
8862 open_ino_finish(ino, info, in->authority().first);
8863 return;
8864 }
8865
8866 if (ret == mds->get_nodeid()) {
8867 _open_ino_traverse_dir(ino, info, 0);
8868 } else {
8869 if (ret >= 0) {
8870 mds_rank_t checked_rank = mds_rank_t(ret);
8871 info.check_peers = true;
8872 info.auth_hint = checked_rank;
8873 info.checked.erase(checked_rank);
8874 }
8875 do_open_ino(ino, info, ret);
8876 }
8877 }
8878
8879 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8880 {
8881 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8882
8883 CInode *in = get_inode(ino);
8884 if (in) {
8885 dout(10) << " found cached " << *in << dendl;
8886 open_ino_finish(ino, info, in->authority().first);
8887 return;
8888 }
8889
8890 if (ret) {
8891 do_open_ino(ino, info, ret);
8892 return;
8893 }
8894
8895 mds_rank_t hint = info.auth_hint;
8896 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8897 info.discover, info.want_xlocked, &hint);
8898 if (ret > 0)
8899 return;
8900 if (hint != mds->get_nodeid())
8901 info.auth_hint = hint;
8902 do_open_ino(ino, info, ret);
8903 }
8904
8905 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
8906 {
8907 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8908 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8909 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8910 if (mds->logger)
8911 mds->logger->inc(l_mds_openino_dir_fetch);
8912 }
8913
8914 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
8915 const vector<inode_backpointer_t>& ancestors,
8916 bool discover, bool want_xlocked, mds_rank_t *hint)
8917 {
8918 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8919 int err = 0;
8920 for (unsigned i = 0; i < ancestors.size(); i++) {
8921 const auto& ancestor = ancestors.at(i);
8922 CInode *diri = get_inode(ancestor.dirino);
8923
8924 if (!diri) {
8925 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8926 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8927 return 1;
8928 }
8929 continue;
8930 }
8931
8932 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8933 CDir *dir = diri->get_parent_dir();
8934 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8935 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8936 dir = dir->get_inode()->get_parent_dir();
8937 _open_ino_fetch_dir(ino, m, dir, i == 0);
8938 return 1;
8939 }
8940
8941 if (!diri->is_dir()) {
8942 dout(10) << " " << *diri << " is not dir" << dendl;
8943 if (i == 0)
8944 err = -ENOTDIR;
8945 break;
8946 }
8947
8948 const string& name = ancestor.dname;
8949 frag_t fg = diri->pick_dirfrag(name);
8950 CDir *dir = diri->get_dirfrag(fg);
8951 if (!dir) {
8952 if (diri->is_auth()) {
8953 if (diri->is_frozen()) {
8954 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8955 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8956 return 1;
8957 }
8958 dir = diri->get_or_open_dirfrag(this, fg);
8959 } else if (discover) {
8960 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8961 return 1;
8962 }
8963 }
8964 if (dir) {
8965 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
8966 CDentry *dn = dir->lookup(name);
8967 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8968 if (dir->is_auth()) {
8969 if (dnl && dnl->is_primary() &&
8970 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8971 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8972 _open_ino_fetch_dir(ino, m, dir, i == 0);
8973 return 1;
8974 }
8975
8976 if (!dnl && !dir->is_complete() &&
8977 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8978 dout(10) << " fetching incomplete " << *dir << dendl;
8979 _open_ino_fetch_dir(ino, m, dir, i == 0);
8980 return 1;
8981 }
8982
8983 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8984 if (i == 0)
8985 err = -ENOENT;
8986 } else if (discover) {
8987 if (!dnl) {
8988 filepath path(name, 0);
8989 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8990 (i == 0 && want_xlocked));
8991 return 1;
8992 }
8993 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8994 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8995 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8996 return 1;
8997 }
8998 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8999 if (i == 0)
9000 err = -ENOENT;
9001 }
9002 }
9003 if (hint && i == 0)
9004 *hint = dir ? dir->authority().first : diri->authority().first;
9005 break;
9006 }
9007 return err;
9008 }
9009
9010 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
9011 {
9012 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9013
9014 MDSContext::vec waiters;
9015 waiters.swap(info.waiters);
9016 opening_inodes.erase(ino);
9017 finish_contexts(g_ceph_context, waiters, ret);
9018 }
9019
9020 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9021 {
9022 if (err < 0 && err != -EAGAIN) {
9023 info.checked.clear();
9024 info.checking = MDS_RANK_NONE;
9025 info.check_peers = true;
9026 info.fetch_backtrace = true;
9027 if (info.discover) {
9028 info.discover = false;
9029 info.ancestors.clear();
9030 }
9031 if (err != -ENOENT && err != -ENOTDIR)
9032 info.last_err = err;
9033 }
9034
9035 if (info.check_peers || info.discover) {
9036 if (info.discover) {
9037 // got backtrace from peer, but failed to find inode. re-check peers
9038 info.discover = false;
9039 info.ancestors.clear();
9040 info.checked.clear();
9041 }
9042 info.check_peers = false;
9043 info.checking = MDS_RANK_NONE;
9044 do_open_ino_peer(ino, info);
9045 } else if (info.fetch_backtrace) {
9046 info.check_peers = true;
9047 info.fetch_backtrace = false;
9048 info.checking = mds->get_nodeid();
9049 info.checked.clear();
9050 C_IO_MDC_OpenInoBacktraceFetched *fin =
9051 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9052 fetch_backtrace(ino, info.pool, fin->bl,
9053 new C_OnFinisher(fin, mds->finisher));
9054 } else {
9055 ceph_assert(!info.ancestors.empty());
9056 info.checking = mds->get_nodeid();
9057 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
9058 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9059 }
9060 }
9061
9062 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9063 {
9064 set<mds_rank_t> all, active;
9065 mds->mdsmap->get_mds_set(all);
9066 if (mds->get_state() == MDSMap::STATE_REJOIN)
9067 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9068 else
9069 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9070
9071 dout(10) << "do_open_ino_peer " << ino << " active " << active
9072 << " all " << all << " checked " << info.checked << dendl;
9073
9074 mds_rank_t whoami = mds->get_nodeid();
9075 mds_rank_t peer = MDS_RANK_NONE;
9076 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9077 if (active.count(info.auth_hint)) {
9078 peer = info.auth_hint;
9079 info.auth_hint = MDS_RANK_NONE;
9080 }
9081 } else {
9082 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9083 if (*p != whoami && info.checked.count(*p) == 0) {
9084 peer = *p;
9085 break;
9086 }
9087 }
9088 if (peer < 0) {
9089 all.erase(whoami);
9090 if (all != info.checked) {
9091 dout(10) << " waiting for more peers to be active" << dendl;
9092 } else {
9093 dout(10) << " all MDS peers have been checked " << dendl;
9094 do_open_ino(ino, info, 0);
9095 }
9096 } else {
9097 info.checking = peer;
9098 vector<inode_backpointer_t> *pa = NULL;
9099 // got backtrace from peer or backtrace just fetched
9100 if (info.discover || !info.fetch_backtrace)
9101 pa = &info.ancestors;
9102 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9103 if (mds->logger)
9104 mds->logger->inc(l_mds_openino_peer_discover);
9105 }
9106 }
9107
9108 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9109 {
9110 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9111 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9112 return;
9113 }
9114
9115 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9116
9117 auto from = mds_rank_t(m->get_source().num());
9118 inodeno_t ino = m->ino;
9119 ref_t<MMDSOpenInoReply> reply;
9120 CInode *in = get_inode(ino);
9121 if (in) {
9122 dout(10) << " have " << *in << dendl;
9123 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9124 if (in->is_auth()) {
9125 touch_inode(in);
9126 while (1) {
9127 CDentry *pdn = in->get_parent_dn();
9128 if (!pdn)
9129 break;
9130 CInode *diri = pdn->get_dir()->get_inode();
9131 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9132 in->inode.version));
9133 in = diri;
9134 }
9135 } else {
9136 reply->hint = in->authority().first;
9137 }
9138 } else if (err < 0) {
9139 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9140 } else {
9141 mds_rank_t hint = MDS_RANK_NONE;
9142 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9143 if (ret > 0)
9144 return;
9145 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9146 }
9147 mds->send_message_mds(reply, from);
9148 }
9149
9150 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9151 {
9152 dout(10) << "handle_open_ino_reply " << *m << dendl;
9153
9154 inodeno_t ino = m->ino;
9155 mds_rank_t from = mds_rank_t(m->get_source().num());
9156 auto it = opening_inodes.find(ino);
9157 if (it != opening_inodes.end() && it->second.checking == from) {
9158 open_ino_info_t& info = it->second;
9159 info.checking = MDS_RANK_NONE;
9160 info.checked.insert(from);
9161
9162 CInode *in = get_inode(ino);
9163 if (in) {
9164 dout(10) << " found cached " << *in << dendl;
9165 open_ino_finish(ino, info, in->authority().first);
9166 } else if (!m->ancestors.empty()) {
9167 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9168 if (!info.want_replica) {
9169 open_ino_finish(ino, info, from);
9170 return;
9171 }
9172
9173 info.ancestors = m->ancestors;
9174 info.auth_hint = from;
9175 info.checking = mds->get_nodeid();
9176 info.discover = true;
9177 _open_ino_traverse_dir(ino, info, 0);
9178 } else if (m->error) {
9179 dout(10) << " error " << m->error << " from mds." << from << dendl;
9180 do_open_ino(ino, info, m->error);
9181 } else {
9182 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9183 info.auth_hint = m->hint;
9184 info.checked.erase(m->hint);
9185 }
9186 do_open_ino_peer(ino, info);
9187 }
9188 }
9189 }
9190
9191 void MDCache::kick_open_ino_peers(mds_rank_t who)
9192 {
9193 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9194
9195 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9196 p != opening_inodes.end();
9197 ++p) {
9198 open_ino_info_t& info = p->second;
9199 if (info.checking == who) {
9200 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9201 info.checking = MDS_RANK_NONE;
9202 do_open_ino_peer(p->first, info);
9203 } else if (info.checking == MDS_RANK_NONE) {
9204 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9205 do_open_ino_peer(p->first, info);
9206 }
9207 }
9208 }
9209
9210 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9211 bool want_replica, bool want_xlocked)
9212 {
9213 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9214 << want_replica << dendl;
9215
9216 auto it = opening_inodes.find(ino);
9217 if (it != opening_inodes.end()) {
9218 open_ino_info_t& info = it->second;
9219 if (want_replica) {
9220 info.want_replica = true;
9221 if (want_xlocked && !info.want_xlocked) {
9222 if (!info.ancestors.empty()) {
9223 CInode *diri = get_inode(info.ancestors[0].dirino);
9224 if (diri) {
9225 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9226 CDir *dir = diri->get_dirfrag(fg);
9227 if (dir && !dir->is_auth()) {
9228 filepath path(info.ancestors[0].dname, 0);
9229 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9230 }
9231 }
9232 }
9233 info.want_xlocked = true;
9234 }
9235 }
9236 info.waiters.push_back(fin);
9237 } else {
9238 open_ino_info_t& info = opening_inodes[ino];
9239 info.want_replica = want_replica;
9240 info.want_xlocked = want_xlocked;
9241 info.tid = ++open_ino_last_tid;
9242 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9243 info.waiters.push_back(fin);
9244 if (mds->is_rejoin() &&
9245 open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
9246 info.fetch_backtrace = false;
9247 info.checking = mds->get_nodeid();
9248 _open_ino_traverse_dir(ino, info, 0);
9249 } else {
9250 do_open_ino(ino, info, 0);
9251 }
9252 }
9253 }
9254
9255 /* ---------------------------- */
9256
9257 /*
9258 * search for a given inode on MDS peers. optionally start with the given node.
9259
9260
9261 TODO
9262 - recover from mds node failure, recovery
9263 - traverse path
9264
9265 */
9266 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9267 mds_rank_t hint, bool path_locked)
9268 {
9269 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9270 CInode *in = get_inode(ino);
9271 if (in && in->state_test(CInode::STATE_PURGING)) {
9272 c->complete(-ESTALE);
9273 return;
9274 }
9275 ceph_assert(!in);
9276
9277 ceph_tid_t tid = ++find_ino_peer_last_tid;
9278 find_ino_peer_info_t& fip = find_ino_peer[tid];
9279 fip.ino = ino;
9280 fip.tid = tid;
9281 fip.fin = c;
9282 fip.path_locked = path_locked;
9283 fip.hint = hint;
9284 _do_find_ino_peer(fip);
9285 }
9286
9287 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9288 {
9289 set<mds_rank_t> all, active;
9290 mds->mdsmap->get_mds_set(all);
9291 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9292
9293 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9294 << " active " << active << " all " << all
9295 << " checked " << fip.checked
9296 << dendl;
9297
9298 mds_rank_t m = MDS_RANK_NONE;
9299 if (fip.hint >= 0) {
9300 m = fip.hint;
9301 fip.hint = MDS_RANK_NONE;
9302 } else {
9303 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9304 if (*p != mds->get_nodeid() &&
9305 fip.checked.count(*p) == 0) {
9306 m = *p;
9307 break;
9308 }
9309 }
9310 if (m == MDS_RANK_NONE) {
9311 all.erase(mds->get_nodeid());
9312 if (all != fip.checked) {
9313 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9314 } else {
9315 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9316 fip.fin->complete(-ESTALE);
9317 find_ino_peer.erase(fip.tid);
9318 }
9319 } else {
9320 fip.checking = m;
9321 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9322 }
9323 }
9324
9325 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9326 {
9327 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9328 return;
9329 }
9330
9331 dout(10) << "handle_find_ino " << *m << dendl;
9332 auto r = make_message<MMDSFindInoReply>(m->tid);
9333 CInode *in = get_inode(m->ino);
9334 if (in) {
9335 in->make_path(r->path);
9336 dout(10) << " have " << r->path << " " << *in << dendl;
9337 }
9338 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9339 }
9340
9341
9342 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9343 {
9344 auto p = find_ino_peer.find(m->tid);
9345 if (p != find_ino_peer.end()) {
9346 dout(10) << "handle_find_ino_reply " << *m << dendl;
9347 find_ino_peer_info_t& fip = p->second;
9348
9349 // success?
9350 if (get_inode(fip.ino)) {
9351 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9352 mds->queue_waiter(fip.fin);
9353 find_ino_peer.erase(p);
9354 return;
9355 }
9356
9357 mds_rank_t from = mds_rank_t(m->get_source().num());
9358 if (fip.checking == from)
9359 fip.checking = MDS_RANK_NONE;
9360 fip.checked.insert(from);
9361
9362 if (!m->path.empty()) {
9363 // we got a path!
9364 vector<CDentry*> trace;
9365 CF_MDS_RetryMessageFactory cf(mds, m);
9366 MDRequestRef null_ref;
9367 int flags = MDS_TRAVERSE_DISCOVER;
9368 if (fip.path_locked)
9369 flags |= MDS_TRAVERSE_PATH_LOCKED;
9370 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9371 if (r > 0)
9372 return;
9373 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9374 << ", retrying" << dendl;
9375 fip.checked.clear();
9376 _do_find_ino_peer(fip);
9377 } else {
9378 // nope, continue.
9379 _do_find_ino_peer(fip);
9380 }
9381 } else {
9382 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9383 }
9384 }
9385
9386 void MDCache::kick_find_ino_peers(mds_rank_t who)
9387 {
9388 // find_ino_peers requests we should move on from
9389 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9390 p != find_ino_peer.end();
9391 ++p) {
9392 find_ino_peer_info_t& fip = p->second;
9393 if (fip.checking == who) {
9394 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9395 fip.checking = MDS_RANK_NONE;
9396 _do_find_ino_peer(fip);
9397 } else if (fip.checking == MDS_RANK_NONE) {
9398 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9399 _do_find_ino_peer(fip);
9400 }
9401 }
9402 }
9403
9404 /* ---------------------------- */
9405
9406 int MDCache::get_num_client_requests()
9407 {
9408 int count = 0;
9409 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9410 p != active_requests.end();
9411 ++p) {
9412 MDRequestRef& mdr = p->second;
9413 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9414 count++;
9415 }
9416 return count;
9417 }
9418
9419 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9420 {
9421 // did we win a forward race against a slave?
9422 if (active_requests.count(req->get_reqid())) {
9423 MDRequestRef& mdr = active_requests[req->get_reqid()];
9424 ceph_assert(mdr);
9425 if (mdr->is_slave()) {
9426 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9427 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9428 } else {
9429 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9430 }
9431 return MDRequestRef();
9432 }
9433
9434 // register new client request
9435 MDRequestImpl::Params params;
9436 params.reqid = req->get_reqid();
9437 params.attempt = req->get_num_fwd();
9438 params.client_req = req;
9439 params.initiated = req->get_recv_stamp();
9440 params.throttled = req->get_throttle_stamp();
9441 params.all_read = req->get_recv_complete_stamp();
9442 params.dispatched = req->get_dispatch_stamp();
9443
9444 MDRequestRef mdr =
9445 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9446 active_requests[params.reqid] = mdr;
9447 mdr->set_op_stamp(req->get_stamp());
9448 dout(7) << "request_start " << *mdr << dendl;
9449 return mdr;
9450 }
9451
9452 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9453 {
9454 int by = m->get_source().num();
9455 MDRequestImpl::Params params;
9456 params.reqid = ri;
9457 params.attempt = attempt;
9458 params.triggering_slave_req = m;
9459 params.slave_to = by;
9460 params.initiated = m->get_recv_stamp();
9461 params.throttled = m->get_throttle_stamp();
9462 params.all_read = m->get_recv_complete_stamp();
9463 params.dispatched = m->get_dispatch_stamp();
9464 MDRequestRef mdr =
9465 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9466 ceph_assert(active_requests.count(mdr->reqid) == 0);
9467 active_requests[mdr->reqid] = mdr;
9468 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9469 return mdr;
9470 }
9471
9472 MDRequestRef MDCache::request_start_internal(int op)
9473 {
9474 utime_t now = ceph_clock_now();
9475 MDRequestImpl::Params params;
9476 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9477 params.reqid.tid = mds->issue_tid();
9478 params.initiated = now;
9479 params.throttled = now;
9480 params.all_read = now;
9481 params.dispatched = now;
9482 params.internal_op = op;
9483 MDRequestRef mdr =
9484 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9485
9486 ceph_assert(active_requests.count(mdr->reqid) == 0);
9487 active_requests[mdr->reqid] = mdr;
9488 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9489 return mdr;
9490 }
9491
9492 MDRequestRef MDCache::request_get(metareqid_t rid)
9493 {
9494 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9495 ceph_assert(p != active_requests.end());
9496 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9497 return p->second;
9498 }
9499
9500 void MDCache::request_finish(MDRequestRef& mdr)
9501 {
9502 dout(7) << "request_finish " << *mdr << dendl;
9503 mdr->mark_event("finishing request");
9504
9505 // slave finisher?
9506 if (mdr->has_more() && mdr->more()->slave_commit) {
9507 Context *fin = mdr->more()->slave_commit;
9508 mdr->more()->slave_commit = 0;
9509 int ret;
9510 if (mdr->aborted) {
9511 mdr->aborted = false;
9512 ret = -1;
9513 mdr->more()->slave_rolling_back = true;
9514 } else {
9515 ret = 0;
9516 mdr->committing = true;
9517 }
9518 fin->complete(ret); // this must re-call request_finish.
9519 return;
9520 }
9521
9522 switch(mdr->internal_op) {
9523 case CEPH_MDS_OP_FRAGMENTDIR:
9524 logger->inc(l_mdss_ireq_fragmentdir);
9525 break;
9526 case CEPH_MDS_OP_EXPORTDIR:
9527 logger->inc(l_mdss_ireq_exportdir);
9528 break;
9529 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9530 logger->inc(l_mdss_ireq_enqueue_scrub);
9531 break;
9532 case CEPH_MDS_OP_FLUSH:
9533 logger->inc(l_mdss_ireq_flush);
9534 break;
9535 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9536 logger->inc(l_mdss_ireq_fragstats);
9537 break;
9538 case CEPH_MDS_OP_REPAIR_INODESTATS:
9539 logger->inc(l_mdss_ireq_inodestats);
9540 break;
9541 }
9542
9543 request_cleanup(mdr);
9544 }
9545
9546
9547 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9548 {
9549 mdr->mark_event("forwarding request");
9550 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9551 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9552 << *mdr->client_request << dendl;
9553 if (mdr->is_batch_head) {
9554 int mask = mdr->client_request->head.args.getattr.mask;
9555
9556 switch (mdr->client_request->get_op()) {
9557 case CEPH_MDS_OP_GETATTR:
9558 {
9559 CInode* in = mdr->in[0];
9560 if (in) {
9561 auto it = in->batch_ops.find(mask);
9562 if (it != in->batch_ops.end()) {
9563 it->second->forward(who);
9564 in->batch_ops.erase(it);
9565 }
9566 }
9567 break;
9568 }
9569 case CEPH_MDS_OP_LOOKUP:
9570 {
9571 if (mdr->dn[0].size()) {
9572 CDentry* dn = mdr->dn[0].back();
9573 auto it = dn->batch_ops.find(mask);
9574 if (it != dn->batch_ops.end()) {
9575 it->second->forward(who);
9576 dn->batch_ops.erase(it);
9577 }
9578 }
9579 break;
9580 }
9581 default:
9582 ceph_abort();
9583 }
9584 } else {
9585 mds->forward_message_mds(mdr->release_client_request(), who);
9586 }
9587 if (mds->logger) mds->logger->inc(l_mds_forward);
9588 } else if (mdr->internal_op >= 0) {
9589 dout(10) << "request_forward on internal op; cancelling" << dendl;
9590 mdr->internal_op_finish->complete(-EXDEV);
9591 } else {
9592 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9593 << " was from mds" << dendl;
9594 }
9595 request_cleanup(mdr);
9596 }
9597
9598
9599 void MDCache::dispatch_request(MDRequestRef& mdr)
9600 {
9601 if (mdr->client_request) {
9602 mds->server->dispatch_client_request(mdr);
9603 } else if (mdr->slave_request) {
9604 mds->server->dispatch_slave_request(mdr);
9605 } else {
9606 switch (mdr->internal_op) {
9607 case CEPH_MDS_OP_FRAGMENTDIR:
9608 dispatch_fragment_dir(mdr);
9609 break;
9610 case CEPH_MDS_OP_EXPORTDIR:
9611 migrator->dispatch_export_dir(mdr, 0);
9612 break;
9613 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9614 enqueue_scrub_work(mdr);
9615 break;
9616 case CEPH_MDS_OP_FLUSH:
9617 flush_dentry_work(mdr);
9618 break;
9619 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9620 repair_dirfrag_stats_work(mdr);
9621 break;
9622 case CEPH_MDS_OP_REPAIR_INODESTATS:
9623 repair_inode_stats_work(mdr);
9624 break;
9625 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9626 upgrade_inode_snaprealm_work(mdr);
9627 break;
9628 default:
9629 ceph_abort();
9630 }
9631 }
9632 }
9633
9634
9635 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9636 {
9637 if (!mdr->has_more())
9638 return;
9639
9640 // clean up slaves
9641 // (will implicitly drop remote dn pins)
9642 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9643 p != mdr->more()->slaves.end();
9644 ++p) {
9645 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt,
9646 MMDSSlaveRequest::OP_FINISH);
9647
9648 if (mdr->killed && !mdr->committing) {
9649 r->mark_abort();
9650 } else if (mdr->more()->srcdn_auth_mds == *p &&
9651 mdr->more()->inode_import.length() > 0) {
9652 // information about rename imported caps
9653 r->inode_export.claim(mdr->more()->inode_import);
9654 }
9655
9656 mds->send_message_mds(r, *p);
9657 }
9658
9659 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9660 * implicitly. Note that we don't call the finishers -- there shouldn't
9661 * be any on a remote lock and the request finish wakes up all
9662 * the waiters anyway! */
9663
9664 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9665 SimpleLock *lock = it->lock;
9666 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9667 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9668 << " on " << lock->get_parent() << dendl;
9669 lock->put_xlock();
9670 mdr->locks.erase(it++);
9671 } else if (it->is_remote_wrlock()) {
9672 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9673 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9674 if (it->is_wrlock()) {
9675 it->clear_remote_wrlock();
9676 ++it;
9677 } else {
9678 mdr->locks.erase(it++);
9679 }
9680 } else {
9681 ++it;
9682 }
9683 }
9684
9685 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9686 * leaving them in can cause double-notifies as
9687 * this function can get called more than once */
9688 }
9689
9690 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9691 {
9692 request_drop_foreign_locks(mdr);
9693 mds->locker->drop_non_rdlocks(mdr.get());
9694 }
9695
9696 void MDCache::request_drop_locks(MDRequestRef& mdr)
9697 {
9698 request_drop_foreign_locks(mdr);
9699 mds->locker->drop_locks(mdr.get());
9700 }
9701
9702 void MDCache::request_cleanup(MDRequestRef& mdr)
9703 {
9704 dout(15) << "request_cleanup " << *mdr << dendl;
9705
9706 if (mdr->has_more()) {
9707 if (mdr->more()->is_ambiguous_auth)
9708 mdr->clear_ambiguous_auth();
9709 if (!mdr->more()->waiting_for_finish.empty())
9710 mds->queue_waiters(mdr->more()->waiting_for_finish);
9711 }
9712
9713 request_drop_locks(mdr);
9714
9715 // drop (local) auth pins
9716 mdr->drop_local_auth_pins();
9717
9718 // drop stickydirs
9719 mdr->put_stickydirs();
9720
9721 mds->locker->kick_cap_releases(mdr);
9722
9723 // drop cache pins
9724 mdr->drop_pins();
9725
9726 // remove from session
9727 mdr->item_session_request.remove_myself();
9728
9729 // remove from map
9730 active_requests.erase(mdr->reqid);
9731
9732 if (mds->logger)
9733 log_stat();
9734
9735 mdr->mark_event("cleaned up request");
9736 }
9737
9738 void MDCache::request_kill(MDRequestRef& mdr)
9739 {
9740 // rollback slave requests is tricky. just let the request proceed.
9741 if (mdr->has_more() &&
9742 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9743 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9744 ceph_assert(mdr->more()->witnessed.empty());
9745 mdr->aborted = true;
9746 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9747 } else {
9748 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9749 }
9750
9751 ceph_assert(mdr->used_prealloc_ino == 0);
9752 ceph_assert(mdr->prealloc_inos.empty());
9753
9754 mdr->session = NULL;
9755 mdr->item_session_request.remove_myself();
9756 return;
9757 }
9758
9759 mdr->killed = true;
9760 mdr->mark_event("killing request");
9761
9762 if (mdr->committing) {
9763 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9764 } else {
9765 dout(10) << "request_kill " << *mdr << dendl;
9766 request_cleanup(mdr);
9767 }
9768 }
9769
9770 // -------------------------------------------------------------------------------
9771 // SNAPREALMS
9772
9773 void MDCache::create_global_snaprealm()
9774 {
9775 CInode *in = new CInode(this); // dummy inode
9776 create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9777 add_inode(in);
9778 global_snaprealm = in->snaprealm;
9779 }
9780
9781 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9782 {
9783 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9784
9785 vector<inodeno_t> split_inos;
9786 vector<inodeno_t> split_realms;
9787
9788 if (notify_clients) {
9789 ceph_assert(in->snaprealm->have_past_parents_open());
9790 if (snapop == CEPH_SNAP_OP_SPLIT) {
9791 // notify clients of update|split
9792 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9793 !p.end(); ++p)
9794 split_inos.push_back((*p)->ino());
9795
9796 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9797 p != in->snaprealm->open_children.end();
9798 ++p)
9799 split_realms.push_back((*p)->inode->ino());
9800 }
9801 }
9802
9803 set<SnapRealm*> past_children;
9804 map<client_t, ref_t<MClientSnap>> updates;
9805 list<SnapRealm*> q;
9806 q.push_back(in->snaprealm);
9807 while (!q.empty()) {
9808 SnapRealm *realm = q.front();
9809 q.pop_front();
9810
9811 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9812 realm->invalidate_cached_snaps();
9813
9814 if (notify_clients) {
9815 for (const auto& p : realm->client_caps) {
9816 const auto& client = p.first;
9817 const auto& caps = p.second;
9818 ceph_assert(!caps->empty());
9819
9820 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9821 if (em.second) {
9822 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9823 update->head.split = in->ino();
9824 update->split_inos = split_inos;
9825 update->split_realms = split_realms;
9826 update->bl = in->snaprealm->get_snap_trace();
9827 em.first->second = std::move(update);
9828 }
9829 }
9830 }
9831
9832 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9833 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9834 p != realm->open_past_children.end();
9835 ++p)
9836 past_children.insert(*p);
9837 }
9838
9839 // notify for active children, too.
9840 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9841 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9842 p != realm->open_children.end();
9843 ++p)
9844 q.push_back(*p);
9845 }
9846
9847 if (notify_clients)
9848 send_snaps(updates);
9849
9850 // notify past children and their descendants if we update/delete old snapshots
9851 for (set<SnapRealm*>::iterator p = past_children.begin();
9852 p != past_children.end();
9853 ++p)
9854 q.push_back(*p);
9855
9856 while (!q.empty()) {
9857 SnapRealm *realm = q.front();
9858 q.pop_front();
9859
9860 realm->invalidate_cached_snaps();
9861
9862 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9863 p != realm->open_children.end();
9864 ++p) {
9865 if (past_children.count(*p) == 0)
9866 q.push_back(*p);
9867 }
9868
9869 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9870 p != realm->open_past_children.end();
9871 ++p) {
9872 if (past_children.count(*p) == 0) {
9873 q.push_back(*p);
9874 past_children.insert(*p);
9875 }
9876 }
9877 }
9878
9879 if (snapop == CEPH_SNAP_OP_DESTROY) {
9880 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9881 for (set<SnapRealm*>::iterator p = past_children.begin();
9882 p != past_children.end();
9883 ++p)
9884 maybe_eval_stray((*p)->inode, true);
9885 }
9886 }
9887
9888 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
9889 {
9890 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9891 ceph_assert(in->is_auth());
9892
9893 set<mds_rank_t> mds_set;
9894 if (stid > 0) {
9895 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9896 mds_set.erase(mds->get_nodeid());
9897 } else {
9898 in->list_replicas(mds_set);
9899 }
9900
9901 if (!mds_set.empty()) {
9902 bufferlist snap_blob;
9903 in->encode_snap(snap_blob);
9904
9905 for (auto p : mds_set) {
9906 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
9907 m->snap_blob = snap_blob;
9908 mds->send_message_mds(m, p);
9909 }
9910 }
9911
9912 if (stid > 0)
9913 notify_global_snaprealm_update(snap_op);
9914 }
9915
9916 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
9917 {
9918 mds_rank_t from = mds_rank_t(m->get_source().num());
9919 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
9920
9921 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9922 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9923 return;
9924 }
9925
9926 // null rejoin_done means open_snaprealms() has already been called
9927 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9928 (mds->is_rejoin() && !rejoin_done);
9929
9930 if (m->get_tid() > 0) {
9931 mds->snapclient->notify_commit(m->get_tid());
9932 if (notify_clients)
9933 notify_global_snaprealm_update(m->get_snap_op());
9934 }
9935
9936 CInode *in = get_inode(m->get_ino());
9937 if (in) {
9938 ceph_assert(!in->is_auth());
9939 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9940 (mds->is_rejoin() && !in->is_rejoining())) {
9941 auto p = m->snap_blob.cbegin();
9942 in->decode_snap(p);
9943
9944 if (!notify_clients) {
9945 if (!rejoin_pending_snaprealms.count(in)) {
9946 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9947 rejoin_pending_snaprealms.insert(in);
9948 }
9949 }
9950 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9951 }
9952 }
9953 }
9954
9955 void MDCache::notify_global_snaprealm_update(int snap_op)
9956 {
9957 if (snap_op != CEPH_SNAP_OP_DESTROY)
9958 snap_op = CEPH_SNAP_OP_UPDATE;
9959 set<Session*> sessions;
9960 mds->sessionmap.get_client_session_set(sessions);
9961 for (auto &session : sessions) {
9962 if (!session->is_open() && !session->is_stale())
9963 continue;
9964 auto update = make_message<MClientSnap>(snap_op);
9965 update->head.split = global_snaprealm->inode->ino();
9966 update->bl = global_snaprealm->get_snap_trace();
9967 mds->send_message_client_counted(update, session);
9968 }
9969 }
9970
9971 // -------------------------------------------------------------------------------
9972 // STRAYS
9973
9974 struct C_MDC_RetryScanStray : public MDCacheContext {
9975 dirfrag_t next;
9976 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9977 void finish(int r) override {
9978 mdcache->scan_stray_dir(next);
9979 }
9980 };
9981
9982 void MDCache::scan_stray_dir(dirfrag_t next)
9983 {
9984 dout(10) << "scan_stray_dir " << next << dendl;
9985
9986 std::vector<CDir*> ls;
9987 for (int i = 0; i < NUM_STRAY; ++i) {
9988 if (strays[i]->ino() < next.ino)
9989 continue;
9990 strays[i]->get_dirfrags(ls);
9991 }
9992
9993 for (const auto& dir : ls) {
9994 if (dir->dirfrag() < next)
9995 continue;
9996 if (!dir->is_complete()) {
9997 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9998 return;
9999 }
10000 for (auto &p : dir->items) {
10001 CDentry *dn = p.second;
10002 dn->state_set(CDentry::STATE_STRAY);
10003 CDentry::linkage_t *dnl = dn->get_projected_linkage();
10004 if (dnl->is_primary()) {
10005 CInode *in = dnl->get_inode();
10006 if (in->inode.nlink == 0)
10007 in->state_set(CInode::STATE_ORPHAN);
10008 maybe_eval_stray(in);
10009 }
10010 }
10011 }
10012 }
10013
10014 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
10015 {
10016 object_t oid = CInode::get_object_name(ino, frag_t(), "");
10017 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
10018 if (mds->logger)
10019 mds->logger->inc(l_mds_openino_backtrace_fetch);
10020 }
10021
10022
10023
10024
10025
10026 // ========================================================================================
10027 // DISCOVER
10028 /*
10029
10030 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10031 to the parent metadata object in the cache (pinning it).
10032
10033 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10034
10035 */
10036
10037 void MDCache::_send_discover(discover_info_t& d)
10038 {
10039 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
10040 d.want_base_dir, d.path_locked);
10041 dis->set_tid(d.tid);
10042 mds->send_message_mds(dis, d.mds);
10043 }
10044
10045 void MDCache::discover_base_ino(inodeno_t want_ino,
10046 MDSContext *onfinish,
10047 mds_rank_t from)
10048 {
10049 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
10050 if (waiting_for_base_ino[from].count(want_ino) == 0) {
10051 discover_info_t& d = _create_discover(from);
10052 d.ino = want_ino;
10053 _send_discover(d);
10054 }
10055 waiting_for_base_ino[from][want_ino].push_back(onfinish);
10056 }
10057
10058
10059 void MDCache::discover_dir_frag(CInode *base,
10060 frag_t approx_fg,
10061 MDSContext *onfinish,
10062 mds_rank_t from)
10063 {
10064 if (from < 0)
10065 from = base->authority().first;
10066
10067 dirfrag_t df(base->ino(), approx_fg);
10068 dout(7) << "discover_dir_frag " << df
10069 << " from mds." << from << dendl;
10070
10071 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10072 discover_info_t& d = _create_discover(from);
10073 d.pin_base(base);
10074 d.ino = base->ino();
10075 d.frag = approx_fg;
10076 d.want_base_dir = true;
10077 _send_discover(d);
10078 }
10079
10080 if (onfinish)
10081 base->add_dir_waiter(approx_fg, onfinish);
10082 }
10083
10084 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10085 CInode *base;
10086 snapid_t snapid;
10087 filepath path;
10088 mds_rank_t from;
10089 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10090 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10091 void finish(int r) override {
10092 mdcache->discover_path(base, snapid, path, 0, from);
10093 }
10094 };
10095
10096 void MDCache::discover_path(CInode *base,
10097 snapid_t snap,
10098 filepath want_path,
10099 MDSContext *onfinish,
10100 bool path_locked,
10101 mds_rank_t from)
10102 {
10103 if (from < 0)
10104 from = base->authority().first;
10105
10106 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10107 << (path_locked ? " path_locked":"")
10108 << dendl;
10109
10110 if (base->is_ambiguous_auth()) {
10111 dout(10) << " waiting for single auth on " << *base << dendl;
10112 if (!onfinish)
10113 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10114 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10115 return;
10116 } else if (from == mds->get_nodeid()) {
10117 MDSContext::vec finished;
10118 base->take_waiting(CInode::WAIT_DIR, finished);
10119 mds->queue_waiters(finished);
10120 return;
10121 }
10122
10123 frag_t fg = base->pick_dirfrag(want_path[0]);
10124 if ((path_locked && want_path.depth() == 1) ||
10125 !base->is_waiting_for_dir(fg) || !onfinish) {
10126 discover_info_t& d = _create_discover(from);
10127 d.ino = base->ino();
10128 d.pin_base(base);
10129 d.frag = fg;
10130 d.snap = snap;
10131 d.want_path = want_path;
10132 d.want_base_dir = true;
10133 d.path_locked = path_locked;
10134 _send_discover(d);
10135 }
10136
10137 // register + wait
10138 if (onfinish)
10139 base->add_dir_waiter(fg, onfinish);
10140 }
10141
10142 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10143 CDir *base;
10144 snapid_t snapid;
10145 filepath path;
10146 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10147 MDCacheContext(c), base(b), snapid(s), path(p) {}
10148 void finish(int r) override {
10149 mdcache->discover_path(base, snapid, path, 0);
10150 }
10151 };
10152
10153 void MDCache::discover_path(CDir *base,
10154 snapid_t snap,
10155 filepath want_path,
10156 MDSContext *onfinish,
10157 bool path_locked)
10158 {
10159 mds_rank_t from = base->authority().first;
10160
10161 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10162 << (path_locked ? " path_locked":"")
10163 << dendl;
10164
10165 if (base->is_ambiguous_auth()) {
10166 dout(7) << " waiting for single auth on " << *base << dendl;
10167 if (!onfinish)
10168 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10169 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10170 return;
10171 } else if (from == mds->get_nodeid()) {
10172 MDSContext::vec finished;
10173 base->take_sub_waiting(finished);
10174 mds->queue_waiters(finished);
10175 return;
10176 }
10177
10178 if ((path_locked && want_path.depth() == 1) ||
10179 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10180 discover_info_t& d = _create_discover(from);
10181 d.ino = base->ino();
10182 d.pin_base(base->inode);
10183 d.frag = base->get_frag();
10184 d.snap = snap;
10185 d.want_path = want_path;
10186 d.want_base_dir = false;
10187 d.path_locked = path_locked;
10188 _send_discover(d);
10189 }
10190
10191 // register + wait
10192 if (onfinish)
10193 base->add_dentry_waiter(want_path[0], snap, onfinish);
10194 }
10195
10196 void MDCache::kick_discovers(mds_rank_t who)
10197 {
10198 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10199 p != discovers.end();
10200 ++p) {
10201 if (p->second.mds != who)
10202 continue;
10203 _send_discover(p->second);
10204 }
10205 }
10206
10207
10208 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10209 {
10210 mds_rank_t whoami = mds->get_nodeid();
10211 mds_rank_t from = mds_rank_t(dis->get_source().num());
10212
10213 ceph_assert(from != whoami);
10214
10215 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10216 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10217 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10218 return;
10219 }
10220
10221 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10222 // delay processing request from survivor because we may not yet choose lock states.
10223 if (!mds->mdsmap->is_rejoin(from)) {
10224 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10225 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10226 return;
10227 }
10228 }
10229
10230
10231 CInode *cur = 0;
10232 auto reply = make_message<MDiscoverReply>(*dis);
10233
10234 snapid_t snapid = dis->get_snapid();
10235
10236 // get started.
10237 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10238 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10239 // wants root
10240 dout(7) << "handle_discover from mds." << from
10241 << " wants base + " << dis->get_want().get_path()
10242 << " snap " << snapid
10243 << dendl;
10244
10245 cur = get_inode(dis->get_base_ino());
10246 ceph_assert(cur);
10247
10248 // add root
10249 reply->starts_with = MDiscoverReply::INODE;
10250 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10251 dout(10) << "added base " << *cur << dendl;
10252 }
10253 else {
10254 // there's a base inode
10255 cur = get_inode(dis->get_base_ino(), snapid);
10256 if (!cur && snapid != CEPH_NOSNAP) {
10257 cur = get_inode(dis->get_base_ino());
10258 if (cur && !cur->is_multiversion())
10259 cur = NULL; // nope!
10260 }
10261
10262 if (!cur) {
10263 dout(7) << "handle_discover mds." << from
10264 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10265 << dendl;
10266 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10267 reply->set_error_dentry(dis->get_dentry(0));
10268 reply->set_flag_error_dir();
10269 } else if (dis->wants_base_dir()) {
10270 dout(7) << "handle_discover mds." << from
10271 << " wants basedir+" << dis->get_want().get_path()
10272 << " has " << *cur
10273 << dendl;
10274 } else {
10275 dout(7) << "handle_discover mds." << from
10276 << " wants " << dis->get_want().get_path()
10277 << " has " << *cur
10278 << dendl;
10279 }
10280 }
10281
10282 ceph_assert(reply);
10283
10284 // add content
10285 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10286 for (unsigned i = 0;
10287 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10288 i++) {
10289
10290 // -- figure out the dir
10291
10292 // is *cur even a dir at all?
10293 if (!cur->is_dir()) {
10294 dout(7) << *cur << " not a dir" << dendl;
10295 reply->set_flag_error_dir();
10296 break;
10297 }
10298
10299 // pick frag
10300 frag_t fg;
10301 if (dis->get_want().depth()) {
10302 // dentry specifies
10303 fg = cur->pick_dirfrag(dis->get_dentry(i));
10304 } else {
10305 // requester explicity specified the frag
10306 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10307 fg = dis->get_base_dir_frag();
10308 if (!cur->dirfragtree.is_leaf(fg))
10309 fg = cur->dirfragtree[fg.value()];
10310 }
10311 CDir *curdir = cur->get_dirfrag(fg);
10312
10313 if ((!curdir && !cur->is_auth()) ||
10314 (curdir && !curdir->is_auth())) {
10315
10316 /* before:
10317 * ONLY set flag if empty!!
10318 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10319 * resulting in duplicate discovers in flight,
10320 * which can wreak havoc when discovering rename srcdn (which may move)
10321 */
10322
10323 if (reply->is_empty()) {
10324 // only hint if empty.
10325 // someday this could be better, but right now the waiter logic isn't smart enough.
10326
10327 // hint
10328 if (curdir) {
10329 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10330 reply->set_dir_auth_hint(curdir->authority().first);
10331 } else {
10332 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10333 << *cur << dendl;
10334 reply->set_dir_auth_hint(cur->authority().first);
10335 }
10336
10337 // note error dentry, if any
10338 // NOTE: important, as it allows requester to issue an equivalent discover
10339 // to whomever we hint at.
10340 if (dis->get_want().depth() > i)
10341 reply->set_error_dentry(dis->get_dentry(i));
10342 }
10343
10344 break;
10345 }
10346
10347 if (!curdir) { // open dir?
10348 if (cur->is_frozen()) {
10349 if (!reply->is_empty()) {
10350 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10351 break;
10352 }
10353 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10354 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10355 return;
10356 }
10357 curdir = cur->get_or_open_dirfrag(this, fg);
10358 } else if (curdir->is_frozen_tree() ||
10359 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10360 if (!reply->is_empty()) {
10361 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10362 break;
10363 }
10364 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10365 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10366 reply->set_flag_error_dir();
10367 break;
10368 }
10369 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10370 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10371 return;
10372 }
10373
10374 // add dir
10375 if (curdir->get_version() == 0) {
10376 // fetch newly opened dir
10377 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10378 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10379 // make sure the base frag is correct, though, in there was a refragment since the
10380 // original request was sent.
10381 reply->set_base_dir_frag(curdir->get_frag());
10382 } else {
10383 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10384 if (!reply->trace.length())
10385 reply->starts_with = MDiscoverReply::DIR;
10386 encode_replica_dir(curdir, from, reply->trace);
10387 dout(7) << "handle_discover added dir " << *curdir << dendl;
10388 }
10389
10390 // lookup
10391 CDentry *dn = 0;
10392 if (curdir->get_version() == 0) {
10393 // fetch newly opened dir
10394 ceph_assert(!curdir->has_bloom());
10395 } else if (dis->get_want().depth() > 0) {
10396 // lookup dentry
10397 dn = curdir->lookup(dis->get_dentry(i), snapid);
10398 } else
10399 break; // done!
10400
10401 // incomplete dir?
10402 if (!dn) {
10403 if (!curdir->is_complete() &&
10404 !(snapid == CEPH_NOSNAP &&
10405 curdir->has_bloom() &&
10406 !curdir->is_in_bloom(dis->get_dentry(i)))) {
10407 // readdir
10408 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10409 if (reply->is_empty()) {
10410 // fetch and wait
10411 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10412 dis->wants_base_dir() && curdir->get_version() == 0);
10413 return;
10414 } else {
10415 // initiate fetch, but send what we have so far
10416 curdir->fetch(0);
10417 break;
10418 }
10419 }
10420
10421 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10422 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10423 << " dne, non-empty reply, stopping" << dendl;
10424 break;
10425 }
10426
10427 // send null dentry
10428 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10429 << *curdir << dendl;
10430 if (snapid == CEPH_NOSNAP)
10431 dn = curdir->add_null_dentry(dis->get_dentry(i));
10432 else
10433 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10434 }
10435 ceph_assert(dn);
10436
10437 // don't add replica to purging dentry/inode
10438 if (dn->state_test(CDentry::STATE_PURGING)) {
10439 if (reply->is_empty())
10440 reply->set_flag_error_dn(dis->get_dentry(i));
10441 break;
10442 }
10443
10444 CDentry::linkage_t *dnl = dn->get_linkage();
10445
10446 // xlocked dentry?
10447 // ...always block on non-tail items (they are unrelated)
10448 // ...allow xlocked tail disocvery _only_ if explicitly requested
10449 if (dn->lock.is_xlocked()) {
10450 // is this the last (tail) item in the discover traversal?
10451 if (dis->is_path_locked()) {
10452 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10453 } else if (reply->is_empty()) {
10454 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10455 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10456 return;
10457 } else {
10458 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10459 break;
10460 }
10461 }
10462
10463 // frozen inode?
10464 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10465 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10466 if (tailitem && dis->is_path_locked()) {
10467 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10468 } else if (reply->is_empty()) {
10469 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10470 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10471 return;
10472 } else {
10473 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10474 break;
10475 }
10476 }
10477
10478 // add dentry
10479 if (!reply->trace.length())
10480 reply->starts_with = MDiscoverReply::DENTRY;
10481 encode_replica_dentry(dn, from, reply->trace);
10482 dout(7) << "handle_discover added dentry " << *dn << dendl;
10483
10484 if (!dnl->is_primary()) break; // stop on null or remote link.
10485
10486 // add inode
10487 CInode *next = dnl->get_inode();
10488 ceph_assert(next->is_auth());
10489
10490 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10491 dout(7) << "handle_discover added inode " << *next << dendl;
10492
10493 // descend, keep going.
10494 cur = next;
10495 continue;
10496 }
10497
10498 // how did we do?
10499 ceph_assert(!reply->is_empty());
10500 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10501 mds->send_message(reply, dis->get_connection());
10502 }
10503
10504 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10505 {
10506 /*
10507 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10508 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10509 return;
10510 }
10511 */
10512 dout(7) << "discover_reply " << *m << dendl;
10513 if (m->is_flag_error_dir())
10514 dout(7) << " flag error, dir" << dendl;
10515 if (m->is_flag_error_dn())
10516 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10517
10518 MDSContext::vec finished, error;
10519 mds_rank_t from = mds_rank_t(m->get_source().num());
10520
10521 // starting point
10522 CInode *cur = get_inode(m->get_base_ino());
10523 auto p = m->trace.cbegin();
10524
10525 int next = m->starts_with;
10526
10527 // decrement discover counters
10528 if (m->get_tid()) {
10529 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10530 if (p != discovers.end()) {
10531 dout(10) << " found tid " << m->get_tid() << dendl;
10532 discovers.erase(p);
10533 } else {
10534 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10535 }
10536 }
10537
10538 // discover may start with an inode
10539 if (!p.end() && next == MDiscoverReply::INODE) {
10540 decode_replica_inode(cur, p, NULL, finished);
10541 dout(7) << "discover_reply got base inode " << *cur << dendl;
10542 ceph_assert(cur->is_base());
10543
10544 next = MDiscoverReply::DIR;
10545
10546 // take waiters?
10547 if (cur->is_base() &&
10548 waiting_for_base_ino[from].count(cur->ino())) {
10549 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10550 waiting_for_base_ino[from].erase(cur->ino());
10551 }
10552 }
10553 ceph_assert(cur);
10554
10555 // loop over discover results.
10556 // indexes follow each ([[dir] dentry] inode)
10557 // can start, end with any type.
10558 while (!p.end()) {
10559 // dir
10560 frag_t fg;
10561 CDir *curdir = nullptr;
10562 if (next == MDiscoverReply::DIR) {
10563 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10564 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10565 ceph_assert(m->get_wanted_base_dir());
10566 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10567 }
10568 } else {
10569 // note: this can only happen our first way around this loop.
10570 if (p.end() && m->is_flag_error_dn()) {
10571 fg = cur->pick_dirfrag(m->get_error_dentry());
10572 curdir = cur->get_dirfrag(fg);
10573 } else
10574 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10575 }
10576
10577 if (p.end())
10578 break;
10579
10580 // dentry
10581 CDentry *dn = nullptr;
10582 decode_replica_dentry(dn, p, curdir, finished);
10583
10584 if (p.end())
10585 break;
10586
10587 // inode
10588 decode_replica_inode(cur, p, dn, finished);
10589
10590 next = MDiscoverReply::DIR;
10591 }
10592
10593 // dir error?
10594 // or dir_auth hint?
10595 if (m->is_flag_error_dir() && !cur->is_dir()) {
10596 // not a dir.
10597 cur->take_waiting(CInode::WAIT_DIR, error);
10598 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10599 mds_rank_t who = m->get_dir_auth_hint();
10600 if (who == mds->get_nodeid()) who = -1;
10601 if (who >= 0)
10602 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10603
10604
10605 if (m->get_wanted_base_dir()) {
10606 frag_t fg = m->get_base_dir_frag();
10607 CDir *dir = cur->get_dirfrag(fg);
10608
10609 if (cur->is_waiting_for_dir(fg)) {
10610 if (cur->is_auth())
10611 cur->take_waiting(CInode::WAIT_DIR, finished);
10612 else if (dir || !cur->dirfragtree.is_leaf(fg))
10613 cur->take_dir_waiting(fg, finished);
10614 else
10615 discover_dir_frag(cur, fg, 0, who);
10616 } else
10617 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10618 }
10619
10620 // try again?
10621 if (m->get_error_dentry().length()) {
10622 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10623 CDir *dir = cur->get_dirfrag(fg);
10624 // wanted a dentry
10625 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10626 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10627 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10628 m->get_wanted_snapid(), finished);
10629 } else {
10630 filepath relpath(m->get_error_dentry(), 0);
10631 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10632 }
10633 } else
10634 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10635 << m->get_error_dentry() << dendl;
10636 }
10637 } else if (m->is_flag_error_dn()) {
10638 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10639 CDir *dir = cur->get_dirfrag(fg);
10640 if (dir) {
10641 if (dir->is_auth()) {
10642 dir->take_sub_waiting(finished);
10643 } else {
10644 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10645 m->get_wanted_snapid(), error);
10646 }
10647 }
10648 }
10649
10650 // waiters
10651 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10652 mds->queue_waiters(finished);
10653 }
10654
10655
10656
10657 // ----------------------------
10658 // REPLICAS
10659
10660
10661 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10662 {
10663 ENCODE_START(1, 1, bl);
10664 dirfrag_t df = dir->dirfrag();
10665 encode(df, bl);
10666 __u32 nonce = dir->add_replica(to);
10667 encode(nonce, bl);
10668 dir->_encode_base(bl);
10669 ENCODE_FINISH(bl);
10670 }
10671
10672 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10673 {
10674 ENCODE_START(1, 1, bl);
10675 encode(dn->get_name(), bl);
10676 encode(dn->last, bl);
10677
10678 __u32 nonce = dn->add_replica(to);
10679 encode(nonce, bl);
10680 encode(dn->first, bl);
10681 encode(dn->linkage.remote_ino, bl);
10682 encode(dn->linkage.remote_d_type, bl);
10683 dn->lock.encode_state_for_replica(bl);
10684 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10685 encode(need_recover, bl);
10686 ENCODE_FINISH(bl);
10687 }
10688
10689 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10690 uint64_t features)
10691 {
10692 ENCODE_START(1, 1, bl);
10693 ceph_assert(in->is_auth());
10694 encode(in->inode.ino, bl); // bleh, minor assymetry here
10695 encode(in->last, bl);
10696
10697 __u32 nonce = in->add_replica(to);
10698 encode(nonce, bl);
10699
10700 in->_encode_base(bl, features);
10701 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10702 ENCODE_FINISH(bl);
10703 }
10704
10705 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10706 MDSContext::vec& finished)
10707 {
10708 DECODE_START(1, p);
10709 dirfrag_t df;
10710 decode(df, p);
10711
10712 ceph_assert(diri->ino() == df.ino);
10713
10714 // add it (_replica_)
10715 dir = diri->get_dirfrag(df.frag);
10716
10717 if (dir) {
10718 // had replica. update w/ new nonce.
10719 __u32 nonce;
10720 decode(nonce, p);
10721 dir->set_replica_nonce(nonce);
10722 dir->_decode_base(p);
10723 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10724 } else {
10725 // force frag to leaf in the diri tree
10726 if (!diri->dirfragtree.is_leaf(df.frag)) {
10727 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10728 << diri->dirfragtree << dendl;
10729 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10730 }
10731 // add replica.
10732 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10733 __u32 nonce;
10734 decode(nonce, p);
10735 dir->set_replica_nonce(nonce);
10736 dir->_decode_base(p);
10737 // is this a dir_auth delegation boundary?
10738 if (from != diri->authority().first ||
10739 diri->is_ambiguous_auth() ||
10740 diri->is_base())
10741 adjust_subtree_auth(dir, from);
10742
10743 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10744 // get waiters
10745 diri->take_dir_waiting(df.frag, finished);
10746 }
10747 DECODE_FINISH(p);
10748 }
10749
10750 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10751 {
10752 DECODE_START(1, p);
10753 string name;
10754 snapid_t last;
10755 decode(name, p);
10756 decode(last, p);
10757
10758 dn = dir->lookup(name, last);
10759
10760 // have it?
10761 bool is_new = false;
10762 if (dn) {
10763 is_new = false;
10764 dout(7) << __func__ << " had " << *dn << dendl;
10765 } else {
10766 is_new = true;
10767 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10768 dout(7) << __func__ << " added " << *dn << dendl;
10769 }
10770
10771 __u32 nonce;
10772 decode(nonce, p);
10773 dn->set_replica_nonce(nonce);
10774 decode(dn->first, p);
10775
10776 inodeno_t rino;
10777 unsigned char rdtype;
10778 decode(rino, p);
10779 decode(rdtype, p);
10780 dn->lock.decode_state(p, is_new);
10781
10782 bool need_recover;
10783 decode(need_recover, p);
10784
10785 if (is_new) {
10786 if (rino)
10787 dir->link_remote_inode(dn, rino, rdtype);
10788 if (need_recover)
10789 dn->lock.mark_need_recover();
10790 }
10791
10792 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10793 DECODE_FINISH(p);
10794 }
10795
10796 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10797 {
10798 DECODE_START(1, p);
10799 inodeno_t ino;
10800 snapid_t last;
10801 __u32 nonce;
10802 decode(ino, p);
10803 decode(last, p);
10804 decode(nonce, p);
10805 in = get_inode(ino, last);
10806 if (!in) {
10807 in = new CInode(this, false, 1, last);
10808 in->set_replica_nonce(nonce);
10809 in->_decode_base(p);
10810 in->_decode_locks_state_for_replica(p, true);
10811 add_inode(in);
10812 if (in->ino() == MDS_INO_ROOT)
10813 in->inode_auth.first = 0;
10814 else if (in->is_mdsdir())
10815 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10816 dout(10) << __func__ << " added " << *in << dendl;
10817 if (dn) {
10818 ceph_assert(dn->get_linkage()->is_null());
10819 dn->dir->link_primary_inode(dn, in);
10820 }
10821 } else {
10822 in->set_replica_nonce(nonce);
10823 in->_decode_base(p);
10824 in->_decode_locks_state_for_replica(p, false);
10825 dout(10) << __func__ << " had " << *in << dendl;
10826 }
10827
10828 if (dn) {
10829 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10830 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10831 }
10832 DECODE_FINISH(p);
10833 }
10834
10835
10836 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10837 {
10838 ENCODE_START(1, 1, bl);
10839 uint64_t features = mds->mdsmap->get_up_features();
10840 encode_replica_inode(get_myin(), who, bl, features);
10841 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10842 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10843 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10844 encode_replica_dir(straydn->get_dir(), who, bl);
10845 encode_replica_dentry(straydn, who, bl);
10846 ENCODE_FINISH(bl);
10847 }
10848
10849 void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
10850 {
10851 MDSContext::vec finished;
10852 auto p = bl.cbegin();
10853
10854 DECODE_START(1, p);
10855 CInode *mdsin = nullptr;
10856 decode_replica_inode(mdsin, p, NULL, finished);
10857 CDir *mdsdir = nullptr;
10858 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10859 CDentry *straydirdn = nullptr;
10860 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10861 CInode *strayin = nullptr;
10862 decode_replica_inode(strayin, p, straydirdn, finished);
10863 CDir *straydir = nullptr;
10864 decode_replica_dir(straydir, p, strayin, from, finished);
10865
10866 decode_replica_dentry(straydn, p, straydir, finished);
10867 if (!finished.empty())
10868 mds->queue_waiters(finished);
10869 DECODE_FINISH(p);
10870 }
10871
10872
10873 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10874 {
10875 // this is an FYI, re: replication
10876
10877 set<mds_rank_t> who;
10878 if (bcast) {
10879 mds->get_mds_map()->get_active_mds_set(who);
10880 } else {
10881 for (const auto &p : dir->get_replicas()) {
10882 who.insert(p.first);
10883 }
10884 }
10885
10886 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10887
10888 filepath path;
10889 dir->inode->make_path(path);
10890
10891 mds_rank_t whoami = mds->get_nodeid();
10892 for (set<mds_rank_t>::iterator it = who.begin();
10893 it != who.end();
10894 ++it) {
10895 if (*it == whoami) continue;
10896 //if (*it == except) continue;
10897 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10898
10899 std::set<int32_t> s;
10900 for (const auto &r : dir->dir_rep_by) {
10901 s.insert(r);
10902 }
10903 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
10904 }
10905
10906 return 0;
10907 }
10908
10909 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
10910 {
10911 dirfrag_t df = m->get_dirfrag();
10912 CDir *dir = get_dirfrag(df);
10913 if (!dir) {
10914 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10915
10916 // discover it?
10917 if (m->should_discover()) {
10918 // only try once!
10919 // this is key to avoid a fragtree update race, among other things.
10920 m->inc_tried_discover();
10921 vector<CDentry*> trace;
10922 CInode *in;
10923 filepath path = m->get_path();
10924 dout(5) << "trying discover on dir_update for " << path << dendl;
10925 CF_MDS_RetryMessageFactory cf(mds, m);
10926 MDRequestRef null_ref;
10927 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
10928 if (r > 0)
10929 return;
10930 if (r == 0 &&
10931 in->ino() == df.ino &&
10932 in->get_approx_dirfrag(df.frag) == NULL) {
10933 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10934 return;
10935 }
10936 }
10937
10938 return;
10939 }
10940
10941 if (!m->has_tried_discover()) {
10942 // Update if it already exists. Othwerwise it got updated by discover reply.
10943 dout(5) << "dir_update on " << *dir << dendl;
10944 dir->dir_rep = m->get_dir_rep();
10945 dir->dir_rep_by.clear();
10946 for (const auto &e : m->get_dir_rep_by()) {
10947 dir->dir_rep_by.insert(e);
10948 }
10949 }
10950 }
10951
10952
10953
10954
10955
10956 // LINK
10957
10958 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10959 {
10960 ENCODE_START(1, 1, bl);
10961 inodeno_t ino = dnl->get_remote_ino();
10962 encode(ino, bl);
10963 __u8 d_type = dnl->get_remote_d_type();
10964 encode(d_type, bl);
10965 ENCODE_FINISH(bl);
10966 }
10967
10968 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10969 {
10970 DECODE_START(1, p);
10971 inodeno_t ino;
10972 __u8 d_type;
10973 decode(ino, p);
10974 decode(d_type, p);
10975 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
10976 dir->link_remote_inode(dn, ino, d_type);
10977 DECODE_FINISH(p);
10978 }
10979
10980 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10981 {
10982 dout(7) << __func__ << " " << *dn << dendl;
10983
10984 CDir *subtree = get_subtree_root(dn->get_dir());
10985 for (const auto &p : dn->get_replicas()) {
10986 // don't tell (rename) witnesses; they already know
10987 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10988 continue;
10989 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10990 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10991 rejoin_gather.count(p.first)))
10992 continue;
10993 CDentry::linkage_t *dnl = dn->get_linkage();
10994 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
10995 if (dnl->is_primary()) {
10996 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
10997 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
10998 mds->mdsmap->get_up_features());
10999 } else if (dnl->is_remote()) {
11000 encode_remote_dentry_link(dnl, m->bl);
11001 } else
11002 ceph_abort(); // aie, bad caller!
11003 mds->send_message_mds(m, p.first);
11004 }
11005 }
11006
11007 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
11008 {
11009 CDentry *dn = NULL;
11010 CDir *dir = get_dirfrag(m->get_dirfrag());
11011 if (!dir) {
11012 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11013 } else {
11014 dn = dir->lookup(m->get_dn());
11015 if (!dn) {
11016 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11017 } else {
11018 dout(7) << __func__ << " on " << *dn << dendl;
11019 CDentry::linkage_t *dnl = dn->get_linkage();
11020
11021 ceph_assert(!dn->is_auth());
11022 ceph_assert(dnl->is_null());
11023 }
11024 }
11025
11026 auto p = m->bl.cbegin();
11027 MDSContext::vec finished;
11028 if (dn) {
11029 if (m->get_is_primary()) {
11030 // primary link.
11031 CInode *in = nullptr;
11032 decode_replica_inode(in, p, dn, finished);
11033 } else {
11034 // remote link, easy enough.
11035 decode_remote_dentry_link(dir, dn, p);
11036 }
11037 } else {
11038 ceph_abort();
11039 }
11040
11041 if (!finished.empty())
11042 mds->queue_waiters(finished);
11043
11044 return;
11045 }
11046
11047
11048 // UNLINK
11049
11050 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11051 {
11052 dout(10) << __func__ << " " << *dn << dendl;
11053 // share unlink news with replicas
11054 set<mds_rank_t> replicas;
11055 dn->list_replicas(replicas);
11056 bufferlist snapbl;
11057 if (straydn) {
11058 straydn->list_replicas(replicas);
11059 CInode *strayin = straydn->get_linkage()->get_inode();
11060 strayin->encode_snap_blob(snapbl);
11061 }
11062 for (set<mds_rank_t>::iterator it = replicas.begin();
11063 it != replicas.end();
11064 ++it) {
11065 // don't tell (rmdir) witnesses; they already know
11066 if (mdr.get() && mdr->more()->witnessed.count(*it))
11067 continue;
11068
11069 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11070 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11071 rejoin_gather.count(*it)))
11072 continue;
11073
11074 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11075 if (straydn) {
11076 encode_replica_stray(straydn, *it, unlink->straybl);
11077 unlink->snapbl = snapbl;
11078 }
11079 mds->send_message_mds(unlink, *it);
11080 }
11081 }
11082
11083 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11084 {
11085 // straydn
11086 CDentry *straydn = nullptr;
11087 if (m->straybl.length())
11088 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
11089
11090 CDir *dir = get_dirfrag(m->get_dirfrag());
11091 if (!dir) {
11092 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11093 } else {
11094 CDentry *dn = dir->lookup(m->get_dn());
11095 if (!dn) {
11096 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11097 } else {
11098 dout(7) << __func__ << " on " << *dn << dendl;
11099 CDentry::linkage_t *dnl = dn->get_linkage();
11100
11101 // open inode?
11102 if (dnl->is_primary()) {
11103 CInode *in = dnl->get_inode();
11104 dn->dir->unlink_inode(dn);
11105 ceph_assert(straydn);
11106 straydn->dir->link_primary_inode(straydn, in);
11107
11108 // in->first is lazily updated on replica; drag it forward so
11109 // that we always keep it in sync with the dnq
11110 ceph_assert(straydn->first >= in->first);
11111 in->first = straydn->first;
11112
11113 // update subtree map?
11114 if (in->is_dir())
11115 adjust_subtree_after_rename(in, dir, false);
11116
11117 if (m->snapbl.length()) {
11118 bool hadrealm = (in->snaprealm ? true : false);
11119 in->decode_snap_blob(m->snapbl);
11120 ceph_assert(in->snaprealm);
11121 ceph_assert(in->snaprealm->have_past_parents_open());
11122 if (!hadrealm)
11123 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11124 }
11125
11126 // send caps to auth (if we're not already)
11127 if (in->is_any_caps() &&
11128 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11129 migrator->export_caps(in);
11130
11131 straydn = NULL;
11132 } else {
11133 ceph_assert(!straydn);
11134 ceph_assert(dnl->is_remote());
11135 dn->dir->unlink_inode(dn);
11136 }
11137 ceph_assert(dnl->is_null());
11138 }
11139 }
11140
11141 // race with trim_dentry()
11142 if (straydn) {
11143 ceph_assert(straydn->get_num_ref() == 0);
11144 ceph_assert(straydn->get_linkage()->is_null());
11145 expiremap ex;
11146 trim_dentry(straydn, ex);
11147 send_expire_messages(ex);
11148 }
11149 }
11150
11151
11152
11153
11154
11155
11156 // ===================================================================
11157
11158
11159
11160 // ===================================================================
11161 // FRAGMENT
11162
11163
11164 /**
11165 * adjust_dir_fragments -- adjust fragmentation for a directory
11166 *
11167 * @param diri directory inode
11168 * @param basefrag base fragment
11169 * @param bits bit adjustment. positive for split, negative for merge.
11170 */
11171 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11172 std::vector<CDir*>* resultfrags,
11173 MDSContext::vec& waiters,
11174 bool replay)
11175 {
11176 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11177 << " on " << *diri << dendl;
11178
11179 auto&& p = diri->get_dirfrags_under(basefrag);
11180
11181 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11182 }
11183
11184 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11185 {
11186 CDir *dir = diri->get_dirfrag(fg);
11187 if (dir)
11188 return dir;
11189
11190 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11191
11192 std::vector<CDir*> src, result;
11193 MDSContext::vec waiters;
11194
11195 // split a parent?
11196 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11197 while (1) {
11198 CDir *pdir = diri->get_dirfrag(parent);
11199 if (pdir) {
11200 int split = fg.bits() - parent.bits();
11201 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11202 src.push_back(pdir);
11203 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11204 dir = diri->get_dirfrag(fg);
11205 if (dir) {
11206 dout(10) << "force_dir_fragment result " << *dir << dendl;
11207 break;
11208 }
11209 }
11210 if (parent == frag_t())
11211 break;
11212 frag_t last = parent;
11213 parent = parent.parent();
11214 dout(10) << " " << last << " parent is " << parent << dendl;
11215 }
11216
11217 if (!dir) {
11218 // hoover up things under fg?
11219 {
11220 auto&& p = diri->get_dirfrags_under(fg);
11221 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11222 }
11223 if (src.empty()) {
11224 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11225 } else {
11226 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11227 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11228 dir = result.front();
11229 dout(10) << "force_dir_fragment result " << *dir << dendl;
11230 }
11231 }
11232 if (!replay)
11233 mds->queue_waiters(waiters);
11234 return dir;
11235 }
11236
11237 void MDCache::adjust_dir_fragments(CInode *diri,
11238 const std::vector<CDir*>& srcfrags,
11239 frag_t basefrag, int bits,
11240 std::vector<CDir*>* resultfrags,
11241 MDSContext::vec& waiters,
11242 bool replay)
11243 {
11244 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11245 << " srcfrags " << srcfrags
11246 << " on " << *diri << dendl;
11247
11248 // adjust fragtree
11249 // yuck. we may have discovered the inode while it was being fragmented.
11250 if (!diri->dirfragtree.is_leaf(basefrag))
11251 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11252
11253 if (bits > 0)
11254 diri->dirfragtree.split(basefrag, bits);
11255 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11256
11257 if (srcfrags.empty())
11258 return;
11259
11260 // split
11261 CDir *parent_dir = diri->get_parent_dir();
11262 CDir *parent_subtree = 0;
11263 if (parent_dir)
11264 parent_subtree = get_subtree_root(parent_dir);
11265
11266 ceph_assert(srcfrags.size() >= 1);
11267 if (bits > 0) {
11268 // SPLIT
11269 ceph_assert(srcfrags.size() == 1);
11270 CDir *dir = srcfrags.front();
11271
11272 dir->split(bits, resultfrags, waiters, replay);
11273
11274 // did i change the subtree map?
11275 if (dir->is_subtree_root()) {
11276 // new frags are now separate subtrees
11277 for (const auto& dir : *resultfrags) {
11278 subtrees[dir].clear(); // new frag is now its own subtree
11279 }
11280
11281 // was i a bound?
11282 if (parent_subtree) {
11283 ceph_assert(subtrees[parent_subtree].count(dir));
11284 subtrees[parent_subtree].erase(dir);
11285 for (const auto& dir : *resultfrags) {
11286 ceph_assert(dir->is_subtree_root());
11287 subtrees[parent_subtree].insert(dir);
11288 }
11289 }
11290
11291 // adjust my bounds.
11292 set<CDir*> bounds;
11293 bounds.swap(subtrees[dir]);
11294 subtrees.erase(dir);
11295 for (set<CDir*>::iterator p = bounds.begin();
11296 p != bounds.end();
11297 ++p) {
11298 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11299 subtrees[frag].insert(*p);
11300 }
11301
11302 show_subtrees(10);
11303 }
11304
11305 diri->close_dirfrag(dir->get_frag());
11306
11307 } else {
11308 // MERGE
11309
11310 // are my constituent bits subtrees? if so, i will be too.
11311 // (it's all or none, actually.)
11312 bool any_subtree = false, any_non_subtree = false;
11313 for (const auto& dir : srcfrags) {
11314 if (dir->is_subtree_root())
11315 any_subtree = true;
11316 else
11317 any_non_subtree = true;
11318 }
11319 ceph_assert(!any_subtree || !any_non_subtree);
11320
11321 set<CDir*> new_bounds;
11322 if (any_subtree) {
11323 for (const auto& dir : srcfrags) {
11324 // this simplifies the code that find subtrees underneath the dirfrag
11325 if (!dir->is_subtree_root()) {
11326 dir->state_set(CDir::STATE_AUXSUBTREE);
11327 adjust_subtree_auth(dir, mds->get_nodeid());
11328 }
11329 }
11330
11331 for (const auto& dir : srcfrags) {
11332 ceph_assert(dir->is_subtree_root());
11333 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11334 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11335 set<CDir*>::iterator r = q->second.begin();
11336 while (r != subtrees[dir].end()) {
11337 new_bounds.insert(*r);
11338 subtrees[dir].erase(r++);
11339 }
11340 subtrees.erase(q);
11341
11342 // remove myself as my parent's bound
11343 if (parent_subtree)
11344 subtrees[parent_subtree].erase(dir);
11345 }
11346 }
11347
11348 // merge
11349 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11350 f->merge(srcfrags, waiters, replay);
11351
11352 if (any_subtree) {
11353 ceph_assert(f->is_subtree_root());
11354 subtrees[f].swap(new_bounds);
11355 if (parent_subtree)
11356 subtrees[parent_subtree].insert(f);
11357
11358 show_subtrees(10);
11359 }
11360
11361 resultfrags->push_back(f);
11362 }
11363 }
11364
11365
11366 class C_MDC_FragmentFrozen : public MDSInternalContext {
11367 MDCache *mdcache;
11368 MDRequestRef mdr;
11369 public:
11370 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11371 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11372 void finish(int r) override {
11373 mdcache->fragment_frozen(mdr, r);
11374 }
11375 };
11376
11377 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11378 {
11379 if (is_readonly()) {
11380 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11381 return false;
11382 }
11383 if (mds->is_cluster_degraded()) {
11384 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11385 return false;
11386 }
11387 if (diri->get_parent_dir() &&
11388 diri->get_parent_dir()->get_inode()->is_stray()) {
11389 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11390 return false;
11391 }
11392 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
11393 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11394 return false;
11395 }
11396
11397 if (diri->scrub_is_in_progress()) {
11398 dout(7) << "can_fragment: scrub in progress" << dendl;
11399 return false;
11400 }
11401
11402 for (const auto& dir : dirs) {
11403 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11404 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11405 return false;
11406 }
11407 if (!dir->is_auth()) {
11408 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11409 return false;
11410 }
11411 if (dir->is_bad()) {
11412 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11413 return false;
11414 }
11415 if (dir->is_frozen() ||
11416 dir->is_freezing()) {
11417 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11418 return false;
11419 }
11420 }
11421
11422 return true;
11423 }
11424
11425 void MDCache::split_dir(CDir *dir, int bits)
11426 {
11427 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11428 ceph_assert(dir->is_auth());
11429 CInode *diri = dir->inode;
11430
11431 std::vector<CDir*> dirs;
11432 dirs.push_back(dir);
11433
11434 if (!can_fragment(diri, dirs)) {
11435 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11436 return;
11437 }
11438
11439 if (dir->frag.bits() + bits > 24) {
11440 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11441 return;
11442 }
11443
11444 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11445 mdr->more()->fragment_base = dir->dirfrag();
11446
11447 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11448 fragment_info_t& info = fragments[dir->dirfrag()];
11449 info.mdr = mdr;
11450 info.dirs.push_back(dir);
11451 info.bits = bits;
11452 info.last_cum_auth_pins_change = ceph_clock_now();
11453
11454 fragment_freeze_dirs(dirs);
11455 // initial mark+complete pass
11456 fragment_mark_and_complete(mdr);
11457 }
11458
11459 void MDCache::merge_dir(CInode *diri, frag_t frag)
11460 {
11461 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11462
11463 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11464 if (!all) {
11465 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11466 return;
11467 }
11468
11469 if (diri->dirfragtree.is_leaf(frag)) {
11470 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11471 return;
11472 }
11473
11474 if (!can_fragment(diri, dirs))
11475 return;
11476
11477 CDir *first = dirs.front();
11478 int bits = first->get_frag().bits() - frag.bits();
11479 dout(10) << " we are merging by " << bits << " bits" << dendl;
11480
11481 dirfrag_t basedirfrag(diri->ino(), frag);
11482 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11483 mdr->more()->fragment_base = basedirfrag;
11484
11485 ceph_assert(fragments.count(basedirfrag) == 0);
11486 fragment_info_t& info = fragments[basedirfrag];
11487 info.mdr = mdr;
11488 info.dirs = dirs;
11489 info.bits = -bits;
11490 info.last_cum_auth_pins_change = ceph_clock_now();
11491
11492 fragment_freeze_dirs(dirs);
11493 // initial mark+complete pass
11494 fragment_mark_and_complete(mdr);
11495 }
11496
11497 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11498 {
11499 bool any_subtree = false, any_non_subtree = false;
11500 for (const auto& dir : dirs) {
11501 dir->auth_pin(dir); // until we mark and complete them
11502 dir->state_set(CDir::STATE_FRAGMENTING);
11503 dir->freeze_dir();
11504 ceph_assert(dir->is_freezing_dir());
11505
11506 if (dir->is_subtree_root())
11507 any_subtree = true;
11508 else
11509 any_non_subtree = true;
11510 }
11511
11512 if (any_subtree && any_non_subtree) {
11513 // either all dirfrags are subtree roots or all are not.
11514 for (const auto& dir : dirs) {
11515 if (dir->is_subtree_root()) {
11516 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11517 } else {
11518 dir->state_set(CDir::STATE_AUXSUBTREE);
11519 adjust_subtree_auth(dir, mds->get_nodeid());
11520 }
11521 }
11522 }
11523 }
11524
11525 class C_MDC_FragmentMarking : public MDCacheContext {
11526 MDRequestRef mdr;
11527 public:
11528 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11529 void finish(int r) override {
11530 mdcache->fragment_mark_and_complete(mdr);
11531 }
11532 };
11533
11534 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11535 {
11536 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11537 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11538 if (it == fragments.end() || it->second.mdr != mdr) {
11539 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11540 request_finish(mdr);
11541 return;
11542 }
11543
11544 fragment_info_t& info = it->second;
11545 CInode *diri = info.dirs.front()->get_inode();
11546 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11547
11548 MDSGatherBuilder gather(g_ceph_context);
11549
11550 for (const auto& dir : info.dirs) {
11551 bool ready = true;
11552 if (!dir->is_complete()) {
11553 dout(15) << " fetching incomplete " << *dir << dendl;
11554 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11555 ready = false;
11556 } else if (dir->get_frag() == frag_t()) {
11557 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11558 // the operation. To avoid CDir::fetch() complaining about missing object,
11559 // we commit new dirfrag first.
11560 if (dir->state_test(CDir::STATE_CREATING)) {
11561 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11562 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11563 ready = false;
11564 } else if (dir->is_new()) {
11565 dout(15) << " committing new " << *dir << dendl;
11566 ceph_assert(dir->is_dirty());
11567 dir->commit(0, gather.new_sub(), true);
11568 ready = false;
11569 }
11570 }
11571 if (!ready)
11572 continue;
11573
11574 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11575 dout(15) << " marking " << *dir << dendl;
11576 for (auto &p : dir->items) {
11577 CDentry *dn = p.second;
11578 dn->get(CDentry::PIN_FRAGMENTING);
11579 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11580 dn->state_set(CDentry::STATE_FRAGMENTING);
11581 }
11582 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11583 dir->auth_unpin(dir);
11584 } else {
11585 dout(15) << " already marked " << *dir << dendl;
11586 }
11587 }
11588 if (gather.has_subs()) {
11589 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11590 gather.activate();
11591 return;
11592 }
11593
11594 for (const auto& dir : info.dirs) {
11595 if (!dir->is_frozen_dir()) {
11596 ceph_assert(dir->is_freezing_dir());
11597 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11598 }
11599 }
11600 if (gather.has_subs()) {
11601 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11602 gather.activate();
11603 // flush log so that request auth_pins are retired
11604 mds->mdlog->flush();
11605 return;
11606 }
11607
11608 fragment_frozen(mdr, 0);
11609 }
11610
11611 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11612 {
11613 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11614 for (const auto& dir : dirs) {
11615 dout(10) << " frag " << *dir << dendl;
11616
11617 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11618 dir->state_clear(CDir::STATE_FRAGMENTING);
11619
11620 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11621 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11622
11623 for (auto &p : dir->items) {
11624 CDentry *dn = p.second;
11625 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11626 dn->state_clear(CDentry::STATE_FRAGMENTING);
11627 dn->put(CDentry::PIN_FRAGMENTING);
11628 }
11629 } else {
11630 dir->auth_unpin(dir);
11631 }
11632
11633 dir->unfreeze_dir();
11634 }
11635 }
11636
11637 bool MDCache::fragment_are_all_frozen(CDir *dir)
11638 {
11639 ceph_assert(dir->is_frozen_dir());
11640 map<dirfrag_t,fragment_info_t>::iterator p;
11641 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11642 p != fragments.end() && p->first.ino == dir->ino();
11643 ++p) {
11644 if (p->first.frag.contains(dir->get_frag()))
11645 return p->second.all_frozen;
11646 }
11647 ceph_abort();
11648 return false;
11649 }
11650
11651 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11652 {
11653 map<dirfrag_t,fragment_info_t>::iterator p;
11654 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11655 p != fragments.end() && p->first.ino == dir->ino();
11656 ++p) {
11657 if (p->first.frag.contains(dir->get_frag())) {
11658 p->second.num_remote_waiters++;
11659 return;
11660 }
11661 }
11662 ceph_abort();
11663 }
11664
11665 void MDCache::find_stale_fragment_freeze()
11666 {
11667 dout(10) << "find_stale_fragment_freeze" << dendl;
11668 // see comment in Migrator::find_stale_export_freeze()
11669 utime_t now = ceph_clock_now();
11670 utime_t cutoff = now;
11671 cutoff -= g_conf()->mds_freeze_tree_timeout;
11672
11673 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11674 p != fragments.end(); ) {
11675 dirfrag_t df = p->first;
11676 fragment_info_t& info = p->second;
11677 ++p;
11678 if (info.all_frozen)
11679 continue;
11680 CDir *dir;
11681 int total_auth_pins = 0;
11682 for (const auto& d : info.dirs) {
11683 dir = d;
11684 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11685 total_auth_pins = -1;
11686 break;
11687 }
11688 if (dir->is_frozen_dir())
11689 continue;
11690 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11691 }
11692 if (total_auth_pins < 0)
11693 continue;
11694 if (info.last_cum_auth_pins != total_auth_pins) {
11695 info.last_cum_auth_pins = total_auth_pins;
11696 info.last_cum_auth_pins_change = now;
11697 continue;
11698 }
11699 if (info.last_cum_auth_pins_change >= cutoff)
11700 continue;
11701 dir = info.dirs.front();
11702 if (info.num_remote_waiters > 0 ||
11703 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11704 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11705 std::vector<CDir*> dirs;
11706 info.dirs.swap(dirs);
11707 fragments.erase(df);
11708 fragment_unmark_unfreeze_dirs(dirs);
11709 }
11710 }
11711 }
11712
11713 class C_MDC_FragmentPrep : public MDCacheLogContext {
11714 MDRequestRef mdr;
11715 public:
11716 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11717 void finish(int r) override {
11718 mdcache->_fragment_logged(mdr);
11719 }
11720 };
11721
11722 class C_MDC_FragmentStore : public MDCacheContext {
11723 MDRequestRef mdr;
11724 public:
11725 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11726 void finish(int r) override {
11727 mdcache->_fragment_stored(mdr);
11728 }
11729 };
11730
11731 class C_MDC_FragmentCommit : public MDCacheLogContext {
11732 dirfrag_t basedirfrag;
11733 MDRequestRef mdr;
11734 public:
11735 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11736 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11737 void finish(int r) override {
11738 mdcache->_fragment_committed(basedirfrag, mdr);
11739 }
11740 };
11741
11742 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11743 dirfrag_t basedirfrag;
11744 int bits;
11745 MDRequestRef mdr;
11746 public:
11747 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11748 const MDRequestRef& r) :
11749 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11750 void finish(int r) override {
11751 ceph_assert(r == 0 || r == -ENOENT);
11752 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
11753 }
11754 void print(ostream& out) const override {
11755 out << "fragment_purge_old(" << basedirfrag << ")";
11756 }
11757 };
11758
11759 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11760 {
11761 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11762 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11763 if (it == fragments.end() || it->second.mdr != mdr) {
11764 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11765 request_finish(mdr);
11766 return;
11767 }
11768
11769 ceph_assert(r == 0);
11770 fragment_info_t& info = it->second;
11771 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11772 << " on " << info.dirs.front()->get_inode() << dendl;
11773
11774 info.all_frozen = true;
11775 dispatch_fragment_dir(mdr);
11776 }
11777
11778 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11779 {
11780 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11781 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11782 if (it == fragments.end() || it->second.mdr != mdr) {
11783 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11784 request_finish(mdr);
11785 return;
11786 }
11787
11788 fragment_info_t& info = it->second;
11789 CInode *diri = info.dirs.front()->get_inode();
11790
11791 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11792 << " on " << *diri << dendl;
11793
11794 if (mdr->more()->slave_error)
11795 mdr->aborted = true;
11796
11797 if (!mdr->aborted) {
11798 MutationImpl::LockOpVec lov;
11799 lov.add_wrlock(&diri->dirfragtreelock);
11800 // prevent a racing gather on any other scatterlocks too
11801 lov.lock_scatter_gather(&diri->nestlock);
11802 lov.lock_scatter_gather(&diri->filelock);
11803 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
11804 if (!mdr->aborted)
11805 return;
11806 }
11807 }
11808
11809 if (mdr->aborted) {
11810 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11811 << info.dirs.front()->dirfrag() << dendl;
11812 if (info.bits > 0)
11813 mds->balancer->queue_split(info.dirs.front(), false);
11814 else
11815 mds->balancer->queue_merge(info.dirs.front());
11816 fragment_unmark_unfreeze_dirs(info.dirs);
11817 fragments.erase(it);
11818 request_finish(mdr);
11819 return;
11820 }
11821
11822 mdr->ls = mds->mdlog->get_current_segment();
11823 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11824 mds->mdlog->start_entry(le);
11825
11826 for (const auto& dir : info.dirs) {
11827 dirfrag_rollback rollback;
11828 rollback.fnode = dir->fnode;
11829 le->add_orig_frag(dir->get_frag(), &rollback);
11830 }
11831
11832 // refragment
11833 MDSContext::vec waiters;
11834 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11835 &info.resultfrags, waiters, false);
11836 if (g_conf()->mds_debug_frag)
11837 diri->verify_dirfrags();
11838 mds->queue_waiters(waiters);
11839
11840 for (const auto& fg : le->orig_frags)
11841 ceph_assert(!diri->dirfragtree.is_leaf(fg));
11842
11843 le->metablob.add_dir_context(info.resultfrags.front());
11844 for (const auto& dir : info.resultfrags) {
11845 if (diri->is_auth()) {
11846 le->metablob.add_fragmented_dir(dir, false, false);
11847 } else {
11848 dir->state_set(CDir::STATE_DIRTYDFT);
11849 le->metablob.add_fragmented_dir(dir, false, true);
11850 }
11851 }
11852
11853 // dft lock
11854 if (diri->is_auth()) {
11855 // journal dirfragtree
11856 auto &pi = diri->project_inode();
11857 pi.inode.version = diri->pre_dirty();
11858 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11859 } else {
11860 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11861 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11862 mdr->add_updated_lock(&diri->dirfragtreelock);
11863 }
11864
11865 /*
11866 // filelock
11867 mds->locker->mark_updated_scatterlock(&diri->filelock);
11868 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11869 mut->add_updated_lock(&diri->filelock);
11870
11871 // dirlock
11872 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11873 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11874 mut->add_updated_lock(&diri->nestlock);
11875 */
11876
11877 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11878 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11879 mdr, __func__);
11880 mds->mdlog->flush();
11881 }
11882
11883 void MDCache::_fragment_logged(MDRequestRef& mdr)
11884 {
11885 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11886 auto& info = fragments.at(basedirfrag);
11887 CInode *diri = info.resultfrags.front()->get_inode();
11888
11889 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11890 << " on " << *diri << dendl;
11891 mdr->mark_event("prepare logged");
11892
11893 if (diri->is_auth())
11894 diri->pop_and_dirty_projected_inode(mdr->ls);
11895
11896 mdr->apply(); // mark scatterlock
11897
11898 // store resulting frags
11899 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11900
11901 for (const auto& dir : info.resultfrags) {
11902 dout(10) << " storing result frag " << *dir << dendl;
11903
11904 // freeze and store them too
11905 dir->auth_pin(this);
11906 dir->state_set(CDir::STATE_FRAGMENTING);
11907 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11908 }
11909
11910 gather.activate();
11911 }
11912
11913 void MDCache::_fragment_stored(MDRequestRef& mdr)
11914 {
11915 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11916 fragment_info_t &info = fragments.at(basedirfrag);
11917 CDir *first = info.resultfrags.front();
11918 CInode *diri = first->get_inode();
11919
11920 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11921 << " on " << *diri << dendl;
11922 mdr->mark_event("new frags stored");
11923
11924 // tell peers
11925 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11926 diri->authority().first : CDIR_AUTH_UNKNOWN;
11927 for (const auto &p : first->get_replicas()) {
11928 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11929 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11930 rejoin_gather.count(p.first)))
11931 continue;
11932
11933 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
11934 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11935 diri_auth != p.first) { // not auth mds of diri
11936 /*
11937 * In the nornal case, mds does not trim dir inode whose child dirfrags
11938 * are likely being fragmented (see trim_inode()). But when fragmenting
11939 * subtree roots, following race can happen:
11940 *
11941 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11942 * mds.c and drops wrlock on dirfragtreelock.
11943 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11944 * SYNC and send lock message mds.c
11945 * - mds.c receives the lock message and changes dirfragtreelock state
11946 * to SYNC
11947 * - mds.c trim dirfrag and dir inode from its cache
11948 * - mds.c receives the fragment_notify message
11949 *
11950 * So we need to ensure replicas have received the notify, then unlock
11951 * the dirfragtreelock.
11952 */
11953 notify->mark_ack_wanted();
11954 info.notify_ack_waiting.insert(p.first);
11955 }
11956
11957 // freshly replicate new dirs to peers
11958 for (const auto& dir : info.resultfrags) {
11959 encode_replica_dir(dir, p.first, notify->basebl);
11960 }
11961
11962 mds->send_message_mds(notify, p.first);
11963 }
11964
11965 // journal commit
11966 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11967 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
11968
11969
11970 // unfreeze resulting frags
11971 for (const auto& dir : info.resultfrags) {
11972 dout(10) << " result frag " << *dir << dendl;
11973
11974 for (auto &p : dir->items) {
11975 CDentry *dn = p.second;
11976 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11977 dn->state_clear(CDentry::STATE_FRAGMENTING);
11978 dn->put(CDentry::PIN_FRAGMENTING);
11979 }
11980
11981 // unfreeze
11982 dir->unfreeze_dir();
11983 }
11984
11985 if (info.notify_ack_waiting.empty()) {
11986 fragment_drop_locks(info);
11987 } else {
11988 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11989 }
11990 }
11991
11992 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
11993 {
11994 dout(10) << "fragment_committed " << basedirfrag << dendl;
11995 if (mdr)
11996 mdr->mark_event("commit logged");
11997
11998 ufragment &uf = uncommitted_fragments.at(basedirfrag);
11999
12000 // remove old frags
12001 C_GatherBuilder gather(
12002 g_ceph_context,
12003 new C_OnFinisher(
12004 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
12005 mds->finisher));
12006
12007 SnapContext nullsnapc;
12008 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
12009 for (const auto& fg : uf.old_frags) {
12010 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
12011 ObjectOperation op;
12012 if (fg == frag_t()) {
12013 // backtrace object
12014 dout(10) << " truncate orphan dirfrag " << oid << dendl;
12015 op.truncate(0);
12016 op.omap_clear();
12017 } else {
12018 dout(10) << " removing orphan dirfrag " << oid << dendl;
12019 op.remove();
12020 }
12021 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12022 ceph::real_clock::now(),
12023 0, gather.new_sub());
12024 }
12025
12026 ceph_assert(gather.has_subs());
12027 gather.activate();
12028 }
12029
12030 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
12031 {
12032 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12033 if (mdr)
12034 mdr->mark_event("old frags purged");
12035
12036 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12037 mds->mdlog->start_submit_entry(le);
12038
12039 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12040
12041 if (mds->logger) {
12042 if (bits > 0) {
12043 mds->logger->inc(l_mds_dir_split);
12044 } else {
12045 mds->logger->inc(l_mds_dir_merge);
12046 }
12047 }
12048
12049 if (mdr) {
12050 auto it = fragments.find(basedirfrag);
12051 ceph_assert(it != fragments.end());
12052 it->second.finishing = true;
12053 if (it->second.notify_ack_waiting.empty())
12054 fragment_maybe_finish(it);
12055 else
12056 mdr->mark_event("wating for notify acks");
12057 }
12058 }
12059
12060 void MDCache::fragment_drop_locks(fragment_info_t& info)
12061 {
12062 mds->locker->drop_locks(info.mdr.get());
12063 request_finish(info.mdr);
12064 //info.mdr.reset();
12065 }
12066
12067 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12068 {
12069 if (!it->second.finishing)
12070 return;
12071
12072 // unmark & auth_unpin
12073 for (const auto &dir : it->second.resultfrags) {
12074 dir->state_clear(CDir::STATE_FRAGMENTING);
12075 dir->auth_unpin(this);
12076
12077 // In case the resulting fragments are beyond the split size,
12078 // we might need to split them again right away (they could
12079 // have been taking inserts between unfreezing and getting
12080 // here)
12081 mds->balancer->maybe_fragment(dir, false);
12082 }
12083
12084 fragments.erase(it);
12085 }
12086
12087
12088 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12089 {
12090 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12091 mds_rank_t from = mds_rank_t(ack->get_source().num());
12092
12093 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12094 return;
12095 }
12096
12097 auto it = fragments.find(ack->get_base_dirfrag());
12098 if (it == fragments.end() ||
12099 it->second.get_tid() != ack->get_tid()) {
12100 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12101 return;
12102 }
12103
12104 if (it->second.notify_ack_waiting.erase(from) &&
12105 it->second.notify_ack_waiting.empty()) {
12106 fragment_drop_locks(it->second);
12107 fragment_maybe_finish(it);
12108 }
12109 }
12110
12111 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12112 {
12113 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12114 mds_rank_t from = mds_rank_t(notify->get_source().num());
12115
12116 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12117 return;
12118 }
12119
12120 CInode *diri = get_inode(notify->get_ino());
12121 if (diri) {
12122 frag_t base = notify->get_basefrag();
12123 int bits = notify->get_bits();
12124
12125 /*
12126 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12127 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12128 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12129 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12130 return;
12131 }
12132 */
12133
12134 // refragment
12135 MDSContext::vec waiters;
12136 std::vector<CDir*> resultfrags;
12137 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12138 if (g_conf()->mds_debug_frag)
12139 diri->verify_dirfrags();
12140
12141 for (const auto& dir : resultfrags) {
12142 diri->take_dir_waiting(dir->get_frag(), waiters);
12143 }
12144
12145 // add new replica dirs values
12146 auto p = notify->basebl.cbegin();
12147 while (!p.end()) {
12148 CDir *tmp_dir = nullptr;
12149 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12150 }
12151
12152 mds->queue_waiters(waiters);
12153 } else {
12154 ceph_abort();
12155 }
12156
12157 if (notify->is_ack_wanted()) {
12158 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12159 notify->get_bits(), notify->get_tid());
12160 mds->send_message_mds(ack, from);
12161 }
12162 }
12163
12164 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12165 LogSegment *ls, bufferlist *rollback)
12166 {
12167 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12168 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12169 ufragment& uf = uncommitted_fragments[basedirfrag];
12170 uf.old_frags = old_frags;
12171 uf.bits = bits;
12172 uf.ls = ls;
12173 ls->uncommitted_fragments.insert(basedirfrag);
12174 if (rollback)
12175 uf.rollback.swap(*rollback);
12176 }
12177
12178 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12179 {
12180 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12181 << " op " << EFragment::op_name(op) << dendl;
12182 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12183 if (it != uncommitted_fragments.end()) {
12184 ufragment& uf = it->second;
12185 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12186 uf.committed = true;
12187 } else {
12188 uf.ls->uncommitted_fragments.erase(basedirfrag);
12189 mds->queue_waiters(uf.waiters);
12190 uncommitted_fragments.erase(it);
12191 }
12192 }
12193 }
12194
12195 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12196 {
12197 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12198 << " old_frags (" << old_frags << ")" << dendl;
12199 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12200 if (it != uncommitted_fragments.end()) {
12201 ufragment& uf = it->second;
12202 if (!uf.old_frags.empty()) {
12203 uf.old_frags = std::move(old_frags);
12204 uf.committed = true;
12205 } else {
12206 uf.ls->uncommitted_fragments.erase(basedirfrag);
12207 uncommitted_fragments.erase(it);
12208 }
12209 }
12210 }
12211
12212 void MDCache::wait_for_uncommitted_fragments(MDSGather *gather)
12213 {
12214 for (auto& p : uncommitted_fragments)
12215 p.second.waiters.push_back(gather->new_sub());
12216 }
12217
12218 void MDCache::rollback_uncommitted_fragments()
12219 {
12220 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12221 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12222 p != uncommitted_fragments.end();
12223 ++p) {
12224 ufragment &uf = p->second;
12225 CInode *diri = get_inode(p->first.ino);
12226 ceph_assert(diri);
12227
12228 if (uf.committed) {
12229 _fragment_committed(p->first, MDRequestRef());
12230 continue;
12231 }
12232
12233 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12234
12235 LogSegment *ls = mds->mdlog->get_current_segment();
12236 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12237 mds->mdlog->start_entry(le);
12238 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12239
12240 frag_vec_t old_frags;
12241 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12242
12243 std::vector<CDir*> resultfrags;
12244 if (uf.old_frags.empty()) {
12245 // created by old format EFragment
12246 MDSContext::vec waiters;
12247 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12248 } else {
12249 auto bp = uf.rollback.cbegin();
12250 for (const auto& fg : uf.old_frags) {
12251 CDir *dir = force_dir_fragment(diri, fg);
12252 resultfrags.push_back(dir);
12253
12254 dirfrag_rollback rollback;
12255 decode(rollback, bp);
12256
12257 dir->set_version(rollback.fnode.version);
12258 dir->fnode = rollback.fnode;
12259
12260 dir->_mark_dirty(ls);
12261
12262 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
12263 dout(10) << " dirty nestinfo on " << *dir << dendl;
12264 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
12265 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
12266 }
12267 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
12268 dout(10) << " dirty fragstat on " << *dir << dendl;
12269 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
12270 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
12271 }
12272
12273 le->add_orig_frag(dir->get_frag());
12274 le->metablob.add_dir_context(dir);
12275 if (diri_auth) {
12276 le->metablob.add_fragmented_dir(dir, true, false);
12277 } else {
12278 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12279 dir->state_set(CDir::STATE_DIRTYDFT);
12280 le->metablob.add_fragmented_dir(dir, true, true);
12281 }
12282 }
12283 }
12284
12285 if (diri_auth) {
12286 auto &pi = diri->project_inode();
12287 pi.inode.version = diri->pre_dirty();
12288 diri->pop_and_dirty_projected_inode(ls); // hacky
12289 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12290 } else {
12291 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12292 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12293 }
12294
12295 if (g_conf()->mds_debug_frag)
12296 diri->verify_dirfrags();
12297
12298 for (const auto& leaf : old_frags) {
12299 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12300 }
12301
12302 mds->mdlog->submit_entry(le);
12303
12304 uf.old_frags.swap(old_frags);
12305 _fragment_committed(p->first, MDRequestRef());
12306 }
12307 }
12308
12309 void MDCache::force_readonly()
12310 {
12311 if (is_readonly())
12312 return;
12313
12314 dout(1) << "force file system read-only" << dendl;
12315 mds->clog->warn() << "force file system read-only";
12316
12317 set_readonly();
12318
12319 mds->server->force_clients_readonly();
12320
12321 // revoke write caps
12322 int count = 0;
12323 for (auto &p : inode_map) {
12324 CInode *in = p.second;
12325 if (in->is_head())
12326 mds->locker->eval(in, CEPH_CAP_LOCKS);
12327 if (!(++count % 1000))
12328 mds->heartbeat_reset();
12329 }
12330
12331 mds->mdlog->flush();
12332 }
12333
12334
12335 // ==============================================================
12336 // debug crap
12337
12338 void MDCache::show_subtrees(int dbl, bool force_print)
12339 {
12340 if (g_conf()->mds_thrash_exports)
12341 dbl += 15;
12342
12343 //dout(10) << "show_subtrees" << dendl;
12344
12345 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12346 return; // i won't print anything.
12347
12348 if (subtrees.empty()) {
12349 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12350 << dendl;
12351 return;
12352 }
12353
12354 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12355 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12356 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12357 "printing subtrees" << dendl;
12358 return;
12359 }
12360
12361 // root frags
12362 std::vector<CDir*> basefrags;
12363 for (set<CInode*>::iterator p = base_inodes.begin();
12364 p != base_inodes.end();
12365 ++p)
12366 (*p)->get_dirfrags(basefrags);
12367 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12368 dout(15) << "show_subtrees" << dendl;
12369
12370 // queue stuff
12371 list<pair<CDir*,int> > q;
12372 string indent;
12373 set<CDir*> seen;
12374
12375 // calc max depth
12376 for (const auto& dir : basefrags) {
12377 q.emplace_back(dir, 0);
12378 }
12379
12380 set<CDir*> subtrees_seen;
12381
12382 unsigned int depth = 0;
12383 while (!q.empty()) {
12384 CDir *dir = q.front().first;
12385 unsigned int d = q.front().second;
12386 q.pop_front();
12387
12388 if (subtrees.count(dir) == 0) continue;
12389
12390 subtrees_seen.insert(dir);
12391
12392 if (d > depth) depth = d;
12393
12394 // sanity check
12395 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12396 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12397 ceph_assert(seen.count(dir) == 0);
12398 seen.insert(dir);
12399
12400 // nested items?
12401 if (!subtrees[dir].empty()) {
12402 for (set<CDir*>::iterator p = subtrees[dir].begin();
12403 p != subtrees[dir].end();
12404 ++p) {
12405 //dout(25) << " saw sub " << **p << dendl;
12406 q.push_front(pair<CDir*,int>(*p, d+1));
12407 }
12408 }
12409 }
12410
12411 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12412 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12413 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12414 "subtrees" << dendl;
12415 return;
12416 }
12417
12418 // print tree
12419 for (const auto& dir : basefrags) {
12420 q.emplace_back(dir, 0);
12421 }
12422
12423 while (!q.empty()) {
12424 CDir *dir = q.front().first;
12425 int d = q.front().second;
12426 q.pop_front();
12427
12428 if (subtrees.count(dir) == 0) continue;
12429
12430 // adjust indenter
12431 while ((unsigned)d < indent.size())
12432 indent.resize(d);
12433
12434 // pad
12435 string pad = "______________________________________";
12436 pad.resize(depth*2+1-indent.size());
12437 if (!subtrees[dir].empty())
12438 pad[0] = '.'; // parent
12439
12440
12441 string auth;
12442 if (dir->is_auth())
12443 auth = "auth ";
12444 else
12445 auth = " rep ";
12446
12447 char s[10];
12448 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12449 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12450 else
12451 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12452
12453 // print
12454 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12455 << " " << auth << *dir << dendl;
12456
12457 if (dir->ino() == MDS_INO_ROOT)
12458 ceph_assert(dir->inode == root);
12459 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12460 ceph_assert(dir->inode == myin);
12461 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12462 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12463
12464 // nested items?
12465 if (!subtrees[dir].empty()) {
12466 // more at my level?
12467 if (!q.empty() && q.front().second == d)
12468 indent += "| ";
12469 else
12470 indent += " ";
12471
12472 for (set<CDir*>::iterator p = subtrees[dir].begin();
12473 p != subtrees[dir].end();
12474 ++p)
12475 q.push_front(pair<CDir*,int>(*p, d+2));
12476 }
12477 }
12478
12479 // verify there isn't stray crap in subtree map
12480 int lost = 0;
12481 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12482 p != subtrees.end();
12483 ++p) {
12484 if (subtrees_seen.count(p->first)) continue;
12485 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12486 lost++;
12487 }
12488 ceph_assert(lost == 0);
12489 }
12490
12491 void MDCache::show_cache()
12492 {
12493 dout(7) << "show_cache" << dendl;
12494
12495 auto show_func = [this](CInode *in) {
12496 // unlinked?
12497 if (!in->parent)
12498 dout(7) << " unlinked " << *in << dendl;
12499
12500 // dirfrags?
12501 auto&& dfs = in->get_dirfrags();
12502 for (const auto& dir : dfs) {
12503 dout(7) << " dirfrag " << *dir << dendl;
12504
12505 for (auto &p : dir->items) {
12506 CDentry *dn = p.second;
12507 dout(7) << " dentry " << *dn << dendl;
12508 CDentry::linkage_t *dnl = dn->get_linkage();
12509 if (dnl->is_primary() && dnl->get_inode())
12510 dout(7) << " inode " << *dnl->get_inode() << dendl;
12511 }
12512 }
12513 };
12514
12515 for (auto &p : inode_map)
12516 show_func(p.second);
12517 for (auto &p : snap_inode_map)
12518 show_func(p.second);
12519 }
12520
12521 void MDCache::cache_status(Formatter *f)
12522 {
12523 f->open_object_section("cache");
12524
12525 f->open_object_section("pool");
12526 mempool::get_pool(mempool::mds_co::id).dump(f);
12527 f->close_section();
12528
12529 f->close_section();
12530 }
12531
12532 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12533 {
12534 ceph_assert(in);
12535 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12536 return;
12537 }
12538 auto&& ls = in->get_dirfrags();
12539 for (const auto &subdir : ls) {
12540 for (const auto &p : subdir->items) {
12541 CDentry *dn = p.second;
12542 CInode *in = dn->get_linkage()->get_inode();
12543 if (in) {
12544 dump_tree(in, cur_depth + 1, max_depth, f);
12545 }
12546 }
12547 }
12548 f->open_object_section("inode");
12549 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12550 f->close_section();
12551 }
12552
12553 int MDCache::dump_cache(std::string_view file_name)
12554 {
12555 return dump_cache(file_name, NULL);
12556 }
12557
12558 int MDCache::dump_cache(Formatter *f)
12559 {
12560 return dump_cache(std::string_view(""), f);
12561 }
12562
12563 /**
12564 * Dump the metadata cache, either to a Formatter, if
12565 * provided, else to a plain text file.
12566 */
12567 int MDCache::dump_cache(std::string_view fn, Formatter *f)
12568 {
12569 int r = 0;
12570
12571 // dumping large caches may cause mds to hang or worse get killed.
12572 // so, disallow the dump if the cache size exceeds the configured
12573 // threshold, which is 1G for formatter and unlimited for file (note
12574 // that this can be jacked up by the admin... and is nothing but foot
12575 // shooting, but the option itself is for devs and hence dangerous to
12576 // tune). TODO: remove this when fixed.
12577 uint64_t threshold = f ?
12578 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12579 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12580
12581 if (threshold && cache_size() > threshold) {
12582 if (f) {
12583 std::stringstream ss;
12584 ss << "cache usage exceeds dump threshold";
12585 f->open_object_section("result");
12586 f->dump_string("error", ss.str());
12587 f->close_section();
12588 } else {
12589 derr << "cache usage exceeds dump threshold" << dendl;
12590 r = -EINVAL;
12591 }
12592 return r;
12593 }
12594
12595 r = 0;
12596 int fd = -1;
12597
12598 if (f) {
12599 f->open_array_section("inodes");
12600 } else {
12601 char path[PATH_MAX] = "";
12602 if (fn.length()) {
12603 snprintf(path, sizeof path, "%s", fn.data());
12604 } else {
12605 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12606 }
12607
12608 dout(1) << "dump_cache to " << path << dendl;
12609
12610 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12611 if (fd < 0) {
12612 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12613 return errno;
12614 }
12615 }
12616
12617 auto dump_func = [fd, f](CInode *in) {
12618 int r;
12619 if (f) {
12620 f->open_object_section("inode");
12621 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12622 f->close_section();
12623 return 1;
12624 }
12625 ostringstream ss;
12626 ss << *in << std::endl;
12627 std::string s = ss.str();
12628 r = safe_write(fd, s.c_str(), s.length());
12629 if (r < 0)
12630 return r;
12631 auto&& dfs = in->get_dirfrags();
12632 for (auto &dir : dfs) {
12633 ostringstream tt;
12634 tt << " " << *dir << std::endl;
12635 std::string t = tt.str();
12636 r = safe_write(fd, t.c_str(), t.length());
12637 if (r < 0)
12638 return r;
12639 for (auto &p : dir->items) {
12640 CDentry *dn = p.second;
12641 ostringstream uu;
12642 uu << " " << *dn << std::endl;
12643 std::string u = uu.str();
12644 r = safe_write(fd, u.c_str(), u.length());
12645 if (r < 0)
12646 return r;
12647 }
12648 dir->check_rstats();
12649 }
12650 return 1;
12651 };
12652
12653 for (auto &p : inode_map) {
12654 r = dump_func(p.second);
12655 if (r < 0)
12656 goto out;
12657 }
12658 for (auto &p : snap_inode_map) {
12659 r = dump_func(p.second);
12660 if (r < 0)
12661 goto out;
12662 }
12663 r = 0;
12664
12665 out:
12666 if (f) {
12667 f->close_section(); // inodes
12668 } else {
12669 ::close(fd);
12670 }
12671 return r;
12672 }
12673
12674
12675
12676 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12677 : MDSInternalContext(c->mds), cache(c), mdr(r)
12678 {}
12679
12680 void C_MDS_RetryRequest::finish(int r)
12681 {
12682 mdr->retry++;
12683 cache->dispatch_request(mdr);
12684 }
12685
12686
12687 class C_MDS_EnqueueScrub : public Context
12688 {
12689 std::string tag;
12690 Formatter *formatter;
12691 Context *on_finish;
12692 public:
12693 ScrubHeaderRef header;
12694 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12695 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12696
12697 Context *take_finisher() {
12698 Context *fin = on_finish;
12699 on_finish = NULL;
12700 return fin;
12701 }
12702
12703 void finish(int r) override {
12704 if (r == 0) {
12705 // since recursive scrub is asynchronous, dump minimal output
12706 // to not upset cli tools.
12707 if (header && header->get_recursive()) {
12708 formatter->open_object_section("results");
12709 formatter->dump_int("return_code", 0);
12710 formatter->dump_string("scrub_tag", tag);
12711 formatter->dump_string("mode", "asynchronous");
12712 formatter->close_section(); // results
12713 }
12714 } else { // we failed the lookup or something; dump ourselves
12715 formatter->open_object_section("results");
12716 formatter->dump_int("return_code", r);
12717 formatter->close_section(); // results
12718 r = 0; // already dumped in formatter
12719 }
12720 if (on_finish)
12721 on_finish->complete(r);
12722 }
12723 };
12724
12725 void MDCache::enqueue_scrub(
12726 std::string_view path,
12727 std::string_view tag,
12728 bool force, bool recursive, bool repair,
12729 Formatter *f, Context *fin)
12730 {
12731 dout(10) << __func__ << " " << path << dendl;
12732 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12733 if (path == "~mdsdir") {
12734 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12735 mdr->set_filepath(fp);
12736 } else {
12737 filepath fp(path);
12738 mdr->set_filepath(path);
12739 }
12740
12741 bool is_internal = false;
12742 std::string tag_str(tag);
12743 if (tag_str.empty()) {
12744 uuid_d uuid_gen;
12745 uuid_gen.generate_random();
12746 tag_str = uuid_gen.to_string();
12747 is_internal = true;
12748 }
12749
12750 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
12751 cs->header = std::make_shared<ScrubHeader>(
12752 tag_str, is_internal, force, recursive, repair, f);
12753
12754 mdr->internal_op_finish = cs;
12755 enqueue_scrub_work(mdr);
12756 }
12757
12758 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12759 {
12760 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
12761 if (NULL == in)
12762 return;
12763
12764 // TODO: Remove this restriction
12765 ceph_assert(in->is_auth());
12766
12767 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12768 ScrubHeaderRef header = cs->header;
12769
12770 // Cannot scrub same dentry twice at same time
12771 if (in->scrub_is_in_progress()) {
12772 mds->server->respond_to_request(mdr, -EBUSY);
12773 return;
12774 } else {
12775 in->scrub_info();
12776 }
12777
12778 header->set_origin(in);
12779
12780 Context *fin;
12781 if (header->get_recursive()) {
12782 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12783 fin = new MDSInternalContextWrapper(mds,
12784 new LambdaContext([this, header](int r) {
12785 recursive_scrub_finish(header);
12786 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12787 })
12788 );
12789 } else {
12790 fin = cs->take_finisher();
12791 }
12792
12793 // If the scrub did some repair, then flush the journal at the end of
12794 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12795 // the on disk state will still look damaged.
12796 auto scrub_finish = new LambdaContext([this, header, fin](int r){
12797 if (!header->get_repaired()) {
12798 if (fin)
12799 fin->complete(r);
12800 return;
12801 }
12802
12803 auto flush_finish = new LambdaContext([this, fin](int r){
12804 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12805 mds->mdlog->trim_all();
12806
12807 if (fin) {
12808 MDSGatherBuilder gather(g_ceph_context);
12809 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12810 for (auto logseg : expiring_segments)
12811 logseg->wait_for_expiry(gather.new_sub());
12812 ceph_assert(gather.has_subs());
12813 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12814 gather.activate();
12815 }
12816 });
12817
12818 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12819 mds->mdlog->start_new_segment();
12820 mds->mdlog->flush();
12821 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
12822 });
12823
12824 if (!header->get_recursive()) {
12825 mds->scrubstack->enqueue_inode_top(in, header,
12826 new MDSInternalContextWrapper(mds, scrub_finish));
12827 } else {
12828 mds->scrubstack->enqueue_inode_bottom(in, header,
12829 new MDSInternalContextWrapper(mds, scrub_finish));
12830 }
12831
12832 mds->server->respond_to_request(mdr, 0);
12833 return;
12834 }
12835
12836 void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12837 {
12838 if (header->get_origin()->is_base() &&
12839 header->get_force() && header->get_repair()) {
12840 // notify snapserver that base directory is recursively scrubbed.
12841 // After both root and mdsdir are recursively scrubbed, snapserver
12842 // knows that all old format snaprealms are converted to the new
12843 // format.
12844 if (mds->mdsmap->get_num_in_mds() == 1 &&
12845 mds->mdsmap->get_num_failed_mds() == 0 &&
12846 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12847 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12848 }
12849 }
12850 }
12851
12852 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
12853 MDRequestRef mdr;
12854 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
12855 MDCacheLogContext(c), mdr(m) {}
12856 void finish(int r) override {
12857 mdr->apply();
12858 get_mds()->server->respond_to_request(mdr, r);
12859 }
12860 };
12861
12862 void MDCache::repair_dirfrag_stats(CDir *dir)
12863 {
12864 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12865 mdr->pin(dir);
12866 mdr->internal_op_private = dir;
12867 mdr->internal_op_finish = new C_MDSInternalNoop;
12868 repair_dirfrag_stats_work(mdr);
12869 }
12870
12871 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12872 {
12873 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12874 dout(10) << __func__ << " " << *dir << dendl;
12875
12876 if (!dir->is_auth()) {
12877 mds->server->respond_to_request(mdr, -ESTALE);
12878 return;
12879 }
12880
12881 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12882 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12883
12884 mds->locker->drop_locks(mdr.get());
12885 mdr->drop_local_auth_pins();
12886 if (mdr->is_any_remote_auth_pin())
12887 mds->locker->notify_freeze_waiter(dir);
12888 return;
12889 }
12890
12891 mdr->auth_pin(dir);
12892
12893 MutationImpl::LockOpVec lov;
12894 CInode *diri = dir->inode;
12895 lov.add_rdlock(&diri->dirfragtreelock);
12896 lov.add_wrlock(&diri->nestlock);
12897 lov.add_wrlock(&diri->filelock);
12898 if (!mds->locker->acquire_locks(mdr, lov))
12899 return;
12900
12901 if (!dir->is_complete()) {
12902 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12903 return;
12904 }
12905
12906 frag_info_t frag_info;
12907 nest_info_t nest_info;
12908 for (auto it = dir->begin(); it != dir->end(); ++it) {
12909 CDentry *dn = it->second;
12910 if (dn->last != CEPH_NOSNAP)
12911 continue;
12912 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12913 if (dnl->is_primary()) {
12914 CInode *in = dnl->get_inode();
12915 nest_info.add(in->get_projected_inode()->accounted_rstat);
12916 if (in->is_dir())
12917 frag_info.nsubdirs++;
12918 else
12919 frag_info.nfiles++;
12920 } else if (dnl->is_remote())
12921 frag_info.nfiles++;
12922 }
12923
12924 fnode_t *pf = dir->get_projected_fnode();
12925 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12926 bool good_rstat = nest_info.same_sums(pf->rstat);
12927 if (good_fragstat && good_rstat) {
12928 dout(10) << __func__ << " no corruption found" << dendl;
12929 mds->server->respond_to_request(mdr, 0);
12930 return;
12931 }
12932
12933 pf = dir->project_fnode();
12934 pf->version = dir->pre_dirty();
12935 mdr->add_projected_fnode(dir);
12936
12937 mdr->ls = mds->mdlog->get_current_segment();
12938 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12939 mds->mdlog->start_entry(le);
12940
12941 if (!good_fragstat) {
12942 if (pf->fragstat.mtime > frag_info.mtime)
12943 frag_info.mtime = pf->fragstat.mtime;
12944 if (pf->fragstat.change_attr > frag_info.change_attr)
12945 frag_info.change_attr = pf->fragstat.change_attr;
12946 pf->fragstat = frag_info;
12947 mds->locker->mark_updated_scatterlock(&diri->filelock);
12948 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12949 mdr->add_updated_lock(&diri->filelock);
12950 }
12951
12952 if (!good_rstat) {
12953 if (pf->rstat.rctime > nest_info.rctime)
12954 nest_info.rctime = pf->rstat.rctime;
12955 pf->rstat = nest_info;
12956 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12957 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12958 mdr->add_updated_lock(&diri->nestlock);
12959 }
12960
12961 le->metablob.add_dir_context(dir);
12962 le->metablob.add_dir(dir, true);
12963
12964 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12965 }
12966
12967 void MDCache::repair_inode_stats(CInode *diri)
12968 {
12969 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12970 mdr->pin(diri);
12971 mdr->internal_op_private = diri;
12972 mdr->internal_op_finish = new C_MDSInternalNoop;
12973 repair_inode_stats_work(mdr);
12974 }
12975
12976 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12977 {
12978 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12979 dout(10) << __func__ << " " << *diri << dendl;
12980
12981 if (!diri->is_auth()) {
12982 mds->server->respond_to_request(mdr, -ESTALE);
12983 return;
12984 }
12985 if (!diri->is_dir()) {
12986 mds->server->respond_to_request(mdr, -ENOTDIR);
12987 return;
12988 }
12989
12990 MutationImpl::LockOpVec lov;
12991
12992 if (mdr->ls) // already marked filelock/nestlock dirty ?
12993 goto do_rdlocks;
12994
12995 lov.add_rdlock(&diri->dirfragtreelock);
12996 lov.add_wrlock(&diri->nestlock);
12997 lov.add_wrlock(&diri->filelock);
12998 if (!mds->locker->acquire_locks(mdr, lov))
12999 return;
13000
13001 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13002 // the scatter-gather process, which will fix any fragstat/rstat errors.
13003 {
13004 frag_vec_t leaves;
13005 diri->dirfragtree.get_leaves(leaves);
13006 for (const auto& leaf : leaves) {
13007 CDir *dir = diri->get_dirfrag(leaf);
13008 if (!dir) {
13009 ceph_assert(mdr->is_auth_pinned(diri));
13010 dir = diri->get_or_open_dirfrag(this, leaf);
13011 }
13012 if (dir->get_version() == 0) {
13013 ceph_assert(dir->is_auth());
13014 dir->fetch(new C_MDS_RetryRequest(this, mdr));
13015 return;
13016 }
13017 }
13018 }
13019
13020 diri->state_set(CInode::STATE_REPAIRSTATS);
13021 mdr->ls = mds->mdlog->get_current_segment();
13022 mds->locker->mark_updated_scatterlock(&diri->filelock);
13023 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13024 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13025 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13026
13027 mds->locker->drop_locks(mdr.get());
13028
13029 do_rdlocks:
13030 // force the scatter-gather process
13031 lov.clear();
13032 lov.add_rdlock(&diri->dirfragtreelock);
13033 lov.add_rdlock(&diri->nestlock);
13034 lov.add_rdlock(&diri->filelock);
13035 if (!mds->locker->acquire_locks(mdr, lov))
13036 return;
13037
13038 diri->state_clear(CInode::STATE_REPAIRSTATS);
13039
13040 frag_info_t dir_info;
13041 nest_info_t nest_info;
13042 nest_info.rsubdirs = 1; // it gets one to account for self
13043 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13044 nest_info.rsnaps = srnode->snaps.size();
13045
13046 {
13047 frag_vec_t leaves;
13048 diri->dirfragtree.get_leaves(leaves);
13049 for (const auto& leaf : leaves) {
13050 CDir *dir = diri->get_dirfrag(leaf);
13051 ceph_assert(dir);
13052 ceph_assert(dir->get_version() > 0);
13053 dir_info.add(dir->fnode.accounted_fragstat);
13054 nest_info.add(dir->fnode.accounted_rstat);
13055 }
13056 }
13057
13058 if (!dir_info.same_sums(diri->inode.dirstat) ||
13059 !nest_info.same_sums(diri->inode.rstat)) {
13060 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13061 << *diri << dendl;
13062 }
13063
13064 mds->server->respond_to_request(mdr, 0);
13065 }
13066
13067 void MDCache::upgrade_inode_snaprealm(CInode *in)
13068 {
13069 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
13070 mdr->pin(in);
13071 mdr->internal_op_private = in;
13072 mdr->internal_op_finish = new C_MDSInternalNoop;
13073 upgrade_inode_snaprealm_work(mdr);
13074 }
13075
13076 void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
13077 {
13078 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
13079 dout(10) << __func__ << " " << *in << dendl;
13080
13081 if (!in->is_auth()) {
13082 mds->server->respond_to_request(mdr, -ESTALE);
13083 return;
13084 }
13085
13086 MutationImpl::LockOpVec lov;
13087 lov.add_xlock(&in->snaplock);
13088 if (!mds->locker->acquire_locks(mdr, lov))
13089 return;
13090
13091 // project_snaprealm() upgrades snaprealm format
13092 auto &pi = in->project_inode(false, true);
13093 mdr->add_projected_inode(in);
13094 pi.inode.version = in->pre_dirty();
13095
13096 mdr->ls = mds->mdlog->get_current_segment();
13097 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
13098 mds->mdlog->start_entry(le);
13099
13100 if (in->is_base()) {
13101 le->metablob.add_root(true, in);
13102 } else {
13103 CDentry *pdn = in->get_projected_parent_dn();
13104 le->metablob.add_dir_context(pdn->get_dir());
13105 le->metablob.add_primary_dentry(pdn, in, true);
13106 }
13107
13108 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13109 }
13110
13111 void MDCache::flush_dentry(std::string_view path, Context *fin)
13112 {
13113 if (is_readonly()) {
13114 dout(10) << __func__ << ": read-only FS" << dendl;
13115 fin->complete(-EROFS);
13116 return;
13117 }
13118 dout(10) << "flush_dentry " << path << dendl;
13119 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13120 filepath fp(path);
13121 mdr->set_filepath(fp);
13122 mdr->internal_op_finish = fin;
13123 flush_dentry_work(mdr);
13124 }
13125
13126 class C_FinishIOMDR : public MDSContext {
13127 protected:
13128 MDSRank *mds;
13129 MDRequestRef mdr;
13130 MDSRank *get_mds() override { return mds; }
13131 public:
13132 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13133 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13134 };
13135
13136 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13137 {
13138 MutationImpl::LockOpVec lov;
13139 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13140 if (!in)
13141 return;
13142
13143 ceph_assert(in->is_auth());
13144 in->flush(new C_FinishIOMDR(mds, mdr));
13145 }
13146
13147
13148 /**
13149 * Initialize performance counters with global perfcounter
13150 * collection.
13151 */
13152 void MDCache::register_perfcounters()
13153 {
13154 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13155
13156 // Stray/purge statistics
13157 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13158 PerfCountersBuilder::PRIO_INTERESTING);
13159 pcb.add_u64(l_mdc_num_recovering_enqueued,
13160 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13161 PerfCountersBuilder::PRIO_INTERESTING);
13162 pcb.add_u64_counter(l_mdc_recovery_completed,
13163 "recovery_completed", "File recoveries completed", "recd",
13164 PerfCountersBuilder::PRIO_INTERESTING);
13165
13166 // useful recovery queue statistics
13167 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13168 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13169 "Files currently being recovered");
13170 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13171 "Files waiting for recovery with elevated priority");
13172 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13173 "File recoveries started");
13174
13175 // along with other stray dentries stats
13176 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13177 "Stray dentries delayed");
13178 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13179 "Stray dentries enqueuing for purge");
13180 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13181 "Stray dentries created");
13182 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13183 "Stray dentries enqueued for purge");
13184 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13185 "Stray dentries reintegrated");
13186 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13187 "Stray dentries migrated");
13188
13189 // low prio internal request stats
13190 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13191 "Internal Request type enqueue scrub");
13192 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13193 "Internal Request type export dir");
13194 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13195 "Internal Request type flush");
13196 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13197 "Internal Request type fragmentdir");
13198 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13199 "Internal Request type frag stats");
13200 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13201 "Internal Request type inode stats");
13202
13203 logger.reset(pcb.create_perf_counters());
13204 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13205 recovery_queue.set_logger(logger.get());
13206 stray_manager.set_logger(logger.get());
13207 }
13208
13209 /**
13210 * Call this when putting references to an inode/dentry or
13211 * when attempting to trim it.
13212 *
13213 * If this inode is no longer linked by anyone, and this MDS
13214 * rank holds the primary dentry, and that dentry is in a stray
13215 * directory, then give up the dentry to the StrayManager, never
13216 * to be seen again by MDCache.
13217 *
13218 * @param delay if true, then purgeable inodes are stashed til
13219 * the next trim(), rather than being purged right
13220 * away.
13221 */
13222 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13223 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
13224 mds->get_state() <= MDSMap::STATE_REJOIN)
13225 return;
13226
13227 CDentry *dn = in->get_projected_parent_dn();
13228
13229 if (dn->state_test(CDentry::STATE_PURGING)) {
13230 /* We have already entered the purging process, no need
13231 * to re-evaluate me ! */
13232 return;
13233 }
13234
13235 if (dn->get_dir()->get_inode()->is_stray()) {
13236 if (delay)
13237 stray_manager.queue_delayed(dn);
13238 else
13239 stray_manager.eval_stray(dn);
13240 }
13241 }
13242
13243 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13244 dout(10) << __func__ << " " << *diri << dendl;
13245 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13246 auto&& ls = diri->get_dirfrags();
13247 for (auto &p : ls) {
13248 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13249 p->try_remove_dentries_for_stray();
13250 }
13251 if (!diri->snaprealm) {
13252 if (diri->is_auth())
13253 diri->clear_dirty_rstat();
13254 diri->clear_scatter_dirty();
13255 }
13256 }
13257
13258 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13259 CInode *in = get_inode(number);
13260 if (!in) {
13261 return false;
13262 }
13263 f->open_object_section("inode");
13264 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13265 f->close_section();
13266 return true;
13267 }
13268
13269 void MDCache::handle_mdsmap(const MDSMap &mdsmap) {
13270 // process export_pin_delayed_queue whenever a new MDSMap received
13271 auto &q = export_pin_delayed_queue;
13272 for (auto it = q.begin(); it != q.end(); ) {
13273 auto *in = *it;
13274 mds_rank_t export_pin = in->get_export_pin(false);
13275 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13276 << " max_mds=" << mdsmap.get_max_mds() << dendl;
13277 if (export_pin >= mdsmap.get_max_mds()) {
13278 it++;
13279 continue;
13280 }
13281
13282 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13283 it = q.erase(it);
13284 in->maybe_export_pin();
13285 }
13286 }
13287